Merge pull request #17 from netzbegruenung/refactor-logic

Refactoring und ein paar neue Features
This commit is contained in:
Marian Steinbach 2018-04-19 12:41:09 +02:00 committed by GitHub
commit 80d3567eef
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 148047 additions and 92643 deletions

19
KRITERIEN.md Normal file
View file

@ -0,0 +1,19 @@
# Qualitätskriterien
Wir prüfen Sites nach den folgenden Kriterien:
- `DNS_RESOLVABLE_IPV4`: Der Hostname der URL ist in eine IPv4 Adresse auflösbar
- `SITE_REACHABLE`: Die Site ist per HTTP(S) erreichbar (Status-Code 200)
- `HTTPS`: Die Site ist über HTTPS erreichbar. Das Server-Zertifikat ist gültig und stammt von einer vertrauenswürdigen CA.
- `WWW_OPTIONAL`: Die Verwendung von `www.` zu Beginn der Startseiten-URL ist optional. Die Site ist sowohl mit als auch ohne dieses Präfix im Hostnamen erreichbar.
- `CANONICAL_URL`: Bei mehreren möglichen URLs, über die auf die Site zugegriffen werden kann, wird auf eine kanonische URL weiter geleitet bzw. per `rel=canonical` Link verwiesen.
- `FAVICON`: Die Site hat ein Favoriten-Icon.
- `FEEDS`: Die Site verweist auf RSS oder Atom Feeds via `rel=alternate` Link Tag.
- `HTTP_RESPONSE_DURATION`: Zeit, die vom Absenden des HTTP-Request bis zum Empfang der Response-Header vergangen ist.

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -17,11 +17,19 @@
background-color: #ffdbdb;
font-size: 1rem;
}
.medium {
background-color: #fce7ac;
font-size: 1rem;
}
.good {
background-color: #cfeaa8;
font-size: 1rem;
}
td.text {
font-size: 0.85rem;
}
.icon {
width: 32px;
height: 32px;
@ -38,13 +46,20 @@
<div class="container-fluid">
<p>Erläuterungen der Testkriterien am <a href="#docs">Ende der Seite</a>. Gehe zum <a href="https://github.com/netzbegruenung/green-spider/">green-spider GitHub repository</a> um mehr zu erfahren.</p>
<p>Weitere Information: <a href="https://github.com/netzbegruenung/green-spider/">green-spider GitHub repository</a> - <a href="https://github.com/netzbegruenung/green-spider/blob/master/KRITERIEN.md">Kriterien</a></p>
<table class="table">
<thead>
<tr>
<th scope="col">Typ</th>
<th scope="col">Land</th>
<th scope="col">Kreis</th>
<th scope="col">Stadt</th>
<th scope="col">URL</th>
<th scope="col">Score</th>
<th scope="col">IP-Adresse</th>
<th scope="col">Erreichbar</th>
<th scope="col">Antwortzeit</th>
<th scope="col">Icon</th>
<th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
<th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
@ -57,22 +72,6 @@
</tbody>
</table>
<hr>
<h3 id="docs">Testkriterien</h3>
<h4>www. optional</h4>
<p>Die Site sollte sowohl mit &quot;www.&quot; als auch ohne &quot;www.&quot; am Anfang der URL erreichbar sein.</p>
<h4>Kanonische URL</h4>
<p>Verschiedene URLs zum Aufruf der Site (mit/ohne &quot;www.&quot; Präfix, mit/ohne HTTPS) sollten alle per HTTP Redirect auf eine einzige &quot;kanonische&quot; URL weiter leiten.</p>
<h4>HTTPS</h4>
<p>Die Site ist per verschlüsselter HTTP-Verbindung (HTTPS-Protokoll) erreichbar.</p>
</div>
<script src="bundle.js"></script>

265
spider.py
View file

@ -14,13 +14,15 @@ import random
import re
import requests
import shutil
import statistics
import sys
import yaml
# configuration
# number of parallel processes to use for crawling
concurrency = 4
concurrency = 3
# connection timeout for website checks (seconds)
connect_timeout = 5
@ -31,7 +33,7 @@ read_timeout = 10
# Git repo for our data
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
# folder in that repo that holds the data
green_direcory_data_path = 'data'
green_direcory_data_path = 'data/countries/de'
green_directory_local_path = './cache/green-directory'
result_path = './webapp/dist/data'
@ -153,11 +155,11 @@ def check_content(r):
# icon
result['icon'] = None
link = soup.find('link', rel='icon')
link = soup.find('link', rel=lambda x: x and x.lower()=='icon')
if link:
result['icon'] = urljoin(r.url, link.get('href'))
else:
link = soup.find('link', rel='shortcut icon')
link = soup.find('link', rel=lambda x: x and x.lower()=='shortcut icon')
if link:
result['icon'] = urljoin(r.url, link.get('href'))
@ -192,7 +194,20 @@ def check_content(r):
return result
def check_site(url):
def collect_ipv4_addresses(hostname_dict):
"""
Return list of unique IPv4 addresses
"""
ips = set()
for item in hostname_dict.values():
if 'ip_addresses' not in item:
continue
for ip in item['ip_addresses']:
ips.add(ip)
return sorted(list(ips))
def check_site(entry):
"""
Performs our site check and returns results as a dict.
@ -205,49 +220,88 @@ def check_site(url):
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
}
# all the info we'll return for the site
result = {
'input_url': url,
'hostnames': [],
'resolvable_urls': [],
'canonical_urls': [],
'urlchecks': [],
# input_url: The URL we derived all checks from
'input_url': entry['url'],
# Meta: Regional and type metadata for the site
'meta': {
'level': entry['level'],
'state': entry['state'],
'district': entry['district'],
'city': entry['city'],
},
# Details: All details we collected about the site (which aren't directly related to the report criteria)
'details': {
'hostnames': {},
'ipv4_addresses': [],
'resolvable_urls': [],
'canonical_urls': [],
'urlchecks': [],
'icons': [],
'feeds': [],
},
# The actual report criteria
'result': {
'DNS_RESOLVABLE_IPV4': {'type': 'boolean', 'value': False, 'score': 0},
'SITE_REACHABLE': {'type': 'boolean', 'value': False, 'score': 0},
'HTTPS': {'type': 'boolean', 'value': False, 'score': 0},
'WWW_OPTIONAL': {'type': 'boolean', 'value': False, 'score': 0},
'CANONICAL_URL': {'type': 'boolean', 'value': False, 'score': 0},
'FAVICON': {'type': 'boolean', 'value': False, 'score': 0},
'FEEDS': {'type': 'boolean', 'value': False, 'score': 0},
'HTTP_RESPONSE_DURATION': {'type': 'number', 'value': None, 'score': 0},
},
'score': 0.0,
}
# derive hostnames to test
parsed = urlparse(url)
# derive hostnames to test (with/without www.)
parsed = urlparse(entry['url'])
hostnames = derive_test_hostnames(parsed.hostname)
processed_hostnames = []
# try to resolve hostnames
processed_hostnames = {}
for hn in hostnames:
record = {
'input_hostname': hn,
processed_hostnames[hn] = {
'resolvable': False,
}
try:
hostname, aliases, ip_addresses = gethostbyname_ex(hn)
record['resolvable'] = True
record['resolved_hostname'] = hostname
record['aliases'] = aliases
record['ip_addresses'] = ip_addresses
processed_hostnames[hn]['resolvable'] = True
processed_hostnames[hn]['resolved_hostname'] = hostname
processed_hostnames[hn]['aliases'] = aliases
processed_hostnames[hn]['ip_addresses'] = ip_addresses
except:
pass
processed_hostnames.append(record)
result['details']['hostnames'] = processed_hostnames
result['hostnames'] = sorted(processed_hostnames, key=lambda hn: hn['input_hostname'])
result['details']['ipv4_addresses'] = collect_ipv4_addresses(processed_hostnames)
# check basic HTTP(S) reachability
checked_urls = []
for item in processed_hostnames:
checked_urls_set = set()
for hn in processed_hostnames.keys():
item = processed_hostnames[hn]
if not item['resolvable']:
continue
for scheme in ('http', 'https'):
url = scheme + '://' + item['resolved_hostname'] + '/'
if url in checked_urls_set:
continue
checked_urls_set.add(url)
record = {
'url': scheme + '://' + item['resolved_hostname'] + '/',
'url': url,
'error': None,
'redirects_to': None,
}
@ -255,7 +309,7 @@ def check_site(url):
try:
r = requests.head(record['url'], headers=headers, allow_redirects=True)
if r.url == url:
logging.info("URL: %s - status %s - no redirect" % (record['url'], r.status_code))
logging.info("URL: %s - status %s" % (record['url'], r.status_code))
else:
logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
record['redirects_to'] = r.url
@ -268,13 +322,13 @@ def check_site(url):
checked_urls.append(record)
result['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
result['canonical_urls'] = sorted(reduce_urls(checked_urls))
result['details']['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
result['details']['canonical_urls'] = sorted(reduce_urls(checked_urls))
# Deeper test for the remaining (canonical) URL(s)
for check_url in result['canonical_urls']:
for check_url in result['details']['canonical_urls']:
logging.info("Checking URL %s" % check_url)
logging.info("Downloading URL %s" % check_url)
check = {
'url': check_url,
@ -306,27 +360,125 @@ def check_site(url):
logging.error(str(e) + " " + check_url)
check['error'] = "unknown"
result['urlchecks'].append(check)
result['details']['urlchecks'].append(check)
result['urlchecks'] = sorted(result['urlchecks'], key=lambda url: url['url'])
result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
# collect icons
icons = set()
for c in result['details']['urlchecks']:
if 'content' not in c:
continue
if c['content'] is None:
logging.warning("No content for %s" % entry['url'])
continue
if c['content']['icon'] is not None:
icons.add(c['content']['icon'])
result['details']['icons'] = sorted(list(icons))
# collect feeds
feeds = set()
for c in result['details']['urlchecks']:
if c['content'] is None:
logging.warning("No content for %s" % entry['url'])
continue
if 'feeds' in c['content'] and len(c['content']['feeds']):
for feed in c['content']['feeds']:
feeds.add(feed)
result['details']['feeds'] = sorted(list(feeds))
### Derive criteria
# DNS_RESOLVABLE_IPV4
if len(result['details']['ipv4_addresses']):
result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
# SITE_REACHABLE
for item in result['details']['resolvable_urls']:
if item['error'] is None:
result['result']['SITE_REACHABLE'] = {'value': True, 'score': 1}
break
# HTTPS
for item in result['details']['urlchecks']:
if item['error'] is None and item['url'].startswith('https://'):
result['result']['HTTPS'] = {'value': True, 'score': 1}
break
# WWW_OPTIONAL
num_hostnames = 0
for hn in result['details']['hostnames'].keys():
item = result['details']['hostnames'][hn]
if not item['resolvable']:
continue
num_hostnames += 1
if num_hostnames > 1:
result['result']['WWW_OPTIONAL'] = {'value': True, 'score': 1}
# CANONICAL_URL
# - either there is only one canonical URL (through redirects)
# - or several pages have identical rel=canonical links
if len(result['details']['canonical_urls']) == 1:
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
else:
links = set()
if result['details']['urlchecks'] is None:
logging.warning("No urlchecks for %s" % entry['url'])
else:
for item in result['details']['urlchecks']:
if item['content']['canonical_link'] is not None:
links.add(item['content']['canonical_link'])
if len(links) == 1:
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
# FAVICON
if len(result['details']['icons']):
result['result']['FAVICON'] = {'value': True, 'score': 1}
# FEEDS
if len(result['details']['feeds']):
result['result']['FEEDS'] = {'value': True, 'score': 1}
# HTTP_RESPONSE_DURATION
durations = []
for item in result['details']['urlchecks']:
if item['error'] is None:
durations.append(item['duration'])
val = round(statistics.mean(durations))
result['result']['HTTP_RESPONSE_DURATION']['value'] = val
if val < 100:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
elif val < 1000:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
# Overall score
for item in result['result'].keys():
result['score'] += result['result'][item]['score']
return result
def main():
"""
Bringing it all together
"""
logging.basicConfig(level=logging.INFO)
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
# refresh our local clone of the green directory
get_green_directory()
urls = []
# build the list of website URLs to run checks for
logging.info("Processing green-directory")
input_entries = []
for entry in dir_entries():
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
continue
@ -339,36 +491,55 @@ def main():
except NameError as ne:
logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
if website_url:
urls.append(website_url)
input_entries.append({
"url": website_url,
"level": entry.get("level"),
"state": entry.get("state"),
"district": entry.get("district"),
"city": entry.get("city"),
})
# randomize order, to distribute requests over servers
logging.info("Shuffling input URLs")
random.seed()
random.shuffle(urls)
random.shuffle(input_entries)
# run checks
logging.info("Starting checks")
results = {}
if concurrency > 1:
pool = Pool(concurrency)
for url in urls:
results[url] = pool.apply_async(check_site, kwds={"url": url})
pool.close()
pool.join()
else:
for url in urls:
results[url] = check_site(url)
pool = Pool(concurrency)
for ientry in input_entries:
logging.info("Submitting %s to job pool" % ientry['url'])
results[ientry['url']] = pool.apply_async(check_site, kwds={'entry': ientry})
pool.close()
pool.join()
results2 = []
logging.info("Checks are finished")
# Restructure result from dict of ApplyResult
# to list of dicts and sort in stable way
json_result = []
done = set()
logging.info("Restructuring results")
# convert results from ApplyResult to dict
for url in sorted(results.keys()):
if url not in done:
results2.append(results[url].get())
logging.info("Getting result for %s" % url)
try:
resultsitem = results[url].get()
json_result.append(resultsitem)
except Exception as e:
logging.error("Error ehn getting result for '%s': %s" % (url, e))
done.add(url)
# Write result as JSON
output_filename = os.path.join(result_path, "spider_result.json")
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(results2, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
json.dump(json_result, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
if __name__ == "__main__":

28
webapp/dist/bundle.js vendored

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

View file

@ -17,11 +17,19 @@
background-color: #ffdbdb;
font-size: 1rem;
}
.medium {
background-color: #fce7ac;
font-size: 1rem;
}
.good {
background-color: #cfeaa8;
font-size: 1rem;
}
td.text {
font-size: 0.85rem;
}
.icon {
width: 32px;
height: 32px;
@ -38,13 +46,20 @@
<div class="container-fluid">
<p>Erläuterungen der Testkriterien am <a href="#docs">Ende der Seite</a>. Gehe zum <a href="https://github.com/netzbegruenung/green-spider/">green-spider GitHub repository</a> um mehr zu erfahren.</p>
<p>Weitere Information: <a href="https://github.com/netzbegruenung/green-spider/">green-spider GitHub repository</a> - <a href="https://github.com/netzbegruenung/green-spider/blob/master/KRITERIEN.md">Kriterien</a></p>
<table class="table">
<thead>
<tr>
<th scope="col">Typ</th>
<th scope="col">Land</th>
<th scope="col">Kreis</th>
<th scope="col">Stadt</th>
<th scope="col">URL</th>
<th scope="col">Score</th>
<th scope="col">IP-Adresse</th>
<th scope="col">Erreichbar</th>
<th scope="col">Antwortzeit</th>
<th scope="col">Icon</th>
<th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
<th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
@ -57,22 +72,6 @@
</tbody>
</table>
<hr>
<h3 id="docs">Testkriterien</h3>
<h4>www. optional</h4>
<p>Die Site sollte sowohl mit &quot;www.&quot; als auch ohne &quot;www.&quot; am Anfang der URL erreichbar sein.</p>
<h4>Kanonische URL</h4>
<p>Verschiedene URLs zum Aufruf der Site (mit/ohne &quot;www.&quot; Präfix, mit/ohne HTTPS) sollten alle per HTTP Redirect auf eine einzige &quot;kanonische&quot; URL weiter leiten.</p>
<h4>HTTPS</h4>
<p>Die Site ist per verschlüsselter HTTP-Verbindung (HTTPS-Protokoll) erreichbar.</p>
</div>
<script src="bundle.js"></script>

View file

@ -8,6 +8,13 @@ import LazyLoad from 'vanilla-lazyload';
$(function(){
var trunc = function(s, length) {
if (s.length > length) {
s = s.substring(0, length) + '…';
}
return s;
};
var table = null;
$.getJSON('data/screenshots.json', function(screenshots){
@ -18,54 +25,71 @@ $(function(){
var row = $(document.createElement('tr'));
// typ
var level = null;
if (item.meta.level === 'DE:ORTSVERBAND') {
level = 'OV';
} else if (item.meta.level === 'DE:KREISVERBAND') {
level = 'KV';
} else if (item.meta.level === 'DE:LANDESVERBAND') {
level = 'LV';
}
row.append('<td>' + (level === null ? '' : level) + '</td>');
// land
row.append('<td>' + (item.meta.state === null ? '' : item.meta.state) + '</td>');
// kreis
row.append('<td>' + (item.meta.district === null ? '' : item.meta.district) + '</td>');
// stadt
row.append('<td>' + (item.meta.city === null ? '' : item.meta.city) + '</td>');
// input URL
row.append('<td><a href="' + item.input_url + '">' + punycode.toUnicode(item.input_url) + '</a></td>');
row.append('<td><a href="' + item.input_url + '">' + trunc(punycode.toUnicode(item.input_url), 60) + '</a></td>');
// score
row.append('<td>' + item.score.toFixed(1) + '</td>');
// IPs
var ips = _.join(_.uniq(_.flatten(_.map(item.hostnames, 'ip_addresses'))), ', ');
row.append('<td class="'+ (ips === '' ? 'bad' : 'good') +' text-center">' + (ips === '' ? '❌ Keine' : ips) + '</td>');
var ips = _.join(item.details.ipv4_addresses, ', ');
row.append('<td class="text '+ (ips === '' ? 'bad' : 'good') +' text-center">' + (ips === '' ? '❌' : ips) + '</td>');
// icon
var icons = [];
var icon = false;
icons = _.uniq(_.map(item.urlchecks, 'content.icon'));
if (icons.length > 0 && icons[0]) {
icon = icons[0];
}
row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center">' + (icon ? ('<img src="' + icon + '" class="icon"/>') : '❌') + '</td>');
// SITE_REACHABLE
row.append('<td class="'+ (item.result.SITE_REACHABLE.value ? 'good' : 'bad') +' text-center">' + (item.result.SITE_REACHABLE.value ? '✅' : '❌') + '</td>');
// hostnames
var twoHostnames = false;
if (_.filter(item.hostnames, {'resolvable': true}).length === 2) {
twoHostnames = true;
};
row.append('<td class="'+ (twoHostnames ? 'good' : 'bad') +' text-center">' + (twoHostnames ? '✅' : '❌') + '</td>');
// HTTP_RESPONSE_DURATION
var durationClass = 'bad';
if (item.result.HTTP_RESPONSE_DURATION.score > 0) { durationClass = 'medium'; }
if (item.result.HTTP_RESPONSE_DURATION.score > 0.5) { durationClass = 'good'; }
row.append('<td class="text '+ durationClass +' text-center" data-order="' + item.result.HTTP_RESPONSE_DURATION.value + '">' + item.result.HTTP_RESPONSE_DURATION.value + ' ms</td>');
// FAVICON
var icon = item.result.FAVICON.value;
row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center">' + (icon ? ('<img src="' + item.details.icons[0] + '" class="icon">') : '❌') + '</td>');
// WWW_OPTIONAL
var wwwOptional = item.result.WWW_OPTIONAL.value;
row.append('<td class="'+ (wwwOptional ? 'good' : 'bad') +' text-center">' + (wwwOptional ? '✅' : '❌') + '</td>');
// one canonical URL
var canonical = false;
if (item.canonical_urls.length === 1) canonical = true;
var canonical_links = _.uniq(_.map(item.urlchecks, 'content.canonical_link'));
if (canonical_links.length === 1) canonical = true;
var canonical = item.result.CANONICAL_URL.value;
row.append('<td class="'+ (canonical ? 'good' : 'bad') +' text-center">' + (canonical ? '✅' : '❌') + '</td>');
// https
var hasHTTPS = false;
hasHTTPS = _.find(item.canonical_urls, function(o){
return o.indexOf('https://') !== -1;
});
var hasHTTPS = item.result.HTTPS.value;
row.append('<td class="'+ (hasHTTPS ? 'good' : 'bad') +' text-center">' + (hasHTTPS ? '✅' : '❌') + '</td>');
// feeds
var feeds = false;
feeds = _.uniq(_.flatten(_.map(item.urlchecks, 'content.feeds')));
row.append('<td class="'+ (feeds.length ? 'good' : 'bad') +' text-center">' + (feeds.length ? '✅' : '❌') + '</td>');
var feeds = item.result.FEEDS.value;
row.append('<td class="'+ (feeds ? 'good' : 'bad') +' text-center">' + (feeds ? '✅' : '❌') + '</td>');
// screenshots
var screenshot = false;
if (item.canonical_urls.length > 0) {
if (typeof screenshots[item.canonical_urls[0]] !== 'undefined') {
var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.canonical_urls[0]];
var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.canonical_urls[0]];
if (item.details.canonical_urls.length > 0) {
if (typeof screenshots[item.details.canonical_urls[0]] !== 'undefined') {
var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.details.canonical_urls[0]];
var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.details.canonical_urls[0]];
screenshot = '<a class="screenshot" href="'+ surl +'" target="_blank" title="Mobile">M</a>';
screenshot += '<a class="screenshot" href="'+ lurl +'" target="_blank" title="Desktop">D</a>';
}