Merge pull request #1 from netzbegruenung/first-version

First working code and results
This commit is contained in:
Marian Steinbach 2018-04-05 18:13:03 +02:00 committed by GitHub
commit daa002b506
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 45466 additions and 0 deletions

3
.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
venv
cache
webapp/node_modules

View file

@ -1,2 +1,27 @@
# green-spider # green-spider
Collects data on green websites and checks for things like SEO, performance, TLS. Collects data on green websites and checks for things like SEO, performance, TLS.
Written and tested in Python3
### Ideas
- If the URL does not start with `www.`, will entering `www.<url>` also work?
- If the URL is HTTP, is it possible to access the site via HTTPS (recommended)?
- If the URL is HTTPS, is it possible to access the sire via HTTP (recommended: redirect to HTTPS)
- Check which cookies are set and with what settings (expiry, domain)
- submit the URL against a service like Google Page Speed and retrieve the score
- Check against our own webpagetest.org instance
- Detect which one of the well-known CMS is used
- Certificate expiry warning
- Favourite icon availability check
### Usage
```nohighlight
virtualenv -p python3 venv
source venv/bin/activate
pip install -r requirements.txt
python spider.py
```

6
requirements.txt Normal file
View file

@ -0,0 +1,6 @@
certifi==2018.1.18
chardet==3.0.4
idna==2.6
requests==2.18.4
urllib3==1.22
pyyaml==3.12

289
spider.py Normal file
View file

@ -0,0 +1,289 @@
# coding: utf8
from git import Repo
from multiprocessing import Pool
from socket import gethostbyname_ex
from urllib.parse import urlparse
import certifi
import json
import logging
import os
import random
import requests
import shutil
import sys
import yaml
# configuration
# number of parallel processes to use for crawling
concurrency = 4
# connection timeout for website checks (seconds)
connect_timeout = 5
# response timeout for website checks
read_timeout = 10
# Git repo for our data
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
# folder in that repo that holds the data
green_direcory_data_path = 'data'
green_directory_local_path = './cache/green-directory'
result_path = './webapp/dist/data'
# end configuration
def get_green_directory():
"""
Clones the source of website URLs, the green directory,
into the local file system using git
"""
if os.path.exists(green_directory_local_path):
shutil.rmtree(green_directory_local_path)
Repo.clone_from(green_directory_repo, green_directory_local_path)
def dir_entries():
"""
Iterator over all data files in the cloned green directory
"""
path = os.path.join(green_directory_local_path, green_direcory_data_path)
for root, dirs, files in os.walk(path):
for fname in files:
filepath = os.path.join(root, fname)
if not filepath.endswith(".yaml"):
continue
with open(filepath, 'r') as yamlfile:
for doc in yaml.load_all(yamlfile):
yield doc
def repr_entry(entry):
"""
Return string representation of a directory entry,
for logging/debugging purposes
"""
r = entry['type']
if 'level' in entry:
r += "/" + entry['level']
if 'state' in entry:
r += "/" + entry['state']
if 'district' in entry:
r += "/" + entry['district']
return r
def derive_test_hostnames(hostname):
"""
Derives the hostnames variants to test for a given host name.
From 'gruene-x.de' or 'www.gruene-x.de' it makes
['gruene-x.de', 'www.gruene-x.de']
which are both plausible web URLs to be used for a domain.
"""
hostnames = set()
hostnames.add(hostname)
if hostname.startswith('www.'):
hostnames.add(hostname[4:])
else:
hostnames.add('www.' + hostname)
return list(hostnames)
def reduce_urls(urllist):
"""
Reduce a list of urls with metadata by eliminating those
that either don't work or lead somewhere else
"""
targets = set()
for u in urllist:
if u['error'] is not None:
continue
if u['redirects_to'] is not None:
targets.add(u['redirects_to'])
else:
targets.add(u['url'])
return list(targets)
def check_site(url):
"""
Performs our site check and returns results as a dict.
1. Normalize the input URL and derive the URLs to check for
2. HEAD the check urls
3. Determine the canonical URL
4. Run full check on canonical URL
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
}
result = {
'input_url': url,
'hostnames': [],
'resolvable_urls': [],
'canonical_urls': [],
'urlchecks': [],
}
# derive hostnames to test
parsed = urlparse(url)
hostnames = derive_test_hostnames(parsed.hostname)
processed_hostnames = []
for hn in hostnames:
record = {
'input_hostname': hn,
'resolvable': False,
}
try:
hostname, aliases, ip_addresses = gethostbyname_ex(hn)
record['resolvable'] = True
record['resolved_hostname'] = hostname
record['aliases'] = aliases
record['ip_addresses'] = ip_addresses
except:
pass
processed_hostnames.append(record)
result['hostnames'] = processed_hostnames
checked_urls = []
for item in processed_hostnames:
if not item['resolvable']:
continue
for scheme in ('http', 'https'):
record = {
'url': scheme + '://' + item['resolved_hostname'] + '/',
'error': None,
'redirects_to': None,
}
try:
r = requests.head(record['url'], headers=headers, allow_redirects=True)
if r.url == url:
logging.info("URL: %s - status %s - no redirect" % (record['url'], r.status_code))
else:
logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
record['redirects_to'] = r.url
except Exception as e:
record['error'] = {
'type': str(type(e)),
'message': str(e),
}
logging.info("URL %s: %s %s" % (url, str(type(e)), e))
checked_urls.append(record)
result['resolvable_urls'] = checked_urls
result['canonical_urls'] = reduce_urls(checked_urls)
# Deeper test for the remaining (canonical) URL(s)
for check_url in result['canonical_urls']:
logging.info("Checking URL %s" % check_url)
check = {
'url': check_url,
'status_code': None,
'duration': None,
'error': None,
}
try:
r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
check['status_code'] = r.status_code
check['duration'] = round(r.elapsed.microseconds / 1000)
except requests.exceptions.ConnectionError as e:
logging.error(str(e) + " " + check_url)
check['error'] = "connection"
except requests.exceptions.Timeout as e:
logging.error(str(e) + " " + check_url)
check['error'] = "connection_timeout"
except requests.exceptions.ReadTimeout as e:
logging.error(str(e) + " " + check_url)
check['error'] = "read_timeout"
except Exception as e:
logging.error(str(e) + " " + check_url)
check['error'] = "unknown"
result['urlchecks'].append(check)
return result
def main():
logging.basicConfig(level=logging.INFO)
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
get_green_directory()
urls = []
for entry in dir_entries():
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
continue
website_url = None
for n in range(len(entry['urls'])):
try:
if entry['urls'][n]['type'] == "WEBSITE":
website_url = entry['urls'][n]['url']
except NameError as ne:
logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
if website_url:
urls.append(website_url)
random.seed()
random.shuffle(urls)
results = {}
if concurrency > 1:
pool = Pool(concurrency)
for url in urls:
results[url] = pool.apply_async(check_site, kwds={"url": url})
pool.close()
pool.join()
else:
for url in urls:
results[url] = check_site(url)
results2 = []
done = set()
# convert results from ApplyResult to dict
for url in sorted(results.keys()):
if url not in done:
results2.append(results[url].get())
done.add(url)
# Write result as JSON
output_filename = os.path.join(result_path, "spider_result.json")
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(results2, jsonfile, indent=2, sort_keys=True)
if __name__ == "__main__":
main()

85
webapp/dist/bundle.js vendored Normal file

File diff suppressed because one or more lines are too long

37126
webapp/dist/data/spider_result.json vendored Normal file

File diff suppressed because it is too large Load diff

60
webapp/dist/index.html vendored Normal file
View file

@ -0,0 +1,60 @@
<!DOCTYPE html>
<html lang="de">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<title>green spider report: Auswertung einiger Merkmale von Bündnis 90/Die Grünen Websites</title>
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.16/css/jquery.dataTables.css">
<style type="text/css">
body {
padding: 20px;
}
.bad {
background-color: #ffdbdb;
}
.good {
background-color: #cfeaa8;
}
</style>
</head>
<body>
<div class="container-fluid">
<p>Erläuterungen der Testkriterien am <a href="#docs">Ende der Seite</a>. Gehe zum <a href="https://github.com/netzbegruenung/green-spider/">green-spider GitHub repository</a> um mehr zu erfahren.</p>
<table class="table">
<thead>
<tr>
<th scope="col">URL</th>
<th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
<th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
<th scope="col"><abbr title="Site nutzt HTTP-Verschlüsselung">HTTPS</abbr></th>
</tr>
</thead>
<tbody>
</tbody>
</table>
<hr>
<h3 id="docs">Testkriterien</h3>
<h4>www. optional</h4>
<p>Die Site sollte sowohl mit &quot;www.&quot; als auch ohne &quot;www.&quot; am Anfang der URL erreichbar sein.</p>
<h4>Kanonische URL</h4>
<p>Verschiedene URLs zum Aufruf der Site (mit/ohne &quot;www.&quot; Präfix, mit/ohne HTTPS) sollten alle per HTTP Redirect auf eine einzige &quot;kanonische&quot; URL weiter leiten.</p>
<h4>HTTPS</h4>
<p>Die Site ist per verschlüsselter HTTP-Verbindung (HTTPS-Protokoll) erreichbar.</p>
</div>
<script src="bundle.js"></script>
</body>
</html>

7782
webapp/package-lock.json generated Normal file

File diff suppressed because it is too large Load diff

31
webapp/package.json Normal file
View file

@ -0,0 +1,31 @@
{
"name": "green-spider-webapp",
"version": "0.0.1",
"description": "Displays spider results",
"private": true,
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"repository": {
"type": "git",
"url": "git+https://github.com/netzbegruenung/green-spider.git"
},
"author": "",
"license": "Apache-2.0",
"bugs": {
"url": "https://github.com/netzbegruenung/green-spider/issues"
},
"homepage": "https://github.com/netzbegruenung/green-spider#readme",
"devDependencies": {
"webpack": "^4.5.0",
"webpack-cli": "^2.0.14"
},
"dependencies": {
"bootstrap": "^4.0.0",
"datatables.net": "^1.10.16",
"jquery": "^3.3.1",
"lodash": "^4.17.5",
"popper.js": "^1.14.3",
"punycode": "^2.1.0"
}
}

50
webapp/src/index.js Normal file
View file

@ -0,0 +1,50 @@
import _ from 'lodash';
import $ from 'jquery';
import 'bootstrap';
import 'popper.js';
import punycode from 'punycode';
import 'datatables.net';
$(function(){
$.getJSON('data/spider_result.json', function(data){
var tbody = $('tbody');
$.each(data, function(index, item) {
var row = $(document.createElement('tr'));
row.append('<td><a href="' + item.input_url + '">' + punycode.toUnicode(item.input_url) + '</a></td>');
// hostnames
var twoHostnames = false;
if (_.filter(item.hostnames, {'resolvable': true}).length === 2) {
twoHostnames = true;
};
row.append('<td class="'+ (twoHostnames ? 'good' : 'bad') +' text-center">' + (twoHostnames ? '✅' : '❌') + '</td>');
// one canonical URL
var canonical = false;
if (item.canonical_urls.length === 1 ) canonical = true;
row.append('<td class="'+ (canonical ? 'good' : 'bad') +' text-center">' + (canonical ? '✅' : '❌') + '</td>');
// https
var hasHTTPS = false;
hasHTTPS = _.find(item.canonical_urls, function(o){
return o.indexOf('https://') !== -1;
});
row.append('<td class="'+ (hasHTTPS ? 'good' : 'bad') +' text-center">' + (hasHTTPS ? '✅' : '❌') + '</td>');
tbody.append(row);
});
// enable data table funcionts (sorting)
$('table.table').DataTable({
order: [[0, "asc"]],
paging: false,
pageLength: 10000,
language: {
"search": "Suche"
}
});
});
});

9
webapp/webpack.config.js Normal file
View file

@ -0,0 +1,9 @@
const path = require('path');
module.exports = {
entry: './src/index.js',
output: {
filename: 'bundle.js',
path: path.resolve(__dirname, 'dist')
}
};