mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-04 01:53:40 +02:00
Merge pull request #1 from netzbegruenung/first-version
First working code and results
This commit is contained in:
commit
daa002b506
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
venv
|
||||||
|
cache
|
||||||
|
webapp/node_modules
|
25
README.md
25
README.md
|
@ -1,2 +1,27 @@
|
||||||
# green-spider
|
# green-spider
|
||||||
|
|
||||||
Collects data on green websites and checks for things like SEO, performance, TLS.
|
Collects data on green websites and checks for things like SEO, performance, TLS.
|
||||||
|
|
||||||
|
Written and tested in Python3
|
||||||
|
|
||||||
|
### Ideas
|
||||||
|
|
||||||
|
- If the URL does not start with `www.`, will entering `www.<url>` also work?
|
||||||
|
- If the URL is HTTP, is it possible to access the site via HTTPS (recommended)?
|
||||||
|
- If the URL is HTTPS, is it possible to access the sire via HTTP (recommended: redirect to HTTPS)
|
||||||
|
- Check which cookies are set and with what settings (expiry, domain)
|
||||||
|
- submit the URL against a service like Google Page Speed and retrieve the score
|
||||||
|
- Check against our own webpagetest.org instance
|
||||||
|
- Detect which one of the well-known CMS is used
|
||||||
|
- Certificate expiry warning
|
||||||
|
- Favourite icon availability check
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```nohighlight
|
||||||
|
virtualenv -p python3 venv
|
||||||
|
source venv/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
python spider.py
|
||||||
|
```
|
||||||
|
|
6
requirements.txt
Normal file
6
requirements.txt
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
certifi==2018.1.18
|
||||||
|
chardet==3.0.4
|
||||||
|
idna==2.6
|
||||||
|
requests==2.18.4
|
||||||
|
urllib3==1.22
|
||||||
|
pyyaml==3.12
|
289
spider.py
Normal file
289
spider.py
Normal file
|
@ -0,0 +1,289 @@
|
||||||
|
# coding: utf8
|
||||||
|
|
||||||
|
from git import Repo
|
||||||
|
from multiprocessing import Pool
|
||||||
|
from socket import gethostbyname_ex
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import certifi
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import requests
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# configuration
|
||||||
|
|
||||||
|
# number of parallel processes to use for crawling
|
||||||
|
concurrency = 4
|
||||||
|
|
||||||
|
# connection timeout for website checks (seconds)
|
||||||
|
connect_timeout = 5
|
||||||
|
|
||||||
|
# response timeout for website checks
|
||||||
|
read_timeout = 10
|
||||||
|
|
||||||
|
# Git repo for our data
|
||||||
|
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
|
||||||
|
# folder in that repo that holds the data
|
||||||
|
green_direcory_data_path = 'data'
|
||||||
|
green_directory_local_path = './cache/green-directory'
|
||||||
|
|
||||||
|
result_path = './webapp/dist/data'
|
||||||
|
|
||||||
|
# end configuration
|
||||||
|
|
||||||
|
|
||||||
|
def get_green_directory():
|
||||||
|
"""
|
||||||
|
Clones the source of website URLs, the green directory,
|
||||||
|
into the local file system using git
|
||||||
|
"""
|
||||||
|
if os.path.exists(green_directory_local_path):
|
||||||
|
shutil.rmtree(green_directory_local_path)
|
||||||
|
Repo.clone_from(green_directory_repo, green_directory_local_path)
|
||||||
|
|
||||||
|
|
||||||
|
def dir_entries():
|
||||||
|
"""
|
||||||
|
Iterator over all data files in the cloned green directory
|
||||||
|
"""
|
||||||
|
path = os.path.join(green_directory_local_path, green_direcory_data_path)
|
||||||
|
for root, dirs, files in os.walk(path):
|
||||||
|
for fname in files:
|
||||||
|
|
||||||
|
filepath = os.path.join(root, fname)
|
||||||
|
if not filepath.endswith(".yaml"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
with open(filepath, 'r') as yamlfile:
|
||||||
|
for doc in yaml.load_all(yamlfile):
|
||||||
|
yield doc
|
||||||
|
|
||||||
|
|
||||||
|
def repr_entry(entry):
|
||||||
|
"""
|
||||||
|
Return string representation of a directory entry,
|
||||||
|
for logging/debugging purposes
|
||||||
|
"""
|
||||||
|
r = entry['type']
|
||||||
|
if 'level' in entry:
|
||||||
|
r += "/" + entry['level']
|
||||||
|
if 'state' in entry:
|
||||||
|
r += "/" + entry['state']
|
||||||
|
if 'district' in entry:
|
||||||
|
r += "/" + entry['district']
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
def derive_test_hostnames(hostname):
|
||||||
|
"""
|
||||||
|
Derives the hostnames variants to test for a given host name.
|
||||||
|
From 'gruene-x.de' or 'www.gruene-x.de' it makes
|
||||||
|
|
||||||
|
['gruene-x.de', 'www.gruene-x.de']
|
||||||
|
|
||||||
|
which are both plausible web URLs to be used for a domain.
|
||||||
|
"""
|
||||||
|
|
||||||
|
hostnames = set()
|
||||||
|
|
||||||
|
hostnames.add(hostname)
|
||||||
|
if hostname.startswith('www.'):
|
||||||
|
hostnames.add(hostname[4:])
|
||||||
|
else:
|
||||||
|
hostnames.add('www.' + hostname)
|
||||||
|
|
||||||
|
return list(hostnames)
|
||||||
|
|
||||||
|
|
||||||
|
def reduce_urls(urllist):
|
||||||
|
"""
|
||||||
|
Reduce a list of urls with metadata by eliminating those
|
||||||
|
that either don't work or lead somewhere else
|
||||||
|
"""
|
||||||
|
targets = set()
|
||||||
|
for u in urllist:
|
||||||
|
if u['error'] is not None:
|
||||||
|
continue
|
||||||
|
if u['redirects_to'] is not None:
|
||||||
|
targets.add(u['redirects_to'])
|
||||||
|
else:
|
||||||
|
targets.add(u['url'])
|
||||||
|
return list(targets)
|
||||||
|
|
||||||
|
|
||||||
|
def check_site(url):
|
||||||
|
"""
|
||||||
|
Performs our site check and returns results as a dict.
|
||||||
|
|
||||||
|
1. Normalize the input URL and derive the URLs to check for
|
||||||
|
2. HEAD the check urls
|
||||||
|
3. Determine the canonical URL
|
||||||
|
4. Run full check on canonical URL
|
||||||
|
"""
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
|
||||||
|
}
|
||||||
|
|
||||||
|
result = {
|
||||||
|
'input_url': url,
|
||||||
|
'hostnames': [],
|
||||||
|
'resolvable_urls': [],
|
||||||
|
'canonical_urls': [],
|
||||||
|
'urlchecks': [],
|
||||||
|
}
|
||||||
|
|
||||||
|
# derive hostnames to test
|
||||||
|
parsed = urlparse(url)
|
||||||
|
hostnames = derive_test_hostnames(parsed.hostname)
|
||||||
|
|
||||||
|
|
||||||
|
processed_hostnames = []
|
||||||
|
for hn in hostnames:
|
||||||
|
|
||||||
|
record = {
|
||||||
|
'input_hostname': hn,
|
||||||
|
'resolvable': False,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
hostname, aliases, ip_addresses = gethostbyname_ex(hn)
|
||||||
|
record['resolvable'] = True
|
||||||
|
record['resolved_hostname'] = hostname
|
||||||
|
record['aliases'] = aliases
|
||||||
|
record['ip_addresses'] = ip_addresses
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
processed_hostnames.append(record)
|
||||||
|
|
||||||
|
result['hostnames'] = processed_hostnames
|
||||||
|
|
||||||
|
checked_urls = []
|
||||||
|
for item in processed_hostnames:
|
||||||
|
if not item['resolvable']:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for scheme in ('http', 'https'):
|
||||||
|
|
||||||
|
record = {
|
||||||
|
'url': scheme + '://' + item['resolved_hostname'] + '/',
|
||||||
|
'error': None,
|
||||||
|
'redirects_to': None,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = requests.head(record['url'], headers=headers, allow_redirects=True)
|
||||||
|
if r.url == url:
|
||||||
|
logging.info("URL: %s - status %s - no redirect" % (record['url'], r.status_code))
|
||||||
|
else:
|
||||||
|
logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
|
||||||
|
record['redirects_to'] = r.url
|
||||||
|
except Exception as e:
|
||||||
|
record['error'] = {
|
||||||
|
'type': str(type(e)),
|
||||||
|
'message': str(e),
|
||||||
|
}
|
||||||
|
logging.info("URL %s: %s %s" % (url, str(type(e)), e))
|
||||||
|
|
||||||
|
checked_urls.append(record)
|
||||||
|
|
||||||
|
result['resolvable_urls'] = checked_urls
|
||||||
|
result['canonical_urls'] = reduce_urls(checked_urls)
|
||||||
|
|
||||||
|
# Deeper test for the remaining (canonical) URL(s)
|
||||||
|
for check_url in result['canonical_urls']:
|
||||||
|
|
||||||
|
logging.info("Checking URL %s" % check_url)
|
||||||
|
|
||||||
|
check = {
|
||||||
|
'url': check_url,
|
||||||
|
'status_code': None,
|
||||||
|
'duration': None,
|
||||||
|
'error': None,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
|
||||||
|
check['status_code'] = r.status_code
|
||||||
|
check['duration'] = round(r.elapsed.microseconds / 1000)
|
||||||
|
except requests.exceptions.ConnectionError as e:
|
||||||
|
logging.error(str(e) + " " + check_url)
|
||||||
|
check['error'] = "connection"
|
||||||
|
except requests.exceptions.Timeout as e:
|
||||||
|
logging.error(str(e) + " " + check_url)
|
||||||
|
check['error'] = "connection_timeout"
|
||||||
|
except requests.exceptions.ReadTimeout as e:
|
||||||
|
logging.error(str(e) + " " + check_url)
|
||||||
|
check['error'] = "read_timeout"
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(str(e) + " " + check_url)
|
||||||
|
check['error'] = "unknown"
|
||||||
|
|
||||||
|
result['urlchecks'].append(check)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
||||||
|
|
||||||
|
get_green_directory()
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
for entry in dir_entries():
|
||||||
|
|
||||||
|
if 'type' not in entry:
|
||||||
|
logging.error("Entry without type")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if 'urls' not in entry:
|
||||||
|
logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
|
||||||
|
continue
|
||||||
|
|
||||||
|
website_url = None
|
||||||
|
for n in range(len(entry['urls'])):
|
||||||
|
try:
|
||||||
|
if entry['urls'][n]['type'] == "WEBSITE":
|
||||||
|
website_url = entry['urls'][n]['url']
|
||||||
|
except NameError as ne:
|
||||||
|
logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
|
||||||
|
if website_url:
|
||||||
|
urls.append(website_url)
|
||||||
|
|
||||||
|
random.seed()
|
||||||
|
random.shuffle(urls)
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
if concurrency > 1:
|
||||||
|
pool = Pool(concurrency)
|
||||||
|
for url in urls:
|
||||||
|
results[url] = pool.apply_async(check_site, kwds={"url": url})
|
||||||
|
pool.close()
|
||||||
|
pool.join()
|
||||||
|
else:
|
||||||
|
for url in urls:
|
||||||
|
results[url] = check_site(url)
|
||||||
|
|
||||||
|
results2 = []
|
||||||
|
done = set()
|
||||||
|
|
||||||
|
# convert results from ApplyResult to dict
|
||||||
|
for url in sorted(results.keys()):
|
||||||
|
if url not in done:
|
||||||
|
results2.append(results[url].get())
|
||||||
|
done.add(url)
|
||||||
|
|
||||||
|
# Write result as JSON
|
||||||
|
output_filename = os.path.join(result_path, "spider_result.json")
|
||||||
|
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
||||||
|
json.dump(results2, jsonfile, indent=2, sort_keys=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
85
webapp/dist/bundle.js
vendored
Normal file
85
webapp/dist/bundle.js
vendored
Normal file
File diff suppressed because one or more lines are too long
37126
webapp/dist/data/spider_result.json
vendored
Normal file
37126
webapp/dist/data/spider_result.json
vendored
Normal file
File diff suppressed because it is too large
Load diff
60
webapp/dist/index.html
vendored
Normal file
60
webapp/dist/index.html
vendored
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="de">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||||
|
<title>green spider report: Auswertung einiger Merkmale von Bündnis 90/Die Grünen Websites</title>
|
||||||
|
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
|
||||||
|
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.16/css/jquery.dataTables.css">
|
||||||
|
<style type="text/css">
|
||||||
|
body {
|
||||||
|
padding: 20px;
|
||||||
|
}
|
||||||
|
.bad {
|
||||||
|
background-color: #ffdbdb;
|
||||||
|
}
|
||||||
|
.good {
|
||||||
|
background-color: #cfeaa8;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<div class="container-fluid">
|
||||||
|
|
||||||
|
<p>Erläuterungen der Testkriterien am <a href="#docs">Ende der Seite</a>. Gehe zum <a href="https://github.com/netzbegruenung/green-spider/">green-spider GitHub repository</a> um mehr zu erfahren.</p>
|
||||||
|
|
||||||
|
<table class="table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th scope="col">URL</th>
|
||||||
|
<th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
|
||||||
|
<th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
|
||||||
|
<th scope="col"><abbr title="Site nutzt HTTP-Verschlüsselung">HTTPS</abbr></th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<hr>
|
||||||
|
|
||||||
|
<h3 id="docs">Testkriterien</h3>
|
||||||
|
|
||||||
|
<h4>www. optional</h4>
|
||||||
|
|
||||||
|
<p>Die Site sollte sowohl mit "www." als auch ohne "www." am Anfang der URL erreichbar sein.</p>
|
||||||
|
|
||||||
|
<h4>Kanonische URL</h4>
|
||||||
|
|
||||||
|
<p>Verschiedene URLs zum Aufruf der Site (mit/ohne "www." Präfix, mit/ohne HTTPS) sollten alle per HTTP Redirect auf eine einzige "kanonische" URL weiter leiten.</p>
|
||||||
|
|
||||||
|
<h4>HTTPS</h4>
|
||||||
|
|
||||||
|
<p>Die Site ist per verschlüsselter HTTP-Verbindung (HTTPS-Protokoll) erreichbar.</p>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script src="bundle.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
7782
webapp/package-lock.json
generated
Normal file
7782
webapp/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load diff
31
webapp/package.json
Normal file
31
webapp/package.json
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
{
|
||||||
|
"name": "green-spider-webapp",
|
||||||
|
"version": "0.0.1",
|
||||||
|
"description": "Displays spider results",
|
||||||
|
"private": true,
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"repository": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "git+https://github.com/netzbegruenung/green-spider.git"
|
||||||
|
},
|
||||||
|
"author": "",
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"bugs": {
|
||||||
|
"url": "https://github.com/netzbegruenung/green-spider/issues"
|
||||||
|
},
|
||||||
|
"homepage": "https://github.com/netzbegruenung/green-spider#readme",
|
||||||
|
"devDependencies": {
|
||||||
|
"webpack": "^4.5.0",
|
||||||
|
"webpack-cli": "^2.0.14"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"bootstrap": "^4.0.0",
|
||||||
|
"datatables.net": "^1.10.16",
|
||||||
|
"jquery": "^3.3.1",
|
||||||
|
"lodash": "^4.17.5",
|
||||||
|
"popper.js": "^1.14.3",
|
||||||
|
"punycode": "^2.1.0"
|
||||||
|
}
|
||||||
|
}
|
50
webapp/src/index.js
Normal file
50
webapp/src/index.js
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
import _ from 'lodash';
|
||||||
|
import $ from 'jquery';
|
||||||
|
import 'bootstrap';
|
||||||
|
import 'popper.js';
|
||||||
|
import punycode from 'punycode';
|
||||||
|
import 'datatables.net';
|
||||||
|
|
||||||
|
|
||||||
|
$(function(){
|
||||||
|
|
||||||
|
$.getJSON('data/spider_result.json', function(data){
|
||||||
|
var tbody = $('tbody');
|
||||||
|
$.each(data, function(index, item) {
|
||||||
|
var row = $(document.createElement('tr'));
|
||||||
|
row.append('<td><a href="' + item.input_url + '">' + punycode.toUnicode(item.input_url) + '</a></td>');
|
||||||
|
|
||||||
|
// hostnames
|
||||||
|
var twoHostnames = false;
|
||||||
|
if (_.filter(item.hostnames, {'resolvable': true}).length === 2) {
|
||||||
|
twoHostnames = true;
|
||||||
|
};
|
||||||
|
row.append('<td class="'+ (twoHostnames ? 'good' : 'bad') +' text-center">' + (twoHostnames ? '✅' : '❌') + '</td>');
|
||||||
|
|
||||||
|
// one canonical URL
|
||||||
|
var canonical = false;
|
||||||
|
if (item.canonical_urls.length === 1 ) canonical = true;
|
||||||
|
row.append('<td class="'+ (canonical ? 'good' : 'bad') +' text-center">' + (canonical ? '✅' : '❌') + '</td>');
|
||||||
|
|
||||||
|
// https
|
||||||
|
var hasHTTPS = false;
|
||||||
|
hasHTTPS = _.find(item.canonical_urls, function(o){
|
||||||
|
return o.indexOf('https://') !== -1;
|
||||||
|
});
|
||||||
|
row.append('<td class="'+ (hasHTTPS ? 'good' : 'bad') +' text-center">' + (hasHTTPS ? '✅' : '❌') + '</td>');
|
||||||
|
|
||||||
|
tbody.append(row);
|
||||||
|
});
|
||||||
|
|
||||||
|
// enable data table funcionts (sorting)
|
||||||
|
$('table.table').DataTable({
|
||||||
|
order: [[0, "asc"]],
|
||||||
|
paging: false,
|
||||||
|
pageLength: 10000,
|
||||||
|
language: {
|
||||||
|
"search": "Suche"
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
9
webapp/webpack.config.js
Normal file
9
webapp/webpack.config.js
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
const path = require('path');
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
entry: './src/index.js',
|
||||||
|
output: {
|
||||||
|
filename: 'bundle.js',
|
||||||
|
path: path.resolve(__dirname, 'dist')
|
||||||
|
}
|
||||||
|
};
|
Loading…
Reference in a new issue