Merge pull request #1 from netzbegruenung/first-version

First working code and results
2018-04-05 18:13:03 +02:00 · 2018-04-05 18:13:03 +02:00 · daa002b506
parent 47b468b2a0 3513eddfe6
commit daa002b506
11 changed files with 45466 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+venv
+cache
+webapp/node_modules
--- a/README.md
+++ b/README.md
@ -1,2 +1,27 @@
 # green-spider
+
 Collects data on green websites and checks for things like SEO, performance, TLS.
+
+Written and tested in Python3
+
+### Ideas
+
+- If the URL does not start with `www.`, will entering `www.<url>` also work?
+- If the URL is HTTP, is it possible to access the site via HTTPS (recommended)?
+- If the URL is HTTPS, is it possible to access the sire via HTTP (recommended: redirect to HTTPS)
+- Check which cookies are set and with what settings (expiry, domain)
+- submit the URL against a service like Google Page Speed and retrieve the score
+- Check against our own webpagetest.org instance
+- Detect which one of the well-known CMS is used
+- Certificate expiry warning
+- Favourite icon availability check
+
+### Usage
+
+```nohighlight
+virtualenv -p python3 venv
+source venv/bin/activate
+pip install -r requirements.txt
+
+python spider.py
+```
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,6 @@
+certifi==2018.1.18
+chardet==3.0.4
+idna==2.6
+requests==2.18.4
+urllib3==1.22
+pyyaml==3.12
--- a/spider.py
+++ b/spider.py
@ -0,0 +1,289 @@
+# coding: utf8
+
+from git import Repo
+from multiprocessing import Pool
+from socket import gethostbyname_ex
+from urllib.parse import urlparse
+import certifi
+import json
+import logging
+import os
+import random
+import requests
+import shutil
+import sys
+import yaml
+
+# configuration
+
+# number of parallel processes to use for crawling
+concurrency = 4
+
+# connection timeout for website checks (seconds)
+connect_timeout = 5
+
+# response timeout for website checks
+read_timeout = 10
+
+# Git repo for our data
+green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
+# folder in that repo that holds the data
+green_direcory_data_path = 'data'
+green_directory_local_path = './cache/green-directory'
+
+result_path = './webapp/dist/data'
+
+# end configuration
+
+
+def get_green_directory():
+    """
+    Clones the source of website URLs, the green directory,
+    into the local file system using git
+    """
+    if os.path.exists(green_directory_local_path):
+        shutil.rmtree(green_directory_local_path)
+    Repo.clone_from(green_directory_repo, green_directory_local_path)
+
+
+def dir_entries():
+    """
+    Iterator over all data files in the cloned green directory
+    """
+    path = os.path.join(green_directory_local_path, green_direcory_data_path)
+    for root, dirs, files in os.walk(path):
+        for fname in files:
+
+            filepath = os.path.join(root, fname)
+            if not filepath.endswith(".yaml"):
+                continue
+
+            with open(filepath, 'r') as yamlfile:
+                for doc in yaml.load_all(yamlfile):
+                    yield doc
+
+
+def repr_entry(entry):
+    """
+    Return string representation of a directory entry,
+    for logging/debugging purposes
+    """
+    r = entry['type']
+    if 'level' in entry:
+        r += "/" + entry['level']
+    if 'state' in entry:
+        r += "/" + entry['state']
+    if 'district' in entry:
+        r += "/" + entry['district']
+    return r
+
+
+def derive_test_hostnames(hostname):
+    """
+    Derives the hostnames variants to test for a given host name.
+    From 'gruene-x.de' or 'www.gruene-x.de' it makes
+
+      ['gruene-x.de', 'www.gruene-x.de']
+
+    which are both plausible web URLs to be used for a domain.
+    """
+
+    hostnames = set()
+
+    hostnames.add(hostname)
+    if hostname.startswith('www.'):
+        hostnames.add(hostname[4:])
+    else:
+        hostnames.add('www.' + hostname)
+
+    return list(hostnames)
+
+
+def reduce_urls(urllist):
+    """
+    Reduce a list of urls with metadata by eliminating those
+    that either don't work or lead somewhere else
+    """
+    targets = set()
+    for u in urllist:
+        if u['error'] is not None:
+            continue
+        if u['redirects_to'] is not None:
+            targets.add(u['redirects_to'])
+        else:
+            targets.add(u['url'])
+    return list(targets)
+
+
+def check_site(url):
+    """
+    Performs our site check and returns results as a dict.
+
+    1. Normalize the input URL and derive the URLs to check for
+    2. HEAD the check urls
+    3. Determine the canonical URL
+    4. Run full check on canonical URL
+    """
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
+    }
+
+    result = {
+        'input_url': url,
+        'hostnames': [],
+        'resolvable_urls': [],
+        'canonical_urls': [],
+        'urlchecks': [],
+    }
+
+    # derive hostnames to test
+    parsed = urlparse(url)
+    hostnames = derive_test_hostnames(parsed.hostname)
+
+
+    processed_hostnames = []
+    for hn in hostnames:
+
+        record  = {
+            'input_hostname': hn,
+            'resolvable': False,
+        }
+
+        try:
+            hostname, aliases, ip_addresses = gethostbyname_ex(hn)
+            record['resolvable'] = True
+            record['resolved_hostname'] = hostname
+            record['aliases'] = aliases
+            record['ip_addresses'] = ip_addresses
+        except:
+            pass
+
+        processed_hostnames.append(record)
+
+    result['hostnames'] = processed_hostnames
+
+    checked_urls = []
+    for item in processed_hostnames:
+        if not item['resolvable']:
+            continue
+
+        for scheme in ('http', 'https'):
+
+            record = {
+                'url': scheme + '://' + item['resolved_hostname'] + '/',
+                'error': None,
+                'redirects_to': None,
+            }
+
+            try:
+                r = requests.head(record['url'], headers=headers, allow_redirects=True)
+                if r.url == url:
+                    logging.info("URL: %s - status %s - no redirect" % (record['url'], r.status_code))
+                else:
+                    logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
+                    record['redirects_to'] = r.url
+            except Exception as e:
+                record['error'] = {
+                    'type': str(type(e)),
+                    'message': str(e),
+                }
+                logging.info("URL %s: %s %s" % (url, str(type(e)), e))
+
+            checked_urls.append(record)
+
+    result['resolvable_urls'] = checked_urls
+    result['canonical_urls'] = reduce_urls(checked_urls)
+
+    # Deeper test for the remaining (canonical) URL(s)
+    for check_url in result['canonical_urls']:
+
+        logging.info("Checking URL %s" % check_url)
+
+        check = {
+            'url': check_url,
+            'status_code': None,
+            'duration': None,
+            'error': None,
+        }
+
+        try:
+            r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
+            check['status_code'] = r.status_code
+            check['duration'] = round(r.elapsed.microseconds / 1000)
+        except requests.exceptions.ConnectionError as e:
+            logging.error(str(e) + " " + check_url)
+            check['error'] = "connection"
+        except requests.exceptions.Timeout as e:
+            logging.error(str(e) + " " + check_url)
+            check['error'] = "connection_timeout"
+        except requests.exceptions.ReadTimeout as e:
+            logging.error(str(e) + " " + check_url)
+            check['error'] = "read_timeout"
+        except Exception as e:
+            logging.error(str(e) + " " + check_url)
+            check['error'] = "unknown"
+
+        result['urlchecks'].append(check)
+
+    return result
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    logging.getLogger("urllib3").setLevel(logging.CRITICAL)
+
+    get_green_directory()
+
+    urls = []
+    for entry in dir_entries():
+
+        if 'type' not in entry:
+            logging.error("Entry without type")
+            continue
+
+        if 'urls' not in entry:
+            logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
+            continue
+
+        website_url = None
+        for n in range(len(entry['urls'])):
+            try:
+                if entry['urls'][n]['type'] == "WEBSITE":
+                    website_url = entry['urls'][n]['url']
+            except NameError as ne:
+                logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
+        if website_url:
+            urls.append(website_url)
+
+    random.seed()
+    random.shuffle(urls)
+
+    results = {}
+
+    if concurrency > 1:
+        pool = Pool(concurrency)
+        for url in urls:
+            results[url] = pool.apply_async(check_site, kwds={"url": url})
+        pool.close()
+        pool.join()
+    else:
+        for url in urls:
+            results[url] = check_site(url)
+
+    results2 = []
+    done = set()
+
+    # convert results from ApplyResult to dict
+    for url in sorted(results.keys()):
+        if url not in done:
+            results2.append(results[url].get())
+        done.add(url)
+
+    # Write result as JSON
+    output_filename = os.path.join(result_path, "spider_result.json")
+    with open(output_filename, 'w', encoding="utf8") as jsonfile:
+        json.dump(results2, jsonfile, indent=2, sort_keys=True)
+
+
+if __name__ == "__main__":
+    main()
--- a/webapp/dist/bundle.js
+++ b/webapp/dist/bundle.js
--- a/webapp/dist/data/spider_result.json
+++ b/webapp/dist/data/spider_result.json
--- a/webapp/dist/index.html
+++ b/webapp/dist/index.html
@ -0,0 +1,60 @@
+<!DOCTYPE html>
+<html lang="de">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+  <title>green spider report: Auswertung einiger Merkmale von Bündnis 90/Die Grünen Websites</title>
+  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
+  <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.16/css/jquery.dataTables.css">
+  <style type="text/css">
+  body {
+    padding: 20px;
+  }
+  .bad {
+    background-color: #ffdbdb;
+  }
+  .good {
+    background-color: #cfeaa8;
+  }
+  </style>
+</head>
+<body>
+
+<div class="container-fluid">
+
+  <p>Erläuterungen der Testkriterien am <a href="#docs">Ende der Seite</a>. Gehe zum <a href="https://github.com/netzbegruenung/green-spider/">green-spider GitHub repository</a> um mehr zu erfahren.</p>
+
+  <table class="table">
+    <thead>
+      <tr>
+        <th scope="col">URL</th>
+        <th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
+        <th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
+        <th scope="col"><abbr title="Site nutzt HTTP-Verschlüsselung">HTTPS</abbr></th>
+      </tr>
+    </thead>
+    <tbody>
+    </tbody>
+  </table>
+
+  <hr>
+
+  <h3 id="docs">Testkriterien</h3>
+
+  <h4>www. optional</h4>
+
+  <p>Die Site sollte sowohl mit &quot;www.&quot; als auch ohne &quot;www.&quot; am Anfang der URL erreichbar sein.</p>
+
+  <h4>Kanonische URL</h4>
+
+  <p>Verschiedene URLs zum Aufruf der Site (mit/ohne &quot;www.&quot; Präfix, mit/ohne HTTPS) sollten alle per HTTP Redirect auf eine einzige &quot;kanonische&quot; URL weiter leiten.</p>
+
+  <h4>HTTPS</h4>
+
+  <p>Die Site ist per verschlüsselter HTTP-Verbindung (HTTPS-Protokoll) erreichbar.</p>
+
+</div>
+
+<script src="bundle.js"></script>
+</body>
+</html>
--- a/webapp/package-lock.json
+++ b/webapp/package-lock.json
--- a/webapp/package.json
+++ b/webapp/package.json
@ -0,0 +1,31 @@
+{
+  "name": "green-spider-webapp",
+  "version": "0.0.1",
+  "description": "Displays spider results",
+  "private": true,
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/netzbegruenung/green-spider.git"
+  },
+  "author": "",
+  "license": "Apache-2.0",
+  "bugs": {
+    "url": "https://github.com/netzbegruenung/green-spider/issues"
+  },
+  "homepage": "https://github.com/netzbegruenung/green-spider#readme",
+  "devDependencies": {
+    "webpack": "^4.5.0",
+    "webpack-cli": "^2.0.14"
+  },
+  "dependencies": {
+    "bootstrap": "^4.0.0",
+    "datatables.net": "^1.10.16",
+    "jquery": "^3.3.1",
+    "lodash": "^4.17.5",
+    "popper.js": "^1.14.3",
+    "punycode": "^2.1.0"
+  }
+}
--- a/webapp/src/index.js
+++ b/webapp/src/index.js
@ -0,0 +1,50 @@
+import _ from 'lodash';
+import $ from 'jquery';
+import 'bootstrap';
+import 'popper.js';
+import punycode from 'punycode';
+import 'datatables.net';
+
+
+$(function(){
+
+  $.getJSON('data/spider_result.json', function(data){
+    var tbody = $('tbody');
+    $.each(data, function(index, item) {
+      var row = $(document.createElement('tr'));
+      row.append('<td><a href="' + item.input_url + '">' + punycode.toUnicode(item.input_url) + '</a></td>');
+
+      // hostnames
+      var twoHostnames = false;
+      if (_.filter(item.hostnames, {'resolvable': true}).length === 2) {
+        twoHostnames = true;
+      };
+      row.append('<td class="'+ (twoHostnames ? 'good' : 'bad') +' text-center">' + (twoHostnames ? '✅' : '❌') + '</td>');
+
+      // one canonical URL
+      var canonical = false;
+      if (item.canonical_urls.length === 1 ) canonical = true;
+      row.append('<td class="'+ (canonical ? 'good' : 'bad') +' text-center">' + (canonical ? '✅' : '❌') + '</td>');
+
+      // https
+      var hasHTTPS = false;
+      hasHTTPS = _.find(item.canonical_urls, function(o){
+        return o.indexOf('https://') !== -1;
+      });
+      row.append('<td class="'+ (hasHTTPS ? 'good' : 'bad') +' text-center">' + (hasHTTPS ? '✅' : '❌') + '</td>');
+
+      tbody.append(row);
+    });
+
+    // enable data table funcionts (sorting)
+    $('table.table').DataTable({
+      order: [[0, "asc"]],
+      paging: false,
+      pageLength: 10000,
+      language: {
+        "search": "Suche"
+      }
+    });
+  });
+
+});
--- a/webapp/webpack.config.js
+++ b/webapp/webpack.config.js
@ -0,0 +1,9 @@
+const path = require('path');
+
+module.exports = {
+  entry: './src/index.js',
+  output: {
+    filename: 'bundle.js',
+    path: path.resolve(__dirname, 'dist')
+  }
+};