From ae6a2e83e92e44378a54fb10e848e95927968c0c Mon Sep 17 00:00:00 2001
From: Marian Steinbach <marian@giantswarm.io>
Date: Wed, 3 Oct 2018 11:05:42 +0200
Subject: [PATCH] Refactor and modularize spider (#70)

See PR description for details
---
 .dockerignore                        |   1 +
 .gitignore                           |   1 +
 .travis.yml                          |   7 +
 Dockerfile                           |  19 +-
 Makefile                             |  31 +-
 checks/__init__.py                   |  64 +++
 checks/abstract_checker.py           |  23 +
 checks/certificate.py                |  62 ++
 checks/certificate_test.py           |  27 +
 checks/charset.py                    |  77 +++
 checks/charset_test.py               |  49 ++
 checks/config.py                     |  29 +
 checks/dns_resolution.py             |  55 ++
 checks/domain_variations.py          |  44 ++
 checks/duplicate_content.py          | 107 ++++
 checks/generator.py                  |  76 +++
 checks/html_head.py                  | 152 +++++
 checks/http_and_https.py             |  27 +
 checks/load_in_browser.py            | 134 +++++
 checks/page_content.py               |  94 ++++
 checks/url_canonicalization.py       |  13 +
 checks/url_reachability.py           | 104 ++++
 checks/url_reachability_test.py      |  71 +++
 cli.py                               |  83 +++
 config/__init__.py                   |  23 +
 devops/run-job.sh                    |  33 +-
 data_export.py => export/__init__.py |  89 +--
 jobs/__init__.py                     | 180 ++++++
 rating/__init__.py                   |  53 ++
 rating/abstract_rater.py             |  22 +
 rating/canonical_url.py              |  31 +
 rating/favicon.py                    |  32 ++
 rating/feeds.py                      |  35 ++
 rating/https.py                      |  47 ++
 rating/no_network_errors.py          |  48 ++
 rating/no_script_errors.py           |  42 ++
 rating/reachable.py                  |  36 ++
 rating/resolvable.py                 |  35 ++
 rating/response_duration.py          |  46 ++
 rating/responsive_layout.py          |  35 ++
 rating/use_specific_fonts.py         |  41 ++
 rating/www_optional.py               |  44 ++
 spider.py                            | 814 ---------------------------
 spider/__init__.py                   |   0
 spider/spider.py                     | 106 ++++
 spider/spider_test.py                |  26 +
 spider_test.py                       | 125 ----
 47 files changed, 2289 insertions(+), 1004 deletions(-)
 create mode 100644 checks/__init__.py
 create mode 100644 checks/abstract_checker.py
 create mode 100644 checks/certificate.py
 create mode 100644 checks/certificate_test.py
 create mode 100644 checks/charset.py
 create mode 100644 checks/charset_test.py
 create mode 100644 checks/config.py
 create mode 100644 checks/dns_resolution.py
 create mode 100644 checks/domain_variations.py
 create mode 100644 checks/duplicate_content.py
 create mode 100644 checks/generator.py
 create mode 100644 checks/html_head.py
 create mode 100644 checks/http_and_https.py
 create mode 100644 checks/load_in_browser.py
 create mode 100644 checks/page_content.py
 create mode 100644 checks/url_canonicalization.py
 create mode 100644 checks/url_reachability.py
 create mode 100644 checks/url_reachability_test.py
 create mode 100644 cli.py
 create mode 100644 config/__init__.py
 rename data_export.py => export/__init__.py (56%)
 create mode 100644 jobs/__init__.py
 create mode 100644 rating/__init__.py
 create mode 100644 rating/abstract_rater.py
 create mode 100644 rating/canonical_url.py
 create mode 100644 rating/favicon.py
 create mode 100644 rating/feeds.py
 create mode 100644 rating/https.py
 create mode 100644 rating/no_network_errors.py
 create mode 100644 rating/no_script_errors.py
 create mode 100644 rating/reachable.py
 create mode 100644 rating/resolvable.py
 create mode 100644 rating/response_duration.py
 create mode 100644 rating/responsive_layout.py
 create mode 100644 rating/use_specific_fonts.py
 create mode 100644 rating/www_optional.py
 delete mode 100644 spider.py
 create mode 100644 spider/__init__.py
 create mode 100644 spider/spider.py
 create mode 100644 spider/spider_test.py
 delete mode 100644 spider_test.py

diff --git a/.dockerignore b/.dockerignore
index ae2f869..e5d5555 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -4,3 +4,4 @@ docs
 secrets
 temp
 venv
+/export-*
diff --git a/.gitignore b/.gitignore
index 1d3b3da..a536d3d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ __pycache__
 .vscode/settings.json
 webapp/dist/bundle.js
 dev-shm
+/export-*
diff --git a/.travis.yml b/.travis.yml
index df1ac71..bd97de9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,5 +6,12 @@ services:
 notifications:
   email: false
 
+language: python
+python:
+  - "3.6"
+
 script:
+  - pip install --upgrade pip
+  - pip install --upgrade codecov
   - make test
+  - codecov
diff --git a/Dockerfile b/Dockerfile
index a53042b..186ac63 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,17 +1,20 @@
-FROM python:3.6-alpine3.7
+FROM python:3.6-alpine3.8
 
 # Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
 RUN echo "http://dl-4.alpinelinux.org/alpine/v3.7/main" >> /etc/apk/repositories && \
     echo "http://dl-4.alpinelinux.org/alpine/v3.7/community" >> /etc/apk/repositories && \
     apk update && \
-    apk --no-cache add chromium chromium-chromedriver python3-dev build-base git && \
+    apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
     pip3 install --upgrade pip && \
-    pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
+    pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 html-similarity==0.3.2 httpretty==0.9.4 pyopenssl==18.0.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
     apk del python3-dev build-base
 
-ADD spider.py /
-ADD spider_test.py /
-ADD data_export.py /
+ADD cli.py /
+ADD config /config
+ADD jobs /jobs
+ADD checks /checks
+ADD rating /rating
+ADD spider /spider
+ADD export /export
 
-ENTRYPOINT ["python3"]
-CMD ["/spider.py"]
+ENTRYPOINT ["python3", "/cli.py"]
diff --git a/Makefile b/Makefile
index 075496d..9b85fa4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,18 +1,20 @@
+IMAGE := quay.io/netzbegruenung/green-spider:latest
 
+DB_ENTITY := spider-results
 
 .PHONY: dockerimage
 
 # Build docker image
 dockerimage:
-	docker build -t quay.io/netzbegruenung/green-spider:latest .
+	docker build -t $(IMAGE) .
 
 # Create spider job queue
 spiderjobs: dockerimage
 	docker run --rm -ti \
 		-v $(PWD)/secrets:/secrets \
-		quay.io/netzbegruenung/green-spider:latest spider.py \
+		$(IMAGE) \
 		--credentials-path /secrets/datastore-writer.json \
-		--loglevel debug \
+		--loglevel info \
 		jobs
 
 # Run spider in docker image
@@ -21,11 +23,26 @@ spider: dockerimage
 	  -v $(PWD)/dev-shm:/dev/shm \
 		-v $(PWD)/webapp/dist/data:/out \
 		-v $(PWD)/secrets:/secrets \
-		quay.io/netzbegruenung/green-spider:latest spider.py \
+		$(IMAGE) \
 		--credentials-path /secrets/datastore-writer.json \
-		--loglevel info \
-		spider
+		--loglevel debug \
+		spider --kind $(DB_ENTITY)
+
+export: dockerimage
+	docker run --rm -ti \
+		-v $(PWD)/export-json:/out \
+		-v $(PWD)/secrets:/secrets \
+		-v $(PWD)/export-siteicons:/icons \
+		$(IMAGE) \
+		--credentials-path /secrets/datastore-reader.json \
+		--loglevel debug \
+		export --kind $(DB_ENTITY)
 
 # run spider tests
+# FIXME
 test: dockerimage
-	docker run --rm -ti quay.io/netzbegruenung/green-spider:latest /spider_test.py
+	docker run --rm -ti \
+		--entrypoint "python3" \
+		$(IMAGE) \
+		-m unittest discover -p '*_test.py'
+
diff --git a/checks/__init__.py b/checks/__init__.py
new file mode 100644
index 0000000..cc2d0f0
--- /dev/null
+++ b/checks/__init__.py
@@ -0,0 +1,64 @@
+"""
+The checks module contains the functionality to get information and test certain
+functionality of a site or individual pages.
+"""
+
+import logging
+
+from checks import charset
+from checks import certificate
+from checks import dns_resolution
+from checks import duplicate_content
+from checks import domain_variations
+from checks import generator
+from checks import html_head
+from checks import http_and_https
+from checks import page_content
+from checks import load_in_browser
+from checks import url_reachability
+from checks import url_canonicalization
+
+from checks.config import Config
+
+
+def perform_checks(input_url):
+    """
+    Executes all our URL/site checks and returns a big-ass result dict.
+    """
+
+    # The sequence of checks to run. Order is important!
+    # Checks which expand the URLs list must come first.
+    # After that, dependencies (encoded in the checks) have to be fulfilled.
+    check_modules = [
+        ('domain_variations', domain_variations),
+        ('http_and_https', http_and_https),
+        ('dns_resolution', dns_resolution),
+        ('url_reachability', url_reachability),
+        ('certificate', certificate),
+        ('url_canonicalization', url_canonicalization),
+        ('page_content', page_content),
+        ('duplicate_content', duplicate_content),
+        ('charset', charset),
+        ('html_head', html_head),
+        ('generator', generator),
+        ('load_in_browser', load_in_browser),
+    ]
+
+    results = {}
+
+    config = Config(urls=[input_url],
+        user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' +
+                   'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' +
+                   'Safari/537.36 green-spider/0.2')
+
+    for check_name, check in check_modules:
+        checker = check.Checker(config=config,
+                                previous_results=results)
+        result = checker.run()
+        results[check_name] = result
+
+        # update config for the next check
+        config = checker.config
+        logging.debug("config after check %s: %r" % (check_name, config))
+    
+    return results
diff --git a/checks/abstract_checker.py b/checks/abstract_checker.py
new file mode 100644
index 0000000..e9db12a
--- /dev/null
+++ b/checks/abstract_checker.py
@@ -0,0 +1,23 @@
+class AbstractChecker(object):
+    """
+    Our blueprint for checks
+    """
+
+    def __init__(self, config, previous_results=None):
+        self._config = config
+
+        # A dictionary of results from previous checkers.
+        # Key is the name of the checker that has generated the result.
+        self._previous_results = previous_results
+
+    def run(self):
+        """Executes the check routine, returns result dict"""
+        raise NotImplementedError()
+
+    @property
+    def config(self):
+        return self._config
+    
+    @property
+    def previous_results(self):
+        return self._previous_results
diff --git a/checks/certificate.py b/checks/certificate.py
new file mode 100644
index 0000000..2539963
--- /dev/null
+++ b/checks/certificate.py
@@ -0,0 +1,62 @@
+"""
+Gathers information on the TLS/SSL certificate used by a server
+"""
+
+from urllib.parse import urlparse
+import logging
+import ssl
+from datetime import datetime
+from datetime import timezone
+
+from OpenSSL import crypto
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+    
+    def run(self):
+        results = {}
+
+        for url in self.config.urls:
+            if url.startswith('https://'):
+                results[url] = self.get_certificate(url)
+
+        return results
+    
+    def get_certificate(self, url):
+        result = {
+            'exception': None,
+            'serial_number': None,
+            'subject': None,
+            'issuer': None,
+            'not_before': None,
+            'not_after': None
+        }
+
+        parsed = urlparse(url)
+        try:
+            cert = ssl.get_server_certificate((parsed.hostname, 443))
+            x509 = crypto.load_certificate(crypto.FILETYPE_PEM, cert)
+            result['serial_number'] = str(x509.get_serial_number())
+
+            nb = x509.get_notBefore().decode('utf-8')
+            na = x509.get_notAfter().decode('utf-8')
+            
+            # parse '2018 06 27 00 00 00Z'
+            result['not_before'] = datetime(int(nb[0:4]), int(nb[4:6]), int(nb[6:8]), int(nb[8:10]), int(nb[10:12]), int(nb[12:14]), tzinfo=timezone.utc).isoformat()
+            result['not_after']  = datetime(int(na[0:4]), int(na[4:6]), int(na[6:8]), int(na[8:10]), int(na[10:12]), int(na[12:14]), tzinfo=timezone.utc).isoformat()
+
+            # decode and convert from bytes to unicode
+            result['subject'] = dict([tuple(map(lambda x: x.decode('utf-8'), tup)) for tup in x509.get_subject().get_components()])
+            result['issuer']  = dict([tuple(map(lambda x: x.decode('utf-8'), tup)) for tup in x509.get_issuer().get_components()])
+            
+        except Exception as e:
+            result['exception'] = {
+                'type': str(type(e)),
+                'message': str(e),
+            }
+            logging.warning("Error when getting certificate for %s: %r" % (url, e))
+
+        return result
diff --git a/checks/certificate_test.py b/checks/certificate_test.py
new file mode 100644
index 0000000..66c2288
--- /dev/null
+++ b/checks/certificate_test.py
@@ -0,0 +1,27 @@
+from checks import certificate
+from checks.config import Config
+import unittest
+
+class TestCertificateChecker(unittest.TestCase):
+
+    def test_google(self):
+        url = 'https://www.google.com/'
+        config = Config(urls=[url])
+        checker = certificate.Checker(config=config, previous_results={})
+        result = checker.run()
+        self.assertIn(url, result)
+        self.assertIsNone(result[url]['exception'])
+        self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services')
+
+    def test_kaarst(self):
+        url = 'https://www.gruenekaarst.de/'
+        config = Config(urls=[url])
+        checker = certificate.Checker(config=config, previous_results={})
+        result = checker.run()
+        self.assertIn(url, result)
+        self.assertIsNone(result[url]['exception'])
+        self.assertEqual(result[url]['issuer']['O'], 'COMODO CA Limited')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/checks/charset.py b/checks/charset.py
new file mode 100644
index 0000000..0851dd2
--- /dev/null
+++ b/checks/charset.py
@@ -0,0 +1,77 @@
+"""
+Checks which character set a page has.
+
+TODO: Check for http-equiv meta tags like
+      <meta http-equiv="content-type" content="text/html; charset=iso-8859-1" />
+"""
+
+import logging
+
+from bs4 import BeautifulSoup
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+    
+    def run(self):
+        assert 'page_content' in self.previous_results
+        
+        results = {}
+
+        for url in self.config.urls:
+            results[url] = self.get_charset(url)
+
+        return results
+    
+    def get_charset(self, url):
+        """
+        Expects page_content_dict['content'] to carry the HTML content
+        """
+        page_content = self.previous_results['page_content'][url]
+        assert 'content' in page_content
+        assert 'response_headers' in page_content
+        logging.debug("%r", page_content['response_headers'])
+        assert 'content-type' in page_content['response_headers']
+
+        if page_content['content'] is None:
+            return
+
+        result = {
+            'meta_charset_tag': None,
+            'content_type_header_charset': None,
+            'charset': 'iso-8859-1', # ISO-8859-1 is the default according to https://www.w3.org/International/articles/http-charset/index
+            'valid': None,
+            'exception': None,
+        }
+
+        soup = BeautifulSoup(page_content['content'], 'html.parser')
+
+        # get response header charset
+        if ('content-type' in page_content['response_headers']
+            and 'charset=' in page_content['response_headers']['content-type']):
+            parts = page_content['response_headers']['content-type'].split("charset=", 1)
+            result['content_type_header_charset'] = parts[1].lower()
+            result['charset'] = parts[1].lower()
+
+        # get meta tag charset
+        metatags = soup.find_all('meta')
+        for tag in metatags:
+            if 'charset' in tag.attrs:
+                result['meta_charset_tag'] = tag['charset'].lower()
+                # meta tag overrules any previous value
+                result['charset'] = tag['charset'].lower()
+        
+        # check for charset plausibility (only for most common ones)
+        if result['charset'] in ('iso-8859-1', 'utf-8'):
+            try:
+                _ = page_content['content'].encode(result['charset'])
+            except UnicodeEncodeError as e:
+                result['valid'] = False
+                result['exception'] = str(e)
+            else:
+                result['valid'] = True
+
+
+        return result
diff --git a/checks/charset_test.py b/checks/charset_test.py
new file mode 100644
index 0000000..cce7677
--- /dev/null
+++ b/checks/charset_test.py
@@ -0,0 +1,49 @@
+import httpretty
+from httpretty import httprettified
+import unittest
+
+from checks import charset
+from checks import page_content
+from checks.config import Config
+
+@httprettified
+class TestCharsetChecker(unittest.TestCase):
+
+    def test_http_response(self):
+        url = 'http://www.example.com/'
+        httpretty.register_uri(httpretty.GET, url,
+            body="""<html>
+                <head>
+                <meta http-equiv="Content-type" value="text/html; charset=foo">
+                <meta charset="utf-8">
+                <title>Hello</title>
+                </head>
+            </html>""",
+            adding_headers={
+                "Content-Type": "text/html; charset=ISO-8859-1",
+            })
+        
+        results = {}
+
+        config = Config(urls=[url])
+        page_content_checker = page_content.Checker(config=config, previous_results={})
+        results['page_content'] = page_content_checker.run()
+
+        self.assertIn(url, results['page_content'])
+        self.assertIn('response_headers', results['page_content'][url])
+        self.assertIn('content-type', results['page_content'][url]['response_headers'])
+
+        charset_checker = charset.Checker(config=page_content_checker.config, previous_results=results)
+        result = charset_checker.run()
+
+        self.assertIn(url, result)
+        self.assertEqual(result[url], {
+            'meta_charset_tag': 'utf-8',
+            'content_type_header_charset': 'iso-8859-1',
+            'charset': 'utf-8',
+            'valid': True,
+            'exception': None,
+        })
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/checks/config.py b/checks/config.py
new file mode 100644
index 0000000..d164b00
--- /dev/null
+++ b/checks/config.py
@@ -0,0 +1,29 @@
+class Config(object):
+    """
+    Our configuration to be passed to checks
+    """
+
+    def __init__(self, urls, user_agent='green-spider/1.0'):
+        self._urls = set(urls)
+        self._user_agent = user_agent
+    
+    def __repr__(self):
+      return "Config(urls=%r)" % self._urls
+
+    @property
+    def urls(self):
+        return list(self._urls)
+
+    def add_url(self, url):
+        self._urls.add(url)
+
+    def remove_url(self, url):
+        """Removes url from urls, if it was in there. Ignores errors."""
+        try:
+            self._urls.remove(url)
+        except KeyError:
+            pass
+    
+    @property
+    def user_agent(self):
+        return self._user_agent
diff --git a/checks/dns_resolution.py b/checks/dns_resolution.py
new file mode 100644
index 0000000..efd3b05
--- /dev/null
+++ b/checks/dns_resolution.py
@@ -0,0 +1,55 @@
+"""
+This check attempts to resolve all hostnames/domains in the input URLs.
+
+URLs which are not resolvable are removed from the config.
+"""
+
+import logging
+from socket import gethostbyname_ex
+from urllib.parse import urlparse
+from urllib.parse import urlunparse
+
+from checks.abstract_checker import AbstractChecker
+
+
+class Checker(AbstractChecker):
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+
+    def run(self):
+        """Executes the check routine, returns result dict"""
+        
+        results = {}
+
+        urls = list(self.config.urls)
+        for url in urls:
+            parsed = urlparse(url)
+            
+            results[url] = self.resolve_hostname(parsed.hostname)
+
+            # remove URL if non-resolvable
+            if not results[url]['resolvable']:
+                self.config.remove_url(url)
+
+        return results
+
+    def resolve_hostname(self, hostname):
+        """
+        Resolve one to IPv4 address(es)
+        """
+        result = {
+            'hostname': hostname,
+            'resolvable': False,
+            'aliases': [],
+            'ipv4_addresses': [],
+        }
+
+        try:
+            hostname, aliases, ipv4_addresses = gethostbyname_ex(hostname)
+            result['resolvable'] = True
+            result['aliases'] = aliases
+            result['ipv4_addresses'] = ipv4_addresses
+        except Exception as e:
+            logging.debug("Hostname %s not resolvable. Exception: %r" % (hostname, e))
+        
+        return result
diff --git a/checks/domain_variations.py b/checks/domain_variations.py
new file mode 100644
index 0000000..ab621ea
--- /dev/null
+++ b/checks/domain_variations.py
@@ -0,0 +1,44 @@
+"""
+This adds commonly tried variations of domains/subdomains to the URLs config.
+"""
+
+import logging
+
+from urllib.parse import urlparse
+from urllib.parse import urlunparse
+
+from checks.abstract_checker import AbstractChecker
+
+
+class Checker(AbstractChecker):
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+
+    def run(self):        
+        urls = list(self.config.urls)
+        for url in urls:
+            parsed = urlparse(url)
+            hostnames = self.expand_hostname(parsed.hostname)
+            
+            for hostname in hostnames:
+                self.config.add_url(urlunparse((parsed.scheme, hostname, 
+                    parsed.path, parsed.params, parsed.query, parsed.fragment)))
+
+        return None
+
+
+    def expand_hostname(self, hostname):
+        """
+        Create variations of subdomains
+        """
+        hostnames = set()
+
+        hostnames.add(hostname)
+        if hostname.startswith('www.'):
+            # remove 'www.' prefix
+            hostnames.add(hostname[4:])
+        else:
+            # add 'www.' prefix
+            hostnames.add('www.' + hostname)
+
+        return sorted(list(hostnames))
diff --git a/checks/duplicate_content.py b/checks/duplicate_content.py
new file mode 100644
index 0000000..9556902
--- /dev/null
+++ b/checks/duplicate_content.py
@@ -0,0 +1,107 @@
+"""
+This checker looks at the similarity between previously downloaded pages
+and removes duplicates from the config URLs
+"""
+
+import logging
+
+import html_similarity
+
+from checks.abstract_checker import AbstractChecker
+
+
+class Checker(AbstractChecker):
+
+    # value above which we consider a page pair a duplicate
+    similarity_threshold = 0.99999
+
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+
+
+    def run(self):
+
+        if len(self.config.urls) == 1:
+            # nothing to do for us
+            return
+        
+        urls = list(self.config.urls)
+
+        # get content
+        content = {}
+
+        assert 'page_content' in self.previous_results
+
+        for url in urls:
+            page_content = self.previous_results['page_content'][url]
+
+            if page_content['content'] is None:
+                logging.warn("Content for URL %s is None" % url)
+
+            content[url] = page_content['content']
+        
+        pairs = self.compare_pairwise(content)
+
+        # remove duplicates
+        for key in pairs:
+            if pairs[key]['similarity'] is None:
+                continue
+            if pairs[key]['similarity'] > self.similarity_threshold:
+                # this pair is a duplicate.
+                # Decide which one to keep
+                url1, url2 = key.split(" ", 1)
+                reject = self.select_url_to_reject(url1, url2)
+                self.config.remove_url(reject)
+
+        return pairs
+
+
+    def compare_pairwise(self, content):
+        # compair pairwise
+        pairs = {}
+
+        for url1 in content:
+            for url2 in content:
+                
+                if url1 == url2:
+                    continue
+                
+                # avoid checking pairs twice
+                pair_key = " ".join(sorted([url1, url2]))
+                if pair_key in pairs:
+                    continue
+
+                try:
+                    s = html_similarity.similarity(content[url1], content[url2])
+                    logging.debug("Comparing pages for URLs %s and %s: similarity=%s", url1, url2, s)
+                    pairs[pair_key] = {
+                        'similarity': s,
+                        'exception': None,
+                    }
+                except (AttributeError, ValueError) as e:
+                    logging.error("html_similarity.similarity thre exception for URL pair %s and %s: %s", url1, url2, e)
+                    pairs[pair_key] = {
+                        'similarity': None,
+                        'exception': str(e),
+                    }
+        
+        return pairs
+
+
+    def select_url_to_reject(self, url1, url2):
+        """Determine which of two URLs to keep, which to reject"""
+
+        # HTTPS takes precedence
+        if url1.startswith('https://') and not url2.startswith('https://'):
+            return url2
+        elif url2.startswith('https://') and not url1.startswith('https://'):
+            return url1
+        
+        # Shorter URL wins
+        if len(url1) < len(url2):
+            return url2
+        elif len(url1) > len(url2):
+            return url1
+        
+        # default behaviour
+        return url1
diff --git a/checks/generator.py b/checks/generator.py
new file mode 100644
index 0000000..5a1968b
--- /dev/null
+++ b/checks/generator.py
@@ -0,0 +1,76 @@
+"""
+Checks the 'generator' meta tag and page content properties
+to detect well-known content management systems, themes etc.
+"""
+
+import logging
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+
+    # IP address of the newthinking GCMS server
+    gcms_ip = "91.102.13.20"
+
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+    
+    def run(self):
+        assert 'page_content' in self.previous_results
+        assert 'html_head' in self.previous_results
+
+        results = {}
+
+        for url in self.config.urls:
+            results[url] = self.get_generator(url)
+
+        return results
+
+
+    def get_generator(self, url):
+        page_content = self.previous_results['page_content'][url]
+        assert 'content' in page_content
+
+        assert 'dns_resolution' in self.previous_results
+        dns_resolution = self.previous_results['dns_resolution']
+
+        head = self.previous_results['html_head'][url]
+
+        generator = None
+
+        if 'generator' in head and head['generator'] is not None:
+            generator = head['generator'].lower()
+            if 'typo3' in generator:
+                generator = 'typo3'
+            if 'wordpress' in generator:
+                generator = 'wordpress'
+            if 'drupal' in generator:
+                generator = 'drupal'
+            if 'joomla' in generator:
+                generator = 'joomla'
+        
+        # Qualify certain CMS flavours in more detail
+        if generator == "typo3":
+            # Typo3-Gruene advertises in the page content
+            if 'typo3-gruene.de' in page_content['content']:
+                generator = "typo3-gruene"
+            # newthinking GCMS in some page hrefs
+            elif 'ntc_gcms' in page_content['content']:
+                generator = "typo3-gcms"
+            # check if one of the IPs matches the well-known GCMS Server IP
+            elif url in dns_resolution:
+                for addr in dns_resolution[url]['ipv4_addresses']:
+                    if addr == self.gcms_ip:
+                        generator = "typo3-gcms"
+
+        elif 'Urwahl3000' in page_content['content']:
+            generator = "wordpress-urwahl"
+
+        elif ('josephknowsbest' in page_content['content'] or
+            'Joseph-knows-best' in page_content['content']):
+            generator = "wordpress-josephknowsbest"
+
+        elif 'wordpress' in page_content['content']:
+            generator = "wordpress"
+        
+        return generator
diff --git a/checks/html_head.py b/checks/html_head.py
new file mode 100644
index 0000000..4d3391e
--- /dev/null
+++ b/checks/html_head.py
@@ -0,0 +1,152 @@
+"""
+Extracts information from the html <head>, like existence and value
+of certain meta tags, link tags, title, etc.
+"""
+
+import logging
+import re
+from urllib.parse import urljoin
+from urllib.parse import urlparse
+
+from bs4 import BeautifulSoup
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+    
+    def run(self):
+        results = {}
+
+        for url in self.config.urls:
+            results[url] = self.get_content(url)
+
+        return results
+    
+    def get_content(self, url):
+        """
+        Expects page_content_dict['content'] to carry the HTML content
+        """
+
+        page_content = self.previous_results['page_content'][url]
+        assert 'content' in page_content
+        assert 'response_headers' in page_content
+        assert 'content-type' in page_content['response_headers']
+
+        if page_content['content'] is None:
+            return
+
+        soup = BeautifulSoup(page_content['content'], 'html.parser')
+        head = soup.find('head')
+
+        result = {
+            'title': self.get_title(head),
+            'link_canonical': self.get_link_canonical(head, url),
+            'link_rss_atom': self.get_link_rss_atom(head, url),
+            'link_icon': self.get_link_icon(head, url),
+            'generator': self.get_generator(head),
+            'opengraph': self.get_opengraph(head),
+            'viewport': self.get_viewport(head),
+        }
+
+        return result
+
+
+    def get_title(self, head):
+        """Extract and clean up page title"""
+        if head is None:
+            return
+        
+        title = None
+
+        tag = head.find('title')
+        if tag is None:
+            return
+        
+        title = tag.get_text()
+        
+        # clean up
+        title = title.replace(u'\u00a0', ' ')
+        title = title.replace('  ', ' ')
+        title = title.strip()
+
+        return title
+        
+
+    def get_link_canonical(self, head, url):
+        if head is None:
+            return
+        link = head.find('link', rel='canonical')
+        if link:
+            return urljoin(url, link.get('href'))
+    
+
+    def get_link_rss_atom(self, head, url):
+        if head is None:
+            return
+        hrefs = []
+        rss_links = head.find_all('link', type='application/rss+xml')
+        atom_links = head.find_all('link', type='application/atom+xml')
+
+        if rss_links:
+            for link in rss_links:
+                hrefs.append(link.get('href'))
+        if atom_links:
+            for link in rss_links:
+                hrefs.append(link.get('href'))
+        
+        # make URLs absolute
+        for i in range(len(hrefs)):
+            parsed = urlparse(hrefs[i])
+            if parsed.scheme == '':
+                hrefs[i] = urljoin(url, hrefs[i])
+
+        return hrefs
+
+    
+    def get_link_icon(self, head, url):
+        if head is None:
+            return
+
+        tag = head.find('link', rel=lambda x: x and x.lower() == 'icon')
+        if tag:
+            return urljoin(url, tag.get('href'))
+        tag = head.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
+        if tag:
+            return urljoin(url, tag.get('href'))
+
+
+    def get_generator(self, head):
+        if head is None:
+            return
+
+        tags = head.select('[name=generator]')
+        if tags:
+            return tags[0].get('content')
+
+
+    def get_opengraph(self, head):
+        if head is None:
+            return
+
+        # we find tags by matching this property/itemprop value regex
+        property_re = re.compile('^og:')
+
+        opengraph = set()
+        for tag in head.find_all(property=property_re):
+            opengraph.add(tag.get('property'))
+        for tag in head.find_all(itemprop=property_re):
+            opengraph.add(tag.get('itemprop'))
+        
+        opengraph = sorted(list(opengraph))
+        if opengraph != []:
+            return opengraph
+    
+
+    def get_viewport(self, head):
+        if head is None:
+            return
+        tags = head.select('[name=viewport]')
+        if tags:
+            return tags[0].get('content')
diff --git a/checks/http_and_https.py b/checks/http_and_https.py
new file mode 100644
index 0000000..79d0b6b
--- /dev/null
+++ b/checks/http_and_https.py
@@ -0,0 +1,27 @@
+"""
+This adds, for every HTTP URL, the HTTPS counterpart,
+and vice versa, to config.urls
+
+So it doesn't actually perform tests. It only expands the
+URLs to test by other checks.
+"""
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+    
+    def run(self):
+        """
+        Adds URLs to config.urls, returns nothing
+        """
+
+        for url in self.config.urls:
+            
+            if url.startswith('https://'):
+                self.config.add_url('http://' + url[8:])
+            elif url.startswith('http://'):
+                self.config.add_url('https://' + url[7:])
+
+        return None
\ No newline at end of file
diff --git a/checks/load_in_browser.py b/checks/load_in_browser.py
new file mode 100644
index 0000000..2ab3af6
--- /dev/null
+++ b/checks/load_in_browser.py
@@ -0,0 +1,134 @@
+"""
+Collects information by loading pages in a browser.
+
+Information includes:
+
+- whether the document width adapts well to viewports as little as 360 pixels wide
+- whether javascript errors or errors from missing resources occur
+- collects CSS font-family properties in use
+"""
+
+import logging
+import time
+
+from selenium import webdriver
+from selenium.common.exceptions import StaleElementReferenceException
+from selenium.common.exceptions import TimeoutException
+import tenacity
+
+from checks.abstract_checker import AbstractChecker
+
+
+class Checker(AbstractChecker):
+
+    page_load_timeout = 20
+
+    # sizes we check for (width, height)
+    sizes = (
+        (360, 640), # rather old smartphone
+        (768, 1024), # older tablet or newer smartphone
+        (1024, 768), # older desktop or horiz. tablet
+        (1920, 1080), # Full HD horizontal
+    )
+
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+
+        # Our selenium user agent using Chrome headless as an engine
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.add_argument('--headless')
+        chrome_options.add_argument('--disable-gpu')
+        chrome_options.add_argument('--no-sandbox')
+        chrome_options.add_argument('--disable-extensions')
+        self.driver = webdriver.Chrome(options=chrome_options)
+        self.driver.set_page_load_timeout(self.page_load_timeout)
+
+    def run(self):
+
+        results = {}
+        for url in self.config.urls:
+
+            results[url] = {
+                'sizes': None,
+                'min_document_width': None,
+                'logs': None,
+                'font_families': None,
+            }
+
+            # responsive check
+            try:
+                sizes = self.check_responsiveness(url)
+                results[url] = {
+                    'sizes': sizes,
+                    'min_document_width': min([s['document_width'] for s in sizes]),
+                    'logs': self.capture_log(),
+                }
+            except TimeoutException as e:
+                logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e))
+                pass
+            except tenacity.RetryError as re:
+                logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re))
+                pass
+            
+            # CSS collection
+            font_families = None
+
+            try:
+                elements = self.driver.find_elements_by_xpath("//*")
+                font_families = set()
+                for element in elements:
+                    try:
+                        font_family = element.value_of_css_property('font-family')
+                        if font_family is None:
+                            continue
+                        font_families.add(font_family.lower())
+                    except StaleElementReferenceException as e:
+                        logging.warn("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
+                        continue
+
+                results[url]['font_families'] = sorted(list(font_families))
+            
+            except TimeoutException as e:
+                logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
+                pass
+
+        self.driver.quit()
+
+        return results
+
+
+    @tenacity.retry(stop=tenacity.stop_after_attempt(3),
+                    retry=tenacity.retry_if_exception_type(TimeoutException))
+    def check_responsiveness(self, url):
+        result = []
+
+        # set window to the first size initially
+        self.driver.set_window_size(self.sizes[0][0], self.sizes[0][1])
+        self.driver.get(url)
+
+        # give the page some time to load
+        time.sleep(10)
+
+        for (width, height) in self.sizes:
+            self.driver.set_window_size(width, height)
+            
+            # wait for re-render/re-flow
+            time.sleep(1.0)
+            doc_width = self.driver.execute_script("return document.body.scrollWidth")
+            
+            result.append({
+                'viewport_width': width,
+                'document_width': int(doc_width),
+            })
+
+        return result
+    
+    def capture_log(self):
+        """
+        Returns log elements with level "SEVERE"
+        """
+        entries = []
+        for entry in self.driver.get_log('browser'):
+            if entry['level'] in ('WARNING', 'SEVERE'):
+                entries.append(entry)
+        return entries
diff --git a/checks/page_content.py b/checks/page_content.py
new file mode 100644
index 0000000..d036274
--- /dev/null
+++ b/checks/page_content.py
@@ -0,0 +1,94 @@
+"""
+This check downloads the HTML page for each URL
+"""
+
+import logging
+
+import requests
+
+from checks.abstract_checker import AbstractChecker
+
+
+class Checker(AbstractChecker):
+
+    # connection timeout (seconds)
+    CONNECT_TIMEOUT = 10
+
+    # response timeout (seconds)
+    READ_TIMEOUT = 20
+
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+
+
+    def run(self):
+        results = {}
+
+        self.headers = {
+            "User-Agent": self.config.user_agent,
+        }
+
+        # copy URLs, as we may be manipulating self.config.urls in the loop
+        url = list(self.config.urls)
+
+        for url in self.config.urls:
+            result = self.download_page(url)
+            results[url] = result
+
+            # remove bad URLs from config, to avoid later checks using them
+            if 'exception' in result and result['exception'] is not None:
+                self.config.remove_url(url)
+        
+        return results
+
+
+    def download_page(self, url):
+        result = {
+            'url': url,
+            'content': None,
+            'content_type': None,
+            'content_length': None,
+            'status_code': None,
+            'response_headers': None,
+            'duration': None,
+            'exception': None,
+        }
+
+        try:
+            r = requests.get(url,
+                             headers=self.headers,
+                             timeout=(self.CONNECT_TIMEOUT, self.READ_TIMEOUT))
+            
+            result['url'] = r.url
+            result['status_code'] = r.status_code
+            result['content'] = r.text
+            result['content_length'] = len(r.text)
+            result['response_headers'] = self.get_headers(r.headers)
+            result['duration'] = round(r.elapsed.total_seconds() * 1000)
+
+            if r.headers.get("content-type") is not None:
+                result['content_type'] = r.headers.get("content-type").split(";")[0].strip()
+
+        except requests.exceptions.ConnectionError as exc:
+            logging.error(str(exc) + " " + url)
+            result['exception'] = "connection"
+        except requests.exceptions.ReadTimeout as exc:
+            logging.error(str(exc) + " " + url)
+            result['exception'] = "read_timeout"
+        except requests.exceptions.Timeout as exc:
+            logging.error(str(exc) + " " + url)
+            result['exception'] = "connection_timeout"
+        except Exception as exc:
+            logging.error(str(exc) + " " + url)
+            result['exception'] = "%s %s" % (str(type(exc)), exc)
+        
+        return result
+    
+    def get_headers(self, headers):
+        """
+        Transforms CaseInsensitiveDict into dict with lowercase keys
+        """
+        out = {}
+        for key in headers:
+            out[key.lower()] = headers[key]
+        return out
diff --git a/checks/url_canonicalization.py b/checks/url_canonicalization.py
new file mode 100644
index 0000000..c6ce173
--- /dev/null
+++ b/checks/url_canonicalization.py
@@ -0,0 +1,13 @@
+"""
+This check verifies whether there is a single URL
+or several variants left at this point.
+"""
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+    
+    def run(self):
+        return self.config.urls
diff --git a/checks/url_reachability.py b/checks/url_reachability.py
new file mode 100644
index 0000000..b371540
--- /dev/null
+++ b/checks/url_reachability.py
@@ -0,0 +1,104 @@
+"""
+This check verifies whether the urls in config are reachable.
+Some additional information regarding redirects and SSL problems
+are also recorded and returned as results.
+
+Non-accessible URLs are removed from config.urls.
+
+A redirect to facebook.com is not considered reachable, as that
+leads to a different website in the sense of this system.
+
+TODO: Parallelize the work done in this test
+"""
+
+import logging
+
+from urllib.parse import urlparse
+import requests
+
+from checks.abstract_checker import AbstractChecker
+
+
+class Checker(AbstractChecker):
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+    
+    def run(self):
+        headers = {
+            "User-Agent": self.config.user_agent
+        }
+
+        results = {}
+        urls = list(self.config.urls)
+
+        for url in urls:
+            logging.debug("Checking URL reachability for %s", url)
+
+            result = {
+                "url": url,
+                "redirect_history": [],
+                "status": None,
+                "exception": None,
+                "duration": None,
+            }
+            
+            # Perform HEAD requests, recording redirect log
+            try:
+                r = requests.head(url, headers=headers, allow_redirects=True)
+                result['status'] = r.status_code
+                result['duration'] = round(r.elapsed.total_seconds() * 1000)
+
+                if len(r.history):
+                    result['redirect_history'] = self.expand_history(r.history)
+                    logging.debug("Redirects: %r", result['redirect_history'])
+
+                if r.url == url:
+                    logging.debug("URL: %s - status %s", url, r.status_code)
+                else:
+                    logging.debug("URL: %s - status %s - redirects to %s", url,
+                        r.status_code, r.url)
+                    # remove source URL, add target URL to config.urls
+                    self.config.remove_url(url)
+                    self.config.add_url(r.url)
+                
+                # remove 404 etc
+                if r.status_code > 400:
+                    self.config.remove_url(url)
+
+            except Exception as exc:
+                logging.info("Exception for URL %s: %s %s", url, str(type(exc)), exc)
+                result['exception'] = {
+                    'type': str(type(exc)),
+                    'message': str(exc),
+                }
+                
+                # remove URL to prevent further checks on unreachable URL
+                self.config.remove_url(url)
+            
+            # if redirects end in www.facebook.com or www.denic.de, remove this URL again
+            # remove if redirect target is facebook
+            if result['exception'] is not None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
+                parsed = urlparse(result['redirect_history'][-1]['redirect_to'])
+                if parsed.hostname in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
+                    result[url]['exception'] = {
+                        'type': 'Bad target domain',
+                        'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.hostname,
+                    }
+                    self.config.remove_url(url)
+
+            results[url] = result
+        
+        return results
+
+    def expand_history(self, history):
+        """Extracts primitives from a list of requests.Response objects"""
+        items = []
+        for h in history:
+            item = {
+                'status': h.status_code,
+                'duration': round(h.elapsed.total_seconds() * 1000),
+                'redirect_to': h.headers['location'],
+            }
+            items.append(item)
+        
+        return items
diff --git a/checks/url_reachability_test.py b/checks/url_reachability_test.py
new file mode 100644
index 0000000..b5514d5
--- /dev/null
+++ b/checks/url_reachability_test.py
@@ -0,0 +1,71 @@
+import httpretty
+from httpretty import httprettified
+import unittest
+
+from checks import url_reachability
+from checks.config import Config
+
+@httprettified
+class TestCharsetChecker(unittest.TestCase):
+
+    def test_success(self):
+        url = 'http://www.example.com/'
+        httpretty.register_uri(httpretty.HEAD, url,
+            status=200, body="<html></html>")
+
+        config = Config(urls=[url])
+        checker = url_reachability.Checker(config=config, previous_results={})
+        result = checker.run()
+
+        self.assertEqual(result[url]['url'], url)
+        self.assertEqual(result[url]['redirect_history'], [])
+        self.assertEqual(result[url]['status'], 200)
+        self.assertIsNone(result[url]['exception'])
+        self.assertTrue(0 < result[url]['duration'] < 100)
+
+
+    def test_redirect(self):
+        url = 'http://www.example.com/'
+        url2 = 'http://www2.example.com/'
+        httpretty.register_uri(httpretty.HEAD, url,
+            status=302, body="",
+            adding_headers={"Location": url2})
+        httpretty.register_uri(httpretty.HEAD, url2,
+            status=200, body="<html></html>")
+
+        config = Config(urls=[url])
+        checker = url_reachability.Checker(config=config, previous_results={})
+        result = checker.run()
+
+        self.assertIn(url, result)
+        self.assertEqual(result[url]['url'], url)
+        self.assertEqual(result[url]['status'], 200)
+        self.assertIsNone(result[url]['exception'])
+        self.assertTrue(0 < result[url]['duration'] < 100)
+        self.assertEqual(len(result[url]['redirect_history']), 1)
+        self.assertEqual(result[url]['redirect_history'][0]['status'], 302)
+        self.assertEqual(result[url]['redirect_history'][0]['redirect_to'], url2)
+
+
+    def test_notfound(self):
+        url = 'http://www.example.com/'
+        httpretty.register_uri(httpretty.HEAD, url,
+            status=404, body="<html><body>Not found</body></html>")
+
+        config = Config(urls=[url])
+        checker = url_reachability.Checker(config=config, previous_results={})
+        result = checker.run()
+
+        self.assertEqual(result[url]['url'], url)
+        self.assertEqual(result[url]['redirect_history'], [])
+        self.assertEqual(result[url]['status'], 404)
+        self.assertIsNone(result[url]['exception'])
+
+        newconfig = checker.config
+
+        self.assertEqual(len(newconfig.urls), 0)
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/cli.py b/cli.py
new file mode 100644
index 0000000..f470f26
--- /dev/null
+++ b/cli.py
@@ -0,0 +1,83 @@
+"""
+Command line utility for spider, export etc.
+"""
+
+import argparse
+import logging
+import signal
+import sys
+
+from google.cloud import datastore
+
+def handle_sigint(signum, frame):
+    """
+    Handles SIGINT, which occurs on Ctrl-C
+    """
+    print("\nInterrupted by SIGINT\n")
+    sys.exit()
+
+
+if __name__ == "__main__":
+    signal.signal(signal.SIGINT,handle_sigint)
+
+    parser = argparse.ArgumentParser()
+
+    # global flags
+    parser.add_argument('--credentials-path', dest='credentials_path',
+                        help='Path to the service account credentials JSON file',
+                        default='/secrets/service-account.json')
+    
+    parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
+                        default='info')
+
+    # subcommands
+    subparsers = parser.add_subparsers(help='sub-command help', dest='command')
+
+    # spider subcommand
+    spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
+    spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
+
+    # jobs subcommand
+    jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
+    jobs_parser.add_argument('--url', help='Add a job to spider a specific URL')
+
+    # export subcommand
+    export_parser = subparsers.add_parser('export', help='Export JSON data')
+    export_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to export (default: spider-results)')
+
+
+    args = parser.parse_args()
+
+    # set log level
+    logging.getLogger("urllib3").setLevel(logging.CRITICAL)
+
+    loglevel = args.loglevel.lower()
+    if loglevel == 'error':
+        logging.basicConfig(level=logging.ERROR)
+    elif loglevel == 'warn':
+        logging.basicConfig(level=logging.WARN)
+    elif loglevel == 'debug':
+        logging.basicConfig(level=logging.DEBUG)
+        logging.getLogger("selenium").setLevel(logging.INFO)
+    else:
+        logging.basicConfig(level=logging.INFO)
+        loglevel = 'info'
+
+    logging.debug("Called command %s", args.command)
+
+    datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
+
+    if args.command == 'jobs':
+
+        import jobs
+        jobs.create_jobs(datastore_client, args.url)
+    
+    elif args.command == 'export':
+
+        import export
+        export.export_screenshots(datastore_client)
+        export.export_results(datastore_client, args.kind)
+
+    else:
+        from spider import spider
+        spider.work_of_queue(datastore_client, args.kind)
diff --git a/config/__init__.py b/config/__init__.py
new file mode 100644
index 0000000..83b1ba4
--- /dev/null
+++ b/config/__init__.py
@@ -0,0 +1,23 @@
+
+
+# connection timeout for website checks (seconds)
+CONNECT_TIMEOUT = 5
+
+# response timeout for website checks
+READ_TIMEOUT = 10
+
+# Git repo for our data
+GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
+
+# folder in that repo that holds the data
+GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
+
+# folder we use locally to clone the repo
+GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
+
+# IP address of the newthinking GCMS server
+GCMS_IP = "91.102.13.20"
+
+# kind name of the spider job key datastore entities
+JOB_DATASTORE_KIND = 'spider-jobs'
+
diff --git a/devops/run-job.sh b/devops/run-job.sh
index 207c205..530ecdb 100755
--- a/devops/run-job.sh
+++ b/devops/run-job.sh
@@ -19,6 +19,8 @@
 #   secrets/datastore-writer.json
 
 
+DOCKERIMAGE="quay.io/netzbegruenung/green-spider:dev"
+
 API_TOKEN_SECRET="secrets/hetzner-api-token.sh"
 test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; }
 source $API_TOKEN_SECRET
@@ -29,10 +31,14 @@ if [[ "$1" == "" ]]; then
   exit 1
 fi
 
+SERVERNAME="$1-$(date | md5 | cut -c1-3)"
+
+# possible values: cx11 (1 core 2 GB), cx21 (2 cores, 4 GB), cx31 (2 cores, 8 GB)
+SERVERTYPE="cx21"
 
 function create_server()
 {
-  echo "Creating server $1"
+  echo "Creating server $SERVERNAME"
 
   # server_type 'cx11' is the smallest, cheapest category.
   # location 'nbg1' is Nürnberg/Nuremberg, Germany.
@@ -44,8 +50,8 @@ function create_server()
     -H "Content-Type: application/json" \
     -H "Authorization: Bearer $API_TOKEN" \
     -d "{
-      \"name\": \"$1\",
-      \"server_type\": \"cx11\",
+      \"name\": \"$SERVERNAME\",
+      \"server_type\": \"$SERVERTYPE\",
       \"location\": \"nbg1\",
       \"start_after_create\": true,
       \"image\": \"debian-9\",
@@ -61,7 +67,7 @@ function create_server()
   # Get IP:
   SERVER_IP=$(echo $CREATE_RESPONSE | jq -r .server.public_net.ipv4.ip)
 
-  echo "Created server with ID $SERVER_ID and IP $SERVER_IP"
+  echo "Created server $SERVERNAME with ID $SERVER_ID and IP $SERVER_IP"
 }
 
 
@@ -142,22 +148,25 @@ else
 
   # Run docker job
   echo "Starting Docker Job"
-  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \
-    -v /root/secrets:/secrets \
-    quay.io/netzbegruenung/green-spider spider.py \
-    --credentials-path /secrets/datastore-writer.json \
-    jobs
+  #ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \
+  #  -v /root/secrets:/secrets \
+  #  quay.io/netzbegruenung/green-spider spider.py \
+  #  --credentials-path /secrets/datastore-writer.json \
+  #  jobs
 
+  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP mkdir -p /dev-shm
   ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \
+    -v /dev-shm:/dev/shm \
     -v /root/secrets:/secrets \
-    quay.io/netzbegruenung/green-spider spider.py \
+    $DOCKERIMAGE \
     --credentials-path /secrets/datastore-writer.json \
-    spider
+    --loglevel info \
+    spider --kind spider-results-dev
 
 fi
 
 # Delete the box
-echo "Deleting server $SERVER_ID"
+echo "Deleting server $SERVERNAME with ID $SERVER_ID"
 curl -s -X DELETE -H "Content-Type: application/json" \
   -H "Authorization: Bearer $API_TOKEN" \
   https://api.hetzner.cloud/v1/servers/$SERVER_ID
diff --git a/data_export.py b/export/__init__.py
similarity index 56%
rename from data_export.py
rename to export/__init__.py
index 2ab23ef..64af277 100644
--- a/data_export.py
+++ b/export/__init__.py
@@ -2,8 +2,7 @@
 Exports data from the database to JSON files for use in a static webapp
 """
 
-from google.cloud import datastore
-import hashlib
+from hashlib import md5
 import json
 import logging
 import sys
@@ -14,44 +13,67 @@ import requests
 
 SITEICONS_PATH = "/icons"
 
-client = None
-
-def export_results():
+def export_results(client, entity_kind):
     """
     Export of the main results data
     """
     out = []
 
-    query = client.query(kind='spider-results')
+    # Load data from database
+    query = client.query(kind=entity_kind)
     for entity in query.fetch():
         logging.debug(entity.key.name)
-        record = dict(entity)
-        record["results"]["created"] = record["created"].isoformat()
-        out.append(record["results"])
+        out.append({
+            'input_url': entity.key.name,
+            'resulting_urls': entity.get('checks').get('url_canonicalization'),
+            'created': entity.get('created').isoformat(),
+            'meta': entity.get('meta'),
+            'checks': entity.get('checks'),
+            'rating': entity.get('rating'),
+            'score': entity.get('score'),
+            'icons': [],
+        })
     
     # load icons, reformat icons details
+    icons_downloaded = set()
     for index in range(len(out)):
-        if "details" not in out[index]:
-            continue
-        if "icons" not in out[index]["details"]:
-            continue
-        urls = out[index]["details"]["icons"]
-        out[index]["details"]["icons"] = {}
-        for url in urls:
-            if not (url.startswith("http://") or url.startswith("https://")):
-                logging.debug("Skipping icon %s", url)
-                continue
-            logging.debug("Dowloading icon %s", url)
-            filename = download_icon(url)
+        assert "checks" in out[index]
+        assert "html_head" in out[index]["checks"]
+        
+        # collect icons urls
+        icons = set()
+        for url in out[index]['checks']['html_head']:
+            assert 'link_icon' in out[index]['checks']['html_head'][url]
+            if out[index]['checks']['html_head'][url]['link_icon'] is not None:
+                iconurl = out[index]['checks']['html_head'][url]['link_icon']
+                if iconurl.startswith("data:"):
+                    continue
+                if iconurl in icons_downloaded:
+                    continue
+                icons.add(iconurl)
+        
+        out[index]["icons"] = {}
+        for iconurl in list(icons):
+            logging.debug("Dowloading icon %s", iconurl)
+            icons_downloaded.add(iconurl)
+            filename = download_icon(iconurl)
             if filename:
-                out[index]["details"]["icons"][url] = filename
+                out[index]["icons"][url] = filename
 
     output_filename = "/out/spider_result.json"
     with open(output_filename, 'w', encoding="utf8") as jsonfile:
         json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
+    
+    # compact version
+    output_filename = "/out/spider_result_compact.json"
+    for i in range(len(out)):
+        out[i]['cms'] = list(out[i]['checks']['generator'].values())
+        del out[i]['checks']
+    with open(output_filename, 'w', encoding="utf8") as jsonfile:
+        json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
 
 
-def export_screenshots():
+def export_screenshots(client):
     """
     Export of screenshot meta data
     """
@@ -78,10 +100,12 @@ def download_icon(icon_url):
     """
 
     default_endings = {
+        "image/x-ico": "ico",
         "image/x-icon": "ico",
         "image/vnd.microsoft.icon": "ico",
         "image/png": "png",
         "image/jpeg": "jpg",
+        "image/gif": "gif",
     }
 
     # Download the icon
@@ -92,7 +116,7 @@ def download_icon(icon_url):
     if req.status_code >= 400:
         return None
 
-    content_hash = hashlib.md5(req.content).hexdigest()
+    content_hash = md5(req.content).hexdigest()
     extension = ""
 
     try:
@@ -109,6 +133,9 @@ def download_icon(icon_url):
     if extension == "":
         # derive from content type
         ctype = req.headers.get('content-type')
+        if ctype is None:
+            return
+
         try:
             extension = default_endings[ctype]
         except KeyError:
@@ -122,17 +149,3 @@ def download_icon(icon_url):
         iconfile.write(req.content)
 
     return filename
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
-
-    if len(sys.argv) == 1:
-        print("Error: please provide path to Google Storage API system account JSON file as argument")
-        sys.exit(1)
-
-    key_path = sys.argv[1]
-    client = datastore.Client.from_service_account_json(key_path)
-    
-    export_screenshots()
-    export_results()
diff --git a/jobs/__init__.py b/jobs/__init__.py
new file mode 100644
index 0000000..3e125d5
--- /dev/null
+++ b/jobs/__init__.py
@@ -0,0 +1,180 @@
+"""
+The jobs module allows to create jobs for the queue and take jobs off the queue
+"""
+
+from datetime import datetime
+import logging
+import os
+import random
+import shutil
+
+from git import Repo
+import tenacity
+import yaml
+from google.api_core.exceptions import Aborted
+from google.cloud import datastore
+
+import config
+
+
+def clone_data_directory():
+    """
+    Clones the source of website URLs, the green directory,
+    into the local file system using git
+    """
+    if os.path.exists(config.GREEN_DIRECTORY_LOCAL_PATH):
+        shutil.rmtree(config.GREEN_DIRECTORY_LOCAL_PATH)
+    Repo.clone_from(config.GREEN_DIRECTORY_REPO, config.GREEN_DIRECTORY_LOCAL_PATH)
+
+
+def directory_entries():
+    """
+    Iterator over all data files in the cloned green directory
+    """
+    path = os.path.join(config.GREEN_DIRECTORY_LOCAL_PATH, config.GREEN_DIRECTORY_DATA_PATH)
+    for root, _, files in os.walk(path):
+        for fname in files:
+
+            filepath = os.path.join(root, fname)
+            if not filepath.endswith(".yaml"):
+                continue
+
+            with open(filepath, 'r', encoding='utf8') as yamlfile:
+                for doc in yaml.load_all(yamlfile):
+                    yield doc
+
+
+def chunks(the_list, size):
+    """
+    Yield successive n-sized chunks from list the_list
+    where n = size.
+    """
+    for i in range(0, len(the_list), size):
+        yield the_list[i:i + size]
+
+
+def create_jobs(datastore_client, url=None):
+    """
+    Read all URLs from green directory and fill a job database
+    with one job per URL.
+
+    Alternatively, if the url argument is given, only the given URL
+    will be added as a spider job.
+    """
+
+    # refresh our local clone of the green directory
+    logging.info("Refreshing green-directory clone")
+    clone_data_directory()
+
+    # build the list of website URLs to run checks for
+    logging.info("Processing green-directory")
+    input_entries = []
+
+    count = 0
+
+    random.seed()
+
+    for entry in directory_entries():
+
+        if 'type' not in entry:
+            logging.error("Entry without type")
+            continue
+        if 'urls' not in entry:
+            logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
+            continue
+
+        website_url = None
+        for index in range(len(entry['urls'])):
+            try:
+                if entry['urls'][index]['type'] == "WEBSITE":
+                    website_url = entry['urls'][index]['url']
+                    if website_url:
+                        if url is not None and website_url != url:
+                            continue
+                        input_entries.append({
+                            "url": website_url,
+                            "type": entry.get("type"),
+                            "level": entry.get("level"),
+                            "state": entry.get("state"),
+                            "district": entry.get("district"),
+                            "city": entry.get("city"),
+                        })
+                        count += 1
+            except NameError:
+                logging.error("Error in %s: 'url' key missing (%s)",
+                              repr_entry(entry), entry['urls'][index])
+
+    # ensure the passed URL argument is really there, even if not part
+    # of the directory.
+    if url and count == 0:
+        logging.info("Adding job for URL %s which is not part of green-directory", url)
+        input_entries.append({
+            "url": url,
+            "type": None,
+            "level": None,
+            "state": None,
+            "district": None,
+            "city": None,
+            "index": int(random.uniform(1000000, 9999999)),
+        })
+
+    count = 0
+    logging.info("Writing jobs")
+
+    entities = []
+
+    for entry in input_entries:
+        key = datastore_client.key(config.JOB_DATASTORE_KIND, entry["url"])
+        entity = datastore.Entity(key=key)
+        entity.update({
+            "created": datetime.utcnow(),
+            "type": entry["type"],
+            "level": entry["level"],
+            "state": entry["state"],
+            "district": entry["district"],
+            "city": entry["city"],
+            "index": int(random.uniform(1000000, 9999999)),
+        })
+        entities.append(entity)
+
+    # commmit to DB
+    for chunk in chunks(entities, 300):
+        logging.debug("Writing jobs chunk of length %d", len(chunk))
+        datastore_client.put_multi(chunk)
+        count += len(chunk)
+
+    logging.info("Writing jobs done, %s jobs added", count)
+
+
+@tenacity.retry(wait=tenacity.wait_exponential(),
+                retry=tenacity.retry_if_exception_type(Aborted))
+def get_job_from_queue(datastore_client):
+    """
+    Returns a URL from the queue
+    """
+    out = None
+
+    with datastore_client.transaction():
+        query = datastore_client.query(kind=config.JOB_DATASTORE_KIND,
+                                       order=['index'])
+        for entity in query.fetch(limit=1):
+            logging.debug("Got job: %s", entity)
+            out = dict(entity)
+            out["url"] = entity.key.name
+            datastore_client.delete(entity.key)
+
+    return out
+
+def repr_entry(entry):
+    """
+    Return string representation of a directory entry,
+    for logging/debugging purposes
+    """
+    ret = entry['type']
+    if 'level' in entry:
+        ret += "/" + entry['level']
+    if 'state' in entry:
+        ret += "/" + entry['state']
+    if 'district' in entry:
+        ret += "/" + entry['district']
+    return ret
diff --git a/rating/__init__.py b/rating/__init__.py
new file mode 100644
index 0000000..197e720
--- /dev/null
+++ b/rating/__init__.py
@@ -0,0 +1,53 @@
+"""
+The rating module contains the functionality to get calculate score for certain
+criteria based on information gather by checks before.
+"""
+
+import logging
+
+from rating import canonical_url
+from rating import favicon
+from rating import feeds
+from rating import https
+from rating import no_network_errors
+from rating import no_script_errors
+from rating import reachable
+from rating import resolvable
+from rating import response_duration
+from rating import responsive_layout
+from rating import use_specific_fonts
+from rating import www_optional
+
+
+def calculate_rating(results):
+    """
+    Calculates ratings for a number of criteria.
+
+    Params:
+    results - Results dictionary from checks
+    """
+
+    # The raters to execute.
+    rating_modules = {
+        'CANONICAL_URL': canonical_url,
+        'DNS_RESOLVABLE_IPV4': resolvable,
+        'FAVICON': favicon,
+        'FEEDS': feeds,
+        'HTTPS': https,
+        'HTTP_RESPONSE_DURATION': response_duration,
+        'NO_NETWORK_ERRORS': no_network_errors,
+        'NO_SCRIPT_ERRORS': no_script_errors,
+        'RESPONSIVE': responsive_layout,
+        'SITE_REACHABLE': reachable,
+        'USE_SPECIFIC_FONTS': use_specific_fonts,
+        'WWW_OPTIONAL': www_optional,
+    }
+
+    output = {}
+
+    for name in rating_modules:
+
+        rater = rating_modules[name].Rater(results)
+        output[name] = rater.rate()
+
+    return output
diff --git a/rating/abstract_rater.py b/rating/abstract_rater.py
new file mode 100644
index 0000000..ef2a2f8
--- /dev/null
+++ b/rating/abstract_rater.py
@@ -0,0 +1,22 @@
+class AbstractRater(object):
+
+    # String 'boolean' or 'number'
+    rating_type = None
+
+    # The default value to return if no rating given
+    default_value = None
+    
+    max_score = 1
+
+    # Name of the checks this rater depends on
+    depends_on_checks = []
+
+    def __init__(self, check_results):
+        self.check_results = check_results
+
+        for item in self.depends_on_checks:
+            assert item in self.check_results
+
+    def rate(self):
+        raise NotImplementedError()
+    
diff --git a/rating/canonical_url.py b/rating/canonical_url.py
new file mode 100644
index 0000000..dbe4024
--- /dev/null
+++ b/rating/canonical_url.py
@@ -0,0 +1,31 @@
+"""
+This looks at remaining resolvable URLs after redirects
+and gives score if there is only one URL left.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['url_canonicalization']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        if len(self.check_results['url_canonicalization']) == 1:
+            value = True
+            score = self.max_score
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
diff --git a/rating/favicon.py b/rating/favicon.py
new file mode 100644
index 0000000..5387a1e
--- /dev/null
+++ b/rating/favicon.py
@@ -0,0 +1,32 @@
+"""
+This gives a score if the site has an icon.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['html_head']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        for url in self.check_results['html_head']:
+            if self.check_results['html_head'][url]['link_icon'] is not None:
+                value = True
+                score = self.max_score
+                break
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
diff --git a/rating/feeds.py b/rating/feeds.py
new file mode 100644
index 0000000..edc8888
--- /dev/null
+++ b/rating/feeds.py
@@ -0,0 +1,35 @@
+"""
+This gives a score if the site has feeds.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['html_head']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        for url in self.check_results['html_head']:
+            if self.check_results['html_head'][url]['link_rss_atom'] is None:
+                continue
+            if self.check_results['html_head'][url]['link_rss_atom'] == []:
+                continue
+            value = True
+            score = self.max_score
+            break
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
diff --git a/rating/https.py b/rating/https.py
new file mode 100644
index 0000000..e47550e
--- /dev/null
+++ b/rating/https.py
@@ -0,0 +1,47 @@
+"""
+This looks at all HTTPS URLs we checked for reachability.
+
+If all of them were reachable without errors, we give full score.
+If some or all had errors, or no HTTPS URL is reachable, we give zero.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['url_reachability']
+
+    # HTTPS is very important, so this counts double
+    max_score = 2
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        reachable_count = 0
+        unreachable_count = 0
+
+        for url in self.check_results['url_reachability']:
+            if not url.startswith('https://'):
+                continue
+
+            if self.check_results['url_reachability'][url]['exception'] is None:
+                reachable_count += 1
+            else:
+                unreachable_count += 1
+        
+        if unreachable_count == 0 and reachable_count > 0:
+            value = True
+            score = self.max_score
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
diff --git a/rating/no_network_errors.py b/rating/no_network_errors.py
new file mode 100644
index 0000000..ac56247
--- /dev/null
+++ b/rating/no_network_errors.py
@@ -0,0 +1,48 @@
+"""
+If all URLs could be loaded without severe network errors, this rater gives a score.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['load_in_browser']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        found_pageloads = 0
+        found_errors = 0
+        for url in self.check_results['load_in_browser']:
+            if (self.check_results['load_in_browser'][url]['logs'] == [] or
+                self.check_results['load_in_browser'][url]['logs'] is None):
+                continue
+            
+            found_pageloads += 1
+            
+            # scan log entries for script errors
+            for entry in self.check_results['load_in_browser'][url]['logs']:
+                if entry['source'] != 'network':
+                    continue
+                if entry['level'] != 'SEVERE':
+                    continue
+                
+                found_errors += 1
+
+        if found_pageloads > 0 and found_errors == 0:
+            value = True
+            score = self.max_score
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
diff --git a/rating/no_script_errors.py b/rating/no_script_errors.py
new file mode 100644
index 0000000..32a89c6
--- /dev/null
+++ b/rating/no_script_errors.py
@@ -0,0 +1,42 @@
+"""
+If all URLs could be loaded without JavaScript errors, this rater gives a score.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['load_in_browser']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        found_pageloads = 0
+        found_errors = 0
+        for url in self.check_results['load_in_browser']:
+            if self.check_results['load_in_browser'][url]['logs'] == []:
+                found_pageloads += 1
+                continue
+            
+            # scan log entries for script errors
+            for entry in self.check_results['load_in_browser'][url]['logs']:
+                if entry['source'] == 'javascript':
+                    found_errors += 1
+
+        if found_pageloads > 0 and found_errors == 0:
+            value = True
+            score = self.max_score
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
diff --git a/rating/reachable.py b/rating/reachable.py
new file mode 100644
index 0000000..381cdb0
--- /dev/null
+++ b/rating/reachable.py
@@ -0,0 +1,36 @@
+"""
+This gives a score if one of the checked URL variations was reachable.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['url_reachability']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        count = 0
+        for url in self.check_results['url_reachability']:
+            if self.check_results['url_reachability'][url]['exception'] is not None:
+                continue
+            count += 1
+        
+        if count > 0:
+            value = True
+            score = self.max_score
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
diff --git a/rating/resolvable.py b/rating/resolvable.py
new file mode 100644
index 0000000..01e243e
--- /dev/null
+++ b/rating/resolvable.py
@@ -0,0 +1,35 @@
+"""
+This gives a score if one of the input URL's hostnames was resolvable
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['dns_resolution']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        count = 0
+        for url in self.check_results['dns_resolution']:
+            if self.check_results['dns_resolution'][url]['resolvable']:
+                count += 1
+        
+        if count > 0:
+            value = True
+            score = self.max_score
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
diff --git a/rating/response_duration.py b/rating/response_duration.py
new file mode 100644
index 0000000..6f22d84
--- /dev/null
+++ b/rating/response_duration.py
@@ -0,0 +1,46 @@
+"""
+This looks at the response duration(s) and scores based on the bucket
+the value is in. Fast responses get one point, slower half a point,
+more than a seconds gets nothing.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'number'
+    default_value = False
+    depends_on_checks = ['page_content']
+    max_score = 1.0
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        duration_sum = 0
+        duration_count = 0
+
+        for url in self.check_results['page_content']:
+            if self.check_results['page_content'][url]['exception'] is not None:
+                continue
+            duration_sum += self.check_results['page_content'][url]['duration']
+            duration_count += 1
+        
+        if duration_count > 0:
+            value = round(duration_sum / duration_count)
+        
+            # value is duration in milliseconds
+            if value < 100:
+                score = self.max_score
+            elif value < 1000:
+                score = self.max_score * 0.5
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
diff --git a/rating/responsive_layout.py b/rating/responsive_layout.py
new file mode 100644
index 0000000..2c198eb
--- /dev/null
+++ b/rating/responsive_layout.py
@@ -0,0 +1,35 @@
+"""
+This gives a score if the site's minimal document width during checks
+was smaller than or equal to the minimal viewport size tested.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['load_in_browser']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        for url in self.check_results['load_in_browser']:
+            if (self.check_results['load_in_browser'][url]['min_document_width'] <=
+                self.check_results['load_in_browser'][url]['sizes'][0]['viewport_width']):
+                value = True
+                score = self.max_score
+                # we use the first URL found here
+                break
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
diff --git a/rating/use_specific_fonts.py b/rating/use_specific_fonts.py
new file mode 100644
index 0000000..8acb907
--- /dev/null
+++ b/rating/use_specific_fonts.py
@@ -0,0 +1,41 @@
+"""
+Checks whether the pages use the font 'Arvo'.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['load_in_browser']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        urls_with_font = 0
+        urls_without_font = 0
+        for url in self.check_results['load_in_browser']:
+            if self.check_results['load_in_browser'][url]['font_families'] is None:
+                urls_without_font += 1
+                continue
+            
+            fonts = " ".join(self.check_results['load_in_browser'][url]['font_families'])
+            if 'arvo' in fonts:
+                urls_with_font += 1
+        
+        if urls_with_font > 0 and urls_without_font == 0:
+            score = self.max_score
+            value = True
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
diff --git a/rating/www_optional.py b/rating/www_optional.py
new file mode 100644
index 0000000..0afef45
--- /dev/null
+++ b/rating/www_optional.py
@@ -0,0 +1,44 @@
+"""
+This looks at reachable URLs and checks whether (sub)domains
+both with and without www. are reachable.
+"""
+
+from urllib.parse import urlparse
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['url_reachability']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        hostnames = set()
+        for url in self.check_results['url_reachability']:
+            if self.check_results['url_reachability'][url]['exception'] is not None:
+                continue
+            parsed = urlparse(url)
+            hostnames.add(parsed)
+        
+        # FIXME
+        # we simply check whether there is more than one hostname.
+        # this works with our current input URls but might be too
+        # simplistic in the future.
+        if len(list(hostnames)) > 1:
+            value = True
+            score = self.max_score
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
diff --git a/spider.py b/spider.py
deleted file mode 100644
index 4e4f6e6..0000000
--- a/spider.py
+++ /dev/null
@@ -1,814 +0,0 @@
-"""
-Provides the spider functionality (website checks).
-"""
-
-import argparse
-import json
-import logging
-import os
-import random
-import re
-import shutil
-import statistics
-import time
-from datetime import datetime
-from socket import gethostbyname_ex
-from urllib.parse import urljoin
-from urllib.parse import urlparse
-
-import requests
-import yaml
-import tenacity
-
-from bs4 import BeautifulSoup
-from git import Repo
-from selenium import webdriver
-from google.cloud import datastore
-from google.api_core.exceptions import Aborted
-from google.api_core.exceptions import InvalidArgument
-
-
-# configuration
-
-# connection timeout for website checks (seconds)
-CONNECT_TIMEOUT = 5
-
-# response timeout for website checks
-READ_TIMEOUT = 10
-
-# Git repo for our data
-GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
-# folder in that repo that holds the data
-GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
-GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
-
-RESULT_PATH = '/out'
-
-# IP address of the newthinking GCMS server
-GCMS_IP = "91.102.13.20"
-
-JOB_DATASTORE_KIND = 'spider-jobs'
-RESULTS_DATASTORE_KIND = 'spider-results'
-
-# end configuration
-
-DATASTORE_CLIENT = None
-
-
-def chunks(the_list, size):
-    """
-    Yield successive n-sized chunks from list the_list
-    where n = size.
-    """
-    for i in range(0, len(the_list), size):
-        yield the_list[i:i + size]
-
-
-def create_jobs(url=None):
-    """
-    Read all URLs from green directory and fill a job database
-    with one job per URL.
-
-    Alternatively, if the url argument is given, only the given URL
-    will be added as a spider job.
-    """
-
-    # refresh our local clone of the green directory
-    logging.info("Refreshing green-directory clone")
-    get_green_directory()
-
-    # build the list of website URLs to run checks for
-    logging.info("Processing green-directory")
-    input_entries = []
-
-    count = 0
-
-    for entry in dir_entries():
-
-        if 'type' not in entry:
-            logging.error("Entry without type")
-            continue
-        if 'urls' not in entry:
-            logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
-            continue
-
-        website_url = None
-        for index in range(len(entry['urls'])):
-            try:
-                if entry['urls'][index]['type'] == "WEBSITE":
-                    website_url = entry['urls'][index]['url']
-                    if website_url:
-                        if url is not None and website_url != url:
-                            continue
-                        input_entries.append({
-                            "url": website_url,
-                            "level": entry.get("level"),
-                            "state": entry.get("state"),
-                            "district": entry.get("district"),
-                            "city": entry.get("city"),
-                        })
-                        count += 1
-            except NameError:
-                logging.error("Error in %s: 'url' key missing (%s)",
-                              repr_entry(entry), entry['urls'][index])
-
-    # ensure the passed URL argument is really there, even if not part
-    # of the directory.
-    if url and count == 0:
-        logging.info("Adding job for URL %s which is not part of green-directory", url)
-        input_entries.append({
-            "url": url,
-            "level": None,
-            "state": None,
-            "district": None,
-            "city": None,
-        })
-
-    # randomize order, to distribute requests over servers
-    logging.debug("Shuffling input URLs")
-    random.seed()
-    random.shuffle(input_entries)
-
-    count = 0
-    logging.info("Writing jobs")
-
-    entities = []
-
-    for entry in input_entries:
-        key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"])
-        entity = datastore.Entity(key=key)
-        entity.update({
-            "created": datetime.utcnow(),
-            "level": entry["level"],
-            "state": entry["state"],
-            "district": entry["district"],
-            "city": entry["city"],
-        })
-        entities.append(entity)
-
-    # commmit to DB
-    for chunk in chunks(entities, 300):
-        logging.debug("Writing jobs chunk of length %d", len(chunk))
-        DATASTORE_CLIENT.put_multi(chunk)
-        count += len(chunk)
-
-    logging.info("Writing jobs done, %s jobs added", count)
-
-
-def get_green_directory():
-    """
-    Clones the source of website URLs, the green directory,
-    into the local file system using git
-    """
-    if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH):
-        shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH)
-    Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH)
-
-
-def dir_entries():
-    """
-    Iterator over all data files in the cloned green directory
-    """
-    path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH)
-    for root, _, files in os.walk(path):
-        for fname in files:
-
-            filepath = os.path.join(root, fname)
-            if not filepath.endswith(".yaml"):
-                continue
-
-            with open(filepath, 'r', encoding='utf8') as yamlfile:
-                for doc in yaml.load_all(yamlfile):
-                    yield doc
-
-
-def repr_entry(entry):
-    """
-    Return string representation of a directory entry,
-    for logging/debugging purposes
-    """
-    ret = entry['type']
-    if 'level' in entry:
-        ret += "/" + entry['level']
-    if 'state' in entry:
-        ret += "/" + entry['state']
-    if 'district' in entry:
-        ret += "/" + entry['district']
-    return ret
-
-
-def derive_test_hostnames(hostname):
-    """
-    Derives the hostnames variants to test for a given host name.
-    From 'gruene-x.de' or 'www.gruene-x.de' it makes
-
-      ['gruene-x.de', 'www.gruene-x.de']
-
-    which are both plausible web URLs to be used for a domain.
-    """
-
-    hostnames = set()
-
-    hostnames.add(hostname)
-    if hostname.startswith('www.'):
-        hostnames.add(hostname[4:])
-    else:
-        hostnames.add('www.' + hostname)
-
-    return sorted(list(hostnames))
-
-
-def reduce_urls(urllist):
-    """
-    Reduce a list of urls with metadata by eliminating those
-    that either don't work or lead somewhere else
-    """
-    targets = set()
-    for url in urllist:
-        if url['error'] is not None:
-            continue
-        if url['redirects_to'] is not None:
-            targets.add(url['redirects_to'])
-        else:
-            targets.add(url['url'])
-    return sorted(list(targets))
-
-
-def normalize_title(title):
-    """
-    Removes garbage from HTML page titles
-    """
-    title = title.replace(u'\u00a0', ' ')
-    title = title.replace('  ', ' ')
-    title = title.strip()
-    return title
-
-
-def check_responsiveness(url):
-    """
-    Checks
-    - whether a page adapts to different viewport sizes
-    - whether a viewport meta tag exists
-    and returns details
-    """
-    details = {
-        'document_width': {},
-        'viewport_meta_tag': None,
-    }
-
-    # sizes we check for (width, height)
-    sizes = (
-        (320, 480), # old smartphone
-        (768, 1024), # older tablet or newer smartphone
-        (1024, 768), # older desktop or horiz. tablet
-        (1920, 1080), # Full HD horizontal
-    )
-
-    # Our selenium user agent using Chrome headless as an engine
-    chrome_options = webdriver.ChromeOptions()
-    chrome_options.add_argument('--headless')
-    chrome_options.add_argument('--disable-gpu')
-    chrome_options.add_argument('--no-sandbox')
-    chrome_options.add_argument('--disable-extensions')
-    driver = webdriver.Chrome(chrome_options=chrome_options)
-    driver.set_page_load_timeout(60)
-    driver.set_window_size(sizes[0][0], sizes[0][1])
-    driver.get(url)
-    time.sleep(1)
-
-    for (width, height) in sizes:
-        driver.set_window_size(width, height)
-        key = "%sx%s" % (width, height)
-        width = driver.execute_script("return document.body.scrollWidth")
-        details['document_width'][key] = int(width)
-
-    try:
-        element = driver.find_element_by_xpath("//meta[@name='viewport']")
-        details['viewport_meta_tag'] = element.get_attribute('content')
-    except:
-        pass
-
-    return details
-
-
-def check_content(req):
-    """
-    Adds details to check regarding content of the page
-
-    check: the dict containing details for this URL
-    r: requests request/response object
-    """
-    result = {}
-
-    result['encoding'] = req.encoding.lower()
-    soup = BeautifulSoup(req.text, 'html.parser')
-
-    result['html'] = req.text
-
-    # page title
-    result['title'] = None
-    title = None
-    head = soup.find('head')
-    if head is not None:
-        title = head.find('title')
-    if title is not None:
-        result['title'] = normalize_title(title.get_text())
-
-    # canonical link
-    result['canonical_link'] = None
-    link = soup.find('link', rel='canonical')
-    if link:
-        result['canonical_link'] = urljoin(req.url, link.get('href'))
-
-    # icon
-    result['icon'] = None
-    link = soup.find('link', rel=lambda x: x and x.lower() == 'icon')
-    if link:
-        result['icon'] = urljoin(req.url, link.get('href'))
-    else:
-        link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
-        if link:
-            result['icon'] = urljoin(req.url, link.get('href'))
-
-    # feed links
-    result['feeds'] = []
-    rss_links = soup.find_all('link', type='application/rss+xml')
-    atom_links = soup.find_all('link', type='application/atom+xml')
-
-    if rss_links:
-        for link in rss_links:
-            result['feeds'].append(urljoin(req.url, link.get('href')))
-    if atom_links:
-        for link in rss_links:
-            result['feeds'].append(urljoin(req.url, link.get('href')))
-
-    # generator meta tag
-    result['generator'] = None
-    if head is not None:
-        generator = head.select('[name=generator]')
-        if generator:
-            result['generator'] = generator[0].get('content')
-
-    # opengraph meta tags
-    result['opengraph'] = None
-    opengraph = set()
-    if head is not None:
-        for item in head.find_all(property=re.compile('^og:')):
-            opengraph.add(item.get('property'))
-        for item in head.find_all(itemprop=re.compile('^og:')):
-            opengraph.add(item.get('itemprop'))
-        if opengraph:
-            result['opengraph'] = sorted(list(opengraph))
-
-    return result
-
-
-def collect_ipv4_addresses(hostname_dict):
-    """
-    Return list of unique IPv4 addresses
-    """
-    ips = set()
-    for item in hostname_dict.values():
-        if 'ip_addresses' not in item:
-            continue
-        for ip_addr in item['ip_addresses']:
-            ips.add(ip_addr)
-    return sorted(list(ips))
-
-
-def parse_generator(generator):
-    """
-    Return well known CMS names from generator
-    """
-    generator = generator.lower()
-    if 'typo3' in generator:
-        return "typo3"
-    if 'wordpress' in generator:
-        return "wordpress"
-    if 'drupal' in generator:
-        return "drupal"
-    if 'joomla' in generator:
-        return "joomla"
-    return generator
-
-def check_site(entry):
-    """
-    Performs our site check and returns results as a dict.
-
-    1. Normalize the input URL and derive the URLs to check for
-    2. HEAD the check urls
-    3. Determine the canonical URL
-    4. Run full check on canonical URL
-    """
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' +
-                      'AppleWebKit/537.36 (KHTML, like Gecko) ' +
-                      'Chrome/65.0.3325.181 green-spider/0.1'
-    }
-
-    # all the info we'll return for the site
-    result = {
-        # input_url: The URL we derived all checks from
-        'input_url': entry['url'],
-        # Meta: Regional and type metadata for the site
-        'meta': {
-            'level': entry.get('level'),
-            'state': entry.get('state'),
-            'district': entry.get('district'),
-            'city': entry.get('city'),
-        },
-        # Details: All details we collected about the site (which aren't directly
-        # related to the report criteria)
-        'details': {
-            'hostnames': {},
-            'ipv4_addresses': [],
-            'resolvable_urls': [],
-            'canonical_urls': [],
-            'urlchecks': [],
-            'icons': [],
-            'feeds': [],
-            'cms': None,
-            'responsive': None,
-        },
-        # The actual report criteria
-        'result': {
-            'DNS_RESOLVABLE_IPV4': {'type': 'boolean', 'value': False, 'score': 0},
-            'SITE_REACHABLE': {'type': 'boolean', 'value': False, 'score': 0},
-            'HTTPS': {'type': 'boolean', 'value': False, 'score': 0},
-            'WWW_OPTIONAL': {'type': 'boolean', 'value': False, 'score': 0},
-            'CANONICAL_URL': {'type': 'boolean', 'value': False, 'score': 0},
-            'FAVICON': {'type': 'boolean', 'value': False, 'score': 0},
-            'FEEDS': {'type': 'boolean', 'value': False, 'score': 0},
-            'HTTP_RESPONSE_DURATION': {'type': 'number', 'value': None, 'score': 0},
-            'RESPONSIVE': {'type': 'boolean', 'value': False, 'score': 0},
-        },
-        'score': 0.0,
-    }
-
-    # derive hostnames to test (with/without www.)
-    parsed = urlparse(entry['url'])
-    hostnames = derive_test_hostnames(parsed.hostname)
-
-    # try to resolve hostnames
-    processed_hostnames = {}
-    for hostname in hostnames:
-
-        processed_hostnames[hostname] = {
-            'resolvable': False,
-        }
-
-        try:
-            hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
-            processed_hostnames[hostname]['resolvable'] = True
-            processed_hostnames[hostname]['resolved_hostname'] = hostname
-            processed_hostnames[hostname]['aliases'] = aliases
-            processed_hostnames[hostname]['ip_addresses'] = ip_addresses
-        except:
-            pass
-
-    result['details']['hostnames'] = processed_hostnames
-
-    result['details']['ipv4_addresses'] = collect_ipv4_addresses(processed_hostnames)
-
-    # check basic HTTP(S) reachability
-    checked_urls = []
-    checked_urls_set = set()
-
-    for hostname in processed_hostnames.keys():
-
-        item = processed_hostnames[hostname]
-
-        if not item['resolvable']:
-            continue
-
-        for scheme in ('http', 'https'):
-
-            url = scheme + '://' + item['resolved_hostname'] + '/'
-
-            if url in checked_urls_set:
-                continue
-
-            checked_urls_set.add(url)
-
-            record = {
-                'url': url,
-                'error': None,
-                'redirects_to': None,
-            }
-
-            try:
-                req = requests.head(record['url'], headers=headers, allow_redirects=True)
-                if req.url == url:
-                    logging.info("URL: %s - status %s", record['url'], req.status_code)
-                else:
-                    logging.info("URL: %s - status %s - redirects to %s", record['url'],
-                                 req.status_code, req.url)
-                    record['redirects_to'] = req.url
-            except Exception as exc:
-                record['error'] = {
-                    'type': str(type(exc)),
-                    'message': str(exc),
-                }
-                logging.info("URL %s: %s %s", url, str(type(exc)), exc)
-
-            checked_urls.append(record)
-
-    result['details']['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
-    result['details']['canonical_urls'] = sorted(reduce_urls(checked_urls))
-
-    # Deeper test for the remaining (canonical) URL(s)
-    for check_url in result['details']['canonical_urls']:
-
-        logging.info("Downloading URL %s", check_url)
-
-        check = {
-            'url': check_url,
-            'status_code': None,
-            'duration': None,
-            'error': None,
-            'content': None,
-            'responsive': None,
-        }
-
-        try:
-            req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
-            check['status_code'] = req.status_code
-            check['duration'] = round(req.elapsed.microseconds / 1000)
-
-            # Content checks
-            if req.status_code < 300:
-                check['content'] = check_content(req)
-
-            # Responsiveness check
-            try:
-                check['responsive'] = check_responsiveness(check_url)
-            except Exception as exc:
-                logging.error("Error when checking responsiveness for '%s': %s", check_url, exc)
-
-        except requests.exceptions.ConnectionError as exc:
-            logging.error(str(exc) + " " + check_url)
-            check['error'] = "connection"
-        except requests.exceptions.ReadTimeout as exc:
-            logging.error(str(exc) + " " + check_url)
-            check['error'] = "read_timeout"
-        except requests.exceptions.Timeout as exc:
-            logging.error(str(exc) + " " + check_url)
-            check['error'] = "connection_timeout"
-        except Exception as exc:
-            logging.error(str(exc) + " " + check_url)
-            check['error'] = "unknown"
-
-        result['details']['urlchecks'].append(check)
-
-
-    result['details']['urlchecks'] = sorted(result['details']['urlchecks'],
-                                            key=lambda url: url['url'])
-
-    # collect icons
-    icons = set()
-    for c in result['details']['urlchecks']:
-        if 'content' not in c:
-            continue
-        if c['content'] is None:
-            logging.warning("No content for %s", entry['url'])
-            continue
-        if c['content']['icon'] is not None:
-            icons.add(c['content']['icon'])
-    result['details']['icons'] = sorted(list(icons))
-
-    # collect feeds
-    feeds = set()
-    for c in result['details']['urlchecks']:
-        if c['content'] is None:
-            logging.warning("No content for %s", entry['url'])
-            continue
-        if 'feeds' in c['content'] and len(c['content']['feeds']):
-            for feed in c['content']['feeds']:
-                feeds.add(feed)
-    result['details']['feeds'] = sorted(list(feeds))
-
-    # detect responsive
-    viewports = set()
-    min_width = 2000
-    for c in result['details']['urlchecks']:
-        if c['responsive'] is None:
-            continue
-        if c['responsive']['viewport_meta_tag'] is not None:
-            viewports.add(c['responsive']['viewport_meta_tag'])
-        widths = c['responsive']['document_width'].values()
-        if min(widths) < min_width:
-            min_width = min(widths)
-    result['details']['responsive'] = {
-        'viewport_meta_tag': list(viewports),
-        'min_width': min_width,
-    }
-
-    # detect CMS
-    for c in result['details']['urlchecks']:
-        if c['content'] is None:
-            continue
-        if 'generator' not in c['content']:
-            continue
-        if c['content']['generator'] != "" and c['content']['generator'] is not None:
-
-            result['details']['cms'] = parse_generator(c['content']['generator'])
-            # Qualify certain CMS flavours in more detail
-            if result['details']['cms'] == "typo3":
-                if GCMS_IP in result['details']['ipv4_addresses']:
-                    result['details']['cms'] = "typo3-gcms"
-                elif 'typo3-gruene.de' in c['content']['html']:
-                    result['details']['cms'] = "typo3-gruene"
-            elif result['details']['cms'] == "wordpress":
-                if 'Urwahl3000' in c['content']['html']:
-                    result['details']['cms'] = "wordpress-urwahl"
-
-        else:
-            # No generator Tag. Use HTML content.
-            if 'Urwahl3000' in c['content']['html']:
-                result['details']['cms'] = "wordpress-urwahl"
-            elif ('josephknowsbest' in c['content']['html'] or
-                  'Joseph-knows-best' in c['content']['html']):
-                result['details']['cms'] = "wordpress-josephknowsbest"
-            elif 'wordpress' in c['content']['html']:
-                result['details']['cms'] = "wordpress"
-
-        # we can stop here
-        break
-
-
-    ### Derive criteria
-
-    # DNS_RESOLVABLE_IPV4
-    if result['details']['ipv4_addresses']:
-        result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
-
-    # SITE_REACHABLE
-    for item in result['details']['resolvable_urls']:
-        if item['error'] is None:
-            result['result']['SITE_REACHABLE'] = {'value': True, 'score': 1}
-            break
-
-    # HTTPS
-    for item in result['details']['urlchecks']:
-        if item['error'] is None and item['url'].startswith('https://'):
-            result['result']['HTTPS'] = {'value': True, 'score': 2}
-            break
-
-    # WWW_OPTIONAL
-    num_hostnames = 0
-    for hostname in result['details']['hostnames'].keys():
-        item = result['details']['hostnames'][hostname]
-        if not item['resolvable']:
-            continue
-        num_hostnames += 1
-    if num_hostnames > 1:
-        result['result']['WWW_OPTIONAL'] = {'value': True, 'score': 1}
-
-    # CANONICAL_URL
-    # - either there is only one canonical URL (through redirects)
-    # - or several pages have identical rel=canonical links
-    if len(result['details']['canonical_urls']) == 1:
-        result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
-    else:
-        links = set()
-        if result['details']['urlchecks'] is None:
-            logging.warning("No urlchecks for %s", entry['url'])
-        else:
-            for item in result['details']['urlchecks']:
-                if item['content'] is not None and item['content']['canonical_link'] is not None:
-                    links.add(item['content']['canonical_link'])
-        if len(links) == 1:
-            result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
-
-    # FAVICON
-    if result['details']['icons']:
-        result['result']['FAVICON'] = {'value': True, 'score': 1}
-
-    # FEEDS
-    if result['details']['feeds']:
-        result['result']['FEEDS'] = {'value': True, 'score': 1}
-
-    # HTTP_RESPONSE_DURATION
-    durations = []
-    for item in result['details']['urlchecks']:
-        if item['error'] is None:
-            durations.append(item['duration'])
-    if durations:
-        val = round(statistics.mean(durations))
-        result['result']['HTTP_RESPONSE_DURATION']['value'] = val
-        if val < 100:
-            result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
-        elif val < 1000:
-            result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
-
-    # RESPONSIVE
-    if result['details']['responsive'] is not None:
-        if (result['details']['responsive']['min_width'] < 500 and
-                len(result['details']['responsive']['viewport_meta_tag']) > 0):
-            result['result']['RESPONSIVE']['value'] = True
-            result['result']['RESPONSIVE']['score'] = 1
-
-    # Overall score
-    for item in result['result'].keys():
-        result['score'] += result['result'][item]['score']
-
-    # clean up - remove full HTML
-    for item in result['details']['urlchecks']:
-        try:
-            del item['content']['html']
-        except:
-            pass
-
-    return result
-
-
-@tenacity.retry(wait=tenacity.wait_exponential(),
-                retry=tenacity.retry_if_exception_type(Aborted))
-def get_job_from_queue():
-    """
-    Returns a URL from the queue
-    """
-    out = None
-
-    with DATASTORE_CLIENT.transaction():
-        query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND)
-        for entity in query.fetch(limit=1):
-            logging.debug("Got job: %s", entity)
-            out = dict(entity)
-            out["url"] = entity.key.name
-            DATASTORE_CLIENT.delete(entity.key)
-
-    return out
-
-def work_of_queue():
-    """
-    Take job from queue and finish it until there are no more jobs
-    """
-    while True:
-        job = get_job_from_queue()
-        if job is None:
-            logging.info("No more jobs. Exiting.")
-            break
-
-        logging.info("Starting job %s", job["url"])
-        result = check_site(entry=job)
-        #logging.debug(result)
-        logging.info("Job %s finished checks", job["url"])
-        logging.info("Job %s writing to DB", job["url"])
-
-        key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"])
-        entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
-        record = {
-            "created": datetime.utcnow(),
-            "results": result,
-        }
-        entity.update(record)
-        try:
-            DATASTORE_CLIENT.put(entity)
-        except InvalidArgument as ex:
-            logging.error("Could not write result: %s", ex)
-        except ex:
-            logging.error("Could not write result: %s", ex)
-
-
-if __name__ == "__main__":
-    """
-    Bringing it all together
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--credentials-path', dest='credentials_path',
-                        help='Path to the service account credentials JSON file',
-                        default='/secrets/service-account.json')
-    parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
-                        default='info')
-
-    subparsers = parser.add_subparsers(help='sub-command help', dest='command')
-
-    subparsers.add_parser('spider', help='Take jobs off the queue and spider')
-
-    jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
-
-    jobs_parser.add_argument('--url', help='Add a job to spider a URL')
-    args = parser.parse_args()
-
-    loglevel = args.loglevel.lower()
-    if loglevel == 'error':
-        logging.basicConfig(level=logging.ERROR)
-    elif loglevel == 'warn':
-        logging.basicConfig(level=logging.WARN)
-    elif loglevel == 'debug':
-        logging.basicConfig(level=logging.DEBUG)
-    else:
-        logging.basicConfig(level=logging.INFO)
-        loglevel = 'info'
-
-    logging.getLogger("urllib3").setLevel(logging.CRITICAL)
-
-    DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
-
-    logging.debug("Called command %s", args.command)
-
-    if args.command == 'jobs':
-        create_jobs(args.url)
-    else:
-        work_of_queue()
diff --git a/spider/__init__.py b/spider/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/spider/spider.py b/spider/spider.py
new file mode 100644
index 0000000..d1cf6de
--- /dev/null
+++ b/spider/spider.py
@@ -0,0 +1,106 @@
+"""
+Provides the spider functionality (website checks).
+"""
+
+import argparse
+import json
+import logging
+import re
+import statistics
+import time
+from datetime import datetime
+from pprint import pprint
+
+from google.api_core.exceptions import InvalidArgument
+from google.cloud import datastore
+
+import checks
+import config
+import jobs
+import rating
+
+def check_and_rate_site(entry):
+    """
+    Performs our site check and returns results as a dict.
+
+    1. Normalize the input URL and derive the URLs to check for
+    2. HEAD the check urls
+    3. Determine the canonical URL
+    4. Run full check on canonical URL
+    """
+
+    # all the info we'll return for the site
+    result = {
+        # input_url: The URL we derived all checks from
+        'input_url': entry['url'],
+        # Meta: Regional and type metadata for the site
+        'meta': {
+            'type': entry.get('type'),
+            'level': entry.get('level'),
+            'state': entry.get('state'),
+            'district': entry.get('district'),
+            'city': entry.get('city'),
+        },
+        # checks: Results from our checks
+        'checks': {},
+        # The actual report scoring criteria
+        'rating': {},
+        # resulting score
+        'score': 0.0,
+    }
+
+    # Results from our next generation checkers
+    result['checks'] = checks.perform_checks(entry['url'])
+
+    result['rating'] = rating.calculate_rating(result['checks'])
+
+    # Overall score is the sum of the individual scores
+    for key in result['rating']:
+        result['score'] += result['rating'][key]['score']
+
+    # remove full HTML page content,
+    # as it's no longer needed
+    try:
+        for url in result['checks']['page_content']:
+            del result['checks']['page_content'][url]['content']
+    except:
+        pass
+
+    return result
+
+
+def work_of_queue(datastore_client, entity_kind):
+    """
+    Take job from queue and finish it until there are no more jobs
+    """
+    while True:
+        job = jobs.get_job_from_queue(datastore_client)
+        if job is None:
+            logging.info("No more jobs. Exiting.")
+            break
+
+        logging.info("Starting job %s", job["url"])
+        result = check_and_rate_site(entry=job)
+
+        logging.debug("Full JSON representation of returned result: %s", json.dumps(result))
+
+        logging.info("Job %s finished checks", job["url"])
+        logging.info("Job %s writing to DB", job["url"])
+
+        key = datastore_client.key(entity_kind, job["url"])
+        entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
+        record = {
+            'created': datetime.utcnow(),
+            'meta': result['meta'],
+            'checks': result['checks'],
+            'rating': result['rating'],
+            'score': result['score'],
+        }
+        entity.update(record)
+        try:
+            datastore_client.put(entity)
+        except InvalidArgument as ex:
+            logging.error("Could not write result: %s", ex)
+        except Exception as ex:
+            logging.error("Could not write result: %s", ex)
+
diff --git a/spider/spider_test.py b/spider/spider_test.py
new file mode 100644
index 0000000..dda55e7
--- /dev/null
+++ b/spider/spider_test.py
@@ -0,0 +1,26 @@
+import unittest
+
+from spider.spider import check_and_rate_site
+
+from pprint import pprint
+
+class TestSpiderr(unittest.TestCase):
+
+    def test_url1(self):
+
+        entry = {
+            "url": "https://httpbin.org/html",
+            "type": "type",
+            "state": "state",
+            "level": "level",
+            "district": "district",
+            "city": "city",
+        }
+
+        url = "https://httpbin.org/html"
+        result = check_and_rate_site(entry)
+
+        self.assertEqual(result["input_url"], url)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/spider_test.py b/spider_test.py
deleted file mode 100644
index a617147..0000000
--- a/spider_test.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import unittest
-import requests
-import responses
-import spider
-
-
-class TestDeriveHostnames(unittest.TestCase):
-
-    def test_basic1(self):
-        hn = spider.derive_test_hostnames('www.my-domain.de')
-        expected = ['my-domain.de', 'www.my-domain.de']
-        self.assertEqual(hn, expected)
-
-    def test_basic2(self):
-        hn = spider.derive_test_hostnames('domain.de')
-        expected = ['domain.de', 'www.domain.de']
-        self.assertEqual(hn, expected)
-
-
-class TestReduceURLs(unittest.TestCase):
-
-    def test_basic(self):
-        testdata = [
-            {'url': 'one', 'error': None, 'redirects_to': None},
-            {'url': 'two', 'error': 'Yes', 'redirects_to': None},
-            {'url': 'three', 'error': None, 'redirects_to': 'five'},
-        ]
-        expected_result = ['five', 'one']
-        result = spider.reduce_urls(testdata)
-        self.assertEqual(result, expected_result)
-
-
-class TestContentChecks(unittest.TestCase):
-
-    @responses.activate
-    def test_minimal(self):
-        url = 'http://my.url'
-        responses.add(responses.GET, url, status=200,
-            content_type='text/html',
-            body='<html></html>')
-        r = requests.get(url)
-        result = spider.check_content(r)
-
-        del result['html']  # don't want to have the messy HTML part in comparison
-
-        expected_result = {
-            'icon': None,
-            'title': None,
-            'generator': None,
-            'feeds': [],
-            'encoding': 'iso-8859-1',
-            'canonical_link': None,
-            'opengraph': None
-        }
-        self.assertDictEqual(result, expected_result)
-
-    @responses.activate
-    def test_basic(self):
-        url = 'http://my.url'
-        responses.add(responses.GET, url, status=200,
-            content_type='text/html; charset=UTF-8',
-            body='''
-                <!DOCTYPE html>
-                <html>
-                <head>
-                    <title> The page's title </title>
-                    <meta name="generator" content="some-cms/1.0">
-                    <link rel="shortcut icon" href="http://foo.bar/image.png">
-                    <link rel="alternate" type="application/rss+xml" href="http://example.com/feed">
-                    <link rel="canonical" href="https://my.site.com/">
-                </head>
-                </html>
-            ''')
-        r = requests.get(url)
-        result = spider.check_content(r)
-
-        del result['html']  # don't want to have the messy HTML part in comparison
-
-        expected_result = {
-            'icon': 'http://foo.bar/image.png',
-            'title': 'The page\'s title',
-            'generator': 'some-cms/1.0',
-            'feeds': [
-                'http://example.com/feed',
-            ],
-            'encoding': 'utf-8',
-            'canonical_link': 'https://my.site.com/',
-            'opengraph': None
-        }
-        self.assertDictEqual(result, expected_result)
-
-    @responses.activate
-    def test_opengraph(self):
-        url = 'http://my.url'
-        responses.add(responses.GET, url, status=200,
-            content_type='text/html; charset=UTF-8',
-            body='''
-                <html>
-                <head>
-                    <meta property="og:title" content="The Rock" />
-                    <meta property="og:type" content="video.movie" />
-                    <meta property="og:url" content="http://www.foor.bar" />
-                    <meta property="og:image" content="http://www.foo.bar/foo.jpg" />
-                </head>
-                </html>
-            ''')
-        r = requests.get(url)
-        result = spider.check_content(r)
-
-        del result['html']  # don't want to have the messy HTML part in comparison
-
-        expected_result = {
-            'icon': None,
-            'title': None,
-            'generator': None,
-            'feeds': [],
-            'encoding': 'utf-8',
-            'canonical_link': None,
-            'opengraph': ['og:image', 'og:title', 'og:type', 'og:url'],
-        }
-        self.assertDictEqual(result, expected_result)
-
-
-if __name__ == '__main__':
-    unittest.main()