Merge pull request #58 from netzbegruenung/write-to-db

Use job queue, write to database
2024-05-03 09:33:42 +02:00 · 2018-08-23 10:16:50 +02:00 · 2018-08-23 10:16:50 +02:00 · bf23478265
parent b61172445b ab84ec8796
commit bf23478265
6 changed files with 425 additions and 245 deletions
--- a/3
+++ b/3
@ -12,7 +12,7 @@ RUN apt-get update \
  && wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
    && dpkg -i google-chrome-stable_current_amd64.deb \
    && rm google-chrome-stable_current_amd64.deb \
-  && pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 certifi==2018.1.18 \
+  && pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 \
  && wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \
    && unzip chromedriver_linux64.zip \
    && rm chromedriver_linux64.zip \
@ -25,6 +25,7 @@ RUN wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x
 ADD spider.py /
 ADD spider_test.py /
 ADD data_export.py /
 ENTRYPOINT ["python3"]
 CMD ["/spider.py"]
--- a/22
+++ b/22
@ -7,24 +7,40 @@ dockerimage:
 	docker pull debian:stretch-slim
 	docker build -t spider .
 # Create spider job queue
 spiderjobs: dockerimage
 	docker run --rm -ti \
 		-v $(PWD)/secrets:/secrets \
 		spider spider.py \
 		--credentials-path /secrets/datastore-writer.json \
 		--loglevel debug \
 		jobs
 # Run spider in docker image
 spider: dockerimage
 	docker run --rm -ti \
 		-v $(PWD)/webapp/dist/data:/out \
 		-v $(PWD)/docs/siteicons:/icons \
 		-v $(PWD)/secrets:/secrets \
 		spider spider.py \
 		--credentials-path /secrets/datastore-writer.json \
 		--loglevel debug \
 		spider
 test: dockerimage
 	docker run --rm -ti spider /spider_test.py
-screenshots: venv
+export:
-	venv/bin/python ./screenshots.py secrets/screenshot-reader.json
+	docker run --rm -ti \
 		-v $(PWD)/webapp/dist/data:/out \
 		-v $(PWD)/secrets:/secrets \
 		spider data_export.py /secrets/datastore-reader.json
 webapp/node_modules:
 	cd webapp && npm install
 # Build webapp
-webapp: webapp/node_modules screenshots
+webapp: webapp/node_modules
 	cd webapp && npx webpack --config webpack.config.js
 	cp -r webapp/dist/* ./docs/
 	cp webapp/node_modules/tooltipster/dist/css/tooltipster.bundle.min.css ./docs/css/
--- a/README.md
+++ b/README.md
@ -1,11 +1,15 @@
 # Green Spider
-Initiative und Tools zur Förderung eines benutzer*innenfreundlichen Auftritts von Bündnis 90/Die Grünen im Web
+Initiative und Tools zur Förderung eines benutzer*innenfreundlichen Auftritts von Bündnis 90/Die Grünen im Web.
 Zur Auswertung: [https://green-spider.netzbegruenung.de/](https://green-spider.netzbegruenung.de/)
 ## Tools
 - Spider: Sammelt Informationen über Websites von B90/GRÜNE Gliederungen
 - Screenshotter: Erstellt Seiten-Screenshots. Siehe [netzbegruenung/green-spider-screenshotter](https://github.com/netzbegruenung/green-spider-screenshotter/)
 - Webapp: Darstellung der Spider-Ergebnisse unter [green-spider.netzbegruenung.de](https://green-spider.netzbegruenung.de/)
 ## Aktivitäten
@ -22,21 +26,28 @@ Zur Kommunikation dient der Chatbegrünung-Kanal [#green-spider](https://chatbeg
 ### Spider ausführen
 Damit werden alle bekannten WWW-Adressen aus [netzbegruenung/green-directory](https://github.com/netzbegruenung/green-directory) geprüft und Daten dazu gesammelt.
 Voraussetzungen:
- GNU make
+- Docker
- Python 3
+- Schlüssel mit Schreibrecht für die Ergebnis-Datenbank
 - virtualenv
-Starte den Vorgang mit diesem Befehl:
+Um alle Sites aus aus [netzbegruenung/green-directory](https://github.com/netzbegruenung/green-directory) zu spidern:
 ```nohighlight
 make spiderjobs
 make spider
 ```
-Das Ergebnis ist die Datei `webapp/dist/data/spider_result.json`. Wenn Du die neuen Daten ins Repository einspielen möchtest, erstelle bitte einen Pull Request.
+Alternativ kann wie im nachfolgenden Beispiel gezeogt das Spidern einer einzelnen URL angestoßen werden. Diese muss nicht zwingend Teil des `green-directory` sein.
 ```nohighlight
 docker run --rm -ti \
  -v $PWD/secrets:/secrets spider \
  spider.py --credentials-path /secrets/datastore-writer.json \
  jobs --url https://www.trittin.de/
 make spider
 ```
 ### Screenshots erstellen
@ -44,16 +55,17 @@ Siehe [green-spider-screenshotter](https://github.com/netzbegruenung/green-spide
 ### Webapp aktualisieren
 Die unter https://netzbegruenung.github.io/green-spider/ veröffentlichte Webapp zeigt den Inhalt des [docs](https://github.com/netzbegruenung/green-spider/tree/master/docs) Verzeichnisses für den `master` Branch dieses repositories an. Dieser kann automatisch neu erzeugt werden.
 Voraussetzungen:
 - npm
- Service-Account JSON-Datei für den Lesezugriff auf Screenshot-Daten
+- Docker
 - Schlüssel mit Leserecht für Screenshot- und Ergebnis-Datenbank
-Um den Inhalt des docs-Verzeichnisses zu aktualisieren, gibt es im Makefile dieses Kommando:
+Die beiden nachfolgenden Kommandos erzeugen die JSON-Exporte der Spider-Ergebnisse
 und Screenshots und aktualisieren die Webapp.
 ```nohighlight
 make export
 make webapp
 ```
--- a/data_export.py
+++ b/data_export.py
@ -1,17 +1,35 @@
 """
 Exports data from the database to JSON files for use in a static webapp
 """
 from google.cloud import datastore
 import json
 import sys
 import os
-def main():
+client = None
    if len(sys.argv) == 1:
        print("Error: please provide path to Google Storage API system account JSON file as argument")
        sys.exit(1)
-    key_path = sys.argv[1]
+def export_results():
-    client = datastore.Client.from_service_account_json(key_path)
+    """
    Export of the main results data
    """
    out = []
    query = client.query(kind='spider-results')
    for entity in query.fetch():
        print(entity.key.name)
        out.append(dict(entity)["results"])
    output_filename = "/out/spider_result.json"
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
        json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
 def export_screenshots():
    """
    Export of screenshot meta data
    """
    out = {}
    query = client.query(kind='webscreenshot')
@ -19,10 +37,18 @@ def main():
        print(item['url'], os.path.basename(item['screenshot_url']))
        out[item['url']] = os.path.basename(item['screenshot_url'])
-    output_filename = "./webapp/dist/data/screenshots.json"
+    output_filename = "/out/screenshots.json"
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
        json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
 if __name__ == "__main__":
-    main()
+    if len(sys.argv) == 1:
        print("Error: please provide path to Google Storage API system account JSON file as argument")
        sys.exit(1)
    key_path = sys.argv[1]
    client = datastore.Client.from_service_account_json(key_path)
    export_screenshots()
    export_results()
--- a/spider.py
+++ b/spider.py
@ -1,68 +1,175 @@
-# coding: utf8
+"""
 Provides the spider functionality (website checks).
 """
-from bs4 import BeautifulSoup
+import argparse
 from git import Repo
 from multiprocessing import Pool
 from selenium import webdriver
 from socket import gethostbyname_ex
 from urllib.parse import urljoin
 from urllib.parse import urlparse
 import certifi
 import hashlib
 import json
 import logging
 import os
 import random
 import re
 import requests
 import shutil
 import statistics
-import sys
+from datetime import datetime
 from socket import gethostbyname_ex
 from urllib.parse import urljoin
 from urllib.parse import urlparse
 import requests
 import yaml
 from bs4 import BeautifulSoup
 from git import Repo
 from selenium import webdriver
 from google.cloud import datastore
 from google.api_core.exceptions import InvalidArgument
 # configuration
 # number of parallel processes to use for crawling
 concurrency = 1
 # connection timeout for website checks (seconds)
-connect_timeout = 5
+CONNECT_TIMEOUT = 5
 # response timeout for website checks
-read_timeout = 10
+READ_TIMEOUT = 10
 # Git repo for our data
-green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
+GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
 # folder in that repo that holds the data
-green_direcory_data_path = 'data/countries/de'
+GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
-green_directory_local_path = './cache/green-directory'
+GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
-result_path = '/out'
+RESULT_PATH = '/out'
-siteicons_path = '/icons'
+SITEICONS_PATH = '/icons'
 # IP address of the newthinking GCMS server
-gcms_ip = "91.102.13.20"
+GCMS_IP = "91.102.13.20"
 JOB_DATASTORE_KIND = 'spider-jobs'
 RESULTS_DATASTORE_KIND = 'spider-results'
 # end configuration
 DATASTORE_CLIENT = None
 def chunks(the_list, size):
    """
    Yield successive n-sized chunks from list the_list
    where n = size.
    """
    for i in range(0, len(the_list), size):
        yield the_list[i:i + size]
 def create_jobs(url=None):
    """
    Read all URLs from green directory and fill a job database
    with one job per URL.
    Alternatively, if the url argument is given, only the given URL
    will be added as a spider job.
    """
    # refresh our local clone of the green directory
    logging.info("Refreshing green-directory clone")
    get_green_directory()
    # build the list of website URLs to run checks for
    logging.info("Processing green-directory")
    input_entries = []
    count = 0
    for entry in dir_entries():
        if 'type' not in entry:
            logging.error("Entry without type")
            continue
        if 'urls' not in entry:
            logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
            continue
        website_url = None
        for index in range(len(entry['urls'])):
            try:
                if entry['urls'][index]['type'] == "WEBSITE":
                    website_url = entry['urls'][index]['url']
                    if website_url:
                        if url is not None and website_url != url:
                            continue
                        input_entries.append({
                            "url": website_url,
                            "level": entry.get("level"),
                            "state": entry.get("state"),
                            "district": entry.get("district"),
                            "city": entry.get("city"),
                        })
                        count += 1
            except NameError:
                logging.error("Error in %s: 'url' key missing (%s)",
                              repr_entry(entry), entry['urls'][index])
    # ensure the passed URL argument is really there, even if not part
    # of the directory.
    if url and count == 0:
        logging.info("Adding job for URL %s which is not part of green-directory", url)
        input_entries.append({
            "url": url,
            "level": None,
            "state": None,
            "district": None,
            "city": None,
        })
    # randomize order, to distribute requests over servers
    logging.debug("Shuffling input URLs")
    random.seed()
    random.shuffle(input_entries)
    count = 0
    logging.info("Writing jobs")
    entities = []
    for entry in input_entries:
        key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"])
        entity = datastore.Entity(key=key)
        entity.update({
            "created": datetime.utcnow(),
            "level": entry["level"],
            "state": entry["state"],
            "district": entry["district"],
            "city": entry["city"],
        })
        entities.append(entity)
    # commmit to DB
    for chunk in chunks(entities, 300):
        logging.debug("Writing jobs chunk of length %d", len(chunk))
        DATASTORE_CLIENT.put_multi(chunk)
        count += len(chunk)
    logging.info("Writing jobs done, %s jobs added", count)
 def get_green_directory():
    """
    Clones the source of website URLs, the green directory,
    into the local file system using git
    """
-    if os.path.exists(green_directory_local_path):
+    if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH):
-        shutil.rmtree(green_directory_local_path)
+        shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH)
-    Repo.clone_from(green_directory_repo, green_directory_local_path)
+    Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH)
 def dir_entries():
    """
    Iterator over all data files in the cloned green directory
    """
-    path = os.path.join(green_directory_local_path, green_direcory_data_path)
+    path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH)
    for root, _, files in os.walk(path):
        for fname in files:
@ -80,14 +187,14 @@ def repr_entry(entry):
    Return string representation of a directory entry,
    for logging/debugging purposes
    """
-    r = entry['type']
+    ret = entry['type']
    if 'level' in entry:
-        r += "/" + entry['level']
+        ret += "/" + entry['level']
    if 'state' in entry:
-        r += "/" + entry['state']
+        ret += "/" + entry['state']
    if 'district' in entry:
-        r += "/" + entry['district']
+        ret += "/" + entry['district']
-    return r
+    return ret
 def derive_test_hostnames(hostname):
@ -117,24 +224,25 @@ def reduce_urls(urllist):
    that either don't work or lead somewhere else
    """
    targets = set()
-    for u in urllist:
+    for url in urllist:
-        if u['error'] is not None:
+        if url['error'] is not None:
            continue
-        if u['redirects_to'] is not None:
+        if url['redirects_to'] is not None:
-            targets.add(u['redirects_to'])
+            targets.add(url['redirects_to'])
        else:
-            targets.add(u['url'])
+            targets.add(url['url'])
    return sorted(list(targets))
-def normalize_title(s):
+def normalize_title(title):
    """
    Removes garbage from HTML page titles
    """
-    s = s.replace('\u00a0', ' ')
+    title = title.replace(u'\u00a0', ' ')
-    s = s.replace('  ', ' ')
+    title = title.replace('  ', ' ')
-    s = s.strip()
+    title = title.strip()
-    return s
+    return title
 def download_icon(icon_url):
    """
@ -150,10 +258,10 @@ def download_icon(icon_url):
    }
    # Download the icon
-    r = requests.get(icon_url)
+    req = requests.get(icon_url)
-    r.raise_for_status()
+    req.raise_for_status()
-    content_hash = hashlib.md5(r.content).hexdigest()
+    content_hash = hashlib.md5(req.content).hexdigest()
    extension = ""
    file_name = os.path.basename(icon_url)[-1]
@ -161,24 +269,25 @@ def download_icon(icon_url):
        ext = file_name.split(".")[-1]
        if ext != "":
            extension = ext
-    
+
    if extension == "":
        # derive from content type
-        t = r.headers.get('content-type')
+        ctype = req.headers.get('content-type')
        try:
-            extension = default_endings[t]
+            extension = default_endings[ctype]
        except KeyError:
-            logging.error("No file ending defined for icon type '%s'" % t)
+            logging.error("No file ending defined for icon type '%s'", ctype)
            return None
-    
+
    filename = content_hash + "." + extension.lower()
-    path = siteicons_path + os.path.sep + filename
+    path = SITEICONS_PATH + os.path.sep + filename
    with open(path, 'wb') as iconfile:
-        iconfile.write(r.content)
+        iconfile.write(req.content)
    return filename
 def check_responsiveness(url):
    """
    Checks
@ -193,9 +302,9 @@ def check_responsiveness(url):
    # sizes we check for (width, height)
    sizes = (
-        (320,480), # old smartphone
+        (320, 480), # old smartphone
-        (768,1024), # older tablet or newer smartphone
+        (768, 1024), # older tablet or newer smartphone
-        (1024,768), # older desktop or horiz. tablet
+        (1024, 768), # older desktop or horiz. tablet
        (1920, 1080), # Full HD horizontal
    )
@ -218,7 +327,8 @@ def check_responsiveness(url):
    return details
-def check_content(r):
+
 def check_content(req):
    """
    Adds details to check regarding content of the page
@ -227,10 +337,10 @@ def check_content(r):
    """
    result = {}
-    result['encoding'] = r.encoding.lower()
+    result['encoding'] = req.encoding.lower()
-    soup = BeautifulSoup(r.text, 'html.parser')
+    soup = BeautifulSoup(req.text, 'html.parser')
-    result['html'] = r.text
+    result['html'] = req.text
    # page title
    result['title'] = None
@ -245,47 +355,47 @@ def check_content(r):
    result['canonical_link'] = None
    link = soup.find('link', rel='canonical')
    if link:
-        result['canonical_link'] = urljoin(r.url, link.get('href'))
+        result['canonical_link'] = urljoin(req.url, link.get('href'))
    # icon
    result['icon'] = None
-    link = soup.find('link', rel=lambda x: x and x.lower()=='icon')
+    link = soup.find('link', rel=lambda x: x and x.lower() == 'icon')
    if link:
-        result['icon'] = urljoin(r.url, link.get('href'))
+        result['icon'] = urljoin(req.url, link.get('href'))
    else:
-        link = soup.find('link', rel=lambda x: x and x.lower()=='shortcut icon')
+        link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
        if link:
-            result['icon'] = urljoin(r.url, link.get('href'))
+            result['icon'] = urljoin(req.url, link.get('href'))
    # feed links
    result['feeds'] = []
    rss_links = soup.find_all('link', type='application/rss+xml')
    atom_links = soup.find_all('link', type='application/atom+xml')
-    if len(rss_links) > 0:
+    if rss_links:
-        for l in rss_links:
+        for link in rss_links:
-            result['feeds'].append(urljoin(r.url, l.get('href')))
+            result['feeds'].append(urljoin(req.url, link.get('href')))
-    if len(atom_links) > 0:
+    if atom_links:
-        for l in rss_links:
+        for link in rss_links:
-            result['feeds'].append(urljoin(r.url, l.get('href')))
+            result['feeds'].append(urljoin(req.url, link.get('href')))
    # generator meta tag
    result['generator'] = None
    if head is not None:
        generator = head.select('[name=generator]')
-        if len(generator):
+        if generator:
            result['generator'] = generator[0].get('content')
    # opengraph meta tags
    result['opengraph'] = None
-    og = set()
+    opengraph = set()
    if head is not None:
        for item in head.find_all(property=re.compile('^og:')):
-            og.add(item.get('property'))
+            opengraph.add(item.get('property'))
        for item in head.find_all(itemprop=re.compile('^og:')):
-            og.add(item.get('itemprop'))
+            opengraph.add(item.get('itemprop'))
-        if len(og):
+        if opengraph:
-            result['opengraph'] = sorted(list(og))
+            result['opengraph'] = sorted(list(opengraph))
    return result
@ -298,8 +408,8 @@ def collect_ipv4_addresses(hostname_dict):
    for item in hostname_dict.values():
        if 'ip_addresses' not in item:
            continue
-        for ip in item['ip_addresses']:
+        for ip_addr in item['ip_addresses']:
-            ips.add(ip)
+            ips.add(ip_addr)
    return sorted(list(ips))
@ -310,11 +420,11 @@ def parse_generator(generator):
    generator = generator.lower()
    if 'typo3' in generator:
        return "typo3"
-    elif 'wordpress' in generator:
+    if 'wordpress' in generator:
        return "wordpress"
-    elif 'drupal' in generator:
+    if 'drupal' in generator:
        return "drupal"
-    elif 'joomla' in generator:
+    if 'joomla' in generator:
        return "joomla"
    return generator
@ -328,7 +438,9 @@ def check_site(entry):
    4. Run full check on canonical URL
    """
    headers = {
-        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' +
                      'AppleWebKit/537.36 (KHTML, like Gecko) ' +
                      'Chrome/65.0.3325.181 green-spider/0.1'
    }
    # all the info we'll return for the site
@ -337,12 +449,13 @@ def check_site(entry):
        'input_url': entry['url'],
        # Meta: Regional and type metadata for the site
        'meta': {
-            'level': entry['level'],
+            'level': entry.get('level'),
-            'state': entry['state'],
+            'state': entry.get('state'),
-            'district': entry['district'],
+            'district': entry.get('district'),
-            'city': entry['city'],
+            'city': entry.get('city'),
        },
-        # Details: All details we collected about the site (which aren't directly related to the report criteria)
+        # Details: All details we collected about the site (which aren't directly
        # related to the report criteria)
        'details': {
            'hostnames': {},
            'ipv4_addresses': [],
@ -375,18 +488,18 @@ def check_site(entry):
    # try to resolve hostnames
    processed_hostnames = {}
-    for hn in hostnames:
+    for hostname in hostnames:
-        processed_hostnames[hn] = {
+        processed_hostnames[hostname] = {
            'resolvable': False,
        }
        try:
-            hostname, aliases, ip_addresses = gethostbyname_ex(hn)
+            hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
-            processed_hostnames[hn]['resolvable'] = True
+            processed_hostnames[hostname]['resolvable'] = True
-            processed_hostnames[hn]['resolved_hostname'] = hostname
+            processed_hostnames[hostname]['resolved_hostname'] = hostname
-            processed_hostnames[hn]['aliases'] = aliases
+            processed_hostnames[hostname]['aliases'] = aliases
-            processed_hostnames[hn]['ip_addresses'] = ip_addresses
+            processed_hostnames[hostname]['ip_addresses'] = ip_addresses
        except:
            pass
@ -398,9 +511,9 @@ def check_site(entry):
    checked_urls = []
    checked_urls_set = set()
-    for hn in processed_hostnames.keys():
+    for hostname in processed_hostnames.keys():
-        item = processed_hostnames[hn]
+        item = processed_hostnames[hostname]
        if not item['resolvable']:
            continue
@ -421,18 +534,19 @@ def check_site(entry):
            }
            try:
-                r = requests.head(record['url'], headers=headers, allow_redirects=True)
+                req = requests.head(record['url'], headers=headers, allow_redirects=True)
-                if r.url == url:
+                if req.url == url:
-                    logging.info("URL: %s - status %s" % (record['url'], r.status_code))
+                    logging.info("URL: %s - status %s", record['url'], req.status_code)
                else:
-                    logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
+                    logging.info("URL: %s - status %s - redirects to %s", record['url'],
-                    record['redirects_to'] = r.url
+                                 req.status_code, req.url)
-            except Exception as e:
+                    record['redirects_to'] = req.url
            except Exception as exc:
                record['error'] = {
-                    'type': str(type(e)),
+                    'type': str(type(exc)),
-                    'message': str(e),
+                    'message': str(exc),
                }
-                logging.info("URL %s: %s %s" % (url, str(type(e)), e))
+                logging.info("URL %s: %s %s", url, str(type(exc)), exc)
            checked_urls.append(record)
@ -442,7 +556,7 @@ def check_site(entry):
    # Deeper test for the remaining (canonical) URL(s)
    for check_url in result['details']['canonical_urls']:
-        logging.info("Downloading URL %s" % check_url)
+        logging.info("Downloading URL %s", check_url)
        check = {
            'url': check_url,
@ -454,37 +568,38 @@ def check_site(entry):
        }
        try:
-            r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
+            req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
-            check['status_code'] = r.status_code
+            check['status_code'] = req.status_code
-            check['duration'] = round(r.elapsed.microseconds / 1000)
+            check['duration'] = round(req.elapsed.microseconds / 1000)
            # Content checks
-            if r.status_code < 300:
+            if req.status_code < 300:
-                check['content'] = check_content(r)
+                check['content'] = check_content(req)
            # Responsiveness check
            try:
                check['responsive'] = check_responsiveness(check_url)
-            except Exception as e:
+            except Exception as exc:
-                logging.error("Error when checking responsiveness for '%s': %s" % (check_url, e))
+                logging.error("Error when checking responsiveness for '%s': %s", check_url, exc)
-        except requests.exceptions.ConnectionError as e:
+        except requests.exceptions.ConnectionError as exc:
-            logging.error(str(e) + " " + check_url)
+            logging.error(str(exc) + " " + check_url)
            check['error'] = "connection"
-        except requests.exceptions.ReadTimeout as e:
+        except requests.exceptions.ReadTimeout as exc:
-            logging.error(str(e) + " " + check_url)
+            logging.error(str(exc) + " " + check_url)
            check['error'] = "read_timeout"
-        except requests.exceptions.Timeout as e:
+        except requests.exceptions.Timeout as exc:
-            logging.error(str(e) + " " + check_url)
+            logging.error(str(exc) + " " + check_url)
            check['error'] = "connection_timeout"
-        except Exception as e:
+        except Exception as exc:
-            logging.error(str(e) + " " + check_url)
+            logging.error(str(exc) + " " + check_url)
            check['error'] = "unknown"
        result['details']['urlchecks'].append(check)
-    result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
+    result['details']['urlchecks'] = sorted(result['details']['urlchecks'],
                                            key=lambda url: url['url'])
    # collect icons
    icons = set()
@ -492,24 +607,24 @@ def check_site(entry):
        if 'content' not in c:
            continue
        if c['content'] is None:
-            logging.warning("No content for %s" % entry['url'])
+            logging.warning("No content for %s", entry['url'])
            continue
        if c['content']['icon'] is not None:
            icons.add(c['content']['icon'])
    downloaded_icons = set()
    for icon_url in icons:
-        logging.info("Getting icon %s" % icon_url)
+        logging.info("Getting icon %s", icon_url)
        try:
            downloaded_icons.add(download_icon(icon_url))
        except Exception as e:
-            logging.error("Could not download icon: %s" % e)
+            logging.error("Could not download icon: %s", e)
    result['details']['icons'] = sorted(list(downloaded_icons))
    # collect feeds
    feeds = set()
    for c in result['details']['urlchecks']:
        if c['content'] is None:
-            logging.warning("No content for %s" % entry['url'])
+            logging.warning("No content for %s", entry['url'])
            continue
        if 'feeds' in c['content'] and len(c['content']['feeds']):
            for feed in c['content']['feeds']:
@ -543,7 +658,7 @@ def check_site(entry):
            result['details']['cms'] = parse_generator(c['content']['generator'])
            # Qualify certain CMS flavours in more detail
            if result['details']['cms'] == "typo3":
-                if gcms_ip in result['details']['ipv4_addresses']:
+                if GCMS_IP in result['details']['ipv4_addresses']:
                    result['details']['cms'] = "typo3-gcms"
                elif 'typo3-gruene.de' in c['content']['html']:
                    result['details']['cms'] = "typo3-gruene"
@ -555,7 +670,8 @@ def check_site(entry):
            # No generator Tag. Use HTML content.
            if 'Urwahl3000' in c['content']['html']:
                result['details']['cms'] = "wordpress-urwahl"
-            elif 'josephknowsbest' in c['content']['html'] or 'Joseph-knows-best' in c['content']['html']:
+            elif ('josephknowsbest' in c['content']['html'] or
                  'Joseph-knows-best' in c['content']['html']):
                result['details']['cms'] = "wordpress-josephknowsbest"
            elif 'wordpress' in c['content']['html']:
                result['details']['cms'] = "wordpress"
@ -567,7 +683,7 @@ def check_site(entry):
    ### Derive criteria
    # DNS_RESOLVABLE_IPV4
-    if len(result['details']['ipv4_addresses']):
+    if result['details']['ipv4_addresses']:
        result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
    # SITE_REACHABLE
@ -584,8 +700,8 @@ def check_site(entry):
    # WWW_OPTIONAL
    num_hostnames = 0
-    for hn in result['details']['hostnames'].keys():
+    for hostname in result['details']['hostnames'].keys():
-        item = result['details']['hostnames'][hn]
+        item = result['details']['hostnames'][hostname]
        if not item['resolvable']:
            continue
        num_hostnames += 1
@ -600,20 +716,20 @@ def check_site(entry):
    else:
        links = set()
        if result['details']['urlchecks'] is None:
-            logging.warning("No urlchecks for %s" % entry['url'])
+            logging.warning("No urlchecks for %s", entry['url'])
        else:
            for item in result['details']['urlchecks']:
-                if item['content']['canonical_link'] is not None:
+                if item['content'] is not None and item['content']['canonical_link'] is not None:
                    links.add(item['content']['canonical_link'])
        if len(links) == 1:
            result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
    # FAVICON
-    if len(result['details']['icons']):
+    if result['details']['icons']:
        result['result']['FAVICON'] = {'value': True, 'score': 1}
    # FEEDS
-    if len(result['details']['feeds']):
+    if result['details']['feeds']:
        result['result']['FEEDS'] = {'value': True, 'score': 1}
    # HTTP_RESPONSE_DURATION
@ -621,17 +737,18 @@ def check_site(entry):
    for item in result['details']['urlchecks']:
        if item['error'] is None:
            durations.append(item['duration'])
-    val = round(statistics.mean(durations))
+    if durations:
-    result['result']['HTTP_RESPONSE_DURATION']['value'] = val
+        val = round(statistics.mean(durations))
-    if val < 100:
+        result['result']['HTTP_RESPONSE_DURATION']['value'] = val
-        result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
+        if val < 100:
-    elif val < 1000:
+            result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
-        result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
+        elif val < 1000:
            result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
    # RESPONSIVE
    if result['details']['responsive'] is not None:
        if (result['details']['responsive']['min_width'] < 500 and
-            len(result['details']['responsive']['viewport_meta_tag']) > 0):
+                len(result['details']['responsive']['viewport_meta_tag']) > 0):
            result['result']['RESPONSIVE']['value'] = True
            result['result']['RESPONSIVE']['score'] = 1
@ -649,87 +766,91 @@ def check_site(entry):
    return result
-def main():
+def get_job_from_queue():
    """
-    Bringing it all together
+    Returns a URL from the queue
    """
-    logging.basicConfig(level=logging.INFO)
+    out = None
    logging.getLogger("urllib3").setLevel(logging.CRITICAL)
-    # refresh our local clone of the green directory
+    with DATASTORE_CLIENT.transaction():
-    get_green_directory()
+        query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND)
        for entity in query.fetch(limit=1):
            logging.debug("Got job: %s", entity)
            out = dict(entity)
            out["url"] = entity.key.name
            DATASTORE_CLIENT.delete(entity.key)
-    # build the list of website URLs to run checks for
+    return out
    logging.info("Processing green-directory")
    input_entries = []
-    for entry in dir_entries():
+def work_of_queue():
    """
    Take job from queue and finish it until there are no more jobs
    """
    while True:
        job = get_job_from_queue()
        if job is None:
            logging.info("No more jobs. Exiting.")
            break
-        if 'type' not in entry:
+        logging.info("Starting job %s", job["url"])
-            logging.error("Entry without type")
+        result = check_site(entry=job)
-            continue
+        #logging.debug(result)
-        if 'urls' not in entry:
+        logging.info("Job %s finished checks", job["url"])
-            logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
+        logging.info("Job %s writing to DB", job["url"])
            continue
-        website_url = None
+        key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"])
-        for n in range(len(entry['urls'])):
+        entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
-            try:
+        record = {
-                if entry['urls'][n]['type'] == "WEBSITE":
+            "created": datetime.utcnow(),
-                    website_url = entry['urls'][n]['url']
+            "results": result,
-                    if website_url:
+        }
-                        input_entries.append({
+        entity.update(record)
-                            "url": website_url,
+        try:
-                            "level": entry.get("level"),
+            DATASTORE_CLIENT.put(entity)
-                            "state": entry.get("state"),
+        except InvalidArgument as ex:
-                            "district": entry.get("district"),
+            logging.error("Could not write result: %s", ex)
-                            "city": entry.get("city"),
+        except ex:
-                        })
+            logging.error("Could not write result: %s", ex)
            except NameError:
                logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
    # randomize order, to distribute requests over servers
    logging.info("Shuffling input URLs")
    random.seed()
    random.shuffle(input_entries)
    # run checks
    logging.info("Starting checks")
    results = {}
    pool = Pool(concurrency)
    for ientry in input_entries:
        logging.info("Submitting %s to job pool" % ientry['url'])
        results[ientry['url']] = pool.apply_async(check_site, kwds={'entry': ientry})
    pool.close()
    pool.join()
    logging.info("Checks are finished")
    # Restructure result from dict of ApplyResult
    # to list of dicts and sort in stable way
    json_result = []
    done = set()
    logging.info("Restructuring results")
    # convert results from ApplyResult to dict
    for url in sorted(results.keys()):
        if url not in done:
            logging.info("Getting result for %s" % url)
            try:
                resultsitem = results[url].get()
                json_result.append(resultsitem)
            except Exception as e:
                logging.error("Error getting result for '%s': %s" % (url, e))
        done.add(url)
    # Write result as JSON
    output_filename = os.path.join(result_path, "spider_result.json")
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
        json.dump(json_result, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
 if __name__ == "__main__":
-    main()
+    """
    Bringing it all together
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--credentials-path', dest='credentials_path',
                        help='Path to the service account credentials JSON file',
                        default='/secrets/service-account.json')
    parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
                        default='info')
    subparsers = parser.add_subparsers(help='sub-command help', dest='command')
    subparsers.add_parser('spider', help='Take jobs off the queue and spider')
    jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
    jobs_parser.add_argument('--url', help='Add a job to spider a URL')
    args = parser.parse_args()
    loglevel = args.loglevel.lower()
    if loglevel == 'error':
        logging.basicConfig(level=logging.ERROR)
    elif loglevel == 'warn':
        logging.basicConfig(level=logging.WARN)
    elif loglevel == 'debug':
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
        loglevel = 'info'
    logging.getLogger("urllib3").setLevel(logging.CRITICAL)
    DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
    logging.debug("Called command %s", args.command)
    if args.command == 'jobs':
        create_jobs(args.url)
    else:
        work_of_queue()
--- a/webapp/src/index.js
+++ b/webapp/src/index.js
@ -55,7 +55,7 @@ $(function(){
        // IPs
        var ips = _.join(item.details.ipv4_addresses, ', ');
-        row.append('<td class="text '+ (ips === '' ? 'bad' : 'good') +' text-center" data-order="' + (ips === '' ? no : ips) + '"><span class="tt" title="IPv4-Adresse(n) des Servers bzw. der Server">' + (ips === '' ? no : ips) + '</span></td>');
+        row.append('<td class="text '+ (ips === '' ? 'bad' : 'good') +' text-center" data-order="' + ips + '"><span class="tt" title="IPv4-Adresse(n) des Servers bzw. der Server">' + (ips === '' ? no : ips) + '</span></td>');
        // SITE_REACHABLE
        var reachable = '<span class="tt" title="Die Site war beim Check erreichbar.">' + yes + '</span>';
@ -65,10 +65,14 @@ $(function(){
        row.append('<td class="'+ (item.result.SITE_REACHABLE.value ? 'good' : 'bad') +' text-center" data-order="'+ (item.result.SITE_REACHABLE.value ? '1' : '0') +'">' + reachable + '</td>');
        // HTTP_RESPONSE_DURATION
-        var durationClass = 'bad';
+        if (!item.result.SITE_REACHABLE.value || item.result.HTTP_RESPONSE_DURATION.value === null) {
-        if (item.result.HTTP_RESPONSE_DURATION.score > 0) { durationClass = 'medium'; }
+          row.append('<td class="text bad text-center" data-order="99999999"><span class="tt" title="Nicht anwendbar">' + no + '</span></td>');
-        if (item.result.HTTP_RESPONSE_DURATION.score > 0.5) { durationClass = 'good'; }
+        } else {
-        row.append('<td class="text '+ durationClass +' text-center" data-order="' + item.result.HTTP_RESPONSE_DURATION.value + '"><span class="tt" title="Dauer, bis der Server die Seitenanfrage beantwortet. Unter 100 ms ist sehr gut. Unter 1 Sekunde ist okay.">' + item.result.HTTP_RESPONSE_DURATION.value + ' ms</span></td>');
+          var durationClass = 'bad';
          if (item.result.HTTP_RESPONSE_DURATION.score > 0) { durationClass = 'medium'; }
          if (item.result.HTTP_RESPONSE_DURATION.score > 0.5) { durationClass = 'good'; }
          row.append('<td class="text '+ durationClass +' text-center" data-order="' + item.result.HTTP_RESPONSE_DURATION.value + '"><span class="tt" title="Dauer, bis der Server die Seitenanfrage beantwortet. Unter 100 ms ist sehr gut. Unter 1 Sekunde ist okay.">' + item.result.HTTP_RESPONSE_DURATION.value + ' ms</span></td>');
        }
        // FAVICON
        var icon = item.result.FAVICON.value && (item.details.icons[0] != null);
@ -103,7 +107,7 @@ $(function(){
        // screenshots
        var screenshot = false;
-        if (item.details.canonical_urls.length > 0) {
+        if (item.details.canonical_urls && item.details.canonical_urls.length > 0) {
          if (typeof screenshots[item.details.canonical_urls[0]] !== 'undefined') {
            var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.details.canonical_urls[0]];
            var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.details.canonical_urls[0]];