Job-Verwaltung mit RQ, und vieles mehr (#149)

* CLI: remove 'jobs' command, add 'manager' * Add job definition * Move jobs to manage folder * Rename jobs to manager * Add rq and redis dependencies * Add docker-compose YAML * Downgrade to alpine 3.8 * Adjust paths in Dockerfile, remove entrypoint * Rename 'make spiderjobs' to 'make jobs' * Fix docker exectution * Adapt 'make jobs' * Fix metadata scheme * Add docker dependency * Rendomize queue (a bit) * Use latest image, remove debug output * Make docker-compose file downwards-compatible * Use latest instead of dev image tag * Update docker-compose.yaml * Adapt job start script * Fix redis connection in manager * Add support for increasing timeout via environment variable * Adapt load_in_browser to cookies table schema change * Fix execution * Mitigate yaml warning * Bump some dependency versions * Report resource usage stats for each job * checks/load_in_browser: Return DOM size, prevent multiple page loads * Update .dockerignore * Code update * Script update * Update README.md * WIP * WIP commit * Update Dockerfile to alpine:edge and chromium v90 * Update TestCertificateChecker * Set defaults for __init__ function * Detect sunflower theme * Update unit test for new datetime (zero-basing) * Set logging prefs from Chromium in a new way * Move datastore client instantiation As it is not needed for all commands * Change green-directory repository URL * Add git settings for cloning green-directory * Pin alpine version 3.14, fix py3-cryptography * Use plain docker build progress output * Add volumes to 'make test' docker run command * Fix bug * Update example command in README * Update dependencies * Add creation of Kubernetes jobs
2021-11-11 20:15:43 +01:00 · 2021-11-11 20:15:43 +01:00 · 618e29d763
parent e59b05fc6c
commit 618e29d763
30 changed files with 982 additions and 150 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,6 +1,6 @@
 .git
 webapp
 docs
 /screenshots
 secrets
 temp
 venv
--- a/.gitignore
+++ b/.gitignore
@ -5,4 +5,5 @@ temp
 __pycache__
 .vscode/settings.json
 kubernetes/green-spider-secret.yaml
-/volumes
+/volumes
 /screenshots
--- a/35
+++ b/35
@ -1,23 +1,26 @@
-FROM python:3.7-alpine3.9
+FROM alpine:3.14
 WORKDIR /workdir
 ADD requirements.txt /workdir/
-RUN echo "http://dl-4.alpinelinux.org/alpine/v3.8/main" >> /etc/apk/repositories && \
+RUN echo "http://dl-4.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories && \
-    echo "http://dl-4.alpinelinux.org/alpine/v3.8/community" >> /etc/apk/repositories && \
+    echo "http://dl-4.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \
-    apk update && \
+    apk --update --no-cache add ca-certificates chromium chromium-chromedriver \
-    apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
+          python3-dev py3-grpcio py3-wheel py3-pip py3-lxml \
-    pip3 install --upgrade pip && \
+          build-base git libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \
-    pip3 install -r requirements.txt && \
+    pip install -r requirements.txt && \
-    apk del python3-dev build-base
+    apk del build-base
-ADD cli.py /
+# As alpine's py3-cryptography did not work as of alpine v3.14, we use this hack from
-ADD config /config
+# https://github.com/pyca/cryptography/issues/3344#issuecomment-650845512
-ADD jobs /jobs
+RUN LDFLAGS="-L/opt/openssl/lib -Wl,-rpath,/opt/openssl/lib" CFLAGS="-I/opt/openssl/include" pip3 install -U cryptography
 ADD checks /checks
 ADD rating /rating
 ADD spider /spider
 ADD export /export
-ENTRYPOINT ["python3", "/cli.py"]
+ADD cli.py /workdir/
 ADD manager /workdir/manager
 ADD config /workdir/config
 ADD checks /workdir/checks
 ADD rating /workdir/rating
 ADD spider /workdir/spider
 ADD export /workdir/export
 ADD job.py /workdir/
--- a/16
+++ b/16
@ -6,16 +6,17 @@ DB_ENTITY := spider-results
 # Build docker image
 dockerimage:
-	docker build -t $(IMAGE) .
+	docker build --progress plain -t $(IMAGE) .
-# Create spider job queue
+# Fill the queue with spider jobs, one for each site.
-spiderjobs:
+jobs:
 	docker run --rm -ti \
 		-v $(PWD)/secrets:/secrets \
 		$(IMAGE) \
-		--credentials-path /secrets/datastore-writer.json \
+		python cli.py \
-		--loglevel debug \
+			--credentials-path /secrets/datastore-writer.json \
-		jobs
+			--loglevel debug \
 			manager
 # Run spider in docker image
 spider:
@ -41,6 +42,9 @@ export:
 # run spider tests
 test:
 	docker run --rm -ti \
 	  -v $(PWD)/volumes/dev-shm:/dev/shm \
      -v $(PWD)/secrets:/secrets \
      -v $(PWD)/screenshots:/screenshots \
 	  -v $(PWD)/volumes/chrome-userdir:/opt/chrome-userdir \
 		--entrypoint "python3" \
 		$(IMAGE) \
--- a/README.md
+++ b/README.md
@ -32,7 +32,7 @@ Alle Informationen zum Betrieb befinden sich im Verzeichnis [devops](https://git
 Green Spider ist in Python 3 geschrieben und wird aktuell unter 3.6 getestet und ausgeführt.
-Aufgrund zahlreicher Dependencies empfiehlt es sich, den Spider Code lokal in Docker
+Aufgrund zahlreicher Abhängigkeiten empfiehlt es sich, den Spider Code lokal in Docker
 auszuführen.
 Das Image wird über den folgenden Befehl erzeugt:
@ -57,18 +57,19 @@ Am einfachsten geht das über den `make spider` Befehl, so:
 make spider ARGS="--url http://www.example.com/"
 ```
-Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenank.
+Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenbank.
 Wenn nur eine einzelne Site gespidert werden soll, die Ergebnisse aber in die Datenbank geschrieben werden sollen, kann der Spider so mit `--job` und einem JSON-Object aufgerufen werden (Beispiel):
-```
+```nohighlight
 docker run --rm -ti \
  -v $(pwd)/volumes/dev-shm:/dev/shm \
  -v $(pwd)/secrets:/secrets \
  -v $(pwd)/screenshots:/screenshots \
  -v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
  --shm-size=2g \
-  quay.io/netzbegruenung/green-spider:latest \
+  quay.io/netzbegruenung/green-spider:latest python3 cli.py \
    --credentials-path /secrets/datastore-writer.json \
    --loglevel debug \
-    spider --job '{"url": "https://xn--grne-porta-westfalica-9hc.de/", "meta": {"city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}}'
+    spider --job '{"url": "https://gruene-porta-westfalica.de/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}'
 ```
--- a/checks/init.py
+++ b/checks/init.py
@ -54,16 +54,27 @@ def perform_checks(input_url):
    results = {}
    # TODO:
    # Set screenshot_bucket_name and storage_credentials_path
    # based on flags.
    config = Config(urls=[input_url],
        user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' +
                   'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' +
-                   'Safari/537.36 green-spider/0.2')
+                   'Safari/537.36 green-spider/0.2',
        screenshot_bucket_name='green-spider-screenshots.sendung.de',
        screenshot_datastore_kind='webscreenshot',
        storage_credentials_path='/secrets/screenshots-uploader.json',
        datastore_credentials_path='/secrets/datastore-writer.json')
    # Iterate over all checks.
    for check_name, check in check_modules:
        # checker is the individual test/assertion handler we instantiate
        # for each check step.
        checker = check.Checker(config=config,
                                previous_results=results)
-        # see if dependencies are met
+        # Ensure that dependencies are met for the checker.
        dependencies = checker.depends_on_results()
        if dependencies != []:
            for dep in dependencies:
@ -71,10 +82,16 @@ def perform_checks(input_url):
                    logging.debug("Skipping check %s as dependency %s is not met" % (check_name, dep))
                    continue
        # Execute the checker's main function.
        result = checker.run()
        results[check_name] = result
-        # update config for the next check
+        # Execute any cleanup/aftermath function (if given) for the checker.
        modified_results = checker.post_hook(result)
        if modified_results is not None:
            results[check_name] = modified_results
        # Update config for the next check(s) in the sequence.
        config = checker.config
        logging.debug("config after check %s: %r" % (check_name, config))
--- a/checks/abstract_checker.py
+++ b/checks/abstract_checker.py
@ -21,6 +21,20 @@ class AbstractChecker(object):
        """Executes the check routine, returns result dict"""
        raise NotImplementedError()
    def post_hook(self, result):
        """
        Optional function to execute after run(). Can be used to post-process
        results data. Should be defined by the implementing checker.
        Params:
          result: Result data from the run() function.
        Returns:
          Dict: Modified results data
          None: Means that nothing has been done, so should be ignored.
        """
        return None
    @property
    def config(self):
        return self._config
--- a/checks/certificate_test.py
+++ b/checks/certificate_test.py
@ -14,7 +14,7 @@ class TestCertificateChecker(unittest.TestCase):
        result = checker.run()
        self.assertIn(url, result)
        self.assertIsNone(result[url]['exception'])
-        self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services')
+        self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services LLC')
    def test_kaarst(self):
        """Real-workd example"""
@ -24,7 +24,7 @@ class TestCertificateChecker(unittest.TestCase):
        result = checker.run()
        self.assertIn(url, result)
        self.assertIsNone(result[url]['exception'])
-        self.assertEqual(result[url]['issuer']['O'], 'Sectigo Limited')
+        self.assertEqual(result[url]['issuer']['O'], 'DigiCert Inc')
    def test_tls_v_1_0(self):
        """Load a certificate for a TLS v1.0 server"""
--- a/checks/config.py
+++ b/checks/config.py
@ -3,9 +3,19 @@ class Config(object):
    Our configuration to be passed to checks
    """
-    def __init__(self, urls, user_agent='green-spider/1.0'):
+    def __init__(self,
                 urls,
                 screenshot_bucket_name='',
                 screenshot_datastore_kind='',
                 storage_credentials_path='',
                 datastore_credentials_path='',
                 user_agent='green-spider/1.0'):
        self._urls = set(urls)
        self._user_agent = user_agent
        self._screenshot_bucket_name = screenshot_bucket_name
        self._screenshot_datastore_kind = screenshot_datastore_kind
        self._storage_credentials_path = storage_credentials_path
        self._datastore_credentials_path = datastore_credentials_path
    def __repr__(self):
      return "Config(urls=%r)" % self._urls
@ -27,3 +37,19 @@ class Config(object):
    @property
    def user_agent(self):
        return self._user_agent
    @property
    def screenshot_bucket_name(self):
        return self._screenshot_bucket_name
    @property
    def storage_credentials_path(self):
        return self._storage_credentials_path
    @property
    def datastore_credentials_path(self):
        return self._datastore_credentials_path
    @property
    def screenshot_datastore_kind(self):
        return self._screenshot_datastore_kind
--- a/checks/generator.py
+++ b/checks/generator.py
@ -75,6 +75,9 @@ class Checker(AbstractChecker):
        elif ('Urwahl3000' in page_content['content'] or
            '/themes/urwahl3000' in page_content['content']):
            generator = 'wordpress-urwahl'
        elif ('/themes/sunflower' in page_content['content']):
            generator = 'wordpress-sunflower'
        elif ('/themes/sunflower' in page_content['content']):
            generator = 'wordpress-sunflower'
--- a/checks/load_feeds_test.py
+++ b/checks/load_feeds_test.py
@ -60,15 +60,13 @@ class TestFeed(unittest.TestCase):
        result = checker.run()
        pprint(result)
-        self.assertEqual(result, {
+        self.assertEqual(result['http://example.com/feed.xml'], {
-            'http://example.com/feed.xml': {
+            'exception': None,
-                'exception': None,
+            'average_interval': 340359,
-                'title': 'Liftoff News',
+            'first_entry': datetime(2003, 5, 30, 11, 6, 42),
-                'latest_entry': datetime(2003, 6, 3, 9, 39, 21),
+            'latest_entry': datetime(2003, 6, 3, 9, 39, 21),
-                'first_entry': datetime(2003, 5, 30, 11, 6, 42),
+            'num_entries': 2,
-                'average_interval': 340359,
+            'title': 'Liftoff News',
                'num_entries': 2,
            }
        })
--- a/checks/load_in_browser.py
+++ b/checks/load_in_browser.py
@ -9,29 +9,38 @@ Information includes:
 - what cookies are set during loading the page
 """
 from datetime import datetime
 import hashlib
 import logging
 import math
 import os
 import shutil
 import time
 import sqlite3
 import json
 from selenium import webdriver
 from selenium.common.exceptions import StaleElementReferenceException
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 import tenacity
 from google.cloud import storage
 from google.cloud import datastore
 from checks.abstract_checker import AbstractChecker
 class Checker(AbstractChecker):
-    page_load_timeout = 30
+    page_load_timeout = 120
    # sizes we check for (width, height)
    sizes = (
        (360, 640), # rather old smartphone
        (768, 1024), # older tablet or newer smartphone
        (1024, 768), # older desktop or horiz. tablet
        (1920, 1080), # Full HD horizontal
        (1500, 1500), # useful window size we also use for the main screenshot
        (1024, 768), # older desktop or horiz. tablet
        (768, 1024), # older tablet or newer smartphone
        (360, 640), # rather old smartphone
    )
    def __init__(self, config, previous_results=None):
@ -39,22 +48,50 @@ class Checker(AbstractChecker):
        # Our selenium user agent using Chrome headless as an engine
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('enable-automation')
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--dns-prefetch-disable')
        chrome_options.add_argument('--disable-extensions')
        chrome_options.add_argument('--disk-cache-size=0')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--verbose')
        chrome_options.page_load_strategy = 'normal'
        # path where to get cookies from
        chrome_options.add_argument("--user-data-dir=/opt/chrome-userdir")
        # mobile_emulation = {
        #     "deviceMetrics": { "width": 360, "height": 640, "pixelRatio": 3.0 },
        #     "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"
        # }
        #mobile_emulation = { "deviceName": "Nexus 5" }
        #chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
        # empty /opt/chrome-userdir
        shutil.rmtree('/opt/chrome-userdir', ignore_errors=True)
-        self.driver = webdriver.Chrome(options=chrome_options)
+        # activate performance logging (includes network logging)
        capabilities = DesiredCapabilities.CHROME
        capabilities['goog:loggingPrefs'] = {'performance': 'ALL'}
        # TODO: also do this
        # (from https://stackoverflow.com/questions/60375633/capture-logs-from-chrome-during-test-is-running-python#comment106827817_60385493)
        capabilities['loggingPrefs'] = {'performance': 'ALL'}
        self.driver = webdriver.Chrome(options=chrome_options, desired_capabilities=capabilities)
        self.driver.set_page_load_timeout(self.page_load_timeout)
-    def run(self):
+        # We capture the browser engine's user agent string
        # for the record.
        self.user_agent = self.driver.execute_script("return navigator.userAgent;")
    def run(self):
        """
        Main function of this check.
        """
        results = {}
        for url in self.config.urls:
@ -64,15 +101,22 @@ class Checker(AbstractChecker):
                'min_document_width': None,
                'logs': None,
                'font_families': None,
                'performance_log': [],
                'screenshots': [],
            }
-            # responsive check
+            self.driver.get(url)
            # Responsive layout check and screenshots.
            try:
-                sizes = self.check_responsiveness(url)
+                check_responsiveness_results = self.check_responsiveness(url)
                results[url] = {
-                    'sizes': sizes,
+                    'sizes': check_responsiveness_results['sizes'],
-                    'min_document_width': min([s['document_width'] for s in sizes]),
+                    'min_document_width': min([s['document_width'] for s in check_responsiveness_results['sizes']]),
                    'dom_size': self.get_dom_size(),
                    'logs': self.capture_log(),
                    'performance_log': [],
                    'screenshots': check_responsiveness_results['screenshots'],
                }
            except TimeoutException as e:
                logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e))
@ -81,6 +125,7 @@ class Checker(AbstractChecker):
                logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re))
                pass
            # Scroll page to bottom, to load all lazy-loading resources.
            try:
                self.scroll_to_bottom()
            except TimeoutException as e:
@ -112,6 +157,7 @@ class Checker(AbstractChecker):
                logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
                pass
            # Process cookies.
            try:
                results[url]['cookies'] = self.get_cookies()
            except TimeoutException as e:
@ -120,10 +166,79 @@ class Checker(AbstractChecker):
            except tenacity.RetryError as re:
                logging.warn("RetryError when collecting cookies for %s: %s" % (url, re))
                pass
            for logentry in self.driver.get_log('performance'):
                decoded_logentry = json.loads(logentry['message'])
                results[url]['performance_log'].append(decoded_logentry)
        self.driver.quit()
        return results
    def post_hook(self, result):
        """
        Logic executed after run() is done.
        Used to upload screenshots and metadata to cloud storage and datastore.
        """
        # Upload screenshots and metadata
        logging.debug("load_in_browser post_hook 1 - Creating client")
        storage_client = storage.Client.from_service_account_json(self.config.storage_credentials_path)
        bucket = storage_client.get_bucket(self.config.screenshot_bucket_name)
        datastore_client = datastore.Client.from_service_account_json(self.config.datastore_credentials_path)
        exclude_from_indexes = ['size', 'screenshot_url', 'user_agent']
        for url in result.keys():
            for screenshot in result[url]['screenshots']:
                # Upload one screenshot
                try:
                    local_file = '%s/%s' % (screenshot['folder'], screenshot['filename'])
                    logging.debug("Handling screenshot file %s" % local_file)
                    if not os.path.exists(screenshot['local_path']):
                        logging.warning("No screenshot created: size=%s, url='%s'" % (screenshot['size'], screenshot['url']))
                        continue
                    logging.debug("Uploading %s to %s/%s" % (screenshot['local_path'], screenshot['folder'], screenshot['filename']))
                    with open(screenshot['local_path'], 'rb') as my_file:
                        # Create new blob in remote bucket
                        blob = bucket.blob(local_file)
                        blob.upload_from_file(my_file, content_type="image/png")
                        blob.make_public()
                except Exception as e:
                    logging.warn("Error uploading screenshot for %s: %s" % (screenshot['url'], e))
                    continue
                try:
                    os.remove(screenshot['local_path'])
                except:
                    pass
                # Write metadata for one screenshot
                data = {
                    'url': screenshot['url'],
                    'size': screenshot['size'],
                    'screenshot_url': screenshot['screenshot_url'],
                    'user_agent': screenshot['user_agent'],
                    'created': screenshot['created'],
                }
                try:
                    key = datastore_client.key(self.config.screenshot_datastore_kind, screenshot['screenshot_url'])
                    entity = datastore.Entity(key=key, exclude_from_indexes=exclude_from_indexes)
                    entity.update(data)
                    datastore_client.put(entity)
                    logging.debug("Successfully stored screenshot metadata for %s" % screenshot['screenshot_url'])
                except Exception as e:
                    logging.warn("Error in %s: %s" % (screenshot['url'], e))
            # Remove screenshots part from results
            del result[url]['screenshots']
        return result
    def get_cookies(self):
        # read cookie DB to get 3rd party cookies, too
@ -131,7 +246,7 @@ class Checker(AbstractChecker):
        db = sqlite3.connect('/opt/chrome-userdir/Default/Cookies')
        db.row_factory = sqlite3.Row
        c = db.cursor()
-        c.execute("SELECT creation_utc, host_key, name, path, expires_utc, is_secure, is_httponly, has_expires, is_persistent, firstpartyonly FROM cookies")
+        c.execute("SELECT creation_utc, host_key, name, path, expires_utc, is_secure, is_httponly, has_expires, is_persistent FROM cookies")
        for row in c.fetchall():
            cookies.append(dict(row))
        c.close()
@ -142,11 +257,13 @@ class Checker(AbstractChecker):
    @tenacity.retry(stop=tenacity.stop_after_attempt(3),
                    retry=tenacity.retry_if_exception_type(TimeoutException))
    def check_responsiveness(self, url):
-        result = []
+        result = {
            'sizes': [],
            'screenshots': [],
        }
        # set window to the first size initially
        self.driver.set_window_size(self.sizes[0][0], self.sizes[0][1])
        self.driver.get(url)
        for (width, height) in self.sizes:
            self.driver.set_window_size(width, height)
@ -155,13 +272,44 @@ class Checker(AbstractChecker):
            time.sleep(1.0)
            doc_width = self.driver.execute_script("return document.body.scrollWidth")
-            result.append({
+            result['sizes'].append({
                'viewport_width': width,
                'document_width': int(doc_width),
            })
            # Make screenshot
            urlhash = hashlib.md5(bytearray(url, 'utf-8')).hexdigest()
            folder = "%sx%s" % (width, height)
            abs_folder = "/screenshots/%s" % folder
            os.makedirs(abs_folder, exist_ok=True)
            filename = urlhash + '.png'
            abs_filepath = "%s/%s" % (abs_folder, filename)
            created = datetime.utcnow()
            success = self.driver.save_screenshot(abs_filepath)
            if not success:
                logging.warn("Failed to create screenshot %s" % abs_filepath)
                continue
            result['screenshots'].append({
                'local_path': abs_filepath,
                'folder': folder,
                'filename': filename,
                'url': url,
                'size': [width, height],
                'screenshot_url': 'http://%s/%s/%s' % (
                    self.config.screenshot_bucket_name, folder, filename),
                'user_agent': self.user_agent,
                'created': created,
            })
        return result
    def get_dom_size(self):
        dom_length = self.driver.execute_script("return document.getElementsByTagName('*').length")
        return int(dom_length)
    def capture_log(self):
        """
        Returns log elements with level "SEVERE" or "WARNING"
--- a/cli.py
+++ b/cli.py
@ -19,7 +19,7 @@ def handle_sigint(signum, frame):
 if __name__ == "__main__":
-    signal.signal(signal.SIGINT,handle_sigint)
+    signal.signal(signal.SIGINT, handle_sigint)
    parser = argparse.ArgumentParser()
@ -40,9 +40,9 @@ if __name__ == "__main__":
    spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
    spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.')
-    # jobs subcommand
+    # manager subcommand
-    jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
+    manager_parser = subparsers.add_parser('manager', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
-    jobs_parser.add_argument('--url', help='Add a job to spider a specific URL')
+    manager_parser.add_argument('--url', help='Add a job to spider a specific URL')
    # export subcommand
    export_parser = subparsers.add_parser('export', help='Export JSON data')
@ -68,20 +68,21 @@ if __name__ == "__main__":
    logging.debug("Called command %s", args.command)
-    datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
+    if args.command == 'manager':
-    if args.command == 'jobs':
+        import manager
-
+        manager.create_jobs(args.url)
        import jobs
        jobs.create_jobs(datastore_client, args.url)
    elif args.command == 'export':
        import export
        datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
        export.export_results(datastore_client, args.kind)
    else:
        from spider import spider
        datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
        if args.url:
            # spider one URL for diagnostic purposes
            spider.test_url(args.url)
--- a/config/init.py
+++ b/config/init.py
@ -7,7 +7,7 @@ CONNECT_TIMEOUT = 5
 READ_TIMEOUT = 10
 # Git repo for our data
-GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
+GREEN_DIRECTORY_REPO = 'https://git.verdigado.com/NB-Public/green-directory.git'
 # folder in that repo that holds the data
 GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
@ -15,9 +15,12 @@ GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
 # folder we use locally to clone the repo
 GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
-# IP address of the newthinking GCMS server
+# IP address of the verdigado GCMS server
 GCMS_IP = "194.29.234.123"
 # kind name of the spider job key datastore entities
 JOB_DATASTORE_KIND = 'spider-jobs'
 K8S_JOBS_PATH = './k8s-jobs'
 K8S_JOB_TEMPLATE = './manager/job_template.yaml'
 K8S_JOB_BATCH_SIZE = 10
--- a/devops/README.md
+++ b/devops/README.md
@ -50,7 +50,7 @@ devops/ssh.sh
 Hostname: `green-spider.netzbegruenung.de`
-```
+```shell
 docker-compose stop webapp
 docker run -it --rm -p 443:443 -p 80:80 --name certbot \
  -v /etc/letsencrypt:/etc/letsencrypt \
--- a/devops/run-job.sh
+++ b/devops/run-job.sh
@ -127,13 +127,11 @@ ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP << EOF
  echo ""
  echo "Install docker"
-  apt-get install -y docker-ce
+  apt-get install -y docker-ce docker-compose
  mkdir /root/secrets
 EOF
 echo "Done with remote setup."
 if [[ $1 == "screenshotter" ]]; then
  ### screenshotter
@ -149,6 +147,41 @@ if [[ $1 == "screenshotter" ]]; then
    -v /root/secrets:/secrets \
    quay.io/netzbegruenung/green-spider-screenshotter
 elif [[ $1 == "spider-new" ]]
 then
  # Some dependencies specific to this task
  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP apt-get install -y python3-pip build-essential
  # Upload some files
  scp -o StrictHostKeyChecking=no -q secrets/datastore-writer.json root@$SERVER_IP:/root/secrets/datastore-writer.json
  scp -o StrictHostKeyChecking=no -q docker-compose.yaml root@$SERVER_IP:/root/docker-compose.yaml
  scp -o StrictHostKeyChecking=no -q requirements.txt root@$SERVER_IP:/root/requirements.txt
  scp -o StrictHostKeyChecking=no -q job.py root@$SERVER_IP:/root/job.py
  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP pip3 install -r requirements.txt
  # Bring up redis for the queue
  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose pull redis
  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose up -d redis
  sleep 5
  # Bring up queue manager
  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose pull manager
  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose up manager
  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq info --url redis://localhost:6379/0
  # Start worker and work off the queue once
  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq worker --burst high default low --url redis://localhost:6379/0
  # Re-queue failed jobs once, then re-execute.
  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq requeue --queue low -u redis://localhost:6379 --all
  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq info --url redis://localhost:6379/0
  ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq worker --burst high default low --url redis://localhost:6379/0
  echo "Done with queued jobs."
 else
  ### spider
--- a/devops/ssh.sh
+++ b/devops/ssh.sh
@ -1,16 +0,0 @@
 #!/bin/bash
 # Log in to webapp server via SSH
 API_TOKEN_SECRET="secrets/hetzner-api-token.sh"
 test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; }
 source $API_TOKEN_SECRET
 source devops/functions.bash
 get_ip
 echo "Use this command for SSH access:"
 echo "ssh -o StrictHostKeyChecking=no root@${IP_IP}"
 ssh -o StrictHostKeyChecking=no root@${IP_IP}
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -0,0 +1,51 @@
 version: "2"
 services:
  redis:
    image: redis:5-alpine
    command: redis-server --save "" --appendonly no
    volumes:
      - ${PWD}/volumes/redis-data:/data
    restart: unless-stopped
    networks:
      - internal_network
      - external_network
    ports:
      - "6379:6379"
  # manager manages the job queue.
  manager:
    image: quay.io/netzbegruenung/green-spider:latest
    command: >
      python3 cli.py
      --credentials-path /secrets/datastore-writer.json
      --loglevel debug manager
    environment:
      REDIS_URL: redis://redis:6379/0
      GIT_USERNAME: ${GIT_USERNAME}
      GIT_PASSWORD: ${GIT_PASSWORD}
    volumes:
      - ${PWD}/secrets:/secrets
    networks:
      - internal_network
      - external_network
    depends_on:
      - redis
  dashboard:
    image: eoranged/rq-dashboard:v0.6.1
    environment:
      RQ_DASHBOARD_REDIS_URL: redis://redis:6379/0
    networks:
      - internal_network
      - external_network
    ports:
      - "9181:9181"
    depends_on:
      - redis
 networks:
  internal_network:
    internal: true
  external_network:
    internal: false
--- a/job.py
+++ b/job.py
@ -0,0 +1,147 @@
 """
 Dieses Script wird vom RQ worker ausgeführt, um einen einzelnen Job aus der
 Spider-Warteschlange abzuarbeiten.
 """
 import json
 import os
 from datetime import datetime
 import time
 import logging
 import docker
 from google.cloud import datastore
 # Maximum oper-job runtime in seconds. This can be increased for second, third attempt
 # via the environment JOB_TIMEOUT variable.
 TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))
 DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest'
 CREDENTIALS_PATH = '/secrets/datastore-writer.json'
 client = docker.from_env()
 low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')
 datastore_client = datastore.Client.from_service_account_json("." + CREDENTIALS_PATH)
 pwd = os.path.abspath(".")
 secrets_path = pwd + "/secrets"
 chromedir_path = pwd + "/volumes/chrome-userdir"
 screenshots_path = pwd + "/screenshots"
 volumes = {}
 volumes[secrets_path] = {'bind': '/secrets', 'mode': 'ro'}
 volumes[chromedir_path] = {'bind': '/opt/chrome-userdir', 'mode': 'rw'}
 volumes[screenshots_path] = {'bind': '/screenshots', 'mode': 'rw'}
 logger = logging.getLogger('rq.worker')
 logger.setLevel(logging.DEBUG)
 def run(job):
    """
    Runs a spider container with the given job.
    Returns the container logs. If the execution takes longer than the
    duration defined by the JOB_TIMEOUT environment variable (in seconds),
    the container gets killed.
    """
    cmd_template = ("python cli.py --credentials-path={path} "
                    " --loglevel=debug "
                    " spider "
                    " --job='{job_json}'")
    cmd = cmd_template.format(path=CREDENTIALS_PATH,
                              job_json=json.dumps(job))
    container = client.containers.run(image=DOCKER_IMAGE,
                          command=cmd,
                          detach=True,
                          remove=True,
                          shm_size='2G',
                          stdout=True,
                          stderr=True,
                          tty=False,
                          volumes=volumes)
    id = container.id
    # Data about this spider run, to be written to datastore
    key = datastore_client.key('spider-runs')
    entity = datastore.Entity(key=key)
    results = {
        'datetime': datetime.utcnow(),
        'url': job['url'],
        'success': True,
        'error': '',
        'duration_seconds': 0,
        'cpu_usage_seconds': 0,
        'network_received_bytes': 0,
        'network_transmitted_bytes': 0,
        'memory_max_bytes': 0,
    }
    # wait for finish
    start = datetime.utcnow()
    while True:
        time.sleep(1)
        clist = client.containers.list(filters={'id': id})
        if len(clist) == 0:
            break
        for c in clist:
            # Collect stats
            try:
                stats = low_level_client.stats(id, stream=False)
                cpu_usage = stats['cpu_stats']['cpu_usage']['total_usage'] / 1000000000.0
                if 'networks' in stats:
                    network_received_bytes = stats['networks']['eth0']['rx_bytes']
                    network_transmitted_bytes = stats['networks']['eth0']['tx_bytes']
                memory_max_bytes = 0
                if 'max_usage' in stats['memory_stats']:
                    memory_max_bytes = stats['memory_stats']['max_usage']
                    results['memory_max_bytes'] = memory_max_bytes
                #logger.debug("Stats: CPU time %d Sec, RX %d KB, Mem %d MB" % (cpu_usage, network_received_bytes/1000, memory_max_bytes/1000000))
                if cpu_usage > 0:
                    results['cpu_usage_seconds'] = round(cpu_usage)
                if network_received_bytes > 0:
                    results['network_received_bytes'] = network_received_bytes
                if network_transmitted_bytes > 0:
                    results['network_transmitted_bytes'] = network_transmitted_bytes
            except docker.errors.APIError as e:
                logger.error("Could not get stats: %s" % e)
            except json.decoder.JSONDecodeError:
                # This means we didn't get proper stats
                pass
            runtime = (datetime.utcnow() - start).seconds
            results['duration_seconds'] = round(runtime)
            #if c.status != "running":
            #    logger.info("Container %s status: %s" % (c.id, c.status))
            if c.status == "exited":
                logger.debug("Container %s is exited." % c.id)
                break
            if runtime > TIMEOUT:
                c.kill()
                results['success'] = False
                results['error'] = 'TIMEOUT'
                entity.update(results)
                datastore_client.put(entity)
                raise Exception("Execution took too long. Killed container after %s seconds." % TIMEOUT)
    entity.update(results)
    datastore_client.put(entity)
    return results
--- a/k8s-job-manager.py
+++ b/k8s-job-manager.py
@ -0,0 +1,67 @@
 import config
 import os
 from datetime import datetime
 import time
 import random
 from pathlib import Path
 import kubernetes
 PENDING_LIMIT = 2
 RUNNING_LIMIT = 4
 INTERVAL = 10 # Seconds
 def main():
    # Get jobs
    jobs = list(Path("./k8s-jobs").rglob("*.yaml"))
    random.seed()
    random.shuffle(jobs)
    kubernetes.config.load_kube_config(context='giantswarm-5jka7')
    v1client = kubernetes.client.CoreV1Api()
    k8sclient = kubernetes.client.ApiClient()
    start = datetime.utcnow()
    jobs_queued = 0
    while len(jobs) > 0:
        # Check whether there are pods pending
        pending_pods = v1client.list_pod_for_all_namespaces(
            watch=False,
            field_selector='status.phase=Pending',
            label_selector='app=green-spider')
        pending = list(pending_pods.items)
        # Get running pods
        running_pods = v1client.list_pod_for_all_namespaces(
            watch=False,
            field_selector='status.phase=Running',
            label_selector='app=green-spider')
        running = list(running_pods.items)
        now = datetime.utcnow()
        duration = now - start
        # Add new job to the queue
        if len(pending) < PENDING_LIMIT and len(running) < RUNNING_LIMIT:
            to_be_queued = RUNNING_LIMIT - len(running)
            for _ in range(to_be_queued):
                job_path = jobs.pop(0)
                jobs_queued += 1
                duration_per_job = duration / jobs_queued
                jobs_remaining = len(jobs)
                print(f'{jobs_queued} jobs queued in {duration} - {jobs_remaining} jobs (estimated {duration_per_job * jobs_remaining}) remaining at {int(duration_per_job.total_seconds())} seconds per job on average')
                kubernetes.utils.create_from_yaml(k8sclient, job_path)
                os.remove(job_path)
        time.sleep(INTERVAL)
    print('No more jobs left. Done.')
 if __name__ == '__main__':
    main()
--- a/kubernetes/job-example.yaml
+++ b/kubernetes/job-example.yaml
@ -0,0 +1,67 @@
 ---
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: green-spider-job-1
  namespace: marian
  labels:
    app: green-spider
 spec:
  activeDeadlineSeconds: 120
  ttlSecondsAfterFinished: 600
  completions: 1
  backoffLimit: 3
  # Pod template
  template:
    metadata:
      name: green-spider-job
      namespace: marian
      labels:
        app: green-spider
    spec:
      restartPolicy: Never
      nodeSelector:
        giantswarm.io/machine-pool: 5n27k
      affinity:
        podAntiAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
          - labelSelector:
              matchExpressions:
              - key: app
                operator: In
                values:
                - green-spider
            topologyKey: topology.kubernetes.io/region
      containers:
      - name: spider
        image: quay.io/netzbegruenung/green-spider:kubernetes
        imagePullPolicy: IfNotPresent
        command:
        - python
        - cli.py
        - --credentials-path=/secrets/datastore-writer.json
        - --loglevel=debug
        - spider
        - '--job={"url":"https://www.gruene.de/","type":"PARTY","level":"DE:BUNDESVERBAND","state":null,"district":null,"city":null}'
        volumeMounts:
        - name: secrets
          mountPath: "/secrets"
          readOnly: true
        - name: shared
          mountPath: /dev/shm
        resources:
          requests:
            cpu: 1000m
            memory: 5000M
      volumes:
      - name: secrets
        secret:
          secretName: green-spider
          items:
          - key: datastore-writer.json
            path: datastore-writer.json
          - key: screenshots-uploader.json
            path: screenshots-uploader.json
      - name: shared
        emptyDir: {}
--- a/kubernetes/psp.yaml
+++ b/kubernetes/psp.yaml
@ -0,0 +1,18 @@
 apiVersion: policy/v1beta1
 kind: PodSecurityPolicy
 metadata:
  name: green-spider-job-psp
  namespace: marian
 spec:
  privileged: false
  seLinux:
    rule: RunAsAny
  supplementalGroups:
    rule: RunAsAny
  runAsUser:
    rule: RunAsAny
  fsGroup:
    rule: RunAsAny
  volumes:
  - emptyDir
  - secret
--- a/manager/init.py
+++ b/manager/init.py
@ -1,21 +1,26 @@
 """
-The jobs module allows to create jobs for the queue and take jobs off the queue
+The manager module allows to fill the RQ job queue.
 """
 from datetime import datetime
 import logging
 import math
 import os
 import random
 import shutil
 import time
 import json
 from datetime import datetime
 from git import Repo
-import tenacity
+from rq import Queue
 import redis
 import yaml
-from google.api_core.exceptions import Aborted
+from yaml import Loader
-from google.cloud import datastore
+from hashlib import sha256
 import config
 REDIS_URL = os.environ.get("REDIS_URL", "redis://redis:6379/0")
 def clone_data_directory():
    """
@ -40,7 +45,7 @@ def directory_entries():
                continue
            with open(filepath, 'r', encoding='utf8') as yamlfile:
-                for doc in yaml.load_all(yamlfile):
+                for doc in yaml.load_all(yamlfile, Loader=Loader):
                    yield doc
@ -53,7 +58,7 @@ def chunks(the_list, size):
        yield the_list[i:i + size]
-def create_jobs(datastore_client, url=None):
+def create_jobs(url=None):
    """
    Read all URLs from green directory and fill a job database
    with one job per URL.
@ -62,6 +67,18 @@ def create_jobs(datastore_client, url=None):
    will be added as a spider job.
    """
    logging.info('Waiting for redis at %s' % REDIS_URL)
    redis_success = False
    while not redis_success:
        try:
            redis_conn = redis.from_url(REDIS_URL)
            redis_success = True
        except Exception as ex:
            logging.error(ex)
            time.sleep(5)
    queue = Queue('low', connection=redis_conn)
    # refresh our local clone of the green directory
    logging.info("Refreshing green-directory clone")
    clone_data_directory()
@ -104,7 +121,7 @@ def create_jobs(datastore_client, url=None):
                logging.error("Error in %s: 'url' key missing (%s)",
                              repr_entry(entry), entry['urls'][index])
-    # ensure the passed URL argument is really there, even if not part
+    # Ensure the passed URL argument is really there, even if not part
    # of the directory.
    if url and count == 0:
        logging.info("Adding job for URL %s which is not part of green-directory", url)
@ -115,55 +132,63 @@ def create_jobs(datastore_client, url=None):
            "state": None,
            "district": None,
            "city": None,
            "index": int(random.uniform(1000000, 9999999)),
        })
    count = 0
    errorcount = 0
    logging.info("Writing jobs")
-    entities = []
+    count = 0
    for entry in input_entries:
-        key = datastore_client.key(config.JOB_DATASTORE_KIND, entry["url"])
+        try:
-        entity = datastore.Entity(key=key)
+            _ = queue.enqueue('job.run',
-        entity.update({
+                job_timeout='300s',
-            "created": datetime.utcnow(),
+                at_front=random.choice([True, False]),
-            "type": entry["type"],
+                # keywords args passes on the job function
-            "level": entry["level"],
+                kwargs={
-            "state": entry["state"],
+                    'job': entry,
-            "district": entry["district"],
+                })
            "city": entry["city"],
            "index": int(random.uniform(1000000, 9999999)),
        })
        entities.append(entity)
-    # commmit to DB
+            # Print job for debugging purposes
-    for chunk in chunks(entities, 300):
+            print(json.dumps(entry))
-        logging.debug("Writing jobs chunk of length %d", len(chunk))
+
-        datastore_client.put_multi(chunk)
+            #logging.debug("Added job with ID %s for URL %s" % (enqueued_job.id, entry['url']))
-        count += len(chunk)
+            count += 1
        except Exception as e:
            errorcount += 1
            logging.error("Error adding job for URL %s: %s" % (entry['url'], e))
        # Write kubernetes Job
        make_k8s_job(entry, count)
        count += 1
    logging.info("Writing jobs done, %s jobs added", count)
    logging.info("%d errors while writing jobs", errorcount)
-@tenacity.retry(wait=tenacity.wait_exponential(),
+def make_k8s_job(job_data, count):
-                retry=tenacity.retry_if_exception_type(Aborted))
+    now = datetime.utcnow().strftime('%Y%m%d%H%M')
-def get_job_from_queue(datastore_client):
+    urlhash = sha256(job_data['url'].encode('utf-8')).hexdigest()[0:12]
-    """
+    job_name = f'gs-{now}-{urlhash}'
-    Returns a URL from the queue
+    filename = f'{job_name}.yaml'
-    """
+    batch_folder = math.floor(count / config.K8S_JOB_BATCH_SIZE)
-    out = None
+    output_dir = os.path.join(config.K8S_JOBS_PATH, str(batch_folder))
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, filename)
    job_json = json.dumps(job_data)
    job_flag = f'\'--job={job_json}\''
-    with datastore_client.transaction():
+    with open(config.K8S_JOB_TEMPLATE, "r") as template_file:
-        query = datastore_client.query(kind=config.JOB_DATASTORE_KIND,
+        template = template_file.read()
-                                       order=['index'])
+    
-        for entity in query.fetch(limit=1):
+    template = template.replace('JOB_NAME', job_name)
-            logging.debug("Got job: %s", entity)
+    template = template.replace('POD_NAME', job_name)
-            out = dict(entity)
+    template = template.replace('JOB_FLAG', job_flag)
-            out["url"] = entity.key.name
+
-            datastore_client.delete(entity.key)
+    with open(output_path, "w") as output:
        output.write(template)
    return out
 def repr_entry(entry):
    """
--- a/manager/job_template.yaml
+++ b/manager/job_template.yaml
@ -0,0 +1,67 @@
 ---
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: JOB_NAME
  namespace: marian
  labels:
    app: green-spider
 spec:
  activeDeadlineSeconds: 600
  ttlSecondsAfterFinished: 600
  completions: 1
  backoffLimit: 3
  # Pod template
  template:
    metadata:
      name: POD_NAME
      namespace: marian
      labels:
        app: green-spider
    spec:
      restartPolicy: Never
      nodeSelector:
        giantswarm.io/machine-pool: 5n27k
      # affinity:
      #   podAntiAffinity:
      #     requiredDuringSchedulingIgnoredDuringExecution:
      #     - labelSelector:
      #         matchExpressions:
      #         - key: app
      #           operator: In
      #           values:
      #           - green-spider
      #       topologyKey: topology.kubernetes.io/region
      containers:
      - name: spider
        image: quay.io/netzbegruenung/green-spider:20211031-chromium93
        imagePullPolicy: IfNotPresent
        command:
        - python3
        - cli.py
        - --credentials-path=/secrets/datastore-writer.json
        - --loglevel=debug
        - spider
        - JOB_FLAG
        volumeMounts:
        - name: secrets
          mountPath: "/secrets"
          readOnly: true
        - name: shared
          mountPath: /dev/shm
        resources:
          requests:
            cpu: 1000m
            memory: 5000M
      volumes:
      - name: secrets
        secret:
          secretName: green-spider
          items:
          - key: datastore-writer.json
            path: datastore-writer.json
          - key: screenshots-uploader.json
            path: screenshots-uploader.json
      - name: shared
        emptyDir: {}
--- a/rating/init.py
+++ b/rating/init.py
@ -10,6 +10,8 @@ from rating import contact_link
 from rating import favicon
 from rating import feeds
 from rating import https
 from rating import network_payload
 from rating import network_requests
 from rating import no_network_errors
 from rating import no_script_errors
 from rating import no_third_party_cookies
@ -39,6 +41,8 @@ def calculate_rating(results):
        'FEEDS': feeds,
        'HTTPS': https,
        'HTTP_RESPONSE_DURATION': response_duration,
        'NETWORK_PAYLOAD': network_payload,
        'NETWORK_REQUESTS': network_requests,
        'NO_NETWORK_ERRORS': no_network_errors,
        'NO_SCRIPT_ERRORS': no_script_errors,
        'NO_THIRD_PARTY_COOKIES': no_third_party_cookies,
--- a/rating/network_payload.py
+++ b/rating/network_payload.py
@ -0,0 +1,57 @@
 """
 This rater evaluates the amount of data transferred for a page load.
 Currently no score is given. The plan is however to reward site that
 cause smaller transfers.
 The rater uses Chrome performance log messages of type
 'Network.loadingFinished'.
 """
 from rating.abstract_rater import AbstractRater
 class Rater(AbstractRater):
    rating_type = 'number'
    default_value = 0
    depends_on_checks = ['load_in_browser']
    max_score = 1.0
    def __init__(self, check_results):
        super().__init__(check_results)
    def rate(self):
        value = self.default_value
        score = 0
        payloads_for_urls = []
        for url in self.check_results['load_in_browser']:
            payload = 0
            if (self.check_results['load_in_browser'][url]['performance_log'] == [] or
                self.check_results['load_in_browser'][url]['performance_log'] is None):
                continue
            for lentry in self.check_results['load_in_browser'][url]['performance_log']:
                if lentry['message']['method'] == 'Network.loadingFinished':
                    payload += lentry['message']['params']['encodedDataLength']
            payloads_for_urls.append(payload)
        # Calculate score based on the largest value found for a URL.
        # See https://github.com/netzbegruenung/green-spider/issues/11#issuecomment-600307544
        # for details.
        if len(payloads_for_urls) > 0:
            value = max(payloads_for_urls)
            if value < 994000:
                score = 1
            elif value < 1496000:
                score = .5
        return {
            'type': self.rating_type,
            'value': value,
            'score': score,
            'max_score': self.max_score,
        }
--- a/rating/network_requests.py
+++ b/rating/network_requests.py
@ -0,0 +1,57 @@
 """
 This rater evaluates the number of network requests made.
 Currently no score is given. The plan is however to reward site that
 use only few requests.
 The rater uses Chrome performance log messages of type
 'Network.requestWillBeSent'.
 """
 from rating.abstract_rater import AbstractRater
 class Rater(AbstractRater):
    rating_type = 'number'
    default_value = 0
    depends_on_checks = ['load_in_browser']
    max_score = 1.0
    def __init__(self, check_results):
        super().__init__(check_results)
    def rate(self):
        value = self.default_value
        score = 0
        num_requests_for_urls = []
        for url in self.check_results['load_in_browser']:
            num_requests = 0
            if (self.check_results['load_in_browser'][url]['performance_log'] == [] or
                self.check_results['load_in_browser'][url]['performance_log'] is None):
                continue
            for lentry in self.check_results['load_in_browser'][url]['performance_log']:
                if lentry['message']['method'] == 'Network.requestWillBeSent':
                    num_requests += 1
            num_requests_for_urls.append(num_requests)
        # Calculate score based on the largest value found for a URL.
        # See https://github.com/netzbegruenung/green-spider/issues/11#issuecomment-600307544
        # for details.
        if len(num_requests_for_urls) > 0:
            value = max(num_requests_for_urls)
            if value <= 28:
                score = 1.0
            elif value <= 38:
                score = 0.5
        return {
            'type': self.rating_type,
            'value': value,
            'score': score,
            'max_score': self.max_score,
        }
--- a/rating/response_duration.py
+++ b/rating/response_duration.py
@ -9,7 +9,7 @@ from rating.abstract_rater import AbstractRater
 class Rater(AbstractRater):
    rating_type = 'number'
-    default_value = False
+    default_value = 0
    depends_on_checks = ['page_content']
    max_score = 1.0
--- a/requirements.txt
+++ b/requirements.txt
@ -1,16 +1,42 @@
-beautifulsoup4==4.8.1
+beautifulsoup4==4.9.3
 cachetools==4.2.2
 certifi==2020.12.5
 cffi==1.14.5
 chardet==3.0.4
 click==7.1.2
 cssselect==1.1.0
 dnspython==1.16.0
-feedparser==5.2.1
+docker==4.4.1
-GitPython
+feedparser==6.0.8
-google-cloud-datastore==1.10.0
+gitdb==4.0.7
-html-similarity==0.3.2
+GitPython==3.1.14
 google-api-core==1.26.3
 google-auth==1.30.0
 google-cloud-core==1.6.0
 google-cloud-datastore==1.15.3
 google-cloud-storage==1.38.0
 googleapis-common-protos==1.53.0
 html-similarity==0.3.3
 httpretty==0.9.7
-pyopenssl==18.0.0
+idna==2.10
-PyYAML
+parsel==1.6.0
-requests==2.22.0
+protobuf==3.15.8
-responses==0.10.15
+pyasn1==0.4.8
-# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
+pyasn1-modules==0.2.8
-selenium==3.8.0
+pycparser==2.20
 pyOpenSSL==20.0.1
 pytz==2021.1
 PyYAML==5.4.1
 redis==3.5.3
 requests==2.25.1
 responses==0.13.3
 rq==1.8.0
 rsa==4.7.2
 selenium==3.141.0
 smmap==3.0.4
 smmap2==2.0.5
-tenacity==5.0.2
+soupsieve==2.2.1
-urllib3==1.25.9
+tenacity==5.1.5
 urllib3==1.26.4
 w3lib==1.22.0
 websocket-client==0.59.0
--- a/spider/spider.py
+++ b/spider/spider.py
@ -16,7 +16,7 @@ from google.cloud import datastore
 import checks
 import config
-import jobs
+import manager
 import rating
 def check_and_rate_site(entry):
@ -54,10 +54,17 @@ def check_and_rate_site(entry):
    for key in result['rating']:
        result['score'] += result['rating'][key]['score']
-    # remove full HTML page content and hyperlinks to safe some storage
+    # Remove bigger result portions to safe some storage:
    # - HTML page content
    # - Hyperlinks
    # - Performnance log
    try:
        for url in result['checks']['page_content']:
            del result['checks']['page_content'][url]['content']
        for url in result['checks']['load_in_browser']:
            del result['checks']['load_in_browser'][url]['performance_log']
        del result['checks']['hyperlinks']
    except:
        pass
@ -80,6 +87,7 @@ def test_url(url):
    result = check_and_rate_site(entry=job)
    pprint(result)
 def execute_single_job(datastore_client, job, entity_kind):
    """
    Executes spider for one single job
@ -103,9 +111,11 @@ def execute_single_job(datastore_client, job, entity_kind):
        'rating': result['rating'],
        'score': result['score'],
    }
    entity.update(record)
    try:
        datastore_client.put(entity)
        logging.debug("Successfully wrote record to database")
    except InvalidArgument as ex:
        logging.error("Could not write result: %s", ex)
    except Exception as ex:
@ -116,7 +126,7 @@ def work_of_queue(datastore_client, entity_kind):
    Take job from queue and finish it until there are no more jobs
    """
    while True:
-        job = jobs.get_job_from_queue(datastore_client)
+        job = manager.get_job_from_queue(datastore_client)
        if job is None:
            logging.info("No more jobs. Exiting.")
            break