Job-Verwaltung mit RQ, und vieles mehr (#149)
* CLI: remove 'jobs' command, add 'manager' * Add job definition * Move jobs to manage folder * Rename jobs to manager * Add rq and redis dependencies * Add docker-compose YAML * Downgrade to alpine 3.8 * Adjust paths in Dockerfile, remove entrypoint * Rename 'make spiderjobs' to 'make jobs' * Fix docker exectution * Adapt 'make jobs' * Fix metadata scheme * Add docker dependency * Rendomize queue (a bit) * Use latest image, remove debug output * Make docker-compose file downwards-compatible * Use latest instead of dev image tag * Update docker-compose.yaml * Adapt job start script * Fix redis connection in manager * Add support for increasing timeout via environment variable * Adapt load_in_browser to cookies table schema change * Fix execution * Mitigate yaml warning * Bump some dependency versions * Report resource usage stats for each job * checks/load_in_browser: Return DOM size, prevent multiple page loads * Update .dockerignore * Code update * Script update * Update README.md * WIP * WIP commit * Update Dockerfile to alpine:edge and chromium v90 * Update TestCertificateChecker * Set defaults for __init__ function * Detect sunflower theme * Update unit test for new datetime (zero-basing) * Set logging prefs from Chromium in a new way * Move datastore client instantiation As it is not needed for all commands * Change green-directory repository URL * Add git settings for cloning green-directory * Pin alpine version 3.14, fix py3-cryptography * Use plain docker build progress output * Add volumes to 'make test' docker run command * Fix bug * Update example command in README * Update dependencies * Add creation of Kubernetes jobs
This commit is contained in:
parent
e59b05fc6c
commit
618e29d763
|
@ -1,6 +1,6 @@
|
||||||
.git
|
.git
|
||||||
webapp
|
|
||||||
docs
|
docs
|
||||||
|
/screenshots
|
||||||
secrets
|
secrets
|
||||||
temp
|
temp
|
||||||
venv
|
venv
|
||||||
|
|
|
@ -5,4 +5,5 @@ temp
|
||||||
__pycache__
|
__pycache__
|
||||||
.vscode/settings.json
|
.vscode/settings.json
|
||||||
kubernetes/green-spider-secret.yaml
|
kubernetes/green-spider-secret.yaml
|
||||||
/volumes
|
/volumes
|
||||||
|
/screenshots
|
||||||
|
|
35
Dockerfile
35
Dockerfile
|
@ -1,23 +1,26 @@
|
||||||
FROM python:3.7-alpine3.9
|
FROM alpine:3.14
|
||||||
|
|
||||||
WORKDIR /workdir
|
WORKDIR /workdir
|
||||||
|
|
||||||
ADD requirements.txt /workdir/
|
ADD requirements.txt /workdir/
|
||||||
|
|
||||||
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.8/main" >> /etc/apk/repositories && \
|
RUN echo "http://dl-4.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories && \
|
||||||
echo "http://dl-4.alpinelinux.org/alpine/v3.8/community" >> /etc/apk/repositories && \
|
echo "http://dl-4.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \
|
||||||
apk update && \
|
apk --update --no-cache add ca-certificates chromium chromium-chromedriver \
|
||||||
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
|
python3-dev py3-grpcio py3-wheel py3-pip py3-lxml \
|
||||||
pip3 install --upgrade pip && \
|
build-base git libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \
|
||||||
pip3 install -r requirements.txt && \
|
pip install -r requirements.txt && \
|
||||||
apk del python3-dev build-base
|
apk del build-base
|
||||||
|
|
||||||
ADD cli.py /
|
# As alpine's py3-cryptography did not work as of alpine v3.14, we use this hack from
|
||||||
ADD config /config
|
# https://github.com/pyca/cryptography/issues/3344#issuecomment-650845512
|
||||||
ADD jobs /jobs
|
RUN LDFLAGS="-L/opt/openssl/lib -Wl,-rpath,/opt/openssl/lib" CFLAGS="-I/opt/openssl/include" pip3 install -U cryptography
|
||||||
ADD checks /checks
|
|
||||||
ADD rating /rating
|
|
||||||
ADD spider /spider
|
|
||||||
ADD export /export
|
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "/cli.py"]
|
ADD cli.py /workdir/
|
||||||
|
ADD manager /workdir/manager
|
||||||
|
ADD config /workdir/config
|
||||||
|
ADD checks /workdir/checks
|
||||||
|
ADD rating /workdir/rating
|
||||||
|
ADD spider /workdir/spider
|
||||||
|
ADD export /workdir/export
|
||||||
|
ADD job.py /workdir/
|
||||||
|
|
16
Makefile
16
Makefile
|
@ -6,16 +6,17 @@ DB_ENTITY := spider-results
|
||||||
|
|
||||||
# Build docker image
|
# Build docker image
|
||||||
dockerimage:
|
dockerimage:
|
||||||
docker build -t $(IMAGE) .
|
docker build --progress plain -t $(IMAGE) .
|
||||||
|
|
||||||
# Create spider job queue
|
# Fill the queue with spider jobs, one for each site.
|
||||||
spiderjobs:
|
jobs:
|
||||||
docker run --rm -ti \
|
docker run --rm -ti \
|
||||||
-v $(PWD)/secrets:/secrets \
|
-v $(PWD)/secrets:/secrets \
|
||||||
$(IMAGE) \
|
$(IMAGE) \
|
||||||
--credentials-path /secrets/datastore-writer.json \
|
python cli.py \
|
||||||
--loglevel debug \
|
--credentials-path /secrets/datastore-writer.json \
|
||||||
jobs
|
--loglevel debug \
|
||||||
|
manager
|
||||||
|
|
||||||
# Run spider in docker image
|
# Run spider in docker image
|
||||||
spider:
|
spider:
|
||||||
|
@ -41,6 +42,9 @@ export:
|
||||||
# run spider tests
|
# run spider tests
|
||||||
test:
|
test:
|
||||||
docker run --rm -ti \
|
docker run --rm -ti \
|
||||||
|
-v $(PWD)/volumes/dev-shm:/dev/shm \
|
||||||
|
-v $(PWD)/secrets:/secrets \
|
||||||
|
-v $(PWD)/screenshots:/screenshots \
|
||||||
-v $(PWD)/volumes/chrome-userdir:/opt/chrome-userdir \
|
-v $(PWD)/volumes/chrome-userdir:/opt/chrome-userdir \
|
||||||
--entrypoint "python3" \
|
--entrypoint "python3" \
|
||||||
$(IMAGE) \
|
$(IMAGE) \
|
||||||
|
|
11
README.md
11
README.md
|
@ -32,7 +32,7 @@ Alle Informationen zum Betrieb befinden sich im Verzeichnis [devops](https://git
|
||||||
|
|
||||||
Green Spider ist in Python 3 geschrieben und wird aktuell unter 3.6 getestet und ausgeführt.
|
Green Spider ist in Python 3 geschrieben und wird aktuell unter 3.6 getestet und ausgeführt.
|
||||||
|
|
||||||
Aufgrund zahlreicher Dependencies empfiehlt es sich, den Spider Code lokal in Docker
|
Aufgrund zahlreicher Abhängigkeiten empfiehlt es sich, den Spider Code lokal in Docker
|
||||||
auszuführen.
|
auszuführen.
|
||||||
|
|
||||||
Das Image wird über den folgenden Befehl erzeugt:
|
Das Image wird über den folgenden Befehl erzeugt:
|
||||||
|
@ -57,18 +57,19 @@ Am einfachsten geht das über den `make spider` Befehl, so:
|
||||||
make spider ARGS="--url http://www.example.com/"
|
make spider ARGS="--url http://www.example.com/"
|
||||||
```
|
```
|
||||||
|
|
||||||
Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenank.
|
Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenbank.
|
||||||
|
|
||||||
Wenn nur eine einzelne Site gespidert werden soll, die Ergebnisse aber in die Datenbank geschrieben werden sollen, kann der Spider so mit `--job` und einem JSON-Object aufgerufen werden (Beispiel):
|
Wenn nur eine einzelne Site gespidert werden soll, die Ergebnisse aber in die Datenbank geschrieben werden sollen, kann der Spider so mit `--job` und einem JSON-Object aufgerufen werden (Beispiel):
|
||||||
|
|
||||||
```
|
```nohighlight
|
||||||
docker run --rm -ti \
|
docker run --rm -ti \
|
||||||
-v $(pwd)/volumes/dev-shm:/dev/shm \
|
-v $(pwd)/volumes/dev-shm:/dev/shm \
|
||||||
-v $(pwd)/secrets:/secrets \
|
-v $(pwd)/secrets:/secrets \
|
||||||
|
-v $(pwd)/screenshots:/screenshots \
|
||||||
-v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
|
-v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
|
||||||
--shm-size=2g \
|
--shm-size=2g \
|
||||||
quay.io/netzbegruenung/green-spider:latest \
|
quay.io/netzbegruenung/green-spider:latest python3 cli.py \
|
||||||
--credentials-path /secrets/datastore-writer.json \
|
--credentials-path /secrets/datastore-writer.json \
|
||||||
--loglevel debug \
|
--loglevel debug \
|
||||||
spider --job '{"url": "https://xn--grne-porta-westfalica-9hc.de/", "meta": {"city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}}'
|
spider --job '{"url": "https://gruene-porta-westfalica.de/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}'
|
||||||
```
|
```
|
||||||
|
|
|
@ -54,16 +54,27 @@ def perform_checks(input_url):
|
||||||
|
|
||||||
results = {}
|
results = {}
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
# Set screenshot_bucket_name and storage_credentials_path
|
||||||
|
# based on flags.
|
||||||
config = Config(urls=[input_url],
|
config = Config(urls=[input_url],
|
||||||
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' +
|
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' +
|
||||||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' +
|
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' +
|
||||||
'Safari/537.36 green-spider/0.2')
|
'Safari/537.36 green-spider/0.2',
|
||||||
|
screenshot_bucket_name='green-spider-screenshots.sendung.de',
|
||||||
|
screenshot_datastore_kind='webscreenshot',
|
||||||
|
storage_credentials_path='/secrets/screenshots-uploader.json',
|
||||||
|
datastore_credentials_path='/secrets/datastore-writer.json')
|
||||||
|
|
||||||
|
# Iterate over all checks.
|
||||||
for check_name, check in check_modules:
|
for check_name, check in check_modules:
|
||||||
|
|
||||||
|
# checker is the individual test/assertion handler we instantiate
|
||||||
|
# for each check step.
|
||||||
checker = check.Checker(config=config,
|
checker = check.Checker(config=config,
|
||||||
previous_results=results)
|
previous_results=results)
|
||||||
|
|
||||||
# see if dependencies are met
|
# Ensure that dependencies are met for the checker.
|
||||||
dependencies = checker.depends_on_results()
|
dependencies = checker.depends_on_results()
|
||||||
if dependencies != []:
|
if dependencies != []:
|
||||||
for dep in dependencies:
|
for dep in dependencies:
|
||||||
|
@ -71,10 +82,16 @@ def perform_checks(input_url):
|
||||||
logging.debug("Skipping check %s as dependency %s is not met" % (check_name, dep))
|
logging.debug("Skipping check %s as dependency %s is not met" % (check_name, dep))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Execute the checker's main function.
|
||||||
result = checker.run()
|
result = checker.run()
|
||||||
results[check_name] = result
|
results[check_name] = result
|
||||||
|
|
||||||
# update config for the next check
|
# Execute any cleanup/aftermath function (if given) for the checker.
|
||||||
|
modified_results = checker.post_hook(result)
|
||||||
|
if modified_results is not None:
|
||||||
|
results[check_name] = modified_results
|
||||||
|
|
||||||
|
# Update config for the next check(s) in the sequence.
|
||||||
config = checker.config
|
config = checker.config
|
||||||
logging.debug("config after check %s: %r" % (check_name, config))
|
logging.debug("config after check %s: %r" % (check_name, config))
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,20 @@ class AbstractChecker(object):
|
||||||
"""Executes the check routine, returns result dict"""
|
"""Executes the check routine, returns result dict"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def post_hook(self, result):
|
||||||
|
"""
|
||||||
|
Optional function to execute after run(). Can be used to post-process
|
||||||
|
results data. Should be defined by the implementing checker.
|
||||||
|
|
||||||
|
Params:
|
||||||
|
result: Result data from the run() function.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: Modified results data
|
||||||
|
None: Means that nothing has been done, so should be ignored.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def config(self):
|
def config(self):
|
||||||
return self._config
|
return self._config
|
||||||
|
|
|
@ -14,7 +14,7 @@ class TestCertificateChecker(unittest.TestCase):
|
||||||
result = checker.run()
|
result = checker.run()
|
||||||
self.assertIn(url, result)
|
self.assertIn(url, result)
|
||||||
self.assertIsNone(result[url]['exception'])
|
self.assertIsNone(result[url]['exception'])
|
||||||
self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services')
|
self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services LLC')
|
||||||
|
|
||||||
def test_kaarst(self):
|
def test_kaarst(self):
|
||||||
"""Real-workd example"""
|
"""Real-workd example"""
|
||||||
|
@ -24,7 +24,7 @@ class TestCertificateChecker(unittest.TestCase):
|
||||||
result = checker.run()
|
result = checker.run()
|
||||||
self.assertIn(url, result)
|
self.assertIn(url, result)
|
||||||
self.assertIsNone(result[url]['exception'])
|
self.assertIsNone(result[url]['exception'])
|
||||||
self.assertEqual(result[url]['issuer']['O'], 'Sectigo Limited')
|
self.assertEqual(result[url]['issuer']['O'], 'DigiCert Inc')
|
||||||
|
|
||||||
def test_tls_v_1_0(self):
|
def test_tls_v_1_0(self):
|
||||||
"""Load a certificate for a TLS v1.0 server"""
|
"""Load a certificate for a TLS v1.0 server"""
|
||||||
|
|
|
@ -3,9 +3,19 @@ class Config(object):
|
||||||
Our configuration to be passed to checks
|
Our configuration to be passed to checks
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, urls, user_agent='green-spider/1.0'):
|
def __init__(self,
|
||||||
|
urls,
|
||||||
|
screenshot_bucket_name='',
|
||||||
|
screenshot_datastore_kind='',
|
||||||
|
storage_credentials_path='',
|
||||||
|
datastore_credentials_path='',
|
||||||
|
user_agent='green-spider/1.0'):
|
||||||
self._urls = set(urls)
|
self._urls = set(urls)
|
||||||
self._user_agent = user_agent
|
self._user_agent = user_agent
|
||||||
|
self._screenshot_bucket_name = screenshot_bucket_name
|
||||||
|
self._screenshot_datastore_kind = screenshot_datastore_kind
|
||||||
|
self._storage_credentials_path = storage_credentials_path
|
||||||
|
self._datastore_credentials_path = datastore_credentials_path
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "Config(urls=%r)" % self._urls
|
return "Config(urls=%r)" % self._urls
|
||||||
|
@ -27,3 +37,19 @@ class Config(object):
|
||||||
@property
|
@property
|
||||||
def user_agent(self):
|
def user_agent(self):
|
||||||
return self._user_agent
|
return self._user_agent
|
||||||
|
|
||||||
|
@property
|
||||||
|
def screenshot_bucket_name(self):
|
||||||
|
return self._screenshot_bucket_name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def storage_credentials_path(self):
|
||||||
|
return self._storage_credentials_path
|
||||||
|
|
||||||
|
@property
|
||||||
|
def datastore_credentials_path(self):
|
||||||
|
return self._datastore_credentials_path
|
||||||
|
|
||||||
|
@property
|
||||||
|
def screenshot_datastore_kind(self):
|
||||||
|
return self._screenshot_datastore_kind
|
||||||
|
|
|
@ -75,6 +75,9 @@ class Checker(AbstractChecker):
|
||||||
elif ('Urwahl3000' in page_content['content'] or
|
elif ('Urwahl3000' in page_content['content'] or
|
||||||
'/themes/urwahl3000' in page_content['content']):
|
'/themes/urwahl3000' in page_content['content']):
|
||||||
generator = 'wordpress-urwahl'
|
generator = 'wordpress-urwahl'
|
||||||
|
|
||||||
|
elif ('/themes/sunflower' in page_content['content']):
|
||||||
|
generator = 'wordpress-sunflower'
|
||||||
|
|
||||||
elif ('/themes/sunflower' in page_content['content']):
|
elif ('/themes/sunflower' in page_content['content']):
|
||||||
generator = 'wordpress-sunflower'
|
generator = 'wordpress-sunflower'
|
||||||
|
|
|
@ -60,15 +60,13 @@ class TestFeed(unittest.TestCase):
|
||||||
result = checker.run()
|
result = checker.run()
|
||||||
pprint(result)
|
pprint(result)
|
||||||
|
|
||||||
self.assertEqual(result, {
|
self.assertEqual(result['http://example.com/feed.xml'], {
|
||||||
'http://example.com/feed.xml': {
|
'exception': None,
|
||||||
'exception': None,
|
'average_interval': 340359,
|
||||||
'title': 'Liftoff News',
|
'first_entry': datetime(2003, 5, 30, 11, 6, 42),
|
||||||
'latest_entry': datetime(2003, 6, 3, 9, 39, 21),
|
'latest_entry': datetime(2003, 6, 3, 9, 39, 21),
|
||||||
'first_entry': datetime(2003, 5, 30, 11, 6, 42),
|
'num_entries': 2,
|
||||||
'average_interval': 340359,
|
'title': 'Liftoff News',
|
||||||
'num_entries': 2,
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,29 +9,38 @@ Information includes:
|
||||||
- what cookies are set during loading the page
|
- what cookies are set during loading the page
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import time
|
import time
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
import json
|
||||||
|
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.common.exceptions import StaleElementReferenceException
|
from selenium.common.exceptions import StaleElementReferenceException
|
||||||
from selenium.common.exceptions import TimeoutException
|
from selenium.common.exceptions import TimeoutException
|
||||||
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||||
import tenacity
|
import tenacity
|
||||||
|
|
||||||
|
from google.cloud import storage
|
||||||
|
from google.cloud import datastore
|
||||||
|
|
||||||
from checks.abstract_checker import AbstractChecker
|
from checks.abstract_checker import AbstractChecker
|
||||||
|
|
||||||
class Checker(AbstractChecker):
|
class Checker(AbstractChecker):
|
||||||
|
|
||||||
page_load_timeout = 30
|
page_load_timeout = 120
|
||||||
|
|
||||||
# sizes we check for (width, height)
|
# sizes we check for (width, height)
|
||||||
sizes = (
|
sizes = (
|
||||||
(360, 640), # rather old smartphone
|
|
||||||
(768, 1024), # older tablet or newer smartphone
|
|
||||||
(1024, 768), # older desktop or horiz. tablet
|
|
||||||
(1920, 1080), # Full HD horizontal
|
(1920, 1080), # Full HD horizontal
|
||||||
|
(1500, 1500), # useful window size we also use for the main screenshot
|
||||||
|
(1024, 768), # older desktop or horiz. tablet
|
||||||
|
(768, 1024), # older tablet or newer smartphone
|
||||||
|
(360, 640), # rather old smartphone
|
||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self, config, previous_results=None):
|
def __init__(self, config, previous_results=None):
|
||||||
|
@ -39,22 +48,50 @@ class Checker(AbstractChecker):
|
||||||
|
|
||||||
# Our selenium user agent using Chrome headless as an engine
|
# Our selenium user agent using Chrome headless as an engine
|
||||||
chrome_options = webdriver.ChromeOptions()
|
chrome_options = webdriver.ChromeOptions()
|
||||||
|
chrome_options.add_argument('enable-automation')
|
||||||
chrome_options.add_argument('--headless')
|
chrome_options.add_argument('--headless')
|
||||||
chrome_options.add_argument('--disable-gpu')
|
chrome_options.add_argument('--disable-gpu')
|
||||||
chrome_options.add_argument('--no-sandbox')
|
chrome_options.add_argument('--no-sandbox')
|
||||||
|
chrome_options.add_argument('--dns-prefetch-disable')
|
||||||
chrome_options.add_argument('--disable-extensions')
|
chrome_options.add_argument('--disable-extensions')
|
||||||
|
chrome_options.add_argument('--disk-cache-size=0')
|
||||||
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||||
|
chrome_options.add_argument('--verbose')
|
||||||
|
chrome_options.page_load_strategy = 'normal'
|
||||||
|
|
||||||
# path where to get cookies from
|
# path where to get cookies from
|
||||||
chrome_options.add_argument("--user-data-dir=/opt/chrome-userdir")
|
chrome_options.add_argument("--user-data-dir=/opt/chrome-userdir")
|
||||||
|
|
||||||
|
# mobile_emulation = {
|
||||||
|
# "deviceMetrics": { "width": 360, "height": 640, "pixelRatio": 3.0 },
|
||||||
|
# "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"
|
||||||
|
# }
|
||||||
|
#mobile_emulation = { "deviceName": "Nexus 5" }
|
||||||
|
#chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
|
||||||
|
|
||||||
# empty /opt/chrome-userdir
|
# empty /opt/chrome-userdir
|
||||||
shutil.rmtree('/opt/chrome-userdir', ignore_errors=True)
|
shutil.rmtree('/opt/chrome-userdir', ignore_errors=True)
|
||||||
|
|
||||||
self.driver = webdriver.Chrome(options=chrome_options)
|
# activate performance logging (includes network logging)
|
||||||
|
capabilities = DesiredCapabilities.CHROME
|
||||||
|
capabilities['goog:loggingPrefs'] = {'performance': 'ALL'}
|
||||||
|
|
||||||
|
# TODO: also do this
|
||||||
|
# (from https://stackoverflow.com/questions/60375633/capture-logs-from-chrome-during-test-is-running-python#comment106827817_60385493)
|
||||||
|
capabilities['loggingPrefs'] = {'performance': 'ALL'}
|
||||||
|
|
||||||
|
|
||||||
|
self.driver = webdriver.Chrome(options=chrome_options, desired_capabilities=capabilities)
|
||||||
self.driver.set_page_load_timeout(self.page_load_timeout)
|
self.driver.set_page_load_timeout(self.page_load_timeout)
|
||||||
|
|
||||||
def run(self):
|
# We capture the browser engine's user agent string
|
||||||
|
# for the record.
|
||||||
|
self.user_agent = self.driver.execute_script("return navigator.userAgent;")
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""
|
||||||
|
Main function of this check.
|
||||||
|
"""
|
||||||
results = {}
|
results = {}
|
||||||
for url in self.config.urls:
|
for url in self.config.urls:
|
||||||
|
|
||||||
|
@ -64,15 +101,22 @@ class Checker(AbstractChecker):
|
||||||
'min_document_width': None,
|
'min_document_width': None,
|
||||||
'logs': None,
|
'logs': None,
|
||||||
'font_families': None,
|
'font_families': None,
|
||||||
|
'performance_log': [],
|
||||||
|
'screenshots': [],
|
||||||
}
|
}
|
||||||
|
|
||||||
# responsive check
|
self.driver.get(url)
|
||||||
|
|
||||||
|
# Responsive layout check and screenshots.
|
||||||
try:
|
try:
|
||||||
sizes = self.check_responsiveness(url)
|
check_responsiveness_results = self.check_responsiveness(url)
|
||||||
results[url] = {
|
results[url] = {
|
||||||
'sizes': sizes,
|
'sizes': check_responsiveness_results['sizes'],
|
||||||
'min_document_width': min([s['document_width'] for s in sizes]),
|
'min_document_width': min([s['document_width'] for s in check_responsiveness_results['sizes']]),
|
||||||
|
'dom_size': self.get_dom_size(),
|
||||||
'logs': self.capture_log(),
|
'logs': self.capture_log(),
|
||||||
|
'performance_log': [],
|
||||||
|
'screenshots': check_responsiveness_results['screenshots'],
|
||||||
}
|
}
|
||||||
except TimeoutException as e:
|
except TimeoutException as e:
|
||||||
logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e))
|
logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e))
|
||||||
|
@ -81,6 +125,7 @@ class Checker(AbstractChecker):
|
||||||
logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re))
|
logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Scroll page to bottom, to load all lazy-loading resources.
|
||||||
try:
|
try:
|
||||||
self.scroll_to_bottom()
|
self.scroll_to_bottom()
|
||||||
except TimeoutException as e:
|
except TimeoutException as e:
|
||||||
|
@ -112,6 +157,7 @@ class Checker(AbstractChecker):
|
||||||
logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
|
logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Process cookies.
|
||||||
try:
|
try:
|
||||||
results[url]['cookies'] = self.get_cookies()
|
results[url]['cookies'] = self.get_cookies()
|
||||||
except TimeoutException as e:
|
except TimeoutException as e:
|
||||||
|
@ -120,10 +166,79 @@ class Checker(AbstractChecker):
|
||||||
except tenacity.RetryError as re:
|
except tenacity.RetryError as re:
|
||||||
logging.warn("RetryError when collecting cookies for %s: %s" % (url, re))
|
logging.warn("RetryError when collecting cookies for %s: %s" % (url, re))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
for logentry in self.driver.get_log('performance'):
|
||||||
|
decoded_logentry = json.loads(logentry['message'])
|
||||||
|
results[url]['performance_log'].append(decoded_logentry)
|
||||||
|
|
||||||
self.driver.quit()
|
self.driver.quit()
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
def post_hook(self, result):
|
||||||
|
"""
|
||||||
|
Logic executed after run() is done.
|
||||||
|
Used to upload screenshots and metadata to cloud storage and datastore.
|
||||||
|
"""
|
||||||
|
# Upload screenshots and metadata
|
||||||
|
|
||||||
|
logging.debug("load_in_browser post_hook 1 - Creating client")
|
||||||
|
|
||||||
|
storage_client = storage.Client.from_service_account_json(self.config.storage_credentials_path)
|
||||||
|
bucket = storage_client.get_bucket(self.config.screenshot_bucket_name)
|
||||||
|
|
||||||
|
datastore_client = datastore.Client.from_service_account_json(self.config.datastore_credentials_path)
|
||||||
|
exclude_from_indexes = ['size', 'screenshot_url', 'user_agent']
|
||||||
|
|
||||||
|
for url in result.keys():
|
||||||
|
for screenshot in result[url]['screenshots']:
|
||||||
|
# Upload one screenshot
|
||||||
|
try:
|
||||||
|
local_file = '%s/%s' % (screenshot['folder'], screenshot['filename'])
|
||||||
|
|
||||||
|
logging.debug("Handling screenshot file %s" % local_file)
|
||||||
|
|
||||||
|
if not os.path.exists(screenshot['local_path']):
|
||||||
|
logging.warning("No screenshot created: size=%s, url='%s'" % (screenshot['size'], screenshot['url']))
|
||||||
|
continue
|
||||||
|
|
||||||
|
logging.debug("Uploading %s to %s/%s" % (screenshot['local_path'], screenshot['folder'], screenshot['filename']))
|
||||||
|
with open(screenshot['local_path'], 'rb') as my_file:
|
||||||
|
# Create new blob in remote bucket
|
||||||
|
blob = bucket.blob(local_file)
|
||||||
|
blob.upload_from_file(my_file, content_type="image/png")
|
||||||
|
blob.make_public()
|
||||||
|
except Exception as e:
|
||||||
|
logging.warn("Error uploading screenshot for %s: %s" % (screenshot['url'], e))
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.remove(screenshot['local_path'])
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Write metadata for one screenshot
|
||||||
|
data = {
|
||||||
|
'url': screenshot['url'],
|
||||||
|
'size': screenshot['size'],
|
||||||
|
'screenshot_url': screenshot['screenshot_url'],
|
||||||
|
'user_agent': screenshot['user_agent'],
|
||||||
|
'created': screenshot['created'],
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
key = datastore_client.key(self.config.screenshot_datastore_kind, screenshot['screenshot_url'])
|
||||||
|
entity = datastore.Entity(key=key, exclude_from_indexes=exclude_from_indexes)
|
||||||
|
entity.update(data)
|
||||||
|
datastore_client.put(entity)
|
||||||
|
logging.debug("Successfully stored screenshot metadata for %s" % screenshot['screenshot_url'])
|
||||||
|
except Exception as e:
|
||||||
|
logging.warn("Error in %s: %s" % (screenshot['url'], e))
|
||||||
|
|
||||||
|
|
||||||
|
# Remove screenshots part from results
|
||||||
|
del result[url]['screenshots']
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def get_cookies(self):
|
def get_cookies(self):
|
||||||
# read cookie DB to get 3rd party cookies, too
|
# read cookie DB to get 3rd party cookies, too
|
||||||
|
@ -131,7 +246,7 @@ class Checker(AbstractChecker):
|
||||||
db = sqlite3.connect('/opt/chrome-userdir/Default/Cookies')
|
db = sqlite3.connect('/opt/chrome-userdir/Default/Cookies')
|
||||||
db.row_factory = sqlite3.Row
|
db.row_factory = sqlite3.Row
|
||||||
c = db.cursor()
|
c = db.cursor()
|
||||||
c.execute("SELECT creation_utc, host_key, name, path, expires_utc, is_secure, is_httponly, has_expires, is_persistent, firstpartyonly FROM cookies")
|
c.execute("SELECT creation_utc, host_key, name, path, expires_utc, is_secure, is_httponly, has_expires, is_persistent FROM cookies")
|
||||||
for row in c.fetchall():
|
for row in c.fetchall():
|
||||||
cookies.append(dict(row))
|
cookies.append(dict(row))
|
||||||
c.close()
|
c.close()
|
||||||
|
@ -142,11 +257,13 @@ class Checker(AbstractChecker):
|
||||||
@tenacity.retry(stop=tenacity.stop_after_attempt(3),
|
@tenacity.retry(stop=tenacity.stop_after_attempt(3),
|
||||||
retry=tenacity.retry_if_exception_type(TimeoutException))
|
retry=tenacity.retry_if_exception_type(TimeoutException))
|
||||||
def check_responsiveness(self, url):
|
def check_responsiveness(self, url):
|
||||||
result = []
|
result = {
|
||||||
|
'sizes': [],
|
||||||
|
'screenshots': [],
|
||||||
|
}
|
||||||
|
|
||||||
# set window to the first size initially
|
# set window to the first size initially
|
||||||
self.driver.set_window_size(self.sizes[0][0], self.sizes[0][1])
|
self.driver.set_window_size(self.sizes[0][0], self.sizes[0][1])
|
||||||
self.driver.get(url)
|
|
||||||
|
|
||||||
for (width, height) in self.sizes:
|
for (width, height) in self.sizes:
|
||||||
self.driver.set_window_size(width, height)
|
self.driver.set_window_size(width, height)
|
||||||
|
@ -155,13 +272,44 @@ class Checker(AbstractChecker):
|
||||||
time.sleep(1.0)
|
time.sleep(1.0)
|
||||||
doc_width = self.driver.execute_script("return document.body.scrollWidth")
|
doc_width = self.driver.execute_script("return document.body.scrollWidth")
|
||||||
|
|
||||||
result.append({
|
result['sizes'].append({
|
||||||
'viewport_width': width,
|
'viewport_width': width,
|
||||||
'document_width': int(doc_width),
|
'document_width': int(doc_width),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Make screenshot
|
||||||
|
urlhash = hashlib.md5(bytearray(url, 'utf-8')).hexdigest()
|
||||||
|
folder = "%sx%s" % (width, height)
|
||||||
|
abs_folder = "/screenshots/%s" % folder
|
||||||
|
os.makedirs(abs_folder, exist_ok=True)
|
||||||
|
filename = urlhash + '.png'
|
||||||
|
abs_filepath = "%s/%s" % (abs_folder, filename)
|
||||||
|
created = datetime.utcnow()
|
||||||
|
|
||||||
|
success = self.driver.save_screenshot(abs_filepath)
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
logging.warn("Failed to create screenshot %s" % abs_filepath)
|
||||||
|
continue
|
||||||
|
|
||||||
|
result['screenshots'].append({
|
||||||
|
'local_path': abs_filepath,
|
||||||
|
'folder': folder,
|
||||||
|
'filename': filename,
|
||||||
|
'url': url,
|
||||||
|
'size': [width, height],
|
||||||
|
'screenshot_url': 'http://%s/%s/%s' % (
|
||||||
|
self.config.screenshot_bucket_name, folder, filename),
|
||||||
|
'user_agent': self.user_agent,
|
||||||
|
'created': created,
|
||||||
|
})
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def get_dom_size(self):
|
||||||
|
dom_length = self.driver.execute_script("return document.getElementsByTagName('*').length")
|
||||||
|
return int(dom_length)
|
||||||
|
|
||||||
def capture_log(self):
|
def capture_log(self):
|
||||||
"""
|
"""
|
||||||
Returns log elements with level "SEVERE" or "WARNING"
|
Returns log elements with level "SEVERE" or "WARNING"
|
||||||
|
|
19
cli.py
19
cli.py
|
@ -19,7 +19,7 @@ def handle_sigint(signum, frame):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
signal.signal(signal.SIGINT,handle_sigint)
|
signal.signal(signal.SIGINT, handle_sigint)
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
@ -40,9 +40,9 @@ if __name__ == "__main__":
|
||||||
spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
|
spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
|
||||||
spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.')
|
spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.')
|
||||||
|
|
||||||
# jobs subcommand
|
# manager subcommand
|
||||||
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
|
manager_parser = subparsers.add_parser('manager', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
|
||||||
jobs_parser.add_argument('--url', help='Add a job to spider a specific URL')
|
manager_parser.add_argument('--url', help='Add a job to spider a specific URL')
|
||||||
|
|
||||||
# export subcommand
|
# export subcommand
|
||||||
export_parser = subparsers.add_parser('export', help='Export JSON data')
|
export_parser = subparsers.add_parser('export', help='Export JSON data')
|
||||||
|
@ -68,20 +68,21 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
logging.debug("Called command %s", args.command)
|
logging.debug("Called command %s", args.command)
|
||||||
|
|
||||||
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
|
if args.command == 'manager':
|
||||||
|
|
||||||
if args.command == 'jobs':
|
import manager
|
||||||
|
manager.create_jobs(args.url)
|
||||||
import jobs
|
|
||||||
jobs.create_jobs(datastore_client, args.url)
|
|
||||||
|
|
||||||
elif args.command == 'export':
|
elif args.command == 'export':
|
||||||
|
|
||||||
import export
|
import export
|
||||||
|
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
|
||||||
export.export_results(datastore_client, args.kind)
|
export.export_results(datastore_client, args.kind)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
from spider import spider
|
from spider import spider
|
||||||
|
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
|
||||||
|
|
||||||
if args.url:
|
if args.url:
|
||||||
# spider one URL for diagnostic purposes
|
# spider one URL for diagnostic purposes
|
||||||
spider.test_url(args.url)
|
spider.test_url(args.url)
|
||||||
|
|
|
@ -7,7 +7,7 @@ CONNECT_TIMEOUT = 5
|
||||||
READ_TIMEOUT = 10
|
READ_TIMEOUT = 10
|
||||||
|
|
||||||
# Git repo for our data
|
# Git repo for our data
|
||||||
GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
|
GREEN_DIRECTORY_REPO = 'https://git.verdigado.com/NB-Public/green-directory.git'
|
||||||
|
|
||||||
# folder in that repo that holds the data
|
# folder in that repo that holds the data
|
||||||
GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
|
GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
|
||||||
|
@ -15,9 +15,12 @@ GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
|
||||||
# folder we use locally to clone the repo
|
# folder we use locally to clone the repo
|
||||||
GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
|
GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
|
||||||
|
|
||||||
# IP address of the newthinking GCMS server
|
# IP address of the verdigado GCMS server
|
||||||
GCMS_IP = "194.29.234.123"
|
GCMS_IP = "194.29.234.123"
|
||||||
|
|
||||||
# kind name of the spider job key datastore entities
|
# kind name of the spider job key datastore entities
|
||||||
JOB_DATASTORE_KIND = 'spider-jobs'
|
JOB_DATASTORE_KIND = 'spider-jobs'
|
||||||
|
|
||||||
|
K8S_JOBS_PATH = './k8s-jobs'
|
||||||
|
K8S_JOB_TEMPLATE = './manager/job_template.yaml'
|
||||||
|
K8S_JOB_BATCH_SIZE = 10
|
||||||
|
|
|
@ -50,7 +50,7 @@ devops/ssh.sh
|
||||||
|
|
||||||
Hostname: `green-spider.netzbegruenung.de`
|
Hostname: `green-spider.netzbegruenung.de`
|
||||||
|
|
||||||
```
|
```shell
|
||||||
docker-compose stop webapp
|
docker-compose stop webapp
|
||||||
docker run -it --rm -p 443:443 -p 80:80 --name certbot \
|
docker run -it --rm -p 443:443 -p 80:80 --name certbot \
|
||||||
-v /etc/letsencrypt:/etc/letsencrypt \
|
-v /etc/letsencrypt:/etc/letsencrypt \
|
||||||
|
|
|
@ -127,13 +127,11 @@ ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP << EOF
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "Install docker"
|
echo "Install docker"
|
||||||
apt-get install -y docker-ce
|
apt-get install -y docker-ce docker-compose
|
||||||
|
|
||||||
mkdir /root/secrets
|
mkdir /root/secrets
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
echo "Done with remote setup."
|
|
||||||
|
|
||||||
if [[ $1 == "screenshotter" ]]; then
|
if [[ $1 == "screenshotter" ]]; then
|
||||||
### screenshotter
|
### screenshotter
|
||||||
|
|
||||||
|
@ -149,6 +147,41 @@ if [[ $1 == "screenshotter" ]]; then
|
||||||
-v /root/secrets:/secrets \
|
-v /root/secrets:/secrets \
|
||||||
quay.io/netzbegruenung/green-spider-screenshotter
|
quay.io/netzbegruenung/green-spider-screenshotter
|
||||||
|
|
||||||
|
elif [[ $1 == "spider-new" ]]
|
||||||
|
then
|
||||||
|
# Some dependencies specific to this task
|
||||||
|
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP apt-get install -y python3-pip build-essential
|
||||||
|
|
||||||
|
# Upload some files
|
||||||
|
scp -o StrictHostKeyChecking=no -q secrets/datastore-writer.json root@$SERVER_IP:/root/secrets/datastore-writer.json
|
||||||
|
scp -o StrictHostKeyChecking=no -q docker-compose.yaml root@$SERVER_IP:/root/docker-compose.yaml
|
||||||
|
scp -o StrictHostKeyChecking=no -q requirements.txt root@$SERVER_IP:/root/requirements.txt
|
||||||
|
scp -o StrictHostKeyChecking=no -q job.py root@$SERVER_IP:/root/job.py
|
||||||
|
|
||||||
|
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP pip3 install -r requirements.txt
|
||||||
|
|
||||||
|
# Bring up redis for the queue
|
||||||
|
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose pull redis
|
||||||
|
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose up -d redis
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
# Bring up queue manager
|
||||||
|
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose pull manager
|
||||||
|
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose up manager
|
||||||
|
|
||||||
|
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq info --url redis://localhost:6379/0
|
||||||
|
|
||||||
|
# Start worker and work off the queue once
|
||||||
|
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq worker --burst high default low --url redis://localhost:6379/0
|
||||||
|
|
||||||
|
# Re-queue failed jobs once, then re-execute.
|
||||||
|
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq requeue --queue low -u redis://localhost:6379 --all
|
||||||
|
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq info --url redis://localhost:6379/0
|
||||||
|
|
||||||
|
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq worker --burst high default low --url redis://localhost:6379/0
|
||||||
|
|
||||||
|
echo "Done with queued jobs."
|
||||||
|
|
||||||
else
|
else
|
||||||
### spider
|
### spider
|
||||||
|
|
||||||
|
|
|
@ -1,16 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Log in to webapp server via SSH
|
|
||||||
|
|
||||||
API_TOKEN_SECRET="secrets/hetzner-api-token.sh"
|
|
||||||
test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; }
|
|
||||||
source $API_TOKEN_SECRET
|
|
||||||
|
|
||||||
source devops/functions.bash
|
|
||||||
|
|
||||||
get_ip
|
|
||||||
|
|
||||||
echo "Use this command for SSH access:"
|
|
||||||
echo "ssh -o StrictHostKeyChecking=no root@${IP_IP}"
|
|
||||||
|
|
||||||
ssh -o StrictHostKeyChecking=no root@${IP_IP}
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
version: "2"
|
||||||
|
services:
|
||||||
|
|
||||||
|
redis:
|
||||||
|
image: redis:5-alpine
|
||||||
|
command: redis-server --save "" --appendonly no
|
||||||
|
volumes:
|
||||||
|
- ${PWD}/volumes/redis-data:/data
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- internal_network
|
||||||
|
- external_network
|
||||||
|
ports:
|
||||||
|
- "6379:6379"
|
||||||
|
|
||||||
|
# manager manages the job queue.
|
||||||
|
manager:
|
||||||
|
image: quay.io/netzbegruenung/green-spider:latest
|
||||||
|
command: >
|
||||||
|
python3 cli.py
|
||||||
|
--credentials-path /secrets/datastore-writer.json
|
||||||
|
--loglevel debug manager
|
||||||
|
environment:
|
||||||
|
REDIS_URL: redis://redis:6379/0
|
||||||
|
GIT_USERNAME: ${GIT_USERNAME}
|
||||||
|
GIT_PASSWORD: ${GIT_PASSWORD}
|
||||||
|
volumes:
|
||||||
|
- ${PWD}/secrets:/secrets
|
||||||
|
networks:
|
||||||
|
- internal_network
|
||||||
|
- external_network
|
||||||
|
depends_on:
|
||||||
|
- redis
|
||||||
|
|
||||||
|
dashboard:
|
||||||
|
image: eoranged/rq-dashboard:v0.6.1
|
||||||
|
environment:
|
||||||
|
RQ_DASHBOARD_REDIS_URL: redis://redis:6379/0
|
||||||
|
networks:
|
||||||
|
- internal_network
|
||||||
|
- external_network
|
||||||
|
ports:
|
||||||
|
- "9181:9181"
|
||||||
|
depends_on:
|
||||||
|
- redis
|
||||||
|
|
||||||
|
networks:
|
||||||
|
internal_network:
|
||||||
|
internal: true
|
||||||
|
external_network:
|
||||||
|
internal: false
|
|
@ -0,0 +1,147 @@
|
||||||
|
"""
|
||||||
|
Dieses Script wird vom RQ worker ausgeführt, um einen einzelnen Job aus der
|
||||||
|
Spider-Warteschlange abzuarbeiten.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import docker
|
||||||
|
from google.cloud import datastore
|
||||||
|
|
||||||
|
# Maximum oper-job runtime in seconds. This can be increased for second, third attempt
|
||||||
|
# via the environment JOB_TIMEOUT variable.
|
||||||
|
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))
|
||||||
|
|
||||||
|
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest'
|
||||||
|
|
||||||
|
CREDENTIALS_PATH = '/secrets/datastore-writer.json'
|
||||||
|
|
||||||
|
client = docker.from_env()
|
||||||
|
low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')
|
||||||
|
|
||||||
|
datastore_client = datastore.Client.from_service_account_json("." + CREDENTIALS_PATH)
|
||||||
|
|
||||||
|
pwd = os.path.abspath(".")
|
||||||
|
secrets_path = pwd + "/secrets"
|
||||||
|
chromedir_path = pwd + "/volumes/chrome-userdir"
|
||||||
|
screenshots_path = pwd + "/screenshots"
|
||||||
|
|
||||||
|
volumes = {}
|
||||||
|
volumes[secrets_path] = {'bind': '/secrets', 'mode': 'ro'}
|
||||||
|
volumes[chromedir_path] = {'bind': '/opt/chrome-userdir', 'mode': 'rw'}
|
||||||
|
volumes[screenshots_path] = {'bind': '/screenshots', 'mode': 'rw'}
|
||||||
|
|
||||||
|
logger = logging.getLogger('rq.worker')
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
def run(job):
|
||||||
|
"""
|
||||||
|
Runs a spider container with the given job.
|
||||||
|
|
||||||
|
Returns the container logs. If the execution takes longer than the
|
||||||
|
duration defined by the JOB_TIMEOUT environment variable (in seconds),
|
||||||
|
the container gets killed.
|
||||||
|
"""
|
||||||
|
cmd_template = ("python cli.py --credentials-path={path} "
|
||||||
|
" --loglevel=debug "
|
||||||
|
" spider "
|
||||||
|
" --job='{job_json}'")
|
||||||
|
|
||||||
|
cmd = cmd_template.format(path=CREDENTIALS_PATH,
|
||||||
|
job_json=json.dumps(job))
|
||||||
|
|
||||||
|
container = client.containers.run(image=DOCKER_IMAGE,
|
||||||
|
command=cmd,
|
||||||
|
detach=True,
|
||||||
|
remove=True,
|
||||||
|
shm_size='2G',
|
||||||
|
stdout=True,
|
||||||
|
stderr=True,
|
||||||
|
tty=False,
|
||||||
|
volumes=volumes)
|
||||||
|
|
||||||
|
id = container.id
|
||||||
|
|
||||||
|
# Data about this spider run, to be written to datastore
|
||||||
|
key = datastore_client.key('spider-runs')
|
||||||
|
entity = datastore.Entity(key=key)
|
||||||
|
results = {
|
||||||
|
'datetime': datetime.utcnow(),
|
||||||
|
'url': job['url'],
|
||||||
|
'success': True,
|
||||||
|
'error': '',
|
||||||
|
'duration_seconds': 0,
|
||||||
|
'cpu_usage_seconds': 0,
|
||||||
|
'network_received_bytes': 0,
|
||||||
|
'network_transmitted_bytes': 0,
|
||||||
|
'memory_max_bytes': 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
# wait for finish
|
||||||
|
start = datetime.utcnow()
|
||||||
|
while True:
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
clist = client.containers.list(filters={'id': id})
|
||||||
|
if len(clist) == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
for c in clist:
|
||||||
|
|
||||||
|
# Collect stats
|
||||||
|
try:
|
||||||
|
stats = low_level_client.stats(id, stream=False)
|
||||||
|
|
||||||
|
cpu_usage = stats['cpu_stats']['cpu_usage']['total_usage'] / 1000000000.0
|
||||||
|
if 'networks' in stats:
|
||||||
|
network_received_bytes = stats['networks']['eth0']['rx_bytes']
|
||||||
|
network_transmitted_bytes = stats['networks']['eth0']['tx_bytes']
|
||||||
|
|
||||||
|
memory_max_bytes = 0
|
||||||
|
if 'max_usage' in stats['memory_stats']:
|
||||||
|
memory_max_bytes = stats['memory_stats']['max_usage']
|
||||||
|
results['memory_max_bytes'] = memory_max_bytes
|
||||||
|
|
||||||
|
#logger.debug("Stats: CPU time %d Sec, RX %d KB, Mem %d MB" % (cpu_usage, network_received_bytes/1000, memory_max_bytes/1000000))
|
||||||
|
|
||||||
|
if cpu_usage > 0:
|
||||||
|
results['cpu_usage_seconds'] = round(cpu_usage)
|
||||||
|
|
||||||
|
if network_received_bytes > 0:
|
||||||
|
results['network_received_bytes'] = network_received_bytes
|
||||||
|
|
||||||
|
if network_transmitted_bytes > 0:
|
||||||
|
results['network_transmitted_bytes'] = network_transmitted_bytes
|
||||||
|
|
||||||
|
|
||||||
|
except docker.errors.APIError as e:
|
||||||
|
logger.error("Could not get stats: %s" % e)
|
||||||
|
except json.decoder.JSONDecodeError:
|
||||||
|
# This means we didn't get proper stats
|
||||||
|
pass
|
||||||
|
|
||||||
|
runtime = (datetime.utcnow() - start).seconds
|
||||||
|
results['duration_seconds'] = round(runtime)
|
||||||
|
|
||||||
|
#if c.status != "running":
|
||||||
|
# logger.info("Container %s status: %s" % (c.id, c.status))
|
||||||
|
|
||||||
|
if c.status == "exited":
|
||||||
|
logger.debug("Container %s is exited." % c.id)
|
||||||
|
break
|
||||||
|
|
||||||
|
if runtime > TIMEOUT:
|
||||||
|
c.kill()
|
||||||
|
results['success'] = False
|
||||||
|
results['error'] = 'TIMEOUT'
|
||||||
|
entity.update(results)
|
||||||
|
datastore_client.put(entity)
|
||||||
|
raise Exception("Execution took too long. Killed container after %s seconds." % TIMEOUT)
|
||||||
|
|
||||||
|
entity.update(results)
|
||||||
|
datastore_client.put(entity)
|
||||||
|
return results
|
|
@ -0,0 +1,67 @@
|
||||||
|
import config
|
||||||
|
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import kubernetes
|
||||||
|
|
||||||
|
PENDING_LIMIT = 2
|
||||||
|
RUNNING_LIMIT = 4
|
||||||
|
|
||||||
|
INTERVAL = 10 # Seconds
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
# Get jobs
|
||||||
|
jobs = list(Path("./k8s-jobs").rglob("*.yaml"))
|
||||||
|
random.seed()
|
||||||
|
random.shuffle(jobs)
|
||||||
|
|
||||||
|
kubernetes.config.load_kube_config(context='giantswarm-5jka7')
|
||||||
|
v1client = kubernetes.client.CoreV1Api()
|
||||||
|
k8sclient = kubernetes.client.ApiClient()
|
||||||
|
|
||||||
|
start = datetime.utcnow()
|
||||||
|
jobs_queued = 0
|
||||||
|
|
||||||
|
while len(jobs) > 0:
|
||||||
|
# Check whether there are pods pending
|
||||||
|
pending_pods = v1client.list_pod_for_all_namespaces(
|
||||||
|
watch=False,
|
||||||
|
field_selector='status.phase=Pending',
|
||||||
|
label_selector='app=green-spider')
|
||||||
|
pending = list(pending_pods.items)
|
||||||
|
|
||||||
|
# Get running pods
|
||||||
|
running_pods = v1client.list_pod_for_all_namespaces(
|
||||||
|
watch=False,
|
||||||
|
field_selector='status.phase=Running',
|
||||||
|
label_selector='app=green-spider')
|
||||||
|
running = list(running_pods.items)
|
||||||
|
|
||||||
|
now = datetime.utcnow()
|
||||||
|
duration = now - start
|
||||||
|
|
||||||
|
# Add new job to the queue
|
||||||
|
if len(pending) < PENDING_LIMIT and len(running) < RUNNING_LIMIT:
|
||||||
|
to_be_queued = RUNNING_LIMIT - len(running)
|
||||||
|
for _ in range(to_be_queued):
|
||||||
|
job_path = jobs.pop(0)
|
||||||
|
jobs_queued += 1
|
||||||
|
|
||||||
|
duration_per_job = duration / jobs_queued
|
||||||
|
jobs_remaining = len(jobs)
|
||||||
|
|
||||||
|
print(f'{jobs_queued} jobs queued in {duration} - {jobs_remaining} jobs (estimated {duration_per_job * jobs_remaining}) remaining at {int(duration_per_job.total_seconds())} seconds per job on average')
|
||||||
|
kubernetes.utils.create_from_yaml(k8sclient, job_path)
|
||||||
|
os.remove(job_path)
|
||||||
|
|
||||||
|
time.sleep(INTERVAL)
|
||||||
|
|
||||||
|
print('No more jobs left. Done.')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -0,0 +1,67 @@
|
||||||
|
---
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: Job
|
||||||
|
metadata:
|
||||||
|
name: green-spider-job-1
|
||||||
|
namespace: marian
|
||||||
|
labels:
|
||||||
|
app: green-spider
|
||||||
|
spec:
|
||||||
|
activeDeadlineSeconds: 120
|
||||||
|
ttlSecondsAfterFinished: 600
|
||||||
|
completions: 1
|
||||||
|
backoffLimit: 3
|
||||||
|
|
||||||
|
# Pod template
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
name: green-spider-job
|
||||||
|
namespace: marian
|
||||||
|
labels:
|
||||||
|
app: green-spider
|
||||||
|
spec:
|
||||||
|
restartPolicy: Never
|
||||||
|
nodeSelector:
|
||||||
|
giantswarm.io/machine-pool: 5n27k
|
||||||
|
affinity:
|
||||||
|
podAntiAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
- labelSelector:
|
||||||
|
matchExpressions:
|
||||||
|
- key: app
|
||||||
|
operator: In
|
||||||
|
values:
|
||||||
|
- green-spider
|
||||||
|
topologyKey: topology.kubernetes.io/region
|
||||||
|
containers:
|
||||||
|
- name: spider
|
||||||
|
image: quay.io/netzbegruenung/green-spider:kubernetes
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
command:
|
||||||
|
- python
|
||||||
|
- cli.py
|
||||||
|
- --credentials-path=/secrets/datastore-writer.json
|
||||||
|
- --loglevel=debug
|
||||||
|
- spider
|
||||||
|
- '--job={"url":"https://www.gruene.de/","type":"PARTY","level":"DE:BUNDESVERBAND","state":null,"district":null,"city":null}'
|
||||||
|
volumeMounts:
|
||||||
|
- name: secrets
|
||||||
|
mountPath: "/secrets"
|
||||||
|
readOnly: true
|
||||||
|
- name: shared
|
||||||
|
mountPath: /dev/shm
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 1000m
|
||||||
|
memory: 5000M
|
||||||
|
volumes:
|
||||||
|
- name: secrets
|
||||||
|
secret:
|
||||||
|
secretName: green-spider
|
||||||
|
items:
|
||||||
|
- key: datastore-writer.json
|
||||||
|
path: datastore-writer.json
|
||||||
|
- key: screenshots-uploader.json
|
||||||
|
path: screenshots-uploader.json
|
||||||
|
- name: shared
|
||||||
|
emptyDir: {}
|
|
@ -0,0 +1,18 @@
|
||||||
|
apiVersion: policy/v1beta1
|
||||||
|
kind: PodSecurityPolicy
|
||||||
|
metadata:
|
||||||
|
name: green-spider-job-psp
|
||||||
|
namespace: marian
|
||||||
|
spec:
|
||||||
|
privileged: false
|
||||||
|
seLinux:
|
||||||
|
rule: RunAsAny
|
||||||
|
supplementalGroups:
|
||||||
|
rule: RunAsAny
|
||||||
|
runAsUser:
|
||||||
|
rule: RunAsAny
|
||||||
|
fsGroup:
|
||||||
|
rule: RunAsAny
|
||||||
|
volumes:
|
||||||
|
- emptyDir
|
||||||
|
- secret
|
|
@ -1,21 +1,26 @@
|
||||||
"""
|
"""
|
||||||
The jobs module allows to create jobs for the queue and take jobs off the queue
|
The manager module allows to fill the RQ job queue.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
import logging
|
import logging
|
||||||
|
import math
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import shutil
|
import shutil
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
from git import Repo
|
from git import Repo
|
||||||
import tenacity
|
from rq import Queue
|
||||||
|
import redis
|
||||||
import yaml
|
import yaml
|
||||||
from google.api_core.exceptions import Aborted
|
from yaml import Loader
|
||||||
from google.cloud import datastore
|
from hashlib import sha256
|
||||||
|
|
||||||
import config
|
import config
|
||||||
|
|
||||||
|
REDIS_URL = os.environ.get("REDIS_URL", "redis://redis:6379/0")
|
||||||
|
|
||||||
def clone_data_directory():
|
def clone_data_directory():
|
||||||
"""
|
"""
|
||||||
|
@ -40,7 +45,7 @@ def directory_entries():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
with open(filepath, 'r', encoding='utf8') as yamlfile:
|
with open(filepath, 'r', encoding='utf8') as yamlfile:
|
||||||
for doc in yaml.load_all(yamlfile):
|
for doc in yaml.load_all(yamlfile, Loader=Loader):
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -53,7 +58,7 @@ def chunks(the_list, size):
|
||||||
yield the_list[i:i + size]
|
yield the_list[i:i + size]
|
||||||
|
|
||||||
|
|
||||||
def create_jobs(datastore_client, url=None):
|
def create_jobs(url=None):
|
||||||
"""
|
"""
|
||||||
Read all URLs from green directory and fill a job database
|
Read all URLs from green directory and fill a job database
|
||||||
with one job per URL.
|
with one job per URL.
|
||||||
|
@ -62,6 +67,18 @@ def create_jobs(datastore_client, url=None):
|
||||||
will be added as a spider job.
|
will be added as a spider job.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
logging.info('Waiting for redis at %s' % REDIS_URL)
|
||||||
|
redis_success = False
|
||||||
|
while not redis_success:
|
||||||
|
try:
|
||||||
|
redis_conn = redis.from_url(REDIS_URL)
|
||||||
|
redis_success = True
|
||||||
|
except Exception as ex:
|
||||||
|
logging.error(ex)
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
queue = Queue('low', connection=redis_conn)
|
||||||
|
|
||||||
# refresh our local clone of the green directory
|
# refresh our local clone of the green directory
|
||||||
logging.info("Refreshing green-directory clone")
|
logging.info("Refreshing green-directory clone")
|
||||||
clone_data_directory()
|
clone_data_directory()
|
||||||
|
@ -104,7 +121,7 @@ def create_jobs(datastore_client, url=None):
|
||||||
logging.error("Error in %s: 'url' key missing (%s)",
|
logging.error("Error in %s: 'url' key missing (%s)",
|
||||||
repr_entry(entry), entry['urls'][index])
|
repr_entry(entry), entry['urls'][index])
|
||||||
|
|
||||||
# ensure the passed URL argument is really there, even if not part
|
# Ensure the passed URL argument is really there, even if not part
|
||||||
# of the directory.
|
# of the directory.
|
||||||
if url and count == 0:
|
if url and count == 0:
|
||||||
logging.info("Adding job for URL %s which is not part of green-directory", url)
|
logging.info("Adding job for URL %s which is not part of green-directory", url)
|
||||||
|
@ -115,55 +132,63 @@ def create_jobs(datastore_client, url=None):
|
||||||
"state": None,
|
"state": None,
|
||||||
"district": None,
|
"district": None,
|
||||||
"city": None,
|
"city": None,
|
||||||
"index": int(random.uniform(1000000, 9999999)),
|
|
||||||
})
|
})
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
|
errorcount = 0
|
||||||
logging.info("Writing jobs")
|
logging.info("Writing jobs")
|
||||||
|
|
||||||
entities = []
|
count = 0
|
||||||
|
|
||||||
for entry in input_entries:
|
for entry in input_entries:
|
||||||
key = datastore_client.key(config.JOB_DATASTORE_KIND, entry["url"])
|
try:
|
||||||
entity = datastore.Entity(key=key)
|
_ = queue.enqueue('job.run',
|
||||||
entity.update({
|
job_timeout='300s',
|
||||||
"created": datetime.utcnow(),
|
at_front=random.choice([True, False]),
|
||||||
"type": entry["type"],
|
# keywords args passes on the job function
|
||||||
"level": entry["level"],
|
kwargs={
|
||||||
"state": entry["state"],
|
'job': entry,
|
||||||
"district": entry["district"],
|
})
|
||||||
"city": entry["city"],
|
|
||||||
"index": int(random.uniform(1000000, 9999999)),
|
|
||||||
})
|
|
||||||
entities.append(entity)
|
|
||||||
|
|
||||||
# commmit to DB
|
# Print job for debugging purposes
|
||||||
for chunk in chunks(entities, 300):
|
print(json.dumps(entry))
|
||||||
logging.debug("Writing jobs chunk of length %d", len(chunk))
|
|
||||||
datastore_client.put_multi(chunk)
|
#logging.debug("Added job with ID %s for URL %s" % (enqueued_job.id, entry['url']))
|
||||||
count += len(chunk)
|
count += 1
|
||||||
|
except Exception as e:
|
||||||
|
errorcount += 1
|
||||||
|
logging.error("Error adding job for URL %s: %s" % (entry['url'], e))
|
||||||
|
|
||||||
|
# Write kubernetes Job
|
||||||
|
make_k8s_job(entry, count)
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
|
||||||
logging.info("Writing jobs done, %s jobs added", count)
|
logging.info("Writing jobs done, %s jobs added", count)
|
||||||
|
logging.info("%d errors while writing jobs", errorcount)
|
||||||
|
|
||||||
|
|
||||||
@tenacity.retry(wait=tenacity.wait_exponential(),
|
def make_k8s_job(job_data, count):
|
||||||
retry=tenacity.retry_if_exception_type(Aborted))
|
now = datetime.utcnow().strftime('%Y%m%d%H%M')
|
||||||
def get_job_from_queue(datastore_client):
|
urlhash = sha256(job_data['url'].encode('utf-8')).hexdigest()[0:12]
|
||||||
"""
|
job_name = f'gs-{now}-{urlhash}'
|
||||||
Returns a URL from the queue
|
filename = f'{job_name}.yaml'
|
||||||
"""
|
batch_folder = math.floor(count / config.K8S_JOB_BATCH_SIZE)
|
||||||
out = None
|
output_dir = os.path.join(config.K8S_JOBS_PATH, str(batch_folder))
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
output_path = os.path.join(output_dir, filename)
|
||||||
|
job_json = json.dumps(job_data)
|
||||||
|
job_flag = f'\'--job={job_json}\''
|
||||||
|
|
||||||
with datastore_client.transaction():
|
with open(config.K8S_JOB_TEMPLATE, "r") as template_file:
|
||||||
query = datastore_client.query(kind=config.JOB_DATASTORE_KIND,
|
template = template_file.read()
|
||||||
order=['index'])
|
|
||||||
for entity in query.fetch(limit=1):
|
template = template.replace('JOB_NAME', job_name)
|
||||||
logging.debug("Got job: %s", entity)
|
template = template.replace('POD_NAME', job_name)
|
||||||
out = dict(entity)
|
template = template.replace('JOB_FLAG', job_flag)
|
||||||
out["url"] = entity.key.name
|
|
||||||
datastore_client.delete(entity.key)
|
with open(output_path, "w") as output:
|
||||||
|
output.write(template)
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
def repr_entry(entry):
|
def repr_entry(entry):
|
||||||
"""
|
"""
|
|
@ -0,0 +1,67 @@
|
||||||
|
---
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: Job
|
||||||
|
metadata:
|
||||||
|
name: JOB_NAME
|
||||||
|
namespace: marian
|
||||||
|
labels:
|
||||||
|
app: green-spider
|
||||||
|
spec:
|
||||||
|
activeDeadlineSeconds: 600
|
||||||
|
ttlSecondsAfterFinished: 600
|
||||||
|
completions: 1
|
||||||
|
backoffLimit: 3
|
||||||
|
|
||||||
|
# Pod template
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
name: POD_NAME
|
||||||
|
namespace: marian
|
||||||
|
labels:
|
||||||
|
app: green-spider
|
||||||
|
spec:
|
||||||
|
restartPolicy: Never
|
||||||
|
nodeSelector:
|
||||||
|
giantswarm.io/machine-pool: 5n27k
|
||||||
|
# affinity:
|
||||||
|
# podAntiAffinity:
|
||||||
|
# requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
# - labelSelector:
|
||||||
|
# matchExpressions:
|
||||||
|
# - key: app
|
||||||
|
# operator: In
|
||||||
|
# values:
|
||||||
|
# - green-spider
|
||||||
|
# topologyKey: topology.kubernetes.io/region
|
||||||
|
containers:
|
||||||
|
- name: spider
|
||||||
|
image: quay.io/netzbegruenung/green-spider:20211031-chromium93
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
command:
|
||||||
|
- python3
|
||||||
|
- cli.py
|
||||||
|
- --credentials-path=/secrets/datastore-writer.json
|
||||||
|
- --loglevel=debug
|
||||||
|
- spider
|
||||||
|
- JOB_FLAG
|
||||||
|
volumeMounts:
|
||||||
|
- name: secrets
|
||||||
|
mountPath: "/secrets"
|
||||||
|
readOnly: true
|
||||||
|
- name: shared
|
||||||
|
mountPath: /dev/shm
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 1000m
|
||||||
|
memory: 5000M
|
||||||
|
volumes:
|
||||||
|
- name: secrets
|
||||||
|
secret:
|
||||||
|
secretName: green-spider
|
||||||
|
items:
|
||||||
|
- key: datastore-writer.json
|
||||||
|
path: datastore-writer.json
|
||||||
|
- key: screenshots-uploader.json
|
||||||
|
path: screenshots-uploader.json
|
||||||
|
- name: shared
|
||||||
|
emptyDir: {}
|
|
@ -10,6 +10,8 @@ from rating import contact_link
|
||||||
from rating import favicon
|
from rating import favicon
|
||||||
from rating import feeds
|
from rating import feeds
|
||||||
from rating import https
|
from rating import https
|
||||||
|
from rating import network_payload
|
||||||
|
from rating import network_requests
|
||||||
from rating import no_network_errors
|
from rating import no_network_errors
|
||||||
from rating import no_script_errors
|
from rating import no_script_errors
|
||||||
from rating import no_third_party_cookies
|
from rating import no_third_party_cookies
|
||||||
|
@ -39,6 +41,8 @@ def calculate_rating(results):
|
||||||
'FEEDS': feeds,
|
'FEEDS': feeds,
|
||||||
'HTTPS': https,
|
'HTTPS': https,
|
||||||
'HTTP_RESPONSE_DURATION': response_duration,
|
'HTTP_RESPONSE_DURATION': response_duration,
|
||||||
|
'NETWORK_PAYLOAD': network_payload,
|
||||||
|
'NETWORK_REQUESTS': network_requests,
|
||||||
'NO_NETWORK_ERRORS': no_network_errors,
|
'NO_NETWORK_ERRORS': no_network_errors,
|
||||||
'NO_SCRIPT_ERRORS': no_script_errors,
|
'NO_SCRIPT_ERRORS': no_script_errors,
|
||||||
'NO_THIRD_PARTY_COOKIES': no_third_party_cookies,
|
'NO_THIRD_PARTY_COOKIES': no_third_party_cookies,
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
"""
|
||||||
|
This rater evaluates the amount of data transferred for a page load.
|
||||||
|
|
||||||
|
Currently no score is given. The plan is however to reward site that
|
||||||
|
cause smaller transfers.
|
||||||
|
|
||||||
|
The rater uses Chrome performance log messages of type
|
||||||
|
'Network.loadingFinished'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from rating.abstract_rater import AbstractRater
|
||||||
|
|
||||||
|
class Rater(AbstractRater):
|
||||||
|
|
||||||
|
rating_type = 'number'
|
||||||
|
default_value = 0
|
||||||
|
depends_on_checks = ['load_in_browser']
|
||||||
|
max_score = 1.0
|
||||||
|
|
||||||
|
def __init__(self, check_results):
|
||||||
|
super().__init__(check_results)
|
||||||
|
|
||||||
|
def rate(self):
|
||||||
|
value = self.default_value
|
||||||
|
score = 0
|
||||||
|
|
||||||
|
payloads_for_urls = []
|
||||||
|
|
||||||
|
for url in self.check_results['load_in_browser']:
|
||||||
|
payload = 0
|
||||||
|
|
||||||
|
if (self.check_results['load_in_browser'][url]['performance_log'] == [] or
|
||||||
|
self.check_results['load_in_browser'][url]['performance_log'] is None):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for lentry in self.check_results['load_in_browser'][url]['performance_log']:
|
||||||
|
if lentry['message']['method'] == 'Network.loadingFinished':
|
||||||
|
payload += lentry['message']['params']['encodedDataLength']
|
||||||
|
|
||||||
|
payloads_for_urls.append(payload)
|
||||||
|
|
||||||
|
# Calculate score based on the largest value found for a URL.
|
||||||
|
# See https://github.com/netzbegruenung/green-spider/issues/11#issuecomment-600307544
|
||||||
|
# for details.
|
||||||
|
if len(payloads_for_urls) > 0:
|
||||||
|
value = max(payloads_for_urls)
|
||||||
|
if value < 994000:
|
||||||
|
score = 1
|
||||||
|
elif value < 1496000:
|
||||||
|
score = .5
|
||||||
|
|
||||||
|
return {
|
||||||
|
'type': self.rating_type,
|
||||||
|
'value': value,
|
||||||
|
'score': score,
|
||||||
|
'max_score': self.max_score,
|
||||||
|
}
|
|
@ -0,0 +1,57 @@
|
||||||
|
"""
|
||||||
|
This rater evaluates the number of network requests made.
|
||||||
|
|
||||||
|
Currently no score is given. The plan is however to reward site that
|
||||||
|
use only few requests.
|
||||||
|
|
||||||
|
The rater uses Chrome performance log messages of type
|
||||||
|
'Network.requestWillBeSent'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from rating.abstract_rater import AbstractRater
|
||||||
|
|
||||||
|
class Rater(AbstractRater):
|
||||||
|
|
||||||
|
rating_type = 'number'
|
||||||
|
default_value = 0
|
||||||
|
depends_on_checks = ['load_in_browser']
|
||||||
|
max_score = 1.0
|
||||||
|
|
||||||
|
def __init__(self, check_results):
|
||||||
|
super().__init__(check_results)
|
||||||
|
|
||||||
|
def rate(self):
|
||||||
|
value = self.default_value
|
||||||
|
score = 0
|
||||||
|
|
||||||
|
num_requests_for_urls = []
|
||||||
|
|
||||||
|
for url in self.check_results['load_in_browser']:
|
||||||
|
num_requests = 0
|
||||||
|
|
||||||
|
if (self.check_results['load_in_browser'][url]['performance_log'] == [] or
|
||||||
|
self.check_results['load_in_browser'][url]['performance_log'] is None):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for lentry in self.check_results['load_in_browser'][url]['performance_log']:
|
||||||
|
if lentry['message']['method'] == 'Network.requestWillBeSent':
|
||||||
|
num_requests += 1
|
||||||
|
|
||||||
|
num_requests_for_urls.append(num_requests)
|
||||||
|
|
||||||
|
# Calculate score based on the largest value found for a URL.
|
||||||
|
# See https://github.com/netzbegruenung/green-spider/issues/11#issuecomment-600307544
|
||||||
|
# for details.
|
||||||
|
if len(num_requests_for_urls) > 0:
|
||||||
|
value = max(num_requests_for_urls)
|
||||||
|
if value <= 28:
|
||||||
|
score = 1.0
|
||||||
|
elif value <= 38:
|
||||||
|
score = 0.5
|
||||||
|
|
||||||
|
return {
|
||||||
|
'type': self.rating_type,
|
||||||
|
'value': value,
|
||||||
|
'score': score,
|
||||||
|
'max_score': self.max_score,
|
||||||
|
}
|
|
@ -9,7 +9,7 @@ from rating.abstract_rater import AbstractRater
|
||||||
class Rater(AbstractRater):
|
class Rater(AbstractRater):
|
||||||
|
|
||||||
rating_type = 'number'
|
rating_type = 'number'
|
||||||
default_value = False
|
default_value = 0
|
||||||
depends_on_checks = ['page_content']
|
depends_on_checks = ['page_content']
|
||||||
max_score = 1.0
|
max_score = 1.0
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,42 @@
|
||||||
beautifulsoup4==4.8.1
|
beautifulsoup4==4.9.3
|
||||||
|
cachetools==4.2.2
|
||||||
|
certifi==2020.12.5
|
||||||
|
cffi==1.14.5
|
||||||
|
chardet==3.0.4
|
||||||
|
click==7.1.2
|
||||||
|
cssselect==1.1.0
|
||||||
dnspython==1.16.0
|
dnspython==1.16.0
|
||||||
feedparser==5.2.1
|
docker==4.4.1
|
||||||
GitPython
|
feedparser==6.0.8
|
||||||
google-cloud-datastore==1.10.0
|
gitdb==4.0.7
|
||||||
html-similarity==0.3.2
|
GitPython==3.1.14
|
||||||
|
google-api-core==1.26.3
|
||||||
|
google-auth==1.30.0
|
||||||
|
google-cloud-core==1.6.0
|
||||||
|
google-cloud-datastore==1.15.3
|
||||||
|
google-cloud-storage==1.38.0
|
||||||
|
googleapis-common-protos==1.53.0
|
||||||
|
html-similarity==0.3.3
|
||||||
httpretty==0.9.7
|
httpretty==0.9.7
|
||||||
pyopenssl==18.0.0
|
idna==2.10
|
||||||
PyYAML
|
parsel==1.6.0
|
||||||
requests==2.22.0
|
protobuf==3.15.8
|
||||||
responses==0.10.15
|
pyasn1==0.4.8
|
||||||
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
|
pyasn1-modules==0.2.8
|
||||||
selenium==3.8.0
|
pycparser==2.20
|
||||||
|
pyOpenSSL==20.0.1
|
||||||
|
pytz==2021.1
|
||||||
|
PyYAML==5.4.1
|
||||||
|
redis==3.5.3
|
||||||
|
requests==2.25.1
|
||||||
|
responses==0.13.3
|
||||||
|
rq==1.8.0
|
||||||
|
rsa==4.7.2
|
||||||
|
selenium==3.141.0
|
||||||
|
smmap==3.0.4
|
||||||
smmap2==2.0.5
|
smmap2==2.0.5
|
||||||
tenacity==5.0.2
|
soupsieve==2.2.1
|
||||||
urllib3==1.25.9
|
tenacity==5.1.5
|
||||||
|
urllib3==1.26.4
|
||||||
|
w3lib==1.22.0
|
||||||
|
websocket-client==0.59.0
|
||||||
|
|
|
@ -16,7 +16,7 @@ from google.cloud import datastore
|
||||||
|
|
||||||
import checks
|
import checks
|
||||||
import config
|
import config
|
||||||
import jobs
|
import manager
|
||||||
import rating
|
import rating
|
||||||
|
|
||||||
def check_and_rate_site(entry):
|
def check_and_rate_site(entry):
|
||||||
|
@ -54,10 +54,17 @@ def check_and_rate_site(entry):
|
||||||
for key in result['rating']:
|
for key in result['rating']:
|
||||||
result['score'] += result['rating'][key]['score']
|
result['score'] += result['rating'][key]['score']
|
||||||
|
|
||||||
# remove full HTML page content and hyperlinks to safe some storage
|
# Remove bigger result portions to safe some storage:
|
||||||
|
# - HTML page content
|
||||||
|
# - Hyperlinks
|
||||||
|
# - Performnance log
|
||||||
try:
|
try:
|
||||||
for url in result['checks']['page_content']:
|
for url in result['checks']['page_content']:
|
||||||
del result['checks']['page_content'][url]['content']
|
del result['checks']['page_content'][url]['content']
|
||||||
|
|
||||||
|
for url in result['checks']['load_in_browser']:
|
||||||
|
del result['checks']['load_in_browser'][url]['performance_log']
|
||||||
|
|
||||||
del result['checks']['hyperlinks']
|
del result['checks']['hyperlinks']
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
@ -80,6 +87,7 @@ def test_url(url):
|
||||||
result = check_and_rate_site(entry=job)
|
result = check_and_rate_site(entry=job)
|
||||||
pprint(result)
|
pprint(result)
|
||||||
|
|
||||||
|
|
||||||
def execute_single_job(datastore_client, job, entity_kind):
|
def execute_single_job(datastore_client, job, entity_kind):
|
||||||
"""
|
"""
|
||||||
Executes spider for one single job
|
Executes spider for one single job
|
||||||
|
@ -103,9 +111,11 @@ def execute_single_job(datastore_client, job, entity_kind):
|
||||||
'rating': result['rating'],
|
'rating': result['rating'],
|
||||||
'score': result['score'],
|
'score': result['score'],
|
||||||
}
|
}
|
||||||
|
|
||||||
entity.update(record)
|
entity.update(record)
|
||||||
try:
|
try:
|
||||||
datastore_client.put(entity)
|
datastore_client.put(entity)
|
||||||
|
logging.debug("Successfully wrote record to database")
|
||||||
except InvalidArgument as ex:
|
except InvalidArgument as ex:
|
||||||
logging.error("Could not write result: %s", ex)
|
logging.error("Could not write result: %s", ex)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
|
@ -116,7 +126,7 @@ def work_of_queue(datastore_client, entity_kind):
|
||||||
Take job from queue and finish it until there are no more jobs
|
Take job from queue and finish it until there are no more jobs
|
||||||
"""
|
"""
|
||||||
while True:
|
while True:
|
||||||
job = jobs.get_job_from_queue(datastore_client)
|
job = manager.get_job_from_queue(datastore_client)
|
||||||
if job is None:
|
if job is None:
|
||||||
logging.info("No more jobs. Exiting.")
|
logging.info("No more jobs. Exiting.")
|
||||||
break
|
break
|
||||||
|
|
Loading…
Reference in New Issue