diff --git a/Dockerfile b/Dockerfile index ff3afa2..70e5e82 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,15 @@ FROM python:3.6-alpine3.8 -# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296 +WORKDIR /workdir + +ADD requirements.txt /workdir/ + RUN echo "http://dl-4.alpinelinux.org/alpine/v3.8/main" >> /etc/apk/repositories && \ echo "http://dl-4.alpinelinux.org/alpine/v3.8/community" >> /etc/apk/repositories && \ apk update && \ apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \ pip3 install --upgrade pip && \ - pip3 install dnspython==1.16.0 selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 html-similarity==0.3.2 httpretty==0.9.4 feedparser==5.2.1 pyopenssl==18.0.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \ + pip3 install -r requirements.txt && \ apk del python3-dev build-base ADD cli.py / diff --git a/README.md b/README.md index e2c32a7..cadb2bc 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ Nach dem ersten erfolgreichen Durchlauf dauert ein neuer Aufruf von `make` nur n In aller Kürze: `make test` -### Spider ausführen +### Spider testweise ausführen (Debugging) Der Spider kann einzelne URLs verarbeiten, ohne die Ergebnisse in eine Datenbank zu schreiben. Am einfachsten geht das über den `make spider` Befehl, so: @@ -58,3 +58,17 @@ make spider ARGS="--url http://www.example.com/" ``` Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenank. + +Wenn nur eine einzelne Site gespidert werden soll, die Ergebnisse aber in die Datenbank geschrieben werden sollen, kann der Spider so mit `--job` und einem JSON-Object aufgerufen werden (Beispiel): + +``` +docker run --rm -ti \ + -v $(pwd)/volumes/dev-shm:/dev/shm \ + -v $(pwd)/secrets:/secrets \ + -v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \ + --shm-size=2g \ + quay.io/netzbegruenung/green-spider:latest \ + --credentials-path /secrets/datastore-writer.json \ + --loglevel debug \ + spider --job '{"url": "https://xn--grne-porta-westfalica-9hc.de/", "meta": {"city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}}' +``` diff --git a/checks/load_feeds.py b/checks/load_feeds.py index 9cae450..6d3a4e0 100644 --- a/checks/load_feeds.py +++ b/checks/load_feeds.py @@ -67,7 +67,7 @@ class Checker(AbstractChecker): data = feedparser.parse(feed_url) if 'bozo_exception' in data: - result['exception'] = data['bozo_exception'] + result['exception'] = str(data['bozo_exception']) if 'headers' not in data: return result diff --git a/cli.py b/cli.py index 2d41d4b..0dcb236 100644 --- a/cli.py +++ b/cli.py @@ -6,6 +6,7 @@ import argparse import logging import signal import sys +import json from google.cloud import datastore @@ -37,7 +38,8 @@ if __name__ == "__main__": spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider') spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)') spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.') - + spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.') + # jobs subcommand jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.') jobs_parser.add_argument('--url', help='Add a job to spider a specific URL') @@ -83,5 +85,8 @@ if __name__ == "__main__": if args.url: # spider one URL for diagnostic purposes spider.test_url(args.url) + elif args.job: + job = json.loads(args.job) + spider.execute_single_job(datastore_client, job, args.kind) else: spider.work_of_queue(datastore_client, args.kind) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d0a34d5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +beautifulsoup4==4.6.0 +dnspython==1.16.0 +feedparser==5.2.1 +GitPython +google-cloud-datastore==1.10.0 +html-similarity==0.3.2 +httpretty==0.9.4 +pyopenssl==18.0.0 +PyYAML +requests==2.18.4 +responses==0.9.0 +# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296 +selenium==3.8.0 +smmap2==2.0.3 +tenacity==5.0.2 +urllib3==1.22 diff --git a/spider/spider.py b/spider/spider.py index 8ad110e..2b08251 100644 --- a/spider/spider.py +++ b/spider/spider.py @@ -78,7 +78,38 @@ def test_url(url): } result = check_and_rate_site(entry=job) - pprint(result['rating']) + pprint(result) + +def execute_single_job(datastore_client, job, entity_kind): + """ + Executes spider for one single job + """ + validate_job(job) + + logging.info("Starting job %s", job["url"]) + result = check_and_rate_site(entry=job) + + logging.debug("Full JSON representation of returned result: %s", json.dumps(result, default=str)) + + logging.info("Job %s finished checks", job["url"]) + logging.info("Job %s writing to DB", job["url"]) + + key = datastore_client.key(entity_kind, job["url"]) + entity = datastore.Entity(key=key) + record = { + 'created': datetime.utcnow(), + 'meta': result['meta'], + 'checks': result['checks'], + 'rating': result['rating'], + 'score': result['score'], + } + entity.update(record) + try: + datastore_client.put(entity) + except InvalidArgument as ex: + logging.error("Could not write result: %s", ex) + except Exception as ex: + logging.error("Could not write result: %s", ex) def work_of_queue(datastore_client, entity_kind): """ @@ -90,28 +121,8 @@ def work_of_queue(datastore_client, entity_kind): logging.info("No more jobs. Exiting.") break - logging.info("Starting job %s", job["url"]) - result = check_and_rate_site(entry=job) - - logging.debug("Full JSON representation of returned result: %s", json.dumps(result, default=str)) - - logging.info("Job %s finished checks", job["url"]) - logging.info("Job %s writing to DB", job["url"]) - - key = datastore_client.key(entity_kind, job["url"]) - entity = datastore.Entity(key=key) - record = { - 'created': datetime.utcnow(), - 'meta': result['meta'], - 'checks': result['checks'], - 'rating': result['rating'], - 'score': result['score'], - } - entity.update(record) - try: - datastore_client.put(entity) - except InvalidArgument as ex: - logging.error("Could not write result: %s", ex) - except Exception as ex: - logging.error("Could not write result: %s", ex) + execute_single_job(datastore_client, job, entity_kind) +def validate_job(jobdict): + if "url" not in jobdict: + raise Exception("Job does not have required 'url' attribute")