Problembehebung mit Exception-Daten im Ergbebnis, die nicht geschrieben werden können, und Spidern einzelner Sites (#132)

* WIP commit for single job execution

* Convert exception to string

* Pass more arguments

* Move python modules lsit into requirements.txt

* Document single site spidering

* Remove debugging
This commit is contained in:
Marian Steinbach 2019-11-22 23:13:57 +01:00 committed by GitHub
parent 725ed5439d
commit b3bb8f34c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 79 additions and 30 deletions

View File

@ -1,12 +1,15 @@
FROM python:3.6-alpine3.8
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
WORKDIR /workdir
ADD requirements.txt /workdir/
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.8/main" >> /etc/apk/repositories && \
echo "http://dl-4.alpinelinux.org/alpine/v3.8/community" >> /etc/apk/repositories && \
apk update && \
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
pip3 install --upgrade pip && \
pip3 install dnspython==1.16.0 selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 html-similarity==0.3.2 httpretty==0.9.4 feedparser==5.2.1 pyopenssl==18.0.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
pip3 install -r requirements.txt && \
apk del python3-dev build-base
ADD cli.py /

View File

@ -48,7 +48,7 @@ Nach dem ersten erfolgreichen Durchlauf dauert ein neuer Aufruf von `make` nur n
In aller Kürze: `make test`
### Spider ausführen
### Spider testweise ausführen (Debugging)
Der Spider kann einzelne URLs verarbeiten, ohne die Ergebnisse in eine Datenbank zu schreiben.
Am einfachsten geht das über den `make spider` Befehl, so:
@ -58,3 +58,17 @@ make spider ARGS="--url http://www.example.com/"
```
Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenank.
Wenn nur eine einzelne Site gespidert werden soll, die Ergebnisse aber in die Datenbank geschrieben werden sollen, kann der Spider so mit `--job` und einem JSON-Object aufgerufen werden (Beispiel):
```
docker run --rm -ti \
-v $(pwd)/volumes/dev-shm:/dev/shm \
-v $(pwd)/secrets:/secrets \
-v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
--shm-size=2g \
quay.io/netzbegruenung/green-spider:latest \
--credentials-path /secrets/datastore-writer.json \
--loglevel debug \
spider --job '{"url": "https://xn--grne-porta-westfalica-9hc.de/", "meta": {"city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}}'
```

View File

@ -67,7 +67,7 @@ class Checker(AbstractChecker):
data = feedparser.parse(feed_url)
if 'bozo_exception' in data:
result['exception'] = data['bozo_exception']
result['exception'] = str(data['bozo_exception'])
if 'headers' not in data:
return result

5
cli.py
View File

@ -6,6 +6,7 @@ import argparse
import logging
import signal
import sys
import json
from google.cloud import datastore
@ -37,6 +38,7 @@ if __name__ == "__main__":
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.')
# jobs subcommand
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
@ -83,5 +85,8 @@ if __name__ == "__main__":
if args.url:
# spider one URL for diagnostic purposes
spider.test_url(args.url)
elif args.job:
job = json.loads(args.job)
spider.execute_single_job(datastore_client, job, args.kind)
else:
spider.work_of_queue(datastore_client, args.kind)

16
requirements.txt Normal file
View File

@ -0,0 +1,16 @@
beautifulsoup4==4.6.0
dnspython==1.16.0
feedparser==5.2.1
GitPython
google-cloud-datastore==1.10.0
html-similarity==0.3.2
httpretty==0.9.4
pyopenssl==18.0.0
PyYAML
requests==2.18.4
responses==0.9.0
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
selenium==3.8.0
smmap2==2.0.3
tenacity==5.0.2
urllib3==1.22

View File

@ -78,17 +78,13 @@ def test_url(url):
}
result = check_and_rate_site(entry=job)
pprint(result['rating'])
pprint(result)
def work_of_queue(datastore_client, entity_kind):
def execute_single_job(datastore_client, job, entity_kind):
"""
Take job from queue and finish it until there are no more jobs
Executes spider for one single job
"""
while True:
job = jobs.get_job_from_queue(datastore_client)
if job is None:
logging.info("No more jobs. Exiting.")
break
validate_job(job)
logging.info("Starting job %s", job["url"])
result = check_and_rate_site(entry=job)
@ -115,3 +111,18 @@ def work_of_queue(datastore_client, entity_kind):
except Exception as ex:
logging.error("Could not write result: %s", ex)
def work_of_queue(datastore_client, entity_kind):
"""
Take job from queue and finish it until there are no more jobs
"""
while True:
job = jobs.get_job_from_queue(datastore_client)
if job is None:
logging.info("No more jobs. Exiting.")
break
execute_single_job(datastore_client, job, entity_kind)
def validate_job(jobdict):
if "url" not in jobdict:
raise Exception("Job does not have required 'url' attribute")