Problembehebung mit Exception-Daten im Ergbebnis, die nicht geschrieben werden können, und Spidern einzelner Sites (#132)
* WIP commit for single job execution * Convert exception to string * Pass more arguments * Move python modules lsit into requirements.txt * Document single site spidering * Remove debugging
This commit is contained in:
parent
725ed5439d
commit
b3bb8f34c3
|
@ -1,12 +1,15 @@
|
|||
FROM python:3.6-alpine3.8
|
||||
|
||||
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
|
||||
WORKDIR /workdir
|
||||
|
||||
ADD requirements.txt /workdir/
|
||||
|
||||
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.8/main" >> /etc/apk/repositories && \
|
||||
echo "http://dl-4.alpinelinux.org/alpine/v3.8/community" >> /etc/apk/repositories && \
|
||||
apk update && \
|
||||
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
|
||||
pip3 install --upgrade pip && \
|
||||
pip3 install dnspython==1.16.0 selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 html-similarity==0.3.2 httpretty==0.9.4 feedparser==5.2.1 pyopenssl==18.0.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
|
||||
pip3 install -r requirements.txt && \
|
||||
apk del python3-dev build-base
|
||||
|
||||
ADD cli.py /
|
||||
|
|
16
README.md
16
README.md
|
@ -48,7 +48,7 @@ Nach dem ersten erfolgreichen Durchlauf dauert ein neuer Aufruf von `make` nur n
|
|||
|
||||
In aller Kürze: `make test`
|
||||
|
||||
### Spider ausführen
|
||||
### Spider testweise ausführen (Debugging)
|
||||
|
||||
Der Spider kann einzelne URLs verarbeiten, ohne die Ergebnisse in eine Datenbank zu schreiben.
|
||||
Am einfachsten geht das über den `make spider` Befehl, so:
|
||||
|
@ -58,3 +58,17 @@ make spider ARGS="--url http://www.example.com/"
|
|||
```
|
||||
|
||||
Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenank.
|
||||
|
||||
Wenn nur eine einzelne Site gespidert werden soll, die Ergebnisse aber in die Datenbank geschrieben werden sollen, kann der Spider so mit `--job` und einem JSON-Object aufgerufen werden (Beispiel):
|
||||
|
||||
```
|
||||
docker run --rm -ti \
|
||||
-v $(pwd)/volumes/dev-shm:/dev/shm \
|
||||
-v $(pwd)/secrets:/secrets \
|
||||
-v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
|
||||
--shm-size=2g \
|
||||
quay.io/netzbegruenung/green-spider:latest \
|
||||
--credentials-path /secrets/datastore-writer.json \
|
||||
--loglevel debug \
|
||||
spider --job '{"url": "https://xn--grne-porta-westfalica-9hc.de/", "meta": {"city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}}'
|
||||
```
|
||||
|
|
|
@ -67,7 +67,7 @@ class Checker(AbstractChecker):
|
|||
data = feedparser.parse(feed_url)
|
||||
|
||||
if 'bozo_exception' in data:
|
||||
result['exception'] = data['bozo_exception']
|
||||
result['exception'] = str(data['bozo_exception'])
|
||||
|
||||
if 'headers' not in data:
|
||||
return result
|
||||
|
|
7
cli.py
7
cli.py
|
@ -6,6 +6,7 @@ import argparse
|
|||
import logging
|
||||
import signal
|
||||
import sys
|
||||
import json
|
||||
|
||||
from google.cloud import datastore
|
||||
|
||||
|
@ -37,7 +38,8 @@ if __name__ == "__main__":
|
|||
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
|
||||
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
|
||||
spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
|
||||
|
||||
spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.')
|
||||
|
||||
# jobs subcommand
|
||||
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
|
||||
jobs_parser.add_argument('--url', help='Add a job to spider a specific URL')
|
||||
|
@ -83,5 +85,8 @@ if __name__ == "__main__":
|
|||
if args.url:
|
||||
# spider one URL for diagnostic purposes
|
||||
spider.test_url(args.url)
|
||||
elif args.job:
|
||||
job = json.loads(args.job)
|
||||
spider.execute_single_job(datastore_client, job, args.kind)
|
||||
else:
|
||||
spider.work_of_queue(datastore_client, args.kind)
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
beautifulsoup4==4.6.0
|
||||
dnspython==1.16.0
|
||||
feedparser==5.2.1
|
||||
GitPython
|
||||
google-cloud-datastore==1.10.0
|
||||
html-similarity==0.3.2
|
||||
httpretty==0.9.4
|
||||
pyopenssl==18.0.0
|
||||
PyYAML
|
||||
requests==2.18.4
|
||||
responses==0.9.0
|
||||
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
|
||||
selenium==3.8.0
|
||||
smmap2==2.0.3
|
||||
tenacity==5.0.2
|
||||
urllib3==1.22
|
|
@ -78,7 +78,38 @@ def test_url(url):
|
|||
}
|
||||
|
||||
result = check_and_rate_site(entry=job)
|
||||
pprint(result['rating'])
|
||||
pprint(result)
|
||||
|
||||
def execute_single_job(datastore_client, job, entity_kind):
|
||||
"""
|
||||
Executes spider for one single job
|
||||
"""
|
||||
validate_job(job)
|
||||
|
||||
logging.info("Starting job %s", job["url"])
|
||||
result = check_and_rate_site(entry=job)
|
||||
|
||||
logging.debug("Full JSON representation of returned result: %s", json.dumps(result, default=str))
|
||||
|
||||
logging.info("Job %s finished checks", job["url"])
|
||||
logging.info("Job %s writing to DB", job["url"])
|
||||
|
||||
key = datastore_client.key(entity_kind, job["url"])
|
||||
entity = datastore.Entity(key=key)
|
||||
record = {
|
||||
'created': datetime.utcnow(),
|
||||
'meta': result['meta'],
|
||||
'checks': result['checks'],
|
||||
'rating': result['rating'],
|
||||
'score': result['score'],
|
||||
}
|
||||
entity.update(record)
|
||||
try:
|
||||
datastore_client.put(entity)
|
||||
except InvalidArgument as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
except Exception as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
|
||||
def work_of_queue(datastore_client, entity_kind):
|
||||
"""
|
||||
|
@ -90,28 +121,8 @@ def work_of_queue(datastore_client, entity_kind):
|
|||
logging.info("No more jobs. Exiting.")
|
||||
break
|
||||
|
||||
logging.info("Starting job %s", job["url"])
|
||||
result = check_and_rate_site(entry=job)
|
||||
|
||||
logging.debug("Full JSON representation of returned result: %s", json.dumps(result, default=str))
|
||||
|
||||
logging.info("Job %s finished checks", job["url"])
|
||||
logging.info("Job %s writing to DB", job["url"])
|
||||
|
||||
key = datastore_client.key(entity_kind, job["url"])
|
||||
entity = datastore.Entity(key=key)
|
||||
record = {
|
||||
'created': datetime.utcnow(),
|
||||
'meta': result['meta'],
|
||||
'checks': result['checks'],
|
||||
'rating': result['rating'],
|
||||
'score': result['score'],
|
||||
}
|
||||
entity.update(record)
|
||||
try:
|
||||
datastore_client.put(entity)
|
||||
except InvalidArgument as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
except Exception as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
execute_single_job(datastore_client, job, entity_kind)
|
||||
|
||||
def validate_job(jobdict):
|
||||
if "url" not in jobdict:
|
||||
raise Exception("Job does not have required 'url' attribute")
|
||||
|
|
Loading…
Reference in New Issue