Problembehebung mit Exception-Daten im Ergbebnis, die nicht geschrieben werden können, und Spidern einzelner Sites (#132)

* WIP commit for single job execution

* Convert exception to string

* Pass more arguments

* Move python modules lsit into requirements.txt

* Document single site spidering

* Remove debugging
pull/133/head
Marian Steinbach 3 years ago committed by GitHub
parent 725ed5439d
commit b3bb8f34c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 7
      Dockerfile
  2. 16
      README.md
  3. 2
      checks/load_feeds.py
  4. 7
      cli.py
  5. 16
      requirements.txt
  6. 61
      spider/spider.py

@ -1,12 +1,15 @@
FROM python:3.6-alpine3.8
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
WORKDIR /workdir
ADD requirements.txt /workdir/
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.8/main" >> /etc/apk/repositories && \
echo "http://dl-4.alpinelinux.org/alpine/v3.8/community" >> /etc/apk/repositories && \
apk update && \
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
pip3 install --upgrade pip && \
pip3 install dnspython==1.16.0 selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 html-similarity==0.3.2 httpretty==0.9.4 feedparser==5.2.1 pyopenssl==18.0.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
pip3 install -r requirements.txt && \
apk del python3-dev build-base
ADD cli.py /

@ -48,7 +48,7 @@ Nach dem ersten erfolgreichen Durchlauf dauert ein neuer Aufruf von `make` nur n
In aller Kürze: `make test`
### Spider ausführen
### Spider testweise ausführen (Debugging)
Der Spider kann einzelne URLs verarbeiten, ohne die Ergebnisse in eine Datenbank zu schreiben.
Am einfachsten geht das über den `make spider` Befehl, so:
@ -58,3 +58,17 @@ make spider ARGS="--url http://www.example.com/"
```
Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenank.
Wenn nur eine einzelne Site gespidert werden soll, die Ergebnisse aber in die Datenbank geschrieben werden sollen, kann der Spider so mit `--job` und einem JSON-Object aufgerufen werden (Beispiel):
```
docker run --rm -ti \
-v $(pwd)/volumes/dev-shm:/dev/shm \
-v $(pwd)/secrets:/secrets \
-v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
--shm-size=2g \
quay.io/netzbegruenung/green-spider:latest \
--credentials-path /secrets/datastore-writer.json \
--loglevel debug \
spider --job '{"url": "https://xn--grne-porta-westfalica-9hc.de/", "meta": {"city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}}'
```

@ -67,7 +67,7 @@ class Checker(AbstractChecker):
data = feedparser.parse(feed_url)
if 'bozo_exception' in data:
result['exception'] = data['bozo_exception']
result['exception'] = str(data['bozo_exception'])
if 'headers' not in data:
return result

@ -6,6 +6,7 @@ import argparse
import logging
import signal
import sys
import json
from google.cloud import datastore
@ -37,7 +38,8 @@ if __name__ == "__main__":
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.')
# jobs subcommand
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
jobs_parser.add_argument('--url', help='Add a job to spider a specific URL')
@ -83,5 +85,8 @@ if __name__ == "__main__":
if args.url:
# spider one URL for diagnostic purposes
spider.test_url(args.url)
elif args.job:
job = json.loads(args.job)
spider.execute_single_job(datastore_client, job, args.kind)
else:
spider.work_of_queue(datastore_client, args.kind)

@ -0,0 +1,16 @@
beautifulsoup4==4.6.0
dnspython==1.16.0
feedparser==5.2.1
GitPython
google-cloud-datastore==1.10.0
html-similarity==0.3.2
httpretty==0.9.4
pyopenssl==18.0.0
PyYAML
requests==2.18.4
responses==0.9.0
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
selenium==3.8.0
smmap2==2.0.3
tenacity==5.0.2
urllib3==1.22

@ -78,7 +78,38 @@ def test_url(url):
}
result = check_and_rate_site(entry=job)
pprint(result['rating'])
pprint(result)
def execute_single_job(datastore_client, job, entity_kind):
"""
Executes spider for one single job
"""
validate_job(job)
logging.info("Starting job %s", job["url"])
result = check_and_rate_site(entry=job)
logging.debug("Full JSON representation of returned result: %s", json.dumps(result, default=str))
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])
key = datastore_client.key(entity_kind, job["url"])
entity = datastore.Entity(key=key)
record = {
'created': datetime.utcnow(),
'meta': result['meta'],
'checks': result['checks'],
'rating': result['rating'],
'score': result['score'],
}
entity.update(record)
try:
datastore_client.put(entity)
except InvalidArgument as ex:
logging.error("Could not write result: %s", ex)
except Exception as ex:
logging.error("Could not write result: %s", ex)
def work_of_queue(datastore_client, entity_kind):
"""
@ -90,28 +121,8 @@ def work_of_queue(datastore_client, entity_kind):
logging.info("No more jobs. Exiting.")
break
logging.info("Starting job %s", job["url"])
result = check_and_rate_site(entry=job)
logging.debug("Full JSON representation of returned result: %s", json.dumps(result, default=str))
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])
key = datastore_client.key(entity_kind, job["url"])
entity = datastore.Entity(key=key)
record = {
'created': datetime.utcnow(),
'meta': result['meta'],
'checks': result['checks'],
'rating': result['rating'],
'score': result['score'],
}
entity.update(record)
try:
datastore_client.put(entity)
except InvalidArgument as ex:
logging.error("Could not write result: %s", ex)
except Exception as ex:
logging.error("Could not write result: %s", ex)
execute_single_job(datastore_client, job, entity_kind)
def validate_job(jobdict):
if "url" not in jobdict:
raise Exception("Job does not have required 'url' attribute")

Loading…
Cancel
Save