Problembehebung mit Exception-Daten im Ergbebnis, die nicht geschrieben werden können, und Spidern einzelner Sites (#132)
* WIP commit for single job execution * Convert exception to string * Pass more arguments * Move python modules lsit into requirements.txt * Document single site spidering * Remove debugging
This commit is contained in:
parent
725ed5439d
commit
b3bb8f34c3
|
@ -1,12 +1,15 @@
|
||||||
FROM python:3.6-alpine3.8
|
FROM python:3.6-alpine3.8
|
||||||
|
|
||||||
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
|
WORKDIR /workdir
|
||||||
|
|
||||||
|
ADD requirements.txt /workdir/
|
||||||
|
|
||||||
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.8/main" >> /etc/apk/repositories && \
|
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.8/main" >> /etc/apk/repositories && \
|
||||||
echo "http://dl-4.alpinelinux.org/alpine/v3.8/community" >> /etc/apk/repositories && \
|
echo "http://dl-4.alpinelinux.org/alpine/v3.8/community" >> /etc/apk/repositories && \
|
||||||
apk update && \
|
apk update && \
|
||||||
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
|
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
|
||||||
pip3 install --upgrade pip && \
|
pip3 install --upgrade pip && \
|
||||||
pip3 install dnspython==1.16.0 selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 html-similarity==0.3.2 httpretty==0.9.4 feedparser==5.2.1 pyopenssl==18.0.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
|
pip3 install -r requirements.txt && \
|
||||||
apk del python3-dev build-base
|
apk del python3-dev build-base
|
||||||
|
|
||||||
ADD cli.py /
|
ADD cli.py /
|
||||||
|
|
16
README.md
16
README.md
|
@ -48,7 +48,7 @@ Nach dem ersten erfolgreichen Durchlauf dauert ein neuer Aufruf von `make` nur n
|
||||||
|
|
||||||
In aller Kürze: `make test`
|
In aller Kürze: `make test`
|
||||||
|
|
||||||
### Spider ausführen
|
### Spider testweise ausführen (Debugging)
|
||||||
|
|
||||||
Der Spider kann einzelne URLs verarbeiten, ohne die Ergebnisse in eine Datenbank zu schreiben.
|
Der Spider kann einzelne URLs verarbeiten, ohne die Ergebnisse in eine Datenbank zu schreiben.
|
||||||
Am einfachsten geht das über den `make spider` Befehl, so:
|
Am einfachsten geht das über den `make spider` Befehl, so:
|
||||||
|
@ -58,3 +58,17 @@ make spider ARGS="--url http://www.example.com/"
|
||||||
```
|
```
|
||||||
|
|
||||||
Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenank.
|
Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenank.
|
||||||
|
|
||||||
|
Wenn nur eine einzelne Site gespidert werden soll, die Ergebnisse aber in die Datenbank geschrieben werden sollen, kann der Spider so mit `--job` und einem JSON-Object aufgerufen werden (Beispiel):
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run --rm -ti \
|
||||||
|
-v $(pwd)/volumes/dev-shm:/dev/shm \
|
||||||
|
-v $(pwd)/secrets:/secrets \
|
||||||
|
-v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
|
||||||
|
--shm-size=2g \
|
||||||
|
quay.io/netzbegruenung/green-spider:latest \
|
||||||
|
--credentials-path /secrets/datastore-writer.json \
|
||||||
|
--loglevel debug \
|
||||||
|
spider --job '{"url": "https://xn--grne-porta-westfalica-9hc.de/", "meta": {"city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}}'
|
||||||
|
```
|
||||||
|
|
|
@ -67,7 +67,7 @@ class Checker(AbstractChecker):
|
||||||
data = feedparser.parse(feed_url)
|
data = feedparser.parse(feed_url)
|
||||||
|
|
||||||
if 'bozo_exception' in data:
|
if 'bozo_exception' in data:
|
||||||
result['exception'] = data['bozo_exception']
|
result['exception'] = str(data['bozo_exception'])
|
||||||
|
|
||||||
if 'headers' not in data:
|
if 'headers' not in data:
|
||||||
return result
|
return result
|
||||||
|
|
7
cli.py
7
cli.py
|
@ -6,6 +6,7 @@ import argparse
|
||||||
import logging
|
import logging
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
from google.cloud import datastore
|
from google.cloud import datastore
|
||||||
|
|
||||||
|
@ -37,7 +38,8 @@ if __name__ == "__main__":
|
||||||
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
|
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
|
||||||
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
|
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
|
||||||
spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
|
spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
|
||||||
|
spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.')
|
||||||
|
|
||||||
# jobs subcommand
|
# jobs subcommand
|
||||||
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
|
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
|
||||||
jobs_parser.add_argument('--url', help='Add a job to spider a specific URL')
|
jobs_parser.add_argument('--url', help='Add a job to spider a specific URL')
|
||||||
|
@ -83,5 +85,8 @@ if __name__ == "__main__":
|
||||||
if args.url:
|
if args.url:
|
||||||
# spider one URL for diagnostic purposes
|
# spider one URL for diagnostic purposes
|
||||||
spider.test_url(args.url)
|
spider.test_url(args.url)
|
||||||
|
elif args.job:
|
||||||
|
job = json.loads(args.job)
|
||||||
|
spider.execute_single_job(datastore_client, job, args.kind)
|
||||||
else:
|
else:
|
||||||
spider.work_of_queue(datastore_client, args.kind)
|
spider.work_of_queue(datastore_client, args.kind)
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
beautifulsoup4==4.6.0
|
||||||
|
dnspython==1.16.0
|
||||||
|
feedparser==5.2.1
|
||||||
|
GitPython
|
||||||
|
google-cloud-datastore==1.10.0
|
||||||
|
html-similarity==0.3.2
|
||||||
|
httpretty==0.9.4
|
||||||
|
pyopenssl==18.0.0
|
||||||
|
PyYAML
|
||||||
|
requests==2.18.4
|
||||||
|
responses==0.9.0
|
||||||
|
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
|
||||||
|
selenium==3.8.0
|
||||||
|
smmap2==2.0.3
|
||||||
|
tenacity==5.0.2
|
||||||
|
urllib3==1.22
|
|
@ -78,7 +78,38 @@ def test_url(url):
|
||||||
}
|
}
|
||||||
|
|
||||||
result = check_and_rate_site(entry=job)
|
result = check_and_rate_site(entry=job)
|
||||||
pprint(result['rating'])
|
pprint(result)
|
||||||
|
|
||||||
|
def execute_single_job(datastore_client, job, entity_kind):
|
||||||
|
"""
|
||||||
|
Executes spider for one single job
|
||||||
|
"""
|
||||||
|
validate_job(job)
|
||||||
|
|
||||||
|
logging.info("Starting job %s", job["url"])
|
||||||
|
result = check_and_rate_site(entry=job)
|
||||||
|
|
||||||
|
logging.debug("Full JSON representation of returned result: %s", json.dumps(result, default=str))
|
||||||
|
|
||||||
|
logging.info("Job %s finished checks", job["url"])
|
||||||
|
logging.info("Job %s writing to DB", job["url"])
|
||||||
|
|
||||||
|
key = datastore_client.key(entity_kind, job["url"])
|
||||||
|
entity = datastore.Entity(key=key)
|
||||||
|
record = {
|
||||||
|
'created': datetime.utcnow(),
|
||||||
|
'meta': result['meta'],
|
||||||
|
'checks': result['checks'],
|
||||||
|
'rating': result['rating'],
|
||||||
|
'score': result['score'],
|
||||||
|
}
|
||||||
|
entity.update(record)
|
||||||
|
try:
|
||||||
|
datastore_client.put(entity)
|
||||||
|
except InvalidArgument as ex:
|
||||||
|
logging.error("Could not write result: %s", ex)
|
||||||
|
except Exception as ex:
|
||||||
|
logging.error("Could not write result: %s", ex)
|
||||||
|
|
||||||
def work_of_queue(datastore_client, entity_kind):
|
def work_of_queue(datastore_client, entity_kind):
|
||||||
"""
|
"""
|
||||||
|
@ -90,28 +121,8 @@ def work_of_queue(datastore_client, entity_kind):
|
||||||
logging.info("No more jobs. Exiting.")
|
logging.info("No more jobs. Exiting.")
|
||||||
break
|
break
|
||||||
|
|
||||||
logging.info("Starting job %s", job["url"])
|
execute_single_job(datastore_client, job, entity_kind)
|
||||||
result = check_and_rate_site(entry=job)
|
|
||||||
|
|
||||||
logging.debug("Full JSON representation of returned result: %s", json.dumps(result, default=str))
|
|
||||||
|
|
||||||
logging.info("Job %s finished checks", job["url"])
|
|
||||||
logging.info("Job %s writing to DB", job["url"])
|
|
||||||
|
|
||||||
key = datastore_client.key(entity_kind, job["url"])
|
|
||||||
entity = datastore.Entity(key=key)
|
|
||||||
record = {
|
|
||||||
'created': datetime.utcnow(),
|
|
||||||
'meta': result['meta'],
|
|
||||||
'checks': result['checks'],
|
|
||||||
'rating': result['rating'],
|
|
||||||
'score': result['score'],
|
|
||||||
}
|
|
||||||
entity.update(record)
|
|
||||||
try:
|
|
||||||
datastore_client.put(entity)
|
|
||||||
except InvalidArgument as ex:
|
|
||||||
logging.error("Could not write result: %s", ex)
|
|
||||||
except Exception as ex:
|
|
||||||
logging.error("Could not write result: %s", ex)
|
|
||||||
|
|
||||||
|
def validate_job(jobdict):
|
||||||
|
if "url" not in jobdict:
|
||||||
|
raise Exception("Job does not have required 'url' attribute")
|
||||||
|
|
Loading…
Reference in New Issue