mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-11 13:06:08 +02:00
Make and use a versioned docker image (#279)
* Revert redis module to 4.1.0 * Revert dnspython to 2.1.0 * Revert click to 8.0.3 * Specify alpine 3.16.2, reorganize into multiple steps * Replace 'latest' with 'main' everywhere * Fix deprecation warnings * Add Google root certificates * Re-order APK packages, write list after installing * Create VERSION file during docker image build * Pin chromium version
This commit is contained in:
parent
024ef118dd
commit
5e723c94db
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -8,4 +8,5 @@ kubernetes/green-spider-secret.yaml
|
||||||
/volumes
|
/volumes
|
||||||
/screenshots
|
/screenshots
|
||||||
/k8s-jobs
|
/k8s-jobs
|
||||||
|
/VERSION
|
||||||
.env
|
.env
|
||||||
|
|
33
Dockerfile
33
Dockerfile
|
@ -1,16 +1,30 @@
|
||||||
FROM alpine:3.16
|
FROM alpine:3.16.2
|
||||||
|
|
||||||
|
ENV CHROMIUM_VERSION=106.0.5249.119-r1
|
||||||
|
|
||||||
|
RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories && \
|
||||||
|
echo "http://dl-cdn.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \
|
||||||
|
apk --update --no-cache add ca-certificates \
|
||||||
|
chromium=$CHROMIUM_VERSION \
|
||||||
|
chromium-chromedriver=$CHROMIUM_VERSION \
|
||||||
|
py3-cryptography python3-dev py3-grpcio py3-wheel py3-pip py3-lxml py3-yaml \
|
||||||
|
build-base git icu-libs libssl1.1 libssl3 libxml2 libxml2-dev libxslt libxslt-dev \
|
||||||
|
libffi-dev openssl-dev cargo
|
||||||
|
|
||||||
|
RUN apk info -v | sort
|
||||||
|
|
||||||
WORKDIR /workdir
|
WORKDIR /workdir
|
||||||
|
|
||||||
ADD requirements.txt /workdir/
|
# Execute time consuming compilations in a separate step
|
||||||
|
RUN python3 -m pip install libcst==0.4.7 sgmllib3k==1.0.0
|
||||||
|
|
||||||
RUN echo "http://dl-4.alpinelinux.org/alpine/edge/main/" >> /etc/apk/repositories && \
|
ADD https://pki.google.com/roots.pem /google_roots.pem
|
||||||
echo "http://dl-4.alpinelinux.org/alpine/edge/community/" >> /etc/apk/repositories && \
|
ENV GRPC_DEFAULT_SSL_ROOTS_FILE_PATH=/google_roots.pem
|
||||||
apk --update --no-cache add ca-certificates chromium chromium-chromedriver py3-cryptography \
|
|
||||||
python3-dev py3-grpcio py3-wheel py3-pip py3-lxml py3-yaml \
|
ADD requirements.txt /workdir/
|
||||||
build-base git icu-libs libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \
|
RUN pip install -r requirements.txt
|
||||||
pip install -r requirements.txt && \
|
|
||||||
apk del build-base
|
RUN python3 -m pip freeze
|
||||||
|
|
||||||
ADD cli.py /workdir/
|
ADD cli.py /workdir/
|
||||||
ADD manager /workdir/manager
|
ADD manager /workdir/manager
|
||||||
|
@ -20,3 +34,4 @@ ADD rating /workdir/rating
|
||||||
ADD spider /workdir/spider
|
ADD spider /workdir/spider
|
||||||
ADD export /workdir/export
|
ADD export /workdir/export
|
||||||
ADD job.py /workdir/
|
ADD job.py /workdir/
|
||||||
|
ADD VERSION /workdir/VERSION
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -1,11 +1,13 @@
|
||||||
IMAGE := quay.io/netzbegruenung/green-spider:latest
|
IMAGE := quay.io/netzbegruenung/green-spider:main
|
||||||
|
|
||||||
DB_ENTITY := spider-results
|
DB_ENTITY := spider-results
|
||||||
|
|
||||||
|
VERSION = $(shell git describe --exact-match --tags 2> /dev/null || git rev-parse HEAD)
|
||||||
|
|
||||||
.PHONY: dockerimage spider export
|
.PHONY: dockerimage spider export
|
||||||
|
|
||||||
# Build docker image
|
# Build docker image
|
||||||
dockerimage:
|
dockerimage: VERSION
|
||||||
docker build --progress plain -t $(IMAGE) .
|
docker build --progress plain -t $(IMAGE) .
|
||||||
|
|
||||||
# Fill the queue with spider jobs, one for each site.
|
# Fill the queue with spider jobs, one for each site.
|
||||||
|
@ -50,3 +52,5 @@ test:
|
||||||
$(IMAGE) \
|
$(IMAGE) \
|
||||||
-m unittest discover -p '*_test.py' -v
|
-m unittest discover -p '*_test.py' -v
|
||||||
|
|
||||||
|
VERSION:
|
||||||
|
@echo $(VERSION) > VERSION
|
||||||
|
|
|
@ -68,8 +68,8 @@ docker run --rm -ti \
|
||||||
-v $(pwd)/screenshots:/screenshots \
|
-v $(pwd)/screenshots:/screenshots \
|
||||||
-v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
|
-v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
|
||||||
--shm-size=2g \
|
--shm-size=2g \
|
||||||
quay.io/netzbegruenung/green-spider:latest python3 cli.py \
|
quay.io/netzbegruenung/green-spider:main python3 cli.py \
|
||||||
--credentials-path /secrets/datastore-writer.json \
|
--credentials-path /secrets/datastore-writer.json \
|
||||||
--loglevel debug \
|
--loglevel debug \
|
||||||
spider --job '{"url": "https://gruene-porta-westfalica.de/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}'
|
spider --job '{"url": "https://gruene-porta-westfalica.de/home/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}'
|
||||||
```
|
```
|
||||||
|
|
|
@ -49,7 +49,7 @@ class Checker(AbstractChecker):
|
||||||
|
|
||||||
# IPv4
|
# IPv4
|
||||||
try:
|
try:
|
||||||
answers = dns.resolver.query(hostname, "A")
|
answers = dns.resolver.resolve(hostname, "A")
|
||||||
result['resolvable_ipv4'] = True
|
result['resolvable_ipv4'] = True
|
||||||
for rdata in answers:
|
for rdata in answers:
|
||||||
result['ipv4_addresses'].append(rdata.address)
|
result['ipv4_addresses'].append(rdata.address)
|
||||||
|
@ -58,7 +58,7 @@ class Checker(AbstractChecker):
|
||||||
|
|
||||||
# IPv6
|
# IPv6
|
||||||
try:
|
try:
|
||||||
answers = dns.resolver.query(hostname, "AAAA")
|
answers = dns.resolver.resolve(hostname, "AAAA")
|
||||||
result['resolvable_ipv6'] = True
|
result['resolvable_ipv6'] = True
|
||||||
for rdata in answers:
|
for rdata in answers:
|
||||||
result['ipv6_addresses'].append(rdata.address)
|
result['ipv6_addresses'].append(rdata.address)
|
||||||
|
|
|
@ -36,7 +36,7 @@ class Checker(AbstractChecker):
|
||||||
page_content = self.previous_results['page_content'][url]
|
page_content = self.previous_results['page_content'][url]
|
||||||
|
|
||||||
if page_content['content'] is None:
|
if page_content['content'] is None:
|
||||||
logging.warn("Content for URL %s is None" % url)
|
logging.warning("Content for URL %s is None" % url)
|
||||||
|
|
||||||
content[url] = page_content['content']
|
content[url] = page_content['content']
|
||||||
|
|
||||||
|
|
|
@ -119,20 +119,20 @@ class Checker(AbstractChecker):
|
||||||
'screenshots': check_responsiveness_results['screenshots'],
|
'screenshots': check_responsiveness_results['screenshots'],
|
||||||
}
|
}
|
||||||
except TimeoutException as e:
|
except TimeoutException as e:
|
||||||
logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e))
|
logging.warning("TimeoutException when checking responsiveness for %s: %s" % (url, e))
|
||||||
pass
|
pass
|
||||||
except tenacity.RetryError as re:
|
except tenacity.RetryError as re:
|
||||||
logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re))
|
logging.warning("RetryError when checking responsiveness for %s: %s" % (url, re))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Scroll page to bottom, to load all lazy-loading resources.
|
# Scroll page to bottom, to load all lazy-loading resources.
|
||||||
try:
|
try:
|
||||||
self.scroll_to_bottom()
|
self.scroll_to_bottom()
|
||||||
except TimeoutException as e:
|
except TimeoutException as e:
|
||||||
logging.warn("TimeoutException in scroll_to_bottom for %s: %s" % (url, e))
|
logging.warning("TimeoutException in scroll_to_bottom for %s: %s" % (url, e))
|
||||||
pass
|
pass
|
||||||
except tenacity.RetryError as re:
|
except tenacity.RetryError as re:
|
||||||
logging.warn("RetryError in scroll_to_bottom for %s: %s" % (url, re))
|
logging.warning("RetryError in scroll_to_bottom for %s: %s" % (url, re))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# CSS collection
|
# CSS collection
|
||||||
|
@ -148,23 +148,23 @@ class Checker(AbstractChecker):
|
||||||
continue
|
continue
|
||||||
font_families.add(font_family.lower())
|
font_families.add(font_family.lower())
|
||||||
except StaleElementReferenceException as e:
|
except StaleElementReferenceException as e:
|
||||||
logging.warn("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
|
logging.warning("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
results[url]['font_families'] = sorted(list(font_families))
|
results[url]['font_families'] = sorted(list(font_families))
|
||||||
|
|
||||||
except TimeoutException as e:
|
except TimeoutException as e:
|
||||||
logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
|
logging.warning("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Process cookies.
|
# Process cookies.
|
||||||
try:
|
try:
|
||||||
results[url]['cookies'] = self.get_cookies()
|
results[url]['cookies'] = self.get_cookies()
|
||||||
except TimeoutException as e:
|
except TimeoutException as e:
|
||||||
logging.warn("TimeoutException when collecting cookies %s: %s" % (url, e))
|
logging.warning("TimeoutException when collecting cookies %s: %s" % (url, e))
|
||||||
pass
|
pass
|
||||||
except tenacity.RetryError as re:
|
except tenacity.RetryError as re:
|
||||||
logging.warn("RetryError when collecting cookies for %s: %s" % (url, re))
|
logging.warning("RetryError when collecting cookies for %s: %s" % (url, re))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
for logentry in self.driver.get_log('performance'):
|
for logentry in self.driver.get_log('performance'):
|
||||||
|
@ -209,7 +209,7 @@ class Checker(AbstractChecker):
|
||||||
blob.upload_from_file(my_file, content_type="image/png")
|
blob.upload_from_file(my_file, content_type="image/png")
|
||||||
blob.make_public()
|
blob.make_public()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warn("Error uploading screenshot for %s: %s" % (screenshot['url'], e))
|
logging.warning("Error uploading screenshot for %s: %s" % (screenshot['url'], e))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -232,7 +232,7 @@ class Checker(AbstractChecker):
|
||||||
datastore_client.put(entity)
|
datastore_client.put(entity)
|
||||||
logging.debug("Successfully stored screenshot metadata for %s" % screenshot['screenshot_url'])
|
logging.debug("Successfully stored screenshot metadata for %s" % screenshot['screenshot_url'])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warn("Error in %s: %s" % (screenshot['url'], e))
|
logging.warning("Error in %s: %s" % (screenshot['url'], e))
|
||||||
|
|
||||||
|
|
||||||
# Remove screenshots part from results
|
# Remove screenshots part from results
|
||||||
|
@ -289,7 +289,7 @@ class Checker(AbstractChecker):
|
||||||
success = self.driver.save_screenshot(abs_filepath)
|
success = self.driver.save_screenshot(abs_filepath)
|
||||||
|
|
||||||
if not success:
|
if not success:
|
||||||
logging.warn("Failed to create screenshot %s" % abs_filepath)
|
logging.warning("Failed to create screenshot %s" % abs_filepath)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
result['screenshots'].append({
|
result['screenshots'].append({
|
||||||
|
|
|
@ -15,7 +15,7 @@ services:
|
||||||
|
|
||||||
# manager manages the job queue.
|
# manager manages the job queue.
|
||||||
manager:
|
manager:
|
||||||
image: quay.io/netzbegruenung/green-spider:latest
|
image: quay.io/netzbegruenung/green-spider:main
|
||||||
command: >
|
command: >
|
||||||
python3 cli.py
|
python3 cli.py
|
||||||
--credentials-path /secrets/datastore-writer.json
|
--credentials-path /secrets/datastore-writer.json
|
||||||
|
|
2
job.py
2
job.py
|
@ -16,7 +16,7 @@ from google.cloud import datastore
|
||||||
# via the environment JOB_TIMEOUT variable.
|
# via the environment JOB_TIMEOUT variable.
|
||||||
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))
|
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))
|
||||||
|
|
||||||
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest'
|
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:main'
|
||||||
|
|
||||||
CREDENTIALS_PATH = '/secrets/datastore-writer.json'
|
CREDENTIALS_PATH = '/secrets/datastore-writer.json'
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ spec:
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: spider
|
- name: spider
|
||||||
image: quay.io/netzbegruenung/green-spider:latest
|
image: quay.io/netzbegruenung/green-spider:main
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
args:
|
args:
|
||||||
- "--credentials-path=/secrets/datastore-writer.json"
|
- "--credentials-path=/secrets/datastore-writer.json"
|
||||||
|
|
|
@ -1,36 +0,0 @@
|
||||||
apiVersion: batch/v1beta1
|
|
||||||
kind: CronJob
|
|
||||||
metadata:
|
|
||||||
name: green-spider-screenshotter
|
|
||||||
spec:
|
|
||||||
# Saturday at 1:05 UTC
|
|
||||||
schedule: "5 1 * * 6"
|
|
||||||
jobTemplate:
|
|
||||||
spec:
|
|
||||||
parallelism: 1
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: screenshotter
|
|
||||||
image: quay.io/netzbegruenung/green-spider-screenshotter:latest
|
|
||||||
imagePullPolicy: Always
|
|
||||||
volumeMounts:
|
|
||||||
- name: secrets
|
|
||||||
mountPath: "/secrets"
|
|
||||||
readOnly: true
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 800m
|
|
||||||
memory: 4000M
|
|
||||||
# No restarts, as this would mean to start over.
|
|
||||||
# TODO: Maintain a queue and change this.
|
|
||||||
restartPolicy: Never
|
|
||||||
volumes:
|
|
||||||
- name: secrets
|
|
||||||
secret:
|
|
||||||
secretName: green-spider
|
|
||||||
items:
|
|
||||||
- key: datastore-writer.json
|
|
||||||
path: datastore-writer.json
|
|
||||||
- key: screenshots-uploader.json
|
|
||||||
path: screenshots-uploader.json
|
|
|
@ -12,7 +12,7 @@ spec:
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: spider
|
- name: spider
|
||||||
image: quay.io/netzbegruenung/green-spider:latest
|
image: quay.io/netzbegruenung/green-spider:main
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
args:
|
args:
|
||||||
- "--credentials-path=/secrets/datastore-writer.json"
|
- "--credentials-path=/secrets/datastore-writer.json"
|
||||||
|
|
|
@ -3,19 +3,19 @@ cachetools==4.2.4
|
||||||
certifi==2021.10.8
|
certifi==2021.10.8
|
||||||
cffi==1.15.1
|
cffi==1.15.1
|
||||||
chardet==3.0.4
|
chardet==3.0.4
|
||||||
click==8.1.3
|
click==8.0.3
|
||||||
cssselect==1.1.0
|
cssselect==1.1.0
|
||||||
dnspython==2.2.1
|
dnspython==2.1.0
|
||||||
docker==4.4.1
|
docker==4.4.1
|
||||||
feedparser==6.0.8
|
feedparser==6.0.8
|
||||||
gitdb==4.0.9
|
gitdb==4.0.9
|
||||||
GitPython==3.1.24
|
GitPython==3.1.24
|
||||||
google-api-core==2.2.2
|
google-api-core==2.10.2
|
||||||
google-auth==2.3.3
|
google-auth==2.13.0
|
||||||
google-cloud-core==2.2.1
|
google-cloud-core==2.3.2
|
||||||
google-cloud-datastore==2.4.0
|
google-cloud-datastore==2.9.0
|
||||||
google-cloud-storage==1.43.0
|
google-cloud-storage==2.5.0
|
||||||
googleapis-common-protos==1.53.0
|
googleapis-common-protos==1.56.4
|
||||||
html-similarity==0.3.3
|
html-similarity==0.3.3
|
||||||
httpretty==1.1.4
|
httpretty==1.1.4
|
||||||
idna==2.10
|
idna==2.10
|
||||||
|
@ -25,9 +25,9 @@ protobuf==4.21.8
|
||||||
pyasn1==0.4.8
|
pyasn1==0.4.8
|
||||||
pyasn1-modules==0.2.8
|
pyasn1-modules==0.2.8
|
||||||
pycparser==2.21
|
pycparser==2.21
|
||||||
pyOpenSSL==22.1.0
|
pyOpenSSL==22.0.0
|
||||||
pytz==2021.3
|
pytz==2021.3
|
||||||
redis==4.3.4
|
redis==4.1.0
|
||||||
requests==2.26.0
|
requests==2.26.0
|
||||||
responses==0.22.0
|
responses==0.22.0
|
||||||
rq==1.8.0
|
rq==1.8.0
|
||||||
|
|
Loading…
Reference in a new issue