Make and use a versioned docker image (#279)

* Revert redis module to 4.1.0

* Revert dnspython to 2.1.0

* Revert click to 8.0.3

* Specify alpine 3.16.2, reorganize into multiple steps

* Replace 'latest' with 'main' everywhere

* Fix deprecation warnings

* Add Google root certificates

* Re-order APK packages, write list after installing

* Create VERSION file during docker image build

* Pin chromium version
This commit is contained in:
Marian Steinbach 2022-10-24 21:35:15 +02:00 committed by GitHub
parent 024ef118dd
commit 5e723c94db
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 61 additions and 77 deletions

1
.gitignore vendored
View file

@ -8,4 +8,5 @@ kubernetes/green-spider-secret.yaml
/volumes /volumes
/screenshots /screenshots
/k8s-jobs /k8s-jobs
/VERSION
.env .env

View file

@ -1,16 +1,30 @@
FROM alpine:3.16 FROM alpine:3.16.2
ENV CHROMIUM_VERSION=106.0.5249.119-r1
RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories && \
echo "http://dl-cdn.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \
apk --update --no-cache add ca-certificates \
chromium=$CHROMIUM_VERSION \
chromium-chromedriver=$CHROMIUM_VERSION \
py3-cryptography python3-dev py3-grpcio py3-wheel py3-pip py3-lxml py3-yaml \
build-base git icu-libs libssl1.1 libssl3 libxml2 libxml2-dev libxslt libxslt-dev \
libffi-dev openssl-dev cargo
RUN apk info -v | sort
WORKDIR /workdir WORKDIR /workdir
ADD requirements.txt /workdir/ # Execute time consuming compilations in a separate step
RUN python3 -m pip install libcst==0.4.7 sgmllib3k==1.0.0
RUN echo "http://dl-4.alpinelinux.org/alpine/edge/main/" >> /etc/apk/repositories && \ ADD https://pki.google.com/roots.pem /google_roots.pem
echo "http://dl-4.alpinelinux.org/alpine/edge/community/" >> /etc/apk/repositories && \ ENV GRPC_DEFAULT_SSL_ROOTS_FILE_PATH=/google_roots.pem
apk --update --no-cache add ca-certificates chromium chromium-chromedriver py3-cryptography \
python3-dev py3-grpcio py3-wheel py3-pip py3-lxml py3-yaml \ ADD requirements.txt /workdir/
build-base git icu-libs libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \ RUN pip install -r requirements.txt
pip install -r requirements.txt && \
apk del build-base RUN python3 -m pip freeze
ADD cli.py /workdir/ ADD cli.py /workdir/
ADD manager /workdir/manager ADD manager /workdir/manager
@ -20,3 +34,4 @@ ADD rating /workdir/rating
ADD spider /workdir/spider ADD spider /workdir/spider
ADD export /workdir/export ADD export /workdir/export
ADD job.py /workdir/ ADD job.py /workdir/
ADD VERSION /workdir/VERSION

View file

@ -1,11 +1,13 @@
IMAGE := quay.io/netzbegruenung/green-spider:latest IMAGE := quay.io/netzbegruenung/green-spider:main
DB_ENTITY := spider-results DB_ENTITY := spider-results
VERSION = $(shell git describe --exact-match --tags 2> /dev/null || git rev-parse HEAD)
.PHONY: dockerimage spider export .PHONY: dockerimage spider export
# Build docker image # Build docker image
dockerimage: dockerimage: VERSION
docker build --progress plain -t $(IMAGE) . docker build --progress plain -t $(IMAGE) .
# Fill the queue with spider jobs, one for each site. # Fill the queue with spider jobs, one for each site.
@ -50,3 +52,5 @@ test:
$(IMAGE) \ $(IMAGE) \
-m unittest discover -p '*_test.py' -v -m unittest discover -p '*_test.py' -v
VERSION:
@echo $(VERSION) > VERSION

View file

@ -68,8 +68,8 @@ docker run --rm -ti \
-v $(pwd)/screenshots:/screenshots \ -v $(pwd)/screenshots:/screenshots \
-v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \ -v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
--shm-size=2g \ --shm-size=2g \
quay.io/netzbegruenung/green-spider:latest python3 cli.py \ quay.io/netzbegruenung/green-spider:main python3 cli.py \
--credentials-path /secrets/datastore-writer.json \ --credentials-path /secrets/datastore-writer.json \
--loglevel debug \ --loglevel debug \
spider --job '{"url": "https://gruene-porta-westfalica.de/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}' spider --job '{"url": "https://gruene-porta-westfalica.de/home/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}'
``` ```

View file

@ -49,7 +49,7 @@ class Checker(AbstractChecker):
# IPv4 # IPv4
try: try:
answers = dns.resolver.query(hostname, "A") answers = dns.resolver.resolve(hostname, "A")
result['resolvable_ipv4'] = True result['resolvable_ipv4'] = True
for rdata in answers: for rdata in answers:
result['ipv4_addresses'].append(rdata.address) result['ipv4_addresses'].append(rdata.address)
@ -58,7 +58,7 @@ class Checker(AbstractChecker):
# IPv6 # IPv6
try: try:
answers = dns.resolver.query(hostname, "AAAA") answers = dns.resolver.resolve(hostname, "AAAA")
result['resolvable_ipv6'] = True result['resolvable_ipv6'] = True
for rdata in answers: for rdata in answers:
result['ipv6_addresses'].append(rdata.address) result['ipv6_addresses'].append(rdata.address)

View file

@ -36,7 +36,7 @@ class Checker(AbstractChecker):
page_content = self.previous_results['page_content'][url] page_content = self.previous_results['page_content'][url]
if page_content['content'] is None: if page_content['content'] is None:
logging.warn("Content for URL %s is None" % url) logging.warning("Content for URL %s is None" % url)
content[url] = page_content['content'] content[url] = page_content['content']

View file

@ -119,20 +119,20 @@ class Checker(AbstractChecker):
'screenshots': check_responsiveness_results['screenshots'], 'screenshots': check_responsiveness_results['screenshots'],
} }
except TimeoutException as e: except TimeoutException as e:
logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e)) logging.warning("TimeoutException when checking responsiveness for %s: %s" % (url, e))
pass pass
except tenacity.RetryError as re: except tenacity.RetryError as re:
logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re)) logging.warning("RetryError when checking responsiveness for %s: %s" % (url, re))
pass pass
# Scroll page to bottom, to load all lazy-loading resources. # Scroll page to bottom, to load all lazy-loading resources.
try: try:
self.scroll_to_bottom() self.scroll_to_bottom()
except TimeoutException as e: except TimeoutException as e:
logging.warn("TimeoutException in scroll_to_bottom for %s: %s" % (url, e)) logging.warning("TimeoutException in scroll_to_bottom for %s: %s" % (url, e))
pass pass
except tenacity.RetryError as re: except tenacity.RetryError as re:
logging.warn("RetryError in scroll_to_bottom for %s: %s" % (url, re)) logging.warning("RetryError in scroll_to_bottom for %s: %s" % (url, re))
pass pass
# CSS collection # CSS collection
@ -148,23 +148,23 @@ class Checker(AbstractChecker):
continue continue
font_families.add(font_family.lower()) font_families.add(font_family.lower())
except StaleElementReferenceException as e: except StaleElementReferenceException as e:
logging.warn("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e)) logging.warning("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
continue continue
results[url]['font_families'] = sorted(list(font_families)) results[url]['font_families'] = sorted(list(font_families))
except TimeoutException as e: except TimeoutException as e:
logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e)) logging.warning("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
pass pass
# Process cookies. # Process cookies.
try: try:
results[url]['cookies'] = self.get_cookies() results[url]['cookies'] = self.get_cookies()
except TimeoutException as e: except TimeoutException as e:
logging.warn("TimeoutException when collecting cookies %s: %s" % (url, e)) logging.warning("TimeoutException when collecting cookies %s: %s" % (url, e))
pass pass
except tenacity.RetryError as re: except tenacity.RetryError as re:
logging.warn("RetryError when collecting cookies for %s: %s" % (url, re)) logging.warning("RetryError when collecting cookies for %s: %s" % (url, re))
pass pass
for logentry in self.driver.get_log('performance'): for logentry in self.driver.get_log('performance'):
@ -209,7 +209,7 @@ class Checker(AbstractChecker):
blob.upload_from_file(my_file, content_type="image/png") blob.upload_from_file(my_file, content_type="image/png")
blob.make_public() blob.make_public()
except Exception as e: except Exception as e:
logging.warn("Error uploading screenshot for %s: %s" % (screenshot['url'], e)) logging.warning("Error uploading screenshot for %s: %s" % (screenshot['url'], e))
continue continue
try: try:
@ -232,7 +232,7 @@ class Checker(AbstractChecker):
datastore_client.put(entity) datastore_client.put(entity)
logging.debug("Successfully stored screenshot metadata for %s" % screenshot['screenshot_url']) logging.debug("Successfully stored screenshot metadata for %s" % screenshot['screenshot_url'])
except Exception as e: except Exception as e:
logging.warn("Error in %s: %s" % (screenshot['url'], e)) logging.warning("Error in %s: %s" % (screenshot['url'], e))
# Remove screenshots part from results # Remove screenshots part from results
@ -289,7 +289,7 @@ class Checker(AbstractChecker):
success = self.driver.save_screenshot(abs_filepath) success = self.driver.save_screenshot(abs_filepath)
if not success: if not success:
logging.warn("Failed to create screenshot %s" % abs_filepath) logging.warning("Failed to create screenshot %s" % abs_filepath)
continue continue
result['screenshots'].append({ result['screenshots'].append({

View file

@ -15,7 +15,7 @@ services:
# manager manages the job queue. # manager manages the job queue.
manager: manager:
image: quay.io/netzbegruenung/green-spider:latest image: quay.io/netzbegruenung/green-spider:main
command: > command: >
python3 cli.py python3 cli.py
--credentials-path /secrets/datastore-writer.json --credentials-path /secrets/datastore-writer.json

2
job.py
View file

@ -16,7 +16,7 @@ from google.cloud import datastore
# via the environment JOB_TIMEOUT variable. # via the environment JOB_TIMEOUT variable.
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50")) TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest' DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:main'
CREDENTIALS_PATH = '/secrets/datastore-writer.json' CREDENTIALS_PATH = '/secrets/datastore-writer.json'

View file

@ -12,7 +12,7 @@ spec:
spec: spec:
containers: containers:
- name: spider - name: spider
image: quay.io/netzbegruenung/green-spider:latest image: quay.io/netzbegruenung/green-spider:main
imagePullPolicy: Always imagePullPolicy: Always
args: args:
- "--credentials-path=/secrets/datastore-writer.json" - "--credentials-path=/secrets/datastore-writer.json"

View file

@ -1,36 +0,0 @@
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: green-spider-screenshotter
spec:
# Saturday at 1:05 UTC
schedule: "5 1 * * 6"
jobTemplate:
spec:
parallelism: 1
template:
spec:
containers:
- name: screenshotter
image: quay.io/netzbegruenung/green-spider-screenshotter:latest
imagePullPolicy: Always
volumeMounts:
- name: secrets
mountPath: "/secrets"
readOnly: true
resources:
requests:
cpu: 800m
memory: 4000M
# No restarts, as this would mean to start over.
# TODO: Maintain a queue and change this.
restartPolicy: Never
volumes:
- name: secrets
secret:
secretName: green-spider
items:
- key: datastore-writer.json
path: datastore-writer.json
- key: screenshots-uploader.json
path: screenshots-uploader.json

View file

@ -12,7 +12,7 @@ spec:
spec: spec:
containers: containers:
- name: spider - name: spider
image: quay.io/netzbegruenung/green-spider:latest image: quay.io/netzbegruenung/green-spider:main
imagePullPolicy: Always imagePullPolicy: Always
args: args:
- "--credentials-path=/secrets/datastore-writer.json" - "--credentials-path=/secrets/datastore-writer.json"

View file

@ -3,19 +3,19 @@ cachetools==4.2.4
certifi==2021.10.8 certifi==2021.10.8
cffi==1.15.1 cffi==1.15.1
chardet==3.0.4 chardet==3.0.4
click==8.1.3 click==8.0.3
cssselect==1.1.0 cssselect==1.1.0
dnspython==2.2.1 dnspython==2.1.0
docker==4.4.1 docker==4.4.1
feedparser==6.0.8 feedparser==6.0.8
gitdb==4.0.9 gitdb==4.0.9
GitPython==3.1.24 GitPython==3.1.24
google-api-core==2.2.2 google-api-core==2.10.2
google-auth==2.3.3 google-auth==2.13.0
google-cloud-core==2.2.1 google-cloud-core==2.3.2
google-cloud-datastore==2.4.0 google-cloud-datastore==2.9.0
google-cloud-storage==1.43.0 google-cloud-storage==2.5.0
googleapis-common-protos==1.53.0 googleapis-common-protos==1.56.4
html-similarity==0.3.3 html-similarity==0.3.3
httpretty==1.1.4 httpretty==1.1.4
idna==2.10 idna==2.10
@ -25,9 +25,9 @@ protobuf==4.21.8
pyasn1==0.4.8 pyasn1==0.4.8
pyasn1-modules==0.2.8 pyasn1-modules==0.2.8
pycparser==2.21 pycparser==2.21
pyOpenSSL==22.1.0 pyOpenSSL==22.0.0
pytz==2021.3 pytz==2021.3
redis==4.3.4 redis==4.1.0
requests==2.26.0 requests==2.26.0
responses==0.22.0 responses==0.22.0
rq==1.8.0 rq==1.8.0