Make and use a versioned docker image (#279)

* Revert redis module to 4.1.0

* Revert dnspython to 2.1.0

* Revert click to 8.0.3

* Specify alpine 3.16.2, reorganize into multiple steps

* Replace 'latest' with 'main' everywhere

* Fix deprecation warnings

* Add Google root certificates

* Re-order APK packages, write list after installing

* Create VERSION file during docker image build

* Pin chromium version
This commit is contained in:
Marian Steinbach 2022-10-24 21:35:15 +02:00 committed by GitHub
parent 024ef118dd
commit 5e723c94db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 61 additions and 77 deletions

1
.gitignore vendored
View File

@ -8,4 +8,5 @@ kubernetes/green-spider-secret.yaml
/volumes
/screenshots
/k8s-jobs
/VERSION
.env

View File

@ -1,16 +1,30 @@
FROM alpine:3.16
FROM alpine:3.16.2
ENV CHROMIUM_VERSION=106.0.5249.119-r1
RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories && \
echo "http://dl-cdn.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \
apk --update --no-cache add ca-certificates \
chromium=$CHROMIUM_VERSION \
chromium-chromedriver=$CHROMIUM_VERSION \
py3-cryptography python3-dev py3-grpcio py3-wheel py3-pip py3-lxml py3-yaml \
build-base git icu-libs libssl1.1 libssl3 libxml2 libxml2-dev libxslt libxslt-dev \
libffi-dev openssl-dev cargo
RUN apk info -v | sort
WORKDIR /workdir
ADD requirements.txt /workdir/
# Execute time consuming compilations in a separate step
RUN python3 -m pip install libcst==0.4.7 sgmllib3k==1.0.0
RUN echo "http://dl-4.alpinelinux.org/alpine/edge/main/" >> /etc/apk/repositories && \
echo "http://dl-4.alpinelinux.org/alpine/edge/community/" >> /etc/apk/repositories && \
apk --update --no-cache add ca-certificates chromium chromium-chromedriver py3-cryptography \
python3-dev py3-grpcio py3-wheel py3-pip py3-lxml py3-yaml \
build-base git icu-libs libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \
pip install -r requirements.txt && \
apk del build-base
ADD https://pki.google.com/roots.pem /google_roots.pem
ENV GRPC_DEFAULT_SSL_ROOTS_FILE_PATH=/google_roots.pem
ADD requirements.txt /workdir/
RUN pip install -r requirements.txt
RUN python3 -m pip freeze
ADD cli.py /workdir/
ADD manager /workdir/manager
@ -20,3 +34,4 @@ ADD rating /workdir/rating
ADD spider /workdir/spider
ADD export /workdir/export
ADD job.py /workdir/
ADD VERSION /workdir/VERSION

View File

@ -1,11 +1,13 @@
IMAGE := quay.io/netzbegruenung/green-spider:latest
IMAGE := quay.io/netzbegruenung/green-spider:main
DB_ENTITY := spider-results
VERSION = $(shell git describe --exact-match --tags 2> /dev/null || git rev-parse HEAD)
.PHONY: dockerimage spider export
# Build docker image
dockerimage:
dockerimage: VERSION
docker build --progress plain -t $(IMAGE) .
# Fill the queue with spider jobs, one for each site.
@ -50,3 +52,5 @@ test:
$(IMAGE) \
-m unittest discover -p '*_test.py' -v
VERSION:
@echo $(VERSION) > VERSION

View File

@ -68,8 +68,8 @@ docker run --rm -ti \
-v $(pwd)/screenshots:/screenshots \
-v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
--shm-size=2g \
quay.io/netzbegruenung/green-spider:latest python3 cli.py \
quay.io/netzbegruenung/green-spider:main python3 cli.py \
--credentials-path /secrets/datastore-writer.json \
--loglevel debug \
spider --job '{"url": "https://gruene-porta-westfalica.de/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}'
spider --job '{"url": "https://gruene-porta-westfalica.de/home/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}'
```

View File

@ -49,7 +49,7 @@ class Checker(AbstractChecker):
# IPv4
try:
answers = dns.resolver.query(hostname, "A")
answers = dns.resolver.resolve(hostname, "A")
result['resolvable_ipv4'] = True
for rdata in answers:
result['ipv4_addresses'].append(rdata.address)
@ -58,7 +58,7 @@ class Checker(AbstractChecker):
# IPv6
try:
answers = dns.resolver.query(hostname, "AAAA")
answers = dns.resolver.resolve(hostname, "AAAA")
result['resolvable_ipv6'] = True
for rdata in answers:
result['ipv6_addresses'].append(rdata.address)

View File

@ -36,7 +36,7 @@ class Checker(AbstractChecker):
page_content = self.previous_results['page_content'][url]
if page_content['content'] is None:
logging.warn("Content for URL %s is None" % url)
logging.warning("Content for URL %s is None" % url)
content[url] = page_content['content']

View File

@ -119,20 +119,20 @@ class Checker(AbstractChecker):
'screenshots': check_responsiveness_results['screenshots'],
}
except TimeoutException as e:
logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e))
logging.warning("TimeoutException when checking responsiveness for %s: %s" % (url, e))
pass
except tenacity.RetryError as re:
logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re))
logging.warning("RetryError when checking responsiveness for %s: %s" % (url, re))
pass
# Scroll page to bottom, to load all lazy-loading resources.
try:
self.scroll_to_bottom()
except TimeoutException as e:
logging.warn("TimeoutException in scroll_to_bottom for %s: %s" % (url, e))
logging.warning("TimeoutException in scroll_to_bottom for %s: %s" % (url, e))
pass
except tenacity.RetryError as re:
logging.warn("RetryError in scroll_to_bottom for %s: %s" % (url, re))
logging.warning("RetryError in scroll_to_bottom for %s: %s" % (url, re))
pass
# CSS collection
@ -148,23 +148,23 @@ class Checker(AbstractChecker):
continue
font_families.add(font_family.lower())
except StaleElementReferenceException as e:
logging.warn("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
logging.warning("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
continue
results[url]['font_families'] = sorted(list(font_families))
except TimeoutException as e:
logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
logging.warning("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
pass
# Process cookies.
try:
results[url]['cookies'] = self.get_cookies()
except TimeoutException as e:
logging.warn("TimeoutException when collecting cookies %s: %s" % (url, e))
logging.warning("TimeoutException when collecting cookies %s: %s" % (url, e))
pass
except tenacity.RetryError as re:
logging.warn("RetryError when collecting cookies for %s: %s" % (url, re))
logging.warning("RetryError when collecting cookies for %s: %s" % (url, re))
pass
for logentry in self.driver.get_log('performance'):
@ -209,7 +209,7 @@ class Checker(AbstractChecker):
blob.upload_from_file(my_file, content_type="image/png")
blob.make_public()
except Exception as e:
logging.warn("Error uploading screenshot for %s: %s" % (screenshot['url'], e))
logging.warning("Error uploading screenshot for %s: %s" % (screenshot['url'], e))
continue
try:
@ -232,7 +232,7 @@ class Checker(AbstractChecker):
datastore_client.put(entity)
logging.debug("Successfully stored screenshot metadata for %s" % screenshot['screenshot_url'])
except Exception as e:
logging.warn("Error in %s: %s" % (screenshot['url'], e))
logging.warning("Error in %s: %s" % (screenshot['url'], e))
# Remove screenshots part from results
@ -289,7 +289,7 @@ class Checker(AbstractChecker):
success = self.driver.save_screenshot(abs_filepath)
if not success:
logging.warn("Failed to create screenshot %s" % abs_filepath)
logging.warning("Failed to create screenshot %s" % abs_filepath)
continue
result['screenshots'].append({

View File

@ -15,7 +15,7 @@ services:
# manager manages the job queue.
manager:
image: quay.io/netzbegruenung/green-spider:latest
image: quay.io/netzbegruenung/green-spider:main
command: >
python3 cli.py
--credentials-path /secrets/datastore-writer.json

2
job.py
View File

@ -16,7 +16,7 @@ from google.cloud import datastore
# via the environment JOB_TIMEOUT variable.
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest'
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:main'
CREDENTIALS_PATH = '/secrets/datastore-writer.json'

View File

@ -12,7 +12,7 @@ spec:
spec:
containers:
- name: spider
image: quay.io/netzbegruenung/green-spider:latest
image: quay.io/netzbegruenung/green-spider:main
imagePullPolicy: Always
args:
- "--credentials-path=/secrets/datastore-writer.json"

View File

@ -1,36 +0,0 @@
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: green-spider-screenshotter
spec:
# Saturday at 1:05 UTC
schedule: "5 1 * * 6"
jobTemplate:
spec:
parallelism: 1
template:
spec:
containers:
- name: screenshotter
image: quay.io/netzbegruenung/green-spider-screenshotter:latest
imagePullPolicy: Always
volumeMounts:
- name: secrets
mountPath: "/secrets"
readOnly: true
resources:
requests:
cpu: 800m
memory: 4000M
# No restarts, as this would mean to start over.
# TODO: Maintain a queue and change this.
restartPolicy: Never
volumes:
- name: secrets
secret:
secretName: green-spider
items:
- key: datastore-writer.json
path: datastore-writer.json
- key: screenshots-uploader.json
path: screenshots-uploader.json

View File

@ -12,7 +12,7 @@ spec:
spec:
containers:
- name: spider
image: quay.io/netzbegruenung/green-spider:latest
image: quay.io/netzbegruenung/green-spider:main
imagePullPolicy: Always
args:
- "--credentials-path=/secrets/datastore-writer.json"

View File

@ -3,19 +3,19 @@ cachetools==4.2.4
certifi==2021.10.8
cffi==1.15.1
chardet==3.0.4
click==8.1.3
click==8.0.3
cssselect==1.1.0
dnspython==2.2.1
dnspython==2.1.0
docker==4.4.1
feedparser==6.0.8
gitdb==4.0.9
GitPython==3.1.24
google-api-core==2.2.2
google-auth==2.3.3
google-cloud-core==2.2.1
google-cloud-datastore==2.4.0
google-cloud-storage==1.43.0
googleapis-common-protos==1.53.0
google-api-core==2.10.2
google-auth==2.13.0
google-cloud-core==2.3.2
google-cloud-datastore==2.9.0
google-cloud-storage==2.5.0
googleapis-common-protos==1.56.4
html-similarity==0.3.3
httpretty==1.1.4
idna==2.10
@ -25,9 +25,9 @@ protobuf==4.21.8
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycparser==2.21
pyOpenSSL==22.1.0
pyOpenSSL==22.0.0
pytz==2021.3
redis==4.3.4
redis==4.1.0
requests==2.26.0
responses==0.22.0
rq==1.8.0