Browse Source
* CLI: remove 'jobs' command, add 'manager' * Add job definition * Move jobs to manage folder * Rename jobs to manager * Add rq and redis dependencies * Add docker-compose YAML * Downgrade to alpine 3.8 * Adjust paths in Dockerfile, remove entrypoint * Rename 'make spiderjobs' to 'make jobs' * Fix docker exectution * Adapt 'make jobs' * Fix metadata scheme * Add docker dependency * Rendomize queue (a bit) * Use latest image, remove debug output * Make docker-compose file downwards-compatible * Use latest instead of dev image tag * Update docker-compose.yaml * Adapt job start script * Fix redis connection in manager * Add support for increasing timeout via environment variable * Adapt load_in_browser to cookies table schema change * Fix execution * Mitigate yaml warning * Bump some dependency versions * Report resource usage stats for each job * checks/load_in_browser: Return DOM size, prevent multiple page loads * Update .dockerignore * Code update * Script update * Update README.md * WIP * WIP commit * Update Dockerfile to alpine:edge and chromium v90 * Update TestCertificateChecker * Set defaults for __init__ function * Detect sunflower theme * Update unit test for new datetime (zero-basing) * Set logging prefs from Chromium in a new way * Move datastore client instantiation As it is not needed for all commands * Change green-directory repository URL * Add git settings for cloning green-directory * Pin alpine version 3.14, fix py3-cryptography * Use plain docker build progress output * Add volumes to 'make test' docker run command * Fix bug * Update example command in README * Update dependencies * Add creation of Kubernetes jobspull/223/head
30 changed files with 986 additions and 154 deletions
@ -1,23 +1,26 @@
|
||||
FROM python:3.7-alpine3.9 |
||||
FROM alpine:3.14 |
||||
|
||||
WORKDIR /workdir |
||||
|
||||
ADD requirements.txt /workdir/ |
||||
|
||||
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.8/main" >> /etc/apk/repositories && \ |
||||
echo "http://dl-4.alpinelinux.org/alpine/v3.8/community" >> /etc/apk/repositories && \ |
||||
apk update && \ |
||||
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \ |
||||
pip3 install --upgrade pip && \ |
||||
pip3 install -r requirements.txt && \ |
||||
apk del python3-dev build-base |
||||
RUN echo "http://dl-4.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories && \ |
||||
echo "http://dl-4.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \ |
||||
apk --update --no-cache add ca-certificates chromium chromium-chromedriver \ |
||||
python3-dev py3-grpcio py3-wheel py3-pip py3-lxml \ |
||||
build-base git libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \ |
||||
pip install -r requirements.txt && \ |
||||
apk del build-base |
||||
|
||||
ADD cli.py / |
||||
ADD config /config |
||||
ADD jobs /jobs |
||||
ADD checks /checks |
||||
ADD rating /rating |
||||
ADD spider /spider |
||||
ADD export /export |
||||
# As alpine's py3-cryptography did not work as of alpine v3.14, we use this hack from |
||||
# https://github.com/pyca/cryptography/issues/3344#issuecomment-650845512 |
||||
RUN LDFLAGS="-L/opt/openssl/lib -Wl,-rpath,/opt/openssl/lib" CFLAGS="-I/opt/openssl/include" pip3 install -U cryptography |
||||
|
||||
ENTRYPOINT ["python3", "/cli.py"] |
||||
ADD cli.py /workdir/ |
||||
ADD manager /workdir/manager |
||||
ADD config /workdir/config |
||||
ADD checks /workdir/checks |
||||
ADD rating /workdir/rating |
||||
ADD spider /workdir/spider |
||||
ADD export /workdir/export |
||||
ADD job.py /workdir/ |
||||
|
@ -1,16 +0,0 @@
|
||||
#!/bin/bash |
||||
|
||||
# Log in to webapp server via SSH |
||||
|
||||
API_TOKEN_SECRET="secrets/hetzner-api-token.sh" |
||||
test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; } |
||||
source $API_TOKEN_SECRET |
||||
|
||||
source devops/functions.bash |
||||
|
||||
get_ip |
||||
|
||||
echo "Use this command for SSH access:" |
||||
echo "ssh -o StrictHostKeyChecking=no root@${IP_IP}" |
||||
|
||||
ssh -o StrictHostKeyChecking=no root@${IP_IP} |
@ -0,0 +1,51 @@
|
||||
version: "2" |
||||
services: |
||||
|
||||
redis: |
||||
image: redis:5-alpine |
||||
command: redis-server --save "" --appendonly no |
||||
volumes: |
||||
- ${PWD}/volumes/redis-data:/data |
||||
restart: unless-stopped |
||||
networks: |
||||
- internal_network |
||||
- external_network |
||||
ports: |
||||
- "6379:6379" |
||||
|
||||
# manager manages the job queue. |
||||
manager: |
||||
image: quay.io/netzbegruenung/green-spider:latest |
||||
command: > |
||||
python3 cli.py |
||||
--credentials-path /secrets/datastore-writer.json |
||||
--loglevel debug manager |
||||
environment: |
||||
REDIS_URL: redis://redis:6379/0 |
||||
GIT_USERNAME: ${GIT_USERNAME} |
||||
GIT_PASSWORD: ${GIT_PASSWORD} |
||||
volumes: |
||||
- ${PWD}/secrets:/secrets |
||||
networks: |
||||
- internal_network |
||||
- external_network |
||||
depends_on: |
||||
- redis |
||||
|
||||
dashboard: |
||||
image: eoranged/rq-dashboard:v0.6.1 |
||||
environment: |
||||
RQ_DASHBOARD_REDIS_URL: redis://redis:6379/0 |
||||
networks: |
||||
- internal_network |
||||
- external_network |
||||
ports: |
||||
- "9181:9181" |
||||
depends_on: |
||||
- redis |
||||
|
||||
networks: |
||||
internal_network: |
||||
internal: true |
||||
external_network: |
||||
internal: false |
@ -0,0 +1,147 @@
|
||||
""" |
||||
Dieses Script wird vom RQ worker ausgeführt, um einen einzelnen Job aus der |
||||
Spider-Warteschlange abzuarbeiten. |
||||
""" |
||||
|
||||
import json |
||||
import os |
||||
from datetime import datetime |
||||
import time |
||||
import logging |
||||
|
||||
import docker |
||||
from google.cloud import datastore |
||||
|
||||
# Maximum oper-job runtime in seconds. This can be increased for second, third attempt |
||||
# via the environment JOB_TIMEOUT variable. |
||||
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50")) |
||||
|
||||
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest' |
||||
|
||||
CREDENTIALS_PATH = '/secrets/datastore-writer.json' |
||||
|
||||
client = docker.from_env() |
||||
low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock') |
||||
|
||||
datastore_client = datastore.Client.from_service_account_json("." + CREDENTIALS_PATH) |
||||
|
||||
pwd = os.path.abspath(".") |
||||
secrets_path = pwd + "/secrets" |
||||
chromedir_path = pwd + "/volumes/chrome-userdir" |
||||
screenshots_path = pwd + "/screenshots" |
||||
|
||||
volumes = {} |
||||
volumes[secrets_path] = {'bind': '/secrets', 'mode': 'ro'} |
||||
volumes[chromedir_path] = {'bind': '/opt/chrome-userdir', 'mode': 'rw'} |
||||
volumes[screenshots_path] = {'bind': '/screenshots', 'mode': 'rw'} |
||||
|
||||
logger = logging.getLogger('rq.worker') |
||||
logger.setLevel(logging.DEBUG) |
||||
|
||||
def run(job): |
||||
""" |
||||
Runs a spider container with the given job. |
||||
|
||||
Returns the container logs. If the execution takes longer than the |
||||
duration defined by the JOB_TIMEOUT environment variable (in seconds), |
||||
the container gets killed. |
||||
""" |
||||
cmd_template = ("python cli.py --credentials-path={path} " |
||||
" --loglevel=debug " |
||||
" spider " |
||||
" --job='{job_json}'") |
||||
|
||||
cmd = cmd_template.format(path=CREDENTIALS_PATH, |
||||
job_json=json.dumps(job)) |
||||
|
||||
container = client.containers.run(image=DOCKER_IMAGE, |
||||
command=cmd, |
||||
detach=True, |
||||
remove=True, |
||||
shm_size='2G', |
||||
stdout=True, |
||||
stderr=True, |
||||
tty=False, |
||||
volumes=volumes) |
||||
|
||||
id = container.id |
||||
|
||||
# Data about this spider run, to be written to datastore |
||||
key = datastore_client.key('spider-runs') |
||||
entity = datastore.Entity(key=key) |
||||
results = { |
||||
'datetime': datetime.utcnow(), |
||||
'url': job['url'], |
||||
'success': True, |
||||
'error': '', |
||||
'duration_seconds': 0, |
||||
'cpu_usage_seconds': 0, |
||||
'network_received_bytes': 0, |
||||
'network_transmitted_bytes': 0, |
||||
'memory_max_bytes': 0, |
||||
} |
||||
|
||||
# wait for finish |
||||
start = datetime.utcnow() |
||||
while True: |
||||
time.sleep(1) |
||||
|
||||
clist = client.containers.list(filters={'id': id}) |
||||
if len(clist) == 0: |
||||
break |
||||
|
||||
for c in clist: |
||||
|
||||
# Collect stats |
||||
try: |
||||
stats = low_level_client.stats(id, stream=False) |
||||
|
||||
cpu_usage = stats['cpu_stats']['cpu_usage']['total_usage'] / 1000000000.0 |
||||
if 'networks' in stats: |
||||
network_received_bytes = stats['networks']['eth0']['rx_bytes'] |
||||
network_transmitted_bytes = stats['networks']['eth0']['tx_bytes'] |
||||
|
||||
memory_max_bytes = 0 |
||||
if 'max_usage' in stats['memory_stats']: |
||||
memory_max_bytes = stats['memory_stats']['max_usage'] |
||||
results['memory_max_bytes'] = memory_max_bytes |
||||
|
||||
#logger.debug("Stats: CPU time %d Sec, RX %d KB, Mem %d MB" % (cpu_usage, network_received_bytes/1000, memory_max_bytes/1000000)) |
||||
|
||||
if cpu_usage > 0: |
||||
results['cpu_usage_seconds'] = round(cpu_usage) |
||||
|
||||
if network_received_bytes > 0: |
||||
results['network_received_bytes'] = network_received_bytes |
||||
|
||||
if network_transmitted_bytes > 0: |
||||
results['network_transmitted_bytes'] = network_transmitted_bytes |
||||
|
||||
|
||||
except docker.errors.APIError as e: |
||||
logger.error("Could not get stats: %s" % e) |
||||
except json.decoder.JSONDecodeError: |
||||
# This means we didn't get proper stats |
||||
pass |
||||
|
||||
runtime = (datetime.utcnow() - start).seconds |
||||
results['duration_seconds'] = round(runtime) |
||||
|
||||
#if c.status != "running": |
||||
# logger.info("Container %s status: %s" % (c.id, c.status)) |
||||
|
||||
if c.status == "exited": |
||||
logger.debug("Container %s is exited." % c.id) |
||||
break |
||||
|
||||
if runtime > TIMEOUT: |
||||
c.kill() |
||||
results['success'] = False |
||||
results['error'] = 'TIMEOUT' |
||||
entity.update(results) |
||||
datastore_client.put(entity) |
||||
raise Exception("Execution took too long. Killed container after %s seconds." % TIMEOUT) |
||||
|
||||
entity.update(results) |
||||
datastore_client.put(entity) |
||||
return results |
@ -0,0 +1,67 @@
|
||||
import config |
||||
|
||||
import os |
||||
from datetime import datetime |
||||
import time |
||||
import random |
||||
from pathlib import Path |
||||
|
||||
import kubernetes |
||||
|
||||
PENDING_LIMIT = 2 |
||||
RUNNING_LIMIT = 4 |
||||
|
||||
INTERVAL = 10 # Seconds |
||||
|
||||
def main(): |
||||
|
||||
# Get jobs |
||||
jobs = list(Path("./k8s-jobs").rglob("*.yaml")) |
||||
random.seed() |
||||
random.shuffle(jobs) |
||||
|
||||
kubernetes.config.load_kube_config(context='giantswarm-5jka7') |
||||
v1client = kubernetes.client.CoreV1Api() |
||||
k8sclient = kubernetes.client.ApiClient() |
||||
|
||||
start = datetime.utcnow() |
||||
jobs_queued = 0 |
||||
|
||||
while len(jobs) > 0: |
||||
# Check whether there are pods pending |
||||
pending_pods = v1client.list_pod_for_all_namespaces( |
||||
watch=False, |
||||
field_selector='status.phase=Pending', |
||||
label_selector='app=green-spider') |
||||
pending = list(pending_pods.items) |
||||
|
||||
# Get running pods |
||||
running_pods = v1client.list_pod_for_all_namespaces( |
||||
watch=False, |
||||
field_selector='status.phase=Running', |
||||
label_selector='app=green-spider') |
||||
running = list(running_pods.items) |
||||
|
||||
now = datetime.utcnow() |
||||
duration = now - start |
||||
|
||||
# Add new job to the queue |
||||
if len(pending) < PENDING_LIMIT and len(running) < RUNNING_LIMIT: |
||||
to_be_queued = RUNNING_LIMIT - len(running) |
||||
for _ in range(to_be_queued): |
||||
job_path = jobs.pop(0) |
||||
jobs_queued += 1 |
||||
|
||||
duration_per_job = duration / jobs_queued |
||||
jobs_remaining = len(jobs) |
||||
|
||||
print(f'{jobs_queued} jobs queued in {duration} - {jobs_remaining} jobs (estimated {duration_per_job * jobs_remaining}) remaining at {int(duration_per_job.total_seconds())} seconds per job on average') |
||||
kubernetes.utils.create_from_yaml(k8sclient, job_path) |
||||
os.remove(job_path) |
||||
|
||||
time.sleep(INTERVAL) |
||||
|
||||
print('No more jobs left. Done.') |
||||
|
||||
if __name__ == '__main__': |
||||
main() |
@ -0,0 +1,67 @@
|
||||
--- |
||||
apiVersion: batch/v1 |
||||
kind: Job |
||||
metadata: |
||||
name: green-spider-job-1 |
||||
namespace: marian |
||||
labels: |
||||
app: green-spider |
||||
spec: |
||||
activeDeadlineSeconds: 120 |
||||
ttlSecondsAfterFinished: 600 |
||||
completions: 1 |
||||
backoffLimit: 3 |
||||
|
||||
# Pod template |
||||
template: |
||||
metadata: |
||||
name: green-spider-job |
||||
namespace: marian |
||||
labels: |
||||
app: green-spider |
||||
spec: |
||||
restartPolicy: Never |
||||
nodeSelector: |
||||
giantswarm.io/machine-pool: 5n27k |
||||
affinity: |
||||
podAntiAffinity: |
||||
requiredDuringSchedulingIgnoredDuringExecution: |
||||
- labelSelector: |
||||
matchExpressions: |
||||
- key: app |
||||
operator: In |
||||
values: |
||||
- green-spider |
||||
topologyKey: topology.kubernetes.io/region |
||||
containers: |
||||
- name: spider |
||||
image: quay.io/netzbegruenung/green-spider:kubernetes |
||||
imagePullPolicy: IfNotPresent |
||||
command: |
||||
- python |
||||
- cli.py |
||||
- --credentials-path=/secrets/datastore-writer.json |
||||
- --loglevel=debug |
||||
- spider |
||||
- '--job={"url":"https://www.gruene.de/","type":"PARTY","level":"DE:BUNDESVERBAND","state":null,"district":null,"city":null}' |
||||
volumeMounts: |
||||
- name: secrets |
||||
mountPath: "/secrets" |
||||
readOnly: true |
||||
- name: shared |
||||
mountPath: /dev/shm |
||||
resources: |
||||
requests: |
||||
cpu: 1000m |
||||
memory: 5000M |
||||
volumes: |
||||
- name: secrets |
||||
secret: |
||||
secretName: green-spider |
||||
items: |
||||
- key: datastore-writer.json |
||||
path: datastore-writer.json |
||||
- key: screenshots-uploader.json |
||||
path: screenshots-uploader.json |
||||
- name: shared |
||||
emptyDir: {} |
@ -0,0 +1,18 @@
|
||||
apiVersion: policy/v1beta1 |
||||
kind: PodSecurityPolicy |
||||
metadata: |
||||
name: green-spider-job-psp |
||||
namespace: marian |
||||
spec: |
||||
privileged: false |
||||
seLinux: |
||||
rule: RunAsAny |
||||
supplementalGroups: |
||||
rule: RunAsAny |
||||
runAsUser: |
||||
rule: RunAsAny |
||||
fsGroup: |
||||
rule: RunAsAny |
||||
volumes: |
||||
- emptyDir |
||||
- secret |