Job-Verwaltung mit RQ, und vieles mehr (#149)
* CLI: remove 'jobs' command, add 'manager' * Add job definition * Move jobs to manage folder * Rename jobs to manager * Add rq and redis dependencies * Add docker-compose YAML * Downgrade to alpine 3.8 * Adjust paths in Dockerfile, remove entrypoint * Rename 'make spiderjobs' to 'make jobs' * Fix docker exectution * Adapt 'make jobs' * Fix metadata scheme * Add docker dependency * Rendomize queue (a bit) * Use latest image, remove debug output * Make docker-compose file downwards-compatible * Use latest instead of dev image tag * Update docker-compose.yaml * Adapt job start script * Fix redis connection in manager * Add support for increasing timeout via environment variable * Adapt load_in_browser to cookies table schema change * Fix execution * Mitigate yaml warning * Bump some dependency versions * Report resource usage stats for each job * checks/load_in_browser: Return DOM size, prevent multiple page loads * Update .dockerignore * Code update * Script update * Update README.md * WIP * WIP commit * Update Dockerfile to alpine:edge and chromium v90 * Update TestCertificateChecker * Set defaults for __init__ function * Detect sunflower theme * Update unit test for new datetime (zero-basing) * Set logging prefs from Chromium in a new way * Move datastore client instantiation As it is not needed for all commands * Change green-directory repository URL * Add git settings for cloning green-directory * Pin alpine version 3.14, fix py3-cryptography * Use plain docker build progress output * Add volumes to 'make test' docker run command * Fix bug * Update example command in README * Update dependencies * Add creation of Kubernetes jobspull/223/head
parent
e59b05fc6c
commit
618e29d763
@ -1,23 +1,26 @@
|
||||
FROM python:3.7-alpine3.9
|
||||
FROM alpine:3.14
|
||||
|
||||
WORKDIR /workdir
|
||||
|
||||
ADD requirements.txt /workdir/
|
||||
|
||||
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.8/main" >> /etc/apk/repositories && \
|
||||
echo "http://dl-4.alpinelinux.org/alpine/v3.8/community" >> /etc/apk/repositories && \
|
||||
apk update && \
|
||||
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
|
||||
pip3 install --upgrade pip && \
|
||||
pip3 install -r requirements.txt && \
|
||||
apk del python3-dev build-base
|
||||
RUN echo "http://dl-4.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories && \
|
||||
echo "http://dl-4.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \
|
||||
apk --update --no-cache add ca-certificates chromium chromium-chromedriver \
|
||||
python3-dev py3-grpcio py3-wheel py3-pip py3-lxml \
|
||||
build-base git libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \
|
||||
pip install -r requirements.txt && \
|
||||
apk del build-base
|
||||
|
||||
ADD cli.py /
|
||||
ADD config /config
|
||||
ADD jobs /jobs
|
||||
ADD checks /checks
|
||||
ADD rating /rating
|
||||
ADD spider /spider
|
||||
ADD export /export
|
||||
# As alpine's py3-cryptography did not work as of alpine v3.14, we use this hack from
|
||||
# https://github.com/pyca/cryptography/issues/3344#issuecomment-650845512
|
||||
RUN LDFLAGS="-L/opt/openssl/lib -Wl,-rpath,/opt/openssl/lib" CFLAGS="-I/opt/openssl/include" pip3 install -U cryptography
|
||||
|
||||
ENTRYPOINT ["python3", "/cli.py"]
|
||||
ADD cli.py /workdir/
|
||||
ADD manager /workdir/manager
|
||||
ADD config /workdir/config
|
||||
ADD checks /workdir/checks
|
||||
ADD rating /workdir/rating
|
||||
ADD spider /workdir/spider
|
||||
ADD export /workdir/export
|
||||
ADD job.py /workdir/
|
||||
|
@ -1,16 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Log in to webapp server via SSH
|
||||
|
||||
API_TOKEN_SECRET="secrets/hetzner-api-token.sh"
|
||||
test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; }
|
||||
source $API_TOKEN_SECRET
|
||||
|
||||
source devops/functions.bash
|
||||
|
||||
get_ip
|
||||
|
||||
echo "Use this command for SSH access:"
|
||||
echo "ssh -o StrictHostKeyChecking=no root@${IP_IP}"
|
||||
|
||||
ssh -o StrictHostKeyChecking=no root@${IP_IP}
|
@ -0,0 +1,51 @@
|
||||
version: "2"
|
||||
services:
|
||||
|
||||
redis:
|
||||
image: redis:5-alpine
|
||||
command: redis-server --save "" --appendonly no
|
||||
volumes:
|
||||
- ${PWD}/volumes/redis-data:/data
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- internal_network
|
||||
- external_network
|
||||
ports:
|
||||
- "6379:6379"
|
||||
|
||||
# manager manages the job queue.
|
||||
manager:
|
||||
image: quay.io/netzbegruenung/green-spider:latest
|
||||
command: >
|
||||
python3 cli.py
|
||||
--credentials-path /secrets/datastore-writer.json
|
||||
--loglevel debug manager
|
||||
environment:
|
||||
REDIS_URL: redis://redis:6379/0
|
||||
GIT_USERNAME: ${GIT_USERNAME}
|
||||
GIT_PASSWORD: ${GIT_PASSWORD}
|
||||
volumes:
|
||||
- ${PWD}/secrets:/secrets
|
||||
networks:
|
||||
- internal_network
|
||||
- external_network
|
||||
depends_on:
|
||||
- redis
|
||||
|
||||
dashboard:
|
||||
image: eoranged/rq-dashboard:v0.6.1
|
||||
environment:
|
||||
RQ_DASHBOARD_REDIS_URL: redis://redis:6379/0
|
||||
networks:
|
||||
- internal_network
|
||||
- external_network
|
||||
ports:
|
||||
- "9181:9181"
|
||||
depends_on:
|
||||
- redis
|
||||
|
||||
networks:
|
||||
internal_network:
|
||||
internal: true
|
||||
external_network:
|
||||
internal: false
|
@ -0,0 +1,147 @@
|
||||
"""
|
||||
Dieses Script wird vom RQ worker ausgeführt, um einen einzelnen Job aus der
|
||||
Spider-Warteschlange abzuarbeiten.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
import time
|
||||
import logging
|
||||
|
||||
import docker
|
||||
from google.cloud import datastore
|
||||
|
||||
# Maximum oper-job runtime in seconds. This can be increased for second, third attempt
|
||||
# via the environment JOB_TIMEOUT variable.
|
||||
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))
|
||||
|
||||
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest'
|
||||
|
||||
CREDENTIALS_PATH = '/secrets/datastore-writer.json'
|
||||
|
||||
client = docker.from_env()
|
||||
low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')
|
||||
|
||||
datastore_client = datastore.Client.from_service_account_json("." + CREDENTIALS_PATH)
|
||||
|
||||
pwd = os.path.abspath(".")
|
||||
secrets_path = pwd + "/secrets"
|
||||
chromedir_path = pwd + "/volumes/chrome-userdir"
|
||||
screenshots_path = pwd + "/screenshots"
|
||||
|
||||
volumes = {}
|
||||
volumes[secrets_path] = {'bind': '/secrets', 'mode': 'ro'}
|
||||
volumes[chromedir_path] = {'bind': '/opt/chrome-userdir', 'mode': 'rw'}
|
||||
volumes[screenshots_path] = {'bind': '/screenshots', 'mode': 'rw'}
|
||||
|
||||
logger = logging.getLogger('rq.worker')
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
def run(job):
|
||||
"""
|
||||
Runs a spider container with the given job.
|
||||
|
||||
Returns the container logs. If the execution takes longer than the
|
||||
duration defined by the JOB_TIMEOUT environment variable (in seconds),
|
||||
the container gets killed.
|
||||
"""
|
||||
cmd_template = ("python cli.py --credentials-path={path} "
|
||||
" --loglevel=debug "
|
||||
" spider "
|
||||
" --job='{job_json}'")
|
||||
|
||||
cmd = cmd_template.format(path=CREDENTIALS_PATH,
|
||||
job_json=json.dumps(job))
|
||||
|
||||
container = client.containers.run(image=DOCKER_IMAGE,
|
||||
command=cmd,
|
||||
detach=True,
|
||||
remove=True,
|
||||
shm_size='2G',
|
||||
stdout=True,
|
||||
stderr=True,
|
||||
tty=False,
|
||||
volumes=volumes)
|
||||
|
||||
id = container.id
|
||||
|
||||
# Data about this spider run, to be written to datastore
|
||||
key = datastore_client.key('spider-runs')
|
||||
entity = datastore.Entity(key=key)
|
||||
results = {
|
||||
'datetime': datetime.utcnow(),
|
||||
'url': job['url'],
|
||||
'success': True,
|
||||
'error': '',
|
||||
'duration_seconds': 0,
|
||||
'cpu_usage_seconds': 0,
|
||||
'network_received_bytes': 0,
|
||||
'network_transmitted_bytes': 0,
|
||||
'memory_max_bytes': 0,
|
||||
}
|
||||
|
||||
# wait for finish
|
||||
start = datetime.utcnow()
|
||||
while True:
|
||||
time.sleep(1)
|
||||
|
||||
clist = client.containers.list(filters={'id': id})
|
||||
if len(clist) == 0:
|
||||
break
|
||||
|
||||
for c in clist:
|
||||
|
||||
# Collect stats
|
||||
try:
|
||||
stats = low_level_client.stats(id, stream=False)
|
||||
|
||||
cpu_usage = stats['cpu_stats']['cpu_usage']['total_usage'] / 1000000000.0
|
||||
if 'networks' in stats:
|
||||
network_received_bytes = stats['networks']['eth0']['rx_bytes']
|
||||
network_transmitted_bytes = stats['networks']['eth0']['tx_bytes']
|
||||
|
||||
memory_max_bytes = 0
|
||||
if 'max_usage' in stats['memory_stats']:
|
||||
memory_max_bytes = stats['memory_stats']['max_usage']
|
||||
results['memory_max_bytes'] = memory_max_bytes
|
||||
|
||||
#logger.debug("Stats: CPU time %d Sec, RX %d KB, Mem %d MB" % (cpu_usage, network_received_bytes/1000, memory_max_bytes/1000000))
|
||||
|
||||
if cpu_usage > 0:
|
||||
results['cpu_usage_seconds'] = round(cpu_usage)
|
||||
|
||||
if network_received_bytes > 0:
|
||||
results['network_received_bytes'] = network_received_bytes
|
||||
|
||||
if network_transmitted_bytes > 0:
|
||||
results['network_transmitted_bytes'] = network_transmitted_bytes
|
||||
|
||||
|
||||
except docker.errors.APIError as e:
|
||||
logger.error("Could not get stats: %s" % e)
|
||||
except json.decoder.JSONDecodeError:
|
||||
# This means we didn't get proper stats
|
||||
pass
|
||||
|
||||
runtime = (datetime.utcnow() - start).seconds
|
||||
results['duration_seconds'] = round(runtime)
|
||||
|
||||
#if c.status != "running":
|
||||
# logger.info("Container %s status: %s" % (c.id, c.status))
|
||||
|
||||
if c.status == "exited":
|
||||
logger.debug("Container %s is exited." % c.id)
|
||||
break
|
||||
|
||||
if runtime > TIMEOUT:
|
||||
c.kill()
|
||||
results['success'] = False
|
||||
results['error'] = 'TIMEOUT'
|
||||
entity.update(results)
|
||||
datastore_client.put(entity)
|
||||
raise Exception("Execution took too long. Killed container after %s seconds." % TIMEOUT)
|
||||
|
||||
entity.update(results)
|
||||
datastore_client.put(entity)
|
||||
return results
|
@ -0,0 +1,67 @@
|
||||
import config
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
import time
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
import kubernetes
|
||||
|
||||
PENDING_LIMIT = 2
|
||||
RUNNING_LIMIT = 4
|
||||
|
||||
INTERVAL = 10 # Seconds
|
||||
|
||||
def main():
|
||||
|
||||
# Get jobs
|
||||
jobs = list(Path("./k8s-jobs").rglob("*.yaml"))
|
||||
random.seed()
|
||||
random.shuffle(jobs)
|
||||
|
||||
kubernetes.config.load_kube_config(context='giantswarm-5jka7')
|
||||
v1client = kubernetes.client.CoreV1Api()
|
||||
k8sclient = kubernetes.client.ApiClient()
|
||||
|
||||
start = datetime.utcnow()
|
||||
jobs_queued = 0
|
||||
|
||||
while len(jobs) > 0:
|
||||
# Check whether there are pods pending
|
||||
pending_pods = v1client.list_pod_for_all_namespaces(
|
||||
watch=False,
|
||||
field_selector='status.phase=Pending',
|
||||
label_selector='app=green-spider')
|
||||
pending = list(pending_pods.items)
|
||||
|
||||
# Get running pods
|
||||
running_pods = v1client.list_pod_for_all_namespaces(
|
||||
watch=False,
|
||||
field_selector='status.phase=Running',
|
||||
label_selector='app=green-spider')
|
||||
running = list(running_pods.items)
|
||||
|
||||
now = datetime.utcnow()
|
||||
duration = now - start
|
||||
|
||||
# Add new job to the queue
|
||||
if len(pending) < PENDING_LIMIT and len(running) < RUNNING_LIMIT:
|
||||
to_be_queued = RUNNING_LIMIT - len(running)
|
||||
for _ in range(to_be_queued):
|
||||
job_path = jobs.pop(0)
|
||||
jobs_queued += 1
|
||||
|
||||
duration_per_job = duration / jobs_queued
|
||||
jobs_remaining = len(jobs)
|
||||
|
||||
print(f'{jobs_queued} jobs queued in {duration} - {jobs_remaining} jobs (estimated {duration_per_job * jobs_remaining}) remaining at {int(duration_per_job.total_seconds())} seconds per job on average')
|
||||
kubernetes.utils.create_from_yaml(k8sclient, job_path)
|
||||
os.remove(job_path)
|
||||
|
||||
time.sleep(INTERVAL)
|
||||
|
||||
print('No more jobs left. Done.')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,67 @@
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: green-spider-job-1
|
||||
namespace: marian
|
||||
labels:
|
||||
app: green-spider
|
||||
spec:
|
||||
activeDeadlineSeconds: 120
|
||||
ttlSecondsAfterFinished: 600
|
||||
completions: 1
|
||||
backoffLimit: 3
|
||||
|
||||
# Pod template
|
||||
template:
|
||||
metadata:
|
||||
name: green-spider-job
|
||||
namespace: marian
|
||||
labels:
|
||||
app: green-spider
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
giantswarm.io/machine-pool: 5n27k
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
- labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- green-spider
|
||||
topologyKey: topology.kubernetes.io/region
|
||||
containers:
|
||||
- name: spider
|
||||
image: quay.io/netzbegruenung/green-spider:kubernetes
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- python
|
||||
- cli.py
|
||||
- --credentials-path=/secrets/datastore-writer.json
|
||||
- --loglevel=debug
|
||||
- spider
|
||||
- '--job={"url":"https://www.gruene.de/","type":"PARTY","level":"DE:BUNDESVERBAND","state":null,"district":null,"city":null}'
|
||||
volumeMounts:
|
||||
- name: secrets
|
||||
mountPath: "/secrets"
|
||||
readOnly: true
|
||||
- name: shared
|
||||
mountPath: /dev/shm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 5000M
|
||||
volumes:
|
||||
- name: secrets
|
||||
secret:
|
||||
secretName: green-spider
|
||||
items:
|
||||
- key: datastore-writer.json
|
||||
path: datastore-writer.json
|
||||
- key: screenshots-uploader.json
|
||||
path: screenshots-uploader.json
|
||||
- name: shared
|
||||
emptyDir: {}
|
@ -0,0 +1,18 @@
|
||||
apiVersion: policy/v1beta1
|
||||
kind: PodSecurityPolicy
|
||||
metadata:
|
||||
name: green-spider-job-psp
|
||||
namespace: marian
|
||||
spec:
|
||||
privileged: false
|
||||
seLinux:
|
||||
rule: RunAsAny
|
||||
supplementalGroups:
|
||||
rule: RunAsAny
|
||||
runAsUser:
|
||||
rule: RunAsAny
|
||||
fsGroup:
|
||||
rule: RunAsAny
|
||||
volumes:
|
||||
- emptyDir
|
||||
- secret
|
@ -0,0 +1,67 @@
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: JOB_NAME
|
||||
namespace: marian
|
||||
labels:
|
||||
app: green-spider
|
||||
spec:
|
||||
activeDeadlineSeconds: 600
|
||||
ttlSecondsAfterFinished: 600
|
||||
completions: 1
|
||||
backoffLimit: 3
|
||||
|
||||
# Pod template
|
||||
template:
|
||||
metadata:
|
||||
name: POD_NAME
|
||||
namespace: marian
|
||||
labels:
|
||||
app: green-spider
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
giantswarm.io/machine-pool: 5n27k
|
||||
# affinity:
|
||||
# podAntiAffinity:
|
||||
# requiredDuringSchedulingIgnoredDuringExecution:
|
||||
# - labelSelector:
|
||||
# matchExpressions:
|
||||
# - key: app
|
||||
# operator: In
|
||||
# values:
|
||||
# - green-spider
|
||||
# topologyKey: topology.kubernetes.io/region
|
||||
containers:
|
||||
- name: spider
|
||||
image: quay.io/netzbegruenung/green-spider:20211031-chromium93
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- python3
|
||||
- cli.py
|
||||
- --credentials-path=/secrets/datastore-writer.json
|
||||
- --loglevel=debug
|
||||
- spider
|
||||
- JOB_FLAG
|
||||
volumeMounts:
|
||||
- name: secrets
|
||||
mountPath: "/secrets"
|
||||
readOnly: true
|
||||
- name: shared
|
||||
mountPath: /dev/shm
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 5000M
|
||||
volumes:
|
||||
- name: secrets
|
||||
secret:
|
||||
secretName: green-spider
|
||||
items:
|
||||
- key: datastore-writer.json
|
||||
path: datastore-writer.json
|
||||
- key: screenshots-uploader.json
|
||||
path: screenshots-uploader.json
|
||||
- name: shared
|
||||
emptyDir: {}
|
@ -0,0 +1,57 @@
|
||||
"""
|
||||
This rater evaluates the amount of data transferred for a page load.
|
||||
|
||||
Currently no score is given. The plan is however to reward site that
|
||||
cause smaller transfers.
|
||||