green-spider/job.py
Marian Steinbach 618e29d763
Job-Verwaltung mit RQ, und vieles mehr (#149)
* CLI: remove 'jobs' command, add 'manager'

* Add job definition

* Move jobs to manage folder

* Rename jobs to manager

* Add rq and redis dependencies

* Add docker-compose YAML

* Downgrade to alpine 3.8

* Adjust paths in Dockerfile, remove entrypoint

* Rename 'make spiderjobs' to 'make jobs'

* Fix docker exectution

* Adapt 'make jobs'

* Fix metadata scheme

* Add docker dependency

* Rendomize queue (a bit)

* Use latest image, remove debug output

* Make docker-compose file downwards-compatible

* Use latest instead of dev image tag

* Update docker-compose.yaml

* Adapt job start script

* Fix redis connection in manager

* Add support for increasing timeout via environment variable

* Adapt load_in_browser to cookies table schema change

* Fix execution

* Mitigate yaml warning

* Bump some dependency versions

* Report resource usage stats for each job

* checks/load_in_browser: Return DOM size, prevent multiple page loads

* Update .dockerignore

* Code update

* Script update

* Update README.md

* WIP

* WIP commit

* Update Dockerfile to alpine:edge and chromium v90

* Update TestCertificateChecker

* Set defaults for __init__ function

* Detect sunflower theme

* Update unit test for new datetime (zero-basing)

* Set logging prefs from Chromium in a new way

* Move datastore client instantiation

As it is not needed for all commands

* Change green-directory repository URL

* Add git settings for cloning green-directory

* Pin alpine version 3.14, fix py3-cryptography

* Use plain docker build progress output

* Add volumes to 'make test' docker run command

* Fix bug

* Update example command in README

* Update dependencies

* Add creation of Kubernetes jobs
2021-11-11 20:15:43 +01:00

148 lines
4.9 KiB
Python

"""
Dieses Script wird vom RQ worker ausgeführt, um einen einzelnen Job aus der
Spider-Warteschlange abzuarbeiten.
"""
import json
import os
from datetime import datetime
import time
import logging
import docker
from google.cloud import datastore
# Maximum oper-job runtime in seconds. This can be increased for second, third attempt
# via the environment JOB_TIMEOUT variable.
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest'
CREDENTIALS_PATH = '/secrets/datastore-writer.json'
client = docker.from_env()
low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')
datastore_client = datastore.Client.from_service_account_json("." + CREDENTIALS_PATH)
pwd = os.path.abspath(".")
secrets_path = pwd + "/secrets"
chromedir_path = pwd + "/volumes/chrome-userdir"
screenshots_path = pwd + "/screenshots"
volumes = {}
volumes[secrets_path] = {'bind': '/secrets', 'mode': 'ro'}
volumes[chromedir_path] = {'bind': '/opt/chrome-userdir', 'mode': 'rw'}
volumes[screenshots_path] = {'bind': '/screenshots', 'mode': 'rw'}
logger = logging.getLogger('rq.worker')
logger.setLevel(logging.DEBUG)
def run(job):
"""
Runs a spider container with the given job.
Returns the container logs. If the execution takes longer than the
duration defined by the JOB_TIMEOUT environment variable (in seconds),
the container gets killed.
"""
cmd_template = ("python cli.py --credentials-path={path} "
" --loglevel=debug "
" spider "
" --job='{job_json}'")
cmd = cmd_template.format(path=CREDENTIALS_PATH,
job_json=json.dumps(job))
container = client.containers.run(image=DOCKER_IMAGE,
command=cmd,
detach=True,
remove=True,
shm_size='2G',
stdout=True,
stderr=True,
tty=False,
volumes=volumes)
id = container.id
# Data about this spider run, to be written to datastore
key = datastore_client.key('spider-runs')
entity = datastore.Entity(key=key)
results = {
'datetime': datetime.utcnow(),
'url': job['url'],
'success': True,
'error': '',
'duration_seconds': 0,
'cpu_usage_seconds': 0,
'network_received_bytes': 0,
'network_transmitted_bytes': 0,
'memory_max_bytes': 0,
}
# wait for finish
start = datetime.utcnow()
while True:
time.sleep(1)
clist = client.containers.list(filters={'id': id})
if len(clist) == 0:
break
for c in clist:
# Collect stats
try:
stats = low_level_client.stats(id, stream=False)
cpu_usage = stats['cpu_stats']['cpu_usage']['total_usage'] / 1000000000.0
if 'networks' in stats:
network_received_bytes = stats['networks']['eth0']['rx_bytes']
network_transmitted_bytes = stats['networks']['eth0']['tx_bytes']
memory_max_bytes = 0
if 'max_usage' in stats['memory_stats']:
memory_max_bytes = stats['memory_stats']['max_usage']
results['memory_max_bytes'] = memory_max_bytes
#logger.debug("Stats: CPU time %d Sec, RX %d KB, Mem %d MB" % (cpu_usage, network_received_bytes/1000, memory_max_bytes/1000000))
if cpu_usage > 0:
results['cpu_usage_seconds'] = round(cpu_usage)
if network_received_bytes > 0:
results['network_received_bytes'] = network_received_bytes
if network_transmitted_bytes > 0:
results['network_transmitted_bytes'] = network_transmitted_bytes
except docker.errors.APIError as e:
logger.error("Could not get stats: %s" % e)
except json.decoder.JSONDecodeError:
# This means we didn't get proper stats
pass
runtime = (datetime.utcnow() - start).seconds
results['duration_seconds'] = round(runtime)
#if c.status != "running":
# logger.info("Container %s status: %s" % (c.id, c.status))
if c.status == "exited":
logger.debug("Container %s is exited." % c.id)
break
if runtime > TIMEOUT:
c.kill()
results['success'] = False
results['error'] = 'TIMEOUT'
entity.update(results)
datastore_client.put(entity)
raise Exception("Execution took too long. Killed container after %s seconds." % TIMEOUT)
entity.update(results)
datastore_client.put(entity)
return results