Hilft Dir dabei, Deine BÜNDNIS 90/DIE GRÜNEN Website zu optimieren
https://green-spider.netzbegruenung.de/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
147 lines
4.9 KiB
147 lines
4.9 KiB
""" |
|
Dieses Script wird vom RQ worker ausgeführt, um einen einzelnen Job aus der |
|
Spider-Warteschlange abzuarbeiten. |
|
""" |
|
|
|
import json |
|
import os |
|
from datetime import datetime |
|
import time |
|
import logging |
|
|
|
import docker |
|
from google.cloud import datastore |
|
|
|
# Maximum oper-job runtime in seconds. This can be increased for second, third attempt |
|
# via the environment JOB_TIMEOUT variable. |
|
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50")) |
|
|
|
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest' |
|
|
|
CREDENTIALS_PATH = '/secrets/datastore-writer.json' |
|
|
|
client = docker.from_env() |
|
low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock') |
|
|
|
datastore_client = datastore.Client.from_service_account_json("." + CREDENTIALS_PATH) |
|
|
|
pwd = os.path.abspath(".") |
|
secrets_path = pwd + "/secrets" |
|
chromedir_path = pwd + "/volumes/chrome-userdir" |
|
screenshots_path = pwd + "/screenshots" |
|
|
|
volumes = {} |
|
volumes[secrets_path] = {'bind': '/secrets', 'mode': 'ro'} |
|
volumes[chromedir_path] = {'bind': '/opt/chrome-userdir', 'mode': 'rw'} |
|
volumes[screenshots_path] = {'bind': '/screenshots', 'mode': 'rw'} |
|
|
|
logger = logging.getLogger('rq.worker') |
|
logger.setLevel(logging.DEBUG) |
|
|
|
def run(job): |
|
""" |
|
Runs a spider container with the given job. |
|
|
|
Returns the container logs. If the execution takes longer than the |
|
duration defined by the JOB_TIMEOUT environment variable (in seconds), |
|
the container gets killed. |
|
""" |
|
cmd_template = ("python cli.py --credentials-path={path} " |
|
" --loglevel=debug " |
|
" spider " |
|
" --job='{job_json}'") |
|
|
|
cmd = cmd_template.format(path=CREDENTIALS_PATH, |
|
job_json=json.dumps(job)) |
|
|
|
container = client.containers.run(image=DOCKER_IMAGE, |
|
command=cmd, |
|
detach=True, |
|
remove=True, |
|
shm_size='2G', |
|
stdout=True, |
|
stderr=True, |
|
tty=False, |
|
volumes=volumes) |
|
|
|
id = container.id |
|
|
|
# Data about this spider run, to be written to datastore |
|
key = datastore_client.key('spider-runs') |
|
entity = datastore.Entity(key=key) |
|
results = { |
|
'datetime': datetime.utcnow(), |
|
'url': job['url'], |
|
'success': True, |
|
'error': '', |
|
'duration_seconds': 0, |
|
'cpu_usage_seconds': 0, |
|
'network_received_bytes': 0, |
|
'network_transmitted_bytes': 0, |
|
'memory_max_bytes': 0, |
|
} |
|
|
|
# wait for finish |
|
start = datetime.utcnow() |
|
while True: |
|
time.sleep(1) |
|
|
|
clist = client.containers.list(filters={'id': id}) |
|
if len(clist) == 0: |
|
break |
|
|
|
for c in clist: |
|
|
|
# Collect stats |
|
try: |
|
stats = low_level_client.stats(id, stream=False) |
|
|
|
cpu_usage = stats['cpu_stats']['cpu_usage']['total_usage'] / 1000000000.0 |
|
if 'networks' in stats: |
|
network_received_bytes = stats['networks']['eth0']['rx_bytes'] |
|
network_transmitted_bytes = stats['networks']['eth0']['tx_bytes'] |
|
|
|
memory_max_bytes = 0 |
|
if 'max_usage' in stats['memory_stats']: |
|
memory_max_bytes = stats['memory_stats']['max_usage'] |
|
results['memory_max_bytes'] = memory_max_bytes |
|
|
|
#logger.debug("Stats: CPU time %d Sec, RX %d KB, Mem %d MB" % (cpu_usage, network_received_bytes/1000, memory_max_bytes/1000000)) |
|
|
|
if cpu_usage > 0: |
|
results['cpu_usage_seconds'] = round(cpu_usage) |
|
|
|
if network_received_bytes > 0: |
|
results['network_received_bytes'] = network_received_bytes |
|
|
|
if network_transmitted_bytes > 0: |
|
results['network_transmitted_bytes'] = network_transmitted_bytes |
|
|
|
|
|
except docker.errors.APIError as e: |
|
logger.error("Could not get stats: %s" % e) |
|
except json.decoder.JSONDecodeError: |
|
# This means we didn't get proper stats |
|
pass |
|
|
|
runtime = (datetime.utcnow() - start).seconds |
|
results['duration_seconds'] = round(runtime) |
|
|
|
#if c.status != "running": |
|
# logger.info("Container %s status: %s" % (c.id, c.status)) |
|
|
|
if c.status == "exited": |
|
logger.debug("Container %s is exited." % c.id) |
|
break |
|
|
|
if runtime > TIMEOUT: |
|
c.kill() |
|
results['success'] = False |
|
results['error'] = 'TIMEOUT' |
|
entity.update(results) |
|
datastore_client.put(entity) |
|
raise Exception("Execution took too long. Killed container after %s seconds." % TIMEOUT) |
|
|
|
entity.update(results) |
|
datastore_client.put(entity) |
|
return results
|
|
|