""" Dieses Script wird vom RQ worker ausgeführt, um einen einzelnen Job aus der Spider-Warteschlange abzuarbeiten. """ import json import os from datetime import datetime import time import logging import docker from google.cloud import datastore # Maximum oper-job runtime in seconds. This can be increased for second, third attempt # via the environment JOB_TIMEOUT variable. TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50")) DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest' CREDENTIALS_PATH = '/secrets/datastore-writer.json' client = docker.from_env() low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock') datastore_client = datastore.Client.from_service_account_json("." + CREDENTIALS_PATH) pwd = os.path.abspath(".") secrets_path = pwd + "/secrets" chromedir_path = pwd + "/volumes/chrome-userdir" screenshots_path = pwd + "/screenshots" volumes = {} volumes[secrets_path] = {'bind': '/secrets', 'mode': 'ro'} volumes[chromedir_path] = {'bind': '/opt/chrome-userdir', 'mode': 'rw'} volumes[screenshots_path] = {'bind': '/screenshots', 'mode': 'rw'} logger = logging.getLogger('rq.worker') logger.setLevel(logging.DEBUG) def run(job): """ Runs a spider container with the given job. Returns the container logs. If the execution takes longer than the duration defined by the JOB_TIMEOUT environment variable (in seconds), the container gets killed. """ cmd_template = ("python cli.py --credentials-path={path} " " --loglevel=debug " " spider " " --job='{job_json}'") cmd = cmd_template.format(path=CREDENTIALS_PATH, job_json=json.dumps(job)) container = client.containers.run(image=DOCKER_IMAGE, command=cmd, detach=True, remove=True, shm_size='2G', stdout=True, stderr=True, tty=False, volumes=volumes) id = container.id # Data about this spider run, to be written to datastore key = datastore_client.key('spider-runs') entity = datastore.Entity(key=key) results = { 'datetime': datetime.utcnow(), 'url': job['url'], 'success': True, 'error': '', 'duration_seconds': 0, 'cpu_usage_seconds': 0, 'network_received_bytes': 0, 'network_transmitted_bytes': 0, 'memory_max_bytes': 0, } # wait for finish start = datetime.utcnow() while True: time.sleep(1) clist = client.containers.list(filters={'id': id}) if len(clist) == 0: break for c in clist: # Collect stats try: stats = low_level_client.stats(id, stream=False) cpu_usage = stats['cpu_stats']['cpu_usage']['total_usage'] / 1000000000.0 if 'networks' in stats: network_received_bytes = stats['networks']['eth0']['rx_bytes'] network_transmitted_bytes = stats['networks']['eth0']['tx_bytes'] memory_max_bytes = 0 if 'max_usage' in stats['memory_stats']: memory_max_bytes = stats['memory_stats']['max_usage'] results['memory_max_bytes'] = memory_max_bytes #logger.debug("Stats: CPU time %d Sec, RX %d KB, Mem %d MB" % (cpu_usage, network_received_bytes/1000, memory_max_bytes/1000000)) if cpu_usage > 0: results['cpu_usage_seconds'] = round(cpu_usage) if network_received_bytes > 0: results['network_received_bytes'] = network_received_bytes if network_transmitted_bytes > 0: results['network_transmitted_bytes'] = network_transmitted_bytes except docker.errors.APIError as e: logger.error("Could not get stats: %s" % e) except json.decoder.JSONDecodeError: # This means we didn't get proper stats pass runtime = (datetime.utcnow() - start).seconds results['duration_seconds'] = round(runtime) #if c.status != "running": # logger.info("Container %s status: %s" % (c.id, c.status)) if c.status == "exited": logger.debug("Container %s is exited." % c.id) break if runtime > TIMEOUT: c.kill() results['success'] = False results['error'] = 'TIMEOUT' entity.update(results) datastore_client.put(entity) raise Exception("Execution took too long. Killed container after %s seconds." % TIMEOUT) entity.update(results) datastore_client.put(entity) return results