green-spider/job.py

"""
Dieses Script wird vom RQ worker ausgeführt, um einen einzelnen Job aus der
Spider-Warteschlange abzuarbeiten.
"""

import json
import os
from datetime import datetime
import time
import logging

import docker
from google.cloud import datastore

# Maximum oper-job runtime in seconds. This can be increased for second, third attempt
# via the environment JOB_TIMEOUT variable.
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))

DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest'

CREDENTIALS_PATH = '/secrets/datastore-writer.json'

client = docker.from_env()
low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')

datastore_client = datastore.Client.from_service_account_json("." + CREDENTIALS_PATH)

pwd = os.path.abspath(".")
secrets_path = pwd + "/secrets"
chromedir_path = pwd + "/volumes/chrome-userdir"
screenshots_path = pwd + "/screenshots"

volumes = {}
volumes[secrets_path] = {'bind': '/secrets', 'mode': 'ro'}
volumes[chromedir_path] = {'bind': '/opt/chrome-userdir', 'mode': 'rw'}
volumes[screenshots_path] = {'bind': '/screenshots', 'mode': 'rw'}

logger = logging.getLogger('rq.worker')
logger.setLevel(logging.DEBUG)

def run(job):
    """
    Runs a spider container with the given job.

    Returns the container logs. If the execution takes longer than the
    duration defined by the JOB_TIMEOUT environment variable (in seconds),
    the container gets killed.
    """
    cmd_template = ("python cli.py --credentials-path={path} "
                    " --loglevel=debug "
                    " spider "
                    " --job='{job_json}'")
    
    cmd = cmd_template.format(path=CREDENTIALS_PATH,
                              job_json=json.dumps(job))
    
    container = client.containers.run(image=DOCKER_IMAGE,
                          command=cmd,
                          detach=True,
                          remove=True,
                          shm_size='2G',
                          stdout=True,
                          stderr=True,
                          tty=False,
                          volumes=volumes)

    id = container.id

    # Data about this spider run, to be written to datastore
    key = datastore_client.key('spider-runs')
    entity = datastore.Entity(key=key)
    results = {
        'datetime': datetime.utcnow(),
        'url': job['url'],
        'success': True,
        'error': '',
        'duration_seconds': 0,
        'cpu_usage_seconds': 0,
        'network_received_bytes': 0,
        'network_transmitted_bytes': 0,
        'memory_max_bytes': 0,
    }

    # wait for finish
    start = datetime.utcnow()
    while True:
        time.sleep(1)

        clist = client.containers.list(filters={'id': id})
        if len(clist) == 0:
            break

        for c in clist:

            # Collect stats
            try:
                stats = low_level_client.stats(id, stream=False)

                cpu_usage = stats['cpu_stats']['cpu_usage']['total_usage'] / 1000000000.0
                if 'networks' in stats:
                    network_received_bytes = stats['networks']['eth0']['rx_bytes']
                    network_transmitted_bytes = stats['networks']['eth0']['tx_bytes']
                
                memory_max_bytes = 0
                if 'max_usage' in stats['memory_stats']:
                    memory_max_bytes = stats['memory_stats']['max_usage']
                    results['memory_max_bytes'] = memory_max_bytes

                #logger.debug("Stats: CPU time %d Sec, RX %d KB, Mem %d MB" % (cpu_usage, network_received_bytes/1000, memory_max_bytes/1000000))

                if cpu_usage > 0:
                    results['cpu_usage_seconds'] = round(cpu_usage)
                
                if network_received_bytes > 0:
                    results['network_received_bytes'] = network_received_bytes

                if network_transmitted_bytes > 0:
                    results['network_transmitted_bytes'] = network_transmitted_bytes
                

            except docker.errors.APIError as e:
                logger.error("Could not get stats: %s" % e)
            except json.decoder.JSONDecodeError:
                # This means we didn't get proper stats
                pass
            
            runtime = (datetime.utcnow() - start).seconds
            results['duration_seconds'] = round(runtime)

            #if c.status != "running":
            #    logger.info("Container %s status: %s" % (c.id, c.status))

            if c.status == "exited":
                logger.debug("Container %s is exited." % c.id)
                break

            if runtime > TIMEOUT:
                c.kill()
                results['success'] = False
                results['error'] = 'TIMEOUT'
                entity.update(results)
                datastore_client.put(entity)
                raise Exception("Execution took too long. Killed container after %s seconds." % TIMEOUT)

    entity.update(results)
    datastore_client.put(entity)
    return results
Job-Verwaltung mit RQ, und vieles mehr (#149) * CLI: remove 'jobs' command, add 'manager' * Add job definition * Move jobs to manage folder * Rename jobs to manager * Add rq and redis dependencies * Add docker-compose YAML * Downgrade to alpine 3.8 * Adjust paths in Dockerfile, remove entrypoint * Rename 'make spiderjobs' to 'make jobs' * Fix docker exectution * Adapt 'make jobs' * Fix metadata scheme * Add docker dependency * Rendomize queue (a bit) * Use latest image, remove debug output * Make docker-compose file downwards-compatible * Use latest instead of dev image tag * Update docker-compose.yaml * Adapt job start script * Fix redis connection in manager * Add support for increasing timeout via environment variable * Adapt load_in_browser to cookies table schema change * Fix execution * Mitigate yaml warning * Bump some dependency versions * Report resource usage stats for each job * checks/load_in_browser: Return DOM size, prevent multiple page loads * Update .dockerignore * Code update * Script update * Update README.md * WIP * WIP commit * Update Dockerfile to alpine:edge and chromium v90 * Update TestCertificateChecker * Set defaults for __init__ function * Detect sunflower theme * Update unit test for new datetime (zero-basing) * Set logging prefs from Chromium in a new way * Move datastore client instantiation As it is not needed for all commands * Change green-directory repository URL * Add git settings for cloning green-directory * Pin alpine version 3.14, fix py3-cryptography * Use plain docker build progress output * Add volumes to 'make test' docker run command * Fix bug * Update example command in README * Update dependencies * Add creation of Kubernetes jobs 2021-11-11 20:15:43 +01:00			`"""`
			`Dieses Script wird vom RQ worker ausgeführt, um einen einzelnen Job aus der`
			`Spider-Warteschlange abzuarbeiten.`
			`"""`

			`import json`
			`import os`
			`from datetime import datetime`
			`import time`
			`import logging`

			`import docker`
			`from google.cloud import datastore`

			`# Maximum oper-job runtime in seconds. This can be increased for second, third attempt`
			`# via the environment JOB_TIMEOUT variable.`
			`TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))`

			`DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest'`

			`CREDENTIALS_PATH = '/secrets/datastore-writer.json'`

			`client = docker.from_env()`
			`low_level_client = docker.APIClient(base_url='unix://var/run/docker.sock')`

			`datastore_client = datastore.Client.from_service_account_json("." + CREDENTIALS_PATH)`

			`pwd = os.path.abspath(".")`
			`secrets_path = pwd + "/secrets"`
			`chromedir_path = pwd + "/volumes/chrome-userdir"`
			`screenshots_path = pwd + "/screenshots"`

			`volumes = {}`
			`volumes[secrets_path] = {'bind': '/secrets', 'mode': 'ro'}`
			`volumes[chromedir_path] = {'bind': '/opt/chrome-userdir', 'mode': 'rw'}`
			`volumes[screenshots_path] = {'bind': '/screenshots', 'mode': 'rw'}`

			`logger = logging.getLogger('rq.worker')`
			`logger.setLevel(logging.DEBUG)`

			`def run(job):`
			`"""`
			`Runs a spider container with the given job.`

			`Returns the container logs. If the execution takes longer than the`
			`duration defined by the JOB_TIMEOUT environment variable (in seconds),`
			`the container gets killed.`
			`"""`
			`cmd_template = ("python cli.py --credentials-path={path} "`
			`" --loglevel=debug "`
			`" spider "`
			`" --job='{job_json}'")`

			`cmd = cmd_template.format(path=CREDENTIALS_PATH,`
			`job_json=json.dumps(job))`

			`container = client.containers.run(image=DOCKER_IMAGE,`
			`command=cmd,`
			`detach=True,`
			`remove=True,`
			`shm_size='2G',`
			`stdout=True,`
			`stderr=True,`
			`tty=False,`
			`volumes=volumes)`

			`id = container.id`

			`# Data about this spider run, to be written to datastore`
			`key = datastore_client.key('spider-runs')`
			`entity = datastore.Entity(key=key)`
			`results = {`
			`'datetime': datetime.utcnow(),`
			`'url': job['url'],`
			`'success': True,`
			`'error': '',`
			`'duration_seconds': 0,`
			`'cpu_usage_seconds': 0,`
			`'network_received_bytes': 0,`
			`'network_transmitted_bytes': 0,`
			`'memory_max_bytes': 0,`
			`}`

			`# wait for finish`
			`start = datetime.utcnow()`
			`while True:`
			`time.sleep(1)`

			`clist = client.containers.list(filters={'id': id})`
			`if len(clist) == 0:`
			`break`

			`for c in clist:`

			`# Collect stats`
			`try:`
			`stats = low_level_client.stats(id, stream=False)`

			`cpu_usage = stats['cpu_stats']['cpu_usage']['total_usage'] / 1000000000.0`
			`if 'networks' in stats:`
			`network_received_bytes = stats['networks']['eth0']['rx_bytes']`
			`network_transmitted_bytes = stats['networks']['eth0']['tx_bytes']`

			`memory_max_bytes = 0`
			`if 'max_usage' in stats['memory_stats']:`
			`memory_max_bytes = stats['memory_stats']['max_usage']`
			`results['memory_max_bytes'] = memory_max_bytes`

			`#logger.debug("Stats: CPU time %d Sec, RX %d KB, Mem %d MB" % (cpu_usage, network_received_bytes/1000, memory_max_bytes/1000000))`

			`if cpu_usage > 0:`
			`results['cpu_usage_seconds'] = round(cpu_usage)`

			`if network_received_bytes > 0:`
			`results['network_received_bytes'] = network_received_bytes`

			`if network_transmitted_bytes > 0:`
			`results['network_transmitted_bytes'] = network_transmitted_bytes`


			`except docker.errors.APIError as e:`
			`logger.error("Could not get stats: %s" % e)`
			`except json.decoder.JSONDecodeError:`
			`# This means we didn't get proper stats`
			`pass`

			`runtime = (datetime.utcnow() - start).seconds`
			`results['duration_seconds'] = round(runtime)`

			`#if c.status != "running":`
			`# logger.info("Container %s status: %s" % (c.id, c.status))`

			`if c.status == "exited":`
			`logger.debug("Container %s is exited." % c.id)`
			`break`

			`if runtime > TIMEOUT:`
			`c.kill()`
			`results['success'] = False`
			`results['error'] = 'TIMEOUT'`
			`entity.update(results)`
			`datastore_client.put(entity)`
			`raise Exception("Execution took too long. Killed container after %s seconds." % TIMEOUT)`

			`entity.update(results)`
			`datastore_client.put(entity)`
			`return results`