From c59db691a003d61f0c3b33723efb2e7a823d17e2 Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Mon, 4 Mar 2024 17:18:37 +0100 Subject: [PATCH] =?UTF-8?q?Reparatur=20und=20Aufr=C3=A4umen=20an=20der=20j?= =?UTF-8?q?ob=20execution=20(#340)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update jq URL * Improve docker compose setup * Script makeover: only one spider job, debian 11, add git clone * Update image name * Add some docs * Pin click to v7 due to problems with rq * Newline * Improve manager code * Add make tarket venv * Remove obsolete 'spider' command from cli * Remove git clone from manager code * Remove worker functions from spider code * Let 'make jobs' execute git clone and use docker compose * Add 'spider' make target * Update .dockerignore * Add dryrun target to spider a URL without storing results * Remove unused config entry --- .dockerignore | 8 +-- Makefile | 33 +++++---- README.md | 38 ++++------ cli.py | 30 ++++---- config/__init__.py | 3 - devops/run-job.sh | 148 ++++++++++++++++++-------------------- docker-compose.yaml | 30 ++++---- export/__init__.py | 14 +--- export/datetimeencoder.py | 13 ++++ manager/__init__.py | 33 ++++----- requirements.txt | 2 +- spider/spider.py | 45 ------------ 12 files changed, 165 insertions(+), 232 deletions(-) create mode 100644 export/datetimeencoder.py diff --git a/.dockerignore b/.dockerignore index 3d3c549..338f9ab 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,7 +1,5 @@ .git -docs /screenshots -secrets -temp -venv -/export-* +/secrets +/venv +/cache diff --git a/Makefile b/Makefile index 4898cf9..ff35ff9 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,10 @@ -IMAGE := ghcr.io/netzbegruenung/green-spider:main +IMAGE := ghcr.io/netzbegruenung/green-spider:latest DB_ENTITY := spider-results VERSION = $(shell git describe --exact-match --tags 2> /dev/null || git rev-parse HEAD) -.PHONY: dockerimage spider export +.PHONY: dockerimage spider export dryrun test # Build docker image dockerimage: VERSION @@ -12,16 +12,14 @@ dockerimage: VERSION # Fill the queue with spider jobs, one for each site. jobs: - docker run --rm -ti \ - -v $(PWD)/secrets:/secrets \ - $(IMAGE) \ - python cli.py \ - --credentials-path /secrets/datastore-writer.json \ - --loglevel debug \ - manager + mkdir -p cache + test -d cache/green-directory || git clone --depth 1 https://git.verdigado.com/NB-Public/green-directory.git cache/green-directory + git -C cache/green-directory fetch && git -C cache/green-directory pull + docker compose up manager + venv/bin/rq info -# Run spider in docker image -spider: +# Spider a single URL and inspect the result +dryrun: docker run --rm -ti \ -v $(PWD)/volumes/dev-shm:/dev/shm \ -v $(PWD)/secrets:/secrets \ @@ -31,7 +29,12 @@ spider: python3 cli.py \ --credentials-path /secrets/datastore-writer.json \ --loglevel debug \ - spider --kind $(DB_ENTITY) ${ARGS} + dryrun ${ARGS} + +# Run the spider. +# OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES is a workaround for mac OS. +spider: + OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/rq --verbose --burst high default low export: docker run --rm -ti \ @@ -53,5 +56,11 @@ test: $(IMAGE) \ python3 -m unittest discover -p '*_test.py' -v +# Create Python virtual environment +venv: + python3 -m venv venv + venv/bin/pip install --upgrade pip + venv/bin/pip install -r requirements.txt + VERSION: @echo $(VERSION) > VERSION diff --git a/README.md b/README.md index 3287d0d..e1dc7fa 100644 --- a/README.md +++ b/README.md @@ -25,46 +25,36 @@ Alle Informationen zum Betrieb befinden sich im Verzeichnis [devops](https://git ## Entwicklung -Green Spider ist in Python 3 geschrieben und wird aktuell unter 3.6 getestet und ausgeführt. +Green Spider ist in Python geschrieben. Der Code ist darauf ausgelegt, in einem Docker Container ausführbar zu sein. Darüber hinaus _kann_ er möglicherweise in einer lokalen Python-Umgebung funktionieren. Für reproduzierbare Bedingungen beim Ausführen des headless Browsers (chromium, chromedriver) empfielt es sich jedoch, in einer Container-Umgebung zu testen. -Aufgrund zahlreicher Abhängigkeiten empfiehlt es sich, den Spider Code lokal in Docker -auszuführen. +Das aktuellste Container Image steht unter `ghcr.io/netzbegruenung/green-spider:latest` zur Verfügung. Alternative Versionen und Tags sind unter [Packages](https://github.com/netzbegruenung/green-spider/pkgs/container/green-spider) auffindbar. -Das Image wird über den folgenden Befehl erzeugt: +Lokal kann das Image mit diesem Befehl gebaut werden: ```nohighlight -make +make dockerimage ``` -Das dauert beim ersten Ausführen einige Zeit, wiel einige Python-Module das Kompilieren diverser Libraries erfordern. -Nach dem ersten erfolgreichen Durchlauf dauert ein neuer Aufruf von `make` nur noch wenige Sekunden. +### Unittests ausführen -### Tests ausführen - -In aller Kürze: `make test` +Nach dem Bauen des Container Image (siehe oben) werden die Unit Tests im Container über `make test` ausgeführt. ### Spider testweise ausführen (Debugging) Der Spider kann einzelne URLs verarbeiten, ohne die Ergebnisse in eine Datenbank zu schreiben. -Am einfachsten geht das über den `make spider` Befehl, so: +Am einfachsten geht das über den `make dryrun` Befehl, so: ```nohighlight -make spider ARGS="--url http://www.example.com/" +make dryrun ARGS="http://www.example.com/" ``` -Ohne `ARGS` aufgerufen, arbeitet der Spider eine Jobliste ab. Dies erfordert Zugriff auf die entsprechende Datenbank. +### Warteschlange und Worker -Wenn nur eine einzelne Site gespidert werden soll, die Ergebnisse aber in die Datenbank geschrieben werden sollen, kann der Spider so mit `--job` und einem JSON-Object aufgerufen werden (Beispiel): +Für einen kompletten Durchlauf wird die Warteschlange gefüllt und dann abgearbeitet. Das passiert im Betrieb über das Script [devops/run-job.sh](https://github.com/netzbegruenung/green-spider/blob/main/devops/run-job.sh). + +Lokal kann das über die folgenden Befehle getestet werden: ```nohighlight -docker run --rm -ti \ - -v $(pwd)/volumes/dev-shm:/dev/shm \ - -v $(pwd)/secrets:/secrets \ - -v $(pwd)/screenshots:/screenshots \ - -v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \ - --shm-size=2g \ - ghcr.io/netzbegruenung/green-spider:latest python3 cli.py \ - --credentials-path /secrets/datastore-writer.json \ - --loglevel debug \ - spider --job '{"url": "https://gruene-porta-westfalica.de/home/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}' +make jobs +make spider ``` diff --git a/cli.py b/cli.py index 3c4ee6f..ad9329d 100644 --- a/cli.py +++ b/cli.py @@ -34,11 +34,9 @@ if __name__ == "__main__": # subcommands subparsers = parser.add_subparsers(help='sub-command help', dest='command') - # spider subcommand - spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider') - spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)') - spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.') - spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.') + # 'dryrun' subcommand to spider one URL without writing results back. + dryrun_parser = subparsers.add_parser('dryrun', help='Spider an arbitrary URL without storing results. ') + dryrun_parser.add_argument('url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.') # manager subcommand manager_parser = subparsers.add_parser('manager', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.') @@ -69,25 +67,21 @@ if __name__ == "__main__": logging.debug("Called command %s", args.command) if args.command == 'manager': - import manager manager.create_jobs(args.url) elif args.command == 'export': - import export datastore_client = datastore.Client.from_service_account_json(args.credentials_path) export.export_results(datastore_client, args.kind) - else: + elif args.command == 'dryrun': from spider import spider - datastore_client = datastore.Client.from_service_account_json(args.credentials_path) - - if args.url: - # spider one URL for diagnostic purposes - spider.test_url(args.url) - elif args.job: - job = json.loads(args.job) - spider.execute_single_job(datastore_client, job, args.kind) - else: - spider.work_of_queue(datastore_client, args.kind) + from export.datetimeencoder import DateTimeEncoder + + result = spider.check_and_rate_site({"url": args.url, "type": "REGIONAL_CHAPTER", "level": "DE:KREISVERBAND", "state": "Unnamed", "district": "Unnamed"}) + print(json.dumps(result, indent=2, sort_keys=True, ensure_ascii=False, cls=DateTimeEncoder)) + + else: + parser.print_help() + sys.exit(1) diff --git a/config/__init__.py b/config/__init__.py index 78d772d..c904cdc 100644 --- a/config/__init__.py +++ b/config/__init__.py @@ -6,9 +6,6 @@ CONNECT_TIMEOUT = 5 # response timeout for website checks READ_TIMEOUT = 10 -# Git repo for our data -GREEN_DIRECTORY_REPO = 'https://git.verdigado.com/NB-Public/green-directory.git' - # folder in that repo that holds the data GREEN_DIRECTORY_DATA_PATH = 'data/countries/de' diff --git a/devops/run-job.sh b/devops/run-job.sh index 8a5ee43..f364506 100755 --- a/devops/run-job.sh +++ b/devops/run-job.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Creates a server, installs Docker, runs a job, tears down the server. +# Creates a server, installs Docker, clones green-directory, creates jobs, runs spider jops, tears down the server. # # This will take several hours. For a complete, clean run it is required to leave the # terminal running the script open. Otherwise the server won't be deleted properly @@ -12,12 +12,13 @@ # Requirements: # # - curl -# - jq (https://stedolan.github.io/jq/) +# - jq (https://jqlang.github.io/jq/) # - ssh # - SSH key referenced in the server details ("ssh_keys") -# - Service account with write permission for Storage and Datastore in -# secrets/datastore-writer.json - +# - Credentials: +# - Hetzner API token in secrets/hetzner-api-token.sh +# - Service account with write permission for Storage and Datastore in secrets/datastore-writer.json +# - Git token for read access to https://git.verdigado.com/NB-Public/green-directory.git in secrets/git-clone-token.sh DOCKERIMAGE="ghcr.io/netzbegruenung/green-spider:latest" @@ -27,13 +28,11 @@ API_TOKEN_SECRET="secrets/hetzner-api-token.sh" test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; } source $API_TOKEN_SECRET +GIT_TOKEN_SECRET="secrets/git-clone-token.sh" +test -f $GIT_TOKEN_SECRET || { echo >&2 "File $GIT_TOKEN_SECRET does not exist."; exit 1; } +source $GIT_TOKEN_SECRET -if [[ "$1" = "" ]]; then - echo "No argument given. Please use 'spider-new' or 'spider' as arguments." - exit 1 -fi - -SERVERNAME="$1-$(date | md5 | cut -c1-3)" +SERVERNAME="spider-$(date | md5 | cut -c1-3)" # possible values: cx11 (1 core 2 GB), cx21 (2 cores, 4 GB), cx31 (2 cores, 8 GB) SERVERTYPE="cx21" @@ -42,21 +41,20 @@ function create_server() { echo "Creating server $SERVERNAME" - # server_type 'cx11' is the smallest, cheapest category. - # location 'nbg1' is Nürnberg/Nuremberg, Germany. - # image 'debian-9' is a plain Debian stretch. # ssh_keys ['Marian'] adds Marian's public key to the server and can be extended. # user_data: Ensures that we can detect when the cloud-init setup is done. # + # For the rest: https://docs.hetzner.cloud/#servers-create-a-server + # CREATE_RESPONSE=$(curl -s -X POST https://api.hetzner.cloud/v1/servers \ -H "Content-Type: application/json" \ -H "Authorization: Bearer $API_TOKEN" \ -d "{ \"name\": \"$SERVERNAME\", \"server_type\": \"$SERVERTYPE\", - \"location\": \"nbg1\", + \"location\": \"fsn1\", \"start_after_create\": true, - \"image\": \"debian-9\", + \"image\": \"debian-11\", \"ssh_keys\": [ \"Marian\" ], @@ -100,9 +98,12 @@ function wait_for_server() create_server $1 wait_for_server -echo "Executing remote commands..." +echo "\nExecuting remote commands..." -ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP << EOF +SSHCMD="ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP" +SCPCMD="scp -o StrictHostKeyChecking=no -q" + +$SSHCMD << EOF DEBIAN_FRONTEND=noninteractive echo "" @@ -111,90 +112,83 @@ ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP << EOF echo "" echo "Install dependencies" - apt-get install -y curl apt-transport-https gnupg2 software-properties-common + apt-get install -y apt-transport-https ca-certificates curl git gnupg2 lsb-release software-properties-common echo "" - echo "Add docker repo key" - curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add - + echo "Add Docker key" + install -m 0755 -d /etc/apt/keyrings + curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc && chmod a+r /etc/apt/keyrings/docker.asc + + # Add the repository to Apt sources + echo "" + #echo "Get distro name" + #. /etc/os-release && echo "$VERSION_CODENAME" + + echo \ + "deb [arch=amd64 signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian \ + bullseye stable" | \ + tee /etc/apt/sources.list.d/docker.list > /dev/null echo "" - echo "Add repo" - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian stretch stable" + echo "Resulting /etc/apt/sources.list.d/docker.list" + cat /etc/apt/sources.list.d/docker.list echo "" - echo "Update package sources again" - apt-get update -q + echo "Install Docker packages" + apt-get update + apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin echo "" - echo "Install docker" - apt-get install -y docker-ce docker-compose + echo "Test docker" + docker run --rm hello-world mkdir /root/secrets EOF -if [[ $1 == "spider-new" ]]; then - # Some dependencies specific to this task - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP apt-get install -y python3-pip build-essential +echo "\nCopying files to server" +$SCPCMD secrets/datastore-writer.json root@$SERVER_IP:/root/secrets/datastore-writer.json +$SCPCMD docker-compose.yaml root@$SERVER_IP:/root/docker-compose.yaml +$SCPCMD job.py root@$SERVER_IP:/root/job.py +$SCPCMD requirements.txt root@$SERVER_IP:/root/requirements.txt - # Upload some files - scp -o StrictHostKeyChecking=no -q secrets/datastore-writer.json root@$SERVER_IP:/root/secrets/datastore-writer.json - scp -o StrictHostKeyChecking=no -q docker-compose.yaml root@$SERVER_IP:/root/docker-compose.yaml - scp -o StrictHostKeyChecking=no -q requirements.txt root@$SERVER_IP:/root/requirements.txt - scp -o StrictHostKeyChecking=no -q job.py root@$SERVER_IP:/root/job.py +echo "\nInstalling Python dependencies" +$SSHCMD apt-get install -y python3-pip build-essential +$SSHCMD pip3 install -r requirements.txt - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP pip3 install -r requirements.txt +echo "\nCloning green-directory" +$SSHCMD git clone --progress --depth 1 https://$GIT_TOKEN@git.verdigado.com/NB-Public/green-directory.git /root/cache/green-directory - # Bring up redis for the queue - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose pull redis - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose up -d redis - sleep 5 +echo "\nPulling Docker images" +$SSHCMD docker compose pull --quiet redis manager - # Bring up queue manager - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose pull manager - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker-compose up manager +echo "\nStarting redis in background" +$SSHCMD docker compose up -d redis +sleep 5 - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq info --url redis://localhost:6379/0 +echo "\nCreating jobs" +$SSHCMD docker compose up manager - # Start worker and work off the queue once - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq worker --burst high default low --url redis://localhost:6379/0 +echo "\nQueue status:" +$SSHCMD rq info --url redis://localhost:6379/0 - # Re-queue failed jobs once, then re-execute. - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq requeue --queue low -u redis://localhost:6379 --all - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq info --url redis://localhost:6379/0 +echo "\nStarting worker (first run)" +$SSHCMD rq worker --burst high default low --url redis://localhost:6379/0 - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP rq worker --burst high default low --url redis://localhost:6379/0 +echo "\nRe-queuing failed jobs" +$SSHCMD rq requeue --queue low --all --url redis://localhost:6379 - echo "Done with queued jobs." +echo "\nQueue status:" +$SSHCMD rq info --url redis://localhost:6379/0 + +echo "\nStarting worker (second run)" +$SSHCMD rq worker --burst high default low --url redis://localhost:6379/0 + +echo "\nDone." -else - ### spider - # Copy service account secret to server - echo "Copying secret to /root/secrets/datastore-writer.json" - scp -o StrictHostKeyChecking=no -q secrets/datastore-writer.json root@$SERVER_IP:/root/secrets/datastore-writer.json - - # Run docker job - echo "Starting Docker Job" - #ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \ - # -v /root/secrets:/secrets \ - # ghcr.io/netzbegruenung/green-spider:latest spider.py \ - # --credentials-path /secrets/datastore-writer.json \ - # jobs - - #ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP mkdir -p /dev-shm - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \ - --shm-size=2g \ - -v /dev/shm:/dev/shm \ - -v /root/secrets:/secrets \ - $DOCKERIMAGE \ - --credentials-path /secrets/datastore-writer.json \ - --loglevel info \ - spider --kind $RESULTS_ENTITY_KIND - -fi # Delete the box -echo "Deleting server $SERVERNAME with ID $SERVER_ID" +echo "\nDeleting server $SERVERNAME with ID $SERVER_ID" curl -s -X DELETE -H "Content-Type: application/json" \ -H "Authorization: Bearer $API_TOKEN" \ https://api.hetzner.cloud/v1/servers/$SERVER_ID diff --git a/docker-compose.yaml b/docker-compose.yaml index f94dc4f..72f8a00 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -13,13 +13,14 @@ services: ports: - "6379:6379" - # manager manages the job queue. + # manager fills the job queue with spider jobs. manager: image: ghcr.io/netzbegruenung/green-spider:latest command: > python3 cli.py --credentials-path /secrets/datastore-writer.json - --loglevel debug manager + --loglevel debug + manager environment: REDIS_URL: redis://redis:6379/0 GIT_USERNAME: ${GIT_USERNAME} @@ -33,20 +34,21 @@ services: depends_on: - redis - dashboard: - image: eoranged/rq-dashboard:v0.6.1 - environment: - RQ_DASHBOARD_REDIS_URL: redis://redis:6379/0 - networks: - - internal_network - - external_network - ports: - - "9181:9181" - depends_on: - - redis + # rq dashboard for debugging/development + # dashboard: + # image: eoranged/rq-dashboard:v0.6.1 + # environment: + # RQ_DASHBOARD_REDIS_URL: redis://redis:6379/0 + # networks: + # - internal_network + # - external_network + # ports: + # - "9181:9181" + # depends_on: + # - redis networks: internal_network: internal: true external_network: - internal: false \ No newline at end of file + internal: false diff --git a/export/__init__.py b/export/__init__.py index 639f011..1d37f1b 100644 --- a/export/__init__.py +++ b/export/__init__.py @@ -11,17 +11,7 @@ from hashlib import md5 import json import requests - -class DateTimeEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, datetime.datetime): - return obj.isoformat() - elif isinstance(obj, datetime.date): - return obj.isoformat() - elif isinstance(obj, datetime.timedelta): - return (datetime.datetime.min + obj).time().isoformat() - else: - return super(DateTimeEncoder, self).default(obj) +from export import datetimeencoder def export_results(client, entity_kind): """ @@ -45,4 +35,4 @@ def export_results(client, entity_kind): output_filename = "/json-export/spider_result.json" with open(output_filename, 'w', encoding="utf8") as jsonfile: - json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False, cls=DateTimeEncoder) + json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False, cls=datetimeencoder.DateTimeEncoder) diff --git a/export/datetimeencoder.py b/export/datetimeencoder.py new file mode 100644 index 0000000..2aa2ab5 --- /dev/null +++ b/export/datetimeencoder.py @@ -0,0 +1,13 @@ +import json +import datetime + +class DateTimeEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, datetime.datetime): + return obj.isoformat() + elif isinstance(obj, datetime.date): + return obj.isoformat() + elif isinstance(obj, datetime.timedelta): + return (datetime.datetime.min + obj).time().isoformat() + else: + return super(DateTimeEncoder, self).default(obj) diff --git a/manager/__init__.py b/manager/__init__.py index cf70bfe..f03266c 100644 --- a/manager/__init__.py +++ b/manager/__init__.py @@ -6,12 +6,10 @@ import logging import math import os import random -import shutil import time import json from datetime import datetime -from git import Repo from rq import Queue import redis import yaml @@ -20,16 +18,12 @@ from hashlib import sha256 import config -REDIS_URL = os.environ.get("REDIS_URL", "redis://redis:6379/0") +# Maximum age for an active spider job +JOB_TTL = '300s' -def clone_data_directory(): - """ - Clones the source of website URLs, the green directory, - into the local file system using git - """ - if os.path.exists(config.GREEN_DIRECTORY_LOCAL_PATH): - return - Repo.clone_from(config.GREEN_DIRECTORY_REPO, config.GREEN_DIRECTORY_LOCAL_PATH) +QUEUE_NAME = 'low' + +REDIS_URL = os.environ.get("REDIS_URL", "redis://redis:6379/0") def directory_entries(): @@ -77,11 +71,7 @@ def create_jobs(url=None): logging.error(ex) time.sleep(5) - queue = Queue('low', connection=redis_conn) - - # refresh our local clone of the green directory - logging.info("Refreshing green-directory clone") - clone_data_directory() + queue = Queue(QUEUE_NAME, connection=redis_conn) # build the list of website URLs to run checks for logging.info("Processing green-directory") @@ -142,17 +132,15 @@ def create_jobs(url=None): for entry in input_entries: try: _ = queue.enqueue('job.run', - job_timeout='300s', - at_front=random.choice([True, False]), + job_timeout=JOB_TTL, + at_front=random.choice([True, False]), # queue shuffling # keywords args passes on the job function kwargs={ 'job': entry, }) # Print job for debugging purposes - print(json.dumps(entry)) - - #logging.debug("Added job with ID %s for URL %s" % (enqueued_job.id, entry['url'])) + logging.debug(f"Created job: {json.dumps(entry)}") count += 1 except Exception as e: errorcount += 1 @@ -168,6 +156,9 @@ def create_jobs(url=None): def make_k8s_job(job_data, count): + """ + Generate a Kubernetes Job resource for this spider job. + """ now = datetime.utcnow().strftime('%Y%m%d%H%M') urlhash = sha256(job_data['url'].encode('utf-8')).hexdigest()[0:12] job_name = f'gs-{now}-{urlhash}' diff --git a/requirements.txt b/requirements.txt index aeac976..b6ddbe8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ cachetools==5.3.3 certifi==2023.7.22 cffi==1.15.1 chardet==5.2.0 -click==8.0.3 +click>=7,<8 cssselect==1.2.0 dnspython==2.6.1 docker==4.4.1 diff --git a/spider/spider.py b/spider/spider.py index c2912d0..9850a32 100644 --- a/spider/spider.py +++ b/spider/spider.py @@ -88,51 +88,6 @@ def test_url(url): pprint(result) -def execute_single_job(datastore_client, job, entity_kind): - """ - Executes spider for one single job - """ - validate_job(job) - - logging.info("Starting job %s", job["url"]) - result = check_and_rate_site(entry=job) - - logging.debug("Full JSON representation of returned result: %s", json.dumps(result, default=str)) - - logging.info("Job %s finished checks", job["url"]) - logging.info("Job %s writing to DB", job["url"]) - - key = datastore_client.key(entity_kind, job["url"]) - entity = datastore.Entity(key=key) - record = { - 'created': datetime.utcnow(), - 'meta': result['meta'], - 'checks': result['checks'], - 'rating': result['rating'], - 'score': result['score'], - } - - entity.update(record) - try: - datastore_client.put(entity) - logging.debug("Successfully wrote record to database") - except InvalidArgument as ex: - logging.error("Could not write result: %s", ex) - except Exception as ex: - logging.error("Could not write result: %s", ex) - -def work_of_queue(datastore_client, entity_kind): - """ - Take job from queue and finish it until there are no more jobs - """ - while True: - job = manager.get_job_from_queue(datastore_client) - if job is None: - logging.info("No more jobs. Exiting.") - break - - execute_single_job(datastore_client, job, entity_kind) - def validate_job(jobdict): if "url" not in jobdict: raise Exception("Job does not have required 'url' attribute")