From 0c0bcbf54e3aa32033b9088249d3f2babf9d0077 Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Thu, 7 Mar 2024 11:31:16 +0100 Subject: [PATCH] Mehrere Fixes und Verbesserungen (#343) * Use UTC for feed item age calculation * Improvements in run-job.sh script * Prevent output buffering in job creation * Remove unused environment variable references * Print more detailed results count * Bring back function to execute a single spider job * Fix 'make spider' command * Upgrade docker to 5.0.3 --- Makefile | 2 +- checks/load_feeds.py | 5 ++-- checks/load_feeds_test.py | 5 ++-- cli.py | 10 +++++++ devops/deploy-webapp.sh | 2 +- devops/run-job.sh | 61 ++++++++++++++++++++++++--------------- docker-compose.yaml | 4 +-- job.py | 2 +- manager/__init__.py | 12 ++++---- requirements.txt | 2 +- spider/spider.py | 32 ++++++++++++++++++++ 11 files changed, 97 insertions(+), 40 deletions(-) diff --git a/Makefile b/Makefile index ff35ff9..03632c1 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,7 @@ dryrun: # Run the spider. # OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES is a workaround for mac OS. spider: - OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/rq --verbose --burst high default low + OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/rq worker --burst high default low export: docker run --rm -ti \ diff --git a/checks/load_feeds.py b/checks/load_feeds.py index 6d3a4e0..40c6f53 100644 --- a/checks/load_feeds.py +++ b/checks/load_feeds.py @@ -5,6 +5,7 @@ Loads feeds linked from pages and collects information on the contained content import logging from time import mktime from datetime import datetime +from datetime import timezone import feedparser @@ -102,7 +103,7 @@ class Checker(AbstractChecker): max_date = timestamp if max_date is not None: - return datetime.fromtimestamp(max_date) + return datetime.fromtimestamp(max_date).replace(tzinfo=timezone.utc) def find_first_entry(self, entries): @@ -117,4 +118,4 @@ class Checker(AbstractChecker): min_date = timestamp if min_date is not None: - return datetime.fromtimestamp(min_date) + return datetime.fromtimestamp(min_date).replace(tzinfo=timezone.utc) diff --git a/checks/load_feeds_test.py b/checks/load_feeds_test.py index 3850aac..aea7bad 100644 --- a/checks/load_feeds_test.py +++ b/checks/load_feeds_test.py @@ -6,6 +6,7 @@ from checks import html_head, page_content from checks import load_feeds from checks.config import Config from datetime import datetime +from datetime import timezone from pprint import pprint @@ -63,8 +64,8 @@ class TestFeed(unittest.TestCase): self.assertEqual(result['http://example.com/feed.xml'], { 'exception': None, 'average_interval': 340359, - 'first_entry': datetime(2003, 5, 30, 11, 6, 42), - 'latest_entry': datetime(2003, 6, 3, 9, 39, 21), + 'first_entry': datetime(2003, 5, 30, 11, 6, 42, tzinfo=timezone.utc), + 'latest_entry': datetime(2003, 6, 3, 9, 39, 21, tzinfo=timezone.utc), 'num_entries': 2, 'title': 'Liftoff News', }) diff --git a/cli.py b/cli.py index ad9329d..b55ccb3 100644 --- a/cli.py +++ b/cli.py @@ -34,6 +34,10 @@ if __name__ == "__main__": # subcommands subparsers = parser.add_subparsers(help='sub-command help', dest='command') + # 'spider' subcommand to execute a job from the queue and store the result. + spider_parser = subparsers.add_parser('spider', help='Execute a spider job from the queue and store the result.') + spider_parser.add_argument('--job', help='JSON job data') + # 'dryrun' subcommand to spider one URL without writing results back. dryrun_parser = subparsers.add_parser('dryrun', help='Spider an arbitrary URL without storing results. ') dryrun_parser.add_argument('url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.') @@ -82,6 +86,12 @@ if __name__ == "__main__": result = spider.check_and_rate_site({"url": args.url, "type": "REGIONAL_CHAPTER", "level": "DE:KREISVERBAND", "state": "Unnamed", "district": "Unnamed"}) print(json.dumps(result, indent=2, sort_keys=True, ensure_ascii=False, cls=DateTimeEncoder)) + elif args.command == 'spider': + from spider import spider + datastore_client = datastore.Client.from_service_account_json(args.credentials_path) + job = json.loads(args.job) + spider.execute_single_job(datastore_client, job, "spider-results") + else: parser.print_help() sys.exit(1) diff --git a/devops/deploy-webapp.sh b/devops/deploy-webapp.sh index 2e432bc..7ddb409 100755 --- a/devops/deploy-webapp.sh +++ b/devops/deploy-webapp.sh @@ -135,7 +135,7 @@ wait_for_server echo "Executing remote commands..." ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP << EOF - DEBIAN_FRONTEND=noninteractive + export DEBIAN_FRONTEND=noninteractive echo "" echo "Update package sources" diff --git a/devops/run-job.sh b/devops/run-job.sh index f364506..8462fa0 100755 --- a/devops/run-job.sh +++ b/devops/run-job.sh @@ -98,21 +98,22 @@ function wait_for_server() create_server $1 wait_for_server -echo "\nExecuting remote commands..." +echo "" +echo "Executing remote commands..." SSHCMD="ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP" SCPCMD="scp -o StrictHostKeyChecking=no -q" $SSHCMD << EOF - DEBIAN_FRONTEND=noninteractive - + export DEBIAN_FRONTEND=noninteractive + echo "" echo "Update package sources" apt-get update -q echo "" echo "Install dependencies" - apt-get install -y apt-transport-https ca-certificates curl git gnupg2 lsb-release software-properties-common + apt-get install -y apt-transport-https git gnupg2 software-properties-common echo "" echo "Add Docker key" @@ -140,55 +141,69 @@ $SSHCMD << EOF echo "" echo "Test docker" + docker pull hello-world docker run --rm hello-world mkdir /root/secrets EOF -echo "\nCopying files to server" +echo "" +echo "Copying files to server" $SCPCMD secrets/datastore-writer.json root@$SERVER_IP:/root/secrets/datastore-writer.json $SCPCMD docker-compose.yaml root@$SERVER_IP:/root/docker-compose.yaml $SCPCMD job.py root@$SERVER_IP:/root/job.py $SCPCMD requirements.txt root@$SERVER_IP:/root/requirements.txt -echo "\nInstalling Python dependencies" -$SSHCMD apt-get install -y python3-pip build-essential +echo "" +echo "Installing Python dependencies" +$SSHCMD DEBIAN_FRONTEND=noninteractive apt-get install -y python3-pip build-essential $SSHCMD pip3 install -r requirements.txt -echo "\nCloning green-directory" +echo "" +echo "Cloning green-directory" $SSHCMD git clone --progress --depth 1 https://$GIT_TOKEN@git.verdigado.com/NB-Public/green-directory.git /root/cache/green-directory -echo "\nPulling Docker images" +echo "" +echo "Pulling container images" $SSHCMD docker compose pull --quiet redis manager -echo "\nStarting redis in background" -$SSHCMD docker compose up -d redis +echo "" +echo "Starting redis in background" +$SSHCMD docker compose up --detach redis sleep 5 -echo "\nCreating jobs" +echo "" +echo "Creating jobs" $SSHCMD docker compose up manager -echo "\nQueue status:" -$SSHCMD rq info --url redis://localhost:6379/0 +echo "" +echo "Queue status" +$SSHCMD rq info --url redis://localhost:6379 -echo "\nStarting worker (first run)" -$SSHCMD rq worker --burst high default low --url redis://localhost:6379/0 +echo "" +echo "Starting worker for first run" +$SSHCMD rq worker --burst high default low --url redis://localhost:6379 -echo "\nRe-queuing failed jobs" +echo "" +echo "Re-queuing failed jobs" $SSHCMD rq requeue --queue low --all --url redis://localhost:6379 -echo "\nQueue status:" -$SSHCMD rq info --url redis://localhost:6379/0 +echo "" +echo "Queue status:" +$SSHCMD rq info --url redis://localhost:6379 -echo "\nStarting worker (second run)" -$SSHCMD rq worker --burst high default low --url redis://localhost:6379/0 +echo "" +echo "Starting worker for second run" +$SSHCMD JOB_TIMEOUT=100 rq worker --burst high default low --url redis://localhost:6379 -echo "\nDone." +echo "" +echo "Done." # Delete the box -echo "\nDeleting server $SERVERNAME with ID $SERVER_ID" +echo "" +echo "Deleting server $SERVERNAME with ID $SERVER_ID" curl -s -X DELETE -H "Content-Type: application/json" \ -H "Authorization: Bearer $API_TOKEN" \ https://api.hetzner.cloud/v1/servers/$SERVER_ID diff --git a/docker-compose.yaml b/docker-compose.yaml index 72f8a00..2a23c7d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -17,14 +17,12 @@ services: manager: image: ghcr.io/netzbegruenung/green-spider:latest command: > - python3 cli.py + python3 -u cli.py --credentials-path /secrets/datastore-writer.json --loglevel debug manager environment: REDIS_URL: redis://redis:6379/0 - GIT_USERNAME: ${GIT_USERNAME} - GIT_PASSWORD: ${GIT_PASSWORD} volumes: - ${PWD}/secrets:/secrets - ${PWD}/cache/green-directory:/workdir/cache/green-directory diff --git a/job.py b/job.py index 3f5bd11..02df9db 100644 --- a/job.py +++ b/job.py @@ -12,7 +12,7 @@ import logging import docker from google.cloud import datastore -# Maximum oper-job runtime in seconds. This can be increased for second, third attempt +# Maximum per-job runtime in seconds. This can be increased for second, third attempt # via the environment JOB_TIMEOUT variable. TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50")) diff --git a/manager/__init__.py b/manager/__init__.py index f03266c..e02d9df 100644 --- a/manager/__init__.py +++ b/manager/__init__.py @@ -126,10 +126,11 @@ def create_jobs(url=None): count = 0 errorcount = 0 + jobscount = 0 logging.info("Writing jobs") - count = 0 for entry in input_entries: + count += 1 try: _ = queue.enqueue('job.run', job_timeout=JOB_TTL, @@ -141,7 +142,7 @@ def create_jobs(url=None): # Print job for debugging purposes logging.debug(f"Created job: {json.dumps(entry)}") - count += 1 + jobscount += 1 except Exception as e: errorcount += 1 logging.error("Error adding job for URL %s: %s" % (entry['url'], e)) @@ -149,10 +150,9 @@ def create_jobs(url=None): # Write kubernetes Job make_k8s_job(entry, count) - count += 1 - - logging.info("Writing jobs done, %s jobs added", count) - logging.info("%d errors while writing jobs", errorcount) + logging.info("Processed %s entries", count) + logging.info("Created %s jobs", jobscount) + logging.info("%d errors", errorcount) def make_k8s_job(job_data, count): diff --git a/requirements.txt b/requirements.txt index b6ddbe8..7b38a76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ chardet==5.2.0 click>=7,<8 cssselect==1.2.0 dnspython==2.6.1 -docker==4.4.1 +docker==5.0.3 feedparser==6.0.8 gitdb==4.0.9 GitPython==3.1.41 diff --git a/spider/spider.py b/spider/spider.py index 9850a32..3136210 100644 --- a/spider/spider.py +++ b/spider/spider.py @@ -72,6 +72,38 @@ def check_and_rate_site(entry): return result +def execute_single_job(datastore_client, job, entity_kind): + """ + Executes spider for one single job + """ + validate_job(job) + + logging.info("Starting job %s", job["url"]) + result = check_and_rate_site(entry=job) + + logging.info("Job %s finished checks", job["url"]) + logging.info("Job %s writing to DB", job["url"]) + + key = datastore_client.key(entity_kind, job["url"]) + entity = datastore.Entity(key=key) + record = { + 'created': datetime.utcnow(), + 'meta': result['meta'], + 'checks': result['checks'], + 'rating': result['rating'], + 'score': result['score'], + } + + entity.update(record) + try: + datastore_client.put(entity) + logging.debug("Successfully wrote record to database") + except InvalidArgument as ex: + logging.error("Could not write result: %s", ex) + except Exception as ex: + logging.error("Could not write result: %s", ex) + + def test_url(url): """ Run the spider for a single URL and print the result.