Mehrere Fixes und Verbesserungen (#343)

* Use UTC for feed item age calculation

* Improvements in run-job.sh script

* Prevent output buffering in job creation

* Remove unused environment variable references

* Print more detailed results count

* Bring back function to execute a single spider job

* Fix 'make spider' command

* Upgrade docker to 5.0.3
This commit is contained in:
Marian Steinbach 2024-03-07 11:31:16 +01:00 committed by GitHub
parent c59db691a0
commit 0c0bcbf54e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 97 additions and 40 deletions

View file

@ -34,7 +34,7 @@ dryrun:
# Run the spider.
# OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES is a workaround for mac OS.
spider:
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/rq --verbose --burst high default low
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/rq worker --burst high default low
export:
docker run --rm -ti \

View file

@ -5,6 +5,7 @@ Loads feeds linked from pages and collects information on the contained content
import logging
from time import mktime
from datetime import datetime
from datetime import timezone
import feedparser
@ -102,7 +103,7 @@ class Checker(AbstractChecker):
max_date = timestamp
if max_date is not None:
return datetime.fromtimestamp(max_date)
return datetime.fromtimestamp(max_date).replace(tzinfo=timezone.utc)
def find_first_entry(self, entries):
@ -117,4 +118,4 @@ class Checker(AbstractChecker):
min_date = timestamp
if min_date is not None:
return datetime.fromtimestamp(min_date)
return datetime.fromtimestamp(min_date).replace(tzinfo=timezone.utc)

View file

@ -6,6 +6,7 @@ from checks import html_head, page_content
from checks import load_feeds
from checks.config import Config
from datetime import datetime
from datetime import timezone
from pprint import pprint
@ -63,8 +64,8 @@ class TestFeed(unittest.TestCase):
self.assertEqual(result['http://example.com/feed.xml'], {
'exception': None,
'average_interval': 340359,
'first_entry': datetime(2003, 5, 30, 11, 6, 42),
'latest_entry': datetime(2003, 6, 3, 9, 39, 21),
'first_entry': datetime(2003, 5, 30, 11, 6, 42, tzinfo=timezone.utc),
'latest_entry': datetime(2003, 6, 3, 9, 39, 21, tzinfo=timezone.utc),
'num_entries': 2,
'title': 'Liftoff News',
})

10
cli.py
View file

@ -34,6 +34,10 @@ if __name__ == "__main__":
# subcommands
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
# 'spider' subcommand to execute a job from the queue and store the result.
spider_parser = subparsers.add_parser('spider', help='Execute a spider job from the queue and store the result.')
spider_parser.add_argument('--job', help='JSON job data')
# 'dryrun' subcommand to spider one URL without writing results back.
dryrun_parser = subparsers.add_parser('dryrun', help='Spider an arbitrary URL without storing results. ')
dryrun_parser.add_argument('url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
@ -82,6 +86,12 @@ if __name__ == "__main__":
result = spider.check_and_rate_site({"url": args.url, "type": "REGIONAL_CHAPTER", "level": "DE:KREISVERBAND", "state": "Unnamed", "district": "Unnamed"})
print(json.dumps(result, indent=2, sort_keys=True, ensure_ascii=False, cls=DateTimeEncoder))
elif args.command == 'spider':
from spider import spider
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
job = json.loads(args.job)
spider.execute_single_job(datastore_client, job, "spider-results")
else:
parser.print_help()
sys.exit(1)

View file

@ -135,7 +135,7 @@ wait_for_server
echo "Executing remote commands..."
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP << EOF
DEBIAN_FRONTEND=noninteractive
export DEBIAN_FRONTEND=noninteractive
echo ""
echo "Update package sources"

View file

@ -98,21 +98,22 @@ function wait_for_server()
create_server $1
wait_for_server
echo "\nExecuting remote commands..."
echo ""
echo "Executing remote commands..."
SSHCMD="ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP"
SCPCMD="scp -o StrictHostKeyChecking=no -q"
$SSHCMD << EOF
DEBIAN_FRONTEND=noninteractive
export DEBIAN_FRONTEND=noninteractive
echo ""
echo "Update package sources"
apt-get update -q
echo ""
echo "Install dependencies"
apt-get install -y apt-transport-https ca-certificates curl git gnupg2 lsb-release software-properties-common
apt-get install -y apt-transport-https git gnupg2 software-properties-common
echo ""
echo "Add Docker key"
@ -140,55 +141,69 @@ $SSHCMD << EOF
echo ""
echo "Test docker"
docker pull hello-world
docker run --rm hello-world
mkdir /root/secrets
EOF
echo "\nCopying files to server"
echo ""
echo "Copying files to server"
$SCPCMD secrets/datastore-writer.json root@$SERVER_IP:/root/secrets/datastore-writer.json
$SCPCMD docker-compose.yaml root@$SERVER_IP:/root/docker-compose.yaml
$SCPCMD job.py root@$SERVER_IP:/root/job.py
$SCPCMD requirements.txt root@$SERVER_IP:/root/requirements.txt
echo "\nInstalling Python dependencies"
$SSHCMD apt-get install -y python3-pip build-essential
echo ""
echo "Installing Python dependencies"
$SSHCMD DEBIAN_FRONTEND=noninteractive apt-get install -y python3-pip build-essential
$SSHCMD pip3 install -r requirements.txt
echo "\nCloning green-directory"
echo ""
echo "Cloning green-directory"
$SSHCMD git clone --progress --depth 1 https://$GIT_TOKEN@git.verdigado.com/NB-Public/green-directory.git /root/cache/green-directory
echo "\nPulling Docker images"
echo ""
echo "Pulling container images"
$SSHCMD docker compose pull --quiet redis manager
echo "\nStarting redis in background"
$SSHCMD docker compose up -d redis
echo ""
echo "Starting redis in background"
$SSHCMD docker compose up --detach redis
sleep 5
echo "\nCreating jobs"
echo ""
echo "Creating jobs"
$SSHCMD docker compose up manager
echo "\nQueue status:"
$SSHCMD rq info --url redis://localhost:6379/0
echo ""
echo "Queue status"
$SSHCMD rq info --url redis://localhost:6379
echo "\nStarting worker (first run)"
$SSHCMD rq worker --burst high default low --url redis://localhost:6379/0
echo ""
echo "Starting worker for first run"
$SSHCMD rq worker --burst high default low --url redis://localhost:6379
echo "\nRe-queuing failed jobs"
echo ""
echo "Re-queuing failed jobs"
$SSHCMD rq requeue --queue low --all --url redis://localhost:6379
echo "\nQueue status:"
$SSHCMD rq info --url redis://localhost:6379/0
echo ""
echo "Queue status:"
$SSHCMD rq info --url redis://localhost:6379
echo "\nStarting worker (second run)"
$SSHCMD rq worker --burst high default low --url redis://localhost:6379/0
echo ""
echo "Starting worker for second run"
$SSHCMD JOB_TIMEOUT=100 rq worker --burst high default low --url redis://localhost:6379
echo "\nDone."
echo ""
echo "Done."
# Delete the box
echo "\nDeleting server $SERVERNAME with ID $SERVER_ID"
echo ""
echo "Deleting server $SERVERNAME with ID $SERVER_ID"
curl -s -X DELETE -H "Content-Type: application/json" \
-H "Authorization: Bearer $API_TOKEN" \
https://api.hetzner.cloud/v1/servers/$SERVER_ID

View file

@ -17,14 +17,12 @@ services:
manager:
image: ghcr.io/netzbegruenung/green-spider:latest
command: >
python3 cli.py
python3 -u cli.py
--credentials-path /secrets/datastore-writer.json
--loglevel debug
manager
environment:
REDIS_URL: redis://redis:6379/0
GIT_USERNAME: ${GIT_USERNAME}
GIT_PASSWORD: ${GIT_PASSWORD}
volumes:
- ${PWD}/secrets:/secrets
- ${PWD}/cache/green-directory:/workdir/cache/green-directory

2
job.py
View file

@ -12,7 +12,7 @@ import logging
import docker
from google.cloud import datastore
# Maximum oper-job runtime in seconds. This can be increased for second, third attempt
# Maximum per-job runtime in seconds. This can be increased for second, third attempt
# via the environment JOB_TIMEOUT variable.
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))

View file

@ -126,10 +126,11 @@ def create_jobs(url=None):
count = 0
errorcount = 0
jobscount = 0
logging.info("Writing jobs")
count = 0
for entry in input_entries:
count += 1
try:
_ = queue.enqueue('job.run',
job_timeout=JOB_TTL,
@ -141,7 +142,7 @@ def create_jobs(url=None):
# Print job for debugging purposes
logging.debug(f"Created job: {json.dumps(entry)}")
count += 1
jobscount += 1
except Exception as e:
errorcount += 1
logging.error("Error adding job for URL %s: %s" % (entry['url'], e))
@ -149,10 +150,9 @@ def create_jobs(url=None):
# Write kubernetes Job
make_k8s_job(entry, count)
count += 1
logging.info("Writing jobs done, %s jobs added", count)
logging.info("%d errors while writing jobs", errorcount)
logging.info("Processed %s entries", count)
logging.info("Created %s jobs", jobscount)
logging.info("%d errors", errorcount)
def make_k8s_job(job_data, count):

View file

@ -6,7 +6,7 @@ chardet==5.2.0
click>=7,<8
cssselect==1.2.0
dnspython==2.6.1
docker==4.4.1
docker==5.0.3
feedparser==6.0.8
gitdb==4.0.9
GitPython==3.1.41

View file

@ -72,6 +72,38 @@ def check_and_rate_site(entry):
return result
def execute_single_job(datastore_client, job, entity_kind):
"""
Executes spider for one single job
"""
validate_job(job)
logging.info("Starting job %s", job["url"])
result = check_and_rate_site(entry=job)
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])
key = datastore_client.key(entity_kind, job["url"])
entity = datastore.Entity(key=key)
record = {
'created': datetime.utcnow(),
'meta': result['meta'],
'checks': result['checks'],
'rating': result['rating'],
'score': result['score'],
}
entity.update(record)
try:
datastore_client.put(entity)
logging.debug("Successfully wrote record to database")
except InvalidArgument as ex:
logging.error("Could not write result: %s", ex)
except Exception as ex:
logging.error("Could not write result: %s", ex)
def test_url(url):
"""
Run the spider for a single URL and print the result.