mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-04-28 15:24:51 +02:00
Mehrere Fixes und Verbesserungen (#343)
* Use UTC for feed item age calculation * Improvements in run-job.sh script * Prevent output buffering in job creation * Remove unused environment variable references * Print more detailed results count * Bring back function to execute a single spider job * Fix 'make spider' command * Upgrade docker to 5.0.3
This commit is contained in:
parent
c59db691a0
commit
0c0bcbf54e
2
Makefile
2
Makefile
|
@ -34,7 +34,7 @@ dryrun:
|
|||
# Run the spider.
|
||||
# OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES is a workaround for mac OS.
|
||||
spider:
|
||||
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/rq --verbose --burst high default low
|
||||
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES venv/bin/rq worker --burst high default low
|
||||
|
||||
export:
|
||||
docker run --rm -ti \
|
||||
|
|
|
@ -5,6 +5,7 @@ Loads feeds linked from pages and collects information on the contained content
|
|||
import logging
|
||||
from time import mktime
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
import feedparser
|
||||
|
||||
|
@ -102,7 +103,7 @@ class Checker(AbstractChecker):
|
|||
max_date = timestamp
|
||||
|
||||
if max_date is not None:
|
||||
return datetime.fromtimestamp(max_date)
|
||||
return datetime.fromtimestamp(max_date).replace(tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def find_first_entry(self, entries):
|
||||
|
@ -117,4 +118,4 @@ class Checker(AbstractChecker):
|
|||
min_date = timestamp
|
||||
|
||||
if min_date is not None:
|
||||
return datetime.fromtimestamp(min_date)
|
||||
return datetime.fromtimestamp(min_date).replace(tzinfo=timezone.utc)
|
||||
|
|
|
@ -6,6 +6,7 @@ from checks import html_head, page_content
|
|||
from checks import load_feeds
|
||||
from checks.config import Config
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from pprint import pprint
|
||||
|
||||
|
@ -63,8 +64,8 @@ class TestFeed(unittest.TestCase):
|
|||
self.assertEqual(result['http://example.com/feed.xml'], {
|
||||
'exception': None,
|
||||
'average_interval': 340359,
|
||||
'first_entry': datetime(2003, 5, 30, 11, 6, 42),
|
||||
'latest_entry': datetime(2003, 6, 3, 9, 39, 21),
|
||||
'first_entry': datetime(2003, 5, 30, 11, 6, 42, tzinfo=timezone.utc),
|
||||
'latest_entry': datetime(2003, 6, 3, 9, 39, 21, tzinfo=timezone.utc),
|
||||
'num_entries': 2,
|
||||
'title': 'Liftoff News',
|
||||
})
|
||||
|
|
10
cli.py
10
cli.py
|
@ -34,6 +34,10 @@ if __name__ == "__main__":
|
|||
# subcommands
|
||||
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
|
||||
|
||||
# 'spider' subcommand to execute a job from the queue and store the result.
|
||||
spider_parser = subparsers.add_parser('spider', help='Execute a spider job from the queue and store the result.')
|
||||
spider_parser.add_argument('--job', help='JSON job data')
|
||||
|
||||
# 'dryrun' subcommand to spider one URL without writing results back.
|
||||
dryrun_parser = subparsers.add_parser('dryrun', help='Spider an arbitrary URL without storing results. ')
|
||||
dryrun_parser.add_argument('url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
|
||||
|
@ -82,6 +86,12 @@ if __name__ == "__main__":
|
|||
result = spider.check_and_rate_site({"url": args.url, "type": "REGIONAL_CHAPTER", "level": "DE:KREISVERBAND", "state": "Unnamed", "district": "Unnamed"})
|
||||
print(json.dumps(result, indent=2, sort_keys=True, ensure_ascii=False, cls=DateTimeEncoder))
|
||||
|
||||
elif args.command == 'spider':
|
||||
from spider import spider
|
||||
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
|
||||
job = json.loads(args.job)
|
||||
spider.execute_single_job(datastore_client, job, "spider-results")
|
||||
|
||||
else:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
|
|
@ -135,7 +135,7 @@ wait_for_server
|
|||
echo "Executing remote commands..."
|
||||
|
||||
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP << EOF
|
||||
DEBIAN_FRONTEND=noninteractive
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
echo ""
|
||||
echo "Update package sources"
|
||||
|
|
|
@ -98,21 +98,22 @@ function wait_for_server()
|
|||
create_server $1
|
||||
wait_for_server
|
||||
|
||||
echo "\nExecuting remote commands..."
|
||||
echo ""
|
||||
echo "Executing remote commands..."
|
||||
|
||||
SSHCMD="ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP"
|
||||
SCPCMD="scp -o StrictHostKeyChecking=no -q"
|
||||
|
||||
$SSHCMD << EOF
|
||||
DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
echo ""
|
||||
echo "Update package sources"
|
||||
apt-get update -q
|
||||
|
||||
echo ""
|
||||
echo "Install dependencies"
|
||||
apt-get install -y apt-transport-https ca-certificates curl git gnupg2 lsb-release software-properties-common
|
||||
apt-get install -y apt-transport-https git gnupg2 software-properties-common
|
||||
|
||||
echo ""
|
||||
echo "Add Docker key"
|
||||
|
@ -140,55 +141,69 @@ $SSHCMD << EOF
|
|||
|
||||
echo ""
|
||||
echo "Test docker"
|
||||
docker pull hello-world
|
||||
docker run --rm hello-world
|
||||
|
||||
mkdir /root/secrets
|
||||
EOF
|
||||
|
||||
echo "\nCopying files to server"
|
||||
echo ""
|
||||
echo "Copying files to server"
|
||||
$SCPCMD secrets/datastore-writer.json root@$SERVER_IP:/root/secrets/datastore-writer.json
|
||||
$SCPCMD docker-compose.yaml root@$SERVER_IP:/root/docker-compose.yaml
|
||||
$SCPCMD job.py root@$SERVER_IP:/root/job.py
|
||||
$SCPCMD requirements.txt root@$SERVER_IP:/root/requirements.txt
|
||||
|
||||
echo "\nInstalling Python dependencies"
|
||||
$SSHCMD apt-get install -y python3-pip build-essential
|
||||
echo ""
|
||||
echo "Installing Python dependencies"
|
||||
$SSHCMD DEBIAN_FRONTEND=noninteractive apt-get install -y python3-pip build-essential
|
||||
$SSHCMD pip3 install -r requirements.txt
|
||||
|
||||
echo "\nCloning green-directory"
|
||||
echo ""
|
||||
echo "Cloning green-directory"
|
||||
$SSHCMD git clone --progress --depth 1 https://$GIT_TOKEN@git.verdigado.com/NB-Public/green-directory.git /root/cache/green-directory
|
||||
|
||||
echo "\nPulling Docker images"
|
||||
echo ""
|
||||
echo "Pulling container images"
|
||||
$SSHCMD docker compose pull --quiet redis manager
|
||||
|
||||
echo "\nStarting redis in background"
|
||||
$SSHCMD docker compose up -d redis
|
||||
echo ""
|
||||
echo "Starting redis in background"
|
||||
$SSHCMD docker compose up --detach redis
|
||||
sleep 5
|
||||
|
||||
echo "\nCreating jobs"
|
||||
echo ""
|
||||
echo "Creating jobs"
|
||||
$SSHCMD docker compose up manager
|
||||
|
||||
echo "\nQueue status:"
|
||||
$SSHCMD rq info --url redis://localhost:6379/0
|
||||
echo ""
|
||||
echo "Queue status"
|
||||
$SSHCMD rq info --url redis://localhost:6379
|
||||
|
||||
echo "\nStarting worker (first run)"
|
||||
$SSHCMD rq worker --burst high default low --url redis://localhost:6379/0
|
||||
echo ""
|
||||
echo "Starting worker for first run"
|
||||
$SSHCMD rq worker --burst high default low --url redis://localhost:6379
|
||||
|
||||
echo "\nRe-queuing failed jobs"
|
||||
echo ""
|
||||
echo "Re-queuing failed jobs"
|
||||
$SSHCMD rq requeue --queue low --all --url redis://localhost:6379
|
||||
|
||||
echo "\nQueue status:"
|
||||
$SSHCMD rq info --url redis://localhost:6379/0
|
||||
echo ""
|
||||
echo "Queue status:"
|
||||
$SSHCMD rq info --url redis://localhost:6379
|
||||
|
||||
echo "\nStarting worker (second run)"
|
||||
$SSHCMD rq worker --burst high default low --url redis://localhost:6379/0
|
||||
echo ""
|
||||
echo "Starting worker for second run"
|
||||
$SSHCMD JOB_TIMEOUT=100 rq worker --burst high default low --url redis://localhost:6379
|
||||
|
||||
echo "\nDone."
|
||||
echo ""
|
||||
echo "Done."
|
||||
|
||||
|
||||
|
||||
# Delete the box
|
||||
echo "\nDeleting server $SERVERNAME with ID $SERVER_ID"
|
||||
echo ""
|
||||
echo "Deleting server $SERVERNAME with ID $SERVER_ID"
|
||||
curl -s -X DELETE -H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer $API_TOKEN" \
|
||||
https://api.hetzner.cloud/v1/servers/$SERVER_ID
|
||||
|
|
|
@ -17,14 +17,12 @@ services:
|
|||
manager:
|
||||
image: ghcr.io/netzbegruenung/green-spider:latest
|
||||
command: >
|
||||
python3 cli.py
|
||||
python3 -u cli.py
|
||||
--credentials-path /secrets/datastore-writer.json
|
||||
--loglevel debug
|
||||
manager
|
||||
environment:
|
||||
REDIS_URL: redis://redis:6379/0
|
||||
GIT_USERNAME: ${GIT_USERNAME}
|
||||
GIT_PASSWORD: ${GIT_PASSWORD}
|
||||
volumes:
|
||||
- ${PWD}/secrets:/secrets
|
||||
- ${PWD}/cache/green-directory:/workdir/cache/green-directory
|
||||
|
|
2
job.py
2
job.py
|
@ -12,7 +12,7 @@ import logging
|
|||
import docker
|
||||
from google.cloud import datastore
|
||||
|
||||
# Maximum oper-job runtime in seconds. This can be increased for second, third attempt
|
||||
# Maximum per-job runtime in seconds. This can be increased for second, third attempt
|
||||
# via the environment JOB_TIMEOUT variable.
|
||||
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))
|
||||
|
||||
|
|
|
@ -126,10 +126,11 @@ def create_jobs(url=None):
|
|||
|
||||
count = 0
|
||||
errorcount = 0
|
||||
jobscount = 0
|
||||
logging.info("Writing jobs")
|
||||
|
||||
count = 0
|
||||
for entry in input_entries:
|
||||
count += 1
|
||||
try:
|
||||
_ = queue.enqueue('job.run',
|
||||
job_timeout=JOB_TTL,
|
||||
|
@ -141,7 +142,7 @@ def create_jobs(url=None):
|
|||
|
||||
# Print job for debugging purposes
|
||||
logging.debug(f"Created job: {json.dumps(entry)}")
|
||||
count += 1
|
||||
jobscount += 1
|
||||
except Exception as e:
|
||||
errorcount += 1
|
||||
logging.error("Error adding job for URL %s: %s" % (entry['url'], e))
|
||||
|
@ -149,10 +150,9 @@ def create_jobs(url=None):
|
|||
# Write kubernetes Job
|
||||
make_k8s_job(entry, count)
|
||||
|
||||
count += 1
|
||||
|
||||
logging.info("Writing jobs done, %s jobs added", count)
|
||||
logging.info("%d errors while writing jobs", errorcount)
|
||||
logging.info("Processed %s entries", count)
|
||||
logging.info("Created %s jobs", jobscount)
|
||||
logging.info("%d errors", errorcount)
|
||||
|
||||
|
||||
def make_k8s_job(job_data, count):
|
||||
|
|
|
@ -6,7 +6,7 @@ chardet==5.2.0
|
|||
click>=7,<8
|
||||
cssselect==1.2.0
|
||||
dnspython==2.6.1
|
||||
docker==4.4.1
|
||||
docker==5.0.3
|
||||
feedparser==6.0.8
|
||||
gitdb==4.0.9
|
||||
GitPython==3.1.41
|
||||
|
|
|
@ -72,6 +72,38 @@ def check_and_rate_site(entry):
|
|||
return result
|
||||
|
||||
|
||||
def execute_single_job(datastore_client, job, entity_kind):
|
||||
"""
|
||||
Executes spider for one single job
|
||||
"""
|
||||
validate_job(job)
|
||||
|
||||
logging.info("Starting job %s", job["url"])
|
||||
result = check_and_rate_site(entry=job)
|
||||
|
||||
logging.info("Job %s finished checks", job["url"])
|
||||
logging.info("Job %s writing to DB", job["url"])
|
||||
|
||||
key = datastore_client.key(entity_kind, job["url"])
|
||||
entity = datastore.Entity(key=key)
|
||||
record = {
|
||||
'created': datetime.utcnow(),
|
||||
'meta': result['meta'],
|
||||
'checks': result['checks'],
|
||||
'rating': result['rating'],
|
||||
'score': result['score'],
|
||||
}
|
||||
|
||||
entity.update(record)
|
||||
try:
|
||||
datastore_client.put(entity)
|
||||
logging.debug("Successfully wrote record to database")
|
||||
except InvalidArgument as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
except Exception as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
|
||||
|
||||
def test_url(url):
|
||||
"""
|
||||
Run the spider for a single URL and print the result.
|
||||
|
|
Loading…
Reference in a new issue