green-spider/cli.py
Marian Steinbach 0c0bcbf54e
Mehrere Fixes und Verbesserungen (#343)
* Use UTC for feed item age calculation

* Improvements in run-job.sh script

* Prevent output buffering in job creation

* Remove unused environment variable references

* Print more detailed results count

* Bring back function to execute a single spider job

* Fix 'make spider' command

* Upgrade docker to 5.0.3
2024-03-07 11:31:16 +01:00

98 lines
3.5 KiB
Python

"""
Command line utility for spider, export etc.
"""
import argparse
import logging
import signal
import sys
import json
from google.cloud import datastore
def handle_sigint(signum, frame):
"""
Handles SIGINT, which occurs on Ctrl-C
"""
print("\nInterrupted by SIGINT\n")
sys.exit()
if __name__ == "__main__":
signal.signal(signal.SIGINT, handle_sigint)
parser = argparse.ArgumentParser()
# global flags
parser.add_argument('--credentials-path', dest='credentials_path',
help='Path to the service account credentials JSON file',
default='/secrets/service-account.json')
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
default='info')
# subcommands
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
# 'spider' subcommand to execute a job from the queue and store the result.
spider_parser = subparsers.add_parser('spider', help='Execute a spider job from the queue and store the result.')
spider_parser.add_argument('--job', help='JSON job data')
# 'dryrun' subcommand to spider one URL without writing results back.
dryrun_parser = subparsers.add_parser('dryrun', help='Spider an arbitrary URL without storing results. ')
dryrun_parser.add_argument('url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
# manager subcommand
manager_parser = subparsers.add_parser('manager', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
manager_parser.add_argument('--url', help='Add a job to spider a specific URL')
# export subcommand
export_parser = subparsers.add_parser('export', help='Export JSON data')
export_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to export (default: spider-results)')
args = parser.parse_args()
# set log level
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
loglevel = args.loglevel.lower()
if loglevel == 'error':
logging.basicConfig(level=logging.ERROR)
elif loglevel == 'warn':
logging.basicConfig(level=logging.WARN)
elif loglevel == 'debug':
logging.basicConfig(level=logging.DEBUG)
logging.getLogger("selenium").setLevel(logging.INFO)
else:
logging.basicConfig(level=logging.INFO)
loglevel = 'info'
logging.debug("Called command %s", args.command)
if args.command == 'manager':
import manager
manager.create_jobs(args.url)
elif args.command == 'export':
import export
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
export.export_results(datastore_client, args.kind)
elif args.command == 'dryrun':
from spider import spider
from export.datetimeencoder import DateTimeEncoder
result = spider.check_and_rate_site({"url": args.url, "type": "REGIONAL_CHAPTER", "level": "DE:KREISVERBAND", "state": "Unnamed", "district": "Unnamed"})
print(json.dumps(result, indent=2, sort_keys=True, ensure_ascii=False, cls=DateTimeEncoder))
elif args.command == 'spider':
from spider import spider
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
job = json.loads(args.job)
spider.execute_single_job(datastore_client, job, "spider-results")
else:
parser.print_help()
sys.exit(1)