Hilft Dir dabei, Deine BÜNDNIS 90/DIE GRÜNEN Website zu optimieren
https://green-spider.netzbegruenung.de/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
93 lines
3.2 KiB
93 lines
3.2 KiB
""" |
|
Command line utility for spider, export etc. |
|
""" |
|
|
|
import argparse |
|
import logging |
|
import signal |
|
import sys |
|
import json |
|
|
|
from google.cloud import datastore |
|
|
|
def handle_sigint(signum, frame): |
|
""" |
|
Handles SIGINT, which occurs on Ctrl-C |
|
""" |
|
print("\nInterrupted by SIGINT\n") |
|
sys.exit() |
|
|
|
|
|
if __name__ == "__main__": |
|
signal.signal(signal.SIGINT, handle_sigint) |
|
|
|
parser = argparse.ArgumentParser() |
|
|
|
# global flags |
|
parser.add_argument('--credentials-path', dest='credentials_path', |
|
help='Path to the service account credentials JSON file', |
|
default='/secrets/service-account.json') |
|
|
|
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)", |
|
default='info') |
|
|
|
# subcommands |
|
subparsers = parser.add_subparsers(help='sub-command help', dest='command') |
|
|
|
# spider subcommand |
|
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider') |
|
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)') |
|
spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.') |
|
spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.') |
|
|
|
# manager subcommand |
|
manager_parser = subparsers.add_parser('manager', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.') |
|
manager_parser.add_argument('--url', help='Add a job to spider a specific URL') |
|
|
|
# export subcommand |
|
export_parser = subparsers.add_parser('export', help='Export JSON data') |
|
export_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to export (default: spider-results)') |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
# set log level |
|
logging.getLogger("urllib3").setLevel(logging.CRITICAL) |
|
|
|
loglevel = args.loglevel.lower() |
|
if loglevel == 'error': |
|
logging.basicConfig(level=logging.ERROR) |
|
elif loglevel == 'warn': |
|
logging.basicConfig(level=logging.WARN) |
|
elif loglevel == 'debug': |
|
logging.basicConfig(level=logging.DEBUG) |
|
logging.getLogger("selenium").setLevel(logging.INFO) |
|
else: |
|
logging.basicConfig(level=logging.INFO) |
|
loglevel = 'info' |
|
|
|
logging.debug("Called command %s", args.command) |
|
|
|
if args.command == 'manager': |
|
|
|
import manager |
|
manager.create_jobs(args.url) |
|
|
|
elif args.command == 'export': |
|
|
|
import export |
|
datastore_client = datastore.Client.from_service_account_json(args.credentials_path) |
|
export.export_results(datastore_client, args.kind) |
|
|
|
else: |
|
from spider import spider |
|
datastore_client = datastore.Client.from_service_account_json(args.credentials_path) |
|
|
|
if args.url: |
|
# spider one URL for diagnostic purposes |
|
spider.test_url(args.url) |
|
elif args.job: |
|
job = json.loads(args.job) |
|
spider.execute_single_job(datastore_client, job, args.kind) |
|
else: |
|
spider.work_of_queue(datastore_client, args.kind)
|
|
|