""" Command line utility for spider, export etc. """ import argparse import logging import signal import sys import json from google.cloud import datastore def handle_sigint(signum, frame): """ Handles SIGINT, which occurs on Ctrl-C """ print("\nInterrupted by SIGINT\n") sys.exit() if __name__ == "__main__": signal.signal(signal.SIGINT, handle_sigint) parser = argparse.ArgumentParser() # global flags parser.add_argument('--credentials-path', dest='credentials_path', help='Path to the service account credentials JSON file', default='/secrets/service-account.json') parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)", default='info') # subcommands subparsers = parser.add_subparsers(help='sub-command help', dest='command') # spider subcommand spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider') spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)') spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.') spider_parser.add_argument('--job', help='Job JSON object. To spider one URL, write the result back and exit.') # manager subcommand manager_parser = subparsers.add_parser('manager', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.') manager_parser.add_argument('--url', help='Add a job to spider a specific URL') # export subcommand export_parser = subparsers.add_parser('export', help='Export JSON data') export_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to export (default: spider-results)') args = parser.parse_args() # set log level logging.getLogger("urllib3").setLevel(logging.CRITICAL) loglevel = args.loglevel.lower() if loglevel == 'error': logging.basicConfig(level=logging.ERROR) elif loglevel == 'warn': logging.basicConfig(level=logging.WARN) elif loglevel == 'debug': logging.basicConfig(level=logging.DEBUG) logging.getLogger("selenium").setLevel(logging.INFO) else: logging.basicConfig(level=logging.INFO) loglevel = 'info' logging.debug("Called command %s", args.command) if args.command == 'manager': import manager manager.create_jobs(args.url) elif args.command == 'export': import export datastore_client = datastore.Client.from_service_account_json(args.credentials_path) export.export_results(datastore_client, args.kind) else: from spider import spider datastore_client = datastore.Client.from_service_account_json(args.credentials_path) if args.url: # spider one URL for diagnostic purposes spider.test_url(args.url) elif args.job: job = json.loads(args.job) spider.execute_single_job(datastore_client, job, args.kind) else: spider.work_of_queue(datastore_client, args.kind)