Reorganize command line invocation

This commit is contained in:
Marian Steinbach 2018-10-02 18:10:10 +02:00
parent b7b8e0feb4
commit 7b2c1fe7b5
9 changed files with 146 additions and 81 deletions

View file

@ -3,9 +3,10 @@ class Config(object):
Our configuration to be passed to checks
"""
def __init__(self, urls, user_agent):
def __init__(self, urls, user_agent=None):
self._urls = set(urls)
self._user_agent = user_agent
if user_agent:
self._user_agent = user_agent
def __repr__(self):
return "Config(urls=%r)" % self._urls

View file

@ -0,0 +1,27 @@
from checks import certificate
from checks.config import Config
import unittest
class TestCertificateChecker(unittest.TestCase):
def test_google(self):
url = 'https://www.google.com/'
config = Config(urls=[url])
checker = certificate.Checker(config=config, previous_results={})
result = checker.run()
self.assertIn(url, result)
self.assertIsNone(result[url]['exception'])
self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services')
def test_kaarst(self):
url = 'https://www.gruenekaarst.de/'
config = Config(urls=[url])
checker = certificate.Checker(config=config, previous_results={})
result = checker.run()
self.assertIn(url, result)
self.assertIsNone(result[url]['exception'])
self.assertEqual(result[url]['issuer']['O'], 'COMODO CA Limited')
if __name__ == '__main__':
unittest.main()

83
cli.py Normal file
View file

@ -0,0 +1,83 @@
"""
Command line utility for spider, export etc.
"""
import argparse
import logging
import signal
import sys
from google.cloud import datastore
def handle_sigint(signum, frame):
"""
Handles SIGINT, which occurs on Ctrl-C
"""
print("\nInterrupted by SIGINT\n")
sys.exit()
if __name__ == "__main__":
signal.signal(signal.SIGINT,handle_sigint)
parser = argparse.ArgumentParser()
# global flags
parser.add_argument('--credentials-path', dest='credentials_path',
help='Path to the service account credentials JSON file',
default='/secrets/service-account.json')
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
default='info')
# subcommands
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
# spider subcommand
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
# jobs subcommand
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
jobs_parser.add_argument('--url', help='Add a job to spider a specific URL')
# export subcommand
export_parser = subparsers.add_parser('export', help='Export JSON data')
export_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to export (default: spider-results)')
args = parser.parse_args()
# set log level
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
loglevel = args.loglevel.lower()
if loglevel == 'error':
logging.basicConfig(level=logging.ERROR)
elif loglevel == 'warn':
logging.basicConfig(level=logging.WARN)
elif loglevel == 'debug':
logging.basicConfig(level=logging.DEBUG)
logging.getLogger("selenium").setLevel(logging.INFO)
else:
logging.basicConfig(level=logging.INFO)
loglevel = 'info'
logging.debug("Called command %s", args.command)
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
if args.command == 'jobs':
import jobs
jobs.create_jobs(datastore_client, args.url)
elif args.command == 'export':
import export
export.export_screenshots(datastore_client)
export.export_results(datastore_client, args.kind)
else:
from spider import spider
spider.work_of_queue(datastore_client, args.kind)

View file

@ -21,6 +21,3 @@ GCMS_IP = "91.102.13.20"
# kind name of the spider job key datastore entities
JOB_DATASTORE_KIND = 'spider-jobs'
# kind name of the spider results datastore entities
# TODO: change back to 'spider-results'
RESULTS_DATASTORE_KIND = 'spider-results-dev'

View file

@ -2,8 +2,7 @@
Exports data from the database to JSON files for use in a static webapp
"""
from google.cloud import datastore
import hashlib
from hashlib import md5
import json
import logging
import sys
@ -14,18 +13,14 @@ import requests
SITEICONS_PATH = "/icons"
SPIDER_RESULTS_ENTITY_KIND = 'spider-results-dev'
client = None
def export_results():
def export_results(client, entity_kind):
"""
Export of the main results data
"""
out = []
# Load data from database
query = client.query(kind=SPIDER_RESULTS_ENTITY_KIND)
query = client.query(kind=entity_kind)
for entity in query.fetch():
logging.debug(entity.key.name)
out.append({
@ -40,6 +35,7 @@ def export_results():
})
# load icons, reformat icons details
icons_downloaded = set()
for index in range(len(out)):
assert "checks" in out[index]
assert "html_head" in out[index]["checks"]
@ -49,11 +45,17 @@ def export_results():
for url in out[index]['checks']['html_head']:
assert 'link_icon' in out[index]['checks']['html_head'][url]
if out[index]['checks']['html_head'][url]['link_icon'] is not None:
icons.add(out[index]['checks']['html_head'][url]['link_icon'])
iconurl = out[index]['checks']['html_head'][url]['link_icon']
if iconurl.startswith("data:"):
continue
if iconurl in icons_downloaded:
continue
icons.add(iconurl)
out[index]["icons"] = {}
for iconurl in list(icons):
logging.debug("Dowloading icon %s", iconurl)
icons_downloaded.add(iconurl)
filename = download_icon(iconurl)
if filename:
out[index]["icons"][url] = filename
@ -61,9 +63,17 @@ def export_results():
output_filename = "/out/spider_result.json"
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
# compact version
output_filename = "/out/spider_result_compact.json"
for i in range(len(out)):
out[i]['cms'] = list(out[i]['checks']['generator'].values())
del out[i]['checks']
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
def export_screenshots():
def export_screenshots(client):
"""
Export of screenshot meta data
"""
@ -90,10 +100,12 @@ def download_icon(icon_url):
"""
default_endings = {
"image/x-ico": "ico",
"image/x-icon": "ico",
"image/vnd.microsoft.icon": "ico",
"image/png": "png",
"image/jpeg": "jpg",
"image/gif": "gif",
}
# Download the icon
@ -104,7 +116,7 @@ def download_icon(icon_url):
if req.status_code >= 400:
return None
content_hash = hashlib.md5(req.content).hexdigest()
content_hash = md5(req.content).hexdigest()
extension = ""
try:
@ -121,6 +133,9 @@ def download_icon(icon_url):
if extension == "":
# derive from content type
ctype = req.headers.get('content-type')
if ctype is None:
return
try:
extension = default_endings[ctype]
except KeyError:
@ -134,18 +149,3 @@ def download_icon(icon_url):
iconfile.write(req.content)
return filename
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
if len(sys.argv) == 1:
print("Error: please provide path to Google Storage API system account JSON file as argument")
sys.exit(1)
key_path = sys.argv[1]
client = datastore.Client.from_service_account_json(key_path)
# TODO: Re-enable
#export_screenshots()
export_results()

View file

@ -15,6 +15,7 @@ from rating import reachable
from rating import resolvable
from rating import response_duration
from rating import responsive_layout
from rating import use_specific_fonts
from rating import www_optional
@ -38,6 +39,7 @@ def calculate_rating(results):
'NO_SCRIPT_ERRORS': no_script_errors,
'RESPONSIVE': responsive_layout,
'SITE_REACHABLE': reachable,
'USE_SPECIFIC_FONTS': use_specific_fonts,
'WWW_OPTIONAL': www_optional,
}

0
spider/__init__.py Normal file
View file

View file

@ -19,9 +19,6 @@ import config
import jobs
import rating
DATASTORE_CLIENT = None
def check_and_rate_site(entry):
"""
Performs our site check and returns results as a dict.
@ -72,12 +69,12 @@ def check_and_rate_site(entry):
return result
def work_of_queue():
def work_of_queue(datastore_client, entity_kind):
"""
Take job from queue and finish it until there are no more jobs
"""
while True:
job = jobs.get_job_from_queue(DATASTORE_CLIENT)
job = jobs.get_job_from_queue(datastore_client)
if job is None:
logging.info("No more jobs. Exiting.")
break
@ -90,7 +87,7 @@ def work_of_queue():
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])
key = DATASTORE_CLIENT.key(config.RESULTS_DATASTORE_KIND, job["url"])
key = datastore_client.key(entity_kind, job["url"])
entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
record = {
'created': datetime.utcnow(),
@ -101,51 +98,9 @@ def work_of_queue():
}
entity.update(record)
try:
DATASTORE_CLIENT.put(entity)
datastore_client.put(entity)
except InvalidArgument as ex:
logging.error("Could not write result: %s", ex)
except Exception as ex:
logging.error("Could not write result: %s", ex)
if __name__ == "__main__":
"""
Bringing it all together
"""
parser = argparse.ArgumentParser()
parser.add_argument('--credentials-path', dest='credentials_path',
help='Path to the service account credentials JSON file',
default='/secrets/service-account.json')
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
default='info')
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
subparsers.add_parser('spider', help='Take jobs off the queue and spider')
jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
jobs_parser.add_argument('--url', help='Add a job to spider a URL')
args = parser.parse_args()
loglevel = args.loglevel.lower()
if loglevel == 'error':
logging.basicConfig(level=logging.ERROR)
elif loglevel == 'warn':
logging.basicConfig(level=logging.WARN)
elif loglevel == 'debug':
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
loglevel = 'info'
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
logging.debug("Called command %s", args.command)
if args.command == 'jobs':
jobs.create_jobs(DATASTORE_CLIENT, args.url)
else:
work_of_queue()

View file

@ -1,7 +1,7 @@
import unittest
import requests
import responses
import spider
from spider import spider
class TestDeriveHostnames(unittest.TestCase):