mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-01 08:34:51 +02:00
Reorganize command line invocation
This commit is contained in:
parent
b7b8e0feb4
commit
7b2c1fe7b5
|
@ -3,9 +3,10 @@ class Config(object):
|
|||
Our configuration to be passed to checks
|
||||
"""
|
||||
|
||||
def __init__(self, urls, user_agent):
|
||||
def __init__(self, urls, user_agent=None):
|
||||
self._urls = set(urls)
|
||||
self._user_agent = user_agent
|
||||
if user_agent:
|
||||
self._user_agent = user_agent
|
||||
|
||||
def __repr__(self):
|
||||
return "Config(urls=%r)" % self._urls
|
||||
|
|
27
checks/test_certificate.py
Normal file
27
checks/test_certificate.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
from checks import certificate
|
||||
from checks.config import Config
|
||||
import unittest
|
||||
|
||||
class TestCertificateChecker(unittest.TestCase):
|
||||
|
||||
def test_google(self):
|
||||
url = 'https://www.google.com/'
|
||||
config = Config(urls=[url])
|
||||
checker = certificate.Checker(config=config, previous_results={})
|
||||
result = checker.run()
|
||||
self.assertIn(url, result)
|
||||
self.assertIsNone(result[url]['exception'])
|
||||
self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services')
|
||||
|
||||
def test_kaarst(self):
|
||||
url = 'https://www.gruenekaarst.de/'
|
||||
config = Config(urls=[url])
|
||||
checker = certificate.Checker(config=config, previous_results={})
|
||||
result = checker.run()
|
||||
self.assertIn(url, result)
|
||||
self.assertIsNone(result[url]['exception'])
|
||||
self.assertEqual(result[url]['issuer']['O'], 'COMODO CA Limited')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
83
cli.py
Normal file
83
cli.py
Normal file
|
@ -0,0 +1,83 @@
|
|||
"""
|
||||
Command line utility for spider, export etc.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
|
||||
from google.cloud import datastore
|
||||
|
||||
def handle_sigint(signum, frame):
|
||||
"""
|
||||
Handles SIGINT, which occurs on Ctrl-C
|
||||
"""
|
||||
print("\nInterrupted by SIGINT\n")
|
||||
sys.exit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
signal.signal(signal.SIGINT,handle_sigint)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# global flags
|
||||
parser.add_argument('--credentials-path', dest='credentials_path',
|
||||
help='Path to the service account credentials JSON file',
|
||||
default='/secrets/service-account.json')
|
||||
|
||||
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
|
||||
default='info')
|
||||
|
||||
# subcommands
|
||||
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
|
||||
|
||||
# spider subcommand
|
||||
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
|
||||
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
|
||||
|
||||
# jobs subcommand
|
||||
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
|
||||
jobs_parser.add_argument('--url', help='Add a job to spider a specific URL')
|
||||
|
||||
# export subcommand
|
||||
export_parser = subparsers.add_parser('export', help='Export JSON data')
|
||||
export_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to export (default: spider-results)')
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# set log level
|
||||
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
||||
|
||||
loglevel = args.loglevel.lower()
|
||||
if loglevel == 'error':
|
||||
logging.basicConfig(level=logging.ERROR)
|
||||
elif loglevel == 'warn':
|
||||
logging.basicConfig(level=logging.WARN)
|
||||
elif loglevel == 'debug':
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logging.getLogger("selenium").setLevel(logging.INFO)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
loglevel = 'info'
|
||||
|
||||
logging.debug("Called command %s", args.command)
|
||||
|
||||
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
|
||||
|
||||
if args.command == 'jobs':
|
||||
|
||||
import jobs
|
||||
jobs.create_jobs(datastore_client, args.url)
|
||||
|
||||
elif args.command == 'export':
|
||||
|
||||
import export
|
||||
export.export_screenshots(datastore_client)
|
||||
export.export_results(datastore_client, args.kind)
|
||||
|
||||
else:
|
||||
from spider import spider
|
||||
spider.work_of_queue(datastore_client, args.kind)
|
|
@ -21,6 +21,3 @@ GCMS_IP = "91.102.13.20"
|
|||
# kind name of the spider job key datastore entities
|
||||
JOB_DATASTORE_KIND = 'spider-jobs'
|
||||
|
||||
# kind name of the spider results datastore entities
|
||||
# TODO: change back to 'spider-results'
|
||||
RESULTS_DATASTORE_KIND = 'spider-results-dev'
|
||||
|
|
|
@ -2,8 +2,7 @@
|
|||
Exports data from the database to JSON files for use in a static webapp
|
||||
"""
|
||||
|
||||
from google.cloud import datastore
|
||||
import hashlib
|
||||
from hashlib import md5
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
|
@ -14,18 +13,14 @@ import requests
|
|||
|
||||
SITEICONS_PATH = "/icons"
|
||||
|
||||
SPIDER_RESULTS_ENTITY_KIND = 'spider-results-dev'
|
||||
|
||||
client = None
|
||||
|
||||
def export_results():
|
||||
def export_results(client, entity_kind):
|
||||
"""
|
||||
Export of the main results data
|
||||
"""
|
||||
out = []
|
||||
|
||||
# Load data from database
|
||||
query = client.query(kind=SPIDER_RESULTS_ENTITY_KIND)
|
||||
query = client.query(kind=entity_kind)
|
||||
for entity in query.fetch():
|
||||
logging.debug(entity.key.name)
|
||||
out.append({
|
||||
|
@ -40,6 +35,7 @@ def export_results():
|
|||
})
|
||||
|
||||
# load icons, reformat icons details
|
||||
icons_downloaded = set()
|
||||
for index in range(len(out)):
|
||||
assert "checks" in out[index]
|
||||
assert "html_head" in out[index]["checks"]
|
||||
|
@ -49,11 +45,17 @@ def export_results():
|
|||
for url in out[index]['checks']['html_head']:
|
||||
assert 'link_icon' in out[index]['checks']['html_head'][url]
|
||||
if out[index]['checks']['html_head'][url]['link_icon'] is not None:
|
||||
icons.add(out[index]['checks']['html_head'][url]['link_icon'])
|
||||
iconurl = out[index]['checks']['html_head'][url]['link_icon']
|
||||
if iconurl.startswith("data:"):
|
||||
continue
|
||||
if iconurl in icons_downloaded:
|
||||
continue
|
||||
icons.add(iconurl)
|
||||
|
||||
out[index]["icons"] = {}
|
||||
for iconurl in list(icons):
|
||||
logging.debug("Dowloading icon %s", iconurl)
|
||||
icons_downloaded.add(iconurl)
|
||||
filename = download_icon(iconurl)
|
||||
if filename:
|
||||
out[index]["icons"][url] = filename
|
||||
|
@ -61,9 +63,17 @@ def export_results():
|
|||
output_filename = "/out/spider_result.json"
|
||||
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
||||
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
||||
|
||||
# compact version
|
||||
output_filename = "/out/spider_result_compact.json"
|
||||
for i in range(len(out)):
|
||||
out[i]['cms'] = list(out[i]['checks']['generator'].values())
|
||||
del out[i]['checks']
|
||||
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
||||
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
||||
|
||||
|
||||
def export_screenshots():
|
||||
def export_screenshots(client):
|
||||
"""
|
||||
Export of screenshot meta data
|
||||
"""
|
||||
|
@ -90,10 +100,12 @@ def download_icon(icon_url):
|
|||
"""
|
||||
|
||||
default_endings = {
|
||||
"image/x-ico": "ico",
|
||||
"image/x-icon": "ico",
|
||||
"image/vnd.microsoft.icon": "ico",
|
||||
"image/png": "png",
|
||||
"image/jpeg": "jpg",
|
||||
"image/gif": "gif",
|
||||
}
|
||||
|
||||
# Download the icon
|
||||
|
@ -104,7 +116,7 @@ def download_icon(icon_url):
|
|||
if req.status_code >= 400:
|
||||
return None
|
||||
|
||||
content_hash = hashlib.md5(req.content).hexdigest()
|
||||
content_hash = md5(req.content).hexdigest()
|
||||
extension = ""
|
||||
|
||||
try:
|
||||
|
@ -121,6 +133,9 @@ def download_icon(icon_url):
|
|||
if extension == "":
|
||||
# derive from content type
|
||||
ctype = req.headers.get('content-type')
|
||||
if ctype is None:
|
||||
return
|
||||
|
||||
try:
|
||||
extension = default_endings[ctype]
|
||||
except KeyError:
|
||||
|
@ -134,18 +149,3 @@ def download_icon(icon_url):
|
|||
iconfile.write(req.content)
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
print("Error: please provide path to Google Storage API system account JSON file as argument")
|
||||
sys.exit(1)
|
||||
|
||||
key_path = sys.argv[1]
|
||||
client = datastore.Client.from_service_account_json(key_path)
|
||||
|
||||
# TODO: Re-enable
|
||||
#export_screenshots()
|
||||
export_results()
|
|
@ -15,6 +15,7 @@ from rating import reachable
|
|||
from rating import resolvable
|
||||
from rating import response_duration
|
||||
from rating import responsive_layout
|
||||
from rating import use_specific_fonts
|
||||
from rating import www_optional
|
||||
|
||||
|
||||
|
@ -38,6 +39,7 @@ def calculate_rating(results):
|
|||
'NO_SCRIPT_ERRORS': no_script_errors,
|
||||
'RESPONSIVE': responsive_layout,
|
||||
'SITE_REACHABLE': reachable,
|
||||
'USE_SPECIFIC_FONTS': use_specific_fonts,
|
||||
'WWW_OPTIONAL': www_optional,
|
||||
}
|
||||
|
||||
|
|
0
spider/__init__.py
Normal file
0
spider/__init__.py
Normal file
|
@ -19,9 +19,6 @@ import config
|
|||
import jobs
|
||||
import rating
|
||||
|
||||
DATASTORE_CLIENT = None
|
||||
|
||||
|
||||
def check_and_rate_site(entry):
|
||||
"""
|
||||
Performs our site check and returns results as a dict.
|
||||
|
@ -72,12 +69,12 @@ def check_and_rate_site(entry):
|
|||
return result
|
||||
|
||||
|
||||
def work_of_queue():
|
||||
def work_of_queue(datastore_client, entity_kind):
|
||||
"""
|
||||
Take job from queue and finish it until there are no more jobs
|
||||
"""
|
||||
while True:
|
||||
job = jobs.get_job_from_queue(DATASTORE_CLIENT)
|
||||
job = jobs.get_job_from_queue(datastore_client)
|
||||
if job is None:
|
||||
logging.info("No more jobs. Exiting.")
|
||||
break
|
||||
|
@ -90,7 +87,7 @@ def work_of_queue():
|
|||
logging.info("Job %s finished checks", job["url"])
|
||||
logging.info("Job %s writing to DB", job["url"])
|
||||
|
||||
key = DATASTORE_CLIENT.key(config.RESULTS_DATASTORE_KIND, job["url"])
|
||||
key = datastore_client.key(entity_kind, job["url"])
|
||||
entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
|
||||
record = {
|
||||
'created': datetime.utcnow(),
|
||||
|
@ -101,51 +98,9 @@ def work_of_queue():
|
|||
}
|
||||
entity.update(record)
|
||||
try:
|
||||
DATASTORE_CLIENT.put(entity)
|
||||
datastore_client.put(entity)
|
||||
except InvalidArgument as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
except Exception as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Bringing it all together
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--credentials-path', dest='credentials_path',
|
||||
help='Path to the service account credentials JSON file',
|
||||
default='/secrets/service-account.json')
|
||||
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
|
||||
default='info')
|
||||
|
||||
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
|
||||
|
||||
subparsers.add_parser('spider', help='Take jobs off the queue and spider')
|
||||
|
||||
jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
|
||||
|
||||
jobs_parser.add_argument('--url', help='Add a job to spider a URL')
|
||||
args = parser.parse_args()
|
||||
|
||||
loglevel = args.loglevel.lower()
|
||||
if loglevel == 'error':
|
||||
logging.basicConfig(level=logging.ERROR)
|
||||
elif loglevel == 'warn':
|
||||
logging.basicConfig(level=logging.WARN)
|
||||
elif loglevel == 'debug':
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
loglevel = 'info'
|
||||
|
||||
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
||||
|
||||
DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
|
||||
|
||||
logging.debug("Called command %s", args.command)
|
||||
|
||||
if args.command == 'jobs':
|
||||
jobs.create_jobs(DATASTORE_CLIENT, args.url)
|
||||
else:
|
||||
work_of_queue()
|
|
@ -1,7 +1,7 @@
|
|||
import unittest
|
||||
import requests
|
||||
import responses
|
||||
import spider
|
||||
from spider import spider
|
||||
|
||||
|
||||
class TestDeriveHostnames(unittest.TestCase):
|
Loading…
Reference in a new issue