diff --git a/.travis.yml b/.travis.yml index bd97de9..6db4ab9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,5 @@ python: - "3.6" script: - - pip install --upgrade pip - - pip install --upgrade codecov + - make - make test - - codecov diff --git a/Makefile b/Makefile index 80332a7..3e964ef 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ spider: $(IMAGE) \ --credentials-path /secrets/datastore-writer.json \ --loglevel debug \ - spider --kind $(DB_ENTITY) + spider --kind $(DB_ENTITY) ${ARGS} export: docker run --rm -ti \ diff --git a/checks/certificate_test.py b/checks/certificate_test.py index 53588fd..b7f2ef2 100644 --- a/checks/certificate_test.py +++ b/checks/certificate_test.py @@ -24,7 +24,7 @@ class TestCertificateChecker(unittest.TestCase): result = checker.run() self.assertIn(url, result) self.assertIsNone(result[url]['exception']) - self.assertEqual(result[url]['issuer']['O'], 'COMODO CA Limited') + self.assertEqual(result[url]['issuer']['O'], 'Sectigo Limited') def test_tls_v_1_0(self): """Load a certificate for a TLS v1.0 server""" diff --git a/checks/load_feeds_test.py b/checks/load_feeds_test.py index b0f2c3f..3e8dfc2 100644 --- a/checks/load_feeds_test.py +++ b/checks/load_feeds_test.py @@ -7,6 +7,8 @@ from checks import load_feeds from checks.config import Config from datetime import datetime +from pprint import pprint + @httprettified class TestFeed(unittest.TestCase): @@ -56,14 +58,14 @@ class TestFeed(unittest.TestCase): checker = load_feeds.Checker(config=config, previous_results=results) result = checker.run() - print(result) + pprint(result) self.assertEqual(result, { 'http://example.com/feed.xml': { 'exception': None, 'title': 'Liftoff News', - 'latest_entry': datetime(2003, 6, 3, 10, 39, 21), - 'first_entry': datetime(2003, 5, 30, 12, 6, 42), + 'latest_entry': datetime(2003, 6, 3, 9, 39, 21), + 'first_entry': datetime(2003, 5, 30, 11, 6, 42), 'average_interval': 340359, 'num_entries': 2, } @@ -104,7 +106,7 @@ class TestFeed(unittest.TestCase): checker = load_feeds.Checker(config=config, previous_results=results) result = checker.run() - print(result) + pprint(result) self.assertEqual(result, { 'http://example.com/feed.xml': { @@ -161,7 +163,7 @@ class TestFeed(unittest.TestCase): checker = load_feeds.Checker(config=config, previous_results=results) result = checker.run() - print(result) + pprint(result) self.assertEqual(result, { 'http://example.com/feed.xml': { diff --git a/checks/load_in_browser_test.py b/checks/load_in_browser_test.py index c30b068..bd7c448 100644 --- a/checks/load_in_browser_test.py +++ b/checks/load_in_browser_test.py @@ -38,7 +38,7 @@ class TestLoadInBrowser(unittest.TestCase): 'httpOnly': False, 'name': 'cookiename', 'path': '/', - 'secure': True, + 'secure': False, 'value': 'cookievalue' }]) diff --git a/checks/url_reachability.py b/checks/url_reachability.py index 24b19a9..2ae52c1 100644 --- a/checks/url_reachability.py +++ b/checks/url_reachability.py @@ -80,17 +80,19 @@ class Checker(AbstractChecker): # if redirects end in www.facebook.com or www.denic.de, remove this URL again # remove if redirect target is facebook - if result['exception'] is not None and result['redirect_history'] is not None and len(result['redirect_history']) > 0: - parsed = urlparse(result['redirect_history'][-1]['redirect_to']) - if parsed.hostname in ('www.facebook.com', 'www.denic.de', 'sedo.com'): - result[url]['exception'] = { + if result['exception'] is None and result['redirect_history'] is not None and len(result['redirect_history']) > 0: + target_url = result['redirect_history'][-1]['redirect_to'] + parsed = urlparse(target_url) + if parsed.netloc in ('www.facebook.com', 'www.denic.de', 'sedo.com'): + result['exception'] = { 'type': 'Bad target domain', - 'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.hostname, + 'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.netloc, } - self.config.remove_url(url) + self.config.remove_url(target_url) + print("Removing URL %s" % target_url) results[url] = result - + return results def expand_history(self, history): diff --git a/cli.py b/cli.py index 3851163..2d41d4b 100644 --- a/cli.py +++ b/cli.py @@ -36,6 +36,7 @@ if __name__ == "__main__": # spider subcommand spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider') spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)') + spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.') # jobs subcommand jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.') @@ -79,4 +80,8 @@ if __name__ == "__main__": else: from spider import spider - spider.work_of_queue(datastore_client, args.kind) + if args.url: + # spider one URL for diagnostic purposes + spider.test_url(args.url) + else: + spider.work_of_queue(datastore_client, args.kind) diff --git a/spider/spider.py b/spider/spider.py index e47c701..8ad110e 100644 --- a/spider/spider.py +++ b/spider/spider.py @@ -65,6 +65,21 @@ def check_and_rate_site(entry): return result +def test_url(url): + """ + Run the spider for a single URL and print the result. + Doesn't write anything to the database. + """ + logging.info("Crawling URL %s", url) + + # mock job + job = { + "url": url, + } + + result = check_and_rate_site(entry=job) + pprint(result['rating']) + def work_of_queue(datastore_client, entity_kind): """ Take job from queue and finish it until there are no more jobs