Fehlerbehebung im url_reachability check (#108)

* Fix detection of redirects to bad domains * Fix bad domain check * Add --url flag to spider for faster debugging * Pass args to make spider * Add spidering of a single URL for debugging purposes * Fix tests * Fix test in CI * Remove pip upgrade
2019-04-19 00:35:28 +02:00 · 2019-04-19 00:35:28 +02:00 · 5e8347916c
parent 2dfcf61cc0
commit 5e8347916c
8 changed files with 41 additions and 19 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -11,7 +11,5 @@ python:
  - "3.6"
 script:
-  - pip install --upgrade pip
+  - make
  - pip install --upgrade codecov
  - make test
  - codecov
--- a/2
+++ b/2
@ -25,7 +25,7 @@ spider:
 		$(IMAGE) \
 		--credentials-path /secrets/datastore-writer.json \
 		--loglevel debug \
-		spider --kind $(DB_ENTITY)
+		spider --kind $(DB_ENTITY) ${ARGS}
 export:
 	docker run --rm -ti \
--- a/checks/certificate_test.py
+++ b/checks/certificate_test.py
@ -24,7 +24,7 @@ class TestCertificateChecker(unittest.TestCase):
        result = checker.run()
        self.assertIn(url, result)
        self.assertIsNone(result[url]['exception'])
-        self.assertEqual(result[url]['issuer']['O'], 'COMODO CA Limited')
+        self.assertEqual(result[url]['issuer']['O'], 'Sectigo Limited')
    def test_tls_v_1_0(self):
        """Load a certificate for a TLS v1.0 server"""
--- a/checks/load_feeds_test.py
+++ b/checks/load_feeds_test.py
@ -7,6 +7,8 @@ from checks import load_feeds
 from checks.config import Config
 from datetime import datetime
 from pprint import pprint
@httprettified
 class TestFeed(unittest.TestCase):
@ -56,14 +58,14 @@ class TestFeed(unittest.TestCase):
        checker = load_feeds.Checker(config=config, previous_results=results)
        result = checker.run()
-        print(result)
+        pprint(result)
        self.assertEqual(result, {
            'http://example.com/feed.xml': {
                'exception': None,
                'title': 'Liftoff News',
-                'latest_entry': datetime(2003, 6, 3, 10, 39, 21),
+                'latest_entry': datetime(2003, 6, 3, 9, 39, 21),
-                'first_entry': datetime(2003, 5, 30, 12, 6, 42),
+                'first_entry': datetime(2003, 5, 30, 11, 6, 42),
                'average_interval': 340359,
                'num_entries': 2,
            }
@ -104,7 +106,7 @@ class TestFeed(unittest.TestCase):
        checker = load_feeds.Checker(config=config, previous_results=results)
        result = checker.run()
-        print(result)
+        pprint(result)
        self.assertEqual(result, {
            'http://example.com/feed.xml': {
@ -161,7 +163,7 @@ class TestFeed(unittest.TestCase):
        checker = load_feeds.Checker(config=config, previous_results=results)
        result = checker.run()
-        print(result)
+        pprint(result)
        self.assertEqual(result, {
            'http://example.com/feed.xml': {
--- a/checks/load_in_browser_test.py
+++ b/checks/load_in_browser_test.py
@ -38,7 +38,7 @@ class TestLoadInBrowser(unittest.TestCase):
            'httpOnly': False,
            'name': 'cookiename',
            'path': '/',
-            'secure': True,
+            'secure': False,
            'value': 'cookievalue'
        }])
--- a/checks/url_reachability.py
+++ b/checks/url_reachability.py
@ -80,17 +80,19 @@ class Checker(AbstractChecker):
            # if redirects end in www.facebook.com or www.denic.de, remove this URL again
            # remove if redirect target is facebook
-            if result['exception'] is not None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
+            if result['exception'] is None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
-                parsed = urlparse(result['redirect_history'][-1]['redirect_to'])
+                target_url = result['redirect_history'][-1]['redirect_to']
-                if parsed.hostname in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
+                parsed = urlparse(target_url)
-                    result[url]['exception'] = {
+                if parsed.netloc in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
                    result['exception'] = {
                        'type': 'Bad target domain',
-                        'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.hostname,
+                        'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.netloc,
                    }
-                    self.config.remove_url(url)
+                    self.config.remove_url(target_url)
                    print("Removing URL %s" % target_url)
            results[url] = result
-        
+
        return results
    def expand_history(self, history):
--- a/cli.py
+++ b/cli.py
@ -36,6 +36,7 @@ if __name__ == "__main__":
    # spider subcommand
    spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
    spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
    spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
    # jobs subcommand
    jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
@ -79,4 +80,8 @@ if __name__ == "__main__":
    else:
        from spider import spider
-        spider.work_of_queue(datastore_client, args.kind)
+        if args.url:
            # spider one URL for diagnostic purposes
            spider.test_url(args.url)
        else:
            spider.work_of_queue(datastore_client, args.kind)
--- a/spider/spider.py
+++ b/spider/spider.py
@ -65,6 +65,21 @@ def check_and_rate_site(entry):
    return result
 def test_url(url):
    """
    Run the spider for a single URL and print the result.
    Doesn't write anything to the database.
    """
    logging.info("Crawling URL %s", url)
    # mock job
    job = {
        "url": url,
    }
    result = check_and_rate_site(entry=job)
    pprint(result['rating'])
 def work_of_queue(datastore_client, entity_kind):
    """
    Take job from queue and finish it until there are no more jobs