Browse Source

Fehlerbehebung im url_reachability check (#108)

* Fix detection of redirects to bad domains

* Fix bad domain check

* Add --url flag to spider for faster debugging

* Pass args to make spider

* Add spidering of a single URL for debugging purposes

* Fix tests

* Fix test in CI

* Remove pip upgrade
pull/115/head
Marian Steinbach 3 years ago committed by GitHub
parent
commit
5e8347916c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 4
      .travis.yml
  2. 2
      Makefile
  3. 2
      checks/certificate_test.py
  4. 12
      checks/load_feeds_test.py
  5. 2
      checks/load_in_browser_test.py
  6. 16
      checks/url_reachability.py
  7. 7
      cli.py
  8. 15
      spider/spider.py

4
.travis.yml

@ -11,7 +11,5 @@ python:
- "3.6"
script:
- pip install --upgrade pip
- pip install --upgrade codecov
- make
- make test
- codecov

2
Makefile

@ -25,7 +25,7 @@ spider:
$(IMAGE) \
--credentials-path /secrets/datastore-writer.json \
--loglevel debug \
spider --kind $(DB_ENTITY)
spider --kind $(DB_ENTITY) ${ARGS}
export:
docker run --rm -ti \

2
checks/certificate_test.py

@ -24,7 +24,7 @@ class TestCertificateChecker(unittest.TestCase):
result = checker.run()
self.assertIn(url, result)
self.assertIsNone(result[url]['exception'])
self.assertEqual(result[url]['issuer']['O'], 'COMODO CA Limited')
self.assertEqual(result[url]['issuer']['O'], 'Sectigo Limited')
def test_tls_v_1_0(self):
"""Load a certificate for a TLS v1.0 server"""

12
checks/load_feeds_test.py

@ -7,6 +7,8 @@ from checks import load_feeds
from checks.config import Config
from datetime import datetime
from pprint import pprint
@httprettified
class TestFeed(unittest.TestCase):
@ -56,14 +58,14 @@ class TestFeed(unittest.TestCase):
checker = load_feeds.Checker(config=config, previous_results=results)
result = checker.run()
print(result)
pprint(result)
self.assertEqual(result, {
'http://example.com/feed.xml': {
'exception': None,
'title': 'Liftoff News',
'latest_entry': datetime(2003, 6, 3, 10, 39, 21),
'first_entry': datetime(2003, 5, 30, 12, 6, 42),
'latest_entry': datetime(2003, 6, 3, 9, 39, 21),
'first_entry': datetime(2003, 5, 30, 11, 6, 42),
'average_interval': 340359,
'num_entries': 2,
}
@ -104,7 +106,7 @@ class TestFeed(unittest.TestCase):
checker = load_feeds.Checker(config=config, previous_results=results)
result = checker.run()
print(result)
pprint(result)
self.assertEqual(result, {
'http://example.com/feed.xml': {
@ -161,7 +163,7 @@ class TestFeed(unittest.TestCase):
checker = load_feeds.Checker(config=config, previous_results=results)
result = checker.run()
print(result)
pprint(result)
self.assertEqual(result, {
'http://example.com/feed.xml': {

2
checks/load_in_browser_test.py

@ -38,7 +38,7 @@ class TestLoadInBrowser(unittest.TestCase):
'httpOnly': False,
'name': 'cookiename',
'path': '/',
'secure': True,
'secure': False,
'value': 'cookievalue'
}])

16
checks/url_reachability.py

@ -80,17 +80,19 @@ class Checker(AbstractChecker):
# if redirects end in www.facebook.com or www.denic.de, remove this URL again
# remove if redirect target is facebook
if result['exception'] is not None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
parsed = urlparse(result['redirect_history'][-1]['redirect_to'])
if parsed.hostname in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
result[url]['exception'] = {
if result['exception'] is None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
target_url = result['redirect_history'][-1]['redirect_to']
parsed = urlparse(target_url)
if parsed.netloc in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
result['exception'] = {
'type': 'Bad target domain',
'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.hostname,
'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.netloc,
}
self.config.remove_url(url)
self.config.remove_url(target_url)
print("Removing URL %s" % target_url)
results[url] = result
return results
def expand_history(self, history):

7
cli.py

@ -36,6 +36,7 @@ if __name__ == "__main__":
# spider subcommand
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
# jobs subcommand
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
@ -79,4 +80,8 @@ if __name__ == "__main__":
else:
from spider import spider
spider.work_of_queue(datastore_client, args.kind)
if args.url:
# spider one URL for diagnostic purposes
spider.test_url(args.url)
else:
spider.work_of_queue(datastore_client, args.kind)

15
spider/spider.py

@ -65,6 +65,21 @@ def check_and_rate_site(entry):
return result
def test_url(url):
"""
Run the spider for a single URL and print the result.
Doesn't write anything to the database.
"""
logging.info("Crawling URL %s", url)
# mock job
job = {
"url": url,
}
result = check_and_rate_site(entry=job)
pprint(result['rating'])
def work_of_queue(datastore_client, entity_kind):
"""
Take job from queue and finish it until there are no more jobs

Loading…
Cancel
Save