Fehlerbehebung im url_reachability check (#108)
* Fix detection of redirects to bad domains * Fix bad domain check * Add --url flag to spider for faster debugging * Pass args to make spider * Add spidering of a single URL for debugging purposes * Fix tests * Fix test in CI * Remove pip upgrade
This commit is contained in:
parent
2dfcf61cc0
commit
5e8347916c
|
@ -11,7 +11,5 @@ python:
|
||||||
- "3.6"
|
- "3.6"
|
||||||
|
|
||||||
script:
|
script:
|
||||||
- pip install --upgrade pip
|
- make
|
||||||
- pip install --upgrade codecov
|
|
||||||
- make test
|
- make test
|
||||||
- codecov
|
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -25,7 +25,7 @@ spider:
|
||||||
$(IMAGE) \
|
$(IMAGE) \
|
||||||
--credentials-path /secrets/datastore-writer.json \
|
--credentials-path /secrets/datastore-writer.json \
|
||||||
--loglevel debug \
|
--loglevel debug \
|
||||||
spider --kind $(DB_ENTITY)
|
spider --kind $(DB_ENTITY) ${ARGS}
|
||||||
|
|
||||||
export:
|
export:
|
||||||
docker run --rm -ti \
|
docker run --rm -ti \
|
||||||
|
|
|
@ -24,7 +24,7 @@ class TestCertificateChecker(unittest.TestCase):
|
||||||
result = checker.run()
|
result = checker.run()
|
||||||
self.assertIn(url, result)
|
self.assertIn(url, result)
|
||||||
self.assertIsNone(result[url]['exception'])
|
self.assertIsNone(result[url]['exception'])
|
||||||
self.assertEqual(result[url]['issuer']['O'], 'COMODO CA Limited')
|
self.assertEqual(result[url]['issuer']['O'], 'Sectigo Limited')
|
||||||
|
|
||||||
def test_tls_v_1_0(self):
|
def test_tls_v_1_0(self):
|
||||||
"""Load a certificate for a TLS v1.0 server"""
|
"""Load a certificate for a TLS v1.0 server"""
|
||||||
|
|
|
@ -7,6 +7,8 @@ from checks import load_feeds
|
||||||
from checks.config import Config
|
from checks.config import Config
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
@httprettified
|
@httprettified
|
||||||
class TestFeed(unittest.TestCase):
|
class TestFeed(unittest.TestCase):
|
||||||
|
|
||||||
|
@ -56,14 +58,14 @@ class TestFeed(unittest.TestCase):
|
||||||
checker = load_feeds.Checker(config=config, previous_results=results)
|
checker = load_feeds.Checker(config=config, previous_results=results)
|
||||||
|
|
||||||
result = checker.run()
|
result = checker.run()
|
||||||
print(result)
|
pprint(result)
|
||||||
|
|
||||||
self.assertEqual(result, {
|
self.assertEqual(result, {
|
||||||
'http://example.com/feed.xml': {
|
'http://example.com/feed.xml': {
|
||||||
'exception': None,
|
'exception': None,
|
||||||
'title': 'Liftoff News',
|
'title': 'Liftoff News',
|
||||||
'latest_entry': datetime(2003, 6, 3, 10, 39, 21),
|
'latest_entry': datetime(2003, 6, 3, 9, 39, 21),
|
||||||
'first_entry': datetime(2003, 5, 30, 12, 6, 42),
|
'first_entry': datetime(2003, 5, 30, 11, 6, 42),
|
||||||
'average_interval': 340359,
|
'average_interval': 340359,
|
||||||
'num_entries': 2,
|
'num_entries': 2,
|
||||||
}
|
}
|
||||||
|
@ -104,7 +106,7 @@ class TestFeed(unittest.TestCase):
|
||||||
checker = load_feeds.Checker(config=config, previous_results=results)
|
checker = load_feeds.Checker(config=config, previous_results=results)
|
||||||
|
|
||||||
result = checker.run()
|
result = checker.run()
|
||||||
print(result)
|
pprint(result)
|
||||||
|
|
||||||
self.assertEqual(result, {
|
self.assertEqual(result, {
|
||||||
'http://example.com/feed.xml': {
|
'http://example.com/feed.xml': {
|
||||||
|
@ -161,7 +163,7 @@ class TestFeed(unittest.TestCase):
|
||||||
checker = load_feeds.Checker(config=config, previous_results=results)
|
checker = load_feeds.Checker(config=config, previous_results=results)
|
||||||
|
|
||||||
result = checker.run()
|
result = checker.run()
|
||||||
print(result)
|
pprint(result)
|
||||||
|
|
||||||
self.assertEqual(result, {
|
self.assertEqual(result, {
|
||||||
'http://example.com/feed.xml': {
|
'http://example.com/feed.xml': {
|
||||||
|
|
|
@ -38,7 +38,7 @@ class TestLoadInBrowser(unittest.TestCase):
|
||||||
'httpOnly': False,
|
'httpOnly': False,
|
||||||
'name': 'cookiename',
|
'name': 'cookiename',
|
||||||
'path': '/',
|
'path': '/',
|
||||||
'secure': True,
|
'secure': False,
|
||||||
'value': 'cookievalue'
|
'value': 'cookievalue'
|
||||||
}])
|
}])
|
||||||
|
|
||||||
|
|
|
@ -80,17 +80,19 @@ class Checker(AbstractChecker):
|
||||||
|
|
||||||
# if redirects end in www.facebook.com or www.denic.de, remove this URL again
|
# if redirects end in www.facebook.com or www.denic.de, remove this URL again
|
||||||
# remove if redirect target is facebook
|
# remove if redirect target is facebook
|
||||||
if result['exception'] is not None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
|
if result['exception'] is None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
|
||||||
parsed = urlparse(result['redirect_history'][-1]['redirect_to'])
|
target_url = result['redirect_history'][-1]['redirect_to']
|
||||||
if parsed.hostname in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
|
parsed = urlparse(target_url)
|
||||||
result[url]['exception'] = {
|
if parsed.netloc in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
|
||||||
|
result['exception'] = {
|
||||||
'type': 'Bad target domain',
|
'type': 'Bad target domain',
|
||||||
'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.hostname,
|
'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.netloc,
|
||||||
}
|
}
|
||||||
self.config.remove_url(url)
|
self.config.remove_url(target_url)
|
||||||
|
print("Removing URL %s" % target_url)
|
||||||
|
|
||||||
results[url] = result
|
results[url] = result
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def expand_history(self, history):
|
def expand_history(self, history):
|
||||||
|
|
7
cli.py
7
cli.py
|
@ -36,6 +36,7 @@ if __name__ == "__main__":
|
||||||
# spider subcommand
|
# spider subcommand
|
||||||
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
|
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
|
||||||
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
|
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
|
||||||
|
spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
|
||||||
|
|
||||||
# jobs subcommand
|
# jobs subcommand
|
||||||
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
|
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
|
||||||
|
@ -79,4 +80,8 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
else:
|
else:
|
||||||
from spider import spider
|
from spider import spider
|
||||||
spider.work_of_queue(datastore_client, args.kind)
|
if args.url:
|
||||||
|
# spider one URL for diagnostic purposes
|
||||||
|
spider.test_url(args.url)
|
||||||
|
else:
|
||||||
|
spider.work_of_queue(datastore_client, args.kind)
|
||||||
|
|
|
@ -65,6 +65,21 @@ def check_and_rate_site(entry):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def test_url(url):
|
||||||
|
"""
|
||||||
|
Run the spider for a single URL and print the result.
|
||||||
|
Doesn't write anything to the database.
|
||||||
|
"""
|
||||||
|
logging.info("Crawling URL %s", url)
|
||||||
|
|
||||||
|
# mock job
|
||||||
|
job = {
|
||||||
|
"url": url,
|
||||||
|
}
|
||||||
|
|
||||||
|
result = check_and_rate_site(entry=job)
|
||||||
|
pprint(result['rating'])
|
||||||
|
|
||||||
def work_of_queue(datastore_client, entity_kind):
|
def work_of_queue(datastore_client, entity_kind):
|
||||||
"""
|
"""
|
||||||
Take job from queue and finish it until there are no more jobs
|
Take job from queue and finish it until there are no more jobs
|
||||||
|
|
Loading…
Reference in New Issue