mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-04-26 06:20:06 +02:00
Fehlerbehebung im url_reachability check (#108)
* Fix detection of redirects to bad domains * Fix bad domain check * Add --url flag to spider for faster debugging * Pass args to make spider * Add spidering of a single URL for debugging purposes * Fix tests * Fix test in CI * Remove pip upgrade
This commit is contained in:
parent
2dfcf61cc0
commit
5e8347916c
|
@ -11,7 +11,5 @@ python:
|
|||
- "3.6"
|
||||
|
||||
script:
|
||||
- pip install --upgrade pip
|
||||
- pip install --upgrade codecov
|
||||
- make
|
||||
- make test
|
||||
- codecov
|
||||
|
|
2
Makefile
2
Makefile
|
@ -25,7 +25,7 @@ spider:
|
|||
$(IMAGE) \
|
||||
--credentials-path /secrets/datastore-writer.json \
|
||||
--loglevel debug \
|
||||
spider --kind $(DB_ENTITY)
|
||||
spider --kind $(DB_ENTITY) ${ARGS}
|
||||
|
||||
export:
|
||||
docker run --rm -ti \
|
||||
|
|
|
@ -24,7 +24,7 @@ class TestCertificateChecker(unittest.TestCase):
|
|||
result = checker.run()
|
||||
self.assertIn(url, result)
|
||||
self.assertIsNone(result[url]['exception'])
|
||||
self.assertEqual(result[url]['issuer']['O'], 'COMODO CA Limited')
|
||||
self.assertEqual(result[url]['issuer']['O'], 'Sectigo Limited')
|
||||
|
||||
def test_tls_v_1_0(self):
|
||||
"""Load a certificate for a TLS v1.0 server"""
|
||||
|
|
|
@ -7,6 +7,8 @@ from checks import load_feeds
|
|||
from checks.config import Config
|
||||
from datetime import datetime
|
||||
|
||||
from pprint import pprint
|
||||
|
||||
@httprettified
|
||||
class TestFeed(unittest.TestCase):
|
||||
|
||||
|
@ -56,14 +58,14 @@ class TestFeed(unittest.TestCase):
|
|||
checker = load_feeds.Checker(config=config, previous_results=results)
|
||||
|
||||
result = checker.run()
|
||||
print(result)
|
||||
pprint(result)
|
||||
|
||||
self.assertEqual(result, {
|
||||
'http://example.com/feed.xml': {
|
||||
'exception': None,
|
||||
'title': 'Liftoff News',
|
||||
'latest_entry': datetime(2003, 6, 3, 10, 39, 21),
|
||||
'first_entry': datetime(2003, 5, 30, 12, 6, 42),
|
||||
'latest_entry': datetime(2003, 6, 3, 9, 39, 21),
|
||||
'first_entry': datetime(2003, 5, 30, 11, 6, 42),
|
||||
'average_interval': 340359,
|
||||
'num_entries': 2,
|
||||
}
|
||||
|
@ -104,7 +106,7 @@ class TestFeed(unittest.TestCase):
|
|||
checker = load_feeds.Checker(config=config, previous_results=results)
|
||||
|
||||
result = checker.run()
|
||||
print(result)
|
||||
pprint(result)
|
||||
|
||||
self.assertEqual(result, {
|
||||
'http://example.com/feed.xml': {
|
||||
|
@ -161,7 +163,7 @@ class TestFeed(unittest.TestCase):
|
|||
checker = load_feeds.Checker(config=config, previous_results=results)
|
||||
|
||||
result = checker.run()
|
||||
print(result)
|
||||
pprint(result)
|
||||
|
||||
self.assertEqual(result, {
|
||||
'http://example.com/feed.xml': {
|
||||
|
|
|
@ -38,7 +38,7 @@ class TestLoadInBrowser(unittest.TestCase):
|
|||
'httpOnly': False,
|
||||
'name': 'cookiename',
|
||||
'path': '/',
|
||||
'secure': True,
|
||||
'secure': False,
|
||||
'value': 'cookievalue'
|
||||
}])
|
||||
|
||||
|
|
|
@ -80,14 +80,16 @@ class Checker(AbstractChecker):
|
|||
|
||||
# if redirects end in www.facebook.com or www.denic.de, remove this URL again
|
||||
# remove if redirect target is facebook
|
||||
if result['exception'] is not None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
|
||||
parsed = urlparse(result['redirect_history'][-1]['redirect_to'])
|
||||
if parsed.hostname in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
|
||||
result[url]['exception'] = {
|
||||
if result['exception'] is None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
|
||||
target_url = result['redirect_history'][-1]['redirect_to']
|
||||
parsed = urlparse(target_url)
|
||||
if parsed.netloc in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
|
||||
result['exception'] = {
|
||||
'type': 'Bad target domain',
|
||||
'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.hostname,
|
||||
'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.netloc,
|
||||
}
|
||||
self.config.remove_url(url)
|
||||
self.config.remove_url(target_url)
|
||||
print("Removing URL %s" % target_url)
|
||||
|
||||
results[url] = result
|
||||
|
||||
|
|
5
cli.py
5
cli.py
|
@ -36,6 +36,7 @@ if __name__ == "__main__":
|
|||
# spider subcommand
|
||||
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
|
||||
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
|
||||
spider_parser.add_argument('--url', help='Spider a URL instead of using jobs from the queue. For testing/debugging only.')
|
||||
|
||||
# jobs subcommand
|
||||
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
|
||||
|
@ -79,4 +80,8 @@ if __name__ == "__main__":
|
|||
|
||||
else:
|
||||
from spider import spider
|
||||
if args.url:
|
||||
# spider one URL for diagnostic purposes
|
||||
spider.test_url(args.url)
|
||||
else:
|
||||
spider.work_of_queue(datastore_client, args.kind)
|
||||
|
|
|
@ -65,6 +65,21 @@ def check_and_rate_site(entry):
|
|||
return result
|
||||
|
||||
|
||||
def test_url(url):
|
||||
"""
|
||||
Run the spider for a single URL and print the result.
|
||||
Doesn't write anything to the database.
|
||||
"""
|
||||
logging.info("Crawling URL %s", url)
|
||||
|
||||
# mock job
|
||||
job = {
|
||||
"url": url,
|
||||
}
|
||||
|
||||
result = check_and_rate_site(entry=job)
|
||||
pprint(result['rating'])
|
||||
|
||||
def work_of_queue(datastore_client, entity_kind):
|
||||
"""
|
||||
Take job from queue and finish it until there are no more jobs
|
||||
|
|
Loading…
Reference in a new issue