green-spider/checks/url_reachability.py

105 lines
3.8 KiB
Python

"""
This check verifies whether the urls in config are reachable.
Some additional information regarding redirects and SSL problems
are also recorded and returned as results.
Non-accessible URLs are removed from config.urls.
A redirect to facebook.com is not considered reachable, as that
leads to a different website in the sense of this system.
TODO: Parallelize the work done in this test
"""
import logging
from urllib.parse import urlparse
import requests
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
headers = {
"User-Agent": self.config.user_agent
}
results = {}
urls = list(self.config.urls)
for url in urls:
logging.debug("Checking URL reachability for %s", url)
result = {
"url": url,
"redirect_history": [],
"status": None,
"exception": None,
"duration": None,
}
# Perform HEAD requests, recording redirect log
try:
r = requests.head(url, headers=headers, allow_redirects=True)
result['status'] = r.status_code
result['duration'] = round(r.elapsed.total_seconds() * 1000)
if len(r.history):
result['redirect_history'] = self.expand_history(r.history)
logging.debug("Redirects: %r", result['redirect_history'])
if r.url == url:
logging.debug("URL: %s - status %s", url, r.status_code)
else:
logging.debug("URL: %s - status %s - redirects to %s", url,
r.status_code, r.url)
# remove source URL, add target URL to config.urls
self.config.remove_url(url)
self.config.add_url(r.url)
# remove 404 etc
if r.status_code > 400:
self.config.remove_url(url)
except Exception as exc:
logging.info("Exception for URL %s: %s %s", url, str(type(exc)), exc)
result['exception'] = {
'type': str(type(exc)),
'message': str(exc),
}
# remove URL to prevent further checks on unreachable URL
self.config.remove_url(url)
# if redirects end in www.facebook.com or www.denic.de, remove this URL again
# remove if redirect target is facebook
if result['exception'] is not None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
parsed = urlparse(result['redirect_history'][-1]['redirect_to'])
if parsed.hostname in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
result[url]['exception'] = {
'type': 'Bad target domain',
'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.hostname,
}
self.config.remove_url(url)
results[url] = result
return results
def expand_history(self, history):
"""Extracts primitives from a list of requests.Response objects"""
items = []
for h in history:
item = {
'status': h.status_code,
'duration': round(h.elapsed.total_seconds() * 1000),
'redirect_to': h.headers['location'],
}
items.append(item)
return items