green-spider/checks/url_reachability.py

"""
This check verifies whether the urls in config are reachable.
Some additional information regarding redirects and SSL problems
are also recorded and returned as results.

Non-accessible URLs are removed from config.urls.

A redirect to facebook.com is not considered reachable, as that
leads to a different website in the sense of this system.

TODO: Parallelize the work done in this test
"""

import logging

from urllib.parse import urlparse
import requests

from checks.abstract_checker import AbstractChecker


class Checker(AbstractChecker):
    def __init__(self, config, previous_results=None):
        super().__init__(config, previous_results)

    def run(self):
        headers = {
            "User-Agent": self.config.user_agent
        }

        results = {}
        urls = list(self.config.urls)

        for url in urls:
            logging.debug("Checking URL reachability for %s", url)

            result = {
                "url": url,
                "redirect_history": [],
                "status": None,
                "exception": None,
                "duration": None,
            }

            # Perform HEAD requests, recording redirect log
            try:
                r = requests.head(url, headers=headers, allow_redirects=True)
                result['status'] = r.status_code
                result['duration'] = round(r.elapsed.total_seconds() * 1000)

                if len(r.history):
                    result['redirect_history'] = self.expand_history(r.history)
                    logging.debug("Redirects: %r", result['redirect_history'])

                if r.url == url:
                    logging.debug("URL: %s - status %s", url, r.status_code)
                else:
                    logging.debug("URL: %s - status %s - redirects to %s", url,
                        r.status_code, r.url)
                    # remove source URL, add target URL to config.urls
                    self.config.remove_url(url)
                    self.config.add_url(r.url)

                # remove 404 etc
                if r.status_code > 400:
                    self.config.remove_url(url)

            except Exception as exc:
                logging.info("Exception for URL %s: %s %s", url, str(type(exc)), exc)
                result['exception'] = {
                    'type': str(type(exc)),
                    'message': str(exc),
                }

                # remove URL to prevent further checks on unreachable URL
                self.config.remove_url(url)

            # if redirects end in www.facebook.com or www.denic.de, remove this URL again
            # remove if redirect target is facebook
            if result['exception'] is not None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
                parsed = urlparse(result['redirect_history'][-1]['redirect_to'])
                if parsed.hostname in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
                    result[url]['exception'] = {
                        'type': 'Bad target domain',
                        'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.hostname,
                    }
                    self.config.remove_url(url)

            results[url] = result

        return results

    def expand_history(self, history):
        """Extracts primitives from a list of requests.Response objects"""
        items = []
        for h in history:
            item = {
                'status': h.status_code,
                'duration': round(h.elapsed.total_seconds() * 1000),
                'redirect_to': h.headers['location'],
            }
            items.append(item)

        return items