From ae6a2e83e92e44378a54fb10e848e95927968c0c Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Wed, 3 Oct 2018 11:05:42 +0200 Subject: [PATCH] Refactor and modularize spider (#70) See PR description for details --- .dockerignore | 1 + .gitignore | 1 + .travis.yml | 7 + Dockerfile | 19 +- Makefile | 31 +- checks/__init__.py | 64 +++ checks/abstract_checker.py | 23 + checks/certificate.py | 62 ++ checks/certificate_test.py | 27 + checks/charset.py | 77 +++ checks/charset_test.py | 49 ++ checks/config.py | 29 + checks/dns_resolution.py | 55 ++ checks/domain_variations.py | 44 ++ checks/duplicate_content.py | 107 ++++ checks/generator.py | 76 +++ checks/html_head.py | 152 +++++ checks/http_and_https.py | 27 + checks/load_in_browser.py | 134 +++++ checks/page_content.py | 94 ++++ checks/url_canonicalization.py | 13 + checks/url_reachability.py | 104 ++++ checks/url_reachability_test.py | 71 +++ cli.py | 83 +++ config/__init__.py | 23 + devops/run-job.sh | 33 +- data_export.py => export/__init__.py | 89 +-- jobs/__init__.py | 180 ++++++ rating/__init__.py | 53 ++ rating/abstract_rater.py | 22 + rating/canonical_url.py | 31 + rating/favicon.py | 32 ++ rating/feeds.py | 35 ++ rating/https.py | 47 ++ rating/no_network_errors.py | 48 ++ rating/no_script_errors.py | 42 ++ rating/reachable.py | 36 ++ rating/resolvable.py | 35 ++ rating/response_duration.py | 46 ++ rating/responsive_layout.py | 35 ++ rating/use_specific_fonts.py | 41 ++ rating/www_optional.py | 44 ++ spider.py | 814 --------------------------- spider/__init__.py | 0 spider/spider.py | 106 ++++ spider/spider_test.py | 26 + spider_test.py | 125 ---- 47 files changed, 2289 insertions(+), 1004 deletions(-) create mode 100644 checks/__init__.py create mode 100644 checks/abstract_checker.py create mode 100644 checks/certificate.py create mode 100644 checks/certificate_test.py create mode 100644 checks/charset.py create mode 100644 checks/charset_test.py create mode 100644 checks/config.py create mode 100644 checks/dns_resolution.py create mode 100644 checks/domain_variations.py create mode 100644 checks/duplicate_content.py create mode 100644 checks/generator.py create mode 100644 checks/html_head.py create mode 100644 checks/http_and_https.py create mode 100644 checks/load_in_browser.py create mode 100644 checks/page_content.py create mode 100644 checks/url_canonicalization.py create mode 100644 checks/url_reachability.py create mode 100644 checks/url_reachability_test.py create mode 100644 cli.py create mode 100644 config/__init__.py rename data_export.py => export/__init__.py (56%) create mode 100644 jobs/__init__.py create mode 100644 rating/__init__.py create mode 100644 rating/abstract_rater.py create mode 100644 rating/canonical_url.py create mode 100644 rating/favicon.py create mode 100644 rating/feeds.py create mode 100644 rating/https.py create mode 100644 rating/no_network_errors.py create mode 100644 rating/no_script_errors.py create mode 100644 rating/reachable.py create mode 100644 rating/resolvable.py create mode 100644 rating/response_duration.py create mode 100644 rating/responsive_layout.py create mode 100644 rating/use_specific_fonts.py create mode 100644 rating/www_optional.py delete mode 100644 spider.py create mode 100644 spider/__init__.py create mode 100644 spider/spider.py create mode 100644 spider/spider_test.py delete mode 100644 spider_test.py diff --git a/.dockerignore b/.dockerignore index ae2f869..e5d5555 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,3 +4,4 @@ docs secrets temp venv +/export-* diff --git a/.gitignore b/.gitignore index 1d3b3da..a536d3d 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ __pycache__ .vscode/settings.json webapp/dist/bundle.js dev-shm +/export-* diff --git a/.travis.yml b/.travis.yml index df1ac71..bd97de9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,5 +6,12 @@ services: notifications: email: false +language: python +python: + - "3.6" + script: + - pip install --upgrade pip + - pip install --upgrade codecov - make test + - codecov diff --git a/Dockerfile b/Dockerfile index a53042b..186ac63 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,17 +1,20 @@ -FROM python:3.6-alpine3.7 +FROM python:3.6-alpine3.8 # Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296 RUN echo "http://dl-4.alpinelinux.org/alpine/v3.7/main" >> /etc/apk/repositories && \ echo "http://dl-4.alpinelinux.org/alpine/v3.7/community" >> /etc/apk/repositories && \ apk update && \ - apk --no-cache add chromium chromium-chromedriver python3-dev build-base git && \ + apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \ pip3 install --upgrade pip && \ - pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \ + pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 html-similarity==0.3.2 httpretty==0.9.4 pyopenssl==18.0.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \ apk del python3-dev build-base -ADD spider.py / -ADD spider_test.py / -ADD data_export.py / +ADD cli.py / +ADD config /config +ADD jobs /jobs +ADD checks /checks +ADD rating /rating +ADD spider /spider +ADD export /export -ENTRYPOINT ["python3"] -CMD ["/spider.py"] +ENTRYPOINT ["python3", "/cli.py"] diff --git a/Makefile b/Makefile index 075496d..9b85fa4 100644 --- a/Makefile +++ b/Makefile @@ -1,18 +1,20 @@ +IMAGE := quay.io/netzbegruenung/green-spider:latest +DB_ENTITY := spider-results .PHONY: dockerimage # Build docker image dockerimage: - docker build -t quay.io/netzbegruenung/green-spider:latest . + docker build -t $(IMAGE) . # Create spider job queue spiderjobs: dockerimage docker run --rm -ti \ -v $(PWD)/secrets:/secrets \ - quay.io/netzbegruenung/green-spider:latest spider.py \ + $(IMAGE) \ --credentials-path /secrets/datastore-writer.json \ - --loglevel debug \ + --loglevel info \ jobs # Run spider in docker image @@ -21,11 +23,26 @@ spider: dockerimage -v $(PWD)/dev-shm:/dev/shm \ -v $(PWD)/webapp/dist/data:/out \ -v $(PWD)/secrets:/secrets \ - quay.io/netzbegruenung/green-spider:latest spider.py \ + $(IMAGE) \ --credentials-path /secrets/datastore-writer.json \ - --loglevel info \ - spider + --loglevel debug \ + spider --kind $(DB_ENTITY) + +export: dockerimage + docker run --rm -ti \ + -v $(PWD)/export-json:/out \ + -v $(PWD)/secrets:/secrets \ + -v $(PWD)/export-siteicons:/icons \ + $(IMAGE) \ + --credentials-path /secrets/datastore-reader.json \ + --loglevel debug \ + export --kind $(DB_ENTITY) # run spider tests +# FIXME test: dockerimage - docker run --rm -ti quay.io/netzbegruenung/green-spider:latest /spider_test.py + docker run --rm -ti \ + --entrypoint "python3" \ + $(IMAGE) \ + -m unittest discover -p '*_test.py' + diff --git a/checks/__init__.py b/checks/__init__.py new file mode 100644 index 0000000..cc2d0f0 --- /dev/null +++ b/checks/__init__.py @@ -0,0 +1,64 @@ +""" +The checks module contains the functionality to get information and test certain +functionality of a site or individual pages. +""" + +import logging + +from checks import charset +from checks import certificate +from checks import dns_resolution +from checks import duplicate_content +from checks import domain_variations +from checks import generator +from checks import html_head +from checks import http_and_https +from checks import page_content +from checks import load_in_browser +from checks import url_reachability +from checks import url_canonicalization + +from checks.config import Config + + +def perform_checks(input_url): + """ + Executes all our URL/site checks and returns a big-ass result dict. + """ + + # The sequence of checks to run. Order is important! + # Checks which expand the URLs list must come first. + # After that, dependencies (encoded in the checks) have to be fulfilled. + check_modules = [ + ('domain_variations', domain_variations), + ('http_and_https', http_and_https), + ('dns_resolution', dns_resolution), + ('url_reachability', url_reachability), + ('certificate', certificate), + ('url_canonicalization', url_canonicalization), + ('page_content', page_content), + ('duplicate_content', duplicate_content), + ('charset', charset), + ('html_head', html_head), + ('generator', generator), + ('load_in_browser', load_in_browser), + ] + + results = {} + + config = Config(urls=[input_url], + user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' + + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' + + 'Safari/537.36 green-spider/0.2') + + for check_name, check in check_modules: + checker = check.Checker(config=config, + previous_results=results) + result = checker.run() + results[check_name] = result + + # update config for the next check + config = checker.config + logging.debug("config after check %s: %r" % (check_name, config)) + + return results diff --git a/checks/abstract_checker.py b/checks/abstract_checker.py new file mode 100644 index 0000000..e9db12a --- /dev/null +++ b/checks/abstract_checker.py @@ -0,0 +1,23 @@ +class AbstractChecker(object): + """ + Our blueprint for checks + """ + + def __init__(self, config, previous_results=None): + self._config = config + + # A dictionary of results from previous checkers. + # Key is the name of the checker that has generated the result. + self._previous_results = previous_results + + def run(self): + """Executes the check routine, returns result dict""" + raise NotImplementedError() + + @property + def config(self): + return self._config + + @property + def previous_results(self): + return self._previous_results diff --git a/checks/certificate.py b/checks/certificate.py new file mode 100644 index 0000000..2539963 --- /dev/null +++ b/checks/certificate.py @@ -0,0 +1,62 @@ +""" +Gathers information on the TLS/SSL certificate used by a server +""" + +from urllib.parse import urlparse +import logging +import ssl +from datetime import datetime +from datetime import timezone + +from OpenSSL import crypto + +from checks.abstract_checker import AbstractChecker + +class Checker(AbstractChecker): + def __init__(self, config, previous_results=None): + super().__init__(config, previous_results) + + def run(self): + results = {} + + for url in self.config.urls: + if url.startswith('https://'): + results[url] = self.get_certificate(url) + + return results + + def get_certificate(self, url): + result = { + 'exception': None, + 'serial_number': None, + 'subject': None, + 'issuer': None, + 'not_before': None, + 'not_after': None + } + + parsed = urlparse(url) + try: + cert = ssl.get_server_certificate((parsed.hostname, 443)) + x509 = crypto.load_certificate(crypto.FILETYPE_PEM, cert) + result['serial_number'] = str(x509.get_serial_number()) + + nb = x509.get_notBefore().decode('utf-8') + na = x509.get_notAfter().decode('utf-8') + + # parse '2018 06 27 00 00 00Z' + result['not_before'] = datetime(int(nb[0:4]), int(nb[4:6]), int(nb[6:8]), int(nb[8:10]), int(nb[10:12]), int(nb[12:14]), tzinfo=timezone.utc).isoformat() + result['not_after'] = datetime(int(na[0:4]), int(na[4:6]), int(na[6:8]), int(na[8:10]), int(na[10:12]), int(na[12:14]), tzinfo=timezone.utc).isoformat() + + # decode and convert from bytes to unicode + result['subject'] = dict([tuple(map(lambda x: x.decode('utf-8'), tup)) for tup in x509.get_subject().get_components()]) + result['issuer'] = dict([tuple(map(lambda x: x.decode('utf-8'), tup)) for tup in x509.get_issuer().get_components()]) + + except Exception as e: + result['exception'] = { + 'type': str(type(e)), + 'message': str(e), + } + logging.warning("Error when getting certificate for %s: %r" % (url, e)) + + return result diff --git a/checks/certificate_test.py b/checks/certificate_test.py new file mode 100644 index 0000000..66c2288 --- /dev/null +++ b/checks/certificate_test.py @@ -0,0 +1,27 @@ +from checks import certificate +from checks.config import Config +import unittest + +class TestCertificateChecker(unittest.TestCase): + + def test_google(self): + url = 'https://www.google.com/' + config = Config(urls=[url]) + checker = certificate.Checker(config=config, previous_results={}) + result = checker.run() + self.assertIn(url, result) + self.assertIsNone(result[url]['exception']) + self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services') + + def test_kaarst(self): + url = 'https://www.gruenekaarst.de/' + config = Config(urls=[url]) + checker = certificate.Checker(config=config, previous_results={}) + result = checker.run() + self.assertIn(url, result) + self.assertIsNone(result[url]['exception']) + self.assertEqual(result[url]['issuer']['O'], 'COMODO CA Limited') + + +if __name__ == '__main__': + unittest.main() diff --git a/checks/charset.py b/checks/charset.py new file mode 100644 index 0000000..0851dd2 --- /dev/null +++ b/checks/charset.py @@ -0,0 +1,77 @@ +""" +Checks which character set a page has. + +TODO: Check for http-equiv meta tags like + +""" + +import logging + +from bs4 import BeautifulSoup + +from checks.abstract_checker import AbstractChecker + +class Checker(AbstractChecker): + def __init__(self, config, previous_results=None): + super().__init__(config, previous_results) + + def run(self): + assert 'page_content' in self.previous_results + + results = {} + + for url in self.config.urls: + results[url] = self.get_charset(url) + + return results + + def get_charset(self, url): + """ + Expects page_content_dict['content'] to carry the HTML content + """ + page_content = self.previous_results['page_content'][url] + assert 'content' in page_content + assert 'response_headers' in page_content + logging.debug("%r", page_content['response_headers']) + assert 'content-type' in page_content['response_headers'] + + if page_content['content'] is None: + return + + result = { + 'meta_charset_tag': None, + 'content_type_header_charset': None, + 'charset': 'iso-8859-1', # ISO-8859-1 is the default according to https://www.w3.org/International/articles/http-charset/index + 'valid': None, + 'exception': None, + } + + soup = BeautifulSoup(page_content['content'], 'html.parser') + + # get response header charset + if ('content-type' in page_content['response_headers'] + and 'charset=' in page_content['response_headers']['content-type']): + parts = page_content['response_headers']['content-type'].split("charset=", 1) + result['content_type_header_charset'] = parts[1].lower() + result['charset'] = parts[1].lower() + + # get meta tag charset + metatags = soup.find_all('meta') + for tag in metatags: + if 'charset' in tag.attrs: + result['meta_charset_tag'] = tag['charset'].lower() + # meta tag overrules any previous value + result['charset'] = tag['charset'].lower() + + # check for charset plausibility (only for most common ones) + if result['charset'] in ('iso-8859-1', 'utf-8'): + try: + _ = page_content['content'].encode(result['charset']) + except UnicodeEncodeError as e: + result['valid'] = False + result['exception'] = str(e) + else: + result['valid'] = True + + + return result diff --git a/checks/charset_test.py b/checks/charset_test.py new file mode 100644 index 0000000..cce7677 --- /dev/null +++ b/checks/charset_test.py @@ -0,0 +1,49 @@ +import httpretty +from httpretty import httprettified +import unittest + +from checks import charset +from checks import page_content +from checks.config import Config + +@httprettified +class TestCharsetChecker(unittest.TestCase): + + def test_http_response(self): + url = 'http://www.example.com/' + httpretty.register_uri(httpretty.GET, url, + body=""" + + + + Hello + + """, + adding_headers={ + "Content-Type": "text/html; charset=ISO-8859-1", + }) + + results = {} + + config = Config(urls=[url]) + page_content_checker = page_content.Checker(config=config, previous_results={}) + results['page_content'] = page_content_checker.run() + + self.assertIn(url, results['page_content']) + self.assertIn('response_headers', results['page_content'][url]) + self.assertIn('content-type', results['page_content'][url]['response_headers']) + + charset_checker = charset.Checker(config=page_content_checker.config, previous_results=results) + result = charset_checker.run() + + self.assertIn(url, result) + self.assertEqual(result[url], { + 'meta_charset_tag': 'utf-8', + 'content_type_header_charset': 'iso-8859-1', + 'charset': 'utf-8', + 'valid': True, + 'exception': None, + }) + +if __name__ == '__main__': + unittest.main() diff --git a/checks/config.py b/checks/config.py new file mode 100644 index 0000000..d164b00 --- /dev/null +++ b/checks/config.py @@ -0,0 +1,29 @@ +class Config(object): + """ + Our configuration to be passed to checks + """ + + def __init__(self, urls, user_agent='green-spider/1.0'): + self._urls = set(urls) + self._user_agent = user_agent + + def __repr__(self): + return "Config(urls=%r)" % self._urls + + @property + def urls(self): + return list(self._urls) + + def add_url(self, url): + self._urls.add(url) + + def remove_url(self, url): + """Removes url from urls, if it was in there. Ignores errors.""" + try: + self._urls.remove(url) + except KeyError: + pass + + @property + def user_agent(self): + return self._user_agent diff --git a/checks/dns_resolution.py b/checks/dns_resolution.py new file mode 100644 index 0000000..efd3b05 --- /dev/null +++ b/checks/dns_resolution.py @@ -0,0 +1,55 @@ +""" +This check attempts to resolve all hostnames/domains in the input URLs. + +URLs which are not resolvable are removed from the config. +""" + +import logging +from socket import gethostbyname_ex +from urllib.parse import urlparse +from urllib.parse import urlunparse + +from checks.abstract_checker import AbstractChecker + + +class Checker(AbstractChecker): + def __init__(self, config, previous_results=None): + super().__init__(config, previous_results) + + def run(self): + """Executes the check routine, returns result dict""" + + results = {} + + urls = list(self.config.urls) + for url in urls: + parsed = urlparse(url) + + results[url] = self.resolve_hostname(parsed.hostname) + + # remove URL if non-resolvable + if not results[url]['resolvable']: + self.config.remove_url(url) + + return results + + def resolve_hostname(self, hostname): + """ + Resolve one to IPv4 address(es) + """ + result = { + 'hostname': hostname, + 'resolvable': False, + 'aliases': [], + 'ipv4_addresses': [], + } + + try: + hostname, aliases, ipv4_addresses = gethostbyname_ex(hostname) + result['resolvable'] = True + result['aliases'] = aliases + result['ipv4_addresses'] = ipv4_addresses + except Exception as e: + logging.debug("Hostname %s not resolvable. Exception: %r" % (hostname, e)) + + return result diff --git a/checks/domain_variations.py b/checks/domain_variations.py new file mode 100644 index 0000000..ab621ea --- /dev/null +++ b/checks/domain_variations.py @@ -0,0 +1,44 @@ +""" +This adds commonly tried variations of domains/subdomains to the URLs config. +""" + +import logging + +from urllib.parse import urlparse +from urllib.parse import urlunparse + +from checks.abstract_checker import AbstractChecker + + +class Checker(AbstractChecker): + def __init__(self, config, previous_results=None): + super().__init__(config, previous_results) + + def run(self): + urls = list(self.config.urls) + for url in urls: + parsed = urlparse(url) + hostnames = self.expand_hostname(parsed.hostname) + + for hostname in hostnames: + self.config.add_url(urlunparse((parsed.scheme, hostname, + parsed.path, parsed.params, parsed.query, parsed.fragment))) + + return None + + + def expand_hostname(self, hostname): + """ + Create variations of subdomains + """ + hostnames = set() + + hostnames.add(hostname) + if hostname.startswith('www.'): + # remove 'www.' prefix + hostnames.add(hostname[4:]) + else: + # add 'www.' prefix + hostnames.add('www.' + hostname) + + return sorted(list(hostnames)) diff --git a/checks/duplicate_content.py b/checks/duplicate_content.py new file mode 100644 index 0000000..9556902 --- /dev/null +++ b/checks/duplicate_content.py @@ -0,0 +1,107 @@ +""" +This checker looks at the similarity between previously downloaded pages +and removes duplicates from the config URLs +""" + +import logging + +import html_similarity + +from checks.abstract_checker import AbstractChecker + + +class Checker(AbstractChecker): + + # value above which we consider a page pair a duplicate + similarity_threshold = 0.99999 + + def __init__(self, config, previous_results=None): + super().__init__(config, previous_results) + + + def run(self): + + if len(self.config.urls) == 1: + # nothing to do for us + return + + urls = list(self.config.urls) + + # get content + content = {} + + assert 'page_content' in self.previous_results + + for url in urls: + page_content = self.previous_results['page_content'][url] + + if page_content['content'] is None: + logging.warn("Content for URL %s is None" % url) + + content[url] = page_content['content'] + + pairs = self.compare_pairwise(content) + + # remove duplicates + for key in pairs: + if pairs[key]['similarity'] is None: + continue + if pairs[key]['similarity'] > self.similarity_threshold: + # this pair is a duplicate. + # Decide which one to keep + url1, url2 = key.split(" ", 1) + reject = self.select_url_to_reject(url1, url2) + self.config.remove_url(reject) + + return pairs + + + def compare_pairwise(self, content): + # compair pairwise + pairs = {} + + for url1 in content: + for url2 in content: + + if url1 == url2: + continue + + # avoid checking pairs twice + pair_key = " ".join(sorted([url1, url2])) + if pair_key in pairs: + continue + + try: + s = html_similarity.similarity(content[url1], content[url2]) + logging.debug("Comparing pages for URLs %s and %s: similarity=%s", url1, url2, s) + pairs[pair_key] = { + 'similarity': s, + 'exception': None, + } + except (AttributeError, ValueError) as e: + logging.error("html_similarity.similarity thre exception for URL pair %s and %s: %s", url1, url2, e) + pairs[pair_key] = { + 'similarity': None, + 'exception': str(e), + } + + return pairs + + + def select_url_to_reject(self, url1, url2): + """Determine which of two URLs to keep, which to reject""" + + # HTTPS takes precedence + if url1.startswith('https://') and not url2.startswith('https://'): + return url2 + elif url2.startswith('https://') and not url1.startswith('https://'): + return url1 + + # Shorter URL wins + if len(url1) < len(url2): + return url2 + elif len(url1) > len(url2): + return url1 + + # default behaviour + return url1 diff --git a/checks/generator.py b/checks/generator.py new file mode 100644 index 0000000..5a1968b --- /dev/null +++ b/checks/generator.py @@ -0,0 +1,76 @@ +""" +Checks the 'generator' meta tag and page content properties +to detect well-known content management systems, themes etc. +""" + +import logging + +from checks.abstract_checker import AbstractChecker + +class Checker(AbstractChecker): + + # IP address of the newthinking GCMS server + gcms_ip = "91.102.13.20" + + def __init__(self, config, previous_results=None): + super().__init__(config, previous_results) + + def run(self): + assert 'page_content' in self.previous_results + assert 'html_head' in self.previous_results + + results = {} + + for url in self.config.urls: + results[url] = self.get_generator(url) + + return results + + + def get_generator(self, url): + page_content = self.previous_results['page_content'][url] + assert 'content' in page_content + + assert 'dns_resolution' in self.previous_results + dns_resolution = self.previous_results['dns_resolution'] + + head = self.previous_results['html_head'][url] + + generator = None + + if 'generator' in head and head['generator'] is not None: + generator = head['generator'].lower() + if 'typo3' in generator: + generator = 'typo3' + if 'wordpress' in generator: + generator = 'wordpress' + if 'drupal' in generator: + generator = 'drupal' + if 'joomla' in generator: + generator = 'joomla' + + # Qualify certain CMS flavours in more detail + if generator == "typo3": + # Typo3-Gruene advertises in the page content + if 'typo3-gruene.de' in page_content['content']: + generator = "typo3-gruene" + # newthinking GCMS in some page hrefs + elif 'ntc_gcms' in page_content['content']: + generator = "typo3-gcms" + # check if one of the IPs matches the well-known GCMS Server IP + elif url in dns_resolution: + for addr in dns_resolution[url]['ipv4_addresses']: + if addr == self.gcms_ip: + generator = "typo3-gcms" + + elif 'Urwahl3000' in page_content['content']: + generator = "wordpress-urwahl" + + elif ('josephknowsbest' in page_content['content'] or + 'Joseph-knows-best' in page_content['content']): + generator = "wordpress-josephknowsbest" + + elif 'wordpress' in page_content['content']: + generator = "wordpress" + + return generator diff --git a/checks/html_head.py b/checks/html_head.py new file mode 100644 index 0000000..4d3391e --- /dev/null +++ b/checks/html_head.py @@ -0,0 +1,152 @@ +""" +Extracts information from the html , like existence and value +of certain meta tags, link tags, title, etc. +""" + +import logging +import re +from urllib.parse import urljoin +from urllib.parse import urlparse + +from bs4 import BeautifulSoup + +from checks.abstract_checker import AbstractChecker + +class Checker(AbstractChecker): + def __init__(self, config, previous_results=None): + super().__init__(config, previous_results) + + def run(self): + results = {} + + for url in self.config.urls: + results[url] = self.get_content(url) + + return results + + def get_content(self, url): + """ + Expects page_content_dict['content'] to carry the HTML content + """ + + page_content = self.previous_results['page_content'][url] + assert 'content' in page_content + assert 'response_headers' in page_content + assert 'content-type' in page_content['response_headers'] + + if page_content['content'] is None: + return + + soup = BeautifulSoup(page_content['content'], 'html.parser') + head = soup.find('head') + + result = { + 'title': self.get_title(head), + 'link_canonical': self.get_link_canonical(head, url), + 'link_rss_atom': self.get_link_rss_atom(head, url), + 'link_icon': self.get_link_icon(head, url), + 'generator': self.get_generator(head), + 'opengraph': self.get_opengraph(head), + 'viewport': self.get_viewport(head), + } + + return result + + + def get_title(self, head): + """Extract and clean up page title""" + if head is None: + return + + title = None + + tag = head.find('title') + if tag is None: + return + + title = tag.get_text() + + # clean up + title = title.replace(u'\u00a0', ' ') + title = title.replace(' ', ' ') + title = title.strip() + + return title + + + def get_link_canonical(self, head, url): + if head is None: + return + link = head.find('link', rel='canonical') + if link: + return urljoin(url, link.get('href')) + + + def get_link_rss_atom(self, head, url): + if head is None: + return + hrefs = [] + rss_links = head.find_all('link', type='application/rss+xml') + atom_links = head.find_all('link', type='application/atom+xml') + + if rss_links: + for link in rss_links: + hrefs.append(link.get('href')) + if atom_links: + for link in rss_links: + hrefs.append(link.get('href')) + + # make URLs absolute + for i in range(len(hrefs)): + parsed = urlparse(hrefs[i]) + if parsed.scheme == '': + hrefs[i] = urljoin(url, hrefs[i]) + + return hrefs + + + def get_link_icon(self, head, url): + if head is None: + return + + tag = head.find('link', rel=lambda x: x and x.lower() == 'icon') + if tag: + return urljoin(url, tag.get('href')) + tag = head.find('link', rel=lambda x: x and x.lower() == 'shortcut icon') + if tag: + return urljoin(url, tag.get('href')) + + + def get_generator(self, head): + if head is None: + return + + tags = head.select('[name=generator]') + if tags: + return tags[0].get('content') + + + def get_opengraph(self, head): + if head is None: + return + + # we find tags by matching this property/itemprop value regex + property_re = re.compile('^og:') + + opengraph = set() + for tag in head.find_all(property=property_re): + opengraph.add(tag.get('property')) + for tag in head.find_all(itemprop=property_re): + opengraph.add(tag.get('itemprop')) + + opengraph = sorted(list(opengraph)) + if opengraph != []: + return opengraph + + + def get_viewport(self, head): + if head is None: + return + tags = head.select('[name=viewport]') + if tags: + return tags[0].get('content') diff --git a/checks/http_and_https.py b/checks/http_and_https.py new file mode 100644 index 0000000..79d0b6b --- /dev/null +++ b/checks/http_and_https.py @@ -0,0 +1,27 @@ +""" +This adds, for every HTTP URL, the HTTPS counterpart, +and vice versa, to config.urls + +So it doesn't actually perform tests. It only expands the +URLs to test by other checks. +""" + +from checks.abstract_checker import AbstractChecker + +class Checker(AbstractChecker): + def __init__(self, config, previous_results=None): + super().__init__(config, previous_results) + + def run(self): + """ + Adds URLs to config.urls, returns nothing + """ + + for url in self.config.urls: + + if url.startswith('https://'): + self.config.add_url('http://' + url[8:]) + elif url.startswith('http://'): + self.config.add_url('https://' + url[7:]) + + return None \ No newline at end of file diff --git a/checks/load_in_browser.py b/checks/load_in_browser.py new file mode 100644 index 0000000..2ab3af6 --- /dev/null +++ b/checks/load_in_browser.py @@ -0,0 +1,134 @@ +""" +Collects information by loading pages in a browser. + +Information includes: + +- whether the document width adapts well to viewports as little as 360 pixels wide +- whether javascript errors or errors from missing resources occur +- collects CSS font-family properties in use +""" + +import logging +import time + +from selenium import webdriver +from selenium.common.exceptions import StaleElementReferenceException +from selenium.common.exceptions import TimeoutException +import tenacity + +from checks.abstract_checker import AbstractChecker + + +class Checker(AbstractChecker): + + page_load_timeout = 20 + + # sizes we check for (width, height) + sizes = ( + (360, 640), # rather old smartphone + (768, 1024), # older tablet or newer smartphone + (1024, 768), # older desktop or horiz. tablet + (1920, 1080), # Full HD horizontal + ) + + def __init__(self, config, previous_results=None): + super().__init__(config, previous_results) + + # Our selenium user agent using Chrome headless as an engine + chrome_options = webdriver.ChromeOptions() + chrome_options.add_argument('--headless') + chrome_options.add_argument('--disable-gpu') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-extensions') + self.driver = webdriver.Chrome(options=chrome_options) + self.driver.set_page_load_timeout(self.page_load_timeout) + + def run(self): + + results = {} + for url in self.config.urls: + + results[url] = { + 'sizes': None, + 'min_document_width': None, + 'logs': None, + 'font_families': None, + } + + # responsive check + try: + sizes = self.check_responsiveness(url) + results[url] = { + 'sizes': sizes, + 'min_document_width': min([s['document_width'] for s in sizes]), + 'logs': self.capture_log(), + } + except TimeoutException as e: + logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e)) + pass + except tenacity.RetryError as re: + logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re)) + pass + + # CSS collection + font_families = None + + try: + elements = self.driver.find_elements_by_xpath("//*") + font_families = set() + for element in elements: + try: + font_family = element.value_of_css_property('font-family') + if font_family is None: + continue + font_families.add(font_family.lower()) + except StaleElementReferenceException as e: + logging.warn("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e)) + continue + + results[url]['font_families'] = sorted(list(font_families)) + + except TimeoutException as e: + logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e)) + pass + + self.driver.quit() + + return results + + + @tenacity.retry(stop=tenacity.stop_after_attempt(3), + retry=tenacity.retry_if_exception_type(TimeoutException)) + def check_responsiveness(self, url): + result = [] + + # set window to the first size initially + self.driver.set_window_size(self.sizes[0][0], self.sizes[0][1]) + self.driver.get(url) + + # give the page some time to load + time.sleep(10) + + for (width, height) in self.sizes: + self.driver.set_window_size(width, height) + + # wait for re-render/re-flow + time.sleep(1.0) + doc_width = self.driver.execute_script("return document.body.scrollWidth") + + result.append({ + 'viewport_width': width, + 'document_width': int(doc_width), + }) + + return result + + def capture_log(self): + """ + Returns log elements with level "SEVERE" + """ + entries = [] + for entry in self.driver.get_log('browser'): + if entry['level'] in ('WARNING', 'SEVERE'): + entries.append(entry) + return entries diff --git a/checks/page_content.py b/checks/page_content.py new file mode 100644 index 0000000..d036274 --- /dev/null +++ b/checks/page_content.py @@ -0,0 +1,94 @@ +""" +This check downloads the HTML page for each URL +""" + +import logging + +import requests + +from checks.abstract_checker import AbstractChecker + + +class Checker(AbstractChecker): + + # connection timeout (seconds) + CONNECT_TIMEOUT = 10 + + # response timeout (seconds) + READ_TIMEOUT = 20 + + def __init__(self, config, previous_results=None): + super().__init__(config, previous_results) + + + def run(self): + results = {} + + self.headers = { + "User-Agent": self.config.user_agent, + } + + # copy URLs, as we may be manipulating self.config.urls in the loop + url = list(self.config.urls) + + for url in self.config.urls: + result = self.download_page(url) + results[url] = result + + # remove bad URLs from config, to avoid later checks using them + if 'exception' in result and result['exception'] is not None: + self.config.remove_url(url) + + return results + + + def download_page(self, url): + result = { + 'url': url, + 'content': None, + 'content_type': None, + 'content_length': None, + 'status_code': None, + 'response_headers': None, + 'duration': None, + 'exception': None, + } + + try: + r = requests.get(url, + headers=self.headers, + timeout=(self.CONNECT_TIMEOUT, self.READ_TIMEOUT)) + + result['url'] = r.url + result['status_code'] = r.status_code + result['content'] = r.text + result['content_length'] = len(r.text) + result['response_headers'] = self.get_headers(r.headers) + result['duration'] = round(r.elapsed.total_seconds() * 1000) + + if r.headers.get("content-type") is not None: + result['content_type'] = r.headers.get("content-type").split(";")[0].strip() + + except requests.exceptions.ConnectionError as exc: + logging.error(str(exc) + " " + url) + result['exception'] = "connection" + except requests.exceptions.ReadTimeout as exc: + logging.error(str(exc) + " " + url) + result['exception'] = "read_timeout" + except requests.exceptions.Timeout as exc: + logging.error(str(exc) + " " + url) + result['exception'] = "connection_timeout" + except Exception as exc: + logging.error(str(exc) + " " + url) + result['exception'] = "%s %s" % (str(type(exc)), exc) + + return result + + def get_headers(self, headers): + """ + Transforms CaseInsensitiveDict into dict with lowercase keys + """ + out = {} + for key in headers: + out[key.lower()] = headers[key] + return out diff --git a/checks/url_canonicalization.py b/checks/url_canonicalization.py new file mode 100644 index 0000000..c6ce173 --- /dev/null +++ b/checks/url_canonicalization.py @@ -0,0 +1,13 @@ +""" +This check verifies whether there is a single URL +or several variants left at this point. +""" + +from checks.abstract_checker import AbstractChecker + +class Checker(AbstractChecker): + def __init__(self, config, previous_results=None): + super().__init__(config, previous_results) + + def run(self): + return self.config.urls diff --git a/checks/url_reachability.py b/checks/url_reachability.py new file mode 100644 index 0000000..b371540 --- /dev/null +++ b/checks/url_reachability.py @@ -0,0 +1,104 @@ +""" +This check verifies whether the urls in config are reachable. +Some additional information regarding redirects and SSL problems +are also recorded and returned as results. + +Non-accessible URLs are removed from config.urls. + +A redirect to facebook.com is not considered reachable, as that +leads to a different website in the sense of this system. + +TODO: Parallelize the work done in this test +""" + +import logging + +from urllib.parse import urlparse +import requests + +from checks.abstract_checker import AbstractChecker + + +class Checker(AbstractChecker): + def __init__(self, config, previous_results=None): + super().__init__(config, previous_results) + + def run(self): + headers = { + "User-Agent": self.config.user_agent + } + + results = {} + urls = list(self.config.urls) + + for url in urls: + logging.debug("Checking URL reachability for %s", url) + + result = { + "url": url, + "redirect_history": [], + "status": None, + "exception": None, + "duration": None, + } + + # Perform HEAD requests, recording redirect log + try: + r = requests.head(url, headers=headers, allow_redirects=True) + result['status'] = r.status_code + result['duration'] = round(r.elapsed.total_seconds() * 1000) + + if len(r.history): + result['redirect_history'] = self.expand_history(r.history) + logging.debug("Redirects: %r", result['redirect_history']) + + if r.url == url: + logging.debug("URL: %s - status %s", url, r.status_code) + else: + logging.debug("URL: %s - status %s - redirects to %s", url, + r.status_code, r.url) + # remove source URL, add target URL to config.urls + self.config.remove_url(url) + self.config.add_url(r.url) + + # remove 404 etc + if r.status_code > 400: + self.config.remove_url(url) + + except Exception as exc: + logging.info("Exception for URL %s: %s %s", url, str(type(exc)), exc) + result['exception'] = { + 'type': str(type(exc)), + 'message': str(exc), + } + + # remove URL to prevent further checks on unreachable URL + self.config.remove_url(url) + + # if redirects end in www.facebook.com or www.denic.de, remove this URL again + # remove if redirect target is facebook + if result['exception'] is not None and result['redirect_history'] is not None and len(result['redirect_history']) > 0: + parsed = urlparse(result['redirect_history'][-1]['redirect_to']) + if parsed.hostname in ('www.facebook.com', 'www.denic.de', 'sedo.com'): + result[url]['exception'] = { + 'type': 'Bad target domain', + 'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.hostname, + } + self.config.remove_url(url) + + results[url] = result + + return results + + def expand_history(self, history): + """Extracts primitives from a list of requests.Response objects""" + items = [] + for h in history: + item = { + 'status': h.status_code, + 'duration': round(h.elapsed.total_seconds() * 1000), + 'redirect_to': h.headers['location'], + } + items.append(item) + + return items diff --git a/checks/url_reachability_test.py b/checks/url_reachability_test.py new file mode 100644 index 0000000..b5514d5 --- /dev/null +++ b/checks/url_reachability_test.py @@ -0,0 +1,71 @@ +import httpretty +from httpretty import httprettified +import unittest + +from checks import url_reachability +from checks.config import Config + +@httprettified +class TestCharsetChecker(unittest.TestCase): + + def test_success(self): + url = 'http://www.example.com/' + httpretty.register_uri(httpretty.HEAD, url, + status=200, body="") + + config = Config(urls=[url]) + checker = url_reachability.Checker(config=config, previous_results={}) + result = checker.run() + + self.assertEqual(result[url]['url'], url) + self.assertEqual(result[url]['redirect_history'], []) + self.assertEqual(result[url]['status'], 200) + self.assertIsNone(result[url]['exception']) + self.assertTrue(0 < result[url]['duration'] < 100) + + + def test_redirect(self): + url = 'http://www.example.com/' + url2 = 'http://www2.example.com/' + httpretty.register_uri(httpretty.HEAD, url, + status=302, body="", + adding_headers={"Location": url2}) + httpretty.register_uri(httpretty.HEAD, url2, + status=200, body="") + + config = Config(urls=[url]) + checker = url_reachability.Checker(config=config, previous_results={}) + result = checker.run() + + self.assertIn(url, result) + self.assertEqual(result[url]['url'], url) + self.assertEqual(result[url]['status'], 200) + self.assertIsNone(result[url]['exception']) + self.assertTrue(0 < result[url]['duration'] < 100) + self.assertEqual(len(result[url]['redirect_history']), 1) + self.assertEqual(result[url]['redirect_history'][0]['status'], 302) + self.assertEqual(result[url]['redirect_history'][0]['redirect_to'], url2) + + + def test_notfound(self): + url = 'http://www.example.com/' + httpretty.register_uri(httpretty.HEAD, url, + status=404, body="Not found") + + config = Config(urls=[url]) + checker = url_reachability.Checker(config=config, previous_results={}) + result = checker.run() + + self.assertEqual(result[url]['url'], url) + self.assertEqual(result[url]['redirect_history'], []) + self.assertEqual(result[url]['status'], 404) + self.assertIsNone(result[url]['exception']) + + newconfig = checker.config + + self.assertEqual(len(newconfig.urls), 0) + + + +if __name__ == '__main__': + unittest.main() diff --git a/cli.py b/cli.py new file mode 100644 index 0000000..f470f26 --- /dev/null +++ b/cli.py @@ -0,0 +1,83 @@ +""" +Command line utility for spider, export etc. +""" + +import argparse +import logging +import signal +import sys + +from google.cloud import datastore + +def handle_sigint(signum, frame): + """ + Handles SIGINT, which occurs on Ctrl-C + """ + print("\nInterrupted by SIGINT\n") + sys.exit() + + +if __name__ == "__main__": + signal.signal(signal.SIGINT,handle_sigint) + + parser = argparse.ArgumentParser() + + # global flags + parser.add_argument('--credentials-path', dest='credentials_path', + help='Path to the service account credentials JSON file', + default='/secrets/service-account.json') + + parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)", + default='info') + + # subcommands + subparsers = parser.add_subparsers(help='sub-command help', dest='command') + + # spider subcommand + spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider') + spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)') + + # jobs subcommand + jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.') + jobs_parser.add_argument('--url', help='Add a job to spider a specific URL') + + # export subcommand + export_parser = subparsers.add_parser('export', help='Export JSON data') + export_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to export (default: spider-results)') + + + args = parser.parse_args() + + # set log level + logging.getLogger("urllib3").setLevel(logging.CRITICAL) + + loglevel = args.loglevel.lower() + if loglevel == 'error': + logging.basicConfig(level=logging.ERROR) + elif loglevel == 'warn': + logging.basicConfig(level=logging.WARN) + elif loglevel == 'debug': + logging.basicConfig(level=logging.DEBUG) + logging.getLogger("selenium").setLevel(logging.INFO) + else: + logging.basicConfig(level=logging.INFO) + loglevel = 'info' + + logging.debug("Called command %s", args.command) + + datastore_client = datastore.Client.from_service_account_json(args.credentials_path) + + if args.command == 'jobs': + + import jobs + jobs.create_jobs(datastore_client, args.url) + + elif args.command == 'export': + + import export + export.export_screenshots(datastore_client) + export.export_results(datastore_client, args.kind) + + else: + from spider import spider + spider.work_of_queue(datastore_client, args.kind) diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..83b1ba4 --- /dev/null +++ b/config/__init__.py @@ -0,0 +1,23 @@ + + +# connection timeout for website checks (seconds) +CONNECT_TIMEOUT = 5 + +# response timeout for website checks +READ_TIMEOUT = 10 + +# Git repo for our data +GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git' + +# folder in that repo that holds the data +GREEN_DIRECTORY_DATA_PATH = 'data/countries/de' + +# folder we use locally to clone the repo +GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory' + +# IP address of the newthinking GCMS server +GCMS_IP = "91.102.13.20" + +# kind name of the spider job key datastore entities +JOB_DATASTORE_KIND = 'spider-jobs' + diff --git a/devops/run-job.sh b/devops/run-job.sh index 207c205..530ecdb 100755 --- a/devops/run-job.sh +++ b/devops/run-job.sh @@ -19,6 +19,8 @@ # secrets/datastore-writer.json +DOCKERIMAGE="quay.io/netzbegruenung/green-spider:dev" + API_TOKEN_SECRET="secrets/hetzner-api-token.sh" test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; } source $API_TOKEN_SECRET @@ -29,10 +31,14 @@ if [[ "$1" == "" ]]; then exit 1 fi +SERVERNAME="$1-$(date | md5 | cut -c1-3)" + +# possible values: cx11 (1 core 2 GB), cx21 (2 cores, 4 GB), cx31 (2 cores, 8 GB) +SERVERTYPE="cx21" function create_server() { - echo "Creating server $1" + echo "Creating server $SERVERNAME" # server_type 'cx11' is the smallest, cheapest category. # location 'nbg1' is Nürnberg/Nuremberg, Germany. @@ -44,8 +50,8 @@ function create_server() -H "Content-Type: application/json" \ -H "Authorization: Bearer $API_TOKEN" \ -d "{ - \"name\": \"$1\", - \"server_type\": \"cx11\", + \"name\": \"$SERVERNAME\", + \"server_type\": \"$SERVERTYPE\", \"location\": \"nbg1\", \"start_after_create\": true, \"image\": \"debian-9\", @@ -61,7 +67,7 @@ function create_server() # Get IP: SERVER_IP=$(echo $CREATE_RESPONSE | jq -r .server.public_net.ipv4.ip) - echo "Created server with ID $SERVER_ID and IP $SERVER_IP" + echo "Created server $SERVERNAME with ID $SERVER_ID and IP $SERVER_IP" } @@ -142,22 +148,25 @@ else # Run docker job echo "Starting Docker Job" - ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \ - -v /root/secrets:/secrets \ - quay.io/netzbegruenung/green-spider spider.py \ - --credentials-path /secrets/datastore-writer.json \ - jobs + #ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \ + # -v /root/secrets:/secrets \ + # quay.io/netzbegruenung/green-spider spider.py \ + # --credentials-path /secrets/datastore-writer.json \ + # jobs + ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP mkdir -p /dev-shm ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \ + -v /dev-shm:/dev/shm \ -v /root/secrets:/secrets \ - quay.io/netzbegruenung/green-spider spider.py \ + $DOCKERIMAGE \ --credentials-path /secrets/datastore-writer.json \ - spider + --loglevel info \ + spider --kind spider-results-dev fi # Delete the box -echo "Deleting server $SERVER_ID" +echo "Deleting server $SERVERNAME with ID $SERVER_ID" curl -s -X DELETE -H "Content-Type: application/json" \ -H "Authorization: Bearer $API_TOKEN" \ https://api.hetzner.cloud/v1/servers/$SERVER_ID diff --git a/data_export.py b/export/__init__.py similarity index 56% rename from data_export.py rename to export/__init__.py index 2ab23ef..64af277 100644 --- a/data_export.py +++ b/export/__init__.py @@ -2,8 +2,7 @@ Exports data from the database to JSON files for use in a static webapp """ -from google.cloud import datastore -import hashlib +from hashlib import md5 import json import logging import sys @@ -14,44 +13,67 @@ import requests SITEICONS_PATH = "/icons" -client = None - -def export_results(): +def export_results(client, entity_kind): """ Export of the main results data """ out = [] - query = client.query(kind='spider-results') + # Load data from database + query = client.query(kind=entity_kind) for entity in query.fetch(): logging.debug(entity.key.name) - record = dict(entity) - record["results"]["created"] = record["created"].isoformat() - out.append(record["results"]) + out.append({ + 'input_url': entity.key.name, + 'resulting_urls': entity.get('checks').get('url_canonicalization'), + 'created': entity.get('created').isoformat(), + 'meta': entity.get('meta'), + 'checks': entity.get('checks'), + 'rating': entity.get('rating'), + 'score': entity.get('score'), + 'icons': [], + }) # load icons, reformat icons details + icons_downloaded = set() for index in range(len(out)): - if "details" not in out[index]: - continue - if "icons" not in out[index]["details"]: - continue - urls = out[index]["details"]["icons"] - out[index]["details"]["icons"] = {} - for url in urls: - if not (url.startswith("http://") or url.startswith("https://")): - logging.debug("Skipping icon %s", url) - continue - logging.debug("Dowloading icon %s", url) - filename = download_icon(url) + assert "checks" in out[index] + assert "html_head" in out[index]["checks"] + + # collect icons urls + icons = set() + for url in out[index]['checks']['html_head']: + assert 'link_icon' in out[index]['checks']['html_head'][url] + if out[index]['checks']['html_head'][url]['link_icon'] is not None: + iconurl = out[index]['checks']['html_head'][url]['link_icon'] + if iconurl.startswith("data:"): + continue + if iconurl in icons_downloaded: + continue + icons.add(iconurl) + + out[index]["icons"] = {} + for iconurl in list(icons): + logging.debug("Dowloading icon %s", iconurl) + icons_downloaded.add(iconurl) + filename = download_icon(iconurl) if filename: - out[index]["details"]["icons"][url] = filename + out[index]["icons"][url] = filename output_filename = "/out/spider_result.json" with open(output_filename, 'w', encoding="utf8") as jsonfile: json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False) + + # compact version + output_filename = "/out/spider_result_compact.json" + for i in range(len(out)): + out[i]['cms'] = list(out[i]['checks']['generator'].values()) + del out[i]['checks'] + with open(output_filename, 'w', encoding="utf8") as jsonfile: + json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False) -def export_screenshots(): +def export_screenshots(client): """ Export of screenshot meta data """ @@ -78,10 +100,12 @@ def download_icon(icon_url): """ default_endings = { + "image/x-ico": "ico", "image/x-icon": "ico", "image/vnd.microsoft.icon": "ico", "image/png": "png", "image/jpeg": "jpg", + "image/gif": "gif", } # Download the icon @@ -92,7 +116,7 @@ def download_icon(icon_url): if req.status_code >= 400: return None - content_hash = hashlib.md5(req.content).hexdigest() + content_hash = md5(req.content).hexdigest() extension = "" try: @@ -109,6 +133,9 @@ def download_icon(icon_url): if extension == "": # derive from content type ctype = req.headers.get('content-type') + if ctype is None: + return + try: extension = default_endings[ctype] except KeyError: @@ -122,17 +149,3 @@ def download_icon(icon_url): iconfile.write(req.content) return filename - - -if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG) - - if len(sys.argv) == 1: - print("Error: please provide path to Google Storage API system account JSON file as argument") - sys.exit(1) - - key_path = sys.argv[1] - client = datastore.Client.from_service_account_json(key_path) - - export_screenshots() - export_results() diff --git a/jobs/__init__.py b/jobs/__init__.py new file mode 100644 index 0000000..3e125d5 --- /dev/null +++ b/jobs/__init__.py @@ -0,0 +1,180 @@ +""" +The jobs module allows to create jobs for the queue and take jobs off the queue +""" + +from datetime import datetime +import logging +import os +import random +import shutil + +from git import Repo +import tenacity +import yaml +from google.api_core.exceptions import Aborted +from google.cloud import datastore + +import config + + +def clone_data_directory(): + """ + Clones the source of website URLs, the green directory, + into the local file system using git + """ + if os.path.exists(config.GREEN_DIRECTORY_LOCAL_PATH): + shutil.rmtree(config.GREEN_DIRECTORY_LOCAL_PATH) + Repo.clone_from(config.GREEN_DIRECTORY_REPO, config.GREEN_DIRECTORY_LOCAL_PATH) + + +def directory_entries(): + """ + Iterator over all data files in the cloned green directory + """ + path = os.path.join(config.GREEN_DIRECTORY_LOCAL_PATH, config.GREEN_DIRECTORY_DATA_PATH) + for root, _, files in os.walk(path): + for fname in files: + + filepath = os.path.join(root, fname) + if not filepath.endswith(".yaml"): + continue + + with open(filepath, 'r', encoding='utf8') as yamlfile: + for doc in yaml.load_all(yamlfile): + yield doc + + +def chunks(the_list, size): + """ + Yield successive n-sized chunks from list the_list + where n = size. + """ + for i in range(0, len(the_list), size): + yield the_list[i:i + size] + + +def create_jobs(datastore_client, url=None): + """ + Read all URLs from green directory and fill a job database + with one job per URL. + + Alternatively, if the url argument is given, only the given URL + will be added as a spider job. + """ + + # refresh our local clone of the green directory + logging.info("Refreshing green-directory clone") + clone_data_directory() + + # build the list of website URLs to run checks for + logging.info("Processing green-directory") + input_entries = [] + + count = 0 + + random.seed() + + for entry in directory_entries(): + + if 'type' not in entry: + logging.error("Entry without type") + continue + if 'urls' not in entry: + logging.debug("Entry %s does not have any URLs.", repr_entry(entry)) + continue + + website_url = None + for index in range(len(entry['urls'])): + try: + if entry['urls'][index]['type'] == "WEBSITE": + website_url = entry['urls'][index]['url'] + if website_url: + if url is not None and website_url != url: + continue + input_entries.append({ + "url": website_url, + "type": entry.get("type"), + "level": entry.get("level"), + "state": entry.get("state"), + "district": entry.get("district"), + "city": entry.get("city"), + }) + count += 1 + except NameError: + logging.error("Error in %s: 'url' key missing (%s)", + repr_entry(entry), entry['urls'][index]) + + # ensure the passed URL argument is really there, even if not part + # of the directory. + if url and count == 0: + logging.info("Adding job for URL %s which is not part of green-directory", url) + input_entries.append({ + "url": url, + "type": None, + "level": None, + "state": None, + "district": None, + "city": None, + "index": int(random.uniform(1000000, 9999999)), + }) + + count = 0 + logging.info("Writing jobs") + + entities = [] + + for entry in input_entries: + key = datastore_client.key(config.JOB_DATASTORE_KIND, entry["url"]) + entity = datastore.Entity(key=key) + entity.update({ + "created": datetime.utcnow(), + "type": entry["type"], + "level": entry["level"], + "state": entry["state"], + "district": entry["district"], + "city": entry["city"], + "index": int(random.uniform(1000000, 9999999)), + }) + entities.append(entity) + + # commmit to DB + for chunk in chunks(entities, 300): + logging.debug("Writing jobs chunk of length %d", len(chunk)) + datastore_client.put_multi(chunk) + count += len(chunk) + + logging.info("Writing jobs done, %s jobs added", count) + + +@tenacity.retry(wait=tenacity.wait_exponential(), + retry=tenacity.retry_if_exception_type(Aborted)) +def get_job_from_queue(datastore_client): + """ + Returns a URL from the queue + """ + out = None + + with datastore_client.transaction(): + query = datastore_client.query(kind=config.JOB_DATASTORE_KIND, + order=['index']) + for entity in query.fetch(limit=1): + logging.debug("Got job: %s", entity) + out = dict(entity) + out["url"] = entity.key.name + datastore_client.delete(entity.key) + + return out + +def repr_entry(entry): + """ + Return string representation of a directory entry, + for logging/debugging purposes + """ + ret = entry['type'] + if 'level' in entry: + ret += "/" + entry['level'] + if 'state' in entry: + ret += "/" + entry['state'] + if 'district' in entry: + ret += "/" + entry['district'] + return ret diff --git a/rating/__init__.py b/rating/__init__.py new file mode 100644 index 0000000..197e720 --- /dev/null +++ b/rating/__init__.py @@ -0,0 +1,53 @@ +""" +The rating module contains the functionality to get calculate score for certain +criteria based on information gather by checks before. +""" + +import logging + +from rating import canonical_url +from rating import favicon +from rating import feeds +from rating import https +from rating import no_network_errors +from rating import no_script_errors +from rating import reachable +from rating import resolvable +from rating import response_duration +from rating import responsive_layout +from rating import use_specific_fonts +from rating import www_optional + + +def calculate_rating(results): + """ + Calculates ratings for a number of criteria. + + Params: + results - Results dictionary from checks + """ + + # The raters to execute. + rating_modules = { + 'CANONICAL_URL': canonical_url, + 'DNS_RESOLVABLE_IPV4': resolvable, + 'FAVICON': favicon, + 'FEEDS': feeds, + 'HTTPS': https, + 'HTTP_RESPONSE_DURATION': response_duration, + 'NO_NETWORK_ERRORS': no_network_errors, + 'NO_SCRIPT_ERRORS': no_script_errors, + 'RESPONSIVE': responsive_layout, + 'SITE_REACHABLE': reachable, + 'USE_SPECIFIC_FONTS': use_specific_fonts, + 'WWW_OPTIONAL': www_optional, + } + + output = {} + + for name in rating_modules: + + rater = rating_modules[name].Rater(results) + output[name] = rater.rate() + + return output diff --git a/rating/abstract_rater.py b/rating/abstract_rater.py new file mode 100644 index 0000000..ef2a2f8 --- /dev/null +++ b/rating/abstract_rater.py @@ -0,0 +1,22 @@ +class AbstractRater(object): + + # String 'boolean' or 'number' + rating_type = None + + # The default value to return if no rating given + default_value = None + + max_score = 1 + + # Name of the checks this rater depends on + depends_on_checks = [] + + def __init__(self, check_results): + self.check_results = check_results + + for item in self.depends_on_checks: + assert item in self.check_results + + def rate(self): + raise NotImplementedError() + diff --git a/rating/canonical_url.py b/rating/canonical_url.py new file mode 100644 index 0000000..dbe4024 --- /dev/null +++ b/rating/canonical_url.py @@ -0,0 +1,31 @@ +""" +This looks at remaining resolvable URLs after redirects +and gives score if there is only one URL left. +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'boolean' + default_value = False + depends_on_checks = ['url_canonicalization'] + max_score = 1 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + if len(self.check_results['url_canonicalization']) == 1: + value = True + score = self.max_score + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/favicon.py b/rating/favicon.py new file mode 100644 index 0000000..5387a1e --- /dev/null +++ b/rating/favicon.py @@ -0,0 +1,32 @@ +""" +This gives a score if the site has an icon. +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'boolean' + default_value = False + depends_on_checks = ['html_head'] + max_score = 1 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + for url in self.check_results['html_head']: + if self.check_results['html_head'][url]['link_icon'] is not None: + value = True + score = self.max_score + break + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/feeds.py b/rating/feeds.py new file mode 100644 index 0000000..edc8888 --- /dev/null +++ b/rating/feeds.py @@ -0,0 +1,35 @@ +""" +This gives a score if the site has feeds. +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'boolean' + default_value = False + depends_on_checks = ['html_head'] + max_score = 1 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + for url in self.check_results['html_head']: + if self.check_results['html_head'][url]['link_rss_atom'] is None: + continue + if self.check_results['html_head'][url]['link_rss_atom'] == []: + continue + value = True + score = self.max_score + break + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/https.py b/rating/https.py new file mode 100644 index 0000000..e47550e --- /dev/null +++ b/rating/https.py @@ -0,0 +1,47 @@ +""" +This looks at all HTTPS URLs we checked for reachability. + +If all of them were reachable without errors, we give full score. +If some or all had errors, or no HTTPS URL is reachable, we give zero. +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'boolean' + default_value = False + depends_on_checks = ['url_reachability'] + + # HTTPS is very important, so this counts double + max_score = 2 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + reachable_count = 0 + unreachable_count = 0 + + for url in self.check_results['url_reachability']: + if not url.startswith('https://'): + continue + + if self.check_results['url_reachability'][url]['exception'] is None: + reachable_count += 1 + else: + unreachable_count += 1 + + if unreachable_count == 0 and reachable_count > 0: + value = True + score = self.max_score + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/no_network_errors.py b/rating/no_network_errors.py new file mode 100644 index 0000000..ac56247 --- /dev/null +++ b/rating/no_network_errors.py @@ -0,0 +1,48 @@ +""" +If all URLs could be loaded without severe network errors, this rater gives a score. +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'boolean' + default_value = False + depends_on_checks = ['load_in_browser'] + max_score = 1 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + found_pageloads = 0 + found_errors = 0 + for url in self.check_results['load_in_browser']: + if (self.check_results['load_in_browser'][url]['logs'] == [] or + self.check_results['load_in_browser'][url]['logs'] is None): + continue + + found_pageloads += 1 + + # scan log entries for script errors + for entry in self.check_results['load_in_browser'][url]['logs']: + if entry['source'] != 'network': + continue + if entry['level'] != 'SEVERE': + continue + + found_errors += 1 + + if found_pageloads > 0 and found_errors == 0: + value = True + score = self.max_score + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/no_script_errors.py b/rating/no_script_errors.py new file mode 100644 index 0000000..32a89c6 --- /dev/null +++ b/rating/no_script_errors.py @@ -0,0 +1,42 @@ +""" +If all URLs could be loaded without JavaScript errors, this rater gives a score. +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'boolean' + default_value = False + depends_on_checks = ['load_in_browser'] + max_score = 1 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + found_pageloads = 0 + found_errors = 0 + for url in self.check_results['load_in_browser']: + if self.check_results['load_in_browser'][url]['logs'] == []: + found_pageloads += 1 + continue + + # scan log entries for script errors + for entry in self.check_results['load_in_browser'][url]['logs']: + if entry['source'] == 'javascript': + found_errors += 1 + + if found_pageloads > 0 and found_errors == 0: + value = True + score = self.max_score + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/reachable.py b/rating/reachable.py new file mode 100644 index 0000000..381cdb0 --- /dev/null +++ b/rating/reachable.py @@ -0,0 +1,36 @@ +""" +This gives a score if one of the checked URL variations was reachable. +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'boolean' + default_value = False + depends_on_checks = ['url_reachability'] + max_score = 1 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + count = 0 + for url in self.check_results['url_reachability']: + if self.check_results['url_reachability'][url]['exception'] is not None: + continue + count += 1 + + if count > 0: + value = True + score = self.max_score + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/resolvable.py b/rating/resolvable.py new file mode 100644 index 0000000..01e243e --- /dev/null +++ b/rating/resolvable.py @@ -0,0 +1,35 @@ +""" +This gives a score if one of the input URL's hostnames was resolvable +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'boolean' + default_value = False + depends_on_checks = ['dns_resolution'] + max_score = 1 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + count = 0 + for url in self.check_results['dns_resolution']: + if self.check_results['dns_resolution'][url]['resolvable']: + count += 1 + + if count > 0: + value = True + score = self.max_score + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/response_duration.py b/rating/response_duration.py new file mode 100644 index 0000000..6f22d84 --- /dev/null +++ b/rating/response_duration.py @@ -0,0 +1,46 @@ +""" +This looks at the response duration(s) and scores based on the bucket +the value is in. Fast responses get one point, slower half a point, +more than a seconds gets nothing. +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'number' + default_value = False + depends_on_checks = ['page_content'] + max_score = 1.0 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + duration_sum = 0 + duration_count = 0 + + for url in self.check_results['page_content']: + if self.check_results['page_content'][url]['exception'] is not None: + continue + duration_sum += self.check_results['page_content'][url]['duration'] + duration_count += 1 + + if duration_count > 0: + value = round(duration_sum / duration_count) + + # value is duration in milliseconds + if value < 100: + score = self.max_score + elif value < 1000: + score = self.max_score * 0.5 + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/responsive_layout.py b/rating/responsive_layout.py new file mode 100644 index 0000000..2c198eb --- /dev/null +++ b/rating/responsive_layout.py @@ -0,0 +1,35 @@ +""" +This gives a score if the site's minimal document width during checks +was smaller than or equal to the minimal viewport size tested. +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'boolean' + default_value = False + depends_on_checks = ['load_in_browser'] + max_score = 1 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + for url in self.check_results['load_in_browser']: + if (self.check_results['load_in_browser'][url]['min_document_width'] <= + self.check_results['load_in_browser'][url]['sizes'][0]['viewport_width']): + value = True + score = self.max_score + # we use the first URL found here + break + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/use_specific_fonts.py b/rating/use_specific_fonts.py new file mode 100644 index 0000000..8acb907 --- /dev/null +++ b/rating/use_specific_fonts.py @@ -0,0 +1,41 @@ +""" +Checks whether the pages use the font 'Arvo'. +""" + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'boolean' + default_value = False + depends_on_checks = ['load_in_browser'] + max_score = 1 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + urls_with_font = 0 + urls_without_font = 0 + for url in self.check_results['load_in_browser']: + if self.check_results['load_in_browser'][url]['font_families'] is None: + urls_without_font += 1 + continue + + fonts = " ".join(self.check_results['load_in_browser'][url]['font_families']) + if 'arvo' in fonts: + urls_with_font += 1 + + if urls_with_font > 0 and urls_without_font == 0: + score = self.max_score + value = True + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/rating/www_optional.py b/rating/www_optional.py new file mode 100644 index 0000000..0afef45 --- /dev/null +++ b/rating/www_optional.py @@ -0,0 +1,44 @@ +""" +This looks at reachable URLs and checks whether (sub)domains +both with and without www. are reachable. +""" + +from urllib.parse import urlparse + +from rating.abstract_rater import AbstractRater + +class Rater(AbstractRater): + + rating_type = 'boolean' + default_value = False + depends_on_checks = ['url_reachability'] + max_score = 1 + + def __init__(self, check_results): + super().__init__(check_results) + + def rate(self): + value = self.default_value + score = 0 + + hostnames = set() + for url in self.check_results['url_reachability']: + if self.check_results['url_reachability'][url]['exception'] is not None: + continue + parsed = urlparse(url) + hostnames.add(parsed) + + # FIXME + # we simply check whether there is more than one hostname. + # this works with our current input URls but might be too + # simplistic in the future. + if len(list(hostnames)) > 1: + value = True + score = self.max_score + + return { + 'type': self.rating_type, + 'value': value, + 'score': score, + 'max_score': self.max_score, + } diff --git a/spider.py b/spider.py deleted file mode 100644 index 4e4f6e6..0000000 --- a/spider.py +++ /dev/null @@ -1,814 +0,0 @@ -""" -Provides the spider functionality (website checks). -""" - -import argparse -import json -import logging -import os -import random -import re -import shutil -import statistics -import time -from datetime import datetime -from socket import gethostbyname_ex -from urllib.parse import urljoin -from urllib.parse import urlparse - -import requests -import yaml -import tenacity - -from bs4 import BeautifulSoup -from git import Repo -from selenium import webdriver -from google.cloud import datastore -from google.api_core.exceptions import Aborted -from google.api_core.exceptions import InvalidArgument - - -# configuration - -# connection timeout for website checks (seconds) -CONNECT_TIMEOUT = 5 - -# response timeout for website checks -READ_TIMEOUT = 10 - -# Git repo for our data -GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git' -# folder in that repo that holds the data -GREEN_DIRECTORY_DATA_PATH = 'data/countries/de' -GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory' - -RESULT_PATH = '/out' - -# IP address of the newthinking GCMS server -GCMS_IP = "91.102.13.20" - -JOB_DATASTORE_KIND = 'spider-jobs' -RESULTS_DATASTORE_KIND = 'spider-results' - -# end configuration - -DATASTORE_CLIENT = None - - -def chunks(the_list, size): - """ - Yield successive n-sized chunks from list the_list - where n = size. - """ - for i in range(0, len(the_list), size): - yield the_list[i:i + size] - - -def create_jobs(url=None): - """ - Read all URLs from green directory and fill a job database - with one job per URL. - - Alternatively, if the url argument is given, only the given URL - will be added as a spider job. - """ - - # refresh our local clone of the green directory - logging.info("Refreshing green-directory clone") - get_green_directory() - - # build the list of website URLs to run checks for - logging.info("Processing green-directory") - input_entries = [] - - count = 0 - - for entry in dir_entries(): - - if 'type' not in entry: - logging.error("Entry without type") - continue - if 'urls' not in entry: - logging.debug("Entry %s does not have any URLs.", repr_entry(entry)) - continue - - website_url = None - for index in range(len(entry['urls'])): - try: - if entry['urls'][index]['type'] == "WEBSITE": - website_url = entry['urls'][index]['url'] - if website_url: - if url is not None and website_url != url: - continue - input_entries.append({ - "url": website_url, - "level": entry.get("level"), - "state": entry.get("state"), - "district": entry.get("district"), - "city": entry.get("city"), - }) - count += 1 - except NameError: - logging.error("Error in %s: 'url' key missing (%s)", - repr_entry(entry), entry['urls'][index]) - - # ensure the passed URL argument is really there, even if not part - # of the directory. - if url and count == 0: - logging.info("Adding job for URL %s which is not part of green-directory", url) - input_entries.append({ - "url": url, - "level": None, - "state": None, - "district": None, - "city": None, - }) - - # randomize order, to distribute requests over servers - logging.debug("Shuffling input URLs") - random.seed() - random.shuffle(input_entries) - - count = 0 - logging.info("Writing jobs") - - entities = [] - - for entry in input_entries: - key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"]) - entity = datastore.Entity(key=key) - entity.update({ - "created": datetime.utcnow(), - "level": entry["level"], - "state": entry["state"], - "district": entry["district"], - "city": entry["city"], - }) - entities.append(entity) - - # commmit to DB - for chunk in chunks(entities, 300): - logging.debug("Writing jobs chunk of length %d", len(chunk)) - DATASTORE_CLIENT.put_multi(chunk) - count += len(chunk) - - logging.info("Writing jobs done, %s jobs added", count) - - -def get_green_directory(): - """ - Clones the source of website URLs, the green directory, - into the local file system using git - """ - if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH): - shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH) - Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH) - - -def dir_entries(): - """ - Iterator over all data files in the cloned green directory - """ - path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH) - for root, _, files in os.walk(path): - for fname in files: - - filepath = os.path.join(root, fname) - if not filepath.endswith(".yaml"): - continue - - with open(filepath, 'r', encoding='utf8') as yamlfile: - for doc in yaml.load_all(yamlfile): - yield doc - - -def repr_entry(entry): - """ - Return string representation of a directory entry, - for logging/debugging purposes - """ - ret = entry['type'] - if 'level' in entry: - ret += "/" + entry['level'] - if 'state' in entry: - ret += "/" + entry['state'] - if 'district' in entry: - ret += "/" + entry['district'] - return ret - - -def derive_test_hostnames(hostname): - """ - Derives the hostnames variants to test for a given host name. - From 'gruene-x.de' or 'www.gruene-x.de' it makes - - ['gruene-x.de', 'www.gruene-x.de'] - - which are both plausible web URLs to be used for a domain. - """ - - hostnames = set() - - hostnames.add(hostname) - if hostname.startswith('www.'): - hostnames.add(hostname[4:]) - else: - hostnames.add('www.' + hostname) - - return sorted(list(hostnames)) - - -def reduce_urls(urllist): - """ - Reduce a list of urls with metadata by eliminating those - that either don't work or lead somewhere else - """ - targets = set() - for url in urllist: - if url['error'] is not None: - continue - if url['redirects_to'] is not None: - targets.add(url['redirects_to']) - else: - targets.add(url['url']) - return sorted(list(targets)) - - -def normalize_title(title): - """ - Removes garbage from HTML page titles - """ - title = title.replace(u'\u00a0', ' ') - title = title.replace(' ', ' ') - title = title.strip() - return title - - -def check_responsiveness(url): - """ - Checks - - whether a page adapts to different viewport sizes - - whether a viewport meta tag exists - and returns details - """ - details = { - 'document_width': {}, - 'viewport_meta_tag': None, - } - - # sizes we check for (width, height) - sizes = ( - (320, 480), # old smartphone - (768, 1024), # older tablet or newer smartphone - (1024, 768), # older desktop or horiz. tablet - (1920, 1080), # Full HD horizontal - ) - - # Our selenium user agent using Chrome headless as an engine - chrome_options = webdriver.ChromeOptions() - chrome_options.add_argument('--headless') - chrome_options.add_argument('--disable-gpu') - chrome_options.add_argument('--no-sandbox') - chrome_options.add_argument('--disable-extensions') - driver = webdriver.Chrome(chrome_options=chrome_options) - driver.set_page_load_timeout(60) - driver.set_window_size(sizes[0][0], sizes[0][1]) - driver.get(url) - time.sleep(1) - - for (width, height) in sizes: - driver.set_window_size(width, height) - key = "%sx%s" % (width, height) - width = driver.execute_script("return document.body.scrollWidth") - details['document_width'][key] = int(width) - - try: - element = driver.find_element_by_xpath("//meta[@name='viewport']") - details['viewport_meta_tag'] = element.get_attribute('content') - except: - pass - - return details - - -def check_content(req): - """ - Adds details to check regarding content of the page - - check: the dict containing details for this URL - r: requests request/response object - """ - result = {} - - result['encoding'] = req.encoding.lower() - soup = BeautifulSoup(req.text, 'html.parser') - - result['html'] = req.text - - # page title - result['title'] = None - title = None - head = soup.find('head') - if head is not None: - title = head.find('title') - if title is not None: - result['title'] = normalize_title(title.get_text()) - - # canonical link - result['canonical_link'] = None - link = soup.find('link', rel='canonical') - if link: - result['canonical_link'] = urljoin(req.url, link.get('href')) - - # icon - result['icon'] = None - link = soup.find('link', rel=lambda x: x and x.lower() == 'icon') - if link: - result['icon'] = urljoin(req.url, link.get('href')) - else: - link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon') - if link: - result['icon'] = urljoin(req.url, link.get('href')) - - # feed links - result['feeds'] = [] - rss_links = soup.find_all('link', type='application/rss+xml') - atom_links = soup.find_all('link', type='application/atom+xml') - - if rss_links: - for link in rss_links: - result['feeds'].append(urljoin(req.url, link.get('href'))) - if atom_links: - for link in rss_links: - result['feeds'].append(urljoin(req.url, link.get('href'))) - - # generator meta tag - result['generator'] = None - if head is not None: - generator = head.select('[name=generator]') - if generator: - result['generator'] = generator[0].get('content') - - # opengraph meta tags - result['opengraph'] = None - opengraph = set() - if head is not None: - for item in head.find_all(property=re.compile('^og:')): - opengraph.add(item.get('property')) - for item in head.find_all(itemprop=re.compile('^og:')): - opengraph.add(item.get('itemprop')) - if opengraph: - result['opengraph'] = sorted(list(opengraph)) - - return result - - -def collect_ipv4_addresses(hostname_dict): - """ - Return list of unique IPv4 addresses - """ - ips = set() - for item in hostname_dict.values(): - if 'ip_addresses' not in item: - continue - for ip_addr in item['ip_addresses']: - ips.add(ip_addr) - return sorted(list(ips)) - - -def parse_generator(generator): - """ - Return well known CMS names from generator - """ - generator = generator.lower() - if 'typo3' in generator: - return "typo3" - if 'wordpress' in generator: - return "wordpress" - if 'drupal' in generator: - return "drupal" - if 'joomla' in generator: - return "joomla" - return generator - -def check_site(entry): - """ - Performs our site check and returns results as a dict. - - 1. Normalize the input URL and derive the URLs to check for - 2. HEAD the check urls - 3. Determine the canonical URL - 4. Run full check on canonical URL - """ - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' + - 'AppleWebKit/537.36 (KHTML, like Gecko) ' + - 'Chrome/65.0.3325.181 green-spider/0.1' - } - - # all the info we'll return for the site - result = { - # input_url: The URL we derived all checks from - 'input_url': entry['url'], - # Meta: Regional and type metadata for the site - 'meta': { - 'level': entry.get('level'), - 'state': entry.get('state'), - 'district': entry.get('district'), - 'city': entry.get('city'), - }, - # Details: All details we collected about the site (which aren't directly - # related to the report criteria) - 'details': { - 'hostnames': {}, - 'ipv4_addresses': [], - 'resolvable_urls': [], - 'canonical_urls': [], - 'urlchecks': [], - 'icons': [], - 'feeds': [], - 'cms': None, - 'responsive': None, - }, - # The actual report criteria - 'result': { - 'DNS_RESOLVABLE_IPV4': {'type': 'boolean', 'value': False, 'score': 0}, - 'SITE_REACHABLE': {'type': 'boolean', 'value': False, 'score': 0}, - 'HTTPS': {'type': 'boolean', 'value': False, 'score': 0}, - 'WWW_OPTIONAL': {'type': 'boolean', 'value': False, 'score': 0}, - 'CANONICAL_URL': {'type': 'boolean', 'value': False, 'score': 0}, - 'FAVICON': {'type': 'boolean', 'value': False, 'score': 0}, - 'FEEDS': {'type': 'boolean', 'value': False, 'score': 0}, - 'HTTP_RESPONSE_DURATION': {'type': 'number', 'value': None, 'score': 0}, - 'RESPONSIVE': {'type': 'boolean', 'value': False, 'score': 0}, - }, - 'score': 0.0, - } - - # derive hostnames to test (with/without www.) - parsed = urlparse(entry['url']) - hostnames = derive_test_hostnames(parsed.hostname) - - # try to resolve hostnames - processed_hostnames = {} - for hostname in hostnames: - - processed_hostnames[hostname] = { - 'resolvable': False, - } - - try: - hostname, aliases, ip_addresses = gethostbyname_ex(hostname) - processed_hostnames[hostname]['resolvable'] = True - processed_hostnames[hostname]['resolved_hostname'] = hostname - processed_hostnames[hostname]['aliases'] = aliases - processed_hostnames[hostname]['ip_addresses'] = ip_addresses - except: - pass - - result['details']['hostnames'] = processed_hostnames - - result['details']['ipv4_addresses'] = collect_ipv4_addresses(processed_hostnames) - - # check basic HTTP(S) reachability - checked_urls = [] - checked_urls_set = set() - - for hostname in processed_hostnames.keys(): - - item = processed_hostnames[hostname] - - if not item['resolvable']: - continue - - for scheme in ('http', 'https'): - - url = scheme + '://' + item['resolved_hostname'] + '/' - - if url in checked_urls_set: - continue - - checked_urls_set.add(url) - - record = { - 'url': url, - 'error': None, - 'redirects_to': None, - } - - try: - req = requests.head(record['url'], headers=headers, allow_redirects=True) - if req.url == url: - logging.info("URL: %s - status %s", record['url'], req.status_code) - else: - logging.info("URL: %s - status %s - redirects to %s", record['url'], - req.status_code, req.url) - record['redirects_to'] = req.url - except Exception as exc: - record['error'] = { - 'type': str(type(exc)), - 'message': str(exc), - } - logging.info("URL %s: %s %s", url, str(type(exc)), exc) - - checked_urls.append(record) - - result['details']['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url']) - result['details']['canonical_urls'] = sorted(reduce_urls(checked_urls)) - - # Deeper test for the remaining (canonical) URL(s) - for check_url in result['details']['canonical_urls']: - - logging.info("Downloading URL %s", check_url) - - check = { - 'url': check_url, - 'status_code': None, - 'duration': None, - 'error': None, - 'content': None, - 'responsive': None, - } - - try: - req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT)) - check['status_code'] = req.status_code - check['duration'] = round(req.elapsed.microseconds / 1000) - - # Content checks - if req.status_code < 300: - check['content'] = check_content(req) - - # Responsiveness check - try: - check['responsive'] = check_responsiveness(check_url) - except Exception as exc: - logging.error("Error when checking responsiveness for '%s': %s", check_url, exc) - - except requests.exceptions.ConnectionError as exc: - logging.error(str(exc) + " " + check_url) - check['error'] = "connection" - except requests.exceptions.ReadTimeout as exc: - logging.error(str(exc) + " " + check_url) - check['error'] = "read_timeout" - except requests.exceptions.Timeout as exc: - logging.error(str(exc) + " " + check_url) - check['error'] = "connection_timeout" - except Exception as exc: - logging.error(str(exc) + " " + check_url) - check['error'] = "unknown" - - result['details']['urlchecks'].append(check) - - - result['details']['urlchecks'] = sorted(result['details']['urlchecks'], - key=lambda url: url['url']) - - # collect icons - icons = set() - for c in result['details']['urlchecks']: - if 'content' not in c: - continue - if c['content'] is None: - logging.warning("No content for %s", entry['url']) - continue - if c['content']['icon'] is not None: - icons.add(c['content']['icon']) - result['details']['icons'] = sorted(list(icons)) - - # collect feeds - feeds = set() - for c in result['details']['urlchecks']: - if c['content'] is None: - logging.warning("No content for %s", entry['url']) - continue - if 'feeds' in c['content'] and len(c['content']['feeds']): - for feed in c['content']['feeds']: - feeds.add(feed) - result['details']['feeds'] = sorted(list(feeds)) - - # detect responsive - viewports = set() - min_width = 2000 - for c in result['details']['urlchecks']: - if c['responsive'] is None: - continue - if c['responsive']['viewport_meta_tag'] is not None: - viewports.add(c['responsive']['viewport_meta_tag']) - widths = c['responsive']['document_width'].values() - if min(widths) < min_width: - min_width = min(widths) - result['details']['responsive'] = { - 'viewport_meta_tag': list(viewports), - 'min_width': min_width, - } - - # detect CMS - for c in result['details']['urlchecks']: - if c['content'] is None: - continue - if 'generator' not in c['content']: - continue - if c['content']['generator'] != "" and c['content']['generator'] is not None: - - result['details']['cms'] = parse_generator(c['content']['generator']) - # Qualify certain CMS flavours in more detail - if result['details']['cms'] == "typo3": - if GCMS_IP in result['details']['ipv4_addresses']: - result['details']['cms'] = "typo3-gcms" - elif 'typo3-gruene.de' in c['content']['html']: - result['details']['cms'] = "typo3-gruene" - elif result['details']['cms'] == "wordpress": - if 'Urwahl3000' in c['content']['html']: - result['details']['cms'] = "wordpress-urwahl" - - else: - # No generator Tag. Use HTML content. - if 'Urwahl3000' in c['content']['html']: - result['details']['cms'] = "wordpress-urwahl" - elif ('josephknowsbest' in c['content']['html'] or - 'Joseph-knows-best' in c['content']['html']): - result['details']['cms'] = "wordpress-josephknowsbest" - elif 'wordpress' in c['content']['html']: - result['details']['cms'] = "wordpress" - - # we can stop here - break - - - ### Derive criteria - - # DNS_RESOLVABLE_IPV4 - if result['details']['ipv4_addresses']: - result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1} - - # SITE_REACHABLE - for item in result['details']['resolvable_urls']: - if item['error'] is None: - result['result']['SITE_REACHABLE'] = {'value': True, 'score': 1} - break - - # HTTPS - for item in result['details']['urlchecks']: - if item['error'] is None and item['url'].startswith('https://'): - result['result']['HTTPS'] = {'value': True, 'score': 2} - break - - # WWW_OPTIONAL - num_hostnames = 0 - for hostname in result['details']['hostnames'].keys(): - item = result['details']['hostnames'][hostname] - if not item['resolvable']: - continue - num_hostnames += 1 - if num_hostnames > 1: - result['result']['WWW_OPTIONAL'] = {'value': True, 'score': 1} - - # CANONICAL_URL - # - either there is only one canonical URL (through redirects) - # - or several pages have identical rel=canonical links - if len(result['details']['canonical_urls']) == 1: - result['result']['CANONICAL_URL'] = {'value': True, 'score': 1} - else: - links = set() - if result['details']['urlchecks'] is None: - logging.warning("No urlchecks for %s", entry['url']) - else: - for item in result['details']['urlchecks']: - if item['content'] is not None and item['content']['canonical_link'] is not None: - links.add(item['content']['canonical_link']) - if len(links) == 1: - result['result']['CANONICAL_URL'] = {'value': True, 'score': 1} - - # FAVICON - if result['details']['icons']: - result['result']['FAVICON'] = {'value': True, 'score': 1} - - # FEEDS - if result['details']['feeds']: - result['result']['FEEDS'] = {'value': True, 'score': 1} - - # HTTP_RESPONSE_DURATION - durations = [] - for item in result['details']['urlchecks']: - if item['error'] is None: - durations.append(item['duration']) - if durations: - val = round(statistics.mean(durations)) - result['result']['HTTP_RESPONSE_DURATION']['value'] = val - if val < 100: - result['result']['HTTP_RESPONSE_DURATION']['score'] = 1 - elif val < 1000: - result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5 - - # RESPONSIVE - if result['details']['responsive'] is not None: - if (result['details']['responsive']['min_width'] < 500 and - len(result['details']['responsive']['viewport_meta_tag']) > 0): - result['result']['RESPONSIVE']['value'] = True - result['result']['RESPONSIVE']['score'] = 1 - - # Overall score - for item in result['result'].keys(): - result['score'] += result['result'][item]['score'] - - # clean up - remove full HTML - for item in result['details']['urlchecks']: - try: - del item['content']['html'] - except: - pass - - return result - - -@tenacity.retry(wait=tenacity.wait_exponential(), - retry=tenacity.retry_if_exception_type(Aborted)) -def get_job_from_queue(): - """ - Returns a URL from the queue - """ - out = None - - with DATASTORE_CLIENT.transaction(): - query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND) - for entity in query.fetch(limit=1): - logging.debug("Got job: %s", entity) - out = dict(entity) - out["url"] = entity.key.name - DATASTORE_CLIENT.delete(entity.key) - - return out - -def work_of_queue(): - """ - Take job from queue and finish it until there are no more jobs - """ - while True: - job = get_job_from_queue() - if job is None: - logging.info("No more jobs. Exiting.") - break - - logging.info("Starting job %s", job["url"]) - result = check_site(entry=job) - #logging.debug(result) - logging.info("Job %s finished checks", job["url"]) - logging.info("Job %s writing to DB", job["url"]) - - key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"]) - entity = datastore.Entity(key=key, exclude_from_indexes=['results']) - record = { - "created": datetime.utcnow(), - "results": result, - } - entity.update(record) - try: - DATASTORE_CLIENT.put(entity) - except InvalidArgument as ex: - logging.error("Could not write result: %s", ex) - except ex: - logging.error("Could not write result: %s", ex) - - -if __name__ == "__main__": - """ - Bringing it all together - """ - parser = argparse.ArgumentParser() - parser.add_argument('--credentials-path', dest='credentials_path', - help='Path to the service account credentials JSON file', - default='/secrets/service-account.json') - parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)", - default='info') - - subparsers = parser.add_subparsers(help='sub-command help', dest='command') - - subparsers.add_parser('spider', help='Take jobs off the queue and spider') - - jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue') - - jobs_parser.add_argument('--url', help='Add a job to spider a URL') - args = parser.parse_args() - - loglevel = args.loglevel.lower() - if loglevel == 'error': - logging.basicConfig(level=logging.ERROR) - elif loglevel == 'warn': - logging.basicConfig(level=logging.WARN) - elif loglevel == 'debug': - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.INFO) - loglevel = 'info' - - logging.getLogger("urllib3").setLevel(logging.CRITICAL) - - DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path) - - logging.debug("Called command %s", args.command) - - if args.command == 'jobs': - create_jobs(args.url) - else: - work_of_queue() diff --git a/spider/__init__.py b/spider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/spider/spider.py b/spider/spider.py new file mode 100644 index 0000000..d1cf6de --- /dev/null +++ b/spider/spider.py @@ -0,0 +1,106 @@ +""" +Provides the spider functionality (website checks). +""" + +import argparse +import json +import logging +import re +import statistics +import time +from datetime import datetime +from pprint import pprint + +from google.api_core.exceptions import InvalidArgument +from google.cloud import datastore + +import checks +import config +import jobs +import rating + +def check_and_rate_site(entry): + """ + Performs our site check and returns results as a dict. + + 1. Normalize the input URL and derive the URLs to check for + 2. HEAD the check urls + 3. Determine the canonical URL + 4. Run full check on canonical URL + """ + + # all the info we'll return for the site + result = { + # input_url: The URL we derived all checks from + 'input_url': entry['url'], + # Meta: Regional and type metadata for the site + 'meta': { + 'type': entry.get('type'), + 'level': entry.get('level'), + 'state': entry.get('state'), + 'district': entry.get('district'), + 'city': entry.get('city'), + }, + # checks: Results from our checks + 'checks': {}, + # The actual report scoring criteria + 'rating': {}, + # resulting score + 'score': 0.0, + } + + # Results from our next generation checkers + result['checks'] = checks.perform_checks(entry['url']) + + result['rating'] = rating.calculate_rating(result['checks']) + + # Overall score is the sum of the individual scores + for key in result['rating']: + result['score'] += result['rating'][key]['score'] + + # remove full HTML page content, + # as it's no longer needed + try: + for url in result['checks']['page_content']: + del result['checks']['page_content'][url]['content'] + except: + pass + + return result + + +def work_of_queue(datastore_client, entity_kind): + """ + Take job from queue and finish it until there are no more jobs + """ + while True: + job = jobs.get_job_from_queue(datastore_client) + if job is None: + logging.info("No more jobs. Exiting.") + break + + logging.info("Starting job %s", job["url"]) + result = check_and_rate_site(entry=job) + + logging.debug("Full JSON representation of returned result: %s", json.dumps(result)) + + logging.info("Job %s finished checks", job["url"]) + logging.info("Job %s writing to DB", job["url"]) + + key = datastore_client.key(entity_kind, job["url"]) + entity = datastore.Entity(key=key, exclude_from_indexes=['results']) + record = { + 'created': datetime.utcnow(), + 'meta': result['meta'], + 'checks': result['checks'], + 'rating': result['rating'], + 'score': result['score'], + } + entity.update(record) + try: + datastore_client.put(entity) + except InvalidArgument as ex: + logging.error("Could not write result: %s", ex) + except Exception as ex: + logging.error("Could not write result: %s", ex) + diff --git a/spider/spider_test.py b/spider/spider_test.py new file mode 100644 index 0000000..dda55e7 --- /dev/null +++ b/spider/spider_test.py @@ -0,0 +1,26 @@ +import unittest + +from spider.spider import check_and_rate_site + +from pprint import pprint + +class TestSpiderr(unittest.TestCase): + + def test_url1(self): + + entry = { + "url": "https://httpbin.org/html", + "type": "type", + "state": "state", + "level": "level", + "district": "district", + "city": "city", + } + + url = "https://httpbin.org/html" + result = check_and_rate_site(entry) + + self.assertEqual(result["input_url"], url) + +if __name__ == '__main__': + unittest.main() diff --git a/spider_test.py b/spider_test.py deleted file mode 100644 index a617147..0000000 --- a/spider_test.py +++ /dev/null @@ -1,125 +0,0 @@ -import unittest -import requests -import responses -import spider - - -class TestDeriveHostnames(unittest.TestCase): - - def test_basic1(self): - hn = spider.derive_test_hostnames('www.my-domain.de') - expected = ['my-domain.de', 'www.my-domain.de'] - self.assertEqual(hn, expected) - - def test_basic2(self): - hn = spider.derive_test_hostnames('domain.de') - expected = ['domain.de', 'www.domain.de'] - self.assertEqual(hn, expected) - - -class TestReduceURLs(unittest.TestCase): - - def test_basic(self): - testdata = [ - {'url': 'one', 'error': None, 'redirects_to': None}, - {'url': 'two', 'error': 'Yes', 'redirects_to': None}, - {'url': 'three', 'error': None, 'redirects_to': 'five'}, - ] - expected_result = ['five', 'one'] - result = spider.reduce_urls(testdata) - self.assertEqual(result, expected_result) - - -class TestContentChecks(unittest.TestCase): - - @responses.activate - def test_minimal(self): - url = 'http://my.url' - responses.add(responses.GET, url, status=200, - content_type='text/html', - body='') - r = requests.get(url) - result = spider.check_content(r) - - del result['html'] # don't want to have the messy HTML part in comparison - - expected_result = { - 'icon': None, - 'title': None, - 'generator': None, - 'feeds': [], - 'encoding': 'iso-8859-1', - 'canonical_link': None, - 'opengraph': None - } - self.assertDictEqual(result, expected_result) - - @responses.activate - def test_basic(self): - url = 'http://my.url' - responses.add(responses.GET, url, status=200, - content_type='text/html; charset=UTF-8', - body=''' - - - - The page's title - - - - - - - ''') - r = requests.get(url) - result = spider.check_content(r) - - del result['html'] # don't want to have the messy HTML part in comparison - - expected_result = { - 'icon': 'http://foo.bar/image.png', - 'title': 'The page\'s title', - 'generator': 'some-cms/1.0', - 'feeds': [ - 'http://example.com/feed', - ], - 'encoding': 'utf-8', - 'canonical_link': 'https://my.site.com/', - 'opengraph': None - } - self.assertDictEqual(result, expected_result) - - @responses.activate - def test_opengraph(self): - url = 'http://my.url' - responses.add(responses.GET, url, status=200, - content_type='text/html; charset=UTF-8', - body=''' - - - - - - - - - ''') - r = requests.get(url) - result = spider.check_content(r) - - del result['html'] # don't want to have the messy HTML part in comparison - - expected_result = { - 'icon': None, - 'title': None, - 'generator': None, - 'feeds': [], - 'encoding': 'utf-8', - 'canonical_link': None, - 'opengraph': ['og:image', 'og:title', 'og:type', 'og:url'], - } - self.assertDictEqual(result, expected_result) - - -if __name__ == '__main__': - unittest.main()