diff --git a/.dockerignore b/.dockerignore
index ae2f869..e5d5555 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -4,3 +4,4 @@ docs
secrets
temp
venv
+/export-*
diff --git a/.gitignore b/.gitignore
index 1d3b3da..a536d3d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ __pycache__
.vscode/settings.json
webapp/dist/bundle.js
dev-shm
+/export-*
diff --git a/.travis.yml b/.travis.yml
index df1ac71..bd97de9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,5 +6,12 @@ services:
notifications:
email: false
+language: python
+python:
+ - "3.6"
+
script:
+ - pip install --upgrade pip
+ - pip install --upgrade codecov
- make test
+ - codecov
diff --git a/Dockerfile b/Dockerfile
index a53042b..186ac63 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,17 +1,20 @@
-FROM python:3.6-alpine3.7
+FROM python:3.6-alpine3.8
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.7/main" >> /etc/apk/repositories && \
echo "http://dl-4.alpinelinux.org/alpine/v3.7/community" >> /etc/apk/repositories && \
apk update && \
- apk --no-cache add chromium chromium-chromedriver python3-dev build-base git && \
+ apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
pip3 install --upgrade pip && \
- pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
+ pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 html-similarity==0.3.2 httpretty==0.9.4 pyopenssl==18.0.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
apk del python3-dev build-base
-ADD spider.py /
-ADD spider_test.py /
-ADD data_export.py /
+ADD cli.py /
+ADD config /config
+ADD jobs /jobs
+ADD checks /checks
+ADD rating /rating
+ADD spider /spider
+ADD export /export
-ENTRYPOINT ["python3"]
-CMD ["/spider.py"]
+ENTRYPOINT ["python3", "/cli.py"]
diff --git a/Makefile b/Makefile
index 075496d..9b85fa4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,18 +1,20 @@
+IMAGE := quay.io/netzbegruenung/green-spider:latest
+DB_ENTITY := spider-results
.PHONY: dockerimage
# Build docker image
dockerimage:
- docker build -t quay.io/netzbegruenung/green-spider:latest .
+ docker build -t $(IMAGE) .
# Create spider job queue
spiderjobs: dockerimage
docker run --rm -ti \
-v $(PWD)/secrets:/secrets \
- quay.io/netzbegruenung/green-spider:latest spider.py \
+ $(IMAGE) \
--credentials-path /secrets/datastore-writer.json \
- --loglevel debug \
+ --loglevel info \
jobs
# Run spider in docker image
@@ -21,11 +23,26 @@ spider: dockerimage
-v $(PWD)/dev-shm:/dev/shm \
-v $(PWD)/webapp/dist/data:/out \
-v $(PWD)/secrets:/secrets \
- quay.io/netzbegruenung/green-spider:latest spider.py \
+ $(IMAGE) \
--credentials-path /secrets/datastore-writer.json \
- --loglevel info \
- spider
+ --loglevel debug \
+ spider --kind $(DB_ENTITY)
+
+export: dockerimage
+ docker run --rm -ti \
+ -v $(PWD)/export-json:/out \
+ -v $(PWD)/secrets:/secrets \
+ -v $(PWD)/export-siteicons:/icons \
+ $(IMAGE) \
+ --credentials-path /secrets/datastore-reader.json \
+ --loglevel debug \
+ export --kind $(DB_ENTITY)
# run spider tests
+# FIXME
test: dockerimage
- docker run --rm -ti quay.io/netzbegruenung/green-spider:latest /spider_test.py
+ docker run --rm -ti \
+ --entrypoint "python3" \
+ $(IMAGE) \
+ -m unittest discover -p '*_test.py'
+
diff --git a/checks/__init__.py b/checks/__init__.py
new file mode 100644
index 0000000..cc2d0f0
--- /dev/null
+++ b/checks/__init__.py
@@ -0,0 +1,64 @@
+"""
+The checks module contains the functionality to get information and test certain
+functionality of a site or individual pages.
+"""
+
+import logging
+
+from checks import charset
+from checks import certificate
+from checks import dns_resolution
+from checks import duplicate_content
+from checks import domain_variations
+from checks import generator
+from checks import html_head
+from checks import http_and_https
+from checks import page_content
+from checks import load_in_browser
+from checks import url_reachability
+from checks import url_canonicalization
+
+from checks.config import Config
+
+
+def perform_checks(input_url):
+ """
+ Executes all our URL/site checks and returns a big-ass result dict.
+ """
+
+ # The sequence of checks to run. Order is important!
+ # Checks which expand the URLs list must come first.
+ # After that, dependencies (encoded in the checks) have to be fulfilled.
+ check_modules = [
+ ('domain_variations', domain_variations),
+ ('http_and_https', http_and_https),
+ ('dns_resolution', dns_resolution),
+ ('url_reachability', url_reachability),
+ ('certificate', certificate),
+ ('url_canonicalization', url_canonicalization),
+ ('page_content', page_content),
+ ('duplicate_content', duplicate_content),
+ ('charset', charset),
+ ('html_head', html_head),
+ ('generator', generator),
+ ('load_in_browser', load_in_browser),
+ ]
+
+ results = {}
+
+ config = Config(urls=[input_url],
+ user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' +
+ 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' +
+ 'Safari/537.36 green-spider/0.2')
+
+ for check_name, check in check_modules:
+ checker = check.Checker(config=config,
+ previous_results=results)
+ result = checker.run()
+ results[check_name] = result
+
+ # update config for the next check
+ config = checker.config
+ logging.debug("config after check %s: %r" % (check_name, config))
+
+ return results
diff --git a/checks/abstract_checker.py b/checks/abstract_checker.py
new file mode 100644
index 0000000..e9db12a
--- /dev/null
+++ b/checks/abstract_checker.py
@@ -0,0 +1,23 @@
+class AbstractChecker(object):
+ """
+ Our blueprint for checks
+ """
+
+ def __init__(self, config, previous_results=None):
+ self._config = config
+
+ # A dictionary of results from previous checkers.
+ # Key is the name of the checker that has generated the result.
+ self._previous_results = previous_results
+
+ def run(self):
+ """Executes the check routine, returns result dict"""
+ raise NotImplementedError()
+
+ @property
+ def config(self):
+ return self._config
+
+ @property
+ def previous_results(self):
+ return self._previous_results
diff --git a/checks/certificate.py b/checks/certificate.py
new file mode 100644
index 0000000..2539963
--- /dev/null
+++ b/checks/certificate.py
@@ -0,0 +1,62 @@
+"""
+Gathers information on the TLS/SSL certificate used by a server
+"""
+
+from urllib.parse import urlparse
+import logging
+import ssl
+from datetime import datetime
+from datetime import timezone
+
+from OpenSSL import crypto
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+ def __init__(self, config, previous_results=None):
+ super().__init__(config, previous_results)
+
+ def run(self):
+ results = {}
+
+ for url in self.config.urls:
+ if url.startswith('https://'):
+ results[url] = self.get_certificate(url)
+
+ return results
+
+ def get_certificate(self, url):
+ result = {
+ 'exception': None,
+ 'serial_number': None,
+ 'subject': None,
+ 'issuer': None,
+ 'not_before': None,
+ 'not_after': None
+ }
+
+ parsed = urlparse(url)
+ try:
+ cert = ssl.get_server_certificate((parsed.hostname, 443))
+ x509 = crypto.load_certificate(crypto.FILETYPE_PEM, cert)
+ result['serial_number'] = str(x509.get_serial_number())
+
+ nb = x509.get_notBefore().decode('utf-8')
+ na = x509.get_notAfter().decode('utf-8')
+
+ # parse '2018 06 27 00 00 00Z'
+ result['not_before'] = datetime(int(nb[0:4]), int(nb[4:6]), int(nb[6:8]), int(nb[8:10]), int(nb[10:12]), int(nb[12:14]), tzinfo=timezone.utc).isoformat()
+ result['not_after'] = datetime(int(na[0:4]), int(na[4:6]), int(na[6:8]), int(na[8:10]), int(na[10:12]), int(na[12:14]), tzinfo=timezone.utc).isoformat()
+
+ # decode and convert from bytes to unicode
+ result['subject'] = dict([tuple(map(lambda x: x.decode('utf-8'), tup)) for tup in x509.get_subject().get_components()])
+ result['issuer'] = dict([tuple(map(lambda x: x.decode('utf-8'), tup)) for tup in x509.get_issuer().get_components()])
+
+ except Exception as e:
+ result['exception'] = {
+ 'type': str(type(e)),
+ 'message': str(e),
+ }
+ logging.warning("Error when getting certificate for %s: %r" % (url, e))
+
+ return result
diff --git a/checks/certificate_test.py b/checks/certificate_test.py
new file mode 100644
index 0000000..66c2288
--- /dev/null
+++ b/checks/certificate_test.py
@@ -0,0 +1,27 @@
+from checks import certificate
+from checks.config import Config
+import unittest
+
+class TestCertificateChecker(unittest.TestCase):
+
+ def test_google(self):
+ url = 'https://www.google.com/'
+ config = Config(urls=[url])
+ checker = certificate.Checker(config=config, previous_results={})
+ result = checker.run()
+ self.assertIn(url, result)
+ self.assertIsNone(result[url]['exception'])
+ self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services')
+
+ def test_kaarst(self):
+ url = 'https://www.gruenekaarst.de/'
+ config = Config(urls=[url])
+ checker = certificate.Checker(config=config, previous_results={})
+ result = checker.run()
+ self.assertIn(url, result)
+ self.assertIsNone(result[url]['exception'])
+ self.assertEqual(result[url]['issuer']['O'], 'COMODO CA Limited')
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/checks/charset.py b/checks/charset.py
new file mode 100644
index 0000000..0851dd2
--- /dev/null
+++ b/checks/charset.py
@@ -0,0 +1,77 @@
+"""
+Checks which character set a page has.
+
+TODO: Check for http-equiv meta tags like
+
+"""
+
+import logging
+
+from bs4 import BeautifulSoup
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+ def __init__(self, config, previous_results=None):
+ super().__init__(config, previous_results)
+
+ def run(self):
+ assert 'page_content' in self.previous_results
+
+ results = {}
+
+ for url in self.config.urls:
+ results[url] = self.get_charset(url)
+
+ return results
+
+ def get_charset(self, url):
+ """
+ Expects page_content_dict['content'] to carry the HTML content
+ """
+ page_content = self.previous_results['page_content'][url]
+ assert 'content' in page_content
+ assert 'response_headers' in page_content
+ logging.debug("%r", page_content['response_headers'])
+ assert 'content-type' in page_content['response_headers']
+
+ if page_content['content'] is None:
+ return
+
+ result = {
+ 'meta_charset_tag': None,
+ 'content_type_header_charset': None,
+ 'charset': 'iso-8859-1', # ISO-8859-1 is the default according to https://www.w3.org/International/articles/http-charset/index
+ 'valid': None,
+ 'exception': None,
+ }
+
+ soup = BeautifulSoup(page_content['content'], 'html.parser')
+
+ # get response header charset
+ if ('content-type' in page_content['response_headers']
+ and 'charset=' in page_content['response_headers']['content-type']):
+ parts = page_content['response_headers']['content-type'].split("charset=", 1)
+ result['content_type_header_charset'] = parts[1].lower()
+ result['charset'] = parts[1].lower()
+
+ # get meta tag charset
+ metatags = soup.find_all('meta')
+ for tag in metatags:
+ if 'charset' in tag.attrs:
+ result['meta_charset_tag'] = tag['charset'].lower()
+ # meta tag overrules any previous value
+ result['charset'] = tag['charset'].lower()
+
+ # check for charset plausibility (only for most common ones)
+ if result['charset'] in ('iso-8859-1', 'utf-8'):
+ try:
+ _ = page_content['content'].encode(result['charset'])
+ except UnicodeEncodeError as e:
+ result['valid'] = False
+ result['exception'] = str(e)
+ else:
+ result['valid'] = True
+
+
+ return result
diff --git a/checks/charset_test.py b/checks/charset_test.py
new file mode 100644
index 0000000..cce7677
--- /dev/null
+++ b/checks/charset_test.py
@@ -0,0 +1,49 @@
+import httpretty
+from httpretty import httprettified
+import unittest
+
+from checks import charset
+from checks import page_content
+from checks.config import Config
+
+@httprettified
+class TestCharsetChecker(unittest.TestCase):
+
+ def test_http_response(self):
+ url = 'http://www.example.com/'
+ httpretty.register_uri(httpretty.GET, url,
+ body="""
+
+
+
+ Hello
+
+ """,
+ adding_headers={
+ "Content-Type": "text/html; charset=ISO-8859-1",
+ })
+
+ results = {}
+
+ config = Config(urls=[url])
+ page_content_checker = page_content.Checker(config=config, previous_results={})
+ results['page_content'] = page_content_checker.run()
+
+ self.assertIn(url, results['page_content'])
+ self.assertIn('response_headers', results['page_content'][url])
+ self.assertIn('content-type', results['page_content'][url]['response_headers'])
+
+ charset_checker = charset.Checker(config=page_content_checker.config, previous_results=results)
+ result = charset_checker.run()
+
+ self.assertIn(url, result)
+ self.assertEqual(result[url], {
+ 'meta_charset_tag': 'utf-8',
+ 'content_type_header_charset': 'iso-8859-1',
+ 'charset': 'utf-8',
+ 'valid': True,
+ 'exception': None,
+ })
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/checks/config.py b/checks/config.py
new file mode 100644
index 0000000..d164b00
--- /dev/null
+++ b/checks/config.py
@@ -0,0 +1,29 @@
+class Config(object):
+ """
+ Our configuration to be passed to checks
+ """
+
+ def __init__(self, urls, user_agent='green-spider/1.0'):
+ self._urls = set(urls)
+ self._user_agent = user_agent
+
+ def __repr__(self):
+ return "Config(urls=%r)" % self._urls
+
+ @property
+ def urls(self):
+ return list(self._urls)
+
+ def add_url(self, url):
+ self._urls.add(url)
+
+ def remove_url(self, url):
+ """Removes url from urls, if it was in there. Ignores errors."""
+ try:
+ self._urls.remove(url)
+ except KeyError:
+ pass
+
+ @property
+ def user_agent(self):
+ return self._user_agent
diff --git a/checks/dns_resolution.py b/checks/dns_resolution.py
new file mode 100644
index 0000000..efd3b05
--- /dev/null
+++ b/checks/dns_resolution.py
@@ -0,0 +1,55 @@
+"""
+This check attempts to resolve all hostnames/domains in the input URLs.
+
+URLs which are not resolvable are removed from the config.
+"""
+
+import logging
+from socket import gethostbyname_ex
+from urllib.parse import urlparse
+from urllib.parse import urlunparse
+
+from checks.abstract_checker import AbstractChecker
+
+
+class Checker(AbstractChecker):
+ def __init__(self, config, previous_results=None):
+ super().__init__(config, previous_results)
+
+ def run(self):
+ """Executes the check routine, returns result dict"""
+
+ results = {}
+
+ urls = list(self.config.urls)
+ for url in urls:
+ parsed = urlparse(url)
+
+ results[url] = self.resolve_hostname(parsed.hostname)
+
+ # remove URL if non-resolvable
+ if not results[url]['resolvable']:
+ self.config.remove_url(url)
+
+ return results
+
+ def resolve_hostname(self, hostname):
+ """
+ Resolve one to IPv4 address(es)
+ """
+ result = {
+ 'hostname': hostname,
+ 'resolvable': False,
+ 'aliases': [],
+ 'ipv4_addresses': [],
+ }
+
+ try:
+ hostname, aliases, ipv4_addresses = gethostbyname_ex(hostname)
+ result['resolvable'] = True
+ result['aliases'] = aliases
+ result['ipv4_addresses'] = ipv4_addresses
+ except Exception as e:
+ logging.debug("Hostname %s not resolvable. Exception: %r" % (hostname, e))
+
+ return result
diff --git a/checks/domain_variations.py b/checks/domain_variations.py
new file mode 100644
index 0000000..ab621ea
--- /dev/null
+++ b/checks/domain_variations.py
@@ -0,0 +1,44 @@
+"""
+This adds commonly tried variations of domains/subdomains to the URLs config.
+"""
+
+import logging
+
+from urllib.parse import urlparse
+from urllib.parse import urlunparse
+
+from checks.abstract_checker import AbstractChecker
+
+
+class Checker(AbstractChecker):
+ def __init__(self, config, previous_results=None):
+ super().__init__(config, previous_results)
+
+ def run(self):
+ urls = list(self.config.urls)
+ for url in urls:
+ parsed = urlparse(url)
+ hostnames = self.expand_hostname(parsed.hostname)
+
+ for hostname in hostnames:
+ self.config.add_url(urlunparse((parsed.scheme, hostname,
+ parsed.path, parsed.params, parsed.query, parsed.fragment)))
+
+ return None
+
+
+ def expand_hostname(self, hostname):
+ """
+ Create variations of subdomains
+ """
+ hostnames = set()
+
+ hostnames.add(hostname)
+ if hostname.startswith('www.'):
+ # remove 'www.' prefix
+ hostnames.add(hostname[4:])
+ else:
+ # add 'www.' prefix
+ hostnames.add('www.' + hostname)
+
+ return sorted(list(hostnames))
diff --git a/checks/duplicate_content.py b/checks/duplicate_content.py
new file mode 100644
index 0000000..9556902
--- /dev/null
+++ b/checks/duplicate_content.py
@@ -0,0 +1,107 @@
+"""
+This checker looks at the similarity between previously downloaded pages
+and removes duplicates from the config URLs
+"""
+
+import logging
+
+import html_similarity
+
+from checks.abstract_checker import AbstractChecker
+
+
+class Checker(AbstractChecker):
+
+ # value above which we consider a page pair a duplicate
+ similarity_threshold = 0.99999
+
+ def __init__(self, config, previous_results=None):
+ super().__init__(config, previous_results)
+
+
+ def run(self):
+
+ if len(self.config.urls) == 1:
+ # nothing to do for us
+ return
+
+ urls = list(self.config.urls)
+
+ # get content
+ content = {}
+
+ assert 'page_content' in self.previous_results
+
+ for url in urls:
+ page_content = self.previous_results['page_content'][url]
+
+ if page_content['content'] is None:
+ logging.warn("Content for URL %s is None" % url)
+
+ content[url] = page_content['content']
+
+ pairs = self.compare_pairwise(content)
+
+ # remove duplicates
+ for key in pairs:
+ if pairs[key]['similarity'] is None:
+ continue
+ if pairs[key]['similarity'] > self.similarity_threshold:
+ # this pair is a duplicate.
+ # Decide which one to keep
+ url1, url2 = key.split(" ", 1)
+ reject = self.select_url_to_reject(url1, url2)
+ self.config.remove_url(reject)
+
+ return pairs
+
+
+ def compare_pairwise(self, content):
+ # compair pairwise
+ pairs = {}
+
+ for url1 in content:
+ for url2 in content:
+
+ if url1 == url2:
+ continue
+
+ # avoid checking pairs twice
+ pair_key = " ".join(sorted([url1, url2]))
+ if pair_key in pairs:
+ continue
+
+ try:
+ s = html_similarity.similarity(content[url1], content[url2])
+ logging.debug("Comparing pages for URLs %s and %s: similarity=%s", url1, url2, s)
+ pairs[pair_key] = {
+ 'similarity': s,
+ 'exception': None,
+ }
+ except (AttributeError, ValueError) as e:
+ logging.error("html_similarity.similarity thre exception for URL pair %s and %s: %s", url1, url2, e)
+ pairs[pair_key] = {
+ 'similarity': None,
+ 'exception': str(e),
+ }
+
+ return pairs
+
+
+ def select_url_to_reject(self, url1, url2):
+ """Determine which of two URLs to keep, which to reject"""
+
+ # HTTPS takes precedence
+ if url1.startswith('https://') and not url2.startswith('https://'):
+ return url2
+ elif url2.startswith('https://') and not url1.startswith('https://'):
+ return url1
+
+ # Shorter URL wins
+ if len(url1) < len(url2):
+ return url2
+ elif len(url1) > len(url2):
+ return url1
+
+ # default behaviour
+ return url1
diff --git a/checks/generator.py b/checks/generator.py
new file mode 100644
index 0000000..5a1968b
--- /dev/null
+++ b/checks/generator.py
@@ -0,0 +1,76 @@
+"""
+Checks the 'generator' meta tag and page content properties
+to detect well-known content management systems, themes etc.
+"""
+
+import logging
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+
+ # IP address of the newthinking GCMS server
+ gcms_ip = "91.102.13.20"
+
+ def __init__(self, config, previous_results=None):
+ super().__init__(config, previous_results)
+
+ def run(self):
+ assert 'page_content' in self.previous_results
+ assert 'html_head' in self.previous_results
+
+ results = {}
+
+ for url in self.config.urls:
+ results[url] = self.get_generator(url)
+
+ return results
+
+
+ def get_generator(self, url):
+ page_content = self.previous_results['page_content'][url]
+ assert 'content' in page_content
+
+ assert 'dns_resolution' in self.previous_results
+ dns_resolution = self.previous_results['dns_resolution']
+
+ head = self.previous_results['html_head'][url]
+
+ generator = None
+
+ if 'generator' in head and head['generator'] is not None:
+ generator = head['generator'].lower()
+ if 'typo3' in generator:
+ generator = 'typo3'
+ if 'wordpress' in generator:
+ generator = 'wordpress'
+ if 'drupal' in generator:
+ generator = 'drupal'
+ if 'joomla' in generator:
+ generator = 'joomla'
+
+ # Qualify certain CMS flavours in more detail
+ if generator == "typo3":
+ # Typo3-Gruene advertises in the page content
+ if 'typo3-gruene.de' in page_content['content']:
+ generator = "typo3-gruene"
+ # newthinking GCMS in some page hrefs
+ elif 'ntc_gcms' in page_content['content']:
+ generator = "typo3-gcms"
+ # check if one of the IPs matches the well-known GCMS Server IP
+ elif url in dns_resolution:
+ for addr in dns_resolution[url]['ipv4_addresses']:
+ if addr == self.gcms_ip:
+ generator = "typo3-gcms"
+
+ elif 'Urwahl3000' in page_content['content']:
+ generator = "wordpress-urwahl"
+
+ elif ('josephknowsbest' in page_content['content'] or
+ 'Joseph-knows-best' in page_content['content']):
+ generator = "wordpress-josephknowsbest"
+
+ elif 'wordpress' in page_content['content']:
+ generator = "wordpress"
+
+ return generator
diff --git a/checks/html_head.py b/checks/html_head.py
new file mode 100644
index 0000000..4d3391e
--- /dev/null
+++ b/checks/html_head.py
@@ -0,0 +1,152 @@
+"""
+Extracts information from the html , like existence and value
+of certain meta tags, link tags, title, etc.
+"""
+
+import logging
+import re
+from urllib.parse import urljoin
+from urllib.parse import urlparse
+
+from bs4 import BeautifulSoup
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+ def __init__(self, config, previous_results=None):
+ super().__init__(config, previous_results)
+
+ def run(self):
+ results = {}
+
+ for url in self.config.urls:
+ results[url] = self.get_content(url)
+
+ return results
+
+ def get_content(self, url):
+ """
+ Expects page_content_dict['content'] to carry the HTML content
+ """
+
+ page_content = self.previous_results['page_content'][url]
+ assert 'content' in page_content
+ assert 'response_headers' in page_content
+ assert 'content-type' in page_content['response_headers']
+
+ if page_content['content'] is None:
+ return
+
+ soup = BeautifulSoup(page_content['content'], 'html.parser')
+ head = soup.find('head')
+
+ result = {
+ 'title': self.get_title(head),
+ 'link_canonical': self.get_link_canonical(head, url),
+ 'link_rss_atom': self.get_link_rss_atom(head, url),
+ 'link_icon': self.get_link_icon(head, url),
+ 'generator': self.get_generator(head),
+ 'opengraph': self.get_opengraph(head),
+ 'viewport': self.get_viewport(head),
+ }
+
+ return result
+
+
+ def get_title(self, head):
+ """Extract and clean up page title"""
+ if head is None:
+ return
+
+ title = None
+
+ tag = head.find('title')
+ if tag is None:
+ return
+
+ title = tag.get_text()
+
+ # clean up
+ title = title.replace(u'\u00a0', ' ')
+ title = title.replace(' ', ' ')
+ title = title.strip()
+
+ return title
+
+
+ def get_link_canonical(self, head, url):
+ if head is None:
+ return
+ link = head.find('link', rel='canonical')
+ if link:
+ return urljoin(url, link.get('href'))
+
+
+ def get_link_rss_atom(self, head, url):
+ if head is None:
+ return
+ hrefs = []
+ rss_links = head.find_all('link', type='application/rss+xml')
+ atom_links = head.find_all('link', type='application/atom+xml')
+
+ if rss_links:
+ for link in rss_links:
+ hrefs.append(link.get('href'))
+ if atom_links:
+ for link in rss_links:
+ hrefs.append(link.get('href'))
+
+ # make URLs absolute
+ for i in range(len(hrefs)):
+ parsed = urlparse(hrefs[i])
+ if parsed.scheme == '':
+ hrefs[i] = urljoin(url, hrefs[i])
+
+ return hrefs
+
+
+ def get_link_icon(self, head, url):
+ if head is None:
+ return
+
+ tag = head.find('link', rel=lambda x: x and x.lower() == 'icon')
+ if tag:
+ return urljoin(url, tag.get('href'))
+ tag = head.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
+ if tag:
+ return urljoin(url, tag.get('href'))
+
+
+ def get_generator(self, head):
+ if head is None:
+ return
+
+ tags = head.select('[name=generator]')
+ if tags:
+ return tags[0].get('content')
+
+
+ def get_opengraph(self, head):
+ if head is None:
+ return
+
+ # we find tags by matching this property/itemprop value regex
+ property_re = re.compile('^og:')
+
+ opengraph = set()
+ for tag in head.find_all(property=property_re):
+ opengraph.add(tag.get('property'))
+ for tag in head.find_all(itemprop=property_re):
+ opengraph.add(tag.get('itemprop'))
+
+ opengraph = sorted(list(opengraph))
+ if opengraph != []:
+ return opengraph
+
+
+ def get_viewport(self, head):
+ if head is None:
+ return
+ tags = head.select('[name=viewport]')
+ if tags:
+ return tags[0].get('content')
diff --git a/checks/http_and_https.py b/checks/http_and_https.py
new file mode 100644
index 0000000..79d0b6b
--- /dev/null
+++ b/checks/http_and_https.py
@@ -0,0 +1,27 @@
+"""
+This adds, for every HTTP URL, the HTTPS counterpart,
+and vice versa, to config.urls
+
+So it doesn't actually perform tests. It only expands the
+URLs to test by other checks.
+"""
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+ def __init__(self, config, previous_results=None):
+ super().__init__(config, previous_results)
+
+ def run(self):
+ """
+ Adds URLs to config.urls, returns nothing
+ """
+
+ for url in self.config.urls:
+
+ if url.startswith('https://'):
+ self.config.add_url('http://' + url[8:])
+ elif url.startswith('http://'):
+ self.config.add_url('https://' + url[7:])
+
+ return None
\ No newline at end of file
diff --git a/checks/load_in_browser.py b/checks/load_in_browser.py
new file mode 100644
index 0000000..2ab3af6
--- /dev/null
+++ b/checks/load_in_browser.py
@@ -0,0 +1,134 @@
+"""
+Collects information by loading pages in a browser.
+
+Information includes:
+
+- whether the document width adapts well to viewports as little as 360 pixels wide
+- whether javascript errors or errors from missing resources occur
+- collects CSS font-family properties in use
+"""
+
+import logging
+import time
+
+from selenium import webdriver
+from selenium.common.exceptions import StaleElementReferenceException
+from selenium.common.exceptions import TimeoutException
+import tenacity
+
+from checks.abstract_checker import AbstractChecker
+
+
+class Checker(AbstractChecker):
+
+ page_load_timeout = 20
+
+ # sizes we check for (width, height)
+ sizes = (
+ (360, 640), # rather old smartphone
+ (768, 1024), # older tablet or newer smartphone
+ (1024, 768), # older desktop or horiz. tablet
+ (1920, 1080), # Full HD horizontal
+ )
+
+ def __init__(self, config, previous_results=None):
+ super().__init__(config, previous_results)
+
+ # Our selenium user agent using Chrome headless as an engine
+ chrome_options = webdriver.ChromeOptions()
+ chrome_options.add_argument('--headless')
+ chrome_options.add_argument('--disable-gpu')
+ chrome_options.add_argument('--no-sandbox')
+ chrome_options.add_argument('--disable-extensions')
+ self.driver = webdriver.Chrome(options=chrome_options)
+ self.driver.set_page_load_timeout(self.page_load_timeout)
+
+ def run(self):
+
+ results = {}
+ for url in self.config.urls:
+
+ results[url] = {
+ 'sizes': None,
+ 'min_document_width': None,
+ 'logs': None,
+ 'font_families': None,
+ }
+
+ # responsive check
+ try:
+ sizes = self.check_responsiveness(url)
+ results[url] = {
+ 'sizes': sizes,
+ 'min_document_width': min([s['document_width'] for s in sizes]),
+ 'logs': self.capture_log(),
+ }
+ except TimeoutException as e:
+ logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e))
+ pass
+ except tenacity.RetryError as re:
+ logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re))
+ pass
+
+ # CSS collection
+ font_families = None
+
+ try:
+ elements = self.driver.find_elements_by_xpath("//*")
+ font_families = set()
+ for element in elements:
+ try:
+ font_family = element.value_of_css_property('font-family')
+ if font_family is None:
+ continue
+ font_families.add(font_family.lower())
+ except StaleElementReferenceException as e:
+ logging.warn("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
+ continue
+
+ results[url]['font_families'] = sorted(list(font_families))
+
+ except TimeoutException as e:
+ logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
+ pass
+
+ self.driver.quit()
+
+ return results
+
+
+ @tenacity.retry(stop=tenacity.stop_after_attempt(3),
+ retry=tenacity.retry_if_exception_type(TimeoutException))
+ def check_responsiveness(self, url):
+ result = []
+
+ # set window to the first size initially
+ self.driver.set_window_size(self.sizes[0][0], self.sizes[0][1])
+ self.driver.get(url)
+
+ # give the page some time to load
+ time.sleep(10)
+
+ for (width, height) in self.sizes:
+ self.driver.set_window_size(width, height)
+
+ # wait for re-render/re-flow
+ time.sleep(1.0)
+ doc_width = self.driver.execute_script("return document.body.scrollWidth")
+
+ result.append({
+ 'viewport_width': width,
+ 'document_width': int(doc_width),
+ })
+
+ return result
+
+ def capture_log(self):
+ """
+ Returns log elements with level "SEVERE"
+ """
+ entries = []
+ for entry in self.driver.get_log('browser'):
+ if entry['level'] in ('WARNING', 'SEVERE'):
+ entries.append(entry)
+ return entries
diff --git a/checks/page_content.py b/checks/page_content.py
new file mode 100644
index 0000000..d036274
--- /dev/null
+++ b/checks/page_content.py
@@ -0,0 +1,94 @@
+"""
+This check downloads the HTML page for each URL
+"""
+
+import logging
+
+import requests
+
+from checks.abstract_checker import AbstractChecker
+
+
+class Checker(AbstractChecker):
+
+ # connection timeout (seconds)
+ CONNECT_TIMEOUT = 10
+
+ # response timeout (seconds)
+ READ_TIMEOUT = 20
+
+ def __init__(self, config, previous_results=None):
+ super().__init__(config, previous_results)
+
+
+ def run(self):
+ results = {}
+
+ self.headers = {
+ "User-Agent": self.config.user_agent,
+ }
+
+ # copy URLs, as we may be manipulating self.config.urls in the loop
+ url = list(self.config.urls)
+
+ for url in self.config.urls:
+ result = self.download_page(url)
+ results[url] = result
+
+ # remove bad URLs from config, to avoid later checks using them
+ if 'exception' in result and result['exception'] is not None:
+ self.config.remove_url(url)
+
+ return results
+
+
+ def download_page(self, url):
+ result = {
+ 'url': url,
+ 'content': None,
+ 'content_type': None,
+ 'content_length': None,
+ 'status_code': None,
+ 'response_headers': None,
+ 'duration': None,
+ 'exception': None,
+ }
+
+ try:
+ r = requests.get(url,
+ headers=self.headers,
+ timeout=(self.CONNECT_TIMEOUT, self.READ_TIMEOUT))
+
+ result['url'] = r.url
+ result['status_code'] = r.status_code
+ result['content'] = r.text
+ result['content_length'] = len(r.text)
+ result['response_headers'] = self.get_headers(r.headers)
+ result['duration'] = round(r.elapsed.total_seconds() * 1000)
+
+ if r.headers.get("content-type") is not None:
+ result['content_type'] = r.headers.get("content-type").split(";")[0].strip()
+
+ except requests.exceptions.ConnectionError as exc:
+ logging.error(str(exc) + " " + url)
+ result['exception'] = "connection"
+ except requests.exceptions.ReadTimeout as exc:
+ logging.error(str(exc) + " " + url)
+ result['exception'] = "read_timeout"
+ except requests.exceptions.Timeout as exc:
+ logging.error(str(exc) + " " + url)
+ result['exception'] = "connection_timeout"
+ except Exception as exc:
+ logging.error(str(exc) + " " + url)
+ result['exception'] = "%s %s" % (str(type(exc)), exc)
+
+ return result
+
+ def get_headers(self, headers):
+ """
+ Transforms CaseInsensitiveDict into dict with lowercase keys
+ """
+ out = {}
+ for key in headers:
+ out[key.lower()] = headers[key]
+ return out
diff --git a/checks/url_canonicalization.py b/checks/url_canonicalization.py
new file mode 100644
index 0000000..c6ce173
--- /dev/null
+++ b/checks/url_canonicalization.py
@@ -0,0 +1,13 @@
+"""
+This check verifies whether there is a single URL
+or several variants left at this point.
+"""
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+ def __init__(self, config, previous_results=None):
+ super().__init__(config, previous_results)
+
+ def run(self):
+ return self.config.urls
diff --git a/checks/url_reachability.py b/checks/url_reachability.py
new file mode 100644
index 0000000..b371540
--- /dev/null
+++ b/checks/url_reachability.py
@@ -0,0 +1,104 @@
+"""
+This check verifies whether the urls in config are reachable.
+Some additional information regarding redirects and SSL problems
+are also recorded and returned as results.
+
+Non-accessible URLs are removed from config.urls.
+
+A redirect to facebook.com is not considered reachable, as that
+leads to a different website in the sense of this system.
+
+TODO: Parallelize the work done in this test
+"""
+
+import logging
+
+from urllib.parse import urlparse
+import requests
+
+from checks.abstract_checker import AbstractChecker
+
+
+class Checker(AbstractChecker):
+ def __init__(self, config, previous_results=None):
+ super().__init__(config, previous_results)
+
+ def run(self):
+ headers = {
+ "User-Agent": self.config.user_agent
+ }
+
+ results = {}
+ urls = list(self.config.urls)
+
+ for url in urls:
+ logging.debug("Checking URL reachability for %s", url)
+
+ result = {
+ "url": url,
+ "redirect_history": [],
+ "status": None,
+ "exception": None,
+ "duration": None,
+ }
+
+ # Perform HEAD requests, recording redirect log
+ try:
+ r = requests.head(url, headers=headers, allow_redirects=True)
+ result['status'] = r.status_code
+ result['duration'] = round(r.elapsed.total_seconds() * 1000)
+
+ if len(r.history):
+ result['redirect_history'] = self.expand_history(r.history)
+ logging.debug("Redirects: %r", result['redirect_history'])
+
+ if r.url == url:
+ logging.debug("URL: %s - status %s", url, r.status_code)
+ else:
+ logging.debug("URL: %s - status %s - redirects to %s", url,
+ r.status_code, r.url)
+ # remove source URL, add target URL to config.urls
+ self.config.remove_url(url)
+ self.config.add_url(r.url)
+
+ # remove 404 etc
+ if r.status_code > 400:
+ self.config.remove_url(url)
+
+ except Exception as exc:
+ logging.info("Exception for URL %s: %s %s", url, str(type(exc)), exc)
+ result['exception'] = {
+ 'type': str(type(exc)),
+ 'message': str(exc),
+ }
+
+ # remove URL to prevent further checks on unreachable URL
+ self.config.remove_url(url)
+
+ # if redirects end in www.facebook.com or www.denic.de, remove this URL again
+ # remove if redirect target is facebook
+ if result['exception'] is not None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
+ parsed = urlparse(result['redirect_history'][-1]['redirect_to'])
+ if parsed.hostname in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
+ result[url]['exception'] = {
+ 'type': 'Bad target domain',
+ 'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.hostname,
+ }
+ self.config.remove_url(url)
+
+ results[url] = result
+
+ return results
+
+ def expand_history(self, history):
+ """Extracts primitives from a list of requests.Response objects"""
+ items = []
+ for h in history:
+ item = {
+ 'status': h.status_code,
+ 'duration': round(h.elapsed.total_seconds() * 1000),
+ 'redirect_to': h.headers['location'],
+ }
+ items.append(item)
+
+ return items
diff --git a/checks/url_reachability_test.py b/checks/url_reachability_test.py
new file mode 100644
index 0000000..b5514d5
--- /dev/null
+++ b/checks/url_reachability_test.py
@@ -0,0 +1,71 @@
+import httpretty
+from httpretty import httprettified
+import unittest
+
+from checks import url_reachability
+from checks.config import Config
+
+@httprettified
+class TestCharsetChecker(unittest.TestCase):
+
+ def test_success(self):
+ url = 'http://www.example.com/'
+ httpretty.register_uri(httpretty.HEAD, url,
+ status=200, body="")
+
+ config = Config(urls=[url])
+ checker = url_reachability.Checker(config=config, previous_results={})
+ result = checker.run()
+
+ self.assertEqual(result[url]['url'], url)
+ self.assertEqual(result[url]['redirect_history'], [])
+ self.assertEqual(result[url]['status'], 200)
+ self.assertIsNone(result[url]['exception'])
+ self.assertTrue(0 < result[url]['duration'] < 100)
+
+
+ def test_redirect(self):
+ url = 'http://www.example.com/'
+ url2 = 'http://www2.example.com/'
+ httpretty.register_uri(httpretty.HEAD, url,
+ status=302, body="",
+ adding_headers={"Location": url2})
+ httpretty.register_uri(httpretty.HEAD, url2,
+ status=200, body="")
+
+ config = Config(urls=[url])
+ checker = url_reachability.Checker(config=config, previous_results={})
+ result = checker.run()
+
+ self.assertIn(url, result)
+ self.assertEqual(result[url]['url'], url)
+ self.assertEqual(result[url]['status'], 200)
+ self.assertIsNone(result[url]['exception'])
+ self.assertTrue(0 < result[url]['duration'] < 100)
+ self.assertEqual(len(result[url]['redirect_history']), 1)
+ self.assertEqual(result[url]['redirect_history'][0]['status'], 302)
+ self.assertEqual(result[url]['redirect_history'][0]['redirect_to'], url2)
+
+
+ def test_notfound(self):
+ url = 'http://www.example.com/'
+ httpretty.register_uri(httpretty.HEAD, url,
+ status=404, body="Not found")
+
+ config = Config(urls=[url])
+ checker = url_reachability.Checker(config=config, previous_results={})
+ result = checker.run()
+
+ self.assertEqual(result[url]['url'], url)
+ self.assertEqual(result[url]['redirect_history'], [])
+ self.assertEqual(result[url]['status'], 404)
+ self.assertIsNone(result[url]['exception'])
+
+ newconfig = checker.config
+
+ self.assertEqual(len(newconfig.urls), 0)
+
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/cli.py b/cli.py
new file mode 100644
index 0000000..f470f26
--- /dev/null
+++ b/cli.py
@@ -0,0 +1,83 @@
+"""
+Command line utility for spider, export etc.
+"""
+
+import argparse
+import logging
+import signal
+import sys
+
+from google.cloud import datastore
+
+def handle_sigint(signum, frame):
+ """
+ Handles SIGINT, which occurs on Ctrl-C
+ """
+ print("\nInterrupted by SIGINT\n")
+ sys.exit()
+
+
+if __name__ == "__main__":
+ signal.signal(signal.SIGINT,handle_sigint)
+
+ parser = argparse.ArgumentParser()
+
+ # global flags
+ parser.add_argument('--credentials-path', dest='credentials_path',
+ help='Path to the service account credentials JSON file',
+ default='/secrets/service-account.json')
+
+ parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
+ default='info')
+
+ # subcommands
+ subparsers = parser.add_subparsers(help='sub-command help', dest='command')
+
+ # spider subcommand
+ spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
+ spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
+
+ # jobs subcommand
+ jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
+ jobs_parser.add_argument('--url', help='Add a job to spider a specific URL')
+
+ # export subcommand
+ export_parser = subparsers.add_parser('export', help='Export JSON data')
+ export_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to export (default: spider-results)')
+
+
+ args = parser.parse_args()
+
+ # set log level
+ logging.getLogger("urllib3").setLevel(logging.CRITICAL)
+
+ loglevel = args.loglevel.lower()
+ if loglevel == 'error':
+ logging.basicConfig(level=logging.ERROR)
+ elif loglevel == 'warn':
+ logging.basicConfig(level=logging.WARN)
+ elif loglevel == 'debug':
+ logging.basicConfig(level=logging.DEBUG)
+ logging.getLogger("selenium").setLevel(logging.INFO)
+ else:
+ logging.basicConfig(level=logging.INFO)
+ loglevel = 'info'
+
+ logging.debug("Called command %s", args.command)
+
+ datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
+
+ if args.command == 'jobs':
+
+ import jobs
+ jobs.create_jobs(datastore_client, args.url)
+
+ elif args.command == 'export':
+
+ import export
+ export.export_screenshots(datastore_client)
+ export.export_results(datastore_client, args.kind)
+
+ else:
+ from spider import spider
+ spider.work_of_queue(datastore_client, args.kind)
diff --git a/config/__init__.py b/config/__init__.py
new file mode 100644
index 0000000..83b1ba4
--- /dev/null
+++ b/config/__init__.py
@@ -0,0 +1,23 @@
+
+
+# connection timeout for website checks (seconds)
+CONNECT_TIMEOUT = 5
+
+# response timeout for website checks
+READ_TIMEOUT = 10
+
+# Git repo for our data
+GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
+
+# folder in that repo that holds the data
+GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
+
+# folder we use locally to clone the repo
+GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
+
+# IP address of the newthinking GCMS server
+GCMS_IP = "91.102.13.20"
+
+# kind name of the spider job key datastore entities
+JOB_DATASTORE_KIND = 'spider-jobs'
+
diff --git a/devops/run-job.sh b/devops/run-job.sh
index 207c205..530ecdb 100755
--- a/devops/run-job.sh
+++ b/devops/run-job.sh
@@ -19,6 +19,8 @@
# secrets/datastore-writer.json
+DOCKERIMAGE="quay.io/netzbegruenung/green-spider:dev"
+
API_TOKEN_SECRET="secrets/hetzner-api-token.sh"
test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; }
source $API_TOKEN_SECRET
@@ -29,10 +31,14 @@ if [[ "$1" == "" ]]; then
exit 1
fi
+SERVERNAME="$1-$(date | md5 | cut -c1-3)"
+
+# possible values: cx11 (1 core 2 GB), cx21 (2 cores, 4 GB), cx31 (2 cores, 8 GB)
+SERVERTYPE="cx21"
function create_server()
{
- echo "Creating server $1"
+ echo "Creating server $SERVERNAME"
# server_type 'cx11' is the smallest, cheapest category.
# location 'nbg1' is Nürnberg/Nuremberg, Germany.
@@ -44,8 +50,8 @@ function create_server()
-H "Content-Type: application/json" \
-H "Authorization: Bearer $API_TOKEN" \
-d "{
- \"name\": \"$1\",
- \"server_type\": \"cx11\",
+ \"name\": \"$SERVERNAME\",
+ \"server_type\": \"$SERVERTYPE\",
\"location\": \"nbg1\",
\"start_after_create\": true,
\"image\": \"debian-9\",
@@ -61,7 +67,7 @@ function create_server()
# Get IP:
SERVER_IP=$(echo $CREATE_RESPONSE | jq -r .server.public_net.ipv4.ip)
- echo "Created server with ID $SERVER_ID and IP $SERVER_IP"
+ echo "Created server $SERVERNAME with ID $SERVER_ID and IP $SERVER_IP"
}
@@ -142,22 +148,25 @@ else
# Run docker job
echo "Starting Docker Job"
- ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \
- -v /root/secrets:/secrets \
- quay.io/netzbegruenung/green-spider spider.py \
- --credentials-path /secrets/datastore-writer.json \
- jobs
+ #ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \
+ # -v /root/secrets:/secrets \
+ # quay.io/netzbegruenung/green-spider spider.py \
+ # --credentials-path /secrets/datastore-writer.json \
+ # jobs
+ ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP mkdir -p /dev-shm
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \
+ -v /dev-shm:/dev/shm \
-v /root/secrets:/secrets \
- quay.io/netzbegruenung/green-spider spider.py \
+ $DOCKERIMAGE \
--credentials-path /secrets/datastore-writer.json \
- spider
+ --loglevel info \
+ spider --kind spider-results-dev
fi
# Delete the box
-echo "Deleting server $SERVER_ID"
+echo "Deleting server $SERVERNAME with ID $SERVER_ID"
curl -s -X DELETE -H "Content-Type: application/json" \
-H "Authorization: Bearer $API_TOKEN" \
https://api.hetzner.cloud/v1/servers/$SERVER_ID
diff --git a/data_export.py b/export/__init__.py
similarity index 56%
rename from data_export.py
rename to export/__init__.py
index 2ab23ef..64af277 100644
--- a/data_export.py
+++ b/export/__init__.py
@@ -2,8 +2,7 @@
Exports data from the database to JSON files for use in a static webapp
"""
-from google.cloud import datastore
-import hashlib
+from hashlib import md5
import json
import logging
import sys
@@ -14,44 +13,67 @@ import requests
SITEICONS_PATH = "/icons"
-client = None
-
-def export_results():
+def export_results(client, entity_kind):
"""
Export of the main results data
"""
out = []
- query = client.query(kind='spider-results')
+ # Load data from database
+ query = client.query(kind=entity_kind)
for entity in query.fetch():
logging.debug(entity.key.name)
- record = dict(entity)
- record["results"]["created"] = record["created"].isoformat()
- out.append(record["results"])
+ out.append({
+ 'input_url': entity.key.name,
+ 'resulting_urls': entity.get('checks').get('url_canonicalization'),
+ 'created': entity.get('created').isoformat(),
+ 'meta': entity.get('meta'),
+ 'checks': entity.get('checks'),
+ 'rating': entity.get('rating'),
+ 'score': entity.get('score'),
+ 'icons': [],
+ })
# load icons, reformat icons details
+ icons_downloaded = set()
for index in range(len(out)):
- if "details" not in out[index]:
- continue
- if "icons" not in out[index]["details"]:
- continue
- urls = out[index]["details"]["icons"]
- out[index]["details"]["icons"] = {}
- for url in urls:
- if not (url.startswith("http://") or url.startswith("https://")):
- logging.debug("Skipping icon %s", url)
- continue
- logging.debug("Dowloading icon %s", url)
- filename = download_icon(url)
+ assert "checks" in out[index]
+ assert "html_head" in out[index]["checks"]
+
+ # collect icons urls
+ icons = set()
+ for url in out[index]['checks']['html_head']:
+ assert 'link_icon' in out[index]['checks']['html_head'][url]
+ if out[index]['checks']['html_head'][url]['link_icon'] is not None:
+ iconurl = out[index]['checks']['html_head'][url]['link_icon']
+ if iconurl.startswith("data:"):
+ continue
+ if iconurl in icons_downloaded:
+ continue
+ icons.add(iconurl)
+
+ out[index]["icons"] = {}
+ for iconurl in list(icons):
+ logging.debug("Dowloading icon %s", iconurl)
+ icons_downloaded.add(iconurl)
+ filename = download_icon(iconurl)
if filename:
- out[index]["details"]["icons"][url] = filename
+ out[index]["icons"][url] = filename
output_filename = "/out/spider_result.json"
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
+
+ # compact version
+ output_filename = "/out/spider_result_compact.json"
+ for i in range(len(out)):
+ out[i]['cms'] = list(out[i]['checks']['generator'].values())
+ del out[i]['checks']
+ with open(output_filename, 'w', encoding="utf8") as jsonfile:
+ json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
-def export_screenshots():
+def export_screenshots(client):
"""
Export of screenshot meta data
"""
@@ -78,10 +100,12 @@ def download_icon(icon_url):
"""
default_endings = {
+ "image/x-ico": "ico",
"image/x-icon": "ico",
"image/vnd.microsoft.icon": "ico",
"image/png": "png",
"image/jpeg": "jpg",
+ "image/gif": "gif",
}
# Download the icon
@@ -92,7 +116,7 @@ def download_icon(icon_url):
if req.status_code >= 400:
return None
- content_hash = hashlib.md5(req.content).hexdigest()
+ content_hash = md5(req.content).hexdigest()
extension = ""
try:
@@ -109,6 +133,9 @@ def download_icon(icon_url):
if extension == "":
# derive from content type
ctype = req.headers.get('content-type')
+ if ctype is None:
+ return
+
try:
extension = default_endings[ctype]
except KeyError:
@@ -122,17 +149,3 @@ def download_icon(icon_url):
iconfile.write(req.content)
return filename
-
-
-if __name__ == "__main__":
- logging.basicConfig(level=logging.DEBUG)
-
- if len(sys.argv) == 1:
- print("Error: please provide path to Google Storage API system account JSON file as argument")
- sys.exit(1)
-
- key_path = sys.argv[1]
- client = datastore.Client.from_service_account_json(key_path)
-
- export_screenshots()
- export_results()
diff --git a/jobs/__init__.py b/jobs/__init__.py
new file mode 100644
index 0000000..3e125d5
--- /dev/null
+++ b/jobs/__init__.py
@@ -0,0 +1,180 @@
+"""
+The jobs module allows to create jobs for the queue and take jobs off the queue
+"""
+
+from datetime import datetime
+import logging
+import os
+import random
+import shutil
+
+from git import Repo
+import tenacity
+import yaml
+from google.api_core.exceptions import Aborted
+from google.cloud import datastore
+
+import config
+
+
+def clone_data_directory():
+ """
+ Clones the source of website URLs, the green directory,
+ into the local file system using git
+ """
+ if os.path.exists(config.GREEN_DIRECTORY_LOCAL_PATH):
+ shutil.rmtree(config.GREEN_DIRECTORY_LOCAL_PATH)
+ Repo.clone_from(config.GREEN_DIRECTORY_REPO, config.GREEN_DIRECTORY_LOCAL_PATH)
+
+
+def directory_entries():
+ """
+ Iterator over all data files in the cloned green directory
+ """
+ path = os.path.join(config.GREEN_DIRECTORY_LOCAL_PATH, config.GREEN_DIRECTORY_DATA_PATH)
+ for root, _, files in os.walk(path):
+ for fname in files:
+
+ filepath = os.path.join(root, fname)
+ if not filepath.endswith(".yaml"):
+ continue
+
+ with open(filepath, 'r', encoding='utf8') as yamlfile:
+ for doc in yaml.load_all(yamlfile):
+ yield doc
+
+
+def chunks(the_list, size):
+ """
+ Yield successive n-sized chunks from list the_list
+ where n = size.
+ """
+ for i in range(0, len(the_list), size):
+ yield the_list[i:i + size]
+
+
+def create_jobs(datastore_client, url=None):
+ """
+ Read all URLs from green directory and fill a job database
+ with one job per URL.
+
+ Alternatively, if the url argument is given, only the given URL
+ will be added as a spider job.
+ """
+
+ # refresh our local clone of the green directory
+ logging.info("Refreshing green-directory clone")
+ clone_data_directory()
+
+ # build the list of website URLs to run checks for
+ logging.info("Processing green-directory")
+ input_entries = []
+
+ count = 0
+
+ random.seed()
+
+ for entry in directory_entries():
+
+ if 'type' not in entry:
+ logging.error("Entry without type")
+ continue
+ if 'urls' not in entry:
+ logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
+ continue
+
+ website_url = None
+ for index in range(len(entry['urls'])):
+ try:
+ if entry['urls'][index]['type'] == "WEBSITE":
+ website_url = entry['urls'][index]['url']
+ if website_url:
+ if url is not None and website_url != url:
+ continue
+ input_entries.append({
+ "url": website_url,
+ "type": entry.get("type"),
+ "level": entry.get("level"),
+ "state": entry.get("state"),
+ "district": entry.get("district"),
+ "city": entry.get("city"),
+ })
+ count += 1
+ except NameError:
+ logging.error("Error in %s: 'url' key missing (%s)",
+ repr_entry(entry), entry['urls'][index])
+
+ # ensure the passed URL argument is really there, even if not part
+ # of the directory.
+ if url and count == 0:
+ logging.info("Adding job for URL %s which is not part of green-directory", url)
+ input_entries.append({
+ "url": url,
+ "type": None,
+ "level": None,
+ "state": None,
+ "district": None,
+ "city": None,
+ "index": int(random.uniform(1000000, 9999999)),
+ })
+
+ count = 0
+ logging.info("Writing jobs")
+
+ entities = []
+
+ for entry in input_entries:
+ key = datastore_client.key(config.JOB_DATASTORE_KIND, entry["url"])
+ entity = datastore.Entity(key=key)
+ entity.update({
+ "created": datetime.utcnow(),
+ "type": entry["type"],
+ "level": entry["level"],
+ "state": entry["state"],
+ "district": entry["district"],
+ "city": entry["city"],
+ "index": int(random.uniform(1000000, 9999999)),
+ })
+ entities.append(entity)
+
+ # commmit to DB
+ for chunk in chunks(entities, 300):
+ logging.debug("Writing jobs chunk of length %d", len(chunk))
+ datastore_client.put_multi(chunk)
+ count += len(chunk)
+
+ logging.info("Writing jobs done, %s jobs added", count)
+
+
+@tenacity.retry(wait=tenacity.wait_exponential(),
+ retry=tenacity.retry_if_exception_type(Aborted))
+def get_job_from_queue(datastore_client):
+ """
+ Returns a URL from the queue
+ """
+ out = None
+
+ with datastore_client.transaction():
+ query = datastore_client.query(kind=config.JOB_DATASTORE_KIND,
+ order=['index'])
+ for entity in query.fetch(limit=1):
+ logging.debug("Got job: %s", entity)
+ out = dict(entity)
+ out["url"] = entity.key.name
+ datastore_client.delete(entity.key)
+
+ return out
+
+def repr_entry(entry):
+ """
+ Return string representation of a directory entry,
+ for logging/debugging purposes
+ """
+ ret = entry['type']
+ if 'level' in entry:
+ ret += "/" + entry['level']
+ if 'state' in entry:
+ ret += "/" + entry['state']
+ if 'district' in entry:
+ ret += "/" + entry['district']
+ return ret
diff --git a/rating/__init__.py b/rating/__init__.py
new file mode 100644
index 0000000..197e720
--- /dev/null
+++ b/rating/__init__.py
@@ -0,0 +1,53 @@
+"""
+The rating module contains the functionality to get calculate score for certain
+criteria based on information gather by checks before.
+"""
+
+import logging
+
+from rating import canonical_url
+from rating import favicon
+from rating import feeds
+from rating import https
+from rating import no_network_errors
+from rating import no_script_errors
+from rating import reachable
+from rating import resolvable
+from rating import response_duration
+from rating import responsive_layout
+from rating import use_specific_fonts
+from rating import www_optional
+
+
+def calculate_rating(results):
+ """
+ Calculates ratings for a number of criteria.
+
+ Params:
+ results - Results dictionary from checks
+ """
+
+ # The raters to execute.
+ rating_modules = {
+ 'CANONICAL_URL': canonical_url,
+ 'DNS_RESOLVABLE_IPV4': resolvable,
+ 'FAVICON': favicon,
+ 'FEEDS': feeds,
+ 'HTTPS': https,
+ 'HTTP_RESPONSE_DURATION': response_duration,
+ 'NO_NETWORK_ERRORS': no_network_errors,
+ 'NO_SCRIPT_ERRORS': no_script_errors,
+ 'RESPONSIVE': responsive_layout,
+ 'SITE_REACHABLE': reachable,
+ 'USE_SPECIFIC_FONTS': use_specific_fonts,
+ 'WWW_OPTIONAL': www_optional,
+ }
+
+ output = {}
+
+ for name in rating_modules:
+
+ rater = rating_modules[name].Rater(results)
+ output[name] = rater.rate()
+
+ return output
diff --git a/rating/abstract_rater.py b/rating/abstract_rater.py
new file mode 100644
index 0000000..ef2a2f8
--- /dev/null
+++ b/rating/abstract_rater.py
@@ -0,0 +1,22 @@
+class AbstractRater(object):
+
+ # String 'boolean' or 'number'
+ rating_type = None
+
+ # The default value to return if no rating given
+ default_value = None
+
+ max_score = 1
+
+ # Name of the checks this rater depends on
+ depends_on_checks = []
+
+ def __init__(self, check_results):
+ self.check_results = check_results
+
+ for item in self.depends_on_checks:
+ assert item in self.check_results
+
+ def rate(self):
+ raise NotImplementedError()
+
diff --git a/rating/canonical_url.py b/rating/canonical_url.py
new file mode 100644
index 0000000..dbe4024
--- /dev/null
+++ b/rating/canonical_url.py
@@ -0,0 +1,31 @@
+"""
+This looks at remaining resolvable URLs after redirects
+and gives score if there is only one URL left.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+ rating_type = 'boolean'
+ default_value = False
+ depends_on_checks = ['url_canonicalization']
+ max_score = 1
+
+ def __init__(self, check_results):
+ super().__init__(check_results)
+
+ def rate(self):
+ value = self.default_value
+ score = 0
+
+ if len(self.check_results['url_canonicalization']) == 1:
+ value = True
+ score = self.max_score
+
+ return {
+ 'type': self.rating_type,
+ 'value': value,
+ 'score': score,
+ 'max_score': self.max_score,
+ }
diff --git a/rating/favicon.py b/rating/favicon.py
new file mode 100644
index 0000000..5387a1e
--- /dev/null
+++ b/rating/favicon.py
@@ -0,0 +1,32 @@
+"""
+This gives a score if the site has an icon.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+ rating_type = 'boolean'
+ default_value = False
+ depends_on_checks = ['html_head']
+ max_score = 1
+
+ def __init__(self, check_results):
+ super().__init__(check_results)
+
+ def rate(self):
+ value = self.default_value
+ score = 0
+
+ for url in self.check_results['html_head']:
+ if self.check_results['html_head'][url]['link_icon'] is not None:
+ value = True
+ score = self.max_score
+ break
+
+ return {
+ 'type': self.rating_type,
+ 'value': value,
+ 'score': score,
+ 'max_score': self.max_score,
+ }
diff --git a/rating/feeds.py b/rating/feeds.py
new file mode 100644
index 0000000..edc8888
--- /dev/null
+++ b/rating/feeds.py
@@ -0,0 +1,35 @@
+"""
+This gives a score if the site has feeds.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+ rating_type = 'boolean'
+ default_value = False
+ depends_on_checks = ['html_head']
+ max_score = 1
+
+ def __init__(self, check_results):
+ super().__init__(check_results)
+
+ def rate(self):
+ value = self.default_value
+ score = 0
+
+ for url in self.check_results['html_head']:
+ if self.check_results['html_head'][url]['link_rss_atom'] is None:
+ continue
+ if self.check_results['html_head'][url]['link_rss_atom'] == []:
+ continue
+ value = True
+ score = self.max_score
+ break
+
+ return {
+ 'type': self.rating_type,
+ 'value': value,
+ 'score': score,
+ 'max_score': self.max_score,
+ }
diff --git a/rating/https.py b/rating/https.py
new file mode 100644
index 0000000..e47550e
--- /dev/null
+++ b/rating/https.py
@@ -0,0 +1,47 @@
+"""
+This looks at all HTTPS URLs we checked for reachability.
+
+If all of them were reachable without errors, we give full score.
+If some or all had errors, or no HTTPS URL is reachable, we give zero.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+ rating_type = 'boolean'
+ default_value = False
+ depends_on_checks = ['url_reachability']
+
+ # HTTPS is very important, so this counts double
+ max_score = 2
+
+ def __init__(self, check_results):
+ super().__init__(check_results)
+
+ def rate(self):
+ value = self.default_value
+ score = 0
+
+ reachable_count = 0
+ unreachable_count = 0
+
+ for url in self.check_results['url_reachability']:
+ if not url.startswith('https://'):
+ continue
+
+ if self.check_results['url_reachability'][url]['exception'] is None:
+ reachable_count += 1
+ else:
+ unreachable_count += 1
+
+ if unreachable_count == 0 and reachable_count > 0:
+ value = True
+ score = self.max_score
+
+ return {
+ 'type': self.rating_type,
+ 'value': value,
+ 'score': score,
+ 'max_score': self.max_score,
+ }
diff --git a/rating/no_network_errors.py b/rating/no_network_errors.py
new file mode 100644
index 0000000..ac56247
--- /dev/null
+++ b/rating/no_network_errors.py
@@ -0,0 +1,48 @@
+"""
+If all URLs could be loaded without severe network errors, this rater gives a score.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+ rating_type = 'boolean'
+ default_value = False
+ depends_on_checks = ['load_in_browser']
+ max_score = 1
+
+ def __init__(self, check_results):
+ super().__init__(check_results)
+
+ def rate(self):
+ value = self.default_value
+ score = 0
+
+ found_pageloads = 0
+ found_errors = 0
+ for url in self.check_results['load_in_browser']:
+ if (self.check_results['load_in_browser'][url]['logs'] == [] or
+ self.check_results['load_in_browser'][url]['logs'] is None):
+ continue
+
+ found_pageloads += 1
+
+ # scan log entries for script errors
+ for entry in self.check_results['load_in_browser'][url]['logs']:
+ if entry['source'] != 'network':
+ continue
+ if entry['level'] != 'SEVERE':
+ continue
+
+ found_errors += 1
+
+ if found_pageloads > 0 and found_errors == 0:
+ value = True
+ score = self.max_score
+
+ return {
+ 'type': self.rating_type,
+ 'value': value,
+ 'score': score,
+ 'max_score': self.max_score,
+ }
diff --git a/rating/no_script_errors.py b/rating/no_script_errors.py
new file mode 100644
index 0000000..32a89c6
--- /dev/null
+++ b/rating/no_script_errors.py
@@ -0,0 +1,42 @@
+"""
+If all URLs could be loaded without JavaScript errors, this rater gives a score.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+ rating_type = 'boolean'
+ default_value = False
+ depends_on_checks = ['load_in_browser']
+ max_score = 1
+
+ def __init__(self, check_results):
+ super().__init__(check_results)
+
+ def rate(self):
+ value = self.default_value
+ score = 0
+
+ found_pageloads = 0
+ found_errors = 0
+ for url in self.check_results['load_in_browser']:
+ if self.check_results['load_in_browser'][url]['logs'] == []:
+ found_pageloads += 1
+ continue
+
+ # scan log entries for script errors
+ for entry in self.check_results['load_in_browser'][url]['logs']:
+ if entry['source'] == 'javascript':
+ found_errors += 1
+
+ if found_pageloads > 0 and found_errors == 0:
+ value = True
+ score = self.max_score
+
+ return {
+ 'type': self.rating_type,
+ 'value': value,
+ 'score': score,
+ 'max_score': self.max_score,
+ }
diff --git a/rating/reachable.py b/rating/reachable.py
new file mode 100644
index 0000000..381cdb0
--- /dev/null
+++ b/rating/reachable.py
@@ -0,0 +1,36 @@
+"""
+This gives a score if one of the checked URL variations was reachable.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+ rating_type = 'boolean'
+ default_value = False
+ depends_on_checks = ['url_reachability']
+ max_score = 1
+
+ def __init__(self, check_results):
+ super().__init__(check_results)
+
+ def rate(self):
+ value = self.default_value
+ score = 0
+
+ count = 0
+ for url in self.check_results['url_reachability']:
+ if self.check_results['url_reachability'][url]['exception'] is not None:
+ continue
+ count += 1
+
+ if count > 0:
+ value = True
+ score = self.max_score
+
+ return {
+ 'type': self.rating_type,
+ 'value': value,
+ 'score': score,
+ 'max_score': self.max_score,
+ }
diff --git a/rating/resolvable.py b/rating/resolvable.py
new file mode 100644
index 0000000..01e243e
--- /dev/null
+++ b/rating/resolvable.py
@@ -0,0 +1,35 @@
+"""
+This gives a score if one of the input URL's hostnames was resolvable
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+ rating_type = 'boolean'
+ default_value = False
+ depends_on_checks = ['dns_resolution']
+ max_score = 1
+
+ def __init__(self, check_results):
+ super().__init__(check_results)
+
+ def rate(self):
+ value = self.default_value
+ score = 0
+
+ count = 0
+ for url in self.check_results['dns_resolution']:
+ if self.check_results['dns_resolution'][url]['resolvable']:
+ count += 1
+
+ if count > 0:
+ value = True
+ score = self.max_score
+
+ return {
+ 'type': self.rating_type,
+ 'value': value,
+ 'score': score,
+ 'max_score': self.max_score,
+ }
diff --git a/rating/response_duration.py b/rating/response_duration.py
new file mode 100644
index 0000000..6f22d84
--- /dev/null
+++ b/rating/response_duration.py
@@ -0,0 +1,46 @@
+"""
+This looks at the response duration(s) and scores based on the bucket
+the value is in. Fast responses get one point, slower half a point,
+more than a seconds gets nothing.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+ rating_type = 'number'
+ default_value = False
+ depends_on_checks = ['page_content']
+ max_score = 1.0
+
+ def __init__(self, check_results):
+ super().__init__(check_results)
+
+ def rate(self):
+ value = self.default_value
+ score = 0
+
+ duration_sum = 0
+ duration_count = 0
+
+ for url in self.check_results['page_content']:
+ if self.check_results['page_content'][url]['exception'] is not None:
+ continue
+ duration_sum += self.check_results['page_content'][url]['duration']
+ duration_count += 1
+
+ if duration_count > 0:
+ value = round(duration_sum / duration_count)
+
+ # value is duration in milliseconds
+ if value < 100:
+ score = self.max_score
+ elif value < 1000:
+ score = self.max_score * 0.5
+
+ return {
+ 'type': self.rating_type,
+ 'value': value,
+ 'score': score,
+ 'max_score': self.max_score,
+ }
diff --git a/rating/responsive_layout.py b/rating/responsive_layout.py
new file mode 100644
index 0000000..2c198eb
--- /dev/null
+++ b/rating/responsive_layout.py
@@ -0,0 +1,35 @@
+"""
+This gives a score if the site's minimal document width during checks
+was smaller than or equal to the minimal viewport size tested.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+ rating_type = 'boolean'
+ default_value = False
+ depends_on_checks = ['load_in_browser']
+ max_score = 1
+
+ def __init__(self, check_results):
+ super().__init__(check_results)
+
+ def rate(self):
+ value = self.default_value
+ score = 0
+
+ for url in self.check_results['load_in_browser']:
+ if (self.check_results['load_in_browser'][url]['min_document_width'] <=
+ self.check_results['load_in_browser'][url]['sizes'][0]['viewport_width']):
+ value = True
+ score = self.max_score
+ # we use the first URL found here
+ break
+
+ return {
+ 'type': self.rating_type,
+ 'value': value,
+ 'score': score,
+ 'max_score': self.max_score,
+ }
diff --git a/rating/use_specific_fonts.py b/rating/use_specific_fonts.py
new file mode 100644
index 0000000..8acb907
--- /dev/null
+++ b/rating/use_specific_fonts.py
@@ -0,0 +1,41 @@
+"""
+Checks whether the pages use the font 'Arvo'.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+ rating_type = 'boolean'
+ default_value = False
+ depends_on_checks = ['load_in_browser']
+ max_score = 1
+
+ def __init__(self, check_results):
+ super().__init__(check_results)
+
+ def rate(self):
+ value = self.default_value
+ score = 0
+
+ urls_with_font = 0
+ urls_without_font = 0
+ for url in self.check_results['load_in_browser']:
+ if self.check_results['load_in_browser'][url]['font_families'] is None:
+ urls_without_font += 1
+ continue
+
+ fonts = " ".join(self.check_results['load_in_browser'][url]['font_families'])
+ if 'arvo' in fonts:
+ urls_with_font += 1
+
+ if urls_with_font > 0 and urls_without_font == 0:
+ score = self.max_score
+ value = True
+
+ return {
+ 'type': self.rating_type,
+ 'value': value,
+ 'score': score,
+ 'max_score': self.max_score,
+ }
diff --git a/rating/www_optional.py b/rating/www_optional.py
new file mode 100644
index 0000000..0afef45
--- /dev/null
+++ b/rating/www_optional.py
@@ -0,0 +1,44 @@
+"""
+This looks at reachable URLs and checks whether (sub)domains
+both with and without www. are reachable.
+"""
+
+from urllib.parse import urlparse
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+ rating_type = 'boolean'
+ default_value = False
+ depends_on_checks = ['url_reachability']
+ max_score = 1
+
+ def __init__(self, check_results):
+ super().__init__(check_results)
+
+ def rate(self):
+ value = self.default_value
+ score = 0
+
+ hostnames = set()
+ for url in self.check_results['url_reachability']:
+ if self.check_results['url_reachability'][url]['exception'] is not None:
+ continue
+ parsed = urlparse(url)
+ hostnames.add(parsed)
+
+ # FIXME
+ # we simply check whether there is more than one hostname.
+ # this works with our current input URls but might be too
+ # simplistic in the future.
+ if len(list(hostnames)) > 1:
+ value = True
+ score = self.max_score
+
+ return {
+ 'type': self.rating_type,
+ 'value': value,
+ 'score': score,
+ 'max_score': self.max_score,
+ }
diff --git a/spider.py b/spider.py
deleted file mode 100644
index 4e4f6e6..0000000
--- a/spider.py
+++ /dev/null
@@ -1,814 +0,0 @@
-"""
-Provides the spider functionality (website checks).
-"""
-
-import argparse
-import json
-import logging
-import os
-import random
-import re
-import shutil
-import statistics
-import time
-from datetime import datetime
-from socket import gethostbyname_ex
-from urllib.parse import urljoin
-from urllib.parse import urlparse
-
-import requests
-import yaml
-import tenacity
-
-from bs4 import BeautifulSoup
-from git import Repo
-from selenium import webdriver
-from google.cloud import datastore
-from google.api_core.exceptions import Aborted
-from google.api_core.exceptions import InvalidArgument
-
-
-# configuration
-
-# connection timeout for website checks (seconds)
-CONNECT_TIMEOUT = 5
-
-# response timeout for website checks
-READ_TIMEOUT = 10
-
-# Git repo for our data
-GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
-# folder in that repo that holds the data
-GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
-GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
-
-RESULT_PATH = '/out'
-
-# IP address of the newthinking GCMS server
-GCMS_IP = "91.102.13.20"
-
-JOB_DATASTORE_KIND = 'spider-jobs'
-RESULTS_DATASTORE_KIND = 'spider-results'
-
-# end configuration
-
-DATASTORE_CLIENT = None
-
-
-def chunks(the_list, size):
- """
- Yield successive n-sized chunks from list the_list
- where n = size.
- """
- for i in range(0, len(the_list), size):
- yield the_list[i:i + size]
-
-
-def create_jobs(url=None):
- """
- Read all URLs from green directory and fill a job database
- with one job per URL.
-
- Alternatively, if the url argument is given, only the given URL
- will be added as a spider job.
- """
-
- # refresh our local clone of the green directory
- logging.info("Refreshing green-directory clone")
- get_green_directory()
-
- # build the list of website URLs to run checks for
- logging.info("Processing green-directory")
- input_entries = []
-
- count = 0
-
- for entry in dir_entries():
-
- if 'type' not in entry:
- logging.error("Entry without type")
- continue
- if 'urls' not in entry:
- logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
- continue
-
- website_url = None
- for index in range(len(entry['urls'])):
- try:
- if entry['urls'][index]['type'] == "WEBSITE":
- website_url = entry['urls'][index]['url']
- if website_url:
- if url is not None and website_url != url:
- continue
- input_entries.append({
- "url": website_url,
- "level": entry.get("level"),
- "state": entry.get("state"),
- "district": entry.get("district"),
- "city": entry.get("city"),
- })
- count += 1
- except NameError:
- logging.error("Error in %s: 'url' key missing (%s)",
- repr_entry(entry), entry['urls'][index])
-
- # ensure the passed URL argument is really there, even if not part
- # of the directory.
- if url and count == 0:
- logging.info("Adding job for URL %s which is not part of green-directory", url)
- input_entries.append({
- "url": url,
- "level": None,
- "state": None,
- "district": None,
- "city": None,
- })
-
- # randomize order, to distribute requests over servers
- logging.debug("Shuffling input URLs")
- random.seed()
- random.shuffle(input_entries)
-
- count = 0
- logging.info("Writing jobs")
-
- entities = []
-
- for entry in input_entries:
- key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"])
- entity = datastore.Entity(key=key)
- entity.update({
- "created": datetime.utcnow(),
- "level": entry["level"],
- "state": entry["state"],
- "district": entry["district"],
- "city": entry["city"],
- })
- entities.append(entity)
-
- # commmit to DB
- for chunk in chunks(entities, 300):
- logging.debug("Writing jobs chunk of length %d", len(chunk))
- DATASTORE_CLIENT.put_multi(chunk)
- count += len(chunk)
-
- logging.info("Writing jobs done, %s jobs added", count)
-
-
-def get_green_directory():
- """
- Clones the source of website URLs, the green directory,
- into the local file system using git
- """
- if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH):
- shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH)
- Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH)
-
-
-def dir_entries():
- """
- Iterator over all data files in the cloned green directory
- """
- path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH)
- for root, _, files in os.walk(path):
- for fname in files:
-
- filepath = os.path.join(root, fname)
- if not filepath.endswith(".yaml"):
- continue
-
- with open(filepath, 'r', encoding='utf8') as yamlfile:
- for doc in yaml.load_all(yamlfile):
- yield doc
-
-
-def repr_entry(entry):
- """
- Return string representation of a directory entry,
- for logging/debugging purposes
- """
- ret = entry['type']
- if 'level' in entry:
- ret += "/" + entry['level']
- if 'state' in entry:
- ret += "/" + entry['state']
- if 'district' in entry:
- ret += "/" + entry['district']
- return ret
-
-
-def derive_test_hostnames(hostname):
- """
- Derives the hostnames variants to test for a given host name.
- From 'gruene-x.de' or 'www.gruene-x.de' it makes
-
- ['gruene-x.de', 'www.gruene-x.de']
-
- which are both plausible web URLs to be used for a domain.
- """
-
- hostnames = set()
-
- hostnames.add(hostname)
- if hostname.startswith('www.'):
- hostnames.add(hostname[4:])
- else:
- hostnames.add('www.' + hostname)
-
- return sorted(list(hostnames))
-
-
-def reduce_urls(urllist):
- """
- Reduce a list of urls with metadata by eliminating those
- that either don't work or lead somewhere else
- """
- targets = set()
- for url in urllist:
- if url['error'] is not None:
- continue
- if url['redirects_to'] is not None:
- targets.add(url['redirects_to'])
- else:
- targets.add(url['url'])
- return sorted(list(targets))
-
-
-def normalize_title(title):
- """
- Removes garbage from HTML page titles
- """
- title = title.replace(u'\u00a0', ' ')
- title = title.replace(' ', ' ')
- title = title.strip()
- return title
-
-
-def check_responsiveness(url):
- """
- Checks
- - whether a page adapts to different viewport sizes
- - whether a viewport meta tag exists
- and returns details
- """
- details = {
- 'document_width': {},
- 'viewport_meta_tag': None,
- }
-
- # sizes we check for (width, height)
- sizes = (
- (320, 480), # old smartphone
- (768, 1024), # older tablet or newer smartphone
- (1024, 768), # older desktop or horiz. tablet
- (1920, 1080), # Full HD horizontal
- )
-
- # Our selenium user agent using Chrome headless as an engine
- chrome_options = webdriver.ChromeOptions()
- chrome_options.add_argument('--headless')
- chrome_options.add_argument('--disable-gpu')
- chrome_options.add_argument('--no-sandbox')
- chrome_options.add_argument('--disable-extensions')
- driver = webdriver.Chrome(chrome_options=chrome_options)
- driver.set_page_load_timeout(60)
- driver.set_window_size(sizes[0][0], sizes[0][1])
- driver.get(url)
- time.sleep(1)
-
- for (width, height) in sizes:
- driver.set_window_size(width, height)
- key = "%sx%s" % (width, height)
- width = driver.execute_script("return document.body.scrollWidth")
- details['document_width'][key] = int(width)
-
- try:
- element = driver.find_element_by_xpath("//meta[@name='viewport']")
- details['viewport_meta_tag'] = element.get_attribute('content')
- except:
- pass
-
- return details
-
-
-def check_content(req):
- """
- Adds details to check regarding content of the page
-
- check: the dict containing details for this URL
- r: requests request/response object
- """
- result = {}
-
- result['encoding'] = req.encoding.lower()
- soup = BeautifulSoup(req.text, 'html.parser')
-
- result['html'] = req.text
-
- # page title
- result['title'] = None
- title = None
- head = soup.find('head')
- if head is not None:
- title = head.find('title')
- if title is not None:
- result['title'] = normalize_title(title.get_text())
-
- # canonical link
- result['canonical_link'] = None
- link = soup.find('link', rel='canonical')
- if link:
- result['canonical_link'] = urljoin(req.url, link.get('href'))
-
- # icon
- result['icon'] = None
- link = soup.find('link', rel=lambda x: x and x.lower() == 'icon')
- if link:
- result['icon'] = urljoin(req.url, link.get('href'))
- else:
- link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
- if link:
- result['icon'] = urljoin(req.url, link.get('href'))
-
- # feed links
- result['feeds'] = []
- rss_links = soup.find_all('link', type='application/rss+xml')
- atom_links = soup.find_all('link', type='application/atom+xml')
-
- if rss_links:
- for link in rss_links:
- result['feeds'].append(urljoin(req.url, link.get('href')))
- if atom_links:
- for link in rss_links:
- result['feeds'].append(urljoin(req.url, link.get('href')))
-
- # generator meta tag
- result['generator'] = None
- if head is not None:
- generator = head.select('[name=generator]')
- if generator:
- result['generator'] = generator[0].get('content')
-
- # opengraph meta tags
- result['opengraph'] = None
- opengraph = set()
- if head is not None:
- for item in head.find_all(property=re.compile('^og:')):
- opengraph.add(item.get('property'))
- for item in head.find_all(itemprop=re.compile('^og:')):
- opengraph.add(item.get('itemprop'))
- if opengraph:
- result['opengraph'] = sorted(list(opengraph))
-
- return result
-
-
-def collect_ipv4_addresses(hostname_dict):
- """
- Return list of unique IPv4 addresses
- """
- ips = set()
- for item in hostname_dict.values():
- if 'ip_addresses' not in item:
- continue
- for ip_addr in item['ip_addresses']:
- ips.add(ip_addr)
- return sorted(list(ips))
-
-
-def parse_generator(generator):
- """
- Return well known CMS names from generator
- """
- generator = generator.lower()
- if 'typo3' in generator:
- return "typo3"
- if 'wordpress' in generator:
- return "wordpress"
- if 'drupal' in generator:
- return "drupal"
- if 'joomla' in generator:
- return "joomla"
- return generator
-
-def check_site(entry):
- """
- Performs our site check and returns results as a dict.
-
- 1. Normalize the input URL and derive the URLs to check for
- 2. HEAD the check urls
- 3. Determine the canonical URL
- 4. Run full check on canonical URL
- """
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' +
- 'AppleWebKit/537.36 (KHTML, like Gecko) ' +
- 'Chrome/65.0.3325.181 green-spider/0.1'
- }
-
- # all the info we'll return for the site
- result = {
- # input_url: The URL we derived all checks from
- 'input_url': entry['url'],
- # Meta: Regional and type metadata for the site
- 'meta': {
- 'level': entry.get('level'),
- 'state': entry.get('state'),
- 'district': entry.get('district'),
- 'city': entry.get('city'),
- },
- # Details: All details we collected about the site (which aren't directly
- # related to the report criteria)
- 'details': {
- 'hostnames': {},
- 'ipv4_addresses': [],
- 'resolvable_urls': [],
- 'canonical_urls': [],
- 'urlchecks': [],
- 'icons': [],
- 'feeds': [],
- 'cms': None,
- 'responsive': None,
- },
- # The actual report criteria
- 'result': {
- 'DNS_RESOLVABLE_IPV4': {'type': 'boolean', 'value': False, 'score': 0},
- 'SITE_REACHABLE': {'type': 'boolean', 'value': False, 'score': 0},
- 'HTTPS': {'type': 'boolean', 'value': False, 'score': 0},
- 'WWW_OPTIONAL': {'type': 'boolean', 'value': False, 'score': 0},
- 'CANONICAL_URL': {'type': 'boolean', 'value': False, 'score': 0},
- 'FAVICON': {'type': 'boolean', 'value': False, 'score': 0},
- 'FEEDS': {'type': 'boolean', 'value': False, 'score': 0},
- 'HTTP_RESPONSE_DURATION': {'type': 'number', 'value': None, 'score': 0},
- 'RESPONSIVE': {'type': 'boolean', 'value': False, 'score': 0},
- },
- 'score': 0.0,
- }
-
- # derive hostnames to test (with/without www.)
- parsed = urlparse(entry['url'])
- hostnames = derive_test_hostnames(parsed.hostname)
-
- # try to resolve hostnames
- processed_hostnames = {}
- for hostname in hostnames:
-
- processed_hostnames[hostname] = {
- 'resolvable': False,
- }
-
- try:
- hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
- processed_hostnames[hostname]['resolvable'] = True
- processed_hostnames[hostname]['resolved_hostname'] = hostname
- processed_hostnames[hostname]['aliases'] = aliases
- processed_hostnames[hostname]['ip_addresses'] = ip_addresses
- except:
- pass
-
- result['details']['hostnames'] = processed_hostnames
-
- result['details']['ipv4_addresses'] = collect_ipv4_addresses(processed_hostnames)
-
- # check basic HTTP(S) reachability
- checked_urls = []
- checked_urls_set = set()
-
- for hostname in processed_hostnames.keys():
-
- item = processed_hostnames[hostname]
-
- if not item['resolvable']:
- continue
-
- for scheme in ('http', 'https'):
-
- url = scheme + '://' + item['resolved_hostname'] + '/'
-
- if url in checked_urls_set:
- continue
-
- checked_urls_set.add(url)
-
- record = {
- 'url': url,
- 'error': None,
- 'redirects_to': None,
- }
-
- try:
- req = requests.head(record['url'], headers=headers, allow_redirects=True)
- if req.url == url:
- logging.info("URL: %s - status %s", record['url'], req.status_code)
- else:
- logging.info("URL: %s - status %s - redirects to %s", record['url'],
- req.status_code, req.url)
- record['redirects_to'] = req.url
- except Exception as exc:
- record['error'] = {
- 'type': str(type(exc)),
- 'message': str(exc),
- }
- logging.info("URL %s: %s %s", url, str(type(exc)), exc)
-
- checked_urls.append(record)
-
- result['details']['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
- result['details']['canonical_urls'] = sorted(reduce_urls(checked_urls))
-
- # Deeper test for the remaining (canonical) URL(s)
- for check_url in result['details']['canonical_urls']:
-
- logging.info("Downloading URL %s", check_url)
-
- check = {
- 'url': check_url,
- 'status_code': None,
- 'duration': None,
- 'error': None,
- 'content': None,
- 'responsive': None,
- }
-
- try:
- req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
- check['status_code'] = req.status_code
- check['duration'] = round(req.elapsed.microseconds / 1000)
-
- # Content checks
- if req.status_code < 300:
- check['content'] = check_content(req)
-
- # Responsiveness check
- try:
- check['responsive'] = check_responsiveness(check_url)
- except Exception as exc:
- logging.error("Error when checking responsiveness for '%s': %s", check_url, exc)
-
- except requests.exceptions.ConnectionError as exc:
- logging.error(str(exc) + " " + check_url)
- check['error'] = "connection"
- except requests.exceptions.ReadTimeout as exc:
- logging.error(str(exc) + " " + check_url)
- check['error'] = "read_timeout"
- except requests.exceptions.Timeout as exc:
- logging.error(str(exc) + " " + check_url)
- check['error'] = "connection_timeout"
- except Exception as exc:
- logging.error(str(exc) + " " + check_url)
- check['error'] = "unknown"
-
- result['details']['urlchecks'].append(check)
-
-
- result['details']['urlchecks'] = sorted(result['details']['urlchecks'],
- key=lambda url: url['url'])
-
- # collect icons
- icons = set()
- for c in result['details']['urlchecks']:
- if 'content' not in c:
- continue
- if c['content'] is None:
- logging.warning("No content for %s", entry['url'])
- continue
- if c['content']['icon'] is not None:
- icons.add(c['content']['icon'])
- result['details']['icons'] = sorted(list(icons))
-
- # collect feeds
- feeds = set()
- for c in result['details']['urlchecks']:
- if c['content'] is None:
- logging.warning("No content for %s", entry['url'])
- continue
- if 'feeds' in c['content'] and len(c['content']['feeds']):
- for feed in c['content']['feeds']:
- feeds.add(feed)
- result['details']['feeds'] = sorted(list(feeds))
-
- # detect responsive
- viewports = set()
- min_width = 2000
- for c in result['details']['urlchecks']:
- if c['responsive'] is None:
- continue
- if c['responsive']['viewport_meta_tag'] is not None:
- viewports.add(c['responsive']['viewport_meta_tag'])
- widths = c['responsive']['document_width'].values()
- if min(widths) < min_width:
- min_width = min(widths)
- result['details']['responsive'] = {
- 'viewport_meta_tag': list(viewports),
- 'min_width': min_width,
- }
-
- # detect CMS
- for c in result['details']['urlchecks']:
- if c['content'] is None:
- continue
- if 'generator' not in c['content']:
- continue
- if c['content']['generator'] != "" and c['content']['generator'] is not None:
-
- result['details']['cms'] = parse_generator(c['content']['generator'])
- # Qualify certain CMS flavours in more detail
- if result['details']['cms'] == "typo3":
- if GCMS_IP in result['details']['ipv4_addresses']:
- result['details']['cms'] = "typo3-gcms"
- elif 'typo3-gruene.de' in c['content']['html']:
- result['details']['cms'] = "typo3-gruene"
- elif result['details']['cms'] == "wordpress":
- if 'Urwahl3000' in c['content']['html']:
- result['details']['cms'] = "wordpress-urwahl"
-
- else:
- # No generator Tag. Use HTML content.
- if 'Urwahl3000' in c['content']['html']:
- result['details']['cms'] = "wordpress-urwahl"
- elif ('josephknowsbest' in c['content']['html'] or
- 'Joseph-knows-best' in c['content']['html']):
- result['details']['cms'] = "wordpress-josephknowsbest"
- elif 'wordpress' in c['content']['html']:
- result['details']['cms'] = "wordpress"
-
- # we can stop here
- break
-
-
- ### Derive criteria
-
- # DNS_RESOLVABLE_IPV4
- if result['details']['ipv4_addresses']:
- result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
-
- # SITE_REACHABLE
- for item in result['details']['resolvable_urls']:
- if item['error'] is None:
- result['result']['SITE_REACHABLE'] = {'value': True, 'score': 1}
- break
-
- # HTTPS
- for item in result['details']['urlchecks']:
- if item['error'] is None and item['url'].startswith('https://'):
- result['result']['HTTPS'] = {'value': True, 'score': 2}
- break
-
- # WWW_OPTIONAL
- num_hostnames = 0
- for hostname in result['details']['hostnames'].keys():
- item = result['details']['hostnames'][hostname]
- if not item['resolvable']:
- continue
- num_hostnames += 1
- if num_hostnames > 1:
- result['result']['WWW_OPTIONAL'] = {'value': True, 'score': 1}
-
- # CANONICAL_URL
- # - either there is only one canonical URL (through redirects)
- # - or several pages have identical rel=canonical links
- if len(result['details']['canonical_urls']) == 1:
- result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
- else:
- links = set()
- if result['details']['urlchecks'] is None:
- logging.warning("No urlchecks for %s", entry['url'])
- else:
- for item in result['details']['urlchecks']:
- if item['content'] is not None and item['content']['canonical_link'] is not None:
- links.add(item['content']['canonical_link'])
- if len(links) == 1:
- result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
-
- # FAVICON
- if result['details']['icons']:
- result['result']['FAVICON'] = {'value': True, 'score': 1}
-
- # FEEDS
- if result['details']['feeds']:
- result['result']['FEEDS'] = {'value': True, 'score': 1}
-
- # HTTP_RESPONSE_DURATION
- durations = []
- for item in result['details']['urlchecks']:
- if item['error'] is None:
- durations.append(item['duration'])
- if durations:
- val = round(statistics.mean(durations))
- result['result']['HTTP_RESPONSE_DURATION']['value'] = val
- if val < 100:
- result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
- elif val < 1000:
- result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
-
- # RESPONSIVE
- if result['details']['responsive'] is not None:
- if (result['details']['responsive']['min_width'] < 500 and
- len(result['details']['responsive']['viewport_meta_tag']) > 0):
- result['result']['RESPONSIVE']['value'] = True
- result['result']['RESPONSIVE']['score'] = 1
-
- # Overall score
- for item in result['result'].keys():
- result['score'] += result['result'][item]['score']
-
- # clean up - remove full HTML
- for item in result['details']['urlchecks']:
- try:
- del item['content']['html']
- except:
- pass
-
- return result
-
-
-@tenacity.retry(wait=tenacity.wait_exponential(),
- retry=tenacity.retry_if_exception_type(Aborted))
-def get_job_from_queue():
- """
- Returns a URL from the queue
- """
- out = None
-
- with DATASTORE_CLIENT.transaction():
- query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND)
- for entity in query.fetch(limit=1):
- logging.debug("Got job: %s", entity)
- out = dict(entity)
- out["url"] = entity.key.name
- DATASTORE_CLIENT.delete(entity.key)
-
- return out
-
-def work_of_queue():
- """
- Take job from queue and finish it until there are no more jobs
- """
- while True:
- job = get_job_from_queue()
- if job is None:
- logging.info("No more jobs. Exiting.")
- break
-
- logging.info("Starting job %s", job["url"])
- result = check_site(entry=job)
- #logging.debug(result)
- logging.info("Job %s finished checks", job["url"])
- logging.info("Job %s writing to DB", job["url"])
-
- key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"])
- entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
- record = {
- "created": datetime.utcnow(),
- "results": result,
- }
- entity.update(record)
- try:
- DATASTORE_CLIENT.put(entity)
- except InvalidArgument as ex:
- logging.error("Could not write result: %s", ex)
- except ex:
- logging.error("Could not write result: %s", ex)
-
-
-if __name__ == "__main__":
- """
- Bringing it all together
- """
- parser = argparse.ArgumentParser()
- parser.add_argument('--credentials-path', dest='credentials_path',
- help='Path to the service account credentials JSON file',
- default='/secrets/service-account.json')
- parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
- default='info')
-
- subparsers = parser.add_subparsers(help='sub-command help', dest='command')
-
- subparsers.add_parser('spider', help='Take jobs off the queue and spider')
-
- jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
-
- jobs_parser.add_argument('--url', help='Add a job to spider a URL')
- args = parser.parse_args()
-
- loglevel = args.loglevel.lower()
- if loglevel == 'error':
- logging.basicConfig(level=logging.ERROR)
- elif loglevel == 'warn':
- logging.basicConfig(level=logging.WARN)
- elif loglevel == 'debug':
- logging.basicConfig(level=logging.DEBUG)
- else:
- logging.basicConfig(level=logging.INFO)
- loglevel = 'info'
-
- logging.getLogger("urllib3").setLevel(logging.CRITICAL)
-
- DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
-
- logging.debug("Called command %s", args.command)
-
- if args.command == 'jobs':
- create_jobs(args.url)
- else:
- work_of_queue()
diff --git a/spider/__init__.py b/spider/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/spider/spider.py b/spider/spider.py
new file mode 100644
index 0000000..d1cf6de
--- /dev/null
+++ b/spider/spider.py
@@ -0,0 +1,106 @@
+"""
+Provides the spider functionality (website checks).
+"""
+
+import argparse
+import json
+import logging
+import re
+import statistics
+import time
+from datetime import datetime
+from pprint import pprint
+
+from google.api_core.exceptions import InvalidArgument
+from google.cloud import datastore
+
+import checks
+import config
+import jobs
+import rating
+
+def check_and_rate_site(entry):
+ """
+ Performs our site check and returns results as a dict.
+
+ 1. Normalize the input URL and derive the URLs to check for
+ 2. HEAD the check urls
+ 3. Determine the canonical URL
+ 4. Run full check on canonical URL
+ """
+
+ # all the info we'll return for the site
+ result = {
+ # input_url: The URL we derived all checks from
+ 'input_url': entry['url'],
+ # Meta: Regional and type metadata for the site
+ 'meta': {
+ 'type': entry.get('type'),
+ 'level': entry.get('level'),
+ 'state': entry.get('state'),
+ 'district': entry.get('district'),
+ 'city': entry.get('city'),
+ },
+ # checks: Results from our checks
+ 'checks': {},
+ # The actual report scoring criteria
+ 'rating': {},
+ # resulting score
+ 'score': 0.0,
+ }
+
+ # Results from our next generation checkers
+ result['checks'] = checks.perform_checks(entry['url'])
+
+ result['rating'] = rating.calculate_rating(result['checks'])
+
+ # Overall score is the sum of the individual scores
+ for key in result['rating']:
+ result['score'] += result['rating'][key]['score']
+
+ # remove full HTML page content,
+ # as it's no longer needed
+ try:
+ for url in result['checks']['page_content']:
+ del result['checks']['page_content'][url]['content']
+ except:
+ pass
+
+ return result
+
+
+def work_of_queue(datastore_client, entity_kind):
+ """
+ Take job from queue and finish it until there are no more jobs
+ """
+ while True:
+ job = jobs.get_job_from_queue(datastore_client)
+ if job is None:
+ logging.info("No more jobs. Exiting.")
+ break
+
+ logging.info("Starting job %s", job["url"])
+ result = check_and_rate_site(entry=job)
+
+ logging.debug("Full JSON representation of returned result: %s", json.dumps(result))
+
+ logging.info("Job %s finished checks", job["url"])
+ logging.info("Job %s writing to DB", job["url"])
+
+ key = datastore_client.key(entity_kind, job["url"])
+ entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
+ record = {
+ 'created': datetime.utcnow(),
+ 'meta': result['meta'],
+ 'checks': result['checks'],
+ 'rating': result['rating'],
+ 'score': result['score'],
+ }
+ entity.update(record)
+ try:
+ datastore_client.put(entity)
+ except InvalidArgument as ex:
+ logging.error("Could not write result: %s", ex)
+ except Exception as ex:
+ logging.error("Could not write result: %s", ex)
+
diff --git a/spider/spider_test.py b/spider/spider_test.py
new file mode 100644
index 0000000..dda55e7
--- /dev/null
+++ b/spider/spider_test.py
@@ -0,0 +1,26 @@
+import unittest
+
+from spider.spider import check_and_rate_site
+
+from pprint import pprint
+
+class TestSpiderr(unittest.TestCase):
+
+ def test_url1(self):
+
+ entry = {
+ "url": "https://httpbin.org/html",
+ "type": "type",
+ "state": "state",
+ "level": "level",
+ "district": "district",
+ "city": "city",
+ }
+
+ url = "https://httpbin.org/html"
+ result = check_and_rate_site(entry)
+
+ self.assertEqual(result["input_url"], url)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/spider_test.py b/spider_test.py
deleted file mode 100644
index a617147..0000000
--- a/spider_test.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import unittest
-import requests
-import responses
-import spider
-
-
-class TestDeriveHostnames(unittest.TestCase):
-
- def test_basic1(self):
- hn = spider.derive_test_hostnames('www.my-domain.de')
- expected = ['my-domain.de', 'www.my-domain.de']
- self.assertEqual(hn, expected)
-
- def test_basic2(self):
- hn = spider.derive_test_hostnames('domain.de')
- expected = ['domain.de', 'www.domain.de']
- self.assertEqual(hn, expected)
-
-
-class TestReduceURLs(unittest.TestCase):
-
- def test_basic(self):
- testdata = [
- {'url': 'one', 'error': None, 'redirects_to': None},
- {'url': 'two', 'error': 'Yes', 'redirects_to': None},
- {'url': 'three', 'error': None, 'redirects_to': 'five'},
- ]
- expected_result = ['five', 'one']
- result = spider.reduce_urls(testdata)
- self.assertEqual(result, expected_result)
-
-
-class TestContentChecks(unittest.TestCase):
-
- @responses.activate
- def test_minimal(self):
- url = 'http://my.url'
- responses.add(responses.GET, url, status=200,
- content_type='text/html',
- body='')
- r = requests.get(url)
- result = spider.check_content(r)
-
- del result['html'] # don't want to have the messy HTML part in comparison
-
- expected_result = {
- 'icon': None,
- 'title': None,
- 'generator': None,
- 'feeds': [],
- 'encoding': 'iso-8859-1',
- 'canonical_link': None,
- 'opengraph': None
- }
- self.assertDictEqual(result, expected_result)
-
- @responses.activate
- def test_basic(self):
- url = 'http://my.url'
- responses.add(responses.GET, url, status=200,
- content_type='text/html; charset=UTF-8',
- body='''
-
-
-
- The page's title
-
-
-
-
-
-
- ''')
- r = requests.get(url)
- result = spider.check_content(r)
-
- del result['html'] # don't want to have the messy HTML part in comparison
-
- expected_result = {
- 'icon': 'http://foo.bar/image.png',
- 'title': 'The page\'s title',
- 'generator': 'some-cms/1.0',
- 'feeds': [
- 'http://example.com/feed',
- ],
- 'encoding': 'utf-8',
- 'canonical_link': 'https://my.site.com/',
- 'opengraph': None
- }
- self.assertDictEqual(result, expected_result)
-
- @responses.activate
- def test_opengraph(self):
- url = 'http://my.url'
- responses.add(responses.GET, url, status=200,
- content_type='text/html; charset=UTF-8',
- body='''
-
-
-
-
-
-
-
-
- ''')
- r = requests.get(url)
- result = spider.check_content(r)
-
- del result['html'] # don't want to have the messy HTML part in comparison
-
- expected_result = {
- 'icon': None,
- 'title': None,
- 'generator': None,
- 'feeds': [],
- 'encoding': 'utf-8',
- 'canonical_link': None,
- 'opengraph': ['og:image', 'og:title', 'og:type', 'og:url'],
- }
- self.assertDictEqual(result, expected_result)
-
-
-if __name__ == '__main__':
- unittest.main()