Refactor and modularize spider (#70)

See PR description for details
This commit is contained in:
Marian Steinbach 2018-10-03 11:05:42 +02:00 committed by GitHub
parent 7514aeb542
commit ae6a2e83e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
47 changed files with 2289 additions and 1004 deletions

View File

@ -4,3 +4,4 @@ docs
secrets
temp
venv
/export-*

1
.gitignore vendored
View File

@ -7,3 +7,4 @@ __pycache__
.vscode/settings.json
webapp/dist/bundle.js
dev-shm
/export-*

View File

@ -6,5 +6,12 @@ services:
notifications:
email: false
language: python
python:
- "3.6"
script:
- pip install --upgrade pip
- pip install --upgrade codecov
- make test
- codecov

View File

@ -1,17 +1,20 @@
FROM python:3.6-alpine3.7
FROM python:3.6-alpine3.8
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.7/main" >> /etc/apk/repositories && \
echo "http://dl-4.alpinelinux.org/alpine/v3.7/community" >> /etc/apk/repositories && \
apk update && \
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git && \
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
pip3 install --upgrade pip && \
pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 html-similarity==0.3.2 httpretty==0.9.4 pyopenssl==18.0.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
apk del python3-dev build-base
ADD spider.py /
ADD spider_test.py /
ADD data_export.py /
ADD cli.py /
ADD config /config
ADD jobs /jobs
ADD checks /checks
ADD rating /rating
ADD spider /spider
ADD export /export
ENTRYPOINT ["python3"]
CMD ["/spider.py"]
ENTRYPOINT ["python3", "/cli.py"]

View File

@ -1,18 +1,20 @@
IMAGE := quay.io/netzbegruenung/green-spider:latest
DB_ENTITY := spider-results
.PHONY: dockerimage
# Build docker image
dockerimage:
docker build -t quay.io/netzbegruenung/green-spider:latest .
docker build -t $(IMAGE) .
# Create spider job queue
spiderjobs: dockerimage
docker run --rm -ti \
-v $(PWD)/secrets:/secrets \
quay.io/netzbegruenung/green-spider:latest spider.py \
$(IMAGE) \
--credentials-path /secrets/datastore-writer.json \
--loglevel debug \
--loglevel info \
jobs
# Run spider in docker image
@ -21,11 +23,26 @@ spider: dockerimage
-v $(PWD)/dev-shm:/dev/shm \
-v $(PWD)/webapp/dist/data:/out \
-v $(PWD)/secrets:/secrets \
quay.io/netzbegruenung/green-spider:latest spider.py \
$(IMAGE) \
--credentials-path /secrets/datastore-writer.json \
--loglevel info \
spider
--loglevel debug \
spider --kind $(DB_ENTITY)
export: dockerimage
docker run --rm -ti \
-v $(PWD)/export-json:/out \
-v $(PWD)/secrets:/secrets \
-v $(PWD)/export-siteicons:/icons \
$(IMAGE) \
--credentials-path /secrets/datastore-reader.json \
--loglevel debug \
export --kind $(DB_ENTITY)
# run spider tests
# FIXME
test: dockerimage
docker run --rm -ti quay.io/netzbegruenung/green-spider:latest /spider_test.py
docker run --rm -ti \
--entrypoint "python3" \
$(IMAGE) \
-m unittest discover -p '*_test.py'

64
checks/__init__.py Normal file
View File

@ -0,0 +1,64 @@
"""
The checks module contains the functionality to get information and test certain
functionality of a site or individual pages.
"""
import logging
from checks import charset
from checks import certificate
from checks import dns_resolution
from checks import duplicate_content
from checks import domain_variations
from checks import generator
from checks import html_head
from checks import http_and_https
from checks import page_content
from checks import load_in_browser
from checks import url_reachability
from checks import url_canonicalization
from checks.config import Config
def perform_checks(input_url):
"""
Executes all our URL/site checks and returns a big-ass result dict.
"""
# The sequence of checks to run. Order is important!
# Checks which expand the URLs list must come first.
# After that, dependencies (encoded in the checks) have to be fulfilled.
check_modules = [
('domain_variations', domain_variations),
('http_and_https', http_and_https),
('dns_resolution', dns_resolution),
('url_reachability', url_reachability),
('certificate', certificate),
('url_canonicalization', url_canonicalization),
('page_content', page_content),
('duplicate_content', duplicate_content),
('charset', charset),
('html_head', html_head),
('generator', generator),
('load_in_browser', load_in_browser),
]
results = {}
config = Config(urls=[input_url],
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' +
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' +
'Safari/537.36 green-spider/0.2')
for check_name, check in check_modules:
checker = check.Checker(config=config,
previous_results=results)
result = checker.run()
results[check_name] = result
# update config for the next check
config = checker.config
logging.debug("config after check %s: %r" % (check_name, config))
return results

View File

@ -0,0 +1,23 @@
class AbstractChecker(object):
"""
Our blueprint for checks
"""
def __init__(self, config, previous_results=None):
self._config = config
# A dictionary of results from previous checkers.
# Key is the name of the checker that has generated the result.
self._previous_results = previous_results
def run(self):
"""Executes the check routine, returns result dict"""
raise NotImplementedError()
@property
def config(self):
return self._config
@property
def previous_results(self):
return self._previous_results

62
checks/certificate.py Normal file
View File

@ -0,0 +1,62 @@
"""
Gathers information on the TLS/SSL certificate used by a server
"""
from urllib.parse import urlparse
import logging
import ssl
from datetime import datetime
from datetime import timezone
from OpenSSL import crypto
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
results = {}
for url in self.config.urls:
if url.startswith('https://'):
results[url] = self.get_certificate(url)
return results
def get_certificate(self, url):
result = {
'exception': None,
'serial_number': None,
'subject': None,
'issuer': None,
'not_before': None,
'not_after': None
}
parsed = urlparse(url)
try:
cert = ssl.get_server_certificate((parsed.hostname, 443))
x509 = crypto.load_certificate(crypto.FILETYPE_PEM, cert)
result['serial_number'] = str(x509.get_serial_number())
nb = x509.get_notBefore().decode('utf-8')
na = x509.get_notAfter().decode('utf-8')
# parse '2018 06 27 00 00 00Z'
result['not_before'] = datetime(int(nb[0:4]), int(nb[4:6]), int(nb[6:8]), int(nb[8:10]), int(nb[10:12]), int(nb[12:14]), tzinfo=timezone.utc).isoformat()
result['not_after'] = datetime(int(na[0:4]), int(na[4:6]), int(na[6:8]), int(na[8:10]), int(na[10:12]), int(na[12:14]), tzinfo=timezone.utc).isoformat()
# decode and convert from bytes to unicode
result['subject'] = dict([tuple(map(lambda x: x.decode('utf-8'), tup)) for tup in x509.get_subject().get_components()])
result['issuer'] = dict([tuple(map(lambda x: x.decode('utf-8'), tup)) for tup in x509.get_issuer().get_components()])
except Exception as e:
result['exception'] = {
'type': str(type(e)),
'message': str(e),
}
logging.warning("Error when getting certificate for %s: %r" % (url, e))
return result

View File

@ -0,0 +1,27 @@
from checks import certificate
from checks.config import Config
import unittest
class TestCertificateChecker(unittest.TestCase):
def test_google(self):
url = 'https://www.google.com/'
config = Config(urls=[url])
checker = certificate.Checker(config=config, previous_results={})
result = checker.run()
self.assertIn(url, result)
self.assertIsNone(result[url]['exception'])
self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services')
def test_kaarst(self):
url = 'https://www.gruenekaarst.de/'
config = Config(urls=[url])
checker = certificate.Checker(config=config, previous_results={})
result = checker.run()
self.assertIn(url, result)
self.assertIsNone(result[url]['exception'])
self.assertEqual(result[url]['issuer']['O'], 'COMODO CA Limited')
if __name__ == '__main__':
unittest.main()

77
checks/charset.py Normal file
View File

@ -0,0 +1,77 @@
"""
Checks which character set a page has.
TODO: Check for http-equiv meta tags like
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1" />
"""
import logging
from bs4 import BeautifulSoup
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
assert 'page_content' in self.previous_results
results = {}
for url in self.config.urls:
results[url] = self.get_charset(url)
return results
def get_charset(self, url):
"""
Expects page_content_dict['content'] to carry the HTML content
"""
page_content = self.previous_results['page_content'][url]
assert 'content' in page_content
assert 'response_headers' in page_content
logging.debug("%r", page_content['response_headers'])
assert 'content-type' in page_content['response_headers']
if page_content['content'] is None:
return
result = {
'meta_charset_tag': None,
'content_type_header_charset': None,
'charset': 'iso-8859-1', # ISO-8859-1 is the default according to https://www.w3.org/International/articles/http-charset/index
'valid': None,
'exception': None,
}
soup = BeautifulSoup(page_content['content'], 'html.parser')
# get response header charset
if ('content-type' in page_content['response_headers']
and 'charset=' in page_content['response_headers']['content-type']):
parts = page_content['response_headers']['content-type'].split("charset=", 1)
result['content_type_header_charset'] = parts[1].lower()
result['charset'] = parts[1].lower()
# get meta tag charset
metatags = soup.find_all('meta')
for tag in metatags:
if 'charset' in tag.attrs:
result['meta_charset_tag'] = tag['charset'].lower()
# meta tag overrules any previous value
result['charset'] = tag['charset'].lower()
# check for charset plausibility (only for most common ones)
if result['charset'] in ('iso-8859-1', 'utf-8'):
try:
_ = page_content['content'].encode(result['charset'])
except UnicodeEncodeError as e:
result['valid'] = False
result['exception'] = str(e)
else:
result['valid'] = True
return result

49
checks/charset_test.py Normal file
View File

@ -0,0 +1,49 @@
import httpretty
from httpretty import httprettified
import unittest
from checks import charset
from checks import page_content
from checks.config import Config
@httprettified
class TestCharsetChecker(unittest.TestCase):
def test_http_response(self):
url = 'http://www.example.com/'
httpretty.register_uri(httpretty.GET, url,
body="""<html>
<head>
<meta http-equiv="Content-type" value="text/html; charset=foo">
<meta charset="utf-8">
<title>Hello</title>
</head>
</html>""",
adding_headers={
"Content-Type": "text/html; charset=ISO-8859-1",
})
results = {}
config = Config(urls=[url])
page_content_checker = page_content.Checker(config=config, previous_results={})
results['page_content'] = page_content_checker.run()
self.assertIn(url, results['page_content'])
self.assertIn('response_headers', results['page_content'][url])
self.assertIn('content-type', results['page_content'][url]['response_headers'])
charset_checker = charset.Checker(config=page_content_checker.config, previous_results=results)
result = charset_checker.run()
self.assertIn(url, result)
self.assertEqual(result[url], {
'meta_charset_tag': 'utf-8',
'content_type_header_charset': 'iso-8859-1',
'charset': 'utf-8',
'valid': True,
'exception': None,
})
if __name__ == '__main__':
unittest.main()

29
checks/config.py Normal file
View File

@ -0,0 +1,29 @@
class Config(object):
"""
Our configuration to be passed to checks
"""
def __init__(self, urls, user_agent='green-spider/1.0'):
self._urls = set(urls)
self._user_agent = user_agent
def __repr__(self):
return "Config(urls=%r)" % self._urls
@property
def urls(self):
return list(self._urls)
def add_url(self, url):
self._urls.add(url)
def remove_url(self, url):
"""Removes url from urls, if it was in there. Ignores errors."""
try:
self._urls.remove(url)
except KeyError:
pass
@property
def user_agent(self):
return self._user_agent

55
checks/dns_resolution.py Normal file
View File

@ -0,0 +1,55 @@
"""
This check attempts to resolve all hostnames/domains in the input URLs.
URLs which are not resolvable are removed from the config.
"""
import logging
from socket import gethostbyname_ex
from urllib.parse import urlparse
from urllib.parse import urlunparse
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
"""Executes the check routine, returns result dict"""
results = {}
urls = list(self.config.urls)
for url in urls:
parsed = urlparse(url)
results[url] = self.resolve_hostname(parsed.hostname)
# remove URL if non-resolvable
if not results[url]['resolvable']:
self.config.remove_url(url)
return results
def resolve_hostname(self, hostname):
"""
Resolve one to IPv4 address(es)
"""
result = {
'hostname': hostname,
'resolvable': False,
'aliases': [],
'ipv4_addresses': [],
}
try:
hostname, aliases, ipv4_addresses = gethostbyname_ex(hostname)
result['resolvable'] = True
result['aliases'] = aliases
result['ipv4_addresses'] = ipv4_addresses
except Exception as e:
logging.debug("Hostname %s not resolvable. Exception: %r" % (hostname, e))
return result

View File

@ -0,0 +1,44 @@
"""
This adds commonly tried variations of domains/subdomains to the URLs config.
"""
import logging
from urllib.parse import urlparse
from urllib.parse import urlunparse
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
urls = list(self.config.urls)
for url in urls:
parsed = urlparse(url)
hostnames = self.expand_hostname(parsed.hostname)
for hostname in hostnames:
self.config.add_url(urlunparse((parsed.scheme, hostname,
parsed.path, parsed.params, parsed.query, parsed.fragment)))
return None
def expand_hostname(self, hostname):
"""
Create variations of subdomains
"""
hostnames = set()
hostnames.add(hostname)
if hostname.startswith('www.'):
# remove 'www.' prefix
hostnames.add(hostname[4:])
else:
# add 'www.' prefix
hostnames.add('www.' + hostname)
return sorted(list(hostnames))

107
checks/duplicate_content.py Normal file
View File

@ -0,0 +1,107 @@
"""
This checker looks at the similarity between previously downloaded pages
and removes duplicates from the config URLs
"""
import logging
import html_similarity
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
# value above which we consider a page pair a duplicate
similarity_threshold = 0.99999
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
if len(self.config.urls) == 1:
# nothing to do for us
return
urls = list(self.config.urls)
# get content
content = {}
assert 'page_content' in self.previous_results
for url in urls:
page_content = self.previous_results['page_content'][url]
if page_content['content'] is None:
logging.warn("Content for URL %s is None" % url)
content[url] = page_content['content']
pairs = self.compare_pairwise(content)
# remove duplicates
for key in pairs:
if pairs[key]['similarity'] is None:
continue
if pairs[key]['similarity'] > self.similarity_threshold:
# this pair is a duplicate.
# Decide which one to keep
url1, url2 = key.split(" ", 1)
reject = self.select_url_to_reject(url1, url2)
self.config.remove_url(reject)
return pairs
def compare_pairwise(self, content):
# compair pairwise
pairs = {}
for url1 in content:
for url2 in content:
if url1 == url2:
continue
# avoid checking pairs twice
pair_key = " ".join(sorted([url1, url2]))
if pair_key in pairs:
continue
try:
s = html_similarity.similarity(content[url1], content[url2])
logging.debug("Comparing pages for URLs %s and %s: similarity=%s", url1, url2, s)
pairs[pair_key] = {
'similarity': s,
'exception': None,
}
except (AttributeError, ValueError) as e:
logging.error("html_similarity.similarity thre exception for URL pair %s and %s: %s", url1, url2, e)
pairs[pair_key] = {
'similarity': None,
'exception': str(e),
}
return pairs
def select_url_to_reject(self, url1, url2):
"""Determine which of two URLs to keep, which to reject"""
# HTTPS takes precedence
if url1.startswith('https://') and not url2.startswith('https://'):
return url2
elif url2.startswith('https://') and not url1.startswith('https://'):
return url1
# Shorter URL wins
if len(url1) < len(url2):
return url2
elif len(url1) > len(url2):
return url1
# default behaviour
return url1

76
checks/generator.py Normal file
View File

@ -0,0 +1,76 @@
"""
Checks the 'generator' meta tag and page content properties
to detect well-known content management systems, themes etc.
"""
import logging
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
# IP address of the newthinking GCMS server
gcms_ip = "91.102.13.20"
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
assert 'page_content' in self.previous_results
assert 'html_head' in self.previous_results
results = {}
for url in self.config.urls:
results[url] = self.get_generator(url)
return results
def get_generator(self, url):
page_content = self.previous_results['page_content'][url]
assert 'content' in page_content
assert 'dns_resolution' in self.previous_results
dns_resolution = self.previous_results['dns_resolution']
head = self.previous_results['html_head'][url]
generator = None
if 'generator' in head and head['generator'] is not None:
generator = head['generator'].lower()
if 'typo3' in generator:
generator = 'typo3'
if 'wordpress' in generator:
generator = 'wordpress'
if 'drupal' in generator:
generator = 'drupal'
if 'joomla' in generator:
generator = 'joomla'
# Qualify certain CMS flavours in more detail
if generator == "typo3":
# Typo3-Gruene advertises in the page content
if 'typo3-gruene.de' in page_content['content']:
generator = "typo3-gruene"
# newthinking GCMS in some page hrefs
elif 'ntc_gcms' in page_content['content']:
generator = "typo3-gcms"
# check if one of the IPs matches the well-known GCMS Server IP
elif url in dns_resolution:
for addr in dns_resolution[url]['ipv4_addresses']:
if addr == self.gcms_ip:
generator = "typo3-gcms"
elif 'Urwahl3000' in page_content['content']:
generator = "wordpress-urwahl"
elif ('josephknowsbest' in page_content['content'] or
'Joseph-knows-best' in page_content['content']):
generator = "wordpress-josephknowsbest"
elif 'wordpress' in page_content['content']:
generator = "wordpress"
return generator

152
checks/html_head.py Normal file
View File

@ -0,0 +1,152 @@
"""
Extracts information from the html <head>, like existence and value
of certain meta tags, link tags, title, etc.
"""
import logging
import re
from urllib.parse import urljoin
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
results = {}
for url in self.config.urls:
results[url] = self.get_content(url)
return results
def get_content(self, url):
"""
Expects page_content_dict['content'] to carry the HTML content
"""
page_content = self.previous_results['page_content'][url]
assert 'content' in page_content
assert 'response_headers' in page_content
assert 'content-type' in page_content['response_headers']
if page_content['content'] is None:
return
soup = BeautifulSoup(page_content['content'], 'html.parser')
head = soup.find('head')
result = {
'title': self.get_title(head),
'link_canonical': self.get_link_canonical(head, url),
'link_rss_atom': self.get_link_rss_atom(head, url),
'link_icon': self.get_link_icon(head, url),
'generator': self.get_generator(head),
'opengraph': self.get_opengraph(head),
'viewport': self.get_viewport(head),
}
return result
def get_title(self, head):
"""Extract and clean up page title"""
if head is None:
return
title = None
tag = head.find('title')
if tag is None:
return
title = tag.get_text()
# clean up
title = title.replace(u'\u00a0', ' ')
title = title.replace(' ', ' ')
title = title.strip()
return title
def get_link_canonical(self, head, url):
if head is None:
return
link = head.find('link', rel='canonical')
if link:
return urljoin(url, link.get('href'))
def get_link_rss_atom(self, head, url):
if head is None:
return
hrefs = []
rss_links = head.find_all('link', type='application/rss+xml')
atom_links = head.find_all('link', type='application/atom+xml')
if rss_links:
for link in rss_links:
hrefs.append(link.get('href'))
if atom_links:
for link in rss_links:
hrefs.append(link.get('href'))
# make URLs absolute
for i in range(len(hrefs)):
parsed = urlparse(hrefs[i])
if parsed.scheme == '':
hrefs[i] = urljoin(url, hrefs[i])
return hrefs
def get_link_icon(self, head, url):
if head is None:
return
tag = head.find('link', rel=lambda x: x and x.lower() == 'icon')
if tag:
return urljoin(url, tag.get('href'))
tag = head.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
if tag:
return urljoin(url, tag.get('href'))
def get_generator(self, head):
if head is None:
return
tags = head.select('[name=generator]')
if tags:
return tags[0].get('content')
def get_opengraph(self, head):
if head is None:
return
# we find tags by matching this property/itemprop value regex
property_re = re.compile('^og:')
opengraph = set()
for tag in head.find_all(property=property_re):
opengraph.add(tag.get('property'))
for tag in head.find_all(itemprop=property_re):
opengraph.add(tag.get('itemprop'))
opengraph = sorted(list(opengraph))
if opengraph != []:
return opengraph
def get_viewport(self, head):
if head is None:
return
tags = head.select('[name=viewport]')
if tags:
return tags[0].get('content')

27
checks/http_and_https.py Normal file
View File

@ -0,0 +1,27 @@
"""
This adds, for every HTTP URL, the HTTPS counterpart,
and vice versa, to config.urls
So it doesn't actually perform tests. It only expands the
URLs to test by other checks.
"""
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
"""
Adds URLs to config.urls, returns nothing
"""
for url in self.config.urls:
if url.startswith('https://'):
self.config.add_url('http://' + url[8:])
elif url.startswith('http://'):
self.config.add_url('https://' + url[7:])
return None

134
checks/load_in_browser.py Normal file
View File

@ -0,0 +1,134 @@
"""
Collects information by loading pages in a browser.
Information includes:
- whether the document width adapts well to viewports as little as 360 pixels wide
- whether javascript errors or errors from missing resources occur
- collects CSS font-family properties in use
"""
import logging
import time
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
import tenacity
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
page_load_timeout = 20
# sizes we check for (width, height)
sizes = (
(360, 640), # rather old smartphone
(768, 1024), # older tablet or newer smartphone
(1024, 768), # older desktop or horiz. tablet
(1920, 1080), # Full HD horizontal
)
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
# Our selenium user agent using Chrome headless as an engine
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-extensions')
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.set_page_load_timeout(self.page_load_timeout)
def run(self):
results = {}
for url in self.config.urls:
results[url] = {
'sizes': None,
'min_document_width': None,
'logs': None,
'font_families': None,
}
# responsive check
try:
sizes = self.check_responsiveness(url)
results[url] = {
'sizes': sizes,
'min_document_width': min([s['document_width'] for s in sizes]),
'logs': self.capture_log(),
}
except TimeoutException as e:
logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e))
pass
except tenacity.RetryError as re:
logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re))
pass
# CSS collection
font_families = None
try:
elements = self.driver.find_elements_by_xpath("//*")
font_families = set()
for element in elements:
try:
font_family = element.value_of_css_property('font-family')
if font_family is None:
continue
font_families.add(font_family.lower())
except StaleElementReferenceException as e:
logging.warn("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
continue
results[url]['font_families'] = sorted(list(font_families))
except TimeoutException as e:
logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
pass
self.driver.quit()
return results
@tenacity.retry(stop=tenacity.stop_after_attempt(3),
retry=tenacity.retry_if_exception_type(TimeoutException))
def check_responsiveness(self, url):
result = []
# set window to the first size initially
self.driver.set_window_size(self.sizes[0][0], self.sizes[0][1])
self.driver.get(url)
# give the page some time to load
time.sleep(10)
for (width, height) in self.sizes:
self.driver.set_window_size(width, height)
# wait for re-render/re-flow
time.sleep(1.0)
doc_width = self.driver.execute_script("return document.body.scrollWidth")
result.append({
'viewport_width': width,
'document_width': int(doc_width),
})
return result
def capture_log(self):
"""
Returns log elements with level "SEVERE"
"""
entries = []
for entry in self.driver.get_log('browser'):
if entry['level'] in ('WARNING', 'SEVERE'):
entries.append(entry)
return entries

94
checks/page_content.py Normal file
View File

@ -0,0 +1,94 @@
"""
This check downloads the HTML page for each URL
"""
import logging
import requests
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
# connection timeout (seconds)
CONNECT_TIMEOUT = 10
# response timeout (seconds)
READ_TIMEOUT = 20
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
results = {}
self.headers = {
"User-Agent": self.config.user_agent,
}
# copy URLs, as we may be manipulating self.config.urls in the loop
url = list(self.config.urls)
for url in self.config.urls:
result = self.download_page(url)
results[url] = result
# remove bad URLs from config, to avoid later checks using them
if 'exception' in result and result['exception'] is not None:
self.config.remove_url(url)
return results
def download_page(self, url):
result = {
'url': url,
'content': None,
'content_type': None,
'content_length': None,
'status_code': None,
'response_headers': None,
'duration': None,
'exception': None,
}
try:
r = requests.get(url,
headers=self.headers,
timeout=(self.CONNECT_TIMEOUT, self.READ_TIMEOUT))
result['url'] = r.url
result['status_code'] = r.status_code
result['content'] = r.text
result['content_length'] = len(r.text)
result['response_headers'] = self.get_headers(r.headers)
result['duration'] = round(r.elapsed.total_seconds() * 1000)
if r.headers.get("content-type") is not None:
result['content_type'] = r.headers.get("content-type").split(";")[0].strip()
except requests.exceptions.ConnectionError as exc:
logging.error(str(exc) + " " + url)
result['exception'] = "connection"
except requests.exceptions.ReadTimeout as exc:
logging.error(str(exc) + " " + url)
result['exception'] = "read_timeout"
except requests.exceptions.Timeout as exc:
logging.error(str(exc) + " " + url)
result['exception'] = "connection_timeout"
except Exception as exc:
logging.error(str(exc) + " " + url)
result['exception'] = "%s %s" % (str(type(exc)), exc)
return result
def get_headers(self, headers):
"""
Transforms CaseInsensitiveDict into dict with lowercase keys
"""
out = {}
for key in headers:
out[key.lower()] = headers[key]
return out

View File

@ -0,0 +1,13 @@
"""
This check verifies whether there is a single URL
or several variants left at this point.
"""
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
return self.config.urls

104
checks/url_reachability.py Normal file
View File

@ -0,0 +1,104 @@
"""
This check verifies whether the urls in config are reachable.
Some additional information regarding redirects and SSL problems
are also recorded and returned as results.
Non-accessible URLs are removed from config.urls.
A redirect to facebook.com is not considered reachable, as that
leads to a different website in the sense of this system.
TODO: Parallelize the work done in this test
"""
import logging
from urllib.parse import urlparse
import requests
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
headers = {
"User-Agent": self.config.user_agent
}
results = {}
urls = list(self.config.urls)
for url in urls:
logging.debug("Checking URL reachability for %s", url)
result = {
"url": url,
"redirect_history": [],
"status": None,
"exception": None,
"duration": None,
}
# Perform HEAD requests, recording redirect log
try:
r = requests.head(url, headers=headers, allow_redirects=True)
result['status'] = r.status_code
result['duration'] = round(r.elapsed.total_seconds() * 1000)
if len(r.history):
result['redirect_history'] = self.expand_history(r.history)
logging.debug("Redirects: %r", result['redirect_history'])
if r.url == url:
logging.debug("URL: %s - status %s", url, r.status_code)
else:
logging.debug("URL: %s - status %s - redirects to %s", url,
r.status_code, r.url)
# remove source URL, add target URL to config.urls
self.config.remove_url(url)
self.config.add_url(r.url)
# remove 404 etc
if r.status_code > 400:
self.config.remove_url(url)
except Exception as exc:
logging.info("Exception for URL %s: %s %s", url, str(type(exc)), exc)
result['exception'] = {
'type': str(type(exc)),
'message': str(exc),
}
# remove URL to prevent further checks on unreachable URL
self.config.remove_url(url)
# if redirects end in www.facebook.com or www.denic.de, remove this URL again
# remove if redirect target is facebook
if result['exception'] is not None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
parsed = urlparse(result['redirect_history'][-1]['redirect_to'])
if parsed.hostname in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
result[url]['exception'] = {
'type': 'Bad target domain',
'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.hostname,
}
self.config.remove_url(url)
results[url] = result
return results
def expand_history(self, history):
"""Extracts primitives from a list of requests.Response objects"""
items = []
for h in history:
item = {
'status': h.status_code,
'duration': round(h.elapsed.total_seconds() * 1000),
'redirect_to': h.headers['location'],
}
items.append(item)
return items

View File

@ -0,0 +1,71 @@
import httpretty
from httpretty import httprettified
import unittest
from checks import url_reachability
from checks.config import Config
@httprettified
class TestCharsetChecker(unittest.TestCase):
def test_success(self):
url = 'http://www.example.com/'
httpretty.register_uri(httpretty.HEAD, url,
status=200, body="<html></html>")
config = Config(urls=[url])
checker = url_reachability.Checker(config=config, previous_results={})
result = checker.run()
self.assertEqual(result[url]['url'], url)
self.assertEqual(result[url]['redirect_history'], [])
self.assertEqual(result[url]['status'], 200)
self.assertIsNone(result[url]['exception'])
self.assertTrue(0 < result[url]['duration'] < 100)
def test_redirect(self):
url = 'http://www.example.com/'
url2 = 'http://www2.example.com/'
httpretty.register_uri(httpretty.HEAD, url,
status=302, body="",
adding_headers={"Location": url2})
httpretty.register_uri(httpretty.HEAD, url2,
status=200, body="<html></html>")
config = Config(urls=[url])
checker = url_reachability.Checker(config=config, previous_results={})
result = checker.run()
self.assertIn(url, result)
self.assertEqual(result[url]['url'], url)
self.assertEqual(result[url]['status'], 200)
self.assertIsNone(result[url]['exception'])
self.assertTrue(0 < result[url]['duration'] < 100)
self.assertEqual(len(result[url]['redirect_history']), 1)
self.assertEqual(result[url]['redirect_history'][0]['status'], 302)
self.assertEqual(result[url]['redirect_history'][0]['redirect_to'], url2)
def test_notfound(self):
url = 'http://www.example.com/'
httpretty.register_uri(httpretty.HEAD, url,
status=404, body="<html><body>Not found</body></html>")
config = Config(urls=[url])
checker = url_reachability.Checker(config=config, previous_results={})
result = checker.run()
self.assertEqual(result[url]['url'], url)
self.assertEqual(result[url]['redirect_history'], [])
self.assertEqual(result[url]['status'], 404)
self.assertIsNone(result[url]['exception'])
newconfig = checker.config
self.assertEqual(len(newconfig.urls), 0)
if __name__ == '__main__':
unittest.main()

83
cli.py Normal file
View File

@ -0,0 +1,83 @@
"""
Command line utility for spider, export etc.
"""
import argparse
import logging
import signal
import sys
from google.cloud import datastore
def handle_sigint(signum, frame):
"""
Handles SIGINT, which occurs on Ctrl-C
"""
print("\nInterrupted by SIGINT\n")
sys.exit()
if __name__ == "__main__":
signal.signal(signal.SIGINT,handle_sigint)
parser = argparse.ArgumentParser()
# global flags
parser.add_argument('--credentials-path', dest='credentials_path',
help='Path to the service account credentials JSON file',
default='/secrets/service-account.json')
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
default='info')
# subcommands
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
# spider subcommand
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
# jobs subcommand
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
jobs_parser.add_argument('--url', help='Add a job to spider a specific URL')
# export subcommand
export_parser = subparsers.add_parser('export', help='Export JSON data')
export_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to export (default: spider-results)')
args = parser.parse_args()
# set log level
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
loglevel = args.loglevel.lower()
if loglevel == 'error':
logging.basicConfig(level=logging.ERROR)
elif loglevel == 'warn':
logging.basicConfig(level=logging.WARN)
elif loglevel == 'debug':
logging.basicConfig(level=logging.DEBUG)
logging.getLogger("selenium").setLevel(logging.INFO)
else:
logging.basicConfig(level=logging.INFO)
loglevel = 'info'
logging.debug("Called command %s", args.command)
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
if args.command == 'jobs':
import jobs
jobs.create_jobs(datastore_client, args.url)
elif args.command == 'export':
import export
export.export_screenshots(datastore_client)
export.export_results(datastore_client, args.kind)
else:
from spider import spider
spider.work_of_queue(datastore_client, args.kind)

23
config/__init__.py Normal file
View File

@ -0,0 +1,23 @@
# connection timeout for website checks (seconds)
CONNECT_TIMEOUT = 5
# response timeout for website checks
READ_TIMEOUT = 10
# Git repo for our data
GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
# folder in that repo that holds the data
GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
# folder we use locally to clone the repo
GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
# IP address of the newthinking GCMS server
GCMS_IP = "91.102.13.20"
# kind name of the spider job key datastore entities
JOB_DATASTORE_KIND = 'spider-jobs'

View File

@ -19,6 +19,8 @@
# secrets/datastore-writer.json
DOCKERIMAGE="quay.io/netzbegruenung/green-spider:dev"
API_TOKEN_SECRET="secrets/hetzner-api-token.sh"
test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; }
source $API_TOKEN_SECRET
@ -29,10 +31,14 @@ if [[ "$1" == "" ]]; then
exit 1
fi
SERVERNAME="$1-$(date | md5 | cut -c1-3)"
# possible values: cx11 (1 core 2 GB), cx21 (2 cores, 4 GB), cx31 (2 cores, 8 GB)
SERVERTYPE="cx21"
function create_server()
{
echo "Creating server $1"
echo "Creating server $SERVERNAME"
# server_type 'cx11' is the smallest, cheapest category.
# location 'nbg1' is Nürnberg/Nuremberg, Germany.
@ -44,8 +50,8 @@ function create_server()
-H "Content-Type: application/json" \
-H "Authorization: Bearer $API_TOKEN" \
-d "{
\"name\": \"$1\",
\"server_type\": \"cx11\",
\"name\": \"$SERVERNAME\",
\"server_type\": \"$SERVERTYPE\",
\"location\": \"nbg1\",
\"start_after_create\": true,
\"image\": \"debian-9\",
@ -61,7 +67,7 @@ function create_server()
# Get IP:
SERVER_IP=$(echo $CREATE_RESPONSE | jq -r .server.public_net.ipv4.ip)
echo "Created server with ID $SERVER_ID and IP $SERVER_IP"
echo "Created server $SERVERNAME with ID $SERVER_ID and IP $SERVER_IP"
}
@ -142,22 +148,25 @@ else
# Run docker job
echo "Starting Docker Job"
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \
-v /root/secrets:/secrets \
quay.io/netzbegruenung/green-spider spider.py \
--credentials-path /secrets/datastore-writer.json \
jobs
#ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \
# -v /root/secrets:/secrets \
# quay.io/netzbegruenung/green-spider spider.py \
# --credentials-path /secrets/datastore-writer.json \
# jobs
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP mkdir -p /dev-shm
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \
-v /dev-shm:/dev/shm \
-v /root/secrets:/secrets \
quay.io/netzbegruenung/green-spider spider.py \
$DOCKERIMAGE \
--credentials-path /secrets/datastore-writer.json \
spider
--loglevel info \
spider --kind spider-results-dev
fi
# Delete the box
echo "Deleting server $SERVER_ID"
echo "Deleting server $SERVERNAME with ID $SERVER_ID"
curl -s -X DELETE -H "Content-Type: application/json" \
-H "Authorization: Bearer $API_TOKEN" \
https://api.hetzner.cloud/v1/servers/$SERVER_ID

View File

@ -2,8 +2,7 @@
Exports data from the database to JSON files for use in a static webapp
"""
from google.cloud import datastore
import hashlib
from hashlib import md5
import json
import logging
import sys
@ -14,44 +13,67 @@ import requests
SITEICONS_PATH = "/icons"
client = None
def export_results():
def export_results(client, entity_kind):
"""
Export of the main results data
"""
out = []
query = client.query(kind='spider-results')
# Load data from database
query = client.query(kind=entity_kind)
for entity in query.fetch():
logging.debug(entity.key.name)
record = dict(entity)
record["results"]["created"] = record["created"].isoformat()
out.append(record["results"])
out.append({
'input_url': entity.key.name,
'resulting_urls': entity.get('checks').get('url_canonicalization'),
'created': entity.get('created').isoformat(),
'meta': entity.get('meta'),
'checks': entity.get('checks'),
'rating': entity.get('rating'),
'score': entity.get('score'),
'icons': [],
})
# load icons, reformat icons details
icons_downloaded = set()
for index in range(len(out)):
if "details" not in out[index]:
assert "checks" in out[index]
assert "html_head" in out[index]["checks"]
# collect icons urls
icons = set()
for url in out[index]['checks']['html_head']:
assert 'link_icon' in out[index]['checks']['html_head'][url]
if out[index]['checks']['html_head'][url]['link_icon'] is not None:
iconurl = out[index]['checks']['html_head'][url]['link_icon']
if iconurl.startswith("data:"):
continue
if "icons" not in out[index]["details"]:
if iconurl in icons_downloaded:
continue
urls = out[index]["details"]["icons"]
out[index]["details"]["icons"] = {}
for url in urls:
if not (url.startswith("http://") or url.startswith("https://")):
logging.debug("Skipping icon %s", url)
continue
logging.debug("Dowloading icon %s", url)
filename = download_icon(url)
icons.add(iconurl)
out[index]["icons"] = {}
for iconurl in list(icons):
logging.debug("Dowloading icon %s", iconurl)
icons_downloaded.add(iconurl)
filename = download_icon(iconurl)
if filename:
out[index]["details"]["icons"][url] = filename
out[index]["icons"][url] = filename
output_filename = "/out/spider_result.json"
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
# compact version
output_filename = "/out/spider_result_compact.json"
for i in range(len(out)):
out[i]['cms'] = list(out[i]['checks']['generator'].values())
del out[i]['checks']
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
def export_screenshots():
def export_screenshots(client):
"""
Export of screenshot meta data
"""
@ -78,10 +100,12 @@ def download_icon(icon_url):
"""
default_endings = {
"image/x-ico": "ico",
"image/x-icon": "ico",
"image/vnd.microsoft.icon": "ico",
"image/png": "png",
"image/jpeg": "jpg",
"image/gif": "gif",
}
# Download the icon
@ -92,7 +116,7 @@ def download_icon(icon_url):
if req.status_code >= 400:
return None
content_hash = hashlib.md5(req.content).hexdigest()
content_hash = md5(req.content).hexdigest()
extension = ""
try:
@ -109,6 +133,9 @@ def download_icon(icon_url):
if extension == "":
# derive from content type
ctype = req.headers.get('content-type')
if ctype is None:
return
try:
extension = default_endings[ctype]
except KeyError:
@ -122,17 +149,3 @@ def download_icon(icon_url):
iconfile.write(req.content)
return filename
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
if len(sys.argv) == 1:
print("Error: please provide path to Google Storage API system account JSON file as argument")
sys.exit(1)
key_path = sys.argv[1]
client = datastore.Client.from_service_account_json(key_path)
export_screenshots()
export_results()

180
jobs/__init__.py Normal file
View File

@ -0,0 +1,180 @@
"""
The jobs module allows to create jobs for the queue and take jobs off the queue
"""
from datetime import datetime
import logging
import os
import random
import shutil
from git import Repo
import tenacity
import yaml
from google.api_core.exceptions import Aborted
from google.cloud import datastore
import config
def clone_data_directory():
"""
Clones the source of website URLs, the green directory,
into the local file system using git
"""
if os.path.exists(config.GREEN_DIRECTORY_LOCAL_PATH):
shutil.rmtree(config.GREEN_DIRECTORY_LOCAL_PATH)
Repo.clone_from(config.GREEN_DIRECTORY_REPO, config.GREEN_DIRECTORY_LOCAL_PATH)
def directory_entries():
"""
Iterator over all data files in the cloned green directory
"""
path = os.path.join(config.GREEN_DIRECTORY_LOCAL_PATH, config.GREEN_DIRECTORY_DATA_PATH)
for root, _, files in os.walk(path):
for fname in files:
filepath = os.path.join(root, fname)
if not filepath.endswith(".yaml"):
continue
with open(filepath, 'r', encoding='utf8') as yamlfile:
for doc in yaml.load_all(yamlfile):
yield doc
def chunks(the_list, size):
"""
Yield successive n-sized chunks from list the_list
where n = size.
"""
for i in range(0, len(the_list), size):
yield the_list[i:i + size]
def create_jobs(datastore_client, url=None):
"""
Read all URLs from green directory and fill a job database
with one job per URL.
Alternatively, if the url argument is given, only the given URL
will be added as a spider job.
"""
# refresh our local clone of the green directory
logging.info("Refreshing green-directory clone")
clone_data_directory()
# build the list of website URLs to run checks for
logging.info("Processing green-directory")
input_entries = []
count = 0
random.seed()
for entry in directory_entries():
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
continue
website_url = None
for index in range(len(entry['urls'])):
try:
if entry['urls'][index]['type'] == "WEBSITE":
website_url = entry['urls'][index]['url']
if website_url:
if url is not None and website_url != url:
continue
input_entries.append({
"url": website_url,
"type": entry.get("type"),
"level": entry.get("level"),
"state": entry.get("state"),
"district": entry.get("district"),
"city": entry.get("city"),
})
count += 1
except NameError:
logging.error("Error in %s: 'url' key missing (%s)",
repr_entry(entry), entry['urls'][index])
# ensure the passed URL argument is really there, even if not part
# of the directory.
if url and count == 0:
logging.info("Adding job for URL %s which is not part of green-directory", url)
input_entries.append({
"url": url,
"type": None,
"level": None,
"state": None,
"district": None,
"city": None,
"index": int(random.uniform(1000000, 9999999)),
})
count = 0
logging.info("Writing jobs")
entities = []
for entry in input_entries:
key = datastore_client.key(config.JOB_DATASTORE_KIND, entry["url"])
entity = datastore.Entity(key=key)
entity.update({
"created": datetime.utcnow(),
"type": entry["type"],
"level": entry["level"],
"state": entry["state"],
"district": entry["district"],
"city": entry["city"],
"index": int(random.uniform(1000000, 9999999)),
})
entities.append(entity)
# commmit to DB
for chunk in chunks(entities, 300):
logging.debug("Writing jobs chunk of length %d", len(chunk))
datastore_client.put_multi(chunk)
count += len(chunk)
logging.info("Writing jobs done, %s jobs added", count)
@tenacity.retry(wait=tenacity.wait_exponential(),
retry=tenacity.retry_if_exception_type(Aborted))
def get_job_from_queue(datastore_client):
"""
Returns a URL from the queue
"""
out = None
with datastore_client.transaction():
query = datastore_client.query(kind=config.JOB_DATASTORE_KIND,
order=['index'])
for entity in query.fetch(limit=1):
logging.debug("Got job: %s", entity)
out = dict(entity)
out["url"] = entity.key.name
datastore_client.delete(entity.key)
return out
def repr_entry(entry):
"""
Return string representation of a directory entry,
for logging/debugging purposes
"""
ret = entry['type']
if 'level' in entry:
ret += "/" + entry['level']
if 'state' in entry:
ret += "/" + entry['state']
if 'district' in entry:
ret += "/" + entry['district']
return ret

53
rating/__init__.py Normal file
View File

@ -0,0 +1,53 @@
"""
The rating module contains the functionality to get calculate score for certain
criteria based on information gather by checks before.
"""
import logging
from rating import canonical_url
from rating import favicon
from rating import feeds
from rating import https
from rating import no_network_errors
from rating import no_script_errors
from rating import reachable
from rating import resolvable
from rating import response_duration
from rating import responsive_layout
from rating import use_specific_fonts
from rating import www_optional
def calculate_rating(results):
"""
Calculates ratings for a number of criteria.
Params:
results - Results dictionary from checks
"""
# The raters to execute.
rating_modules = {
'CANONICAL_URL': canonical_url,
'DNS_RESOLVABLE_IPV4': resolvable,
'FAVICON': favicon,
'FEEDS': feeds,
'HTTPS': https,
'HTTP_RESPONSE_DURATION': response_duration,
'NO_NETWORK_ERRORS': no_network_errors,
'NO_SCRIPT_ERRORS': no_script_errors,
'RESPONSIVE': responsive_layout,
'SITE_REACHABLE': reachable,
'USE_SPECIFIC_FONTS': use_specific_fonts,
'WWW_OPTIONAL': www_optional,
}
output = {}
for name in rating_modules:
rater = rating_modules[name].Rater(results)
output[name] = rater.rate()
return output

22
rating/abstract_rater.py Normal file
View File

@ -0,0 +1,22 @@
class AbstractRater(object):
# String 'boolean' or 'number'
rating_type = None
# The default value to return if no rating given
default_value = None
max_score = 1
# Name of the checks this rater depends on
depends_on_checks = []
def __init__(self, check_results):
self.check_results = check_results
for item in self.depends_on_checks:
assert item in self.check_results
def rate(self):
raise NotImplementedError()

31
rating/canonical_url.py Normal file
View File

@ -0,0 +1,31 @@
"""
This looks at remaining resolvable URLs after redirects
and gives score if there is only one URL left.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['url_canonicalization']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
if len(self.check_results['url_canonicalization']) == 1:
value = True
score = self.max_score
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

32
rating/favicon.py Normal file
View File

@ -0,0 +1,32 @@
"""
This gives a score if the site has an icon.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['html_head']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
for url in self.check_results['html_head']:
if self.check_results['html_head'][url]['link_icon'] is not None:
value = True
score = self.max_score
break
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

35
rating/feeds.py Normal file
View File

@ -0,0 +1,35 @@
"""
This gives a score if the site has feeds.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['html_head']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
for url in self.check_results['html_head']:
if self.check_results['html_head'][url]['link_rss_atom'] is None:
continue
if self.check_results['html_head'][url]['link_rss_atom'] == []:
continue
value = True
score = self.max_score
break
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

47
rating/https.py Normal file
View File

@ -0,0 +1,47 @@
"""
This looks at all HTTPS URLs we checked for reachability.
If all of them were reachable without errors, we give full score.
If some or all had errors, or no HTTPS URL is reachable, we give zero.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['url_reachability']
# HTTPS is very important, so this counts double
max_score = 2
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
reachable_count = 0
unreachable_count = 0
for url in self.check_results['url_reachability']:
if not url.startswith('https://'):
continue
if self.check_results['url_reachability'][url]['exception'] is None:
reachable_count += 1
else:
unreachable_count += 1
if unreachable_count == 0 and reachable_count > 0:
value = True
score = self.max_score
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

View File

@ -0,0 +1,48 @@
"""
If all URLs could be loaded without severe network errors, this rater gives a score.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['load_in_browser']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
found_pageloads = 0
found_errors = 0
for url in self.check_results['load_in_browser']:
if (self.check_results['load_in_browser'][url]['logs'] == [] or
self.check_results['load_in_browser'][url]['logs'] is None):
continue
found_pageloads += 1
# scan log entries for script errors
for entry in self.check_results['load_in_browser'][url]['logs']:
if entry['source'] != 'network':
continue
if entry['level'] != 'SEVERE':
continue
found_errors += 1
if found_pageloads > 0 and found_errors == 0:
value = True
score = self.max_score
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

View File

@ -0,0 +1,42 @@
"""
If all URLs could be loaded without JavaScript errors, this rater gives a score.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['load_in_browser']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
found_pageloads = 0
found_errors = 0
for url in self.check_results['load_in_browser']:
if self.check_results['load_in_browser'][url]['logs'] == []:
found_pageloads += 1
continue
# scan log entries for script errors
for entry in self.check_results['load_in_browser'][url]['logs']:
if entry['source'] == 'javascript':
found_errors += 1
if found_pageloads > 0 and found_errors == 0:
value = True
score = self.max_score
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

36
rating/reachable.py Normal file
View File

@ -0,0 +1,36 @@
"""
This gives a score if one of the checked URL variations was reachable.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['url_reachability']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
count = 0
for url in self.check_results['url_reachability']:
if self.check_results['url_reachability'][url]['exception'] is not None:
continue
count += 1
if count > 0:
value = True
score = self.max_score
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

35
rating/resolvable.py Normal file
View File

@ -0,0 +1,35 @@
"""
This gives a score if one of the input URL's hostnames was resolvable
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['dns_resolution']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
count = 0
for url in self.check_results['dns_resolution']:
if self.check_results['dns_resolution'][url]['resolvable']:
count += 1
if count > 0:
value = True
score = self.max_score
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

View File

@ -0,0 +1,46 @@
"""
This looks at the response duration(s) and scores based on the bucket
the value is in. Fast responses get one point, slower half a point,
more than a seconds gets nothing.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'number'
default_value = False
depends_on_checks = ['page_content']
max_score = 1.0
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
duration_sum = 0
duration_count = 0
for url in self.check_results['page_content']:
if self.check_results['page_content'][url]['exception'] is not None:
continue
duration_sum += self.check_results['page_content'][url]['duration']
duration_count += 1
if duration_count > 0:
value = round(duration_sum / duration_count)
# value is duration in milliseconds
if value < 100:
score = self.max_score
elif value < 1000:
score = self.max_score * 0.5
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

View File

@ -0,0 +1,35 @@
"""
This gives a score if the site's minimal document width during checks
was smaller than or equal to the minimal viewport size tested.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['load_in_browser']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
for url in self.check_results['load_in_browser']:
if (self.check_results['load_in_browser'][url]['min_document_width'] <=
self.check_results['load_in_browser'][url]['sizes'][0]['viewport_width']):
value = True
score = self.max_score
# we use the first URL found here
break
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

View File

@ -0,0 +1,41 @@
"""
Checks whether the pages use the font 'Arvo'.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['load_in_browser']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
urls_with_font = 0
urls_without_font = 0
for url in self.check_results['load_in_browser']:
if self.check_results['load_in_browser'][url]['font_families'] is None:
urls_without_font += 1
continue
fonts = " ".join(self.check_results['load_in_browser'][url]['font_families'])
if 'arvo' in fonts:
urls_with_font += 1
if urls_with_font > 0 and urls_without_font == 0:
score = self.max_score
value = True
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

44
rating/www_optional.py Normal file
View File

@ -0,0 +1,44 @@
"""
This looks at reachable URLs and checks whether (sub)domains
both with and without www. are reachable.
"""
from urllib.parse import urlparse
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['url_reachability']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
hostnames = set()
for url in self.check_results['url_reachability']:
if self.check_results['url_reachability'][url]['exception'] is not None:
continue
parsed = urlparse(url)
hostnames.add(parsed)
# FIXME
# we simply check whether there is more than one hostname.
# this works with our current input URls but might be too
# simplistic in the future.
if len(list(hostnames)) > 1:
value = True
score = self.max_score
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

814
spider.py
View File

@ -1,814 +0,0 @@
"""
Provides the spider functionality (website checks).
"""
import argparse
import json
import logging
import os
import random
import re
import shutil
import statistics
import time
from datetime import datetime
from socket import gethostbyname_ex
from urllib.parse import urljoin
from urllib.parse import urlparse
import requests
import yaml
import tenacity
from bs4 import BeautifulSoup
from git import Repo
from selenium import webdriver
from google.cloud import datastore
from google.api_core.exceptions import Aborted
from google.api_core.exceptions import InvalidArgument
# configuration
# connection timeout for website checks (seconds)
CONNECT_TIMEOUT = 5
# response timeout for website checks
READ_TIMEOUT = 10
# Git repo for our data
GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
# folder in that repo that holds the data
GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
RESULT_PATH = '/out'
# IP address of the newthinking GCMS server
GCMS_IP = "91.102.13.20"
JOB_DATASTORE_KIND = 'spider-jobs'
RESULTS_DATASTORE_KIND = 'spider-results'
# end configuration
DATASTORE_CLIENT = None
def chunks(the_list, size):
"""
Yield successive n-sized chunks from list the_list
where n = size.
"""
for i in range(0, len(the_list), size):
yield the_list[i:i + size]
def create_jobs(url=None):
"""
Read all URLs from green directory and fill a job database
with one job per URL.
Alternatively, if the url argument is given, only the given URL
will be added as a spider job.
"""
# refresh our local clone of the green directory
logging.info("Refreshing green-directory clone")
get_green_directory()
# build the list of website URLs to run checks for
logging.info("Processing green-directory")
input_entries = []
count = 0
for entry in dir_entries():
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
continue
website_url = None
for index in range(len(entry['urls'])):
try:
if entry['urls'][index]['type'] == "WEBSITE":
website_url = entry['urls'][index]['url']
if website_url:
if url is not None and website_url != url:
continue
input_entries.append({
"url": website_url,
"level": entry.get("level"),
"state": entry.get("state"),
"district": entry.get("district"),
"city": entry.get("city"),
})
count += 1
except NameError:
logging.error("Error in %s: 'url' key missing (%s)",
repr_entry(entry), entry['urls'][index])
# ensure the passed URL argument is really there, even if not part
# of the directory.
if url and count == 0:
logging.info("Adding job for URL %s which is not part of green-directory", url)
input_entries.append({
"url": url,
"level": None,
"state": None,
"district": None,
"city": None,
})
# randomize order, to distribute requests over servers
logging.debug("Shuffling input URLs")
random.seed()
random.shuffle(input_entries)
count = 0
logging.info("Writing jobs")
entities = []
for entry in input_entries:
key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"])
entity = datastore.Entity(key=key)
entity.update({
"created": datetime.utcnow(),
"level": entry["level"],
"state": entry["state"],
"district": entry["district"],
"city": entry["city"],
})
entities.append(entity)
# commmit to DB
for chunk in chunks(entities, 300):
logging.debug("Writing jobs chunk of length %d", len(chunk))
DATASTORE_CLIENT.put_multi(chunk)
count += len(chunk)
logging.info("Writing jobs done, %s jobs added", count)
def get_green_directory():
"""
Clones the source of website URLs, the green directory,
into the local file system using git
"""
if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH):
shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH)
Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH)
def dir_entries():
"""
Iterator over all data files in the cloned green directory
"""
path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH)
for root, _, files in os.walk(path):
for fname in files:
filepath = os.path.join(root, fname)
if not filepath.endswith(".yaml"):
continue
with open(filepath, 'r', encoding='utf8') as yamlfile:
for doc in yaml.load_all(yamlfile):
yield doc
def repr_entry(entry):
"""
Return string representation of a directory entry,
for logging/debugging purposes
"""
ret = entry['type']
if 'level' in entry:
ret += "/" + entry['level']
if 'state' in entry:
ret += "/" + entry['state']
if 'district' in entry:
ret += "/" + entry['district']
return ret
def derive_test_hostnames(hostname):
"""
Derives the hostnames variants to test for a given host name.
From 'gruene-x.de' or 'www.gruene-x.de' it makes
['gruene-x.de', 'www.gruene-x.de']
which are both plausible web URLs to be used for a domain.
"""
hostnames = set()
hostnames.add(hostname)
if hostname.startswith('www.'):
hostnames.add(hostname[4:])
else:
hostnames.add('www.' + hostname)
return sorted(list(hostnames))
def reduce_urls(urllist):
"""
Reduce a list of urls with metadata by eliminating those
that either don't work or lead somewhere else
"""
targets = set()
for url in urllist:
if url['error'] is not None:
continue
if url['redirects_to'] is not None:
targets.add(url['redirects_to'])
else:
targets.add(url['url'])
return sorted(list(targets))
def normalize_title(title):
"""
Removes garbage from HTML page titles
"""
title = title.replace(u'\u00a0', ' ')
title = title.replace(' ', ' ')
title = title.strip()
return title
def check_responsiveness(url):
"""
Checks
- whether a page adapts to different viewport sizes
- whether a viewport meta tag exists
and returns details
"""
details = {
'document_width': {},
'viewport_meta_tag': None,
}
# sizes we check for (width, height)
sizes = (
(320, 480), # old smartphone
(768, 1024), # older tablet or newer smartphone
(1024, 768), # older desktop or horiz. tablet
(1920, 1080), # Full HD horizontal
)
# Our selenium user agent using Chrome headless as an engine
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-extensions')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.set_page_load_timeout(60)
driver.set_window_size(sizes[0][0], sizes[0][1])
driver.get(url)
time.sleep(1)
for (width, height) in sizes:
driver.set_window_size(width, height)
key = "%sx%s" % (width, height)
width = driver.execute_script("return document.body.scrollWidth")
details['document_width'][key] = int(width)
try:
element = driver.find_element_by_xpath("//meta[@name='viewport']")
details['viewport_meta_tag'] = element.get_attribute('content')
except:
pass
return details
def check_content(req):
"""
Adds details to check regarding content of the page
check: the dict containing details for this URL
r: requests request/response object
"""
result = {}
result['encoding'] = req.encoding.lower()
soup = BeautifulSoup(req.text, 'html.parser')
result['html'] = req.text
# page title
result['title'] = None
title = None
head = soup.find('head')
if head is not None:
title = head.find('title')
if title is not None:
result['title'] = normalize_title(title.get_text())
# canonical link
result['canonical_link'] = None
link = soup.find('link', rel='canonical')
if link:
result['canonical_link'] = urljoin(req.url, link.get('href'))
# icon
result['icon'] = None
link = soup.find('link', rel=lambda x: x and x.lower() == 'icon')
if link:
result['icon'] = urljoin(req.url, link.get('href'))
else:
link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
if link:
result['icon'] = urljoin(req.url, link.get('href'))
# feed links
result['feeds'] = []
rss_links = soup.find_all('link', type='application/rss+xml')
atom_links = soup.find_all('link', type='application/atom+xml')
if rss_links:
for link in rss_links:
result['feeds'].append(urljoin(req.url, link.get('href')))
if atom_links:
for link in rss_links:
result['feeds'].append(urljoin(req.url, link.get('href')))
# generator meta tag
result['generator'] = None
if head is not None:
generator = head.select('[name=generator]')
if generator:
result['generator'] = generator[0].get('content')
# opengraph meta tags
result['opengraph'] = None
opengraph = set()
if head is not None:
for item in head.find_all(property=re.compile('^og:')):
opengraph.add(item.get('property'))
for item in head.find_all(itemprop=re.compile('^og:')):
opengraph.add(item.get('itemprop'))
if opengraph:
result['opengraph'] = sorted(list(opengraph))
return result
def collect_ipv4_addresses(hostname_dict):
"""
Return list of unique IPv4 addresses
"""
ips = set()
for item in hostname_dict.values():
if 'ip_addresses' not in item:
continue
for ip_addr in item['ip_addresses']:
ips.add(ip_addr)
return sorted(list(ips))
def parse_generator(generator):
"""
Return well known CMS names from generator
"""
generator = generator.lower()
if 'typo3' in generator:
return "typo3"
if 'wordpress' in generator:
return "wordpress"
if 'drupal' in generator:
return "drupal"
if 'joomla' in generator:
return "joomla"
return generator
def check_site(entry):
"""
Performs our site check and returns results as a dict.
1. Normalize the input URL and derive the URLs to check for
2. HEAD the check urls
3. Determine the canonical URL
4. Run full check on canonical URL
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' +
'AppleWebKit/537.36 (KHTML, like Gecko) ' +
'Chrome/65.0.3325.181 green-spider/0.1'
}
# all the info we'll return for the site
result = {
# input_url: The URL we derived all checks from
'input_url': entry['url'],
# Meta: Regional and type metadata for the site
'meta': {
'level': entry.get('level'),
'state': entry.get('state'),
'district': entry.get('district'),
'city': entry.get('city'),
},
# Details: All details we collected about the site (which aren't directly
# related to the report criteria)
'details': {
'hostnames': {},
'ipv4_addresses': [],
'resolvable_urls': [],
'canonical_urls': [],
'urlchecks': [],
'icons': [],
'feeds': [],
'cms': None,
'responsive': None,
},
# The actual report criteria
'result': {
'DNS_RESOLVABLE_IPV4': {'type': 'boolean', 'value': False, 'score': 0},
'SITE_REACHABLE': {'type': 'boolean', 'value': False, 'score': 0},
'HTTPS': {'type': 'boolean', 'value': False, 'score': 0},
'WWW_OPTIONAL': {'type': 'boolean', 'value': False, 'score': 0},
'CANONICAL_URL': {'type': 'boolean', 'value': False, 'score': 0},
'FAVICON': {'type': 'boolean', 'value': False, 'score': 0},
'FEEDS': {'type': 'boolean', 'value': False, 'score': 0},
'HTTP_RESPONSE_DURATION': {'type': 'number', 'value': None, 'score': 0},
'RESPONSIVE': {'type': 'boolean', 'value': False, 'score': 0},
},
'score': 0.0,
}
# derive hostnames to test (with/without www.)
parsed = urlparse(entry['url'])
hostnames = derive_test_hostnames(parsed.hostname)
# try to resolve hostnames
processed_hostnames = {}
for hostname in hostnames:
processed_hostnames[hostname] = {
'resolvable': False,
}
try:
hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
processed_hostnames[hostname]['resolvable'] = True
processed_hostnames[hostname]['resolved_hostname'] = hostname
processed_hostnames[hostname]['aliases'] = aliases
processed_hostnames[hostname]['ip_addresses'] = ip_addresses
except:
pass
result['details']['hostnames'] = processed_hostnames
result['details']['ipv4_addresses'] = collect_ipv4_addresses(processed_hostnames)
# check basic HTTP(S) reachability
checked_urls = []
checked_urls_set = set()
for hostname in processed_hostnames.keys():
item = processed_hostnames[hostname]
if not item['resolvable']:
continue
for scheme in ('http', 'https'):
url = scheme + '://' + item['resolved_hostname'] + '/'
if url in checked_urls_set:
continue
checked_urls_set.add(url)
record = {
'url': url,
'error': None,
'redirects_to': None,
}
try:
req = requests.head(record['url'], headers=headers, allow_redirects=True)
if req.url == url:
logging.info("URL: %s - status %s", record['url'], req.status_code)
else:
logging.info("URL: %s - status %s - redirects to %s", record['url'],
req.status_code, req.url)
record['redirects_to'] = req.url
except Exception as exc:
record['error'] = {
'type': str(type(exc)),
'message': str(exc),
}
logging.info("URL %s: %s %s", url, str(type(exc)), exc)
checked_urls.append(record)
result['details']['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
result['details']['canonical_urls'] = sorted(reduce_urls(checked_urls))
# Deeper test for the remaining (canonical) URL(s)
for check_url in result['details']['canonical_urls']:
logging.info("Downloading URL %s", check_url)
check = {
'url': check_url,
'status_code': None,
'duration': None,
'error': None,
'content': None,
'responsive': None,
}
try:
req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
check['status_code'] = req.status_code
check['duration'] = round(req.elapsed.microseconds / 1000)
# Content checks
if req.status_code < 300:
check['content'] = check_content(req)
# Responsiveness check
try:
check['responsive'] = check_responsiveness(check_url)
except Exception as exc:
logging.error("Error when checking responsiveness for '%s': %s", check_url, exc)
except requests.exceptions.ConnectionError as exc:
logging.error(str(exc) + " " + check_url)
check['error'] = "connection"
except requests.exceptions.ReadTimeout as exc:
logging.error(str(exc) + " " + check_url)
check['error'] = "read_timeout"
except requests.exceptions.Timeout as exc:
logging.error(str(exc) + " " + check_url)
check['error'] = "connection_timeout"
except Exception as exc:
logging.error(str(exc) + " " + check_url)
check['error'] = "unknown"
result['details']['urlchecks'].append(check)
result['details']['urlchecks'] = sorted(result['details']['urlchecks'],
key=lambda url: url['url'])
# collect icons
icons = set()
for c in result['details']['urlchecks']:
if 'content' not in c:
continue
if c['content'] is None:
logging.warning("No content for %s", entry['url'])
continue
if c['content']['icon'] is not None:
icons.add(c['content']['icon'])
result['details']['icons'] = sorted(list(icons))
# collect feeds
feeds = set()
for c in result['details']['urlchecks']:
if c['content'] is None:
logging.warning("No content for %s", entry['url'])
continue
if 'feeds' in c['content'] and len(c['content']['feeds']):
for feed in c['content']['feeds']:
feeds.add(feed)
result['details']['feeds'] = sorted(list(feeds))
# detect responsive
viewports = set()
min_width = 2000
for c in result['details']['urlchecks']:
if c['responsive'] is None:
continue
if c['responsive']['viewport_meta_tag'] is not None:
viewports.add(c['responsive']['viewport_meta_tag'])
widths = c['responsive']['document_width'].values()
if min(widths) < min_width:
min_width = min(widths)
result['details']['responsive'] = {
'viewport_meta_tag': list(viewports),
'min_width': min_width,
}
# detect CMS
for c in result['details']['urlchecks']:
if c['content'] is None:
continue
if 'generator' not in c['content']:
continue
if c['content']['generator'] != "" and c['content']['generator'] is not None:
result['details']['cms'] = parse_generator(c['content']['generator'])
# Qualify certain CMS flavours in more detail
if result['details']['cms'] == "typo3":
if GCMS_IP in result['details']['ipv4_addresses']:
result['details']['cms'] = "typo3-gcms"
elif 'typo3-gruene.de' in c['content']['html']:
result['details']['cms'] = "typo3-gruene"
elif result['details']['cms'] == "wordpress":
if 'Urwahl3000' in c['content']['html']:
result['details']['cms'] = "wordpress-urwahl"
else:
# No generator Tag. Use HTML content.
if 'Urwahl3000' in c['content']['html']:
result['details']['cms'] = "wordpress-urwahl"
elif ('josephknowsbest' in c['content']['html'] or
'Joseph-knows-best' in c['content']['html']):
result['details']['cms'] = "wordpress-josephknowsbest"
elif 'wordpress' in c['content']['html']:
result['details']['cms'] = "wordpress"
# we can stop here
break
### Derive criteria
# DNS_RESOLVABLE_IPV4
if result['details']['ipv4_addresses']:
result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
# SITE_REACHABLE
for item in result['details']['resolvable_urls']:
if item['error'] is None:
result['result']['SITE_REACHABLE'] = {'value': True, 'score': 1}
break
# HTTPS
for item in result['details']['urlchecks']:
if item['error'] is None and item['url'].startswith('https://'):
result['result']['HTTPS'] = {'value': True, 'score': 2}
break
# WWW_OPTIONAL
num_hostnames = 0
for hostname in result['details']['hostnames'].keys():
item = result['details']['hostnames'][hostname]
if not item['resolvable']:
continue
num_hostnames += 1
if num_hostnames > 1:
result['result']['WWW_OPTIONAL'] = {'value': True, 'score': 1}
# CANONICAL_URL
# - either there is only one canonical URL (through redirects)
# - or several pages have identical rel=canonical links
if len(result['details']['canonical_urls']) == 1:
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
else:
links = set()
if result['details']['urlchecks'] is None:
logging.warning("No urlchecks for %s", entry['url'])
else:
for item in result['details']['urlchecks']:
if item['content'] is not None and item['content']['canonical_link'] is not None:
links.add(item['content']['canonical_link'])
if len(links) == 1:
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
# FAVICON
if result['details']['icons']:
result['result']['FAVICON'] = {'value': True, 'score': 1}
# FEEDS
if result['details']['feeds']:
result['result']['FEEDS'] = {'value': True, 'score': 1}
# HTTP_RESPONSE_DURATION
durations = []
for item in result['details']['urlchecks']:
if item['error'] is None:
durations.append(item['duration'])
if durations:
val = round(statistics.mean(durations))
result['result']['HTTP_RESPONSE_DURATION']['value'] = val
if val < 100:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
elif val < 1000:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
# RESPONSIVE
if result['details']['responsive'] is not None:
if (result['details']['responsive']['min_width'] < 500 and
len(result['details']['responsive']['viewport_meta_tag']) > 0):
result['result']['RESPONSIVE']['value'] = True
result['result']['RESPONSIVE']['score'] = 1
# Overall score
for item in result['result'].keys():
result['score'] += result['result'][item]['score']
# clean up - remove full HTML
for item in result['details']['urlchecks']:
try:
del item['content']['html']
except:
pass
return result
@tenacity.retry(wait=tenacity.wait_exponential(),
retry=tenacity.retry_if_exception_type(Aborted))
def get_job_from_queue():
"""
Returns a URL from the queue
"""
out = None
with DATASTORE_CLIENT.transaction():
query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND)
for entity in query.fetch(limit=1):
logging.debug("Got job: %s", entity)
out = dict(entity)
out["url"] = entity.key.name
DATASTORE_CLIENT.delete(entity.key)
return out
def work_of_queue():
"""
Take job from queue and finish it until there are no more jobs
"""
while True:
job = get_job_from_queue()
if job is None:
logging.info("No more jobs. Exiting.")
break
logging.info("Starting job %s", job["url"])
result = check_site(entry=job)
#logging.debug(result)
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])
key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"])
entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
record = {
"created": datetime.utcnow(),
"results": result,
}
entity.update(record)
try:
DATASTORE_CLIENT.put(entity)
except InvalidArgument as ex:
logging.error("Could not write result: %s", ex)
except ex:
logging.error("Could not write result: %s", ex)
if __name__ == "__main__":
"""
Bringing it all together
"""
parser = argparse.ArgumentParser()
parser.add_argument('--credentials-path', dest='credentials_path',
help='Path to the service account credentials JSON file',
default='/secrets/service-account.json')
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
default='info')
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
subparsers.add_parser('spider', help='Take jobs off the queue and spider')
jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
jobs_parser.add_argument('--url', help='Add a job to spider a URL')
args = parser.parse_args()
loglevel = args.loglevel.lower()
if loglevel == 'error':
logging.basicConfig(level=logging.ERROR)
elif loglevel == 'warn':
logging.basicConfig(level=logging.WARN)
elif loglevel == 'debug':
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
loglevel = 'info'
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
logging.debug("Called command %s", args.command)
if args.command == 'jobs':
create_jobs(args.url)
else:
work_of_queue()

0
spider/__init__.py Normal file
View File

106
spider/spider.py Normal file
View File

@ -0,0 +1,106 @@
"""
Provides the spider functionality (website checks).
"""
import argparse
import json
import logging
import re
import statistics
import time
from datetime import datetime
from pprint import pprint
from google.api_core.exceptions import InvalidArgument
from google.cloud import datastore
import checks
import config
import jobs
import rating
def check_and_rate_site(entry):
"""
Performs our site check and returns results as a dict.
1. Normalize the input URL and derive the URLs to check for
2. HEAD the check urls
3. Determine the canonical URL
4. Run full check on canonical URL
"""
# all the info we'll return for the site
result = {
# input_url: The URL we derived all checks from
'input_url': entry['url'],
# Meta: Regional and type metadata for the site
'meta': {
'type': entry.get('type'),
'level': entry.get('level'),
'state': entry.get('state'),
'district': entry.get('district'),
'city': entry.get('city'),
},
# checks: Results from our checks
'checks': {},
# The actual report scoring criteria
'rating': {},
# resulting score
'score': 0.0,
}
# Results from our next generation checkers
result['checks'] = checks.perform_checks(entry['url'])
result['rating'] = rating.calculate_rating(result['checks'])
# Overall score is the sum of the individual scores
for key in result['rating']:
result['score'] += result['rating'][key]['score']
# remove full HTML page content,
# as it's no longer needed
try:
for url in result['checks']['page_content']:
del result['checks']['page_content'][url]['content']
except:
pass
return result
def work_of_queue(datastore_client, entity_kind):
"""
Take job from queue and finish it until there are no more jobs
"""
while True:
job = jobs.get_job_from_queue(datastore_client)
if job is None:
logging.info("No more jobs. Exiting.")
break
logging.info("Starting job %s", job["url"])
result = check_and_rate_site(entry=job)
logging.debug("Full JSON representation of returned result: %s", json.dumps(result))
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])
key = datastore_client.key(entity_kind, job["url"])
entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
record = {
'created': datetime.utcnow(),
'meta': result['meta'],
'checks': result['checks'],
'rating': result['rating'],
'score': result['score'],
}
entity.update(record)
try:
datastore_client.put(entity)
except InvalidArgument as ex:
logging.error("Could not write result: %s", ex)
except Exception as ex:
logging.error("Could not write result: %s", ex)

26
spider/spider_test.py Normal file
View File

@ -0,0 +1,26 @@
import unittest
from spider.spider import check_and_rate_site
from pprint import pprint
class TestSpiderr(unittest.TestCase):
def test_url1(self):
entry = {
"url": "https://httpbin.org/html",
"type": "type",
"state": "state",
"level": "level",
"district": "district",
"city": "city",
}
url = "https://httpbin.org/html"
result = check_and_rate_site(entry)
self.assertEqual(result["input_url"], url)
if __name__ == '__main__':
unittest.main()

View File

@ -1,125 +0,0 @@
import unittest
import requests
import responses
import spider
class TestDeriveHostnames(unittest.TestCase):
def test_basic1(self):
hn = spider.derive_test_hostnames('www.my-domain.de')
expected = ['my-domain.de', 'www.my-domain.de']
self.assertEqual(hn, expected)
def test_basic2(self):
hn = spider.derive_test_hostnames('domain.de')
expected = ['domain.de', 'www.domain.de']
self.assertEqual(hn, expected)
class TestReduceURLs(unittest.TestCase):
def test_basic(self):
testdata = [
{'url': 'one', 'error': None, 'redirects_to': None},
{'url': 'two', 'error': 'Yes', 'redirects_to': None},
{'url': 'three', 'error': None, 'redirects_to': 'five'},
]
expected_result = ['five', 'one']
result = spider.reduce_urls(testdata)
self.assertEqual(result, expected_result)
class TestContentChecks(unittest.TestCase):
@responses.activate
def test_minimal(self):
url = 'http://my.url'
responses.add(responses.GET, url, status=200,
content_type='text/html',
body='<html></html>')
r = requests.get(url)
result = spider.check_content(r)
del result['html'] # don't want to have the messy HTML part in comparison
expected_result = {
'icon': None,
'title': None,
'generator': None,
'feeds': [],
'encoding': 'iso-8859-1',
'canonical_link': None,
'opengraph': None
}
self.assertDictEqual(result, expected_result)
@responses.activate
def test_basic(self):
url = 'http://my.url'
responses.add(responses.GET, url, status=200,
content_type='text/html; charset=UTF-8',
body='''
<!DOCTYPE html>
<html>
<head>
<title> The page's title </title>
<meta name="generator" content="some-cms/1.0">
<link rel="shortcut icon" href="http://foo.bar/image.png">
<link rel="alternate" type="application/rss+xml" href="http://example.com/feed">
<link rel="canonical" href="https://my.site.com/">
</head>
</html>
''')
r = requests.get(url)
result = spider.check_content(r)
del result['html'] # don't want to have the messy HTML part in comparison
expected_result = {
'icon': 'http://foo.bar/image.png',
'title': 'The page\'s title',
'generator': 'some-cms/1.0',
'feeds': [
'http://example.com/feed',
],
'encoding': 'utf-8',
'canonical_link': 'https://my.site.com/',
'opengraph': None
}
self.assertDictEqual(result, expected_result)
@responses.activate
def test_opengraph(self):
url = 'http://my.url'
responses.add(responses.GET, url, status=200,
content_type='text/html; charset=UTF-8',
body='''
<html>
<head>
<meta property="og:title" content="The Rock" />
<meta property="og:type" content="video.movie" />
<meta property="og:url" content="http://www.foor.bar" />
<meta property="og:image" content="http://www.foo.bar/foo.jpg" />
</head>
</html>
''')
r = requests.get(url)
result = spider.check_content(r)
del result['html'] # don't want to have the messy HTML part in comparison
expected_result = {
'icon': None,
'title': None,
'generator': None,
'feeds': [],
'encoding': 'utf-8',
'canonical_link': None,
'opengraph': ['og:image', 'og:title', 'og:type', 'og:url'],
}
self.assertDictEqual(result, expected_result)
if __name__ == '__main__':
unittest.main()