parent
7514aeb542
commit
ae6a2e83e9
|
@ -4,3 +4,4 @@ docs
|
|||
secrets
|
||||
temp
|
||||
venv
|
||||
/export-*
|
||||
|
|
|
@ -7,3 +7,4 @@ __pycache__
|
|||
.vscode/settings.json
|
||||
webapp/dist/bundle.js
|
||||
dev-shm
|
||||
/export-*
|
||||
|
|
|
@ -6,5 +6,12 @@ services:
|
|||
notifications:
|
||||
email: false
|
||||
|
||||
language: python
|
||||
python:
|
||||
- "3.6"
|
||||
|
||||
script:
|
||||
- pip install --upgrade pip
|
||||
- pip install --upgrade codecov
|
||||
- make test
|
||||
- codecov
|
||||
|
|
19
Dockerfile
19
Dockerfile
|
@ -1,17 +1,20 @@
|
|||
FROM python:3.6-alpine3.7
|
||||
FROM python:3.6-alpine3.8
|
||||
|
||||
# Note: we pin selenium to 3.8.0 because of https://github.com/SeleniumHQ/selenium/issues/5296
|
||||
RUN echo "http://dl-4.alpinelinux.org/alpine/v3.7/main" >> /etc/apk/repositories && \
|
||||
echo "http://dl-4.alpinelinux.org/alpine/v3.7/community" >> /etc/apk/repositories && \
|
||||
apk update && \
|
||||
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git && \
|
||||
apk --no-cache add chromium chromium-chromedriver python3-dev build-base git py3-lxml libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev && \
|
||||
pip3 install --upgrade pip && \
|
||||
pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
|
||||
pip3 install selenium==3.8.0 GitPython PyYAML beautifulsoup4==4.6.0 html-similarity==0.3.2 httpretty==0.9.4 pyopenssl==18.0.0 requests==2.18.4 responses==0.9.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 && \
|
||||
apk del python3-dev build-base
|
||||
|
||||
ADD spider.py /
|
||||
ADD spider_test.py /
|
||||
ADD data_export.py /
|
||||
ADD cli.py /
|
||||
ADD config /config
|
||||
ADD jobs /jobs
|
||||
ADD checks /checks
|
||||
ADD rating /rating
|
||||
ADD spider /spider
|
||||
ADD export /export
|
||||
|
||||
ENTRYPOINT ["python3"]
|
||||
CMD ["/spider.py"]
|
||||
ENTRYPOINT ["python3", "/cli.py"]
|
||||
|
|
31
Makefile
31
Makefile
|
@ -1,18 +1,20 @@
|
|||
IMAGE := quay.io/netzbegruenung/green-spider:latest
|
||||
|
||||
DB_ENTITY := spider-results
|
||||
|
||||
.PHONY: dockerimage
|
||||
|
||||
# Build docker image
|
||||
dockerimage:
|
||||
docker build -t quay.io/netzbegruenung/green-spider:latest .
|
||||
docker build -t $(IMAGE) .
|
||||
|
||||
# Create spider job queue
|
||||
spiderjobs: dockerimage
|
||||
docker run --rm -ti \
|
||||
-v $(PWD)/secrets:/secrets \
|
||||
quay.io/netzbegruenung/green-spider:latest spider.py \
|
||||
$(IMAGE) \
|
||||
--credentials-path /secrets/datastore-writer.json \
|
||||
--loglevel debug \
|
||||
--loglevel info \
|
||||
jobs
|
||||
|
||||
# Run spider in docker image
|
||||
|
@ -21,11 +23,26 @@ spider: dockerimage
|
|||
-v $(PWD)/dev-shm:/dev/shm \
|
||||
-v $(PWD)/webapp/dist/data:/out \
|
||||
-v $(PWD)/secrets:/secrets \
|
||||
quay.io/netzbegruenung/green-spider:latest spider.py \
|
||||
$(IMAGE) \
|
||||
--credentials-path /secrets/datastore-writer.json \
|
||||
--loglevel info \
|
||||
spider
|
||||
--loglevel debug \
|
||||
spider --kind $(DB_ENTITY)
|
||||
|
||||
export: dockerimage
|
||||
docker run --rm -ti \
|
||||
-v $(PWD)/export-json:/out \
|
||||
-v $(PWD)/secrets:/secrets \
|
||||
-v $(PWD)/export-siteicons:/icons \
|
||||
$(IMAGE) \
|
||||
--credentials-path /secrets/datastore-reader.json \
|
||||
--loglevel debug \
|
||||
export --kind $(DB_ENTITY)
|
||||
|
||||
# run spider tests
|
||||
# FIXME
|
||||
test: dockerimage
|
||||
docker run --rm -ti quay.io/netzbegruenung/green-spider:latest /spider_test.py
|
||||
docker run --rm -ti \
|
||||
--entrypoint "python3" \
|
||||
$(IMAGE) \
|
||||
-m unittest discover -p '*_test.py'
|
||||
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
"""
|
||||
The checks module contains the functionality to get information and test certain
|
||||
functionality of a site or individual pages.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from checks import charset
|
||||
from checks import certificate
|
||||
from checks import dns_resolution
|
||||
from checks import duplicate_content
|
||||
from checks import domain_variations
|
||||
from checks import generator
|
||||
from checks import html_head
|
||||
from checks import http_and_https
|
||||
from checks import page_content
|
||||
from checks import load_in_browser
|
||||
from checks import url_reachability
|
||||
from checks import url_canonicalization
|
||||
|
||||
from checks.config import Config
|
||||
|
||||
|
||||
def perform_checks(input_url):
|
||||
"""
|
||||
Executes all our URL/site checks and returns a big-ass result dict.
|
||||
"""
|
||||
|
||||
# The sequence of checks to run. Order is important!
|
||||
# Checks which expand the URLs list must come first.
|
||||
# After that, dependencies (encoded in the checks) have to be fulfilled.
|
||||
check_modules = [
|
||||
('domain_variations', domain_variations),
|
||||
('http_and_https', http_and_https),
|
||||
('dns_resolution', dns_resolution),
|
||||
('url_reachability', url_reachability),
|
||||
('certificate', certificate),
|
||||
('url_canonicalization', url_canonicalization),
|
||||
('page_content', page_content),
|
||||
('duplicate_content', duplicate_content),
|
||||
('charset', charset),
|
||||
('html_head', html_head),
|
||||
('generator', generator),
|
||||
('load_in_browser', load_in_browser),
|
||||
]
|
||||
|
||||
results = {}
|
||||
|
||||
config = Config(urls=[input_url],
|
||||
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' +
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' +
|
||||
'Safari/537.36 green-spider/0.2')
|
||||
|
||||
for check_name, check in check_modules:
|
||||
checker = check.Checker(config=config,
|
||||
previous_results=results)
|
||||
result = checker.run()
|
||||
results[check_name] = result
|
||||
|
||||
# update config for the next check
|
||||
config = checker.config
|
||||
logging.debug("config after check %s: %r" % (check_name, config))
|
||||
|
||||
return results
|
|
@ -0,0 +1,23 @@
|
|||
class AbstractChecker(object):
|
||||
"""
|
||||
Our blueprint for checks
|
||||
"""
|
||||
|
||||
def __init__(self, config, previous_results=None):
|
||||
self._config = config
|
||||
|
||||
# A dictionary of results from previous checkers.
|
||||
# Key is the name of the checker that has generated the result.
|
||||
self._previous_results = previous_results
|
||||
|
||||
def run(self):
|
||||
"""Executes the check routine, returns result dict"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
return self._config
|
||||
|
||||
@property
|
||||
def previous_results(self):
|
||||
return self._previous_results
|
|
@ -0,0 +1,62 @@
|
|||
"""
|
||||
Gathers information on the TLS/SSL certificate used by a server
|
||||
"""
|
||||
|
||||
from urllib.parse import urlparse
|
||||
import logging
|
||||
import ssl
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from OpenSSL import crypto
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
def run(self):
|
||||
results = {}
|
||||
|
||||
for url in self.config.urls:
|
||||
if url.startswith('https://'):
|
||||
results[url] = self.get_certificate(url)
|
||||
|
||||
return results
|
||||
|
||||
def get_certificate(self, url):
|
||||
result = {
|
||||
'exception': None,
|
||||
'serial_number': None,
|
||||
'subject': None,
|
||||
'issuer': None,
|
||||
'not_before': None,
|
||||
'not_after': None
|
||||
}
|
||||
|
||||
parsed = urlparse(url)
|
||||
try:
|
||||
cert = ssl.get_server_certificate((parsed.hostname, 443))
|
||||
x509 = crypto.load_certificate(crypto.FILETYPE_PEM, cert)
|
||||
result['serial_number'] = str(x509.get_serial_number())
|
||||
|
||||
nb = x509.get_notBefore().decode('utf-8')
|
||||
na = x509.get_notAfter().decode('utf-8')
|
||||
|
||||
# parse '2018 06 27 00 00 00Z'
|
||||
result['not_before'] = datetime(int(nb[0:4]), int(nb[4:6]), int(nb[6:8]), int(nb[8:10]), int(nb[10:12]), int(nb[12:14]), tzinfo=timezone.utc).isoformat()
|
||||
result['not_after'] = datetime(int(na[0:4]), int(na[4:6]), int(na[6:8]), int(na[8:10]), int(na[10:12]), int(na[12:14]), tzinfo=timezone.utc).isoformat()
|
||||
|
||||
# decode and convert from bytes to unicode
|
||||
result['subject'] = dict([tuple(map(lambda x: x.decode('utf-8'), tup)) for tup in x509.get_subject().get_components()])
|
||||
result['issuer'] = dict([tuple(map(lambda x: x.decode('utf-8'), tup)) for tup in x509.get_issuer().get_components()])
|
||||
|
||||
except Exception as e:
|
||||
result['exception'] = {
|
||||
'type': str(type(e)),
|
||||
'message': str(e),
|
||||
}
|
||||
logging.warning("Error when getting certificate for %s: %r" % (url, e))
|
||||
|
||||
return result
|
|
@ -0,0 +1,27 @@
|
|||
from checks import certificate
|
||||
from checks.config import Config
|
||||
import unittest
|
||||
|
||||
class TestCertificateChecker(unittest.TestCase):
|
||||
|
||||
def test_google(self):
|
||||
url = 'https://www.google.com/'
|
||||
config = Config(urls=[url])
|
||||
checker = certificate.Checker(config=config, previous_results={})
|
||||
result = checker.run()
|
||||
self.assertIn(url, result)
|
||||
self.assertIsNone(result[url]['exception'])
|
||||
self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services')
|
||||
|
||||
def test_kaarst(self):
|
||||
url = 'https://www.gruenekaarst.de/'
|
||||
config = Config(urls=[url])
|
||||
checker = certificate.Checker(config=config, previous_results={})
|
||||
result = checker.run()
|
||||
self.assertIn(url, result)
|
||||
self.assertIsNone(result[url]['exception'])
|
||||
self.assertEqual(result[url]['issuer']['O'], 'COMODO CA Limited')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -0,0 +1,77 @@
|
|||
"""
|
||||
Checks which character set a page has.
|
||||
|
||||
TODO: Check for http-equiv meta tags like
|
||||
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1" />
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
def run(self):
|
||||
assert 'page_content' in self.previous_results
|
||||
|
||||
results = {}
|
||||
|
||||
for url in self.config.urls:
|
||||
results[url] = self.get_charset(url)
|
||||
|
||||
return results
|
||||
|
||||
def get_charset(self, url):
|
||||
"""
|
||||
Expects page_content_dict['content'] to carry the HTML content
|
||||
"""
|
||||
page_content = self.previous_results['page_content'][url]
|
||||
assert 'content' in page_content
|
||||
assert 'response_headers' in page_content
|
||||
logging.debug("%r", page_content['response_headers'])
|
||||
assert 'content-type' in page_content['response_headers']
|
||||
|
||||
if page_content['content'] is None:
|
||||
return
|
||||
|
||||
result = {
|
||||
'meta_charset_tag': None,
|
||||
'content_type_header_charset': None,
|
||||
'charset': 'iso-8859-1', # ISO-8859-1 is the default according to https://www.w3.org/International/articles/http-charset/index
|
||||
'valid': None,
|
||||
'exception': None,
|
||||
}
|
||||
|
||||
soup = BeautifulSoup(page_content['content'], 'html.parser')
|
||||
|
||||
# get response header charset
|
||||
if ('content-type' in page_content['response_headers']
|
||||
and 'charset=' in page_content['response_headers']['content-type']):
|
||||
parts = page_content['response_headers']['content-type'].split("charset=", 1)
|
||||
result['content_type_header_charset'] = parts[1].lower()
|
||||
result['charset'] = parts[1].lower()
|
||||
|
||||
# get meta tag charset
|
||||
metatags = soup.find_all('meta')
|
||||
for tag in metatags:
|
||||
if 'charset' in tag.attrs:
|
||||
result['meta_charset_tag'] = tag['charset'].lower()
|
||||
# meta tag overrules any previous value
|
||||
result['charset'] = tag['charset'].lower()
|
||||
|
||||
# check for charset plausibility (only for most common ones)
|
||||
if result['charset'] in ('iso-8859-1', 'utf-8'):
|
||||
try:
|
||||
_ = page_content['content'].encode(result['charset'])
|
||||
except UnicodeEncodeError as e:
|
||||
result['valid'] = False
|
||||
result['exception'] = str(e)
|
||||
else:
|
||||
result['valid'] = True
|
||||
|
||||
|
||||
return result
|
|
@ -0,0 +1,49 @@
|
|||
import httpretty
|
||||
from httpretty import httprettified
|
||||
import unittest
|
||||
|
||||
from checks import charset
|
||||
from checks import page_content
|
||||
from checks.config import Config
|
||||
|
||||
@httprettified
|
||||
class TestCharsetChecker(unittest.TestCase):
|
||||
|
||||
def test_http_response(self):
|
||||
url = 'http://www.example.com/'
|
||||
httpretty.register_uri(httpretty.GET, url,
|
||||
body="""<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-type" value="text/html; charset=foo">
|
||||
<meta charset="utf-8">
|
||||
<title>Hello</title>
|
||||
</head>
|
||||
</html>""",
|
||||
adding_headers={
|
||||
"Content-Type": "text/html; charset=ISO-8859-1",
|
||||
})
|
||||
|
||||
results = {}
|
||||
|
||||
config = Config(urls=[url])
|
||||
page_content_checker = page_content.Checker(config=config, previous_results={})
|
||||
results['page_content'] = page_content_checker.run()
|
||||
|
||||
self.assertIn(url, results['page_content'])
|
||||
self.assertIn('response_headers', results['page_content'][url])
|
||||
self.assertIn('content-type', results['page_content'][url]['response_headers'])
|
||||
|
||||
charset_checker = charset.Checker(config=page_content_checker.config, previous_results=results)
|
||||
result = charset_checker.run()
|
||||
|
||||
self.assertIn(url, result)
|
||||
self.assertEqual(result[url], {
|
||||
'meta_charset_tag': 'utf-8',
|
||||
'content_type_header_charset': 'iso-8859-1',
|
||||
'charset': 'utf-8',
|
||||
'valid': True,
|
||||
'exception': None,
|
||||
})
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -0,0 +1,29 @@
|
|||
class Config(object):
|
||||
"""
|
||||
Our configuration to be passed to checks
|
||||
"""
|
||||
|
||||
def __init__(self, urls, user_agent='green-spider/1.0'):
|
||||
self._urls = set(urls)
|
||||
self._user_agent = user_agent
|
||||
|
||||
def __repr__(self):
|
||||
return "Config(urls=%r)" % self._urls
|
||||
|
||||
@property
|
||||
def urls(self):
|
||||
return list(self._urls)
|
||||
|
||||
def add_url(self, url):
|
||||
self._urls.add(url)
|
||||
|
||||
def remove_url(self, url):
|
||||
"""Removes url from urls, if it was in there. Ignores errors."""
|
||||
try:
|
||||
self._urls.remove(url)
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
@property
|
||||
def user_agent(self):
|
||||
return self._user_agent
|
|
@ -0,0 +1,55 @@
|
|||
"""
|
||||
This check attempts to resolve all hostnames/domains in the input URLs.
|
||||
|
||||
URLs which are not resolvable are removed from the config.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from socket import gethostbyname_ex
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import urlunparse
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
def run(self):
|
||||
"""Executes the check routine, returns result dict"""
|
||||
|
||||
results = {}
|
||||
|
||||
urls = list(self.config.urls)
|
||||
for url in urls:
|
||||
parsed = urlparse(url)
|
||||
|
||||
results[url] = self.resolve_hostname(parsed.hostname)
|
||||
|
||||
# remove URL if non-resolvable
|
||||
if not results[url]['resolvable']:
|
||||
self.config.remove_url(url)
|
||||
|
||||
return results
|
||||
|
||||
def resolve_hostname(self, hostname):
|
||||
"""
|
||||
Resolve one to IPv4 address(es)
|
||||
"""
|
||||
result = {
|
||||
'hostname': hostname,
|
||||
'resolvable': False,
|
||||
'aliases': [],
|
||||
'ipv4_addresses': [],
|
||||
}
|
||||
|
||||
try:
|
||||
hostname, aliases, ipv4_addresses = gethostbyname_ex(hostname)
|
||||
result['resolvable'] = True
|
||||
result['aliases'] = aliases
|
||||
result['ipv4_addresses'] = ipv4_addresses
|
||||
except Exception as e:
|
||||
logging.debug("Hostname %s not resolvable. Exception: %r" % (hostname, e))
|
||||
|
||||
return result
|
|
@ -0,0 +1,44 @@
|
|||
"""
|
||||
This adds commonly tried variations of domains/subdomains to the URLs config.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import urlunparse
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
def run(self):
|
||||
urls = list(self.config.urls)
|
||||
for url in urls:
|
||||
parsed = urlparse(url)
|
||||
hostnames = self.expand_hostname(parsed.hostname)
|
||||
|
||||
for hostname in hostnames:
|
||||
self.config.add_url(urlunparse((parsed.scheme, hostname,
|
||||
parsed.path, parsed.params, parsed.query, parsed.fragment)))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def expand_hostname(self, hostname):
|
||||
"""
|
||||
Create variations of subdomains
|
||||
"""
|
||||
hostnames = set()
|
||||
|
||||
hostnames.add(hostname)
|
||||
if hostname.startswith('www.'):
|
||||
# remove 'www.' prefix
|
||||
hostnames.add(hostname[4:])
|
||||
else:
|
||||
# add 'www.' prefix
|
||||
hostnames.add('www.' + hostname)
|
||||
|
||||
return sorted(list(hostnames))
|
|
@ -0,0 +1,107 @@
|
|||
"""
|
||||
This checker looks at the similarity between previously downloaded pages
|
||||
and removes duplicates from the config URLs
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import html_similarity
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
|
||||
# value above which we consider a page pair a duplicate
|
||||
similarity_threshold = 0.99999
|
||||
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
|
||||
def run(self):
|
||||
|
||||
if len(self.config.urls) == 1:
|
||||
# nothing to do for us
|
||||
return
|
||||
|
||||
urls = list(self.config.urls)
|
||||
|
||||
# get content
|
||||
content = {}
|
||||
|
||||
assert 'page_content' in self.previous_results
|
||||
|
||||
for url in urls:
|
||||
page_content = self.previous_results['page_content'][url]
|
||||
|
||||
if page_content['content'] is None:
|
||||
logging.warn("Content for URL %s is None" % url)
|
||||
|
||||
content[url] = page_content['content']
|
||||
|
||||
pairs = self.compare_pairwise(content)
|
||||
|
||||
# remove duplicates
|
||||
for key in pairs:
|
||||
if pairs[key]['similarity'] is None:
|
||||
continue
|
||||
if pairs[key]['similarity'] > self.similarity_threshold:
|
||||
# this pair is a duplicate.
|
||||
# Decide which one to keep
|
||||
url1, url2 = key.split(" ", 1)
|
||||
reject = self.select_url_to_reject(url1, url2)
|
||||
self.config.remove_url(reject)
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def compare_pairwise(self, content):
|
||||
# compair pairwise
|
||||
pairs = {}
|
||||
|
||||
for url1 in content:
|
||||
for url2 in content:
|
||||
|
||||
if url1 == url2:
|
||||
continue
|
||||
|
||||
# avoid checking pairs twice
|
||||
pair_key = " ".join(sorted([url1, url2]))
|
||||
if pair_key in pairs:
|
||||
continue
|
||||
|
||||
try:
|
||||
s = html_similarity.similarity(content[url1], content[url2])
|
||||
logging.debug("Comparing pages for URLs %s and %s: similarity=%s", url1, url2, s)
|
||||
pairs[pair_key] = {
|
||||
'similarity': s,
|
||||
'exception': None,
|
||||
}
|
||||
except (AttributeError, ValueError) as e:
|
||||
logging.error("html_similarity.similarity thre exception for URL pair %s and %s: %s", url1, url2, e)
|
||||
pairs[pair_key] = {
|
||||
'similarity': None,
|
||||
'exception': str(e),
|
||||
}
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def select_url_to_reject(self, url1, url2):
|
||||
"""Determine which of two URLs to keep, which to reject"""
|
||||
|
||||
# HTTPS takes precedence
|
||||
if url1.startswith('https://') and not url2.startswith('https://'):
|
||||
return url2
|
||||
elif url2.startswith('https://') and not url1.startswith('https://'):
|
||||
return url1
|
||||
|
||||
# Shorter URL wins
|
||||
if len(url1) < len(url2):
|
||||
return url2
|
||||
elif len(url1) > len(url2):
|
||||
return url1
|
||||
|
||||
# default behaviour
|
||||
return url1
|
|
@ -0,0 +1,76 @@
|
|||
"""
|
||||
Checks the 'generator' meta tag and page content properties
|
||||
to detect well-known content management systems, themes etc.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
|
||||
# IP address of the newthinking GCMS server
|
||||
gcms_ip = "91.102.13.20"
|
||||
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
def run(self):
|
||||
assert 'page_content' in self.previous_results
|
||||
assert 'html_head' in self.previous_results
|
||||
|
||||
results = {}
|
||||
|
||||
for url in self.config.urls:
|
||||
results[url] = self.get_generator(url)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_generator(self, url):
|
||||
page_content = self.previous_results['page_content'][url]
|
||||
assert 'content' in page_content
|
||||
|
||||
assert 'dns_resolution' in self.previous_results
|
||||
dns_resolution = self.previous_results['dns_resolution']
|
||||
|
||||
head = self.previous_results['html_head'][url]
|
||||
|
||||
generator = None
|
||||
|
||||
if 'generator' in head and head['generator'] is not None:
|
||||
generator = head['generator'].lower()
|
||||
if 'typo3' in generator:
|
||||
generator = 'typo3'
|
||||
if 'wordpress' in generator:
|
||||
generator = 'wordpress'
|
||||
if 'drupal' in generator:
|
||||
generator = 'drupal'
|
||||
if 'joomla' in generator:
|
||||
generator = 'joomla'
|
||||
|
||||
# Qualify certain CMS flavours in more detail
|
||||
if generator == "typo3":
|
||||
# Typo3-Gruene advertises in the page content
|
||||
if 'typo3-gruene.de' in page_content['content']:
|
||||
generator = "typo3-gruene"
|
||||
# newthinking GCMS in some page hrefs
|
||||
elif 'ntc_gcms' in page_content['content']:
|
||||
generator = "typo3-gcms"
|
||||
# check if one of the IPs matches the well-known GCMS Server IP
|
||||
elif url in dns_resolution:
|
||||
for addr in dns_resolution[url]['ipv4_addresses']:
|
||||
if addr == self.gcms_ip:
|
||||
generator = "typo3-gcms"
|
||||
|
||||
elif 'Urwahl3000' in page_content['content']:
|
||||
generator = "wordpress-urwahl"
|
||||
|
||||
elif ('josephknowsbest' in page_content['content'] or
|
||||
'Joseph-knows-best' in page_content['content']):
|
||||
generator = "wordpress-josephknowsbest"
|
||||
|
||||
elif 'wordpress' in page_content['content']:
|
||||
generator = "wordpress"
|
||||
|
||||
return generator
|
|
@ -0,0 +1,152 @@
|
|||
"""
|
||||
Extracts information from the html <head>, like existence and value
|
||||
of certain meta tags, link tags, title, etc.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
def run(self):
|
||||
results = {}
|
||||
|
||||
for url in self.config.urls:
|
||||
results[url] = self.get_content(url)
|
||||
|
||||
return results
|
||||
|
||||
def get_content(self, url):
|
||||
"""
|
||||
Expects page_content_dict['content'] to carry the HTML content
|
||||
"""
|
||||
|
||||
page_content = self.previous_results['page_content'][url]
|
||||
assert 'content' in page_content
|
||||
assert 'response_headers' in page_content
|
||||
assert 'content-type' in page_content['response_headers']
|
||||
|
||||
if page_content['content'] is None:
|
||||
return
|
||||
|
||||
soup = BeautifulSoup(page_content['content'], 'html.parser')
|
||||
head = soup.find('head')
|
||||
|
||||
result = {
|
||||
'title': self.get_title(head),
|
||||
'link_canonical': self.get_link_canonical(head, url),
|
||||
'link_rss_atom': self.get_link_rss_atom(head, url),
|
||||
'link_icon': self.get_link_icon(head, url),
|
||||
'generator': self.get_generator(head),
|
||||
'opengraph': self.get_opengraph(head),
|
||||
'viewport': self.get_viewport(head),
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_title(self, head):
|
||||
"""Extract and clean up page title"""
|
||||
if head is None:
|
||||
return
|
||||
|
||||
title = None
|
||||
|
||||
tag = head.find('title')
|
||||
if tag is None:
|
||||
return
|
||||
|
||||
title = tag.get_text()
|
||||
|
||||
# clean up
|
||||
title = title.replace(u'\u00a0', ' ')
|
||||
title = title.replace(' ', ' ')
|
||||
title = title.strip()
|
||||
|
||||
return title
|
||||
|
||||
|
||||
def get_link_canonical(self, head, url):
|
||||
if head is None:
|
||||
return
|
||||
link = head.find('link', rel='canonical')
|
||||
if link:
|
||||
return urljoin(url, link.get('href'))
|
||||
|
||||
|
||||
def get_link_rss_atom(self, head, url):
|
||||
if head is None:
|
||||
return
|
||||
hrefs = []
|
||||
rss_links = head.find_all('link', type='application/rss+xml')
|
||||
atom_links = head.find_all('link', type='application/atom+xml')
|
||||
|
||||
if rss_links:
|
||||
for link in rss_links:
|
||||
hrefs.append(link.get('href'))
|
||||
if atom_links:
|
||||
for link in rss_links:
|
||||
hrefs.append(link.get('href'))
|
||||
|
||||
# make URLs absolute
|
||||
for i in range(len(hrefs)):
|
||||
parsed = urlparse(hrefs[i])
|
||||
if parsed.scheme == '':
|
||||
hrefs[i] = urljoin(url, hrefs[i])
|
||||
|
||||
return hrefs
|
||||
|
||||
|
||||
def get_link_icon(self, head, url):
|
||||
if head is None:
|
||||
return
|
||||
|
||||
tag = head.find('link', rel=lambda x: x and x.lower() == 'icon')
|
||||
if tag:
|
||||
return urljoin(url, tag.get('href'))
|
||||
tag = head.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
|
||||
if tag:
|
||||
return urljoin(url, tag.get('href'))
|
||||
|
||||
|
||||
def get_generator(self, head):
|
||||
if head is None:
|
||||
return
|
||||
|
||||
tags = head.select('[name=generator]')
|
||||
if tags:
|
||||
return tags[0].get('content')
|
||||
|
||||
|
||||
def get_opengraph(self, head):
|
||||
if head is None:
|
||||
return
|
||||
|
||||
# we find tags by matching this property/itemprop value regex
|
||||
property_re = re.compile('^og:')
|
||||
|
||||
opengraph = set()
|
||||
for tag in head.find_all(property=property_re):
|
||||
opengraph.add(tag.get('property'))
|
||||
for tag in head.find_all(itemprop=property_re):
|
||||
opengraph.add(tag.get('itemprop'))
|
||||
|
||||
opengraph = sorted(list(opengraph))
|
||||
if opengraph != []:
|
||||
return opengraph
|
||||
|
||||
|
||||
def get_viewport(self, head):
|
||||
if head is None:
|
||||
return
|
||||
tags = head.select('[name=viewport]')
|
||||
if tags:
|
||||
return tags[0].get('content')
|
|
@ -0,0 +1,27 @@
|
|||
"""
|
||||
This adds, for every HTTP URL, the HTTPS counterpart,
|
||||
and vice versa, to config.urls
|
||||
|
||||
So it doesn't actually perform tests. It only expands the
|
||||
URLs to test by other checks.
|
||||
"""
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Adds URLs to config.urls, returns nothing
|
||||
"""
|
||||
|
||||
for url in self.config.urls:
|
||||
|
||||
if url.startswith('https://'):
|
||||
self.config.add_url('http://' + url[8:])
|
||||
elif url.startswith('http://'):
|
||||
self.config.add_url('https://' + url[7:])
|
||||
|
||||
return None
|
|
@ -0,0 +1,134 @@
|
|||
"""
|
||||
Collects information by loading pages in a browser.
|
||||
|
||||
Information includes:
|
||||
|
||||
- whether the document width adapts well to viewports as little as 360 pixels wide
|
||||
- whether javascript errors or errors from missing resources occur
|
||||
- collects CSS font-family properties in use
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import StaleElementReferenceException
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
import tenacity
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
|
||||
page_load_timeout = 20
|
||||
|
||||
# sizes we check for (width, height)
|
||||
sizes = (
|
||||
(360, 640), # rather old smartphone
|
||||
(768, 1024), # older tablet or newer smartphone
|
||||
(1024, 768), # older desktop or horiz. tablet
|
||||
(1920, 1080), # Full HD horizontal
|
||||
)
|
||||
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
# Our selenium user agent using Chrome headless as an engine
|
||||
chrome_options = webdriver.ChromeOptions()
|
||||
chrome_options.add_argument('--headless')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-extensions')
|
||||
self.driver = webdriver.Chrome(options=chrome_options)
|
||||
self.driver.set_page_load_timeout(self.page_load_timeout)
|
||||
|
||||
def run(self):
|
||||
|
||||
results = {}
|
||||
for url in self.config.urls:
|
||||
|
||||
results[url] = {
|
||||
'sizes': None,
|
||||
'min_document_width': None,
|
||||
'logs': None,
|
||||
'font_families': None,
|
||||
}
|
||||
|
||||
# responsive check
|
||||
try:
|
||||
sizes = self.check_responsiveness(url)
|
||||
results[url] = {
|
||||
'sizes': sizes,
|
||||
'min_document_width': min([s['document_width'] for s in sizes]),
|
||||
'logs': self.capture_log(),
|
||||
}
|
||||
except TimeoutException as e:
|
||||
logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e))
|
||||
pass
|
||||
except tenacity.RetryError as re:
|
||||
logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re))
|
||||
pass
|
||||
|
||||
# CSS collection
|
||||
font_families = None
|
||||
|
||||
try:
|
||||
elements = self.driver.find_elements_by_xpath("//*")
|
||||
font_families = set()
|
||||
for element in elements:
|
||||
try:
|
||||
font_family = element.value_of_css_property('font-family')
|
||||
if font_family is None:
|
||||
continue
|
||||
font_families.add(font_family.lower())
|
||||
except StaleElementReferenceException as e:
|
||||
logging.warn("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
|
||||
continue
|
||||
|
||||
results[url]['font_families'] = sorted(list(font_families))
|
||||
|
||||
except TimeoutException as e:
|
||||
logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
|
||||
pass
|
||||
|
||||
self.driver.quit()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@tenacity.retry(stop=tenacity.stop_after_attempt(3),
|
||||
retry=tenacity.retry_if_exception_type(TimeoutException))
|
||||
def check_responsiveness(self, url):
|
||||
result = []
|
||||
|
||||
# set window to the first size initially
|
||||
self.driver.set_window_size(self.sizes[0][0], self.sizes[0][1])
|
||||
self.driver.get(url)
|
||||
|
||||
# give the page some time to load
|
||||
time.sleep(10)
|
||||
|
||||
for (width, height) in self.sizes:
|
||||
self.driver.set_window_size(width, height)
|
||||
|
||||
# wait for re-render/re-flow
|
||||
time.sleep(1.0)
|
||||
doc_width = self.driver.execute_script("return document.body.scrollWidth")
|
||||
|
||||
result.append({
|
||||
'viewport_width': width,
|
||||
'document_width': int(doc_width),
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
def capture_log(self):
|
||||
"""
|
||||
Returns log elements with level "SEVERE"
|
||||
"""
|
||||
entries = []
|
||||
for entry in self.driver.get_log('browser'):
|
||||
if entry['level'] in ('WARNING', 'SEVERE'):
|
||||
entries.append(entry)
|
||||
return entries
|
|
@ -0,0 +1,94 @@
|
|||
"""
|
||||
This check downloads the HTML page for each URL
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import requests
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
|
||||
# connection timeout (seconds)
|
||||
CONNECT_TIMEOUT = 10
|
||||
|
||||
# response timeout (seconds)
|
||||
READ_TIMEOUT = 20
|
||||
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
|
||||
def run(self):
|
||||
results = {}
|
||||
|
||||
self.headers = {
|
||||
"User-Agent": self.config.user_agent,
|
||||
}
|
||||
|
||||
# copy URLs, as we may be manipulating self.config.urls in the loop
|
||||
url = list(self.config.urls)
|
||||
|
||||
for url in self.config.urls:
|
||||
result = self.download_page(url)
|
||||
results[url] = result
|
||||
|
||||
# remove bad URLs from config, to avoid later checks using them
|
||||
if 'exception' in result and result['exception'] is not None:
|
||||
self.config.remove_url(url)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def download_page(self, url):
|
||||
result = {
|
||||
'url': url,
|
||||
'content': None,
|
||||
'content_type': None,
|
||||
'content_length': None,
|
||||
'status_code': None,
|
||||
'response_headers': None,
|
||||
'duration': None,
|
||||
'exception': None,
|
||||
}
|
||||
|
||||
try:
|
||||
r = requests.get(url,
|
||||
headers=self.headers,
|
||||
timeout=(self.CONNECT_TIMEOUT, self.READ_TIMEOUT))
|
||||
|
||||
result['url'] = r.url
|
||||
result['status_code'] = r.status_code
|
||||
result['content'] = r.text
|
||||
result['content_length'] = len(r.text)
|
||||
result['response_headers'] = self.get_headers(r.headers)
|
||||
result['duration'] = round(r.elapsed.total_seconds() * 1000)
|
||||
|
||||
if r.headers.get("content-type") is not None:
|
||||
result['content_type'] = r.headers.get("content-type").split(";")[0].strip()
|
||||
|
||||
except requests.exceptions.ConnectionError as exc:
|
||||
logging.error(str(exc) + " " + url)
|
||||
result['exception'] = "connection"
|
||||
except requests.exceptions.ReadTimeout as exc:
|
||||
logging.error(str(exc) + " " + url)
|
||||
result['exception'] = "read_timeout"
|
||||
except requests.exceptions.Timeout as exc:
|
||||
logging.error(str(exc) + " " + url)
|
||||
result['exception'] = "connection_timeout"
|
||||
except Exception as exc:
|
||||
logging.error(str(exc) + " " + url)
|
||||
result['exception'] = "%s %s" % (str(type(exc)), exc)
|
||||
|
||||
return result
|
||||
|
||||
def get_headers(self, headers):
|
||||
"""
|
||||
Transforms CaseInsensitiveDict into dict with lowercase keys
|
||||
"""
|
||||
out = {}
|
||||
for key in headers:
|
||||
out[key.lower()] = headers[key]
|
||||
return out
|
|
@ -0,0 +1,13 @@
|
|||
"""
|
||||
This check verifies whether there is a single URL
|
||||
or several variants left at this point.
|
||||
"""
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
def run(self):
|
||||
return self.config.urls
|
|
@ -0,0 +1,104 @@
|
|||
"""
|
||||
This check verifies whether the urls in config are reachable.
|
||||
Some additional information regarding redirects and SSL problems
|
||||
are also recorded and returned as results.
|
||||
|
||||
Non-accessible URLs are removed from config.urls.
|
||||
|
||||
A redirect to facebook.com is not considered reachable, as that
|
||||
leads to a different website in the sense of this system.
|
||||
|
||||
TODO: Parallelize the work done in this test
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from urllib.parse import urlparse
|
||||
import requests
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
def run(self):
|
||||
headers = {
|
||||
"User-Agent": self.config.user_agent
|
||||
}
|
||||
|
||||
results = {}
|
||||
urls = list(self.config.urls)
|
||||
|
||||
for url in urls:
|
||||
logging.debug("Checking URL reachability for %s", url)
|
||||
|
||||
result = {
|
||||
"url": url,
|
||||
"redirect_history": [],
|
||||
"status": None,
|
||||
"exception": None,
|
||||
"duration": None,
|
||||
}
|
||||
|
||||
# Perform HEAD requests, recording redirect log
|
||||
try:
|
||||
r = requests.head(url, headers=headers, allow_redirects=True)
|
||||
result['status'] = r.status_code
|
||||
result['duration'] = round(r.elapsed.total_seconds() * 1000)
|
||||
|
||||
if len(r.history):
|
||||
result['redirect_history'] = self.expand_history(r.history)
|
||||
logging.debug("Redirects: %r", result['redirect_history'])
|
||||
|
||||
if r.url == url:
|
||||
logging.debug("URL: %s - status %s", url, r.status_code)
|
||||
else:
|
||||
logging.debug("URL: %s - status %s - redirects to %s", url,
|
||||
r.status_code, r.url)
|
||||
# remove source URL, add target URL to config.urls
|
||||
self.config.remove_url(url)
|
||||
self.config.add_url(r.url)
|
||||
|
||||
# remove 404 etc
|
||||
if r.status_code > 400:
|
||||
self.config.remove_url(url)
|
||||
|
||||
except Exception as exc:
|
||||
logging.info("Exception for URL %s: %s %s", url, str(type(exc)), exc)
|
||||
result['exception'] = {
|
||||
'type': str(type(exc)),
|
||||
'message': str(exc),
|
||||
}
|
||||
|
||||
# remove URL to prevent further checks on unreachable URL
|
||||
self.config.remove_url(url)
|
||||
|
||||
# if redirects end in www.facebook.com or www.denic.de, remove this URL again
|
||||
# remove if redirect target is facebook
|
||||
if result['exception'] is not None and result['redirect_history'] is not None and len(result['redirect_history']) > 0:
|
||||
parsed = urlparse(result['redirect_history'][-1]['redirect_to'])
|
||||
if parsed.hostname in ('www.facebook.com', 'www.denic.de', 'sedo.com'):
|
||||
result[url]['exception'] = {
|
||||
'type': 'Bad target domain',
|
||||
'message': 'The URL redirects to %s, which is unsupported by green-spider as it doesn\'t qualify as an owned website' % parsed.hostname,
|
||||
}
|
||||
self.config.remove_url(url)
|
||||
|
||||
results[url] = result
|
||||
|
||||
return results
|
||||
|
||||
def expand_history(self, history):
|
||||
"""Extracts primitives from a list of requests.Response objects"""
|
||||
items = []
|
||||
for h in history:
|
||||
item = {
|
||||
'status': h.status_code,
|
||||
'duration': round(h.elapsed.total_seconds() * 1000),
|
||||
'redirect_to': h.headers['location'],
|
||||
}
|
||||
items.append(item)
|
||||
|
||||
return items
|
|
@ -0,0 +1,71 @@
|
|||
import httpretty
|
||||
from httpretty import httprettified
|
||||
import unittest
|
||||
|
||||
from checks import url_reachability
|
||||
from checks.config import Config
|
||||
|
||||
@httprettified
|
||||
class TestCharsetChecker(unittest.TestCase):
|
||||
|
||||
def test_success(self):
|
||||
url = 'http://www.example.com/'
|
||||
httpretty.register_uri(httpretty.HEAD, url,
|
||||
status=200, body="<html></html>")
|
||||
|
||||
config = Config(urls=[url])
|
||||
checker = url_reachability.Checker(config=config, previous_results={})
|
||||
result = checker.run()
|
||||
|
||||
self.assertEqual(result[url]['url'], url)
|
||||
self.assertEqual(result[url]['redirect_history'], [])
|
||||
self.assertEqual(result[url]['status'], 200)
|
||||
self.assertIsNone(result[url]['exception'])
|
||||
self.assertTrue(0 < result[url]['duration'] < 100)
|
||||
|
||||
|
||||
def test_redirect(self):
|
||||
url = 'http://www.example.com/'
|
||||
url2 = 'http://www2.example.com/'
|
||||
httpretty.register_uri(httpretty.HEAD, url,
|
||||
status=302, body="",
|
||||
adding_headers={"Location": url2})
|
||||
httpretty.register_uri(httpretty.HEAD, url2,
|
||||
status=200, body="<html></html>")
|
||||
|
||||
config = Config(urls=[url])
|
||||
checker = url_reachability.Checker(config=config, previous_results={})
|
||||
result = checker.run()
|
||||
|
||||
self.assertIn(url, result)
|
||||
self.assertEqual(result[url]['url'], url)
|
||||
self.assertEqual(result[url]['status'], 200)
|
||||
self.assertIsNone(result[url]['exception'])
|
||||
self.assertTrue(0 < result[url]['duration'] < 100)
|
||||
self.assertEqual(len(result[url]['redirect_history']), 1)
|
||||
self.assertEqual(result[url]['redirect_history'][0]['status'], 302)
|
||||
self.assertEqual(result[url]['redirect_history'][0]['redirect_to'], url2)
|
||||
|
||||
|
||||
def test_notfound(self):
|
||||
url = 'http://www.example.com/'
|
||||
httpretty.register_uri(httpretty.HEAD, url,
|
||||
status=404, body="<html><body>Not found</body></html>")
|
||||
|
||||
config = Config(urls=[url])
|
||||
checker = url_reachability.Checker(config=config, previous_results={})
|
||||
result = checker.run()
|
||||
|
||||
self.assertEqual(result[url]['url'], url)
|
||||
self.assertEqual(result[url]['redirect_history'], [])
|
||||
self.assertEqual(result[url]['status'], 404)
|
||||
self.assertIsNone(result[url]['exception'])
|
||||
|
||||
newconfig = checker.config
|
||||
|
||||
self.assertEqual(len(newconfig.urls), 0)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -0,0 +1,83 @@
|
|||
"""
|
||||
Command line utility for spider, export etc.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
|
||||
from google.cloud import datastore
|
||||
|
||||
def handle_sigint(signum, frame):
|
||||
"""
|
||||
Handles SIGINT, which occurs on Ctrl-C
|
||||
"""
|
||||
print("\nInterrupted by SIGINT\n")
|
||||
sys.exit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
signal.signal(signal.SIGINT,handle_sigint)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# global flags
|
||||
parser.add_argument('--credentials-path', dest='credentials_path',
|
||||
help='Path to the service account credentials JSON file',
|
||||
default='/secrets/service-account.json')
|
||||
|
||||
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
|
||||
default='info')
|
||||
|
||||
# subcommands
|
||||
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
|
||||
|
||||
# spider subcommand
|
||||
spider_parser = subparsers.add_parser('spider', help='Take jobs off the queue and spider')
|
||||
spider_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to write (default: spider-results)')
|
||||
|
||||
# jobs subcommand
|
||||
jobs_parser = subparsers.add_parser('jobs', help='Adds spider jobs to the queue. By default, all green-directory URLs are added.')
|
||||
jobs_parser.add_argument('--url', help='Add a job to spider a specific URL')
|
||||
|
||||
# export subcommand
|
||||
export_parser = subparsers.add_parser('export', help='Export JSON data')
|
||||
export_parser.add_argument('--kind', default='spider-results', help='Datastore entity kind to export (default: spider-results)')
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# set log level
|
||||
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
||||
|
||||
loglevel = args.loglevel.lower()
|
||||
if loglevel == 'error':
|
||||
logging.basicConfig(level=logging.ERROR)
|
||||
elif loglevel == 'warn':
|
||||
logging.basicConfig(level=logging.WARN)
|
||||
elif loglevel == 'debug':
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logging.getLogger("selenium").setLevel(logging.INFO)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
loglevel = 'info'
|
||||
|
||||
logging.debug("Called command %s", args.command)
|
||||
|
||||
datastore_client = datastore.Client.from_service_account_json(args.credentials_path)
|
||||
|
||||
if args.command == 'jobs':
|
||||
|
||||
import jobs
|
||||
jobs.create_jobs(datastore_client, args.url)
|
||||
|
||||
elif args.command == 'export':
|
||||
|
||||
import export
|
||||
export.export_screenshots(datastore_client)
|
||||
export.export_results(datastore_client, args.kind)
|
||||
|
||||
else:
|
||||
from spider import spider
|
||||
spider.work_of_queue(datastore_client, args.kind)
|
|
@ -0,0 +1,23 @@
|
|||
|
||||
|
||||
# connection timeout for website checks (seconds)
|
||||
CONNECT_TIMEOUT = 5
|
||||
|
||||
# response timeout for website checks
|
||||
READ_TIMEOUT = 10
|
||||
|
||||
# Git repo for our data
|
||||
GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
|
||||
|
||||
# folder in that repo that holds the data
|
||||
GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
|
||||
|
||||
# folder we use locally to clone the repo
|
||||
GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
|
||||
|
||||
# IP address of the newthinking GCMS server
|
||||
GCMS_IP = "91.102.13.20"
|
||||
|
||||
# kind name of the spider job key datastore entities
|
||||
JOB_DATASTORE_KIND = 'spider-jobs'
|
||||
|
|
@ -19,6 +19,8 @@
|
|||
# secrets/datastore-writer.json
|
||||
|
||||
|
||||
DOCKERIMAGE="quay.io/netzbegruenung/green-spider:dev"
|
||||
|
||||
API_TOKEN_SECRET="secrets/hetzner-api-token.sh"
|
||||
test -f $API_TOKEN_SECRET || { echo >&2 "File $API_TOKEN_SECRET does not exist."; exit 1; }
|
||||
source $API_TOKEN_SECRET
|
||||
|
@ -29,10 +31,14 @@ if [[ "$1" == "" ]]; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
SERVERNAME="$1-$(date | md5 | cut -c1-3)"
|
||||
|
||||
# possible values: cx11 (1 core 2 GB), cx21 (2 cores, 4 GB), cx31 (2 cores, 8 GB)
|
||||
SERVERTYPE="cx21"
|
||||
|
||||
function create_server()
|
||||
{
|
||||
echo "Creating server $1"
|
||||
echo "Creating server $SERVERNAME"
|
||||
|
||||
# server_type 'cx11' is the smallest, cheapest category.
|
||||
# location 'nbg1' is Nürnberg/Nuremberg, Germany.
|
||||
|
@ -44,8 +50,8 @@ function create_server()
|
|||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer $API_TOKEN" \
|
||||
-d "{
|
||||
\"name\": \"$1\",
|
||||
\"server_type\": \"cx11\",
|
||||
\"name\": \"$SERVERNAME\",
|
||||
\"server_type\": \"$SERVERTYPE\",
|
||||
\"location\": \"nbg1\",
|
||||
\"start_after_create\": true,
|
||||
\"image\": \"debian-9\",
|
||||
|
@ -61,7 +67,7 @@ function create_server()
|
|||
# Get IP:
|
||||
SERVER_IP=$(echo $CREATE_RESPONSE | jq -r .server.public_net.ipv4.ip)
|
||||
|
||||
echo "Created server with ID $SERVER_ID and IP $SERVER_IP"
|
||||
echo "Created server $SERVERNAME with ID $SERVER_ID and IP $SERVER_IP"
|
||||
}
|
||||
|
||||
|
||||
|
@ -142,22 +148,25 @@ else
|
|||
|
||||
# Run docker job
|
||||
echo "Starting Docker Job"
|
||||
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \
|
||||
-v /root/secrets:/secrets \
|
||||
quay.io/netzbegruenung/green-spider spider.py \
|
||||
--credentials-path /secrets/datastore-writer.json \
|
||||
jobs
|
||||
#ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \
|
||||
# -v /root/secrets:/secrets \
|
||||
# quay.io/netzbegruenung/green-spider spider.py \
|
||||
# --credentials-path /secrets/datastore-writer.json \
|
||||
# jobs
|
||||
|
||||
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP mkdir -p /dev-shm
|
||||
ssh -o StrictHostKeyChecking=no -q root@$SERVER_IP docker run -t \
|
||||
-v /dev-shm:/dev/shm \
|
||||
-v /root/secrets:/secrets \
|
||||
quay.io/netzbegruenung/green-spider spider.py \
|
||||
$DOCKERIMAGE \
|
||||
--credentials-path /secrets/datastore-writer.json \
|
||||
spider
|
||||
--loglevel info \
|
||||
spider --kind spider-results-dev
|
||||
|
||||
fi
|
||||
|
||||
# Delete the box
|
||||
echo "Deleting server $SERVER_ID"
|
||||
echo "Deleting server $SERVERNAME with ID $SERVER_ID"
|
||||
curl -s -X DELETE -H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer $API_TOKEN" \
|
||||
https://api.hetzner.cloud/v1/servers/$SERVER_ID
|
||||
|
|
|
@ -2,8 +2,7 @@
|
|||
Exports data from the database to JSON files for use in a static webapp
|
||||
"""
|
||||
|
||||
from google.cloud import datastore
|
||||
import hashlib
|
||||
from hashlib import md5
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
|
@ -14,44 +13,67 @@ import requests
|
|||
|
||||
SITEICONS_PATH = "/icons"
|
||||
|
||||
client = None
|
||||
|
||||
def export_results():
|
||||
def export_results(client, entity_kind):
|
||||
"""
|
||||
Export of the main results data
|
||||
"""
|
||||
out = []
|
||||
|
||||
query = client.query(kind='spider-results')
|
||||
# Load data from database
|
||||
query = client.query(kind=entity_kind)
|
||||
for entity in query.fetch():
|
||||
logging.debug(entity.key.name)
|
||||
record = dict(entity)
|
||||
record["results"]["created"] = record["created"].isoformat()
|
||||
out.append(record["results"])
|
||||
out.append({
|
||||
'input_url': entity.key.name,
|
||||
'resulting_urls': entity.get('checks').get('url_canonicalization'),
|
||||
'created': entity.get('created').isoformat(),
|
||||
'meta': entity.get('meta'),
|
||||
'checks': entity.get('checks'),
|
||||
'rating': entity.get('rating'),
|
||||
'score': entity.get('score'),
|
||||
'icons': [],
|
||||
})
|
||||
|
||||
# load icons, reformat icons details
|
||||
icons_downloaded = set()
|
||||
for index in range(len(out)):
|
||||
if "details" not in out[index]:
|
||||
continue
|
||||
if "icons" not in out[index]["details"]:
|
||||
continue
|
||||
urls = out[index]["details"]["icons"]
|
||||
out[index]["details"]["icons"] = {}
|
||||
for url in urls:
|
||||
if not (url.startswith("http://") or url.startswith("https://")):
|
||||
logging.debug("Skipping icon %s", url)
|
||||
continue
|
||||
logging.debug("Dowloading icon %s", url)
|
||||
filename = download_icon(url)
|
||||
assert "checks" in out[index]
|
||||
assert "html_head" in out[index]["checks"]
|
||||
|
||||
# collect icons urls
|
||||
icons = set()
|
||||
for url in out[index]['checks']['html_head']:
|
||||
assert 'link_icon' in out[index]['checks']['html_head'][url]
|
||||
if out[index]['checks']['html_head'][url]['link_icon'] is not None:
|
||||
iconurl = out[index]['checks']['html_head'][url]['link_icon']
|
||||
if iconurl.startswith("data:"):
|
||||
continue
|
||||
if iconurl in icons_downloaded:
|
||||
continue
|
||||
icons.add(iconurl)
|
||||
|
||||
out[index]["icons"] = {}
|
||||
for iconurl in list(icons):
|
||||
logging.debug("Dowloading icon %s", iconurl)
|
||||
icons_downloaded.add(iconurl)
|
||||
filename = download_icon(iconurl)
|
||||
if filename:
|
||||
out[index]["details"]["icons"][url] = filename
|
||||
out[index]["icons"][url] = filename
|
||||
|
||||
output_filename = "/out/spider_result.json"
|
||||
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
||||
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
||||
|
||||
# compact version
|
||||
output_filename = "/out/spider_result_compact.json"
|
||||
for i in range(len(out)):
|
||||
out[i]['cms'] = list(out[i]['checks']['generator'].values())
|
||||
del out[i]['checks']
|
||||
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
||||
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
||||
|
||||
|
||||
def export_screenshots():
|
||||
def export_screenshots(client):
|
||||
"""
|
||||
Export of screenshot meta data
|
||||
"""
|
||||
|
@ -78,10 +100,12 @@ def download_icon(icon_url):
|
|||
"""
|
||||
|
||||
default_endings = {
|
||||
"image/x-ico": "ico",
|
||||
"image/x-icon": "ico",
|
||||
"image/vnd.microsoft.icon": "ico",
|
||||
"image/png": "png",
|
||||
"image/jpeg": "jpg",
|
||||
"image/gif": "gif",
|
||||
}
|
||||
|
||||
# Download the icon
|
||||
|
@ -92,7 +116,7 @@ def download_icon(icon_url):
|
|||
if req.status_code >= 400:
|
||||
return None
|
||||
|
||||
content_hash = hashlib.md5(req.content).hexdigest()
|
||||
content_hash = md5(req.content).hexdigest()
|
||||
extension = ""
|
||||
|
||||
try:
|
||||
|
@ -109,6 +133,9 @@ def download_icon(icon_url):
|
|||
if extension == "":
|
||||
# derive from content type
|
||||
ctype = req.headers.get('content-type')
|
||||
if ctype is None:
|
||||
return
|
||||
|
||||
try:
|
||||
extension = default_endings[ctype]
|
||||
except KeyError:
|
||||
|
@ -122,17 +149,3 @@ def download_icon(icon_url):
|
|||
iconfile.write(req.content)
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
print("Error: please provide path to Google Storage API system account JSON file as argument")
|
||||
sys.exit(1)
|
||||
|
||||
key_path = sys.argv[1]
|
||||
client = datastore.Client.from_service_account_json(key_path)
|
||||
|
||||
export_screenshots()
|
||||
export_results()
|
|
@ -0,0 +1,180 @@
|
|||
"""
|
||||
The jobs module allows to create jobs for the queue and take jobs off the queue
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
|
||||
from git import Repo
|
||||
import tenacity
|
||||
import yaml
|
||||
from google.api_core.exceptions import Aborted
|
||||
from google.cloud import datastore
|
||||
|
||||
import config
|
||||
|
||||
|
||||
def clone_data_directory():
|
||||
"""
|
||||
Clones the source of website URLs, the green directory,
|
||||
into the local file system using git
|
||||
"""
|
||||
if os.path.exists(config.GREEN_DIRECTORY_LOCAL_PATH):
|
||||
shutil.rmtree(config.GREEN_DIRECTORY_LOCAL_PATH)
|
||||
Repo.clone_from(config.GREEN_DIRECTORY_REPO, config.GREEN_DIRECTORY_LOCAL_PATH)
|
||||
|
||||
|
||||
def directory_entries():
|
||||
"""
|
||||
Iterator over all data files in the cloned green directory
|
||||
"""
|
||||
path = os.path.join(config.GREEN_DIRECTORY_LOCAL_PATH, config.GREEN_DIRECTORY_DATA_PATH)
|
||||
for root, _, files in os.walk(path):
|
||||
for fname in files:
|
||||
|
||||
filepath = os.path.join(root, fname)
|
||||
if not filepath.endswith(".yaml"):
|
||||
continue
|
||||
|
||||
with open(filepath, 'r', encoding='utf8') as yamlfile:
|
||||
for doc in yaml.load_all(yamlfile):
|
||||
yield doc
|
||||
|
||||
|
||||
def chunks(the_list, size):
|
||||
"""
|
||||
Yield successive n-sized chunks from list the_list
|
||||
where n = size.
|
||||
"""
|
||||
for i in range(0, len(the_list), size):
|
||||
yield the_list[i:i + size]
|
||||
|
||||
|
||||
def create_jobs(datastore_client, url=None):
|
||||
"""
|
||||
Read all URLs from green directory and fill a job database
|
||||
with one job per URL.
|
||||
|
||||
Alternatively, if the url argument is given, only the given URL
|
||||
will be added as a spider job.
|
||||
"""
|
||||
|
||||
# refresh our local clone of the green directory
|
||||
logging.info("Refreshing green-directory clone")
|
||||
clone_data_directory()
|
||||
|
||||
# build the list of website URLs to run checks for
|
||||
logging.info("Processing green-directory")
|
||||
input_entries = []
|
||||
|
||||
count = 0
|
||||
|
||||
random.seed()
|
||||
|
||||
for entry in directory_entries():
|
||||
|
||||
if 'type' not in entry:
|
||||
logging.error("Entry without type")
|
||||
continue
|
||||
if 'urls' not in entry:
|
||||
logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
|
||||
continue
|
||||
|
||||
website_url = None
|
||||
for index in range(len(entry['urls'])):
|
||||
try:
|
||||
if entry['urls'][index]['type'] == "WEBSITE":
|
||||
website_url = entry['urls'][index]['url']
|
||||
if website_url:
|
||||
if url is not None and website_url != url:
|
||||
continue
|
||||
input_entries.append({
|
||||
"url": website_url,
|
||||
"type": entry.get("type"),
|
||||
"level": entry.get("level"),
|
||||
"state": entry.get("state"),
|
||||
"district": entry.get("district"),
|
||||
"city": entry.get("city"),
|
||||
})
|
||||
count += 1
|
||||
except NameError:
|
||||
logging.error("Error in %s: 'url' key missing (%s)",
|
||||
repr_entry(entry), entry['urls'][index])
|
||||
|
||||
# ensure the passed URL argument is really there, even if not part
|
||||
# of the directory.
|
||||
if url and count == 0:
|
||||
logging.info("Adding job for URL %s which is not part of green-directory", url)
|
||||
input_entries.append({
|
||||
"url": url,
|
||||
"type": None,
|
||||
"level": None,
|
||||
"state": None,
|
||||
"district": None,
|
||||
"city": None,
|
||||
"index": int(random.uniform(1000000, 9999999)),
|
||||
})
|
||||
|
||||
count = 0
|
||||
logging.info("Writing jobs")
|
||||
|
||||
entities = []
|
||||
|
||||
for entry in input_entries:
|
||||
key = datastore_client.key(config.JOB_DATASTORE_KIND, entry["url"])
|
||||
entity = datastore.Entity(key=key)
|
||||
entity.update({
|
||||
"created": datetime.utcnow(),
|
||||
"type": entry["type"],
|
||||
"level": entry["level"],
|
||||
"state": entry["state"],
|
||||
"district": entry["district"],
|
||||
"city": entry["city"],
|
||||
"index": int(random.uniform(1000000, 9999999)),
|
||||
})
|
||||
entities.append(entity)
|
||||
|
||||
# commmit to DB
|
||||
for chunk in chunks(entities, 300):
|
||||
logging.debug("Writing jobs chunk of length %d", len(chunk))
|
||||
datastore_client.put_multi(chunk)
|
||||
count += len(chunk)
|
||||
|
||||
logging.info("Writing jobs done, %s jobs added", count)
|
||||
|
||||
|
||||
@tenacity.retry(wait=tenacity.wait_exponential(),
|
||||
retry=tenacity.retry_if_exception_type(Aborted))
|
||||
def get_job_from_queue(datastore_client):
|
||||
"""
|
||||
Returns a URL from the queue
|
||||
"""
|
||||
out = None
|
||||
|
||||
with datastore_client.transaction():
|
||||
query = datastore_client.query(kind=config.JOB_DATASTORE_KIND,
|
||||
order=['index'])
|
||||
for entity in query.fetch(limit=1):
|
||||
logging.debug("Got job: %s", entity)
|
||||
out = dict(entity)
|
||||
out["url"] = entity.key.name
|
||||
datastore_client.delete(entity.key)
|
||||
|
||||
return out
|
||||
|
||||
def repr_entry(entry):
|
||||
"""
|
||||
Return string representation of a directory entry,
|
||||
for logging/debugging purposes
|
||||
"""
|
||||
ret = entry['type']
|
||||
if 'level' in entry:
|
||||
ret += "/" + entry['level']
|
||||
if 'state' in entry:
|
||||
ret += "/" + entry['state']
|
||||
if 'district' in entry:
|
||||
ret += "/" + entry['district']
|
||||
return ret
|
|
@ -0,0 +1,53 @@
|
|||
"""
|
||||
The rating module contains the functionality to get calculate score for certain
|
||||
criteria based on information gather by checks before.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from rating import canonical_url
|
||||
from rating import favicon
|
||||
from rating import feeds
|
||||
from rating import https
|
||||
from rating import no_network_errors
|
||||
from rating import no_script_errors
|
||||
from rating import reachable
|
||||
from rating import resolvable
|
||||
from rating import response_duration
|
||||
from rating import responsive_layout
|
||||
from rating import use_specific_fonts
|
||||
from rating import www_optional
|
||||
|
||||
|
||||
def calculate_rating(results):
|
||||
"""
|
||||
Calculates ratings for a number of criteria.
|
||||
|
||||
Params:
|
||||
results - Results dictionary from checks
|
||||
"""
|
||||
|
||||
# The raters to execute.
|
||||
rating_modules = {
|
||||
'CANONICAL_URL': canonical_url,
|
||||
'DNS_RESOLVABLE_IPV4': resolvable,
|
||||
'FAVICON': favicon,
|
||||
'FEEDS': feeds,
|
||||
'HTTPS': https,
|
||||
'HTTP_RESPONSE_DURATION': response_duration,
|
||||
'NO_NETWORK_ERRORS': no_network_errors,
|
||||
'NO_SCRIPT_ERRORS': no_script_errors,
|
||||
'RESPONSIVE': responsive_layout,
|
||||
'SITE_REACHABLE': reachable,
|
||||
'USE_SPECIFIC_FONTS': use_specific_fonts,
|
||||
'WWW_OPTIONAL': www_optional,
|
||||
}
|
||||
|
||||
output = {}
|
||||
|
||||
for name in rating_modules:
|
||||
|
||||
rater = rating_modules[name].Rater(results)
|
||||
output[name] = rater.rate()
|
||||
|
||||
return output
|
|
@ -0,0 +1,22 @@
|
|||
class AbstractRater(object):
|
||||
|
||||
# String 'boolean' or 'number'
|
||||
rating_type = None
|
||||
|
||||
# The default value to return if no rating given
|
||||
default_value = None
|
||||
|
||||
max_score = 1
|
||||
|
||||
# Name of the checks this rater depends on
|
||||
depends_on_checks = []
|
||||
|
||||
def __init__(self, check_results):
|
||||
self.check_results = check_results
|
||||
|
||||
for item in self.depends_on_checks:
|
||||
assert item in self.check_results
|
||||
|
||||
def rate(self):
|
||||
raise NotImplementedError()
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
"""
|
||||
This looks at remaining resolvable URLs after redirects
|
||||
and gives score if there is only one URL left.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['url_canonicalization']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
if len(self.check_results['url_canonicalization']) == 1:
|
||||
value = True
|
||||
score = self.max_score
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
"""
|
||||
This gives a score if the site has an icon.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['html_head']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
for url in self.check_results['html_head']:
|
||||
if self.check_results['html_head'][url]['link_icon'] is not None:
|
||||
value = True
|
||||
score = self.max_score
|
||||
break
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
"""
|
||||
This gives a score if the site has feeds.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['html_head']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
for url in self.check_results['html_head']:
|
||||
if self.check_results['html_head'][url]['link_rss_atom'] is None:
|
||||
continue
|
||||
if self.check_results['html_head'][url]['link_rss_atom'] == []:
|
||||
continue
|
||||
value = True
|
||||
score = self.max_score
|
||||
break
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
"""
|
||||
This looks at all HTTPS URLs we checked for reachability.
|
||||
|
||||
If all of them were reachable without errors, we give full score.
|
||||
If some or all had errors, or no HTTPS URL is reachable, we give zero.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['url_reachability']
|
||||
|
||||
# HTTPS is very important, so this counts double
|
||||
max_score = 2
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
reachable_count = 0
|
||||
unreachable_count = 0
|
||||
|
||||
for url in self.check_results['url_reachability']:
|
||||
if not url.startswith('https://'):
|
||||
continue
|
||||
|
||||
if self.check_results['url_reachability'][url]['exception'] is None:
|
||||
reachable_count += 1
|
||||
else:
|
||||
unreachable_count += 1
|
||||
|
||||
if unreachable_count == 0 and reachable_count > 0:
|
||||
value = True
|
||||
score = self.max_score
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
"""
|
||||
If all URLs could be loaded without severe network errors, this rater gives a score.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['load_in_browser']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
found_pageloads = 0
|
||||
found_errors = 0
|
||||
for url in self.check_results['load_in_browser']:
|
||||
if (self.check_results['load_in_browser'][url]['logs'] == [] or
|
||||
self.check_results['load_in_browser'][url]['logs'] is None):
|
||||
continue
|
||||
|
||||
found_pageloads += 1
|
||||
|
||||
# scan log entries for script errors
|
||||
for entry in self.check_results['load_in_browser'][url]['logs']:
|
||||
if entry['source'] != 'network':
|
||||
continue
|
||||
if entry['level'] != 'SEVERE':
|
||||
continue
|
||||
|
||||
found_errors += 1
|
||||
|
||||
if found_pageloads > 0 and found_errors == 0:
|
||||
value = True
|
||||
score = self.max_score
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
"""
|
||||
If all URLs could be loaded without JavaScript errors, this rater gives a score.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['load_in_browser']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
found_pageloads = 0
|
||||
found_errors = 0
|
||||
for url in self.check_results['load_in_browser']:
|
||||
if self.check_results['load_in_browser'][url]['logs'] == []:
|
||||
found_pageloads += 1
|
||||
continue
|
||||
|
||||
# scan log entries for script errors
|
||||
for entry in self.check_results['load_in_browser'][url]['logs']:
|
||||
if entry['source'] == 'javascript':
|
||||
found_errors += 1
|
||||
|
||||
if found_pageloads > 0 and found_errors == 0:
|
||||
value = True
|
||||
score = self.max_score
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
"""
|
||||
This gives a score if one of the checked URL variations was reachable.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['url_reachability']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
count = 0
|
||||
for url in self.check_results['url_reachability']:
|
||||
if self.check_results['url_reachability'][url]['exception'] is not None:
|
||||
continue
|
||||
count += 1
|
||||
|
||||
if count > 0:
|
||||
value = True
|
||||
score = self.max_score
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
"""
|
||||
This gives a score if one of the input URL's hostnames was resolvable
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['dns_resolution']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
count = 0
|
||||
for url in self.check_results['dns_resolution']:
|
||||
if self.check_results['dns_resolution'][url]['resolvable']:
|
||||
count += 1
|
||||
|
||||
if count > 0:
|
||||
value = True
|
||||
score = self.max_score
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
"""
|
||||
This looks at the response duration(s) and scores based on the bucket
|
||||
the value is in. Fast responses get one point, slower half a point,
|
||||
more than a seconds gets nothing.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'number'
|
||||
default_value = False
|
||||
depends_on_checks = ['page_content']
|
||||
max_score = 1.0
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
duration_sum = 0
|
||||
duration_count = 0
|
||||
|
||||
for url in self.check_results['page_content']:
|
||||
if self.check_results['page_content'][url]['exception'] is not None:
|
||||
continue
|
||||
duration_sum += self.check_results['page_content'][url]['duration']
|
||||
duration_count += 1
|
||||
|
||||
if duration_count > 0:
|
||||
value = round(duration_sum / duration_count)
|
||||
|
||||
# value is duration in milliseconds
|
||||
if value < 100:
|
||||
score = self.max_score
|
||||
elif value < 1000:
|
||||
score = self.max_score * 0.5
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
"""
|
||||
This gives a score if the site's minimal document width during checks
|
||||
was smaller than or equal to the minimal viewport size tested.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['load_in_browser']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
for url in self.check_results['load_in_browser']:
|
||||
if (self.check_results['load_in_browser'][url]['min_document_width'] <=
|
||||
self.check_results['load_in_browser'][url]['sizes'][0]['viewport_width']):
|
||||
value = True
|
||||
score = self.max_score
|
||||
# we use the first URL found here
|
||||
break
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
"""
|
||||
Checks whether the pages use the font 'Arvo'.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['load_in_browser']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
urls_with_font = 0
|
||||
urls_without_font = 0
|
||||
for url in self.check_results['load_in_browser']:
|
||||
if self.check_results['load_in_browser'][url]['font_families'] is None:
|
||||
urls_without_font += 1
|
||||
continue
|
||||
|
||||
fonts = " ".join(self.check_results['load_in_browser'][url]['font_families'])
|
||||
if 'arvo' in fonts:
|
||||
urls_with_font += 1
|
||||
|
||||
if urls_with_font > 0 and urls_without_font == 0:
|
||||
score = self.max_score
|
||||
value = True
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
"""
|
||||
This looks at reachable URLs and checks whether (sub)domains
|
||||
both with and without www. are reachable.
|
||||
"""
|
||||
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['url_reachability']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
hostnames = set()
|
||||
for url in self.check_results['url_reachability']:
|
||||
if self.check_results['url_reachability'][url]['exception'] is not None:
|
||||
continue
|
||||
parsed = urlparse(url)
|
||||
hostnames.add(parsed)
|
||||
|
||||
# FIXME
|
||||
# we simply check whether there is more than one hostname.
|
||||
# this works with our current input URls but might be too
|
||||
# simplistic in the future.
|
||||
if len(list(hostnames)) > 1:
|
||||
value = True
|
||||
score = self.max_score
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
814
spider.py
814
spider.py
|
@ -1,814 +0,0 @@
|
|||
"""
|
||||
Provides the spider functionality (website checks).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import shutil
|
||||
import statistics
|
||||
import time
|
||||
from datetime import datetime
|
||||
from socket import gethostbyname_ex
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
import yaml
|
||||
import tenacity
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from git import Repo
|
||||
from selenium import webdriver
|
||||
from google.cloud import datastore
|
||||
from google.api_core.exceptions import Aborted
|
||||
from google.api_core.exceptions import InvalidArgument
|
||||
|
||||
|
||||
# configuration
|
||||
|
||||
# connection timeout for website checks (seconds)
|
||||
CONNECT_TIMEOUT = 5
|
||||
|
||||
# response timeout for website checks
|
||||
READ_TIMEOUT = 10
|
||||
|
||||
# Git repo for our data
|
||||
GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
|
||||
# folder in that repo that holds the data
|
||||
GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
|
||||
GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
|
||||
|
||||
RESULT_PATH = '/out'
|
||||
|
||||
# IP address of the newthinking GCMS server
|
||||
GCMS_IP = "91.102.13.20"
|
||||
|
||||
JOB_DATASTORE_KIND = 'spider-jobs'
|
||||
RESULTS_DATASTORE_KIND = 'spider-results'
|
||||
|
||||
# end configuration
|
||||
|
||||
DATASTORE_CLIENT = None
|
||||
|
||||
|
||||
def chunks(the_list, size):
|
||||
"""
|
||||
Yield successive n-sized chunks from list the_list
|
||||
where n = size.
|
||||
"""
|
||||
for i in range(0, len(the_list), size):
|
||||
yield the_list[i:i + size]
|
||||
|
||||
|
||||
def create_jobs(url=None):
|
||||
"""
|
||||
Read all URLs from green directory and fill a job database
|
||||
with one job per URL.
|
||||
|
||||
Alternatively, if the url argument is given, only the given URL
|
||||
will be added as a spider job.
|
||||
"""
|
||||
|
||||
# refresh our local clone of the green directory
|
||||
logging.info("Refreshing green-directory clone")
|
||||
get_green_directory()
|
||||
|
||||
# build the list of website URLs to run checks for
|
||||
logging.info("Processing green-directory")
|
||||
input_entries = []
|
||||
|
||||
count = 0
|
||||
|
||||
for entry in dir_entries():
|
||||
|
||||
if 'type' not in entry:
|
||||
logging.error("Entry without type")
|
||||
continue
|
||||
if 'urls' not in entry:
|
||||
logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
|
||||
continue
|
||||
|
||||
website_url = None
|
||||
for index in range(len(entry['urls'])):
|
||||
try:
|
||||
if entry['urls'][index]['type'] == "WEBSITE":
|
||||
website_url = entry['urls'][index]['url']
|
||||
if website_url:
|
||||
if url is not None and website_url != url:
|
||||
continue
|
||||
input_entries.append({
|
||||
"url": website_url,
|
||||
"level": entry.get("level"),
|
||||
"state": entry.get("state"),
|
||||
"district": entry.get("district"),
|
||||
"city": entry.get("city"),
|
||||
})
|
||||
count += 1
|
||||
except NameError:
|
||||
logging.error("Error in %s: 'url' key missing (%s)",
|
||||
repr_entry(entry), entry['urls'][index])
|
||||
|
||||
# ensure the passed URL argument is really there, even if not part
|
||||
# of the directory.
|
||||
if url and count == 0:
|
||||
logging.info("Adding job for URL %s which is not part of green-directory", url)
|
||||
input_entries.append({
|
||||
"url": url,
|
||||
"level": None,
|
||||
"state": None,
|
||||
"district": None,
|
||||
"city": None,
|
||||
})
|
||||
|
||||
# randomize order, to distribute requests over servers
|
||||
logging.debug("Shuffling input URLs")
|
||||
random.seed()
|
||||
random.shuffle(input_entries)
|
||||
|
||||
count = 0
|
||||
logging.info("Writing jobs")
|
||||
|
||||
entities = []
|
||||
|
||||
for entry in input_entries:
|
||||
key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"])
|
||||
entity = datastore.Entity(key=key)
|
||||
entity.update({
|
||||
"created": datetime.utcnow(),
|
||||
"level": entry["level"],
|
||||
"state": entry["state"],
|
||||
"district": entry["district"],
|
||||
"city": entry["city"],
|
||||
})
|
||||
entities.append(entity)
|
||||
|
||||
# commmit to DB
|
||||
for chunk in chunks(entities, 300):
|
||||
logging.debug("Writing jobs chunk of length %d", len(chunk))
|
||||
DATASTORE_CLIENT.put_multi(chunk)
|
||||
count += len(chunk)
|
||||
|
||||
logging.info("Writing jobs done, %s jobs added", count)
|
||||
|
||||
|
||||
def get_green_directory():
|
||||
"""
|
||||
Clones the source of website URLs, the green directory,
|
||||
into the local file system using git
|
||||
"""
|
||||
if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH):
|
||||
shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH)
|
||||
Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH)
|
||||
|
||||
|
||||
def dir_entries():
|
||||
"""
|
||||
Iterator over all data files in the cloned green directory
|
||||
"""
|
||||
path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH)
|
||||
for root, _, files in os.walk(path):
|
||||
for fname in files:
|
||||
|
||||
filepath = os.path.join(root, fname)
|
||||
if not filepath.endswith(".yaml"):
|
||||
continue
|
||||
|
||||
with open(filepath, 'r', encoding='utf8') as yamlfile:
|
||||
for doc in yaml.load_all(yamlfile):
|
||||
yield doc
|
||||
|
||||
|
||||
def repr_entry(entry):
|
||||
"""
|
||||
Return string representation of a directory entry,
|
||||
for logging/debugging purposes
|
||||
"""
|
||||
ret = entry['type']
|
||||
if 'level' in entry:
|
||||
ret += "/" + entry['level']
|
||||
if 'state' in entry:
|
||||
ret += "/" + entry['state']
|
||||
if 'district' in entry:
|
||||
ret += "/" + entry['district']
|
||||
return ret
|
||||
|
||||
|
||||
def derive_test_hostnames(hostname):
|
||||
"""
|
||||
Derives the hostnames variants to test for a given host name.
|
||||
From 'gruene-x.de' or 'www.gruene-x.de' it makes
|
||||
|
||||
['gruene-x.de', 'www.gruene-x.de']
|
||||
|
||||
which are both plausible web URLs to be used for a domain.
|
||||
"""
|
||||
|
||||
hostnames = set()
|
||||
|
||||
hostnames.add(hostname)
|
||||
if hostname.startswith('www.'):
|
||||
hostnames.add(hostname[4:])
|
||||
else:
|
||||
hostnames.add('www.' + hostname)
|
||||
|
||||
return sorted(list(hostnames))
|
||||
|
||||
|
||||
def reduce_urls(urllist):
|
||||
"""
|
||||
Reduce a list of urls with metadata by eliminating those
|
||||
that either don't work or lead somewhere else
|
||||
"""
|
||||
targets = set()
|
||||
for url in urllist:
|
||||
if url['error'] is not None:
|
||||
continue
|
||||
if url['redirects_to'] is not None:
|
||||
targets.add(url['redirects_to'])
|
||||
else:
|
||||
targets.add(url['url'])
|
||||
return sorted(list(targets))
|
||||
|
||||
|
||||
def normalize_title(title):
|
||||
"""
|
||||
Removes garbage from HTML page titles
|
||||
"""
|
||||
title = title.replace(u'\u00a0', ' ')
|
||||
title = title.replace(' ', ' ')
|
||||
title = title.strip()
|
||||
return title
|
||||
|
||||
|
||||
def check_responsiveness(url):
|
||||
"""
|
||||
Checks
|
||||
- whether a page adapts to different viewport sizes
|
||||
- whether a viewport meta tag exists
|
||||
and returns details
|
||||
"""
|
||||
details = {
|
||||
'document_width': {},
|
||||
'viewport_meta_tag': None,
|
||||
}
|
||||
|
||||
# sizes we check for (width, height)
|
||||
sizes = (
|
||||
(320, 480), # old smartphone
|
||||
(768, 1024), # older tablet or newer smartphone
|
||||
(1024, 768), # older desktop or horiz. tablet
|
||||
(1920, 1080), # Full HD horizontal
|
||||
)
|
||||
|
||||
# Our selenium user agent using Chrome headless as an engine
|
||||
chrome_options = webdriver.ChromeOptions()
|
||||
chrome_options.add_argument('--headless')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--disable-extensions')
|
||||
driver = webdriver.Chrome(chrome_options=chrome_options)
|
||||
driver.set_page_load_timeout(60)
|
||||
driver.set_window_size(sizes[0][0], sizes[0][1])
|
||||
driver.get(url)
|
||||
time.sleep(1)
|
||||
|
||||
for (width, height) in sizes:
|
||||
driver.set_window_size(width, height)
|
||||
key = "%sx%s" % (width, height)
|
||||
width = driver.execute_script("return document.body.scrollWidth")
|
||||
details['document_width'][key] = int(width)
|
||||
|
||||
try:
|
||||
element = driver.find_element_by_xpath("//meta[@name='viewport']")
|
||||
details['viewport_meta_tag'] = element.get_attribute('content')
|
||||
except:
|
||||
pass
|
||||
|
||||
return details
|
||||
|
||||
|
||||
def check_content(req):
|
||||
"""
|
||||
Adds details to check regarding content of the page
|
||||
|
||||
check: the dict containing details for this URL
|
||||
r: requests request/response object
|
||||
"""
|
||||
result = {}
|
||||
|
||||
result['encoding'] = req.encoding.lower()
|
||||
soup = BeautifulSoup(req.text, 'html.parser')
|
||||
|
||||
result['html'] = req.text
|
||||
|
||||
# page title
|
||||
result['title'] = None
|
||||
title = None
|
||||
head = soup.find('head')
|
||||
if head is not None:
|
||||
title = head.find('title')
|
||||
if title is not None:
|
||||
result['title'] = normalize_title(title.get_text())
|
||||
|
||||
# canonical link
|
||||
result['canonical_link'] = None
|
||||
link = soup.find('link', rel='canonical')
|
||||
if link:
|
||||
result['canonical_link'] = urljoin(req.url, link.get('href'))
|
||||
|
||||
# icon
|
||||
result['icon'] = None
|
||||
link = soup.find('link', rel=lambda x: x and x.lower() == 'icon')
|
||||
if link:
|
||||
result['icon'] = urljoin(req.url, link.get('href'))
|
||||
else:
|
||||
link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
|
||||
if link:
|
||||
result['icon'] = urljoin(req.url, link.get('href'))
|
||||
|
||||
# feed links
|
||||
result['feeds'] = []
|
||||
rss_links = soup.find_all('link', type='application/rss+xml')
|
||||
atom_links = soup.find_all('link', type='application/atom+xml')
|
||||
|
||||
if rss_links:
|
||||
for link in rss_links:
|
||||
result['feeds'].append(urljoin(req.url, link.get('href')))
|
||||
if atom_links:
|
||||
for link in rss_links:
|
||||
result['feeds'].append(urljoin(req.url, link.get('href')))
|
||||
|
||||
# generator meta tag
|
||||
result['generator'] = None
|
||||
if head is not None:
|
||||
generator = head.select('[name=generator]')
|
||||
if generator:
|
||||
result['generator'] = generator[0].get('content')
|
||||
|
||||
# opengraph meta tags
|
||||
result['opengraph'] = None
|
||||
opengraph = set()
|
||||
if head is not None:
|
||||
for item in head.find_all(property=re.compile('^og:')):
|
||||
opengraph.add(item.get('property'))
|
||||
for item in head.find_all(itemprop=re.compile('^og:')):
|
||||
opengraph.add(item.get('itemprop'))
|
||||
if opengraph:
|
||||
result['opengraph'] = sorted(list(opengraph))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def collect_ipv4_addresses(hostname_dict):
|
||||
"""
|
||||
Return list of unique IPv4 addresses
|
||||
"""
|
||||
ips = set()
|
||||
for item in hostname_dict.values():
|
||||
if 'ip_addresses' not in item:
|
||||
continue
|
||||
for ip_addr in item['ip_addresses']:
|
||||
ips.add(ip_addr)
|
||||
return sorted(list(ips))
|
||||
|
||||
|
||||
def parse_generator(generator):
|
||||
"""
|
||||
Return well known CMS names from generator
|
||||
"""
|
||||
generator = generator.lower()
|
||||
if 'typo3' in generator:
|
||||
return "typo3"
|
||||
if 'wordpress' in generator:
|
||||
return "wordpress"
|
||||
if 'drupal' in generator:
|
||||
return "drupal"
|
||||
if 'joomla' in generator:
|
||||
return "joomla"
|
||||
return generator
|
||||
|
||||
def check_site(entry):
|
||||
"""
|
||||
Performs our site check and returns results as a dict.
|
||||
|
||||
1. Normalize the input URL and derive the URLs to check for
|
||||
2. HEAD the check urls
|
||||
3. Determine the canonical URL
|
||||
4. Run full check on canonical URL
|
||||
"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' +
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) ' +
|
||||
'Chrome/65.0.3325.181 green-spider/0.1'
|
||||
}
|
||||
|
||||
# all the info we'll return for the site
|
||||
result = {
|
||||
# input_url: The URL we derived all checks from
|
||||
'input_url': entry['url'],
|
||||
# Meta: Regional and type metadata for the site
|
||||
'meta': {
|
||||
'level': entry.get('level'),
|
||||
'state': entry.get('state'),
|
||||
'district': entry.get('district'),
|
||||
'city': entry.get('city'),
|
||||
},
|
||||
# Details: All details we collected about the site (which aren't directly
|
||||
# related to the report criteria)
|
||||
'details': {
|
||||
'hostnames': {},
|
||||
'ipv4_addresses': [],
|
||||
'resolvable_urls': [],
|
||||
'canonical_urls': [],
|
||||
'urlchecks': [],
|
||||
'icons': [],
|
||||
'feeds': [],
|
||||
'cms': None,
|
||||
'responsive': None,
|
||||
},
|
||||
# The actual report criteria
|
||||
'result': {
|
||||
'DNS_RESOLVABLE_IPV4': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'SITE_REACHABLE': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'HTTPS': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'WWW_OPTIONAL': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'CANONICAL_URL': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'FAVICON': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'FEEDS': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
'HTTP_RESPONSE_DURATION': {'type': 'number', 'value': None, 'score': 0},
|
||||
'RESPONSIVE': {'type': 'boolean', 'value': False, 'score': 0},
|
||||
},
|
||||
'score': 0.0,
|
||||
}
|
||||
|
||||
# derive hostnames to test (with/without www.)
|
||||
parsed = urlparse(entry['url'])
|
||||
hostnames = derive_test_hostnames(parsed.hostname)
|
||||
|
||||
# try to resolve hostnames
|
||||
processed_hostnames = {}
|
||||
for hostname in hostnames:
|
||||
|
||||
processed_hostnames[hostname] = {
|
||||
'resolvable': False,
|
||||
}
|
||||
|
||||
try:
|
||||
hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
|
||||
processed_hostnames[hostname]['resolvable'] = True
|
||||
processed_hostnames[hostname]['resolved_hostname'] = hostname
|
||||
processed_hostnames[hostname]['aliases'] = aliases
|
||||
processed_hostnames[hostname]['ip_addresses'] = ip_addresses
|
||||
except:
|
||||
pass
|
||||
|
||||
result['details']['hostnames'] = processed_hostnames
|
||||
|
||||
result['details']['ipv4_addresses'] = collect_ipv4_addresses(processed_hostnames)
|
||||
|
||||
# check basic HTTP(S) reachability
|
||||
checked_urls = []
|
||||
checked_urls_set = set()
|
||||
|
||||
for hostname in processed_hostnames.keys():
|
||||
|
||||
item = processed_hostnames[hostname]
|
||||
|
||||
if not item['resolvable']:
|
||||
continue
|
||||
|
||||
for scheme in ('http', 'https'):
|
||||
|
||||
url = scheme + '://' + item['resolved_hostname'] + '/'
|
||||
|
||||
if url in checked_urls_set:
|
||||
continue
|
||||
|
||||
checked_urls_set.add(url)
|
||||
|
||||
record = {
|
||||
'url': url,
|
||||
'error': None,
|
||||
'redirects_to': None,
|
||||
}
|
||||
|
||||
try:
|
||||
req = requests.head(record['url'], headers=headers, allow_redirects=True)
|
||||
if req.url == url:
|
||||
logging.info("URL: %s - status %s", record['url'], req.status_code)
|
||||
else:
|
||||
logging.info("URL: %s - status %s - redirects to %s", record['url'],
|
||||
req.status_code, req.url)
|
||||
record['redirects_to'] = req.url
|
||||
except Exception as exc:
|
||||
record['error'] = {
|
||||
'type': str(type(exc)),
|
||||
'message': str(exc),
|
||||
}
|
||||
logging.info("URL %s: %s %s", url, str(type(exc)), exc)
|
||||
|
||||
checked_urls.append(record)
|
||||
|
||||
result['details']['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
|
||||
result['details']['canonical_urls'] = sorted(reduce_urls(checked_urls))
|
||||
|
||||
# Deeper test for the remaining (canonical) URL(s)
|
||||
for check_url in result['details']['canonical_urls']:
|
||||
|
||||
logging.info("Downloading URL %s", check_url)
|
||||
|
||||
check = {
|
||||
'url': check_url,
|
||||
'status_code': None,
|
||||
'duration': None,
|
||||
'error': None,
|
||||
'content': None,
|
||||
'responsive': None,
|
||||
}
|
||||
|
||||
try:
|
||||
req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
|
||||
check['status_code'] = req.status_code
|
||||
check['duration'] = round(req.elapsed.microseconds / 1000)
|
||||
|
||||
# Content checks
|
||||
if req.status_code < 300:
|
||||
check['content'] = check_content(req)
|
||||
|
||||
# Responsiveness check
|
||||
try:
|
||||
check['responsive'] = check_responsiveness(check_url)
|
||||
except Exception as exc:
|
||||
logging.error("Error when checking responsiveness for '%s': %s", check_url, exc)
|
||||
|
||||
except requests.exceptions.ConnectionError as exc:
|
||||
logging.error(str(exc) + " " + check_url)
|
||||
check['error'] = "connection"
|
||||
except requests.exceptions.ReadTimeout as exc:
|
||||
logging.error(str(exc) + " " + check_url)
|
||||
check['error'] = "read_timeout"
|
||||
except requests.exceptions.Timeout as exc:
|
||||
logging.error(str(exc) + " " + check_url)
|
||||
check['error'] = "connection_timeout"
|
||||
except Exception as exc:
|
||||
logging.error(str(exc) + " " + check_url)
|
||||
check['error'] = "unknown"
|
||||
|
||||
result['details']['urlchecks'].append(check)
|
||||
|
||||
|
||||
result['details']['urlchecks'] = sorted(result['details']['urlchecks'],
|
||||
key=lambda url: url['url'])
|
||||
|
||||
# collect icons
|
||||
icons = set()
|
||||
for c in result['details']['urlchecks']:
|
||||
if 'content' not in c:
|
||||
continue
|
||||
if c['content'] is None:
|
||||
logging.warning("No content for %s", entry['url'])
|
||||
continue
|
||||
if c['content']['icon'] is not None:
|
||||
icons.add(c['content']['icon'])
|
||||
result['details']['icons'] = sorted(list(icons))
|
||||
|
||||
# collect feeds
|
||||
feeds = set()
|
||||
for c in result['details']['urlchecks']:
|
||||
if c['content'] is None:
|
||||
logging.warning("No content for %s", entry['url'])
|
||||
continue
|
||||
if 'feeds' in c['content'] and len(c['content']['feeds']):
|
||||
for feed in c['content']['feeds']:
|
||||
feeds.add(feed)
|
||||
result['details']['feeds'] = sorted(list(feeds))
|
||||
|
||||
# detect responsive
|
||||
viewports = set()
|
||||
min_width = 2000
|
||||
for c in result['details']['urlchecks']:
|
||||
if c['responsive'] is None:
|
||||
continue
|
||||
if c['responsive']['viewport_meta_tag'] is not None:
|
||||
viewports.add(c['responsive']['viewport_meta_tag'])
|
||||
widths = c['responsive']['document_width'].values()
|
||||
if min(widths) < min_width:
|
||||
min_width = min(widths)
|
||||
result['details']['responsive'] = {
|
||||
'viewport_meta_tag': list(viewports),
|
||||
'min_width': min_width,
|
||||
}
|
||||
|
||||
# detect CMS
|
||||
for c in result['details']['urlchecks']:
|
||||
if c['content'] is None:
|
||||
continue
|
||||
if 'generator' not in c['content']:
|
||||
continue
|
||||
if c['content']['generator'] != "" and c['content']['generator'] is not None:
|
||||
|
||||
result['details']['cms'] = parse_generator(c['content']['generator'])
|
||||
# Qualify certain CMS flavours in more detail
|
||||
if result['details']['cms'] == "typo3":
|
||||
if GCMS_IP in result['details']['ipv4_addresses']:
|
||||
result['details']['cms'] = "typo3-gcms"
|
||||
elif 'typo3-gruene.de' in c['content']['html']:
|
||||
result['details']['cms'] = "typo3-gruene"
|
||||
elif result['details']['cms'] == "wordpress":
|
||||
if 'Urwahl3000' in c['content']['html']:
|
||||
result['details']['cms'] = "wordpress-urwahl"
|
||||
|
||||
else:
|
||||
# No generator Tag. Use HTML content.
|
||||
if 'Urwahl3000' in c['content']['html']:
|
||||
result['details']['cms'] = "wordpress-urwahl"
|
||||
elif ('josephknowsbest' in c['content']['html'] or
|
||||
'Joseph-knows-best' in c['content']['html']):
|
||||
result['details']['cms'] = "wordpress-josephknowsbest"
|
||||
elif 'wordpress' in c['content']['html']:
|
||||
result['details']['cms'] = "wordpress"
|
||||
|
||||
# we can stop here
|
||||
break
|
||||
|
||||
|
||||
### Derive criteria
|
||||
|
||||
# DNS_RESOLVABLE_IPV4
|
||||
if result['details']['ipv4_addresses']:
|
||||
result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
|
||||
|
||||
# SITE_REACHABLE
|
||||
for item in result['details']['resolvable_urls']:
|
||||
if item['error'] is None:
|
||||
result['result']['SITE_REACHABLE'] = {'value': True, 'score': 1}
|
||||
break
|
||||
|
||||
# HTTPS
|
||||
for item in result['details']['urlchecks']:
|
||||
if item['error'] is None and item['url'].startswith('https://'):
|
||||
result['result']['HTTPS'] = {'value': True, 'score': 2}
|
||||
break
|
||||
|
||||
# WWW_OPTIONAL
|
||||
num_hostnames = 0
|
||||
for hostname in result['details']['hostnames'].keys():
|
||||
item = result['details']['hostnames'][hostname]
|
||||
if not item['resolvable']:
|
||||
continue
|
||||
num_hostnames += 1
|
||||
if num_hostnames > 1:
|
||||
result['result']['WWW_OPTIONAL'] = {'value': True, 'score': 1}
|
||||
|
||||
# CANONICAL_URL
|
||||
# - either there is only one canonical URL (through redirects)
|
||||
# - or several pages have identical rel=canonical links
|
||||
if len(result['details']['canonical_urls']) == 1:
|
||||
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
|
||||
else:
|
||||
links = set()
|
||||
if result['details']['urlchecks'] is None:
|
||||
logging.warning("No urlchecks for %s", entry['url'])
|
||||
else:
|
||||
for item in result['details']['urlchecks']:
|
||||
if item['content'] is not None and item['content']['canonical_link'] is not None:
|
||||
links.add(item['content']['canonical_link'])
|
||||
if len(links) == 1:
|
||||
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
|
||||
|
||||
# FAVICON
|
||||
if result['details']['icons']:
|
||||
result['result']['FAVICON'] = {'value': True, 'score': 1}
|
||||
|
||||
# FEEDS
|
||||
if result['details']['feeds']:
|
||||
result['result']['FEEDS'] = {'value': True, 'score': 1}
|
||||
|
||||
# HTTP_RESPONSE_DURATION
|
||||
durations = []
|
||||
for item in result['details']['urlchecks']:
|
||||
if item['error'] is None:
|
||||
durations.append(item['duration'])
|
||||
if durations:
|
||||
val = round(statistics.mean(durations))
|
||||
result['result']['HTTP_RESPONSE_DURATION']['value'] = val
|
||||
if val < 100:
|
||||
result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
|
||||
elif val < 1000:
|
||||
result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
|
||||
|
||||
# RESPONSIVE
|
||||
if result['details']['responsive'] is not None:
|
||||
if (result['details']['responsive']['min_width'] < 500 and
|
||||
len(result['details']['responsive']['viewport_meta_tag']) > 0):
|
||||
result['result']['RESPONSIVE']['value'] = True
|
||||
result['result']['RESPONSIVE']['score'] = 1
|
||||
|
||||
# Overall score
|
||||
for item in result['result'].keys():
|
||||
result['score'] += result['result'][item]['score']
|
||||
|
||||
# clean up - remove full HTML
|
||||
for item in result['details']['urlchecks']:
|
||||
try:
|
||||
del item['content']['html']
|
||||
except:
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@tenacity.retry(wait=tenacity.wait_exponential(),
|
||||
retry=tenacity.retry_if_exception_type(Aborted))
|
||||
def get_job_from_queue():
|
||||
"""
|
||||
Returns a URL from the queue
|
||||
"""
|
||||
out = None
|
||||
|
||||
with DATASTORE_CLIENT.transaction():
|
||||
query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND)
|
||||
for entity in query.fetch(limit=1):
|
||||
logging.debug("Got job: %s", entity)
|
||||
out = dict(entity)
|
||||
out["url"] = entity.key.name
|
||||
DATASTORE_CLIENT.delete(entity.key)
|
||||
|
||||
return out
|
||||
|
||||
def work_of_queue():
|
||||
"""
|
||||
Take job from queue and finish it until there are no more jobs
|
||||
"""
|
||||
while True:
|
||||
job = get_job_from_queue()
|
||||
if job is None:
|
||||
logging.info("No more jobs. Exiting.")
|
||||
break
|
||||
|
||||
logging.info("Starting job %s", job["url"])
|
||||
result = check_site(entry=job)
|
||||
#logging.debug(result)
|
||||
logging.info("Job %s finished checks", job["url"])
|
||||
logging.info("Job %s writing to DB", job["url"])
|
||||
|
||||
key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"])
|
||||
entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
|
||||
record = {
|
||||
"created": datetime.utcnow(),
|
||||
"results": result,
|
||||
}
|
||||
entity.update(record)
|
||||
try:
|
||||
DATASTORE_CLIENT.put(entity)
|
||||
except InvalidArgument as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
except ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Bringing it all together
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--credentials-path', dest='credentials_path',
|
||||
help='Path to the service account credentials JSON file',
|
||||
default='/secrets/service-account.json')
|
||||
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
|
||||
default='info')
|
||||
|
||||
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
|
||||
|
||||
subparsers.add_parser('spider', help='Take jobs off the queue and spider')
|
||||
|
||||
jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
|
||||
|
||||
jobs_parser.add_argument('--url', help='Add a job to spider a URL')
|
||||
args = parser.parse_args()
|
||||
|
||||
loglevel = args.loglevel.lower()
|
||||
if loglevel == 'error':
|
||||
logging.basicConfig(level=logging.ERROR)
|
||||
elif loglevel == 'warn':
|
||||
logging.basicConfig(level=logging.WARN)
|
||||
elif loglevel == 'debug':
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
loglevel = 'info'
|
||||
|
||||
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
||||
|
||||
DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
|
||||
|
||||
logging.debug("Called command %s", args.command)
|
||||
|
||||
if args.command == 'jobs':
|
||||
create_jobs(args.url)
|
||||
else:
|
||||
work_of_queue()
|
|
@ -0,0 +1,106 @@
|
|||
"""
|
||||
Provides the spider functionality (website checks).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import statistics
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pprint import pprint
|
||||
|
||||
from google.api_core.exceptions import InvalidArgument
|
||||
from google.cloud import datastore
|
||||
|
||||
import checks
|
||||
import config
|
||||
import jobs
|
||||
import rating
|
||||
|
||||
def check_and_rate_site(entry):
|
||||
"""
|
||||
Performs our site check and returns results as a dict.
|
||||
|
||||
1. Normalize the input URL and derive the URLs to check for
|
||||
2. HEAD the check urls
|
||||
3. Determine the canonical URL
|
||||
4. Run full check on canonical URL
|
||||
"""
|
||||
|
||||
# all the info we'll return for the site
|
||||
result = {
|
||||
# input_url: The URL we derived all checks from
|
||||
'input_url': entry['url'],
|
||||
# Meta: Regional and type metadata for the site
|
||||
'meta': {
|
||||
'type': entry.get('type'),
|
||||
'level': entry.get('level'),
|
||||
'state': entry.get('state'),
|
||||
'district': entry.get('district'),
|
||||
'city': entry.get('city'),
|
||||
},
|
||||
# checks: Results from our checks
|
||||
'checks': {},
|
||||
# The actual report scoring criteria
|
||||
'rating': {},
|
||||
# resulting score
|
||||
'score': 0.0,
|
||||
}
|
||||
|
||||
# Results from our next generation checkers
|
||||
result['checks'] = checks.perform_checks(entry['url'])
|
||||
|
||||
result['rating'] = rating.calculate_rating(result['checks'])
|
||||
|
||||
# Overall score is the sum of the individual scores
|
||||
for key in result['rating']:
|
||||
result['score'] += result['rating'][key]['score']
|
||||
|
||||
# remove full HTML page content,
|
||||
# as it's no longer needed
|
||||
try:
|
||||
for url in result['checks']['page_content']:
|
||||
del result['checks']['page_content'][url]['content']
|
||||
except:
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def work_of_queue(datastore_client, entity_kind):
|
||||
"""
|
||||
Take job from queue and finish it until there are no more jobs
|
||||
"""
|
||||
while True:
|
||||
job = jobs.get_job_from_queue(datastore_client)
|
||||
if job is None:
|
||||
logging.info("No more jobs. Exiting.")
|
||||
break
|
||||
|
||||
logging.info("Starting job %s", job["url"])
|
||||
result = check_and_rate_site(entry=job)
|
||||
|
||||
logging.debug("Full JSON representation of returned result: %s", json.dumps(result))
|
||||
|
||||
logging.info("Job %s finished checks", job["url"])
|
||||
logging.info("Job %s writing to DB", job["url"])
|
||||
|
||||
key = datastore_client.key(entity_kind, job["url"])
|
||||
entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
|
||||
record = {
|
||||
'created': datetime.utcnow(),
|
||||
'meta': result['meta'],
|
||||
'checks': result['checks'],
|
||||
'rating': result['rating'],
|
||||
'score': result['score'],
|
||||
}
|
||||
entity.update(record)
|
||||
try:
|
||||
datastore_client.put(entity)
|
||||
except InvalidArgument as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
except Exception as ex:
|
||||
logging.error("Could not write result: %s", ex)
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
import unittest
|
||||
|
||||
from spider.spider import check_and_rate_site
|
||||
|
||||
from pprint import pprint
|
||||
|
||||
class TestSpiderr(unittest.TestCase):
|
||||
|
||||
def test_url1(self):
|
||||
|
||||
entry = {
|
||||
"url": "https://httpbin.org/html",
|
||||
"type": "type",
|
||||
"state": "state",
|
||||
"level": "level",
|
||||
"district": "district",
|
||||
"city": "city",
|
||||
}
|
||||
|
||||
url = "https://httpbin.org/html"
|
||||
result = check_and_rate_site(entry)
|
||||
|
||||
self.assertEqual(result["input_url"], url)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
125
spider_test.py
125
spider_test.py
|
@ -1,125 +0,0 @@
|
|||
import unittest
|
||||
import requests
|
||||
import responses
|
||||
import spider
|
||||
|
||||
|
||||
class TestDeriveHostnames(unittest.TestCase):
|
||||
|
||||
def test_basic1(self):
|
||||
hn = spider.derive_test_hostnames('www.my-domain.de')
|
||||
expected = ['my-domain.de', 'www.my-domain.de']
|
||||
self.assertEqual(hn, expected)
|
||||
|
||||
def test_basic2(self):
|
||||
hn = spider.derive_test_hostnames('domain.de')
|
||||
expected = ['domain.de', 'www.domain.de']
|
||||
self.assertEqual(hn, expected)
|
||||
|
||||
|
||||
class TestReduceURLs(unittest.TestCase):
|
||||
|
||||
def test_basic(self):
|
||||
testdata = [
|
||||
{'url': 'one', 'error': None, 'redirects_to': None},
|
||||
{'url': 'two', 'error': 'Yes', 'redirects_to': None},
|
||||
{'url': 'three', 'error': None, 'redirects_to': 'five'},
|
||||
]
|
||||
expected_result = ['five', 'one']
|
||||
result = spider.reduce_urls(testdata)
|
||||
self.assertEqual(result, expected_result)
|
||||
|
||||
|
||||
class TestContentChecks(unittest.TestCase):
|
||||
|
||||
@responses.activate
|
||||
def test_minimal(self):
|
||||
url = 'http://my.url'
|
||||
responses.add(responses.GET, url, status=200,
|
||||
content_type='text/html',
|
||||
body='<html></html>')
|
||||
r = requests.get(url)
|
||||
result = spider.check_content(r)
|
||||
|
||||
del result['html'] # don't want to have the messy HTML part in comparison
|
||||
|
||||
expected_result = {
|
||||
'icon': None,
|
||||
'title': None,
|
||||
'generator': None,
|
||||
'feeds': [],
|
||||
'encoding': 'iso-8859-1',
|
||||
'canonical_link': None,
|
||||
'opengraph': None
|
||||
}
|
||||
self.assertDictEqual(result, expected_result)
|
||||
|
||||
@responses.activate
|
||||
def test_basic(self):
|
||||
url = 'http://my.url'
|
||||
responses.add(responses.GET, url, status=200,
|
||||
content_type='text/html; charset=UTF-8',
|
||||
body='''
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title> The page's title </title>
|
||||
<meta name="generator" content="some-cms/1.0">
|
||||
<link rel="shortcut icon" href="http://foo.bar/image.png">
|
||||
<link rel="alternate" type="application/rss+xml" href="http://example.com/feed">
|
||||
<link rel="canonical" href="https://my.site.com/">
|
||||
</head>
|
||||
</html>
|
||||
''')
|
||||
r = requests.get(url)
|
||||
result = spider.check_content(r)
|
||||
|
||||
del result['html'] # don't want to have the messy HTML part in comparison
|
||||
|
||||
expected_result = {
|
||||
'icon': 'http://foo.bar/image.png',
|
||||
'title': 'The page\'s title',
|
||||
'generator': 'some-cms/1.0',
|
||||
'feeds': [
|
||||
'http://example.com/feed',
|
||||
],
|
||||
'encoding': 'utf-8',
|
||||
'canonical_link': 'https://my.site.com/',
|
||||
'opengraph': None
|
||||
}
|
||||
self.assertDictEqual(result, expected_result)
|
||||
|
||||
@responses.activate
|
||||
def test_opengraph(self):
|
||||
url = 'http://my.url'
|
||||
responses.add(responses.GET, url, status=200,
|
||||
content_type='text/html; charset=UTF-8',
|
||||
body='''
|
||||
<html>
|
||||
<head>
|
||||
<meta property="og:title" content="The Rock" />
|
||||
<meta property="og:type" content="video.movie" />
|
||||
<meta property="og:url" content="http://www.foor.bar" />
|
||||
<meta property="og:image" content="http://www.foo.bar/foo.jpg" />
|
||||
</head>
|
||||
</html>
|
||||
''')
|
||||
r = requests.get(url)
|
||||
result = spider.check_content(r)
|
||||
|
||||
del result['html'] # don't want to have the messy HTML part in comparison
|
||||
|
||||
expected_result = {
|
||||
'icon': None,
|
||||
'title': None,
|
||||
'generator': None,
|
||||
'feeds': [],
|
||||
'encoding': 'utf-8',
|
||||
'canonical_link': None,
|
||||
'opengraph': ['og:image', 'og:title', 'og:type', 'og:url'],
|
||||
}
|
||||
self.assertDictEqual(result, expected_result)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Loading…
Reference in New Issue