Start refactoring checks

This commit is contained in:
Marian Steinbach 2018-09-25 00:44:07 +02:00
parent 3e1c0ede73
commit a2287388dd
5 changed files with 168 additions and 53 deletions

35
checks/__init__.py Normal file
View file

@ -0,0 +1,35 @@
"""
The checks module contains the individual checks we perform with a page
"""
import logging
from checks import subdomain_variations
#from checks import home_url_canonicalization
#from checks import http_and_https
from checks.config import Config
def perform_checks(input_url):
"""
Executes the tests in the right order
"""
check_modules = [
('subdomain_variations', subdomain_variations),
#("home_url_canonicalization", home_url_canonicalization),
#("http_and_https", http_and_https),
]
result = {}
config = Config(urls=[input_url])
for check_name, check in check_modules:
checker = check.Checker(config)
result[check_name] = checker.run()
# update config for the next check
config = checker.config
return result

View file

@ -0,0 +1,15 @@
class AbstractChecker(object):
"""
Our blueprint for checks
"""
def __init__(self, config):
self._config = config
def run(self):
"""Executes the check routine, returns result dict"""
raise NotImplementedError()
@property
def config(self):
return self._config

17
checks/config.py Normal file
View file

@ -0,0 +1,17 @@
class Config(object):
"""
Our configuration to be passed to checks
"""
def __init__(self, urls):
self._urls = set(urls)
def __repr__(self):
return "Config(urls=%r)" % self._urls
@property
def urls(self):
return list(self._urls)
def add_url(self, url):
self._urls.add(url)

View file

@ -0,0 +1,84 @@
"""
This check makes sure that commmonly used variations of a (sub)domain are resolvable.
Example: input_url = 'http://example.com'
will check: ['example.com', 'www.example.com']
Resolvable subdomains are added to config.urls.
Details on the resolution are returns as a result from the run() method.
"""
import logging
from socket import gethostbyname_ex
from urllib.parse import urlparse
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config):
super().__init__(config)
def run(self):
"""Executes the check routine, returns result dict"""
logging.debug("subdomain_variations.Checker.run() called with Config: %r" % self.config)
hostnames = self.expand_hostnames()
results = self.resolve_hostnames(hostnames)
# pass resolvable hostnames on as URLs for further checks
for item in results:
if item['resolvable']:
self.config.add_url('http://%s/' % item['hostname'])
return results
def expand_hostnames(self):
"""
Create variations of subdomains
"""
hostnames = set()
for url in self.config.urls:
parsed = urlparse(url)
hostnames.add(parsed.hostname)
if parsed.hostname.startswith('www.'):
# remove 'www.' prefix
hostnames.add(parsed.hostname[4:])
else:
# add 'www.' prefix
hostnames.add('www.' + parsed.hostname)
return sorted(list(hostnames))
def resolve_hostname(self, hostname):
"""
Resolve one to IPv4 address(es)
"""
result = {
'hostname': hostname,
'resolvable': False,
'aliases': [],
'ipv4_addresses': [],
}
try:
hostname, aliases, ipv4_addresses = gethostbyname_ex(hostname)
result['resolvable'] = True
result['aliases'] = aliases
result['ipv4_addresses'] = ipv4_addresses
except Exception as e:
logging.debug("Hostname %s not resolvable. Exception: %r" % (hostname, e))
return result
def resolve_hostnames(self, hostnames):
result = []
for hostname in hostnames:
result.append(self.resolve_hostname(hostname))
return result

View file

@ -21,30 +21,11 @@ from google.cloud import datastore
import jobs
import config
import checks
DATASTORE_CLIENT = None
def derive_test_hostnames(hostname):
"""
Derives the hostnames variants to test for a given host name.
From 'gruene-x.de' or 'www.gruene-x.de' it makes
['gruene-x.de', 'www.gruene-x.de']
which are both plausible web URLs to be used for a domain.
"""
hostnames = set()
hostnames.add(hostname)
if hostname.startswith('www.'):
hostnames.add(hostname[4:])
else:
hostnames.add('www.' + hostname)
return sorted(list(hostnames))
def reduce_urls(urllist):
"""
@ -191,16 +172,15 @@ def check_content(req):
return result
def collect_ipv4_addresses(hostname_dict):
def collect_ipv4_addresses(hostname_results):
"""
Return list of unique IPv4 addresses
"""
ips = set()
for item in hostname_dict.values():
if 'ip_addresses' not in item:
for item in hostname_results:
if 'ipv4_addresses' not in item:
continue
for ip_addr in item['ip_addresses']:
ips.add(ip_addr)
ips = ips | set(item['ipv4_addresses']) # union
return sorted(list(ips))
@ -219,6 +199,7 @@ def parse_generator(generator):
return "joomla"
return generator
def check_site(entry):
"""
Performs our site check and returns results as a dict.
@ -273,45 +254,29 @@ def check_site(entry):
'score': 0.0,
}
# derive hostnames to test (with/without www.)
parsed = urlparse(entry['url'])
hostnames = derive_test_hostnames(parsed.hostname)
# Results from our next generation checkers
nextgen_results = checks.perform_checks(entry['url'])
# try to resolve hostnames
processed_hostnames = {}
for hostname in hostnames:
result['details']['hostnames'] = nextgen_results['subdomain_variations']
logging.debug("result[details][hostnames]: %r" % result['details']['hostnames'])
processed_hostnames[hostname] = {
'resolvable': False,
}
result['details']['ipv4_addresses'] = collect_ipv4_addresses(nextgen_results['subdomain_variations'])
logging.debug("result[details][ipv4_addresses]: %r" % result['details']['ipv4_addresses'])
try:
hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
processed_hostnames[hostname]['resolvable'] = True
processed_hostnames[hostname]['resolved_hostname'] = hostname
processed_hostnames[hostname]['aliases'] = aliases
processed_hostnames[hostname]['ip_addresses'] = ip_addresses
except:
pass
result['details']['hostnames'] = processed_hostnames
result['details']['ipv4_addresses'] = collect_ipv4_addresses(processed_hostnames)
time.sleep(5)
# check basic HTTP(S) reachability
checked_urls = []
checked_urls_set = set()
for hostname in processed_hostnames.keys():
item = processed_hostnames[hostname]
for item in result['details']['hostnames']:
if not item['resolvable']:
continue
for scheme in ('http', 'https'):
url = scheme + '://' + item['resolved_hostname'] + '/'
url = scheme + '://' + item['hostname'] + '/'
if url in checked_urls_set:
continue
@ -484,8 +449,7 @@ def check_site(entry):
# WWW_OPTIONAL
num_hostnames = 0
for hostname in result['details']['hostnames'].keys():
item = result['details']['hostnames'][hostname]
for item in result['details']['hostnames']:
if not item['resolvable']:
continue
num_hostnames += 1
@ -562,7 +526,7 @@ def work_of_queue():
logging.info("Starting job %s", job["url"])
result = check_site(entry=job)
#logging.debug(result)
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])