mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-02 00:54:52 +02:00
Start refactoring checks
This commit is contained in:
parent
3e1c0ede73
commit
a2287388dd
35
checks/__init__.py
Normal file
35
checks/__init__.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
"""
|
||||
The checks module contains the individual checks we perform with a page
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from checks import subdomain_variations
|
||||
#from checks import home_url_canonicalization
|
||||
#from checks import http_and_https
|
||||
|
||||
from checks.config import Config
|
||||
|
||||
|
||||
def perform_checks(input_url):
|
||||
"""
|
||||
Executes the tests in the right order
|
||||
"""
|
||||
check_modules = [
|
||||
('subdomain_variations', subdomain_variations),
|
||||
#("home_url_canonicalization", home_url_canonicalization),
|
||||
#("http_and_https", http_and_https),
|
||||
]
|
||||
|
||||
result = {}
|
||||
|
||||
config = Config(urls=[input_url])
|
||||
|
||||
for check_name, check in check_modules:
|
||||
checker = check.Checker(config)
|
||||
result[check_name] = checker.run()
|
||||
|
||||
# update config for the next check
|
||||
config = checker.config
|
||||
|
||||
return result
|
15
checks/abstract_checker.py
Normal file
15
checks/abstract_checker.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
class AbstractChecker(object):
|
||||
"""
|
||||
Our blueprint for checks
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
self._config = config
|
||||
|
||||
def run(self):
|
||||
"""Executes the check routine, returns result dict"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def config(self):
|
||||
return self._config
|
17
checks/config.py
Normal file
17
checks/config.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
class Config(object):
|
||||
"""
|
||||
Our configuration to be passed to checks
|
||||
"""
|
||||
|
||||
def __init__(self, urls):
|
||||
self._urls = set(urls)
|
||||
|
||||
def __repr__(self):
|
||||
return "Config(urls=%r)" % self._urls
|
||||
|
||||
@property
|
||||
def urls(self):
|
||||
return list(self._urls)
|
||||
|
||||
def add_url(self, url):
|
||||
self._urls.add(url)
|
84
checks/subdomain_variations.py
Normal file
84
checks/subdomain_variations.py
Normal file
|
@ -0,0 +1,84 @@
|
|||
"""
|
||||
This check makes sure that commmonly used variations of a (sub)domain are resolvable.
|
||||
|
||||
Example: input_url = 'http://example.com'
|
||||
will check: ['example.com', 'www.example.com']
|
||||
|
||||
Resolvable subdomains are added to config.urls.
|
||||
|
||||
Details on the resolution are returns as a result from the run() method.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from socket import gethostbyname_ex
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
def run(self):
|
||||
"""Executes the check routine, returns result dict"""
|
||||
logging.debug("subdomain_variations.Checker.run() called with Config: %r" % self.config)
|
||||
|
||||
hostnames = self.expand_hostnames()
|
||||
|
||||
results = self.resolve_hostnames(hostnames)
|
||||
|
||||
# pass resolvable hostnames on as URLs for further checks
|
||||
for item in results:
|
||||
if item['resolvable']:
|
||||
self.config.add_url('http://%s/' % item['hostname'])
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def expand_hostnames(self):
|
||||
"""
|
||||
Create variations of subdomains
|
||||
"""
|
||||
hostnames = set()
|
||||
|
||||
for url in self.config.urls:
|
||||
parsed = urlparse(url)
|
||||
hostnames.add(parsed.hostname)
|
||||
if parsed.hostname.startswith('www.'):
|
||||
# remove 'www.' prefix
|
||||
hostnames.add(parsed.hostname[4:])
|
||||
else:
|
||||
# add 'www.' prefix
|
||||
hostnames.add('www.' + parsed.hostname)
|
||||
|
||||
return sorted(list(hostnames))
|
||||
|
||||
|
||||
def resolve_hostname(self, hostname):
|
||||
"""
|
||||
Resolve one to IPv4 address(es)
|
||||
"""
|
||||
result = {
|
||||
'hostname': hostname,
|
||||
'resolvable': False,
|
||||
'aliases': [],
|
||||
'ipv4_addresses': [],
|
||||
}
|
||||
|
||||
try:
|
||||
hostname, aliases, ipv4_addresses = gethostbyname_ex(hostname)
|
||||
result['resolvable'] = True
|
||||
result['aliases'] = aliases
|
||||
result['ipv4_addresses'] = ipv4_addresses
|
||||
except Exception as e:
|
||||
logging.debug("Hostname %s not resolvable. Exception: %r" % (hostname, e))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def resolve_hostnames(self, hostnames):
|
||||
result = []
|
||||
for hostname in hostnames:
|
||||
result.append(self.resolve_hostname(hostname))
|
||||
|
||||
return result
|
70
spider.py
70
spider.py
|
@ -21,30 +21,11 @@ from google.cloud import datastore
|
|||
|
||||
import jobs
|
||||
import config
|
||||
import checks
|
||||
|
||||
DATASTORE_CLIENT = None
|
||||
|
||||
|
||||
def derive_test_hostnames(hostname):
|
||||
"""
|
||||
Derives the hostnames variants to test for a given host name.
|
||||
From 'gruene-x.de' or 'www.gruene-x.de' it makes
|
||||
|
||||
['gruene-x.de', 'www.gruene-x.de']
|
||||
|
||||
which are both plausible web URLs to be used for a domain.
|
||||
"""
|
||||
|
||||
hostnames = set()
|
||||
|
||||
hostnames.add(hostname)
|
||||
if hostname.startswith('www.'):
|
||||
hostnames.add(hostname[4:])
|
||||
else:
|
||||
hostnames.add('www.' + hostname)
|
||||
|
||||
return sorted(list(hostnames))
|
||||
|
||||
|
||||
def reduce_urls(urllist):
|
||||
"""
|
||||
|
@ -191,16 +172,15 @@ def check_content(req):
|
|||
return result
|
||||
|
||||
|
||||
def collect_ipv4_addresses(hostname_dict):
|
||||
def collect_ipv4_addresses(hostname_results):
|
||||
"""
|
||||
Return list of unique IPv4 addresses
|
||||
"""
|
||||
ips = set()
|
||||
for item in hostname_dict.values():
|
||||
if 'ip_addresses' not in item:
|
||||
for item in hostname_results:
|
||||
if 'ipv4_addresses' not in item:
|
||||
continue
|
||||
for ip_addr in item['ip_addresses']:
|
||||
ips.add(ip_addr)
|
||||
ips = ips | set(item['ipv4_addresses']) # union
|
||||
return sorted(list(ips))
|
||||
|
||||
|
||||
|
@ -219,6 +199,7 @@ def parse_generator(generator):
|
|||
return "joomla"
|
||||
return generator
|
||||
|
||||
|
||||
def check_site(entry):
|
||||
"""
|
||||
Performs our site check and returns results as a dict.
|
||||
|
@ -273,45 +254,29 @@ def check_site(entry):
|
|||
'score': 0.0,
|
||||
}
|
||||
|
||||
# derive hostnames to test (with/without www.)
|
||||
parsed = urlparse(entry['url'])
|
||||
hostnames = derive_test_hostnames(parsed.hostname)
|
||||
# Results from our next generation checkers
|
||||
nextgen_results = checks.perform_checks(entry['url'])
|
||||
|
||||
# try to resolve hostnames
|
||||
processed_hostnames = {}
|
||||
for hostname in hostnames:
|
||||
result['details']['hostnames'] = nextgen_results['subdomain_variations']
|
||||
logging.debug("result[details][hostnames]: %r" % result['details']['hostnames'])
|
||||
|
||||
processed_hostnames[hostname] = {
|
||||
'resolvable': False,
|
||||
}
|
||||
result['details']['ipv4_addresses'] = collect_ipv4_addresses(nextgen_results['subdomain_variations'])
|
||||
logging.debug("result[details][ipv4_addresses]: %r" % result['details']['ipv4_addresses'])
|
||||
|
||||
try:
|
||||
hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
|
||||
processed_hostnames[hostname]['resolvable'] = True
|
||||
processed_hostnames[hostname]['resolved_hostname'] = hostname
|
||||
processed_hostnames[hostname]['aliases'] = aliases
|
||||
processed_hostnames[hostname]['ip_addresses'] = ip_addresses
|
||||
except:
|
||||
pass
|
||||
|
||||
result['details']['hostnames'] = processed_hostnames
|
||||
|
||||
result['details']['ipv4_addresses'] = collect_ipv4_addresses(processed_hostnames)
|
||||
time.sleep(5)
|
||||
|
||||
# check basic HTTP(S) reachability
|
||||
checked_urls = []
|
||||
checked_urls_set = set()
|
||||
|
||||
for hostname in processed_hostnames.keys():
|
||||
|
||||
item = processed_hostnames[hostname]
|
||||
for item in result['details']['hostnames']:
|
||||
|
||||
if not item['resolvable']:
|
||||
continue
|
||||
|
||||
for scheme in ('http', 'https'):
|
||||
|
||||
url = scheme + '://' + item['resolved_hostname'] + '/'
|
||||
url = scheme + '://' + item['hostname'] + '/'
|
||||
|
||||
if url in checked_urls_set:
|
||||
continue
|
||||
|
@ -484,8 +449,7 @@ def check_site(entry):
|
|||
|
||||
# WWW_OPTIONAL
|
||||
num_hostnames = 0
|
||||
for hostname in result['details']['hostnames'].keys():
|
||||
item = result['details']['hostnames'][hostname]
|
||||
for item in result['details']['hostnames']:
|
||||
if not item['resolvable']:
|
||||
continue
|
||||
num_hostnames += 1
|
||||
|
@ -562,7 +526,7 @@ def work_of_queue():
|
|||
|
||||
logging.info("Starting job %s", job["url"])
|
||||
result = check_site(entry=job)
|
||||
#logging.debug(result)
|
||||
|
||||
logging.info("Job %s finished checks", job["url"])
|
||||
logging.info("Job %s writing to DB", job["url"])
|
||||
|
||||
|
|
Loading…
Reference in a new issue