green-spider/spider.py

293 lines
8.0 KiB
Python
Raw Normal View History

2018-04-03 23:15:28 +02:00
# coding: utf8
from git import Repo
from multiprocessing import Pool
from socket import gethostbyname_ex
2018-04-04 21:14:16 +02:00
from urllib.parse import urlparse
import certifi
2018-04-03 23:15:28 +02:00
import json
import logging
import os
import random
import requests
import shutil
import sys
import yaml
# configuration
# number of parallel processes to use for crawling
concurrency = 4
2018-04-03 23:15:28 +02:00
# connection timeout for website checks (seconds)
connect_timeout = 5
# response timeout for website checks
read_timeout = 10
# Git repo for our data
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
# folder in that repo that holds the data
green_direcory_data_path = 'data'
green_directory_local_path = './cache/green-directory'
result_path = './webapp/dist/data'
2018-04-03 23:15:28 +02:00
# end configuration
2018-04-04 21:14:16 +02:00
2018-04-03 23:15:28 +02:00
def get_green_directory():
"""
2018-04-04 21:14:16 +02:00
Clones the source of website URLs, the green directory,
into the local file system using git
2018-04-03 23:15:28 +02:00
"""
if os.path.exists(green_directory_local_path):
shutil.rmtree(green_directory_local_path)
Repo.clone_from(green_directory_repo, green_directory_local_path)
def dir_entries():
2018-04-04 21:14:16 +02:00
"""
Iterator over all data files in the cloned green directory
"""
2018-04-03 23:15:28 +02:00
path = os.path.join(green_directory_local_path, green_direcory_data_path)
for root, dirs, files in os.walk(path):
for fname in files:
filepath = os.path.join(root, fname)
if not filepath.endswith(".yaml"):
continue
with open(filepath, 'r') as yamlfile:
for doc in yaml.load_all(yamlfile):
yield doc
2018-04-04 21:14:16 +02:00
2018-04-03 23:15:28 +02:00
def repr_entry(entry):
"""
2018-04-04 21:14:16 +02:00
Return string representation of a directory entry,
for logging/debugging purposes
2018-04-03 23:15:28 +02:00
"""
r = entry['type']
if 'level' in entry:
r += "/" + entry['level']
if 'state' in entry:
r += "/" + entry['state']
if 'district' in entry:
r += "/" + entry['district']
return r
2018-04-04 21:14:16 +02:00
def derive_test_hostnames(hostname):
"""
Derives the hostnames variants to test for a given host name.
From 'gruene-x.de' or 'www.gruene-x.de' it makes
['gruene-x.de', 'www.gruene-x.de']
which are both plausible web URLs to be used for a domain.
"""
hostnames = set()
hostnames.add(hostname)
if hostname.startswith('www.'):
hostnames.add(hostname[4:])
else:
hostnames.add('www.' + hostname)
return list(hostnames)
def reduce_urls(urllist):
"""
Reduce a list of urls with metadata by eliminating those
that either don't work or lead somewhere else
"""
targets = set()
for u in urllist:
if u['error'] is not None:
continue
if u['redirects_to'] is not None:
targets.add(u['redirects_to'])
else:
targets.add(u['url'])
return list(targets)
2018-04-03 23:15:28 +02:00
def check_site(url):
"""
2018-04-04 21:14:16 +02:00
Performs our site check and returns results as a dict.
1. Normalize the input URL and derive the URLs to check for
2. HEAD the check urls
3. Determine the canonical URL
4. Run full check on canonical URL
2018-04-03 23:15:28 +02:00
"""
2018-04-04 21:14:16 +02:00
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
}
2018-04-03 23:15:28 +02:00
result = {
2018-04-04 21:14:16 +02:00
'input_url': url,
'hostnames': [],
'resolvable_urls': [],
'canonical_urls': [],
'urlchecks': [],
2018-04-03 23:15:28 +02:00
}
2018-04-04 21:14:16 +02:00
# derive hostnames to test
parsed = urlparse(url)
hostnames = derive_test_hostnames(parsed.hostname)
2018-04-03 23:15:28 +02:00
2018-04-04 21:14:16 +02:00
processed_hostnames = []
for hn in hostnames:
record = {
'input_hostname': hn,
'resolvable': False,
}
try:
hostname, aliases, ip_addresses = gethostbyname_ex(hn)
record['resolvable'] = True
record['resolved_hostname'] = hostname
record['aliases'] = aliases
record['ip_addresses'] = ip_addresses
except:
pass
processed_hostnames.append(record)
result['hostnames'] = sorted(processed_hostnames, key=lambda hn: hn['input_hostname'])
2018-04-04 21:14:16 +02:00
checked_urls = []
for item in processed_hostnames:
if not item['resolvable']:
continue
for scheme in ('http', 'https'):
record = {
'url': scheme + '://' + item['resolved_hostname'] + '/',
'error': None,
'redirects_to': None,
}
try:
r = requests.head(record['url'], headers=headers, allow_redirects=True)
if r.url == url:
logging.info("URL: %s - status %s - no redirect" % (record['url'], r.status_code))
else:
logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
record['redirects_to'] = r.url
except Exception as e:
record['error'] = {
'type': str(type(e)),
'message': str(e),
}
logging.info("URL %s: %s %s" % (url, str(type(e)), e))
checked_urls.append(record)
result['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
result['canonical_urls'] = sorted(reduce_urls(checked_urls))
2018-04-04 21:14:16 +02:00
# Deeper test for the remaining (canonical) URL(s)
for check_url in result['canonical_urls']:
logging.info("Checking URL %s" % check_url)
check = {
'url': check_url,
'status_code': None,
'duration': None,
'error': None,
}
try:
r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
check['status_code'] = r.status_code
check['duration'] = round(r.elapsed.microseconds / 1000)
except requests.exceptions.ConnectionError as e:
logging.error(str(e) + " " + check_url)
check['error'] = "connection"
except requests.exceptions.Timeout as e:
logging.error(str(e) + " " + check_url)
check['error'] = "connection_timeout"
except requests.exceptions.ReadTimeout as e:
logging.error(str(e) + " " + check_url)
check['error'] = "read_timeout"
except Exception as e:
logging.error(str(e) + " " + check_url)
check['error'] = "unknown"
result['urlchecks'].append(check)
result['urlchecks'] = sorted(result['urlchecks'], key=lambda url: url['url'])
2018-04-03 23:15:28 +02:00
return result
2018-04-04 21:14:16 +02:00
2018-04-03 23:15:28 +02:00
def main():
logging.basicConfig(level=logging.INFO)
2018-04-04 21:14:16 +02:00
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
2018-04-03 23:15:28 +02:00
get_green_directory()
urls = []
for entry in dir_entries():
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
2018-04-04 21:14:16 +02:00
logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
2018-04-03 23:15:28 +02:00
continue
website_url = None
for n in range(len(entry['urls'])):
try:
if entry['urls'][n]['type'] == "WEBSITE":
website_url = entry['urls'][n]['url']
except NameError as ne:
logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
if website_url:
urls.append(website_url)
random.seed()
random.shuffle(urls)
results = {}
if concurrency > 1:
pool = Pool(concurrency)
for url in urls:
results[url] = pool.apply_async(check_site, kwds={"url": url})
pool.close()
pool.join()
else:
for url in urls:
results[url] = check_site(url)
2018-04-04 21:14:16 +02:00
results2 = []
done = set()
2018-04-03 23:15:28 +02:00
2018-04-04 21:14:16 +02:00
# convert results from ApplyResult to dict
for url in sorted(results.keys()):
2018-04-04 21:14:16 +02:00
if url not in done:
results2.append(results[url].get())
done.add(url)
# Write result as JSON
output_filename = os.path.join(result_path, "spider_result.json")
2018-04-04 21:14:16 +02:00
with open(output_filename, 'w', encoding="utf8") as jsonfile:
2018-04-03 23:15:28 +02:00
json.dump(results2, jsonfile, indent=2, sort_keys=True)
if __name__ == "__main__":
main()