2018-04-03 23:15:28 +02:00
|
|
|
# coding: utf8
|
|
|
|
|
2018-04-09 22:39:53 +02:00
|
|
|
from bs4 import BeautifulSoup
|
2018-04-03 23:15:28 +02:00
|
|
|
from git import Repo
|
|
|
|
from multiprocessing import Pool
|
|
|
|
from socket import gethostbyname_ex
|
2018-04-09 23:02:12 +02:00
|
|
|
from urllib.parse import urljoin
|
2018-04-04 21:14:16 +02:00
|
|
|
from urllib.parse import urlparse
|
|
|
|
import certifi
|
2018-04-03 23:15:28 +02:00
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
import os
|
|
|
|
import random
|
2018-04-09 22:39:53 +02:00
|
|
|
import re
|
2018-04-03 23:15:28 +02:00
|
|
|
import requests
|
|
|
|
import shutil
|
|
|
|
import sys
|
|
|
|
import yaml
|
|
|
|
|
|
|
|
# configuration
|
|
|
|
|
|
|
|
# number of parallel processes to use for crawling
|
2018-04-03 23:25:14 +02:00
|
|
|
concurrency = 4
|
2018-04-03 23:15:28 +02:00
|
|
|
|
|
|
|
# connection timeout for website checks (seconds)
|
|
|
|
connect_timeout = 5
|
|
|
|
|
|
|
|
# response timeout for website checks
|
|
|
|
read_timeout = 10
|
|
|
|
|
|
|
|
# Git repo for our data
|
|
|
|
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
|
|
|
|
# folder in that repo that holds the data
|
|
|
|
green_direcory_data_path = 'data'
|
|
|
|
green_directory_local_path = './cache/green-directory'
|
|
|
|
|
2018-04-05 18:11:40 +02:00
|
|
|
result_path = './webapp/dist/data'
|
2018-04-03 23:15:28 +02:00
|
|
|
|
|
|
|
# end configuration
|
|
|
|
|
2018-04-04 21:14:16 +02:00
|
|
|
|
2018-04-03 23:15:28 +02:00
|
|
|
def get_green_directory():
|
|
|
|
"""
|
2018-04-04 21:14:16 +02:00
|
|
|
Clones the source of website URLs, the green directory,
|
|
|
|
into the local file system using git
|
2018-04-03 23:15:28 +02:00
|
|
|
"""
|
|
|
|
if os.path.exists(green_directory_local_path):
|
|
|
|
shutil.rmtree(green_directory_local_path)
|
|
|
|
Repo.clone_from(green_directory_repo, green_directory_local_path)
|
|
|
|
|
|
|
|
|
|
|
|
def dir_entries():
|
2018-04-04 21:14:16 +02:00
|
|
|
"""
|
|
|
|
Iterator over all data files in the cloned green directory
|
|
|
|
"""
|
2018-04-03 23:15:28 +02:00
|
|
|
path = os.path.join(green_directory_local_path, green_direcory_data_path)
|
|
|
|
for root, dirs, files in os.walk(path):
|
|
|
|
for fname in files:
|
|
|
|
|
|
|
|
filepath = os.path.join(root, fname)
|
|
|
|
if not filepath.endswith(".yaml"):
|
|
|
|
continue
|
|
|
|
|
|
|
|
with open(filepath, 'r') as yamlfile:
|
|
|
|
for doc in yaml.load_all(yamlfile):
|
|
|
|
yield doc
|
|
|
|
|
2018-04-04 21:14:16 +02:00
|
|
|
|
2018-04-03 23:15:28 +02:00
|
|
|
def repr_entry(entry):
|
|
|
|
"""
|
2018-04-04 21:14:16 +02:00
|
|
|
Return string representation of a directory entry,
|
|
|
|
for logging/debugging purposes
|
2018-04-03 23:15:28 +02:00
|
|
|
"""
|
|
|
|
r = entry['type']
|
|
|
|
if 'level' in entry:
|
|
|
|
r += "/" + entry['level']
|
|
|
|
if 'state' in entry:
|
|
|
|
r += "/" + entry['state']
|
|
|
|
if 'district' in entry:
|
|
|
|
r += "/" + entry['district']
|
|
|
|
return r
|
|
|
|
|
2018-04-04 21:14:16 +02:00
|
|
|
|
|
|
|
def derive_test_hostnames(hostname):
|
|
|
|
"""
|
|
|
|
Derives the hostnames variants to test for a given host name.
|
|
|
|
From 'gruene-x.de' or 'www.gruene-x.de' it makes
|
|
|
|
|
|
|
|
['gruene-x.de', 'www.gruene-x.de']
|
|
|
|
|
|
|
|
which are both plausible web URLs to be used for a domain.
|
|
|
|
"""
|
|
|
|
|
|
|
|
hostnames = set()
|
|
|
|
|
|
|
|
hostnames.add(hostname)
|
|
|
|
if hostname.startswith('www.'):
|
|
|
|
hostnames.add(hostname[4:])
|
|
|
|
else:
|
|
|
|
hostnames.add('www.' + hostname)
|
|
|
|
|
|
|
|
return list(hostnames)
|
|
|
|
|
|
|
|
|
|
|
|
def reduce_urls(urllist):
|
|
|
|
"""
|
|
|
|
Reduce a list of urls with metadata by eliminating those
|
|
|
|
that either don't work or lead somewhere else
|
|
|
|
"""
|
|
|
|
targets = set()
|
|
|
|
for u in urllist:
|
|
|
|
if u['error'] is not None:
|
|
|
|
continue
|
|
|
|
if u['redirects_to'] is not None:
|
|
|
|
targets.add(u['redirects_to'])
|
|
|
|
else:
|
|
|
|
targets.add(u['url'])
|
|
|
|
return list(targets)
|
|
|
|
|
2018-04-03 23:15:28 +02:00
|
|
|
|
2018-04-09 22:39:53 +02:00
|
|
|
def normalize_title(s):
|
|
|
|
"""
|
|
|
|
Removes garbage from HTML page titles
|
|
|
|
"""
|
|
|
|
s = s.replace('\u00a0', ' ')
|
|
|
|
s = s.replace(' ', ' ')
|
|
|
|
s = s.strip()
|
|
|
|
return s
|
|
|
|
|
|
|
|
def check_content(r):
|
|
|
|
"""
|
|
|
|
Adds details to check regarding content of the page
|
|
|
|
|
|
|
|
check: the dict containing details for this URL
|
|
|
|
r: requests request/response object
|
|
|
|
"""
|
|
|
|
result = {}
|
|
|
|
|
|
|
|
result['encoding'] = r.encoding
|
|
|
|
soup = BeautifulSoup(r.text, 'html.parser')
|
|
|
|
|
|
|
|
# page title
|
|
|
|
result['title'] = None
|
|
|
|
title = soup.find('head').find('title')
|
|
|
|
if title is not None:
|
|
|
|
result['title'] = normalize_title(title.get_text())
|
|
|
|
|
|
|
|
# canonical link
|
|
|
|
result['canonical_link'] = None
|
|
|
|
link = soup.find('link', rel='canonical')
|
|
|
|
if link:
|
2018-04-09 23:02:12 +02:00
|
|
|
result['canonical_link'] = urljoin(r.url, link.get('href'))
|
|
|
|
|
|
|
|
# icon
|
|
|
|
result['icon'] = None
|
|
|
|
link = soup.find('link', rel='icon')
|
|
|
|
if link:
|
|
|
|
result['icon'] = urljoin(r.url, link.get('href'))
|
|
|
|
else:
|
|
|
|
link = soup.find('link', rel='shortcut icon')
|
|
|
|
if link:
|
|
|
|
result['icon'] = urljoin(r.url, link.get('href'))
|
2018-04-09 22:39:53 +02:00
|
|
|
|
|
|
|
# feed links
|
|
|
|
result['feeds'] = []
|
|
|
|
rss_links = soup.find_all('link', type='application/rss+xml')
|
|
|
|
atom_links = soup.find_all('link', type='application/atom+xml')
|
|
|
|
|
|
|
|
if len(rss_links) > 0:
|
|
|
|
for l in rss_links:
|
2018-04-09 23:02:12 +02:00
|
|
|
result['feeds'].append(urljoin(r.url, l.get('href')))
|
2018-04-09 22:39:53 +02:00
|
|
|
if len(atom_links) > 0:
|
|
|
|
for l in rss_links:
|
2018-04-09 23:02:12 +02:00
|
|
|
result['feeds'].append(urljoin(r.url, l.get('href')))
|
2018-04-09 22:39:53 +02:00
|
|
|
|
|
|
|
# generator meta tag
|
|
|
|
result['generator'] = None
|
|
|
|
generator = soup.head.select('[name=generator]')
|
|
|
|
if len(generator):
|
|
|
|
result['generator'] = generator[0].get('content')
|
|
|
|
|
|
|
|
# opengraph meta tags
|
|
|
|
result['opengraph'] = None
|
|
|
|
og = set()
|
|
|
|
for item in soup.head.find_all(property=re.compile('^og:')):
|
|
|
|
og.add(item.get('property'))
|
|
|
|
for item in soup.head.find_all(itemprop=re.compile('^og:')):
|
|
|
|
og.add(item.get('itemprop'))
|
|
|
|
if len(og):
|
|
|
|
result['opengraph'] = list(og)
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
2018-04-09 23:02:12 +02:00
|
|
|
|
2018-04-03 23:15:28 +02:00
|
|
|
def check_site(url):
|
|
|
|
"""
|
2018-04-04 21:14:16 +02:00
|
|
|
Performs our site check and returns results as a dict.
|
|
|
|
|
|
|
|
1. Normalize the input URL and derive the URLs to check for
|
|
|
|
2. HEAD the check urls
|
|
|
|
3. Determine the canonical URL
|
|
|
|
4. Run full check on canonical URL
|
2018-04-03 23:15:28 +02:00
|
|
|
"""
|
2018-04-04 21:14:16 +02:00
|
|
|
headers = {
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
|
|
|
|
}
|
|
|
|
|
2018-04-03 23:15:28 +02:00
|
|
|
result = {
|
2018-04-04 21:14:16 +02:00
|
|
|
'input_url': url,
|
|
|
|
'hostnames': [],
|
|
|
|
'resolvable_urls': [],
|
|
|
|
'canonical_urls': [],
|
|
|
|
'urlchecks': [],
|
2018-04-03 23:15:28 +02:00
|
|
|
}
|
|
|
|
|
2018-04-04 21:14:16 +02:00
|
|
|
# derive hostnames to test
|
|
|
|
parsed = urlparse(url)
|
|
|
|
hostnames = derive_test_hostnames(parsed.hostname)
|
2018-04-03 23:15:28 +02:00
|
|
|
|
|
|
|
|
2018-04-04 21:14:16 +02:00
|
|
|
processed_hostnames = []
|
|
|
|
for hn in hostnames:
|
|
|
|
|
|
|
|
record = {
|
|
|
|
'input_hostname': hn,
|
|
|
|
'resolvable': False,
|
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
hostname, aliases, ip_addresses = gethostbyname_ex(hn)
|
|
|
|
record['resolvable'] = True
|
|
|
|
record['resolved_hostname'] = hostname
|
|
|
|
record['aliases'] = aliases
|
|
|
|
record['ip_addresses'] = ip_addresses
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
|
|
|
processed_hostnames.append(record)
|
|
|
|
|
2018-04-06 00:06:54 +02:00
|
|
|
result['hostnames'] = sorted(processed_hostnames, key=lambda hn: hn['input_hostname'])
|
2018-04-04 21:14:16 +02:00
|
|
|
|
|
|
|
checked_urls = []
|
|
|
|
for item in processed_hostnames:
|
|
|
|
if not item['resolvable']:
|
|
|
|
continue
|
|
|
|
|
|
|
|
for scheme in ('http', 'https'):
|
|
|
|
|
|
|
|
record = {
|
|
|
|
'url': scheme + '://' + item['resolved_hostname'] + '/',
|
|
|
|
'error': None,
|
|
|
|
'redirects_to': None,
|
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
r = requests.head(record['url'], headers=headers, allow_redirects=True)
|
|
|
|
if r.url == url:
|
|
|
|
logging.info("URL: %s - status %s - no redirect" % (record['url'], r.status_code))
|
|
|
|
else:
|
|
|
|
logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
|
|
|
|
record['redirects_to'] = r.url
|
|
|
|
except Exception as e:
|
|
|
|
record['error'] = {
|
|
|
|
'type': str(type(e)),
|
|
|
|
'message': str(e),
|
|
|
|
}
|
|
|
|
logging.info("URL %s: %s %s" % (url, str(type(e)), e))
|
|
|
|
|
|
|
|
checked_urls.append(record)
|
|
|
|
|
2018-04-06 00:06:54 +02:00
|
|
|
result['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
|
|
|
|
result['canonical_urls'] = sorted(reduce_urls(checked_urls))
|
2018-04-04 21:14:16 +02:00
|
|
|
|
|
|
|
# Deeper test for the remaining (canonical) URL(s)
|
|
|
|
for check_url in result['canonical_urls']:
|
|
|
|
|
|
|
|
logging.info("Checking URL %s" % check_url)
|
|
|
|
|
|
|
|
check = {
|
|
|
|
'url': check_url,
|
|
|
|
'status_code': None,
|
|
|
|
'duration': None,
|
|
|
|
'error': None,
|
2018-04-09 22:39:53 +02:00
|
|
|
'content': None,
|
2018-04-04 21:14:16 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
|
|
|
|
check['status_code'] = r.status_code
|
|
|
|
check['duration'] = round(r.elapsed.microseconds / 1000)
|
2018-04-09 22:39:53 +02:00
|
|
|
|
|
|
|
# Content checks
|
|
|
|
if r.status_code < 300:
|
|
|
|
check['content'] = check_content(r)
|
|
|
|
|
2018-04-04 21:14:16 +02:00
|
|
|
except requests.exceptions.ConnectionError as e:
|
|
|
|
logging.error(str(e) + " " + check_url)
|
|
|
|
check['error'] = "connection"
|
|
|
|
except requests.exceptions.Timeout as e:
|
|
|
|
logging.error(str(e) + " " + check_url)
|
|
|
|
check['error'] = "connection_timeout"
|
|
|
|
except requests.exceptions.ReadTimeout as e:
|
|
|
|
logging.error(str(e) + " " + check_url)
|
|
|
|
check['error'] = "read_timeout"
|
|
|
|
except Exception as e:
|
|
|
|
logging.error(str(e) + " " + check_url)
|
|
|
|
check['error'] = "unknown"
|
|
|
|
|
|
|
|
result['urlchecks'].append(check)
|
|
|
|
|
2018-04-06 00:06:54 +02:00
|
|
|
|
|
|
|
result['urlchecks'] = sorted(result['urlchecks'], key=lambda url: url['url'])
|
|
|
|
|
2018-04-03 23:15:28 +02:00
|
|
|
return result
|
|
|
|
|
2018-04-04 21:14:16 +02:00
|
|
|
|
2018-04-03 23:15:28 +02:00
|
|
|
def main():
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
2018-04-04 21:14:16 +02:00
|
|
|
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
2018-04-03 23:15:28 +02:00
|
|
|
|
|
|
|
get_green_directory()
|
|
|
|
|
|
|
|
urls = []
|
|
|
|
for entry in dir_entries():
|
|
|
|
|
|
|
|
if 'type' not in entry:
|
|
|
|
logging.error("Entry without type")
|
|
|
|
continue
|
|
|
|
|
|
|
|
if 'urls' not in entry:
|
2018-04-04 21:14:16 +02:00
|
|
|
logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
|
2018-04-03 23:15:28 +02:00
|
|
|
continue
|
|
|
|
|
|
|
|
website_url = None
|
|
|
|
for n in range(len(entry['urls'])):
|
|
|
|
try:
|
|
|
|
if entry['urls'][n]['type'] == "WEBSITE":
|
|
|
|
website_url = entry['urls'][n]['url']
|
|
|
|
except NameError as ne:
|
|
|
|
logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
|
|
|
|
if website_url:
|
|
|
|
urls.append(website_url)
|
|
|
|
|
|
|
|
random.seed()
|
|
|
|
random.shuffle(urls)
|
|
|
|
|
|
|
|
results = {}
|
|
|
|
|
|
|
|
if concurrency > 1:
|
|
|
|
pool = Pool(concurrency)
|
|
|
|
for url in urls:
|
|
|
|
results[url] = pool.apply_async(check_site, kwds={"url": url})
|
|
|
|
pool.close()
|
|
|
|
pool.join()
|
|
|
|
else:
|
|
|
|
for url in urls:
|
|
|
|
results[url] = check_site(url)
|
|
|
|
|
2018-04-04 21:14:16 +02:00
|
|
|
results2 = []
|
|
|
|
done = set()
|
2018-04-03 23:15:28 +02:00
|
|
|
|
2018-04-04 21:14:16 +02:00
|
|
|
# convert results from ApplyResult to dict
|
2018-04-05 18:11:40 +02:00
|
|
|
for url in sorted(results.keys()):
|
2018-04-04 21:14:16 +02:00
|
|
|
if url not in done:
|
|
|
|
results2.append(results[url].get())
|
|
|
|
done.add(url)
|
|
|
|
|
|
|
|
# Write result as JSON
|
2018-04-05 18:11:40 +02:00
|
|
|
output_filename = os.path.join(result_path, "spider_result.json")
|
2018-04-04 21:14:16 +02:00
|
|
|
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
2018-04-09 22:40:17 +02:00
|
|
|
json.dump(results2, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
2018-04-03 23:15:28 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|