green-spider/spider.py

# coding: utf8

from git import Repo
from multiprocessing import Pool
from socket import gethostbyname_ex
from urllib.parse import urlparse
import certifi
import json
import logging
import os
import random
import requests
import shutil
import sys
import yaml

# configuration

# number of parallel processes to use for crawling
concurrency = 4

# connection timeout for website checks (seconds)
connect_timeout = 5

# response timeout for website checks
read_timeout = 10

# Git repo for our data
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
# folder in that repo that holds the data
green_direcory_data_path = 'data'
green_directory_local_path = './cache/green-directory'

result_path = './webapp/dist/data'

# end configuration


def get_green_directory():
    """
    Clones the source of website URLs, the green directory,
    into the local file system using git
    """
    if os.path.exists(green_directory_local_path):
        shutil.rmtree(green_directory_local_path)
    Repo.clone_from(green_directory_repo, green_directory_local_path)


def dir_entries():
    """
    Iterator over all data files in the cloned green directory
    """
    path = os.path.join(green_directory_local_path, green_direcory_data_path)
    for root, dirs, files in os.walk(path):
        for fname in files:

            filepath = os.path.join(root, fname)
            if not filepath.endswith(".yaml"):
                continue

            with open(filepath, 'r') as yamlfile:
                for doc in yaml.load_all(yamlfile):
                    yield doc


def repr_entry(entry):
    """
    Return string representation of a directory entry,
    for logging/debugging purposes
    """
    r = entry['type']
    if 'level' in entry:
        r += "/" + entry['level']
    if 'state' in entry:
        r += "/" + entry['state']
    if 'district' in entry:
        r += "/" + entry['district']
    return r


def derive_test_hostnames(hostname):
    """
    Derives the hostnames variants to test for a given host name.
    From 'gruene-x.de' or 'www.gruene-x.de' it makes

      ['gruene-x.de', 'www.gruene-x.de']

    which are both plausible web URLs to be used for a domain.
    """

    hostnames = set()

    hostnames.add(hostname)
    if hostname.startswith('www.'):
        hostnames.add(hostname[4:])
    else:
        hostnames.add('www.' + hostname)

    return list(hostnames)


def reduce_urls(urllist):
    """
    Reduce a list of urls with metadata by eliminating those
    that either don't work or lead somewhere else
    """
    targets = set()
    for u in urllist:
        if u['error'] is not None:
            continue
        if u['redirects_to'] is not None:
            targets.add(u['redirects_to'])
        else:
            targets.add(u['url'])
    return list(targets)


def check_site(url):
    """
    Performs our site check and returns results as a dict.

    1. Normalize the input URL and derive the URLs to check for
    2. HEAD the check urls
    3. Determine the canonical URL
    4. Run full check on canonical URL
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
    }

    result = {
        'input_url': url,
        'hostnames': [],
        'resolvable_urls': [],
        'canonical_urls': [],
        'urlchecks': [],
    }

    # derive hostnames to test
    parsed = urlparse(url)
    hostnames = derive_test_hostnames(parsed.hostname)


    processed_hostnames = []
    for hn in hostnames:

        record  = {
            'input_hostname': hn,
            'resolvable': False,
        }

        try:
            hostname, aliases, ip_addresses = gethostbyname_ex(hn)
            record['resolvable'] = True
            record['resolved_hostname'] = hostname
            record['aliases'] = aliases
            record['ip_addresses'] = ip_addresses
        except:
            pass

        processed_hostnames.append(record)

    result['hostnames'] = sorted(processed_hostnames, key=lambda hn: hn['input_hostname'])

    checked_urls = []
    for item in processed_hostnames:
        if not item['resolvable']:
            continue

        for scheme in ('http', 'https'):

            record = {
                'url': scheme + '://' + item['resolved_hostname'] + '/',
                'error': None,
                'redirects_to': None,
            }

            try:
                r = requests.head(record['url'], headers=headers, allow_redirects=True)
                if r.url == url:
                    logging.info("URL: %s - status %s - no redirect" % (record['url'], r.status_code))
                else:
                    logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
                    record['redirects_to'] = r.url
            except Exception as e:
                record['error'] = {
                    'type': str(type(e)),
                    'message': str(e),
                }
                logging.info("URL %s: %s %s" % (url, str(type(e)), e))

            checked_urls.append(record)

    result['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])
    result['canonical_urls'] = sorted(reduce_urls(checked_urls))

    # Deeper test for the remaining (canonical) URL(s)
    for check_url in result['canonical_urls']:

        logging.info("Checking URL %s" % check_url)

        check = {
            'url': check_url,
            'status_code': None,
            'duration': None,
            'error': None,
        }

        try:
            r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
            check['status_code'] = r.status_code
            check['duration'] = round(r.elapsed.microseconds / 1000)
        except requests.exceptions.ConnectionError as e:
            logging.error(str(e) + " " + check_url)
            check['error'] = "connection"
        except requests.exceptions.Timeout as e:
            logging.error(str(e) + " " + check_url)
            check['error'] = "connection_timeout"
        except requests.exceptions.ReadTimeout as e:
            logging.error(str(e) + " " + check_url)
            check['error'] = "read_timeout"
        except Exception as e:
            logging.error(str(e) + " " + check_url)
            check['error'] = "unknown"

        result['urlchecks'].append(check)


    result['urlchecks'] = sorted(result['urlchecks'], key=lambda url: url['url'])

    return result


def main():
    logging.basicConfig(level=logging.INFO)
    logging.getLogger("urllib3").setLevel(logging.CRITICAL)

    get_green_directory()

    urls = []
    for entry in dir_entries():

        if 'type' not in entry:
            logging.error("Entry without type")
            continue

        if 'urls' not in entry:
            logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
            continue

        website_url = None
        for n in range(len(entry['urls'])):
            try:
                if entry['urls'][n]['type'] == "WEBSITE":
                    website_url = entry['urls'][n]['url']
            except NameError as ne:
                logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
        if website_url:
            urls.append(website_url)

    random.seed()
    random.shuffle(urls)

    results = {}

    if concurrency > 1:
        pool = Pool(concurrency)
        for url in urls:
            results[url] = pool.apply_async(check_site, kwds={"url": url})
        pool.close()
        pool.join()
    else:
        for url in urls:
            results[url] = check_site(url)

    results2 = []
    done = set()

    # convert results from ApplyResult to dict
    for url in sorted(results.keys()):
        if url not in done:
            results2.append(results[url].get())
        done.add(url)

    # Write result as JSON
    output_filename = os.path.join(result_path, "spider_result.json")
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
        json.dump(results2, jsonfile, indent=2, sort_keys=True)


if __name__ == "__main__":
    main()
First working code and results 2018-04-03 23:15:28 +02:00			`# coding: utf8`

			`from git import Repo`
			`from multiprocessing import Pool`
			`from socket import gethostbyname_ex`
New test logic, new results 2018-04-04 21:14:16 +02:00			`from urllib.parse import urlparse`
			`import certifi`
First working code and results 2018-04-03 23:15:28 +02:00			`import json`
			`import logging`
			`import os`
			`import random`
			`import requests`
			`import shutil`
			`import sys`
			`import yaml`

			`# configuration`

			`# number of parallel processes to use for crawling`
Reduce concurrency to reduce variance in timing results 2018-04-03 23:25:14 +02:00			`concurrency = 4`
First working code and results 2018-04-03 23:15:28 +02:00
			`# connection timeout for website checks (seconds)`
			`connect_timeout = 5`

			`# response timeout for website checks`
			`read_timeout = 10`

			`# Git repo for our data`
			`green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'`
			`# folder in that repo that holds the data`
			`green_direcory_data_path = 'data'`
			`green_directory_local_path = './cache/green-directory'`

Update results, spider logic and webapp 2018-04-05 18:11:40 +02:00			`result_path = './webapp/dist/data'`
First working code and results 2018-04-03 23:15:28 +02:00
			`# end configuration`

New test logic, new results 2018-04-04 21:14:16 +02:00
First working code and results 2018-04-03 23:15:28 +02:00			`def get_green_directory():`
			`"""`
New test logic, new results 2018-04-04 21:14:16 +02:00			`Clones the source of website URLs, the green directory,`
			`into the local file system using git`
First working code and results 2018-04-03 23:15:28 +02:00			`"""`
			`if os.path.exists(green_directory_local_path):`
			`shutil.rmtree(green_directory_local_path)`
			`Repo.clone_from(green_directory_repo, green_directory_local_path)`


			`def dir_entries():`
New test logic, new results 2018-04-04 21:14:16 +02:00			`"""`
			`Iterator over all data files in the cloned green directory`
			`"""`
First working code and results 2018-04-03 23:15:28 +02:00			`path = os.path.join(green_directory_local_path, green_direcory_data_path)`
			`for root, dirs, files in os.walk(path):`
			`for fname in files:`

			`filepath = os.path.join(root, fname)`
			`if not filepath.endswith(".yaml"):`
			`continue`

			`with open(filepath, 'r') as yamlfile:`
			`for doc in yaml.load_all(yamlfile):`
			`yield doc`

New test logic, new results 2018-04-04 21:14:16 +02:00
First working code and results 2018-04-03 23:15:28 +02:00			`def repr_entry(entry):`
			`"""`
New test logic, new results 2018-04-04 21:14:16 +02:00			`Return string representation of a directory entry,`
			`for logging/debugging purposes`
First working code and results 2018-04-03 23:15:28 +02:00			`"""`
			`r = entry['type']`
			`if 'level' in entry:`
			`r += "/" + entry['level']`
			`if 'state' in entry:`
			`r += "/" + entry['state']`
			`if 'district' in entry:`
			`r += "/" + entry['district']`
			`return r`

New test logic, new results 2018-04-04 21:14:16 +02:00
			`def derive_test_hostnames(hostname):`
			`"""`
			`Derives the hostnames variants to test for a given host name.`
			`From 'gruene-x.de' or 'www.gruene-x.de' it makes`

			`['gruene-x.de', 'www.gruene-x.de']`

			`which are both plausible web URLs to be used for a domain.`
			`"""`

			`hostnames = set()`

			`hostnames.add(hostname)`
			`if hostname.startswith('www.'):`
			`hostnames.add(hostname[4:])`
			`else:`
			`hostnames.add('www.' + hostname)`

			`return list(hostnames)`


			`def reduce_urls(urllist):`
			`"""`
			`Reduce a list of urls with metadata by eliminating those`
			`that either don't work or lead somewhere else`
			`"""`
			`targets = set()`
			`for u in urllist:`
			`if u['error'] is not None:`
			`continue`
			`if u['redirects_to'] is not None:`
			`targets.add(u['redirects_to'])`
			`else:`
			`targets.add(u['url'])`
			`return list(targets)`

First working code and results 2018-04-03 23:15:28 +02:00
			`def check_site(url):`
			`"""`
New test logic, new results 2018-04-04 21:14:16 +02:00			`Performs our site check and returns results as a dict.`

			`1. Normalize the input URL and derive the URLs to check for`
			`2. HEAD the check urls`
			`3. Determine the canonical URL`
			`4. Run full check on canonical URL`
First working code and results 2018-04-03 23:15:28 +02:00			`"""`
New test logic, new results 2018-04-04 21:14:16 +02:00			`headers = {`
			`'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'`
			`}`

First working code and results 2018-04-03 23:15:28 +02:00			`result = {`
New test logic, new results 2018-04-04 21:14:16 +02:00			`'input_url': url,`
			`'hostnames': [],`
			`'resolvable_urls': [],`
			`'canonical_urls': [],`
			`'urlchecks': [],`
First working code and results 2018-04-03 23:15:28 +02:00			`}`

New test logic, new results 2018-04-04 21:14:16 +02:00			`# derive hostnames to test`
			`parsed = urlparse(url)`
			`hostnames = derive_test_hostnames(parsed.hostname)`
First working code and results 2018-04-03 23:15:28 +02:00

New test logic, new results 2018-04-04 21:14:16 +02:00			`processed_hostnames = []`
			`for hn in hostnames:`

			`record = {`
			`'input_hostname': hn,`
			`'resolvable': False,`
			`}`

			`try:`
			`hostname, aliases, ip_addresses = gethostbyname_ex(hn)`
			`record['resolvable'] = True`
			`record['resolved_hostname'] = hostname`
			`record['aliases'] = aliases`
			`record['ip_addresses'] = ip_addresses`
			`except:`
			`pass`

			`processed_hostnames.append(record)`

Set stable sorting for all arrays to minimize the diffs between results submissions 2018-04-06 00:06:54 +02:00			`result['hostnames'] = sorted(processed_hostnames, key=lambda hn: hn['input_hostname'])`
New test logic, new results 2018-04-04 21:14:16 +02:00
			`checked_urls = []`
			`for item in processed_hostnames:`
			`if not item['resolvable']:`
			`continue`

			`for scheme in ('http', 'https'):`

			`record = {`
			`'url': scheme + '://' + item['resolved_hostname'] + '/',`
			`'error': None,`
			`'redirects_to': None,`
			`}`

			`try:`
			`r = requests.head(record['url'], headers=headers, allow_redirects=True)`
			`if r.url == url:`
			`logging.info("URL: %s - status %s - no redirect" % (record['url'], r.status_code))`
			`else:`
			`logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))`
			`record['redirects_to'] = r.url`
			`except Exception as e:`
			`record['error'] = {`
			`'type': str(type(e)),`
			`'message': str(e),`
			`}`
			`logging.info("URL %s: %s %s" % (url, str(type(e)), e))`

			`checked_urls.append(record)`

Set stable sorting for all arrays to minimize the diffs between results submissions 2018-04-06 00:06:54 +02:00			`result['resolvable_urls'] = sorted(checked_urls, key=lambda url: url['url'])`
			`result['canonical_urls'] = sorted(reduce_urls(checked_urls))`
New test logic, new results 2018-04-04 21:14:16 +02:00
			`# Deeper test for the remaining (canonical) URL(s)`
			`for check_url in result['canonical_urls']:`

			`logging.info("Checking URL %s" % check_url)`

			`check = {`
			`'url': check_url,`
			`'status_code': None,`
			`'duration': None,`
			`'error': None,`
			`}`

			`try:`
			`r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))`
			`check['status_code'] = r.status_code`
			`check['duration'] = round(r.elapsed.microseconds / 1000)`
			`except requests.exceptions.ConnectionError as e:`
			`logging.error(str(e) + " " + check_url)`
			`check['error'] = "connection"`
			`except requests.exceptions.Timeout as e:`
			`logging.error(str(e) + " " + check_url)`
			`check['error'] = "connection_timeout"`
			`except requests.exceptions.ReadTimeout as e:`
			`logging.error(str(e) + " " + check_url)`
			`check['error'] = "read_timeout"`
			`except Exception as e:`
			`logging.error(str(e) + " " + check_url)`
			`check['error'] = "unknown"`

			`result['urlchecks'].append(check)`

Set stable sorting for all arrays to minimize the diffs between results submissions 2018-04-06 00:06:54 +02:00
			`result['urlchecks'] = sorted(result['urlchecks'], key=lambda url: url['url'])`

First working code and results 2018-04-03 23:15:28 +02:00			`return result`

New test logic, new results 2018-04-04 21:14:16 +02:00
First working code and results 2018-04-03 23:15:28 +02:00			`def main():`
			`logging.basicConfig(level=logging.INFO)`
New test logic, new results 2018-04-04 21:14:16 +02:00			`logging.getLogger("urllib3").setLevel(logging.CRITICAL)`
First working code and results 2018-04-03 23:15:28 +02:00
			`get_green_directory()`

			`urls = []`
			`for entry in dir_entries():`

			`if 'type' not in entry:`
			`logging.error("Entry without type")`
			`continue`

			`if 'urls' not in entry:`
New test logic, new results 2018-04-04 21:14:16 +02:00			`logging.debug("Entry %s does not have any URLs." % repr_entry(entry))`
First working code and results 2018-04-03 23:15:28 +02:00			`continue`

			`website_url = None`
			`for n in range(len(entry['urls'])):`
			`try:`
			`if entry['urls'][n]['type'] == "WEBSITE":`
			`website_url = entry['urls'][n]['url']`
			`except NameError as ne:`
			`logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))`
			`if website_url:`
			`urls.append(website_url)`

			`random.seed()`
			`random.shuffle(urls)`

			`results = {}`

			`if concurrency > 1:`
			`pool = Pool(concurrency)`
			`for url in urls:`
			`results[url] = pool.apply_async(check_site, kwds={"url": url})`
			`pool.close()`
			`pool.join()`
			`else:`
			`for url in urls:`
			`results[url] = check_site(url)`

New test logic, new results 2018-04-04 21:14:16 +02:00			`results2 = []`
			`done = set()`
First working code and results 2018-04-03 23:15:28 +02:00
New test logic, new results 2018-04-04 21:14:16 +02:00			`# convert results from ApplyResult to dict`
Update results, spider logic and webapp 2018-04-05 18:11:40 +02:00			`for url in sorted(results.keys()):`
New test logic, new results 2018-04-04 21:14:16 +02:00			`if url not in done:`
			`results2.append(results[url].get())`
			`done.add(url)`

			`# Write result as JSON`
Update results, spider logic and webapp 2018-04-05 18:11:40 +02:00			`output_filename = os.path.join(result_path, "spider_result.json")`
New test logic, new results 2018-04-04 21:14:16 +02:00			`with open(output_filename, 'w', encoding="utf8") as jsonfile:`
First working code and results 2018-04-03 23:15:28 +02:00			`json.dump(results2, jsonfile, indent=2, sort_keys=True)`


			`if __name__ == "__main__":`
			`main()`