First working code and results

2018-04-03 23:15:28 +02:00 · 2018-04-03 23:15:28 +02:00 · 220a6ba629
parent 47b468b2a0
commit 220a6ba629
5 changed files with 4721 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+venv
+cache
--- a/README.md
+++ b/README.md
@ -1,2 +1,25 @@
 # green-spider
+
 Collects data on green websites and checks for things like SEO, performance, TLS.
+
+Written and tested in Python3
+
+### Ideas
+
+- If the URL does not start with `www.`, will entering `www.<url>` also work?
+- If the URL is HTTP, is it possible to access the site via HTTPS (recommended)?
+- If the URL is HTTPS, is it possible to access the sire via HTTP (recommended: redirect to HTTPS)
+- Check which cookies are set and with what settings (expiry, domain)
+- submit the URL against a service like Google Page Speed and retrieve the score
+- Check against our own webpagetest.org instance
+- Detect which one of the well-known CMS is used?
+
+### Usage
+
+```nohighlight
+virtualenv -p python3 venv
+source venv/bin/activate
+pip install -r requirements.txt
+
+python spider.py
+```
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,6 @@
+certifi==2018.1.18
+chardet==3.0.4
+idna==2.6
+requests==2.18.4
+urllib3==1.22
+pyyaml==3.12
--- a/result.json
+++ b/result.json
--- a/spider.py
+++ b/spider.py
@ -0,0 +1,180 @@
+# coding: utf8
+
+from git import Repo
+from multiprocessing import Pool
+from urllib.parse import urlparse
+from socket import gethostbyname_ex
+import json
+import logging
+import os
+import random
+import requests
+import shutil
+import sys
+import yaml
+import json
+
+# configuration
+
+# number of parallel processes to use for crawling
+concurrency = 6
+
+# connection timeout for website checks (seconds)
+connect_timeout = 5
+
+# response timeout for website checks
+read_timeout = 10
+
+# Git repo for our data
+green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
+# folder in that repo that holds the data
+green_direcory_data_path = 'data'
+green_directory_local_path = './cache/green-directory'
+
+
+# end configuration
+
+def get_green_directory():
+    """
+    Clones the green directory into the local file system
+    """
+    if os.path.exists(green_directory_local_path):
+        shutil.rmtree(green_directory_local_path)
+    Repo.clone_from(green_directory_repo, green_directory_local_path)
+
+
+def dir_entries():
+    path = os.path.join(green_directory_local_path, green_direcory_data_path)
+    for root, dirs, files in os.walk(path):
+        for fname in files:
+
+            filepath = os.path.join(root, fname)
+            if not filepath.endswith(".yaml"):
+                continue
+
+            with open(filepath, 'r') as yamlfile:
+                for doc in yaml.load_all(yamlfile):
+                    yield doc
+
+def repr_entry(entry):
+    """
+    Return string representation of an entry
+    """
+    r = entry['type']
+    if 'level' in entry:
+        r += "/" + entry['level']
+    if 'state' in entry:
+        r += "/" + entry['state']
+    if 'district' in entry:
+        r += "/" + entry['district']
+    return r
+
+def resolve_hostname(url):
+    parsed = urlparse(url)
+    hostname, aliaslist, ipaddrlist = gethostbyname_ex(parsed.hostname)
+    return (parsed.scheme, hostname, aliaslist, ipaddrlist)
+
+def check_site(url):
+    """
+    Performs our site check and returns results as a dict
+    """
+    result = {
+        'status_code': 0,
+        'error': None,
+        'redirects': 0,
+        'final_url': None,
+        'hostname': None,
+        'scheme': None,
+        'aliases': None,
+        'ip_addresses': None,
+        'duration': 0,
+    }
+
+    try:
+        (scheme, hostname, aliases, ip_addresses) = resolve_hostname(url)
+        result['scheme'] = scheme
+        result['hostname'] = hostname
+        result['aliases'] = aliases
+        result['ip_addresses'] = ip_addresses
+    except Exception as e:
+        logging.error(str(e) + " " + url)
+
+    headers = {
+        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
+    }
+
+    try:
+        r = requests.get(url, headers=headers, timeout=(connect_timeout, read_timeout))
+        result['status_code'] = r.status_code
+        if len(r.history) > 0:
+            result['redirects'] = len(r.history)
+            result['final_url'] = r.url
+        result['duration'] = r.elapsed.microseconds / 1000
+    except requests.exceptions.ConnectionError as e:
+        logging.error(str(e) + " " + url)
+        result['error'] = "connection"
+    except requests.exceptions.Timeout as e:
+        logging.error(str(e) + " " + url)
+        result['error'] = "connection_timeout"
+    except requests.exceptions.ReadTimeout as e:
+        logging.error(str(e) + " " + url)
+        result['error'] = "read_timeout"
+    except Exception as e:
+        logging.error(str(e) + " " + url)
+        result['error'] = "unknown"
+
+    logging.info("%s done" % url)
+    return result
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    get_green_directory()
+
+    urls = []
+    for entry in dir_entries():
+
+        if 'type' not in entry:
+            logging.error("Entry without type")
+            continue
+
+        if 'urls' not in entry:
+            logging.info("Entry %s does not have any URLs." % repr_entry(entry))
+            continue
+
+        website_url = None
+        for n in range(len(entry['urls'])):
+            try:
+                if entry['urls'][n]['type'] == "WEBSITE":
+                    website_url = entry['urls'][n]['url']
+            except NameError as ne:
+                logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
+        if website_url:
+            urls.append(website_url)
+
+    random.seed()
+    random.shuffle(urls)
+
+    results = {}
+
+    if concurrency > 1:
+        pool = Pool(concurrency)
+        for url in urls:
+            results[url] = pool.apply_async(check_site, kwds={"url": url})
+        pool.close()
+        pool.join()
+    else:
+        for url in urls:
+            results[url] = check_site(url)
+
+    results2 = {}
+
+    for url in results.keys():
+        results2[url] = results[url].get()
+
+    with open('result.json', 'w', encoding="utf8") as jsonfile:
+        json.dump(results2, jsonfile, indent=2, sort_keys=True)
+
+
+if __name__ == "__main__":
+    main()