First working code and results

This commit is contained in:
Marian Steinbach 2018-04-03 23:15:28 +02:00
parent 47b468b2a0
commit 220a6ba629
5 changed files with 4721 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
venv
cache

View File

@ -1,2 +1,25 @@
# green-spider # green-spider
Collects data on green websites and checks for things like SEO, performance, TLS. Collects data on green websites and checks for things like SEO, performance, TLS.
Written and tested in Python3
### Ideas
- If the URL does not start with `www.`, will entering `www.<url>` also work?
- If the URL is HTTP, is it possible to access the site via HTTPS (recommended)?
- If the URL is HTTPS, is it possible to access the sire via HTTP (recommended: redirect to HTTPS)
- Check which cookies are set and with what settings (expiry, domain)
- submit the URL against a service like Google Page Speed and retrieve the score
- Check against our own webpagetest.org instance
- Detect which one of the well-known CMS is used?
### Usage
```nohighlight
virtualenv -p python3 venv
source venv/bin/activate
pip install -r requirements.txt
python spider.py
```

6
requirements.txt Normal file
View File

@ -0,0 +1,6 @@
certifi==2018.1.18
chardet==3.0.4
idna==2.6
requests==2.18.4
urllib3==1.22
pyyaml==3.12

4510
result.json Normal file

File diff suppressed because it is too large Load Diff

180
spider.py Normal file
View File

@ -0,0 +1,180 @@
# coding: utf8
from git import Repo
from multiprocessing import Pool
from urllib.parse import urlparse
from socket import gethostbyname_ex
import json
import logging
import os
import random
import requests
import shutil
import sys
import yaml
import json
# configuration
# number of parallel processes to use for crawling
concurrency = 6
# connection timeout for website checks (seconds)
connect_timeout = 5
# response timeout for website checks
read_timeout = 10
# Git repo for our data
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
# folder in that repo that holds the data
green_direcory_data_path = 'data'
green_directory_local_path = './cache/green-directory'
# end configuration
def get_green_directory():
"""
Clones the green directory into the local file system
"""
if os.path.exists(green_directory_local_path):
shutil.rmtree(green_directory_local_path)
Repo.clone_from(green_directory_repo, green_directory_local_path)
def dir_entries():
path = os.path.join(green_directory_local_path, green_direcory_data_path)
for root, dirs, files in os.walk(path):
for fname in files:
filepath = os.path.join(root, fname)
if not filepath.endswith(".yaml"):
continue
with open(filepath, 'r') as yamlfile:
for doc in yaml.load_all(yamlfile):
yield doc
def repr_entry(entry):
"""
Return string representation of an entry
"""
r = entry['type']
if 'level' in entry:
r += "/" + entry['level']
if 'state' in entry:
r += "/" + entry['state']
if 'district' in entry:
r += "/" + entry['district']
return r
def resolve_hostname(url):
parsed = urlparse(url)
hostname, aliaslist, ipaddrlist = gethostbyname_ex(parsed.hostname)
return (parsed.scheme, hostname, aliaslist, ipaddrlist)
def check_site(url):
"""
Performs our site check and returns results as a dict
"""
result = {
'status_code': 0,
'error': None,
'redirects': 0,
'final_url': None,
'hostname': None,
'scheme': None,
'aliases': None,
'ip_addresses': None,
'duration': 0,
}
try:
(scheme, hostname, aliases, ip_addresses) = resolve_hostname(url)
result['scheme'] = scheme
result['hostname'] = hostname
result['aliases'] = aliases
result['ip_addresses'] = ip_addresses
except Exception as e:
logging.error(str(e) + " " + url)
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
}
try:
r = requests.get(url, headers=headers, timeout=(connect_timeout, read_timeout))
result['status_code'] = r.status_code
if len(r.history) > 0:
result['redirects'] = len(r.history)
result['final_url'] = r.url
result['duration'] = r.elapsed.microseconds / 1000
except requests.exceptions.ConnectionError as e:
logging.error(str(e) + " " + url)
result['error'] = "connection"
except requests.exceptions.Timeout as e:
logging.error(str(e) + " " + url)
result['error'] = "connection_timeout"
except requests.exceptions.ReadTimeout as e:
logging.error(str(e) + " " + url)
result['error'] = "read_timeout"
except Exception as e:
logging.error(str(e) + " " + url)
result['error'] = "unknown"
logging.info("%s done" % url)
return result
def main():
logging.basicConfig(level=logging.INFO)
get_green_directory()
urls = []
for entry in dir_entries():
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
logging.info("Entry %s does not have any URLs." % repr_entry(entry))
continue
website_url = None
for n in range(len(entry['urls'])):
try:
if entry['urls'][n]['type'] == "WEBSITE":
website_url = entry['urls'][n]['url']
except NameError as ne:
logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
if website_url:
urls.append(website_url)
random.seed()
random.shuffle(urls)
results = {}
if concurrency > 1:
pool = Pool(concurrency)
for url in urls:
results[url] = pool.apply_async(check_site, kwds={"url": url})
pool.close()
pool.join()
else:
for url in urls:
results[url] = check_site(url)
results2 = {}
for url in results.keys():
results2[url] = results[url].get()
with open('result.json', 'w', encoding="utf8") as jsonfile:
json.dump(results2, jsonfile, indent=2, sort_keys=True)
if __name__ == "__main__":
main()