Merge pull request #58 from netzbegruenung/write-to-db

Use job queue, write to database
This commit is contained in:
Marian Steinbach 2018-08-23 10:16:50 +02:00 committed by GitHub
commit bf23478265
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 425 additions and 245 deletions

View File

@ -12,7 +12,7 @@ RUN apt-get update \
&& wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
&& dpkg -i google-chrome-stable_current_amd64.deb \
&& rm google-chrome-stable_current_amd64.deb \
&& pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 certifi==2018.1.18 \
&& pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 \
&& wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \
&& unzip chromedriver_linux64.zip \
&& rm chromedriver_linux64.zip \
@ -25,6 +25,7 @@ RUN wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x
ADD spider.py /
ADD spider_test.py /
ADD data_export.py /
ENTRYPOINT ["python3"]
CMD ["/spider.py"]

View File

@ -7,24 +7,40 @@ dockerimage:
docker pull debian:stretch-slim
docker build -t spider .
# Create spider job queue
spiderjobs: dockerimage
docker run --rm -ti \
-v $(PWD)/secrets:/secrets \
spider spider.py \
--credentials-path /secrets/datastore-writer.json \
--loglevel debug \
jobs
# Run spider in docker image
spider: dockerimage
docker run --rm -ti \
-v $(PWD)/webapp/dist/data:/out \
-v $(PWD)/docs/siteicons:/icons \
-v $(PWD)/secrets:/secrets \
spider spider.py \
--credentials-path /secrets/datastore-writer.json \
--loglevel debug \
spider
test: dockerimage
docker run --rm -ti spider /spider_test.py
screenshots: venv
venv/bin/python ./screenshots.py secrets/screenshot-reader.json
export:
docker run --rm -ti \
-v $(PWD)/webapp/dist/data:/out \
-v $(PWD)/secrets:/secrets \
spider data_export.py /secrets/datastore-reader.json
webapp/node_modules:
cd webapp && npm install
# Build webapp
webapp: webapp/node_modules screenshots
webapp: webapp/node_modules
cd webapp && npx webpack --config webpack.config.js
cp -r webapp/dist/* ./docs/
cp webapp/node_modules/tooltipster/dist/css/tooltipster.bundle.min.css ./docs/css/

View File

@ -1,11 +1,15 @@
# Green Spider
Initiative und Tools zur Förderung eines benutzer*innenfreundlichen Auftritts von Bündnis 90/Die Grünen im Web
Initiative und Tools zur Förderung eines benutzer*innenfreundlichen Auftritts von Bündnis 90/Die Grünen im Web.
Zur Auswertung: [https://green-spider.netzbegruenung.de/](https://green-spider.netzbegruenung.de/)
## Tools
- Spider: Sammelt Informationen über Websites von B90/GRÜNE Gliederungen
- Screenshotter: Erstellt Seiten-Screenshots. Siehe [netzbegruenung/green-spider-screenshotter](https://github.com/netzbegruenung/green-spider-screenshotter/)
- Webapp: Darstellung der Spider-Ergebnisse unter [green-spider.netzbegruenung.de](https://green-spider.netzbegruenung.de/)
## Aktivitäten
@ -22,21 +26,28 @@ Zur Kommunikation dient der Chatbegrünung-Kanal [#green-spider](https://chatbeg
### Spider ausführen
Damit werden alle bekannten WWW-Adressen aus [netzbegruenung/green-directory](https://github.com/netzbegruenung/green-directory) geprüft und Daten dazu gesammelt.
Voraussetzungen:
- GNU make
- Python 3
- virtualenv
- Docker
- Schlüssel mit Schreibrecht für die Ergebnis-Datenbank
Starte den Vorgang mit diesem Befehl:
Um alle Sites aus aus [netzbegruenung/green-directory](https://github.com/netzbegruenung/green-directory) zu spidern:
```nohighlight
make spiderjobs
make spider
```
Das Ergebnis ist die Datei `webapp/dist/data/spider_result.json`. Wenn Du die neuen Daten ins Repository einspielen möchtest, erstelle bitte einen Pull Request.
Alternativ kann wie im nachfolgenden Beispiel gezeogt das Spidern einer einzelnen URL angestoßen werden. Diese muss nicht zwingend Teil des `green-directory` sein.
```nohighlight
docker run --rm -ti \
-v $PWD/secrets:/secrets spider \
spider.py --credentials-path /secrets/datastore-writer.json \
jobs --url https://www.trittin.de/
make spider
```
### Screenshots erstellen
@ -44,16 +55,17 @@ Siehe [green-spider-screenshotter](https://github.com/netzbegruenung/green-spide
### Webapp aktualisieren
Die unter https://netzbegruenung.github.io/green-spider/ veröffentlichte Webapp zeigt den Inhalt des [docs](https://github.com/netzbegruenung/green-spider/tree/master/docs) Verzeichnisses für den `master` Branch dieses repositories an. Dieser kann automatisch neu erzeugt werden.
Voraussetzungen:
- npm
- Service-Account JSON-Datei für den Lesezugriff auf Screenshot-Daten
- Docker
- Schlüssel mit Leserecht für Screenshot- und Ergebnis-Datenbank
Um den Inhalt des docs-Verzeichnisses zu aktualisieren, gibt es im Makefile dieses Kommando:
Die beiden nachfolgenden Kommandos erzeugen die JSON-Exporte der Spider-Ergebnisse
und Screenshots und aktualisieren die Webapp.
```nohighlight
make export
make webapp
```

View File

@ -1,17 +1,35 @@
"""
Exports data from the database to JSON files for use in a static webapp
"""
from google.cloud import datastore
import json
import sys
import os
def main():
if len(sys.argv) == 1:
print("Error: please provide path to Google Storage API system account JSON file as argument")
sys.exit(1)
client = None
key_path = sys.argv[1]
client = datastore.Client.from_service_account_json(key_path)
def export_results():
"""
Export of the main results data
"""
out = []
query = client.query(kind='spider-results')
for entity in query.fetch():
print(entity.key.name)
out.append(dict(entity)["results"])
output_filename = "/out/spider_result.json"
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
def export_screenshots():
"""
Export of screenshot meta data
"""
out = {}
query = client.query(kind='webscreenshot')
@ -19,10 +37,18 @@ def main():
print(item['url'], os.path.basename(item['screenshot_url']))
out[item['url']] = os.path.basename(item['screenshot_url'])
output_filename = "./webapp/dist/data/screenshots.json"
output_filename = "/out/screenshots.json"
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
if __name__ == "__main__":
main()
if len(sys.argv) == 1:
print("Error: please provide path to Google Storage API system account JSON file as argument")
sys.exit(1)
key_path = sys.argv[1]
client = datastore.Client.from_service_account_json(key_path)
export_screenshots()
export_results()

551
spider.py
View File

@ -1,68 +1,175 @@
# coding: utf8
"""
Provides the spider functionality (website checks).
"""
from bs4 import BeautifulSoup
from git import Repo
from multiprocessing import Pool
from selenium import webdriver
from socket import gethostbyname_ex
from urllib.parse import urljoin
from urllib.parse import urlparse
import certifi
import argparse
import hashlib
import json
import logging
import os
import random
import re
import requests
import shutil
import statistics
import sys
from datetime import datetime
from socket import gethostbyname_ex
from urllib.parse import urljoin
from urllib.parse import urlparse
import requests
import yaml
from bs4 import BeautifulSoup
from git import Repo
from selenium import webdriver
from google.cloud import datastore
from google.api_core.exceptions import InvalidArgument
# configuration
# number of parallel processes to use for crawling
concurrency = 1
# connection timeout for website checks (seconds)
connect_timeout = 5
CONNECT_TIMEOUT = 5
# response timeout for website checks
read_timeout = 10
READ_TIMEOUT = 10
# Git repo for our data
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
# folder in that repo that holds the data
green_direcory_data_path = 'data/countries/de'
green_directory_local_path = './cache/green-directory'
GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
result_path = '/out'
RESULT_PATH = '/out'
siteicons_path = '/icons'
SITEICONS_PATH = '/icons'
# IP address of the newthinking GCMS server
gcms_ip = "91.102.13.20"
GCMS_IP = "91.102.13.20"
JOB_DATASTORE_KIND = 'spider-jobs'
RESULTS_DATASTORE_KIND = 'spider-results'
# end configuration
DATASTORE_CLIENT = None
def chunks(the_list, size):
"""
Yield successive n-sized chunks from list the_list
where n = size.
"""
for i in range(0, len(the_list), size):
yield the_list[i:i + size]
def create_jobs(url=None):
"""
Read all URLs from green directory and fill a job database
with one job per URL.
Alternatively, if the url argument is given, only the given URL
will be added as a spider job.
"""
# refresh our local clone of the green directory
logging.info("Refreshing green-directory clone")
get_green_directory()
# build the list of website URLs to run checks for
logging.info("Processing green-directory")
input_entries = []
count = 0
for entry in dir_entries():
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
continue
website_url = None
for index in range(len(entry['urls'])):
try:
if entry['urls'][index]['type'] == "WEBSITE":
website_url = entry['urls'][index]['url']
if website_url:
if url is not None and website_url != url:
continue
input_entries.append({
"url": website_url,
"level": entry.get("level"),
"state": entry.get("state"),
"district": entry.get("district"),
"city": entry.get("city"),
})
count += 1
except NameError:
logging.error("Error in %s: 'url' key missing (%s)",
repr_entry(entry), entry['urls'][index])
# ensure the passed URL argument is really there, even if not part
# of the directory.
if url and count == 0:
logging.info("Adding job for URL %s which is not part of green-directory", url)
input_entries.append({
"url": url,
"level": None,
"state": None,
"district": None,
"city": None,
})
# randomize order, to distribute requests over servers
logging.debug("Shuffling input URLs")
random.seed()
random.shuffle(input_entries)
count = 0
logging.info("Writing jobs")
entities = []
for entry in input_entries:
key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"])
entity = datastore.Entity(key=key)
entity.update({
"created": datetime.utcnow(),
"level": entry["level"],
"state": entry["state"],
"district": entry["district"],
"city": entry["city"],
})
entities.append(entity)
# commmit to DB
for chunk in chunks(entities, 300):
logging.debug("Writing jobs chunk of length %d", len(chunk))
DATASTORE_CLIENT.put_multi(chunk)
count += len(chunk)
logging.info("Writing jobs done, %s jobs added", count)
def get_green_directory():
"""
Clones the source of website URLs, the green directory,
into the local file system using git
"""
if os.path.exists(green_directory_local_path):
shutil.rmtree(green_directory_local_path)
Repo.clone_from(green_directory_repo, green_directory_local_path)
if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH):
shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH)
Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH)
def dir_entries():
"""
Iterator over all data files in the cloned green directory
"""
path = os.path.join(green_directory_local_path, green_direcory_data_path)
path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH)
for root, _, files in os.walk(path):
for fname in files:
@ -80,14 +187,14 @@ def repr_entry(entry):
Return string representation of a directory entry,
for logging/debugging purposes
"""
r = entry['type']
ret = entry['type']
if 'level' in entry:
r += "/" + entry['level']
ret += "/" + entry['level']
if 'state' in entry:
r += "/" + entry['state']
ret += "/" + entry['state']
if 'district' in entry:
r += "/" + entry['district']
return r
ret += "/" + entry['district']
return ret
def derive_test_hostnames(hostname):
@ -117,24 +224,25 @@ def reduce_urls(urllist):
that either don't work or lead somewhere else
"""
targets = set()
for u in urllist:
if u['error'] is not None:
for url in urllist:
if url['error'] is not None:
continue
if u['redirects_to'] is not None:
targets.add(u['redirects_to'])
if url['redirects_to'] is not None:
targets.add(url['redirects_to'])
else:
targets.add(u['url'])
targets.add(url['url'])
return sorted(list(targets))
def normalize_title(s):
def normalize_title(title):
"""
Removes garbage from HTML page titles
"""
s = s.replace('\u00a0', ' ')
s = s.replace(' ', ' ')
s = s.strip()
return s
title = title.replace(u'\u00a0', ' ')
title = title.replace(' ', ' ')
title = title.strip()
return title
def download_icon(icon_url):
"""
@ -150,10 +258,10 @@ def download_icon(icon_url):
}
# Download the icon
r = requests.get(icon_url)
r.raise_for_status()
req = requests.get(icon_url)
req.raise_for_status()
content_hash = hashlib.md5(r.content).hexdigest()
content_hash = hashlib.md5(req.content).hexdigest()
extension = ""
file_name = os.path.basename(icon_url)[-1]
@ -161,24 +269,25 @@ def download_icon(icon_url):
ext = file_name.split(".")[-1]
if ext != "":
extension = ext
if extension == "":
# derive from content type
t = r.headers.get('content-type')
ctype = req.headers.get('content-type')
try:
extension = default_endings[t]
extension = default_endings[ctype]
except KeyError:
logging.error("No file ending defined for icon type '%s'" % t)
logging.error("No file ending defined for icon type '%s'", ctype)
return None
filename = content_hash + "." + extension.lower()
path = siteicons_path + os.path.sep + filename
path = SITEICONS_PATH + os.path.sep + filename
with open(path, 'wb') as iconfile:
iconfile.write(r.content)
iconfile.write(req.content)
return filename
def check_responsiveness(url):
"""
Checks
@ -193,9 +302,9 @@ def check_responsiveness(url):
# sizes we check for (width, height)
sizes = (
(320,480), # old smartphone
(768,1024), # older tablet or newer smartphone
(1024,768), # older desktop or horiz. tablet
(320, 480), # old smartphone
(768, 1024), # older tablet or newer smartphone
(1024, 768), # older desktop or horiz. tablet
(1920, 1080), # Full HD horizontal
)
@ -218,7 +327,8 @@ def check_responsiveness(url):
return details
def check_content(r):
def check_content(req):
"""
Adds details to check regarding content of the page
@ -227,10 +337,10 @@ def check_content(r):
"""
result = {}
result['encoding'] = r.encoding.lower()
soup = BeautifulSoup(r.text, 'html.parser')
result['encoding'] = req.encoding.lower()
soup = BeautifulSoup(req.text, 'html.parser')
result['html'] = r.text
result['html'] = req.text
# page title
result['title'] = None
@ -245,47 +355,47 @@ def check_content(r):
result['canonical_link'] = None
link = soup.find('link', rel='canonical')
if link:
result['canonical_link'] = urljoin(r.url, link.get('href'))
result['canonical_link'] = urljoin(req.url, link.get('href'))
# icon
result['icon'] = None
link = soup.find('link', rel=lambda x: x and x.lower()=='icon')
link = soup.find('link', rel=lambda x: x and x.lower() == 'icon')
if link:
result['icon'] = urljoin(r.url, link.get('href'))
result['icon'] = urljoin(req.url, link.get('href'))
else:
link = soup.find('link', rel=lambda x: x and x.lower()=='shortcut icon')
link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
if link:
result['icon'] = urljoin(r.url, link.get('href'))
result['icon'] = urljoin(req.url, link.get('href'))
# feed links
result['feeds'] = []
rss_links = soup.find_all('link', type='application/rss+xml')
atom_links = soup.find_all('link', type='application/atom+xml')
if len(rss_links) > 0:
for l in rss_links:
result['feeds'].append(urljoin(r.url, l.get('href')))
if len(atom_links) > 0:
for l in rss_links:
result['feeds'].append(urljoin(r.url, l.get('href')))
if rss_links:
for link in rss_links:
result['feeds'].append(urljoin(req.url, link.get('href')))
if atom_links:
for link in rss_links:
result['feeds'].append(urljoin(req.url, link.get('href')))
# generator meta tag
result['generator'] = None
if head is not None:
generator = head.select('[name=generator]')
if len(generator):
if generator:
result['generator'] = generator[0].get('content')
# opengraph meta tags
result['opengraph'] = None
og = set()
opengraph = set()
if head is not None:
for item in head.find_all(property=re.compile('^og:')):
og.add(item.get('property'))
opengraph.add(item.get('property'))
for item in head.find_all(itemprop=re.compile('^og:')):
og.add(item.get('itemprop'))
if len(og):
result['opengraph'] = sorted(list(og))
opengraph.add(item.get('itemprop'))
if opengraph:
result['opengraph'] = sorted(list(opengraph))
return result
@ -298,8 +408,8 @@ def collect_ipv4_addresses(hostname_dict):
for item in hostname_dict.values():
if 'ip_addresses' not in item:
continue
for ip in item['ip_addresses']:
ips.add(ip)
for ip_addr in item['ip_addresses']:
ips.add(ip_addr)
return sorted(list(ips))
@ -310,11 +420,11 @@ def parse_generator(generator):
generator = generator.lower()
if 'typo3' in generator:
return "typo3"
elif 'wordpress' in generator:
if 'wordpress' in generator:
return "wordpress"
elif 'drupal' in generator:
if 'drupal' in generator:
return "drupal"
elif 'joomla' in generator:
if 'joomla' in generator:
return "joomla"
return generator
@ -328,7 +438,9 @@ def check_site(entry):
4. Run full check on canonical URL
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' +
'AppleWebKit/537.36 (KHTML, like Gecko) ' +
'Chrome/65.0.3325.181 green-spider/0.1'
}
# all the info we'll return for the site
@ -337,12 +449,13 @@ def check_site(entry):
'input_url': entry['url'],
# Meta: Regional and type metadata for the site
'meta': {
'level': entry['level'],
'state': entry['state'],
'district': entry['district'],
'city': entry['city'],
'level': entry.get('level'),
'state': entry.get('state'),
'district': entry.get('district'),
'city': entry.get('city'),
},
# Details: All details we collected about the site (which aren't directly related to the report criteria)
# Details: All details we collected about the site (which aren't directly
# related to the report criteria)
'details': {
'hostnames': {},
'ipv4_addresses': [],
@ -375,18 +488,18 @@ def check_site(entry):
# try to resolve hostnames
processed_hostnames = {}
for hn in hostnames:
for hostname in hostnames:
processed_hostnames[hn] = {
processed_hostnames[hostname] = {
'resolvable': False,
}
try:
hostname, aliases, ip_addresses = gethostbyname_ex(hn)
processed_hostnames[hn]['resolvable'] = True
processed_hostnames[hn]['resolved_hostname'] = hostname
processed_hostnames[hn]['aliases'] = aliases
processed_hostnames[hn]['ip_addresses'] = ip_addresses
hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
processed_hostnames[hostname]['resolvable'] = True
processed_hostnames[hostname]['resolved_hostname'] = hostname
processed_hostnames[hostname]['aliases'] = aliases
processed_hostnames[hostname]['ip_addresses'] = ip_addresses
except:
pass
@ -398,9 +511,9 @@ def check_site(entry):
checked_urls = []
checked_urls_set = set()
for hn in processed_hostnames.keys():
for hostname in processed_hostnames.keys():
item = processed_hostnames[hn]
item = processed_hostnames[hostname]
if not item['resolvable']:
continue
@ -421,18 +534,19 @@ def check_site(entry):
}
try:
r = requests.head(record['url'], headers=headers, allow_redirects=True)
if r.url == url:
logging.info("URL: %s - status %s" % (record['url'], r.status_code))
req = requests.head(record['url'], headers=headers, allow_redirects=True)
if req.url == url:
logging.info("URL: %s - status %s", record['url'], req.status_code)
else:
logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
record['redirects_to'] = r.url
except Exception as e:
logging.info("URL: %s - status %s - redirects to %s", record['url'],
req.status_code, req.url)
record['redirects_to'] = req.url
except Exception as exc:
record['error'] = {
'type': str(type(e)),
'message': str(e),
'type': str(type(exc)),
'message': str(exc),
}
logging.info("URL %s: %s %s" % (url, str(type(e)), e))
logging.info("URL %s: %s %s", url, str(type(exc)), exc)
checked_urls.append(record)
@ -442,7 +556,7 @@ def check_site(entry):
# Deeper test for the remaining (canonical) URL(s)
for check_url in result['details']['canonical_urls']:
logging.info("Downloading URL %s" % check_url)
logging.info("Downloading URL %s", check_url)
check = {
'url': check_url,
@ -454,37 +568,38 @@ def check_site(entry):
}
try:
r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
check['status_code'] = r.status_code
check['duration'] = round(r.elapsed.microseconds / 1000)
req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
check['status_code'] = req.status_code
check['duration'] = round(req.elapsed.microseconds / 1000)
# Content checks
if r.status_code < 300:
check['content'] = check_content(r)
if req.status_code < 300:
check['content'] = check_content(req)
# Responsiveness check
try:
check['responsive'] = check_responsiveness(check_url)
except Exception as e:
logging.error("Error when checking responsiveness for '%s': %s" % (check_url, e))
except Exception as exc:
logging.error("Error when checking responsiveness for '%s': %s", check_url, exc)
except requests.exceptions.ConnectionError as e:
logging.error(str(e) + " " + check_url)
except requests.exceptions.ConnectionError as exc:
logging.error(str(exc) + " " + check_url)
check['error'] = "connection"
except requests.exceptions.ReadTimeout as e:
logging.error(str(e) + " " + check_url)
except requests.exceptions.ReadTimeout as exc:
logging.error(str(exc) + " " + check_url)
check['error'] = "read_timeout"
except requests.exceptions.Timeout as e:
logging.error(str(e) + " " + check_url)
except requests.exceptions.Timeout as exc:
logging.error(str(exc) + " " + check_url)
check['error'] = "connection_timeout"
except Exception as e:
logging.error(str(e) + " " + check_url)
except Exception as exc:
logging.error(str(exc) + " " + check_url)
check['error'] = "unknown"
result['details']['urlchecks'].append(check)
result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
result['details']['urlchecks'] = sorted(result['details']['urlchecks'],
key=lambda url: url['url'])
# collect icons
icons = set()
@ -492,24 +607,24 @@ def check_site(entry):
if 'content' not in c:
continue
if c['content'] is None:
logging.warning("No content for %s" % entry['url'])
logging.warning("No content for %s", entry['url'])
continue
if c['content']['icon'] is not None:
icons.add(c['content']['icon'])
downloaded_icons = set()
for icon_url in icons:
logging.info("Getting icon %s" % icon_url)
logging.info("Getting icon %s", icon_url)
try:
downloaded_icons.add(download_icon(icon_url))
except Exception as e:
logging.error("Could not download icon: %s" % e)
logging.error("Could not download icon: %s", e)
result['details']['icons'] = sorted(list(downloaded_icons))
# collect feeds
feeds = set()
for c in result['details']['urlchecks']:
if c['content'] is None:
logging.warning("No content for %s" % entry['url'])
logging.warning("No content for %s", entry['url'])
continue
if 'feeds' in c['content'] and len(c['content']['feeds']):
for feed in c['content']['feeds']:
@ -543,7 +658,7 @@ def check_site(entry):
result['details']['cms'] = parse_generator(c['content']['generator'])
# Qualify certain CMS flavours in more detail
if result['details']['cms'] == "typo3":
if gcms_ip in result['details']['ipv4_addresses']:
if GCMS_IP in result['details']['ipv4_addresses']:
result['details']['cms'] = "typo3-gcms"
elif 'typo3-gruene.de' in c['content']['html']:
result['details']['cms'] = "typo3-gruene"
@ -555,7 +670,8 @@ def check_site(entry):
# No generator Tag. Use HTML content.
if 'Urwahl3000' in c['content']['html']:
result['details']['cms'] = "wordpress-urwahl"
elif 'josephknowsbest' in c['content']['html'] or 'Joseph-knows-best' in c['content']['html']:
elif ('josephknowsbest' in c['content']['html'] or
'Joseph-knows-best' in c['content']['html']):
result['details']['cms'] = "wordpress-josephknowsbest"
elif 'wordpress' in c['content']['html']:
result['details']['cms'] = "wordpress"
@ -567,7 +683,7 @@ def check_site(entry):
### Derive criteria
# DNS_RESOLVABLE_IPV4
if len(result['details']['ipv4_addresses']):
if result['details']['ipv4_addresses']:
result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
# SITE_REACHABLE
@ -584,8 +700,8 @@ def check_site(entry):
# WWW_OPTIONAL
num_hostnames = 0
for hn in result['details']['hostnames'].keys():
item = result['details']['hostnames'][hn]
for hostname in result['details']['hostnames'].keys():
item = result['details']['hostnames'][hostname]
if not item['resolvable']:
continue
num_hostnames += 1
@ -600,20 +716,20 @@ def check_site(entry):
else:
links = set()
if result['details']['urlchecks'] is None:
logging.warning("No urlchecks for %s" % entry['url'])
logging.warning("No urlchecks for %s", entry['url'])
else:
for item in result['details']['urlchecks']:
if item['content']['canonical_link'] is not None:
if item['content'] is not None and item['content']['canonical_link'] is not None:
links.add(item['content']['canonical_link'])
if len(links) == 1:
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
# FAVICON
if len(result['details']['icons']):
if result['details']['icons']:
result['result']['FAVICON'] = {'value': True, 'score': 1}
# FEEDS
if len(result['details']['feeds']):
if result['details']['feeds']:
result['result']['FEEDS'] = {'value': True, 'score': 1}
# HTTP_RESPONSE_DURATION
@ -621,17 +737,18 @@ def check_site(entry):
for item in result['details']['urlchecks']:
if item['error'] is None:
durations.append(item['duration'])
val = round(statistics.mean(durations))
result['result']['HTTP_RESPONSE_DURATION']['value'] = val
if val < 100:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
elif val < 1000:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
if durations:
val = round(statistics.mean(durations))
result['result']['HTTP_RESPONSE_DURATION']['value'] = val
if val < 100:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
elif val < 1000:
result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
# RESPONSIVE
if result['details']['responsive'] is not None:
if (result['details']['responsive']['min_width'] < 500 and
len(result['details']['responsive']['viewport_meta_tag']) > 0):
len(result['details']['responsive']['viewport_meta_tag']) > 0):
result['result']['RESPONSIVE']['value'] = True
result['result']['RESPONSIVE']['score'] = 1
@ -649,87 +766,91 @@ def check_site(entry):
return result
def main():
def get_job_from_queue():
"""
Bringing it all together
Returns a URL from the queue
"""
logging.basicConfig(level=logging.INFO)
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
out = None
# refresh our local clone of the green directory
get_green_directory()
with DATASTORE_CLIENT.transaction():
query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND)
for entity in query.fetch(limit=1):
logging.debug("Got job: %s", entity)
out = dict(entity)
out["url"] = entity.key.name
DATASTORE_CLIENT.delete(entity.key)
# build the list of website URLs to run checks for
logging.info("Processing green-directory")
input_entries = []
return out
for entry in dir_entries():
def work_of_queue():
"""
Take job from queue and finish it until there are no more jobs
"""
while True:
job = get_job_from_queue()
if job is None:
logging.info("No more jobs. Exiting.")
break
if 'type' not in entry:
logging.error("Entry without type")
continue
if 'urls' not in entry:
logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
continue
logging.info("Starting job %s", job["url"])
result = check_site(entry=job)
#logging.debug(result)
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])
website_url = None
for n in range(len(entry['urls'])):
try:
if entry['urls'][n]['type'] == "WEBSITE":
website_url = entry['urls'][n]['url']
if website_url:
input_entries.append({
"url": website_url,
"level": entry.get("level"),
"state": entry.get("state"),
"district": entry.get("district"),
"city": entry.get("city"),
})
except NameError:
logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
# randomize order, to distribute requests over servers
logging.info("Shuffling input URLs")
random.seed()
random.shuffle(input_entries)
# run checks
logging.info("Starting checks")
results = {}
pool = Pool(concurrency)
for ientry in input_entries:
logging.info("Submitting %s to job pool" % ientry['url'])
results[ientry['url']] = pool.apply_async(check_site, kwds={'entry': ientry})
pool.close()
pool.join()
logging.info("Checks are finished")
# Restructure result from dict of ApplyResult
# to list of dicts and sort in stable way
json_result = []
done = set()
logging.info("Restructuring results")
# convert results from ApplyResult to dict
for url in sorted(results.keys()):
if url not in done:
logging.info("Getting result for %s" % url)
try:
resultsitem = results[url].get()
json_result.append(resultsitem)
except Exception as e:
logging.error("Error getting result for '%s': %s" % (url, e))
done.add(url)
# Write result as JSON
output_filename = os.path.join(result_path, "spider_result.json")
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(json_result, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"])
entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
record = {
"created": datetime.utcnow(),
"results": result,
}
entity.update(record)
try:
DATASTORE_CLIENT.put(entity)
except InvalidArgument as ex:
logging.error("Could not write result: %s", ex)
except ex:
logging.error("Could not write result: %s", ex)
if __name__ == "__main__":
main()
"""
Bringing it all together
"""
parser = argparse.ArgumentParser()
parser.add_argument('--credentials-path', dest='credentials_path',
help='Path to the service account credentials JSON file',
default='/secrets/service-account.json')
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
default='info')
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
subparsers.add_parser('spider', help='Take jobs off the queue and spider')
jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
jobs_parser.add_argument('--url', help='Add a job to spider a URL')
args = parser.parse_args()
loglevel = args.loglevel.lower()
if loglevel == 'error':
logging.basicConfig(level=logging.ERROR)
elif loglevel == 'warn':
logging.basicConfig(level=logging.WARN)
elif loglevel == 'debug':
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
loglevel = 'info'
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
logging.debug("Called command %s", args.command)
if args.command == 'jobs':
create_jobs(args.url)
else:
work_of_queue()

View File

@ -55,7 +55,7 @@ $(function(){
// IPs
var ips = _.join(item.details.ipv4_addresses, ', ');
row.append('<td class="text '+ (ips === '' ? 'bad' : 'good') +' text-center" data-order="' + (ips === '' ? no : ips) + '"><span class="tt" title="IPv4-Adresse(n) des Servers bzw. der Server">' + (ips === '' ? no : ips) + '</span></td>');
row.append('<td class="text '+ (ips === '' ? 'bad' : 'good') +' text-center" data-order="' + ips + '"><span class="tt" title="IPv4-Adresse(n) des Servers bzw. der Server">' + (ips === '' ? no : ips) + '</span></td>');
// SITE_REACHABLE
var reachable = '<span class="tt" title="Die Site war beim Check erreichbar.">' + yes + '</span>';
@ -65,10 +65,14 @@ $(function(){
row.append('<td class="'+ (item.result.SITE_REACHABLE.value ? 'good' : 'bad') +' text-center" data-order="'+ (item.result.SITE_REACHABLE.value ? '1' : '0') +'">' + reachable + '</td>');
// HTTP_RESPONSE_DURATION
var durationClass = 'bad';
if (item.result.HTTP_RESPONSE_DURATION.score > 0) { durationClass = 'medium'; }
if (item.result.HTTP_RESPONSE_DURATION.score > 0.5) { durationClass = 'good'; }
row.append('<td class="text '+ durationClass +' text-center" data-order="' + item.result.HTTP_RESPONSE_DURATION.value + '"><span class="tt" title="Dauer, bis der Server die Seitenanfrage beantwortet. Unter 100 ms ist sehr gut. Unter 1 Sekunde ist okay.">' + item.result.HTTP_RESPONSE_DURATION.value + ' ms</span></td>');
if (!item.result.SITE_REACHABLE.value || item.result.HTTP_RESPONSE_DURATION.value === null) {
row.append('<td class="text bad text-center" data-order="99999999"><span class="tt" title="Nicht anwendbar">' + no + '</span></td>');
} else {
var durationClass = 'bad';
if (item.result.HTTP_RESPONSE_DURATION.score > 0) { durationClass = 'medium'; }
if (item.result.HTTP_RESPONSE_DURATION.score > 0.5) { durationClass = 'good'; }
row.append('<td class="text '+ durationClass +' text-center" data-order="' + item.result.HTTP_RESPONSE_DURATION.value + '"><span class="tt" title="Dauer, bis der Server die Seitenanfrage beantwortet. Unter 100 ms ist sehr gut. Unter 1 Sekunde ist okay.">' + item.result.HTTP_RESPONSE_DURATION.value + ' ms</span></td>');
}
// FAVICON
var icon = item.result.FAVICON.value && (item.details.icons[0] != null);
@ -103,7 +107,7 @@ $(function(){
// screenshots
var screenshot = false;
if (item.details.canonical_urls.length > 0) {
if (item.details.canonical_urls && item.details.canonical_urls.length > 0) {
if (typeof screenshots[item.details.canonical_urls[0]] !== 'undefined') {
var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.details.canonical_urls[0]];
var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.details.canonical_urls[0]];