mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-03 09:33:42 +02:00
Merge pull request #58 from netzbegruenung/write-to-db
Use job queue, write to database
This commit is contained in:
commit
bf23478265
|
@ -12,7 +12,7 @@ RUN apt-get update \
|
||||||
&& wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
|
&& wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
|
||||||
&& dpkg -i google-chrome-stable_current_amd64.deb \
|
&& dpkg -i google-chrome-stable_current_amd64.deb \
|
||||||
&& rm google-chrome-stable_current_amd64.deb \
|
&& rm google-chrome-stable_current_amd64.deb \
|
||||||
&& pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 certifi==2018.1.18 \
|
&& pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 \
|
||||||
&& wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \
|
&& wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \
|
||||||
&& unzip chromedriver_linux64.zip \
|
&& unzip chromedriver_linux64.zip \
|
||||||
&& rm chromedriver_linux64.zip \
|
&& rm chromedriver_linux64.zip \
|
||||||
|
@ -25,6 +25,7 @@ RUN wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x
|
||||||
|
|
||||||
ADD spider.py /
|
ADD spider.py /
|
||||||
ADD spider_test.py /
|
ADD spider_test.py /
|
||||||
|
ADD data_export.py /
|
||||||
|
|
||||||
ENTRYPOINT ["python3"]
|
ENTRYPOINT ["python3"]
|
||||||
CMD ["/spider.py"]
|
CMD ["/spider.py"]
|
||||||
|
|
22
Makefile
22
Makefile
|
@ -7,24 +7,40 @@ dockerimage:
|
||||||
docker pull debian:stretch-slim
|
docker pull debian:stretch-slim
|
||||||
docker build -t spider .
|
docker build -t spider .
|
||||||
|
|
||||||
|
# Create spider job queue
|
||||||
|
spiderjobs: dockerimage
|
||||||
|
docker run --rm -ti \
|
||||||
|
-v $(PWD)/secrets:/secrets \
|
||||||
|
spider spider.py \
|
||||||
|
--credentials-path /secrets/datastore-writer.json \
|
||||||
|
--loglevel debug \
|
||||||
|
jobs
|
||||||
|
|
||||||
# Run spider in docker image
|
# Run spider in docker image
|
||||||
spider: dockerimage
|
spider: dockerimage
|
||||||
docker run --rm -ti \
|
docker run --rm -ti \
|
||||||
-v $(PWD)/webapp/dist/data:/out \
|
-v $(PWD)/webapp/dist/data:/out \
|
||||||
-v $(PWD)/docs/siteicons:/icons \
|
-v $(PWD)/docs/siteicons:/icons \
|
||||||
|
-v $(PWD)/secrets:/secrets \
|
||||||
|
spider spider.py \
|
||||||
|
--credentials-path /secrets/datastore-writer.json \
|
||||||
|
--loglevel debug \
|
||||||
spider
|
spider
|
||||||
|
|
||||||
test: dockerimage
|
test: dockerimage
|
||||||
docker run --rm -ti spider /spider_test.py
|
docker run --rm -ti spider /spider_test.py
|
||||||
|
|
||||||
screenshots: venv
|
export:
|
||||||
venv/bin/python ./screenshots.py secrets/screenshot-reader.json
|
docker run --rm -ti \
|
||||||
|
-v $(PWD)/webapp/dist/data:/out \
|
||||||
|
-v $(PWD)/secrets:/secrets \
|
||||||
|
spider data_export.py /secrets/datastore-reader.json
|
||||||
|
|
||||||
webapp/node_modules:
|
webapp/node_modules:
|
||||||
cd webapp && npm install
|
cd webapp && npm install
|
||||||
|
|
||||||
# Build webapp
|
# Build webapp
|
||||||
webapp: webapp/node_modules screenshots
|
webapp: webapp/node_modules
|
||||||
cd webapp && npx webpack --config webpack.config.js
|
cd webapp && npx webpack --config webpack.config.js
|
||||||
cp -r webapp/dist/* ./docs/
|
cp -r webapp/dist/* ./docs/
|
||||||
cp webapp/node_modules/tooltipster/dist/css/tooltipster.bundle.min.css ./docs/css/
|
cp webapp/node_modules/tooltipster/dist/css/tooltipster.bundle.min.css ./docs/css/
|
||||||
|
|
36
README.md
36
README.md
|
@ -1,11 +1,15 @@
|
||||||
# Green Spider
|
# Green Spider
|
||||||
|
|
||||||
Initiative und Tools zur Förderung eines benutzer*innenfreundlichen Auftritts von Bündnis 90/Die Grünen im Web
|
Initiative und Tools zur Förderung eines benutzer*innenfreundlichen Auftritts von Bündnis 90/Die Grünen im Web.
|
||||||
|
|
||||||
|
Zur Auswertung: [https://green-spider.netzbegruenung.de/](https://green-spider.netzbegruenung.de/)
|
||||||
|
|
||||||
## Tools
|
## Tools
|
||||||
|
|
||||||
- Spider: Sammelt Informationen über Websites von B90/GRÜNE Gliederungen
|
- Spider: Sammelt Informationen über Websites von B90/GRÜNE Gliederungen
|
||||||
|
|
||||||
|
- Screenshotter: Erstellt Seiten-Screenshots. Siehe [netzbegruenung/green-spider-screenshotter](https://github.com/netzbegruenung/green-spider-screenshotter/)
|
||||||
|
|
||||||
- Webapp: Darstellung der Spider-Ergebnisse unter [green-spider.netzbegruenung.de](https://green-spider.netzbegruenung.de/)
|
- Webapp: Darstellung der Spider-Ergebnisse unter [green-spider.netzbegruenung.de](https://green-spider.netzbegruenung.de/)
|
||||||
|
|
||||||
## Aktivitäten
|
## Aktivitäten
|
||||||
|
@ -22,21 +26,28 @@ Zur Kommunikation dient der Chatbegrünung-Kanal [#green-spider](https://chatbeg
|
||||||
|
|
||||||
### Spider ausführen
|
### Spider ausführen
|
||||||
|
|
||||||
Damit werden alle bekannten WWW-Adressen aus [netzbegruenung/green-directory](https://github.com/netzbegruenung/green-directory) geprüft und Daten dazu gesammelt.
|
|
||||||
|
|
||||||
Voraussetzungen:
|
Voraussetzungen:
|
||||||
|
|
||||||
- GNU make
|
- Docker
|
||||||
- Python 3
|
- Schlüssel mit Schreibrecht für die Ergebnis-Datenbank
|
||||||
- virtualenv
|
|
||||||
|
|
||||||
Starte den Vorgang mit diesem Befehl:
|
Um alle Sites aus aus [netzbegruenung/green-directory](https://github.com/netzbegruenung/green-directory) zu spidern:
|
||||||
|
|
||||||
```nohighlight
|
```nohighlight
|
||||||
|
make spiderjobs
|
||||||
make spider
|
make spider
|
||||||
```
|
```
|
||||||
|
|
||||||
Das Ergebnis ist die Datei `webapp/dist/data/spider_result.json`. Wenn Du die neuen Daten ins Repository einspielen möchtest, erstelle bitte einen Pull Request.
|
Alternativ kann wie im nachfolgenden Beispiel gezeogt das Spidern einer einzelnen URL angestoßen werden. Diese muss nicht zwingend Teil des `green-directory` sein.
|
||||||
|
|
||||||
|
```nohighlight
|
||||||
|
docker run --rm -ti \
|
||||||
|
-v $PWD/secrets:/secrets spider \
|
||||||
|
spider.py --credentials-path /secrets/datastore-writer.json \
|
||||||
|
jobs --url https://www.trittin.de/
|
||||||
|
|
||||||
|
make spider
|
||||||
|
```
|
||||||
|
|
||||||
### Screenshots erstellen
|
### Screenshots erstellen
|
||||||
|
|
||||||
|
@ -44,16 +55,17 @@ Siehe [green-spider-screenshotter](https://github.com/netzbegruenung/green-spide
|
||||||
|
|
||||||
### Webapp aktualisieren
|
### Webapp aktualisieren
|
||||||
|
|
||||||
Die unter https://netzbegruenung.github.io/green-spider/ veröffentlichte Webapp zeigt den Inhalt des [docs](https://github.com/netzbegruenung/green-spider/tree/master/docs) Verzeichnisses für den `master` Branch dieses repositories an. Dieser kann automatisch neu erzeugt werden.
|
|
||||||
|
|
||||||
Voraussetzungen:
|
Voraussetzungen:
|
||||||
|
|
||||||
- npm
|
- npm
|
||||||
- Service-Account JSON-Datei für den Lesezugriff auf Screenshot-Daten
|
- Docker
|
||||||
|
- Schlüssel mit Leserecht für Screenshot- und Ergebnis-Datenbank
|
||||||
|
|
||||||
Um den Inhalt des docs-Verzeichnisses zu aktualisieren, gibt es im Makefile dieses Kommando:
|
Die beiden nachfolgenden Kommandos erzeugen die JSON-Exporte der Spider-Ergebnisse
|
||||||
|
und Screenshots und aktualisieren die Webapp.
|
||||||
|
|
||||||
```nohighlight
|
```nohighlight
|
||||||
|
make export
|
||||||
make webapp
|
make webapp
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,35 @@
|
||||||
|
"""
|
||||||
|
Exports data from the database to JSON files for use in a static webapp
|
||||||
|
"""
|
||||||
|
|
||||||
from google.cloud import datastore
|
from google.cloud import datastore
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
def main():
|
client = None
|
||||||
if len(sys.argv) == 1:
|
|
||||||
print("Error: please provide path to Google Storage API system account JSON file as argument")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
key_path = sys.argv[1]
|
def export_results():
|
||||||
client = datastore.Client.from_service_account_json(key_path)
|
"""
|
||||||
|
Export of the main results data
|
||||||
|
"""
|
||||||
|
out = []
|
||||||
|
|
||||||
|
query = client.query(kind='spider-results')
|
||||||
|
for entity in query.fetch():
|
||||||
|
print(entity.key.name)
|
||||||
|
out.append(dict(entity)["results"])
|
||||||
|
|
||||||
|
output_filename = "/out/spider_result.json"
|
||||||
|
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
||||||
|
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
|
def export_screenshots():
|
||||||
|
"""
|
||||||
|
Export of screenshot meta data
|
||||||
|
"""
|
||||||
out = {}
|
out = {}
|
||||||
|
|
||||||
query = client.query(kind='webscreenshot')
|
query = client.query(kind='webscreenshot')
|
||||||
|
@ -19,10 +37,18 @@ def main():
|
||||||
print(item['url'], os.path.basename(item['screenshot_url']))
|
print(item['url'], os.path.basename(item['screenshot_url']))
|
||||||
out[item['url']] = os.path.basename(item['screenshot_url'])
|
out[item['url']] = os.path.basename(item['screenshot_url'])
|
||||||
|
|
||||||
output_filename = "./webapp/dist/data/screenshots.json"
|
output_filename = "/out/screenshots.json"
|
||||||
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
||||||
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
if len(sys.argv) == 1:
|
||||||
|
print("Error: please provide path to Google Storage API system account JSON file as argument")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
key_path = sys.argv[1]
|
||||||
|
client = datastore.Client.from_service_account_json(key_path)
|
||||||
|
|
||||||
|
export_screenshots()
|
||||||
|
export_results()
|
551
spider.py
551
spider.py
|
@ -1,68 +1,175 @@
|
||||||
# coding: utf8
|
"""
|
||||||
|
Provides the spider functionality (website checks).
|
||||||
|
"""
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
import argparse
|
||||||
from git import Repo
|
|
||||||
from multiprocessing import Pool
|
|
||||||
from selenium import webdriver
|
|
||||||
from socket import gethostbyname_ex
|
|
||||||
from urllib.parse import urljoin
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
import certifi
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
import requests
|
|
||||||
import shutil
|
import shutil
|
||||||
import statistics
|
import statistics
|
||||||
import sys
|
from datetime import datetime
|
||||||
|
from socket import gethostbyname_ex
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import requests
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from git import Repo
|
||||||
|
from selenium import webdriver
|
||||||
|
from google.cloud import datastore
|
||||||
|
from google.api_core.exceptions import InvalidArgument
|
||||||
|
|
||||||
|
|
||||||
# configuration
|
# configuration
|
||||||
|
|
||||||
# number of parallel processes to use for crawling
|
|
||||||
concurrency = 1
|
|
||||||
|
|
||||||
# connection timeout for website checks (seconds)
|
# connection timeout for website checks (seconds)
|
||||||
connect_timeout = 5
|
CONNECT_TIMEOUT = 5
|
||||||
|
|
||||||
# response timeout for website checks
|
# response timeout for website checks
|
||||||
read_timeout = 10
|
READ_TIMEOUT = 10
|
||||||
|
|
||||||
# Git repo for our data
|
# Git repo for our data
|
||||||
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
|
GREEN_DIRECTORY_REPO = 'https://github.com/netzbegruenung/green-directory.git'
|
||||||
# folder in that repo that holds the data
|
# folder in that repo that holds the data
|
||||||
green_direcory_data_path = 'data/countries/de'
|
GREEN_DIRECTORY_DATA_PATH = 'data/countries/de'
|
||||||
green_directory_local_path = './cache/green-directory'
|
GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
|
||||||
|
|
||||||
result_path = '/out'
|
RESULT_PATH = '/out'
|
||||||
|
|
||||||
siteicons_path = '/icons'
|
SITEICONS_PATH = '/icons'
|
||||||
|
|
||||||
# IP address of the newthinking GCMS server
|
# IP address of the newthinking GCMS server
|
||||||
gcms_ip = "91.102.13.20"
|
GCMS_IP = "91.102.13.20"
|
||||||
|
|
||||||
|
JOB_DATASTORE_KIND = 'spider-jobs'
|
||||||
|
RESULTS_DATASTORE_KIND = 'spider-results'
|
||||||
|
|
||||||
# end configuration
|
# end configuration
|
||||||
|
|
||||||
|
DATASTORE_CLIENT = None
|
||||||
|
|
||||||
|
|
||||||
|
def chunks(the_list, size):
|
||||||
|
"""
|
||||||
|
Yield successive n-sized chunks from list the_list
|
||||||
|
where n = size.
|
||||||
|
"""
|
||||||
|
for i in range(0, len(the_list), size):
|
||||||
|
yield the_list[i:i + size]
|
||||||
|
|
||||||
|
|
||||||
|
def create_jobs(url=None):
|
||||||
|
"""
|
||||||
|
Read all URLs from green directory and fill a job database
|
||||||
|
with one job per URL.
|
||||||
|
|
||||||
|
Alternatively, if the url argument is given, only the given URL
|
||||||
|
will be added as a spider job.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# refresh our local clone of the green directory
|
||||||
|
logging.info("Refreshing green-directory clone")
|
||||||
|
get_green_directory()
|
||||||
|
|
||||||
|
# build the list of website URLs to run checks for
|
||||||
|
logging.info("Processing green-directory")
|
||||||
|
input_entries = []
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
for entry in dir_entries():
|
||||||
|
|
||||||
|
if 'type' not in entry:
|
||||||
|
logging.error("Entry without type")
|
||||||
|
continue
|
||||||
|
if 'urls' not in entry:
|
||||||
|
logging.debug("Entry %s does not have any URLs.", repr_entry(entry))
|
||||||
|
continue
|
||||||
|
|
||||||
|
website_url = None
|
||||||
|
for index in range(len(entry['urls'])):
|
||||||
|
try:
|
||||||
|
if entry['urls'][index]['type'] == "WEBSITE":
|
||||||
|
website_url = entry['urls'][index]['url']
|
||||||
|
if website_url:
|
||||||
|
if url is not None and website_url != url:
|
||||||
|
continue
|
||||||
|
input_entries.append({
|
||||||
|
"url": website_url,
|
||||||
|
"level": entry.get("level"),
|
||||||
|
"state": entry.get("state"),
|
||||||
|
"district": entry.get("district"),
|
||||||
|
"city": entry.get("city"),
|
||||||
|
})
|
||||||
|
count += 1
|
||||||
|
except NameError:
|
||||||
|
logging.error("Error in %s: 'url' key missing (%s)",
|
||||||
|
repr_entry(entry), entry['urls'][index])
|
||||||
|
|
||||||
|
# ensure the passed URL argument is really there, even if not part
|
||||||
|
# of the directory.
|
||||||
|
if url and count == 0:
|
||||||
|
logging.info("Adding job for URL %s which is not part of green-directory", url)
|
||||||
|
input_entries.append({
|
||||||
|
"url": url,
|
||||||
|
"level": None,
|
||||||
|
"state": None,
|
||||||
|
"district": None,
|
||||||
|
"city": None,
|
||||||
|
})
|
||||||
|
|
||||||
|
# randomize order, to distribute requests over servers
|
||||||
|
logging.debug("Shuffling input URLs")
|
||||||
|
random.seed()
|
||||||
|
random.shuffle(input_entries)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
logging.info("Writing jobs")
|
||||||
|
|
||||||
|
entities = []
|
||||||
|
|
||||||
|
for entry in input_entries:
|
||||||
|
key = DATASTORE_CLIENT.key(JOB_DATASTORE_KIND, entry["url"])
|
||||||
|
entity = datastore.Entity(key=key)
|
||||||
|
entity.update({
|
||||||
|
"created": datetime.utcnow(),
|
||||||
|
"level": entry["level"],
|
||||||
|
"state": entry["state"],
|
||||||
|
"district": entry["district"],
|
||||||
|
"city": entry["city"],
|
||||||
|
})
|
||||||
|
entities.append(entity)
|
||||||
|
|
||||||
|
# commmit to DB
|
||||||
|
for chunk in chunks(entities, 300):
|
||||||
|
logging.debug("Writing jobs chunk of length %d", len(chunk))
|
||||||
|
DATASTORE_CLIENT.put_multi(chunk)
|
||||||
|
count += len(chunk)
|
||||||
|
|
||||||
|
logging.info("Writing jobs done, %s jobs added", count)
|
||||||
|
|
||||||
|
|
||||||
def get_green_directory():
|
def get_green_directory():
|
||||||
"""
|
"""
|
||||||
Clones the source of website URLs, the green directory,
|
Clones the source of website URLs, the green directory,
|
||||||
into the local file system using git
|
into the local file system using git
|
||||||
"""
|
"""
|
||||||
if os.path.exists(green_directory_local_path):
|
if os.path.exists(GREEN_DIRECTORY_LOCAL_PATH):
|
||||||
shutil.rmtree(green_directory_local_path)
|
shutil.rmtree(GREEN_DIRECTORY_LOCAL_PATH)
|
||||||
Repo.clone_from(green_directory_repo, green_directory_local_path)
|
Repo.clone_from(GREEN_DIRECTORY_REPO, GREEN_DIRECTORY_LOCAL_PATH)
|
||||||
|
|
||||||
|
|
||||||
def dir_entries():
|
def dir_entries():
|
||||||
"""
|
"""
|
||||||
Iterator over all data files in the cloned green directory
|
Iterator over all data files in the cloned green directory
|
||||||
"""
|
"""
|
||||||
path = os.path.join(green_directory_local_path, green_direcory_data_path)
|
path = os.path.join(GREEN_DIRECTORY_LOCAL_PATH, GREEN_DIRECTORY_DATA_PATH)
|
||||||
for root, _, files in os.walk(path):
|
for root, _, files in os.walk(path):
|
||||||
for fname in files:
|
for fname in files:
|
||||||
|
|
||||||
|
@ -80,14 +187,14 @@ def repr_entry(entry):
|
||||||
Return string representation of a directory entry,
|
Return string representation of a directory entry,
|
||||||
for logging/debugging purposes
|
for logging/debugging purposes
|
||||||
"""
|
"""
|
||||||
r = entry['type']
|
ret = entry['type']
|
||||||
if 'level' in entry:
|
if 'level' in entry:
|
||||||
r += "/" + entry['level']
|
ret += "/" + entry['level']
|
||||||
if 'state' in entry:
|
if 'state' in entry:
|
||||||
r += "/" + entry['state']
|
ret += "/" + entry['state']
|
||||||
if 'district' in entry:
|
if 'district' in entry:
|
||||||
r += "/" + entry['district']
|
ret += "/" + entry['district']
|
||||||
return r
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def derive_test_hostnames(hostname):
|
def derive_test_hostnames(hostname):
|
||||||
|
@ -117,24 +224,25 @@ def reduce_urls(urllist):
|
||||||
that either don't work or lead somewhere else
|
that either don't work or lead somewhere else
|
||||||
"""
|
"""
|
||||||
targets = set()
|
targets = set()
|
||||||
for u in urllist:
|
for url in urllist:
|
||||||
if u['error'] is not None:
|
if url['error'] is not None:
|
||||||
continue
|
continue
|
||||||
if u['redirects_to'] is not None:
|
if url['redirects_to'] is not None:
|
||||||
targets.add(u['redirects_to'])
|
targets.add(url['redirects_to'])
|
||||||
else:
|
else:
|
||||||
targets.add(u['url'])
|
targets.add(url['url'])
|
||||||
return sorted(list(targets))
|
return sorted(list(targets))
|
||||||
|
|
||||||
|
|
||||||
def normalize_title(s):
|
def normalize_title(title):
|
||||||
"""
|
"""
|
||||||
Removes garbage from HTML page titles
|
Removes garbage from HTML page titles
|
||||||
"""
|
"""
|
||||||
s = s.replace('\u00a0', ' ')
|
title = title.replace(u'\u00a0', ' ')
|
||||||
s = s.replace(' ', ' ')
|
title = title.replace(' ', ' ')
|
||||||
s = s.strip()
|
title = title.strip()
|
||||||
return s
|
return title
|
||||||
|
|
||||||
|
|
||||||
def download_icon(icon_url):
|
def download_icon(icon_url):
|
||||||
"""
|
"""
|
||||||
|
@ -150,10 +258,10 @@ def download_icon(icon_url):
|
||||||
}
|
}
|
||||||
|
|
||||||
# Download the icon
|
# Download the icon
|
||||||
r = requests.get(icon_url)
|
req = requests.get(icon_url)
|
||||||
r.raise_for_status()
|
req.raise_for_status()
|
||||||
|
|
||||||
content_hash = hashlib.md5(r.content).hexdigest()
|
content_hash = hashlib.md5(req.content).hexdigest()
|
||||||
extension = ""
|
extension = ""
|
||||||
|
|
||||||
file_name = os.path.basename(icon_url)[-1]
|
file_name = os.path.basename(icon_url)[-1]
|
||||||
|
@ -161,24 +269,25 @@ def download_icon(icon_url):
|
||||||
ext = file_name.split(".")[-1]
|
ext = file_name.split(".")[-1]
|
||||||
if ext != "":
|
if ext != "":
|
||||||
extension = ext
|
extension = ext
|
||||||
|
|
||||||
if extension == "":
|
if extension == "":
|
||||||
# derive from content type
|
# derive from content type
|
||||||
t = r.headers.get('content-type')
|
ctype = req.headers.get('content-type')
|
||||||
try:
|
try:
|
||||||
extension = default_endings[t]
|
extension = default_endings[ctype]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
logging.error("No file ending defined for icon type '%s'" % t)
|
logging.error("No file ending defined for icon type '%s'", ctype)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
filename = content_hash + "." + extension.lower()
|
filename = content_hash + "." + extension.lower()
|
||||||
|
|
||||||
path = siteicons_path + os.path.sep + filename
|
path = SITEICONS_PATH + os.path.sep + filename
|
||||||
with open(path, 'wb') as iconfile:
|
with open(path, 'wb') as iconfile:
|
||||||
iconfile.write(r.content)
|
iconfile.write(req.content)
|
||||||
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
def check_responsiveness(url):
|
def check_responsiveness(url):
|
||||||
"""
|
"""
|
||||||
Checks
|
Checks
|
||||||
|
@ -193,9 +302,9 @@ def check_responsiveness(url):
|
||||||
|
|
||||||
# sizes we check for (width, height)
|
# sizes we check for (width, height)
|
||||||
sizes = (
|
sizes = (
|
||||||
(320,480), # old smartphone
|
(320, 480), # old smartphone
|
||||||
(768,1024), # older tablet or newer smartphone
|
(768, 1024), # older tablet or newer smartphone
|
||||||
(1024,768), # older desktop or horiz. tablet
|
(1024, 768), # older desktop or horiz. tablet
|
||||||
(1920, 1080), # Full HD horizontal
|
(1920, 1080), # Full HD horizontal
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -218,7 +327,8 @@ def check_responsiveness(url):
|
||||||
|
|
||||||
return details
|
return details
|
||||||
|
|
||||||
def check_content(r):
|
|
||||||
|
def check_content(req):
|
||||||
"""
|
"""
|
||||||
Adds details to check regarding content of the page
|
Adds details to check regarding content of the page
|
||||||
|
|
||||||
|
@ -227,10 +337,10 @@ def check_content(r):
|
||||||
"""
|
"""
|
||||||
result = {}
|
result = {}
|
||||||
|
|
||||||
result['encoding'] = r.encoding.lower()
|
result['encoding'] = req.encoding.lower()
|
||||||
soup = BeautifulSoup(r.text, 'html.parser')
|
soup = BeautifulSoup(req.text, 'html.parser')
|
||||||
|
|
||||||
result['html'] = r.text
|
result['html'] = req.text
|
||||||
|
|
||||||
# page title
|
# page title
|
||||||
result['title'] = None
|
result['title'] = None
|
||||||
|
@ -245,47 +355,47 @@ def check_content(r):
|
||||||
result['canonical_link'] = None
|
result['canonical_link'] = None
|
||||||
link = soup.find('link', rel='canonical')
|
link = soup.find('link', rel='canonical')
|
||||||
if link:
|
if link:
|
||||||
result['canonical_link'] = urljoin(r.url, link.get('href'))
|
result['canonical_link'] = urljoin(req.url, link.get('href'))
|
||||||
|
|
||||||
# icon
|
# icon
|
||||||
result['icon'] = None
|
result['icon'] = None
|
||||||
link = soup.find('link', rel=lambda x: x and x.lower()=='icon')
|
link = soup.find('link', rel=lambda x: x and x.lower() == 'icon')
|
||||||
if link:
|
if link:
|
||||||
result['icon'] = urljoin(r.url, link.get('href'))
|
result['icon'] = urljoin(req.url, link.get('href'))
|
||||||
else:
|
else:
|
||||||
link = soup.find('link', rel=lambda x: x and x.lower()=='shortcut icon')
|
link = soup.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
|
||||||
if link:
|
if link:
|
||||||
result['icon'] = urljoin(r.url, link.get('href'))
|
result['icon'] = urljoin(req.url, link.get('href'))
|
||||||
|
|
||||||
# feed links
|
# feed links
|
||||||
result['feeds'] = []
|
result['feeds'] = []
|
||||||
rss_links = soup.find_all('link', type='application/rss+xml')
|
rss_links = soup.find_all('link', type='application/rss+xml')
|
||||||
atom_links = soup.find_all('link', type='application/atom+xml')
|
atom_links = soup.find_all('link', type='application/atom+xml')
|
||||||
|
|
||||||
if len(rss_links) > 0:
|
if rss_links:
|
||||||
for l in rss_links:
|
for link in rss_links:
|
||||||
result['feeds'].append(urljoin(r.url, l.get('href')))
|
result['feeds'].append(urljoin(req.url, link.get('href')))
|
||||||
if len(atom_links) > 0:
|
if atom_links:
|
||||||
for l in rss_links:
|
for link in rss_links:
|
||||||
result['feeds'].append(urljoin(r.url, l.get('href')))
|
result['feeds'].append(urljoin(req.url, link.get('href')))
|
||||||
|
|
||||||
# generator meta tag
|
# generator meta tag
|
||||||
result['generator'] = None
|
result['generator'] = None
|
||||||
if head is not None:
|
if head is not None:
|
||||||
generator = head.select('[name=generator]')
|
generator = head.select('[name=generator]')
|
||||||
if len(generator):
|
if generator:
|
||||||
result['generator'] = generator[0].get('content')
|
result['generator'] = generator[0].get('content')
|
||||||
|
|
||||||
# opengraph meta tags
|
# opengraph meta tags
|
||||||
result['opengraph'] = None
|
result['opengraph'] = None
|
||||||
og = set()
|
opengraph = set()
|
||||||
if head is not None:
|
if head is not None:
|
||||||
for item in head.find_all(property=re.compile('^og:')):
|
for item in head.find_all(property=re.compile('^og:')):
|
||||||
og.add(item.get('property'))
|
opengraph.add(item.get('property'))
|
||||||
for item in head.find_all(itemprop=re.compile('^og:')):
|
for item in head.find_all(itemprop=re.compile('^og:')):
|
||||||
og.add(item.get('itemprop'))
|
opengraph.add(item.get('itemprop'))
|
||||||
if len(og):
|
if opengraph:
|
||||||
result['opengraph'] = sorted(list(og))
|
result['opengraph'] = sorted(list(opengraph))
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@ -298,8 +408,8 @@ def collect_ipv4_addresses(hostname_dict):
|
||||||
for item in hostname_dict.values():
|
for item in hostname_dict.values():
|
||||||
if 'ip_addresses' not in item:
|
if 'ip_addresses' not in item:
|
||||||
continue
|
continue
|
||||||
for ip in item['ip_addresses']:
|
for ip_addr in item['ip_addresses']:
|
||||||
ips.add(ip)
|
ips.add(ip_addr)
|
||||||
return sorted(list(ips))
|
return sorted(list(ips))
|
||||||
|
|
||||||
|
|
||||||
|
@ -310,11 +420,11 @@ def parse_generator(generator):
|
||||||
generator = generator.lower()
|
generator = generator.lower()
|
||||||
if 'typo3' in generator:
|
if 'typo3' in generator:
|
||||||
return "typo3"
|
return "typo3"
|
||||||
elif 'wordpress' in generator:
|
if 'wordpress' in generator:
|
||||||
return "wordpress"
|
return "wordpress"
|
||||||
elif 'drupal' in generator:
|
if 'drupal' in generator:
|
||||||
return "drupal"
|
return "drupal"
|
||||||
elif 'joomla' in generator:
|
if 'joomla' in generator:
|
||||||
return "joomla"
|
return "joomla"
|
||||||
return generator
|
return generator
|
||||||
|
|
||||||
|
@ -328,7 +438,9 @@ def check_site(entry):
|
||||||
4. Run full check on canonical URL
|
4. Run full check on canonical URL
|
||||||
"""
|
"""
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 green-spider/0.1'
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) ' +
|
||||||
|
'AppleWebKit/537.36 (KHTML, like Gecko) ' +
|
||||||
|
'Chrome/65.0.3325.181 green-spider/0.1'
|
||||||
}
|
}
|
||||||
|
|
||||||
# all the info we'll return for the site
|
# all the info we'll return for the site
|
||||||
|
@ -337,12 +449,13 @@ def check_site(entry):
|
||||||
'input_url': entry['url'],
|
'input_url': entry['url'],
|
||||||
# Meta: Regional and type metadata for the site
|
# Meta: Regional and type metadata for the site
|
||||||
'meta': {
|
'meta': {
|
||||||
'level': entry['level'],
|
'level': entry.get('level'),
|
||||||
'state': entry['state'],
|
'state': entry.get('state'),
|
||||||
'district': entry['district'],
|
'district': entry.get('district'),
|
||||||
'city': entry['city'],
|
'city': entry.get('city'),
|
||||||
},
|
},
|
||||||
# Details: All details we collected about the site (which aren't directly related to the report criteria)
|
# Details: All details we collected about the site (which aren't directly
|
||||||
|
# related to the report criteria)
|
||||||
'details': {
|
'details': {
|
||||||
'hostnames': {},
|
'hostnames': {},
|
||||||
'ipv4_addresses': [],
|
'ipv4_addresses': [],
|
||||||
|
@ -375,18 +488,18 @@ def check_site(entry):
|
||||||
|
|
||||||
# try to resolve hostnames
|
# try to resolve hostnames
|
||||||
processed_hostnames = {}
|
processed_hostnames = {}
|
||||||
for hn in hostnames:
|
for hostname in hostnames:
|
||||||
|
|
||||||
processed_hostnames[hn] = {
|
processed_hostnames[hostname] = {
|
||||||
'resolvable': False,
|
'resolvable': False,
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
hostname, aliases, ip_addresses = gethostbyname_ex(hn)
|
hostname, aliases, ip_addresses = gethostbyname_ex(hostname)
|
||||||
processed_hostnames[hn]['resolvable'] = True
|
processed_hostnames[hostname]['resolvable'] = True
|
||||||
processed_hostnames[hn]['resolved_hostname'] = hostname
|
processed_hostnames[hostname]['resolved_hostname'] = hostname
|
||||||
processed_hostnames[hn]['aliases'] = aliases
|
processed_hostnames[hostname]['aliases'] = aliases
|
||||||
processed_hostnames[hn]['ip_addresses'] = ip_addresses
|
processed_hostnames[hostname]['ip_addresses'] = ip_addresses
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -398,9 +511,9 @@ def check_site(entry):
|
||||||
checked_urls = []
|
checked_urls = []
|
||||||
checked_urls_set = set()
|
checked_urls_set = set()
|
||||||
|
|
||||||
for hn in processed_hostnames.keys():
|
for hostname in processed_hostnames.keys():
|
||||||
|
|
||||||
item = processed_hostnames[hn]
|
item = processed_hostnames[hostname]
|
||||||
|
|
||||||
if not item['resolvable']:
|
if not item['resolvable']:
|
||||||
continue
|
continue
|
||||||
|
@ -421,18 +534,19 @@ def check_site(entry):
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.head(record['url'], headers=headers, allow_redirects=True)
|
req = requests.head(record['url'], headers=headers, allow_redirects=True)
|
||||||
if r.url == url:
|
if req.url == url:
|
||||||
logging.info("URL: %s - status %s" % (record['url'], r.status_code))
|
logging.info("URL: %s - status %s", record['url'], req.status_code)
|
||||||
else:
|
else:
|
||||||
logging.info("URL: %s - status %s - redirects to %s" % (record['url'], r.status_code, r.url))
|
logging.info("URL: %s - status %s - redirects to %s", record['url'],
|
||||||
record['redirects_to'] = r.url
|
req.status_code, req.url)
|
||||||
except Exception as e:
|
record['redirects_to'] = req.url
|
||||||
|
except Exception as exc:
|
||||||
record['error'] = {
|
record['error'] = {
|
||||||
'type': str(type(e)),
|
'type': str(type(exc)),
|
||||||
'message': str(e),
|
'message': str(exc),
|
||||||
}
|
}
|
||||||
logging.info("URL %s: %s %s" % (url, str(type(e)), e))
|
logging.info("URL %s: %s %s", url, str(type(exc)), exc)
|
||||||
|
|
||||||
checked_urls.append(record)
|
checked_urls.append(record)
|
||||||
|
|
||||||
|
@ -442,7 +556,7 @@ def check_site(entry):
|
||||||
# Deeper test for the remaining (canonical) URL(s)
|
# Deeper test for the remaining (canonical) URL(s)
|
||||||
for check_url in result['details']['canonical_urls']:
|
for check_url in result['details']['canonical_urls']:
|
||||||
|
|
||||||
logging.info("Downloading URL %s" % check_url)
|
logging.info("Downloading URL %s", check_url)
|
||||||
|
|
||||||
check = {
|
check = {
|
||||||
'url': check_url,
|
'url': check_url,
|
||||||
|
@ -454,37 +568,38 @@ def check_site(entry):
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
|
req = requests.get(check_url, headers=headers, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT))
|
||||||
check['status_code'] = r.status_code
|
check['status_code'] = req.status_code
|
||||||
check['duration'] = round(r.elapsed.microseconds / 1000)
|
check['duration'] = round(req.elapsed.microseconds / 1000)
|
||||||
|
|
||||||
# Content checks
|
# Content checks
|
||||||
if r.status_code < 300:
|
if req.status_code < 300:
|
||||||
check['content'] = check_content(r)
|
check['content'] = check_content(req)
|
||||||
|
|
||||||
# Responsiveness check
|
# Responsiveness check
|
||||||
try:
|
try:
|
||||||
check['responsive'] = check_responsiveness(check_url)
|
check['responsive'] = check_responsiveness(check_url)
|
||||||
except Exception as e:
|
except Exception as exc:
|
||||||
logging.error("Error when checking responsiveness for '%s': %s" % (check_url, e))
|
logging.error("Error when checking responsiveness for '%s': %s", check_url, exc)
|
||||||
|
|
||||||
except requests.exceptions.ConnectionError as e:
|
except requests.exceptions.ConnectionError as exc:
|
||||||
logging.error(str(e) + " " + check_url)
|
logging.error(str(exc) + " " + check_url)
|
||||||
check['error'] = "connection"
|
check['error'] = "connection"
|
||||||
except requests.exceptions.ReadTimeout as e:
|
except requests.exceptions.ReadTimeout as exc:
|
||||||
logging.error(str(e) + " " + check_url)
|
logging.error(str(exc) + " " + check_url)
|
||||||
check['error'] = "read_timeout"
|
check['error'] = "read_timeout"
|
||||||
except requests.exceptions.Timeout as e:
|
except requests.exceptions.Timeout as exc:
|
||||||
logging.error(str(e) + " " + check_url)
|
logging.error(str(exc) + " " + check_url)
|
||||||
check['error'] = "connection_timeout"
|
check['error'] = "connection_timeout"
|
||||||
except Exception as e:
|
except Exception as exc:
|
||||||
logging.error(str(e) + " " + check_url)
|
logging.error(str(exc) + " " + check_url)
|
||||||
check['error'] = "unknown"
|
check['error'] = "unknown"
|
||||||
|
|
||||||
result['details']['urlchecks'].append(check)
|
result['details']['urlchecks'].append(check)
|
||||||
|
|
||||||
|
|
||||||
result['details']['urlchecks'] = sorted(result['details']['urlchecks'], key=lambda url: url['url'])
|
result['details']['urlchecks'] = sorted(result['details']['urlchecks'],
|
||||||
|
key=lambda url: url['url'])
|
||||||
|
|
||||||
# collect icons
|
# collect icons
|
||||||
icons = set()
|
icons = set()
|
||||||
|
@ -492,24 +607,24 @@ def check_site(entry):
|
||||||
if 'content' not in c:
|
if 'content' not in c:
|
||||||
continue
|
continue
|
||||||
if c['content'] is None:
|
if c['content'] is None:
|
||||||
logging.warning("No content for %s" % entry['url'])
|
logging.warning("No content for %s", entry['url'])
|
||||||
continue
|
continue
|
||||||
if c['content']['icon'] is not None:
|
if c['content']['icon'] is not None:
|
||||||
icons.add(c['content']['icon'])
|
icons.add(c['content']['icon'])
|
||||||
downloaded_icons = set()
|
downloaded_icons = set()
|
||||||
for icon_url in icons:
|
for icon_url in icons:
|
||||||
logging.info("Getting icon %s" % icon_url)
|
logging.info("Getting icon %s", icon_url)
|
||||||
try:
|
try:
|
||||||
downloaded_icons.add(download_icon(icon_url))
|
downloaded_icons.add(download_icon(icon_url))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error("Could not download icon: %s" % e)
|
logging.error("Could not download icon: %s", e)
|
||||||
result['details']['icons'] = sorted(list(downloaded_icons))
|
result['details']['icons'] = sorted(list(downloaded_icons))
|
||||||
|
|
||||||
# collect feeds
|
# collect feeds
|
||||||
feeds = set()
|
feeds = set()
|
||||||
for c in result['details']['urlchecks']:
|
for c in result['details']['urlchecks']:
|
||||||
if c['content'] is None:
|
if c['content'] is None:
|
||||||
logging.warning("No content for %s" % entry['url'])
|
logging.warning("No content for %s", entry['url'])
|
||||||
continue
|
continue
|
||||||
if 'feeds' in c['content'] and len(c['content']['feeds']):
|
if 'feeds' in c['content'] and len(c['content']['feeds']):
|
||||||
for feed in c['content']['feeds']:
|
for feed in c['content']['feeds']:
|
||||||
|
@ -543,7 +658,7 @@ def check_site(entry):
|
||||||
result['details']['cms'] = parse_generator(c['content']['generator'])
|
result['details']['cms'] = parse_generator(c['content']['generator'])
|
||||||
# Qualify certain CMS flavours in more detail
|
# Qualify certain CMS flavours in more detail
|
||||||
if result['details']['cms'] == "typo3":
|
if result['details']['cms'] == "typo3":
|
||||||
if gcms_ip in result['details']['ipv4_addresses']:
|
if GCMS_IP in result['details']['ipv4_addresses']:
|
||||||
result['details']['cms'] = "typo3-gcms"
|
result['details']['cms'] = "typo3-gcms"
|
||||||
elif 'typo3-gruene.de' in c['content']['html']:
|
elif 'typo3-gruene.de' in c['content']['html']:
|
||||||
result['details']['cms'] = "typo3-gruene"
|
result['details']['cms'] = "typo3-gruene"
|
||||||
|
@ -555,7 +670,8 @@ def check_site(entry):
|
||||||
# No generator Tag. Use HTML content.
|
# No generator Tag. Use HTML content.
|
||||||
if 'Urwahl3000' in c['content']['html']:
|
if 'Urwahl3000' in c['content']['html']:
|
||||||
result['details']['cms'] = "wordpress-urwahl"
|
result['details']['cms'] = "wordpress-urwahl"
|
||||||
elif 'josephknowsbest' in c['content']['html'] or 'Joseph-knows-best' in c['content']['html']:
|
elif ('josephknowsbest' in c['content']['html'] or
|
||||||
|
'Joseph-knows-best' in c['content']['html']):
|
||||||
result['details']['cms'] = "wordpress-josephknowsbest"
|
result['details']['cms'] = "wordpress-josephknowsbest"
|
||||||
elif 'wordpress' in c['content']['html']:
|
elif 'wordpress' in c['content']['html']:
|
||||||
result['details']['cms'] = "wordpress"
|
result['details']['cms'] = "wordpress"
|
||||||
|
@ -567,7 +683,7 @@ def check_site(entry):
|
||||||
### Derive criteria
|
### Derive criteria
|
||||||
|
|
||||||
# DNS_RESOLVABLE_IPV4
|
# DNS_RESOLVABLE_IPV4
|
||||||
if len(result['details']['ipv4_addresses']):
|
if result['details']['ipv4_addresses']:
|
||||||
result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
|
result['result']['DNS_RESOLVABLE_IPV4'] = {'value': True, 'score': 1}
|
||||||
|
|
||||||
# SITE_REACHABLE
|
# SITE_REACHABLE
|
||||||
|
@ -584,8 +700,8 @@ def check_site(entry):
|
||||||
|
|
||||||
# WWW_OPTIONAL
|
# WWW_OPTIONAL
|
||||||
num_hostnames = 0
|
num_hostnames = 0
|
||||||
for hn in result['details']['hostnames'].keys():
|
for hostname in result['details']['hostnames'].keys():
|
||||||
item = result['details']['hostnames'][hn]
|
item = result['details']['hostnames'][hostname]
|
||||||
if not item['resolvable']:
|
if not item['resolvable']:
|
||||||
continue
|
continue
|
||||||
num_hostnames += 1
|
num_hostnames += 1
|
||||||
|
@ -600,20 +716,20 @@ def check_site(entry):
|
||||||
else:
|
else:
|
||||||
links = set()
|
links = set()
|
||||||
if result['details']['urlchecks'] is None:
|
if result['details']['urlchecks'] is None:
|
||||||
logging.warning("No urlchecks for %s" % entry['url'])
|
logging.warning("No urlchecks for %s", entry['url'])
|
||||||
else:
|
else:
|
||||||
for item in result['details']['urlchecks']:
|
for item in result['details']['urlchecks']:
|
||||||
if item['content']['canonical_link'] is not None:
|
if item['content'] is not None and item['content']['canonical_link'] is not None:
|
||||||
links.add(item['content']['canonical_link'])
|
links.add(item['content']['canonical_link'])
|
||||||
if len(links) == 1:
|
if len(links) == 1:
|
||||||
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
|
result['result']['CANONICAL_URL'] = {'value': True, 'score': 1}
|
||||||
|
|
||||||
# FAVICON
|
# FAVICON
|
||||||
if len(result['details']['icons']):
|
if result['details']['icons']:
|
||||||
result['result']['FAVICON'] = {'value': True, 'score': 1}
|
result['result']['FAVICON'] = {'value': True, 'score': 1}
|
||||||
|
|
||||||
# FEEDS
|
# FEEDS
|
||||||
if len(result['details']['feeds']):
|
if result['details']['feeds']:
|
||||||
result['result']['FEEDS'] = {'value': True, 'score': 1}
|
result['result']['FEEDS'] = {'value': True, 'score': 1}
|
||||||
|
|
||||||
# HTTP_RESPONSE_DURATION
|
# HTTP_RESPONSE_DURATION
|
||||||
|
@ -621,17 +737,18 @@ def check_site(entry):
|
||||||
for item in result['details']['urlchecks']:
|
for item in result['details']['urlchecks']:
|
||||||
if item['error'] is None:
|
if item['error'] is None:
|
||||||
durations.append(item['duration'])
|
durations.append(item['duration'])
|
||||||
val = round(statistics.mean(durations))
|
if durations:
|
||||||
result['result']['HTTP_RESPONSE_DURATION']['value'] = val
|
val = round(statistics.mean(durations))
|
||||||
if val < 100:
|
result['result']['HTTP_RESPONSE_DURATION']['value'] = val
|
||||||
result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
|
if val < 100:
|
||||||
elif val < 1000:
|
result['result']['HTTP_RESPONSE_DURATION']['score'] = 1
|
||||||
result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
|
elif val < 1000:
|
||||||
|
result['result']['HTTP_RESPONSE_DURATION']['score'] = 0.5
|
||||||
|
|
||||||
# RESPONSIVE
|
# RESPONSIVE
|
||||||
if result['details']['responsive'] is not None:
|
if result['details']['responsive'] is not None:
|
||||||
if (result['details']['responsive']['min_width'] < 500 and
|
if (result['details']['responsive']['min_width'] < 500 and
|
||||||
len(result['details']['responsive']['viewport_meta_tag']) > 0):
|
len(result['details']['responsive']['viewport_meta_tag']) > 0):
|
||||||
result['result']['RESPONSIVE']['value'] = True
|
result['result']['RESPONSIVE']['value'] = True
|
||||||
result['result']['RESPONSIVE']['score'] = 1
|
result['result']['RESPONSIVE']['score'] = 1
|
||||||
|
|
||||||
|
@ -649,87 +766,91 @@ def check_site(entry):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def get_job_from_queue():
|
||||||
"""
|
"""
|
||||||
Bringing it all together
|
Returns a URL from the queue
|
||||||
"""
|
"""
|
||||||
logging.basicConfig(level=logging.INFO)
|
out = None
|
||||||
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
|
||||||
|
|
||||||
# refresh our local clone of the green directory
|
with DATASTORE_CLIENT.transaction():
|
||||||
get_green_directory()
|
query = DATASTORE_CLIENT.query(kind=JOB_DATASTORE_KIND)
|
||||||
|
for entity in query.fetch(limit=1):
|
||||||
|
logging.debug("Got job: %s", entity)
|
||||||
|
out = dict(entity)
|
||||||
|
out["url"] = entity.key.name
|
||||||
|
DATASTORE_CLIENT.delete(entity.key)
|
||||||
|
|
||||||
# build the list of website URLs to run checks for
|
return out
|
||||||
logging.info("Processing green-directory")
|
|
||||||
input_entries = []
|
|
||||||
|
|
||||||
for entry in dir_entries():
|
def work_of_queue():
|
||||||
|
"""
|
||||||
|
Take job from queue and finish it until there are no more jobs
|
||||||
|
"""
|
||||||
|
while True:
|
||||||
|
job = get_job_from_queue()
|
||||||
|
if job is None:
|
||||||
|
logging.info("No more jobs. Exiting.")
|
||||||
|
break
|
||||||
|
|
||||||
if 'type' not in entry:
|
logging.info("Starting job %s", job["url"])
|
||||||
logging.error("Entry without type")
|
result = check_site(entry=job)
|
||||||
continue
|
#logging.debug(result)
|
||||||
if 'urls' not in entry:
|
logging.info("Job %s finished checks", job["url"])
|
||||||
logging.debug("Entry %s does not have any URLs." % repr_entry(entry))
|
logging.info("Job %s writing to DB", job["url"])
|
||||||
continue
|
|
||||||
|
|
||||||
website_url = None
|
key = DATASTORE_CLIENT.key(RESULTS_DATASTORE_KIND, job["url"])
|
||||||
for n in range(len(entry['urls'])):
|
entity = datastore.Entity(key=key, exclude_from_indexes=['results'])
|
||||||
try:
|
record = {
|
||||||
if entry['urls'][n]['type'] == "WEBSITE":
|
"created": datetime.utcnow(),
|
||||||
website_url = entry['urls'][n]['url']
|
"results": result,
|
||||||
if website_url:
|
}
|
||||||
input_entries.append({
|
entity.update(record)
|
||||||
"url": website_url,
|
try:
|
||||||
"level": entry.get("level"),
|
DATASTORE_CLIENT.put(entity)
|
||||||
"state": entry.get("state"),
|
except InvalidArgument as ex:
|
||||||
"district": entry.get("district"),
|
logging.error("Could not write result: %s", ex)
|
||||||
"city": entry.get("city"),
|
except ex:
|
||||||
})
|
logging.error("Could not write result: %s", ex)
|
||||||
except NameError:
|
|
||||||
logging.error("Error in %s: 'url' key missing (%s)" % (repr_entry(entry), entry['urls'][n]))
|
|
||||||
|
|
||||||
|
|
||||||
# randomize order, to distribute requests over servers
|
|
||||||
logging.info("Shuffling input URLs")
|
|
||||||
random.seed()
|
|
||||||
random.shuffle(input_entries)
|
|
||||||
|
|
||||||
# run checks
|
|
||||||
logging.info("Starting checks")
|
|
||||||
results = {}
|
|
||||||
|
|
||||||
pool = Pool(concurrency)
|
|
||||||
for ientry in input_entries:
|
|
||||||
logging.info("Submitting %s to job pool" % ientry['url'])
|
|
||||||
results[ientry['url']] = pool.apply_async(check_site, kwds={'entry': ientry})
|
|
||||||
pool.close()
|
|
||||||
pool.join()
|
|
||||||
|
|
||||||
logging.info("Checks are finished")
|
|
||||||
|
|
||||||
# Restructure result from dict of ApplyResult
|
|
||||||
# to list of dicts and sort in stable way
|
|
||||||
json_result = []
|
|
||||||
done = set()
|
|
||||||
|
|
||||||
logging.info("Restructuring results")
|
|
||||||
|
|
||||||
# convert results from ApplyResult to dict
|
|
||||||
for url in sorted(results.keys()):
|
|
||||||
if url not in done:
|
|
||||||
logging.info("Getting result for %s" % url)
|
|
||||||
try:
|
|
||||||
resultsitem = results[url].get()
|
|
||||||
json_result.append(resultsitem)
|
|
||||||
except Exception as e:
|
|
||||||
logging.error("Error getting result for '%s': %s" % (url, e))
|
|
||||||
done.add(url)
|
|
||||||
|
|
||||||
# Write result as JSON
|
|
||||||
output_filename = os.path.join(result_path, "spider_result.json")
|
|
||||||
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
|
||||||
json.dump(json_result, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
"""
|
||||||
|
Bringing it all together
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--credentials-path', dest='credentials_path',
|
||||||
|
help='Path to the service account credentials JSON file',
|
||||||
|
default='/secrets/service-account.json')
|
||||||
|
parser.add_argument('--loglevel', help="error, warn, info, or debug (default: info)",
|
||||||
|
default='info')
|
||||||
|
|
||||||
|
subparsers = parser.add_subparsers(help='sub-command help', dest='command')
|
||||||
|
|
||||||
|
subparsers.add_parser('spider', help='Take jobs off the queue and spider')
|
||||||
|
|
||||||
|
jobs_parser = subparsers.add_parser('jobs', help='Create jobs for the queue')
|
||||||
|
|
||||||
|
jobs_parser.add_argument('--url', help='Add a job to spider a URL')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
loglevel = args.loglevel.lower()
|
||||||
|
if loglevel == 'error':
|
||||||
|
logging.basicConfig(level=logging.ERROR)
|
||||||
|
elif loglevel == 'warn':
|
||||||
|
logging.basicConfig(level=logging.WARN)
|
||||||
|
elif loglevel == 'debug':
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
else:
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
loglevel = 'info'
|
||||||
|
|
||||||
|
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
|
||||||
|
|
||||||
|
DATASTORE_CLIENT = datastore.Client.from_service_account_json(args.credentials_path)
|
||||||
|
|
||||||
|
logging.debug("Called command %s", args.command)
|
||||||
|
|
||||||
|
if args.command == 'jobs':
|
||||||
|
create_jobs(args.url)
|
||||||
|
else:
|
||||||
|
work_of_queue()
|
||||||
|
|
|
@ -55,7 +55,7 @@ $(function(){
|
||||||
|
|
||||||
// IPs
|
// IPs
|
||||||
var ips = _.join(item.details.ipv4_addresses, ', ');
|
var ips = _.join(item.details.ipv4_addresses, ', ');
|
||||||
row.append('<td class="text '+ (ips === '' ? 'bad' : 'good') +' text-center" data-order="' + (ips === '' ? no : ips) + '"><span class="tt" title="IPv4-Adresse(n) des Servers bzw. der Server">' + (ips === '' ? no : ips) + '</span></td>');
|
row.append('<td class="text '+ (ips === '' ? 'bad' : 'good') +' text-center" data-order="' + ips + '"><span class="tt" title="IPv4-Adresse(n) des Servers bzw. der Server">' + (ips === '' ? no : ips) + '</span></td>');
|
||||||
|
|
||||||
// SITE_REACHABLE
|
// SITE_REACHABLE
|
||||||
var reachable = '<span class="tt" title="Die Site war beim Check erreichbar.">' + yes + '</span>';
|
var reachable = '<span class="tt" title="Die Site war beim Check erreichbar.">' + yes + '</span>';
|
||||||
|
@ -65,10 +65,14 @@ $(function(){
|
||||||
row.append('<td class="'+ (item.result.SITE_REACHABLE.value ? 'good' : 'bad') +' text-center" data-order="'+ (item.result.SITE_REACHABLE.value ? '1' : '0') +'">' + reachable + '</td>');
|
row.append('<td class="'+ (item.result.SITE_REACHABLE.value ? 'good' : 'bad') +' text-center" data-order="'+ (item.result.SITE_REACHABLE.value ? '1' : '0') +'">' + reachable + '</td>');
|
||||||
|
|
||||||
// HTTP_RESPONSE_DURATION
|
// HTTP_RESPONSE_DURATION
|
||||||
var durationClass = 'bad';
|
if (!item.result.SITE_REACHABLE.value || item.result.HTTP_RESPONSE_DURATION.value === null) {
|
||||||
if (item.result.HTTP_RESPONSE_DURATION.score > 0) { durationClass = 'medium'; }
|
row.append('<td class="text bad text-center" data-order="99999999"><span class="tt" title="Nicht anwendbar">' + no + '</span></td>');
|
||||||
if (item.result.HTTP_RESPONSE_DURATION.score > 0.5) { durationClass = 'good'; }
|
} else {
|
||||||
row.append('<td class="text '+ durationClass +' text-center" data-order="' + item.result.HTTP_RESPONSE_DURATION.value + '"><span class="tt" title="Dauer, bis der Server die Seitenanfrage beantwortet. Unter 100 ms ist sehr gut. Unter 1 Sekunde ist okay.">' + item.result.HTTP_RESPONSE_DURATION.value + ' ms</span></td>');
|
var durationClass = 'bad';
|
||||||
|
if (item.result.HTTP_RESPONSE_DURATION.score > 0) { durationClass = 'medium'; }
|
||||||
|
if (item.result.HTTP_RESPONSE_DURATION.score > 0.5) { durationClass = 'good'; }
|
||||||
|
row.append('<td class="text '+ durationClass +' text-center" data-order="' + item.result.HTTP_RESPONSE_DURATION.value + '"><span class="tt" title="Dauer, bis der Server die Seitenanfrage beantwortet. Unter 100 ms ist sehr gut. Unter 1 Sekunde ist okay.">' + item.result.HTTP_RESPONSE_DURATION.value + ' ms</span></td>');
|
||||||
|
}
|
||||||
|
|
||||||
// FAVICON
|
// FAVICON
|
||||||
var icon = item.result.FAVICON.value && (item.details.icons[0] != null);
|
var icon = item.result.FAVICON.value && (item.details.icons[0] != null);
|
||||||
|
@ -103,7 +107,7 @@ $(function(){
|
||||||
|
|
||||||
// screenshots
|
// screenshots
|
||||||
var screenshot = false;
|
var screenshot = false;
|
||||||
if (item.details.canonical_urls.length > 0) {
|
if (item.details.canonical_urls && item.details.canonical_urls.length > 0) {
|
||||||
if (typeof screenshots[item.details.canonical_urls[0]] !== 'undefined') {
|
if (typeof screenshots[item.details.canonical_urls[0]] !== 'undefined') {
|
||||||
var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.details.canonical_urls[0]];
|
var surl = 'http://green-spider-screenshots.sendung.de/320x640/'+screenshots[item.details.canonical_urls[0]];
|
||||||
var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.details.canonical_urls[0]];
|
var lurl = 'http://green-spider-screenshots.sendung.de/1500x1500/'+screenshots[item.details.canonical_urls[0]];
|
||||||
|
|
Loading…
Reference in a new issue