green-spider/export/__init__.py

152 lines
4.6 KiB
Python
Raw Normal View History

2018-08-23 09:37:02 +02:00
"""
Exports data from the database to JSON files for use in a static webapp
"""
from hashlib import md5
2018-04-17 20:45:51 +02:00
import json
2018-08-27 23:39:00 +02:00
import logging
2018-04-17 20:45:51 +02:00
import sys
2018-08-15 22:02:20 +02:00
import os
2018-04-17 20:45:51 +02:00
2018-08-27 23:39:00 +02:00
import requests
SITEICONS_PATH = "/icons"
2018-04-17 20:45:51 +02:00
def export_results(client, entity_kind):
2018-08-23 09:37:02 +02:00
"""
Export of the main results data
"""
out = []
# Load data from database
query = client.query(kind=entity_kind)
2018-08-23 09:37:02 +02:00
for entity in query.fetch():
2018-08-27 23:39:00 +02:00
logging.debug(entity.key.name)
out.append({
'input_url': entity.key.name,
'resulting_urls': entity.get('checks').get('url_canonicalization'),
'created': entity.get('created').isoformat(),
'meta': entity.get('meta'),
'checks': entity.get('checks'),
'rating': entity.get('rating'),
'score': entity.get('score'),
'icons': [],
})
2018-08-23 09:37:02 +02:00
2018-08-27 23:39:00 +02:00
# load icons, reformat icons details
icons_downloaded = set()
2018-08-27 23:39:00 +02:00
for index in range(len(out)):
assert "checks" in out[index]
assert "html_head" in out[index]["checks"]
# collect icons urls
icons = set()
for url in out[index]['checks']['html_head']:
assert 'link_icon' in out[index]['checks']['html_head'][url]
if out[index]['checks']['html_head'][url]['link_icon'] is not None:
iconurl = out[index]['checks']['html_head'][url]['link_icon']
if iconurl.startswith("data:"):
continue
if iconurl in icons_downloaded:
continue
icons.add(iconurl)
out[index]["icons"] = {}
for iconurl in list(icons):
logging.debug("Dowloading icon %s", iconurl)
icons_downloaded.add(iconurl)
filename = download_icon(iconurl)
2018-08-27 23:39:00 +02:00
if filename:
out[index]["icons"][url] = filename
2018-08-27 23:39:00 +02:00
2018-08-23 09:37:02 +02:00
output_filename = "/out/spider_result.json"
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
# compact version
output_filename = "/out/spider_result_compact.json"
for i in range(len(out)):
out[i]['cms'] = list(out[i]['checks']['generator'].values())
del out[i]['checks']
with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
2018-04-17 20:45:51 +02:00
def export_screenshots(client):
2018-08-23 09:37:02 +02:00
"""
Export of screenshot meta data
"""
2018-08-15 22:02:20 +02:00
out = {}
2018-04-17 20:45:51 +02:00
2018-08-15 22:02:20 +02:00
query = client.query(kind='webscreenshot')
for item in query.fetch():
2018-09-17 17:35:21 +02:00
if 'screenshot_url' not in item:
logging.error("Export failed. No 'screenshot_url' attribute set in dataset. %s\n" % item)
return
logging.debug("url: %s, screenshot_url: %s" % (item['url'], item['screenshot_url']))
filename = os.path.basename(item['screenshot_url'])
out[item['url']] = filename
2018-08-15 22:02:20 +02:00
2018-08-23 09:37:02 +02:00
output_filename = "/out/screenshots.json"
2018-04-17 20:45:51 +02:00
with open(output_filename, 'w', encoding="utf8") as jsonfile:
2018-08-15 22:02:20 +02:00
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
2018-04-17 20:45:51 +02:00
2018-08-27 23:39:00 +02:00
def download_icon(icon_url):
"""
Download an icon from the given URL and store it with
a file name of <hash>.<ending>
"""
default_endings = {
"image/x-ico": "ico",
2018-08-27 23:39:00 +02:00
"image/x-icon": "ico",
"image/vnd.microsoft.icon": "ico",
"image/png": "png",
"image/jpeg": "jpg",
"image/gif": "gif",
2018-08-27 23:39:00 +02:00
}
# Download the icon
try:
req = requests.get(icon_url, timeout=10)
except:
return None
if req.status_code >= 400:
return None
content_hash = md5(req.content).hexdigest()
2018-08-27 23:39:00 +02:00
extension = ""
try:
file_name = os.path.basename(icon_url)[-1]
except IndexError as exc:
2018-10-08 08:42:29 +02:00
logging.error("Could not get file name from URL %s. Not downloading. Details: %s", icon_url, exc)
2018-08-27 23:39:00 +02:00
return None
if file_name != "" and "." in file_name:
ext = file_name.split(".")[-1]
if ext != "":
extension = ext
if extension == "":
# derive from content type
ctype = req.headers.get('content-type')
if ctype is None:
return
2018-08-27 23:39:00 +02:00
try:
extension = default_endings[ctype]
except KeyError:
2018-10-08 08:42:29 +02:00
logging.error("No file ending defined for icon type %s. Not downloading.", ctype)
2018-08-27 23:39:00 +02:00
return None
filename = content_hash + "." + extension.lower()
path = SITEICONS_PATH + os.path.sep + filename
with open(path, 'wb') as iconfile:
iconfile.write(req.content)
return filename