Add icon fetching to data export

This commit is contained in:
Marian Steinbach 2018-08-27 23:39:00 +02:00
parent 4abeb48cec
commit a463087b0a
2 changed files with 83 additions and 5 deletions

View File

@ -20,7 +20,6 @@ spiderjobs: dockerimage
spider: dockerimage spider: dockerimage
docker run --rm -ti \ docker run --rm -ti \
-v $(PWD)/webapp/dist/data:/out \ -v $(PWD)/webapp/dist/data:/out \
-v $(PWD)/docs/siteicons:/icons \
-v $(PWD)/secrets:/secrets \ -v $(PWD)/secrets:/secrets \
spider spider.py \ spider spider.py \
--credentials-path /secrets/datastore-writer.json \ --credentials-path /secrets/datastore-writer.json \
@ -32,10 +31,11 @@ test: dockerimage
docker run --rm -ti spider /spider_test.py docker run --rm -ti spider /spider_test.py
# export JSON data for the webapp # export JSON data for the webapp
export: export: dockerimage
docker run --rm -ti \ docker run --rm -ti \
-v $(PWD)/webapp/dist/data:/out \ -v $(PWD)/webapp/dist/data:/out \
-v $(PWD)/secrets:/secrets \ -v $(PWD)/secrets:/secrets \
-v $(PWD)/docs/siteicons:/icons \
spider data_export.py /secrets/datastore-reader.json spider data_export.py /secrets/datastore-reader.json
# NodeJS modules for the webapp creation # NodeJS modules for the webapp creation

View File

@ -3,10 +3,16 @@ Exports data from the database to JSON files for use in a static webapp
""" """
from google.cloud import datastore from google.cloud import datastore
import hashlib
import json import json
import logging
import sys import sys
import os import os
import requests
SITEICONS_PATH = "/icons"
client = None client = None
@ -18,9 +24,26 @@ def export_results():
query = client.query(kind='spider-results') query = client.query(kind='spider-results')
for entity in query.fetch(): for entity in query.fetch():
print(entity.key.name) logging.debug(entity.key.name)
out.append(dict(entity)["results"]) out.append(dict(entity)["results"])
# load icons, reformat icons details
for index in range(len(out)):
if "details" not in out[index]:
continue
if "icons" not in out[index]["details"]:
continue
urls = out[index]["details"]["icons"]
out[index]["details"]["icons"] = {}
for url in urls:
if not (url.startswith("http://") or url.startswith("https://")):
logging.debug("Skipping icon %s", url)
continue
logging.debug("Dowloading icon %s", url)
filename = download_icon(url)
if filename:
out[index]["details"]["icons"][url] = filename
output_filename = "/out/spider_result.json" output_filename = "/out/spider_result.json"
with open(output_filename, 'w', encoding="utf8") as jsonfile: with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False) json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
@ -34,7 +57,7 @@ def export_screenshots():
query = client.query(kind='webscreenshot') query = client.query(kind='webscreenshot')
for item in query.fetch(): for item in query.fetch():
print(item['url'], os.path.basename(item['screenshot_url'])) logging.debug(item['url'], os.path.basename(item['screenshot_url']))
out[item['url']] = os.path.basename(item['screenshot_url']) out[item['url']] = os.path.basename(item['screenshot_url'])
output_filename = "/out/screenshots.json" output_filename = "/out/screenshots.json"
@ -42,7 +65,62 @@ def export_screenshots():
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False) json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
def download_icon(icon_url):
"""
Download an icon from the given URL and store it with
a file name of <hash>.<ending>
"""
default_endings = {
"image/x-icon": "ico",
"image/vnd.microsoft.icon": "ico",
"image/png": "png",
"image/jpeg": "jpg",
}
# Download the icon
try:
req = requests.get(icon_url, timeout=10)
except:
return None
if req.status_code >= 400:
return None
content_hash = hashlib.md5(req.content).hexdigest()
extension = ""
try:
file_name = os.path.basename(icon_url)[-1]
except IndexError as exc:
logging.error("Error in URL %s: %s", icon_url, exc)
return None
if file_name != "" and "." in file_name:
ext = file_name.split(".")[-1]
if ext != "":
extension = ext
if extension == "":
# derive from content type
ctype = req.headers.get('content-type')
try:
extension = default_endings[ctype]
except KeyError:
logging.error("No file ending defined for icon type '%s'", ctype)
return None
filename = content_hash + "." + extension.lower()
path = SITEICONS_PATH + os.path.sep + filename
with open(path, 'wb') as iconfile:
iconfile.write(req.content)
return filename
if __name__ == "__main__": if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
if len(sys.argv) == 1: if len(sys.argv) == 1:
print("Error: please provide path to Google Storage API system account JSON file as argument") print("Error: please provide path to Google Storage API system account JSON file as argument")
sys.exit(1) sys.exit(1)
@ -50,5 +128,5 @@ if __name__ == "__main__":
key_path = sys.argv[1] key_path = sys.argv[1]
client = datastore.Client.from_service_account_json(key_path) client = datastore.Client.from_service_account_json(key_path)
export_screenshots() #export_screenshots()
export_results() export_results()