Add icon fetching to data export
This commit is contained in:
parent
4abeb48cec
commit
a463087b0a
4
Makefile
4
Makefile
|
@ -20,7 +20,6 @@ spiderjobs: dockerimage
|
|||
spider: dockerimage
|
||||
docker run --rm -ti \
|
||||
-v $(PWD)/webapp/dist/data:/out \
|
||||
-v $(PWD)/docs/siteicons:/icons \
|
||||
-v $(PWD)/secrets:/secrets \
|
||||
spider spider.py \
|
||||
--credentials-path /secrets/datastore-writer.json \
|
||||
|
@ -32,10 +31,11 @@ test: dockerimage
|
|||
docker run --rm -ti spider /spider_test.py
|
||||
|
||||
# export JSON data for the webapp
|
||||
export:
|
||||
export: dockerimage
|
||||
docker run --rm -ti \
|
||||
-v $(PWD)/webapp/dist/data:/out \
|
||||
-v $(PWD)/secrets:/secrets \
|
||||
-v $(PWD)/docs/siteicons:/icons \
|
||||
spider data_export.py /secrets/datastore-reader.json
|
||||
|
||||
# NodeJS modules for the webapp creation
|
||||
|
|
|
@ -3,10 +3,16 @@ Exports data from the database to JSON files for use in a static webapp
|
|||
"""
|
||||
|
||||
from google.cloud import datastore
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
SITEICONS_PATH = "/icons"
|
||||
|
||||
client = None
|
||||
|
||||
|
@ -18,9 +24,26 @@ def export_results():
|
|||
|
||||
query = client.query(kind='spider-results')
|
||||
for entity in query.fetch():
|
||||
print(entity.key.name)
|
||||
logging.debug(entity.key.name)
|
||||
out.append(dict(entity)["results"])
|
||||
|
||||
# load icons, reformat icons details
|
||||
for index in range(len(out)):
|
||||
if "details" not in out[index]:
|
||||
continue
|
||||
if "icons" not in out[index]["details"]:
|
||||
continue
|
||||
urls = out[index]["details"]["icons"]
|
||||
out[index]["details"]["icons"] = {}
|
||||
for url in urls:
|
||||
if not (url.startswith("http://") or url.startswith("https://")):
|
||||
logging.debug("Skipping icon %s", url)
|
||||
continue
|
||||
logging.debug("Dowloading icon %s", url)
|
||||
filename = download_icon(url)
|
||||
if filename:
|
||||
out[index]["details"]["icons"][url] = filename
|
||||
|
||||
output_filename = "/out/spider_result.json"
|
||||
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
||||
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
||||
|
@ -34,7 +57,7 @@ def export_screenshots():
|
|||
|
||||
query = client.query(kind='webscreenshot')
|
||||
for item in query.fetch():
|
||||
print(item['url'], os.path.basename(item['screenshot_url']))
|
||||
logging.debug(item['url'], os.path.basename(item['screenshot_url']))
|
||||
out[item['url']] = os.path.basename(item['screenshot_url'])
|
||||
|
||||
output_filename = "/out/screenshots.json"
|
||||
|
@ -42,7 +65,62 @@ def export_screenshots():
|
|||
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
||||
|
||||
|
||||
def download_icon(icon_url):
|
||||
"""
|
||||
Download an icon from the given URL and store it with
|
||||
a file name of <hash>.<ending>
|
||||
"""
|
||||
|
||||
default_endings = {
|
||||
"image/x-icon": "ico",
|
||||
"image/vnd.microsoft.icon": "ico",
|
||||
"image/png": "png",
|
||||
"image/jpeg": "jpg",
|
||||
}
|
||||
|
||||
# Download the icon
|
||||
try:
|
||||
req = requests.get(icon_url, timeout=10)
|
||||
except:
|
||||
return None
|
||||
if req.status_code >= 400:
|
||||
return None
|
||||
|
||||
content_hash = hashlib.md5(req.content).hexdigest()
|
||||
extension = ""
|
||||
|
||||
try:
|
||||
file_name = os.path.basename(icon_url)[-1]
|
||||
except IndexError as exc:
|
||||
logging.error("Error in URL %s: %s", icon_url, exc)
|
||||
return None
|
||||
|
||||
if file_name != "" and "." in file_name:
|
||||
ext = file_name.split(".")[-1]
|
||||
if ext != "":
|
||||
extension = ext
|
||||
|
||||
if extension == "":
|
||||
# derive from content type
|
||||
ctype = req.headers.get('content-type')
|
||||
try:
|
||||
extension = default_endings[ctype]
|
||||
except KeyError:
|
||||
logging.error("No file ending defined for icon type '%s'", ctype)
|
||||
return None
|
||||
|
||||
filename = content_hash + "." + extension.lower()
|
||||
|
||||
path = SITEICONS_PATH + os.path.sep + filename
|
||||
with open(path, 'wb') as iconfile:
|
||||
iconfile.write(req.content)
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
print("Error: please provide path to Google Storage API system account JSON file as argument")
|
||||
sys.exit(1)
|
||||
|
@ -50,5 +128,5 @@ if __name__ == "__main__":
|
|||
key_path = sys.argv[1]
|
||||
client = datastore.Client.from_service_account_json(key_path)
|
||||
|
||||
export_screenshots()
|
||||
#export_screenshots()
|
||||
export_results()
|
||||
|
|
Loading…
Reference in New Issue