green-spider/data_export.py

"""
Exports data from the database to JSON files for use in a static webapp
"""

from google.cloud import datastore
import hashlib
import json
import logging
import sys
import os

import requests


SITEICONS_PATH = "/icons"

client = None

def export_results():
    """
    Export of the main results data
    """
    out = []

    query = client.query(kind='spider-results')
    for entity in query.fetch():
        logging.debug(entity.key.name)
        out.append(dict(entity)["results"])
    
    # load icons, reformat icons details
    for index in range(len(out)):
        if "details" not in out[index]:
            continue
        if "icons" not in out[index]["details"]:
            continue
        urls = out[index]["details"]["icons"]
        out[index]["details"]["icons"] = {}
        for url in urls:
            if not (url.startswith("http://") or url.startswith("https://")):
                logging.debug("Skipping icon %s", url)
                continue
            logging.debug("Dowloading icon %s", url)
            filename = download_icon(url)
            if filename:
                out[index]["details"]["icons"][url] = filename

    output_filename = "/out/spider_result.json"
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
        json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)


def export_screenshots():
    """
    Export of screenshot meta data
    """
    out = {}

    query = client.query(kind='webscreenshot')
    for item in query.fetch():
        logging.debug(item['url'], os.path.basename(item['screenshot_url']))
        out[item['url']] = os.path.basename(item['screenshot_url'])
    
    output_filename = "/out/screenshots.json"
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
        json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)


def download_icon(icon_url):
    """
    Download an icon from the given URL and store it with
    a file name of <hash>.<ending>
    """

    default_endings = {
        "image/x-icon": "ico",
        "image/vnd.microsoft.icon": "ico",
        "image/png": "png",
        "image/jpeg": "jpg",
    }

    # Download the icon
    try:
        req = requests.get(icon_url, timeout=10)
    except:
        return None
    if req.status_code >= 400:
        return None

    content_hash = hashlib.md5(req.content).hexdigest()
    extension = ""

    try:
        file_name = os.path.basename(icon_url)[-1]
    except IndexError as exc:
        logging.error("Error in URL %s: %s", icon_url, exc)
        return None

    if file_name != "" and "." in file_name:
        ext = file_name.split(".")[-1]
        if ext != "":
            extension = ext

    if extension == "":
        # derive from content type
        ctype = req.headers.get('content-type')
        try:
            extension = default_endings[ctype]
        except KeyError:
            logging.error("No file ending defined for icon type '%s'", ctype)
            return None

    filename = content_hash + "." + extension.lower()

    path = SITEICONS_PATH + os.path.sep + filename
    with open(path, 'wb') as iconfile:
        iconfile.write(req.content)

    return filename


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)

    if len(sys.argv) == 1:
        print("Error: please provide path to Google Storage API system account JSON file as argument")
        sys.exit(1)

    key_path = sys.argv[1]
    client = datastore.Client.from_service_account_json(key_path)
    
    #export_screenshots()
    export_results()
Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`"""`
			`Exports data from the database to JSON files for use in a static webapp`
			`"""`

Fetch screenshot data from database 2018-08-15 22:02:20 +02:00			`from google.cloud import datastore`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`import hashlib`
Add site screenshots 2018-04-17 20:45:51 +02:00			`import json`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`import logging`
Add site screenshots 2018-04-17 20:45:51 +02:00			`import sys`
Fetch screenshot data from database 2018-08-15 22:02:20 +02:00			`import os`
Add site screenshots 2018-04-17 20:45:51 +02:00
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`import requests`


			`SITEICONS_PATH = "/icons"`
Add site screenshots 2018-04-17 20:45:51 +02:00
Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`client = None`

			`def export_results():`
			`"""`
			`Export of the main results data`
			`"""`
			`out = []`

			`query = client.query(kind='spider-results')`
			`for entity in query.fetch():`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`logging.debug(entity.key.name)`
Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`out.append(dict(entity)["results"])`

Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`# load icons, reformat icons details`
			`for index in range(len(out)):`
			`if "details" not in out[index]:`
			`continue`
			`if "icons" not in out[index]["details"]:`
			`continue`
			`urls = out[index]["details"]["icons"]`
			`out[index]["details"]["icons"] = {}`
			`for url in urls:`
			`if not (url.startswith("http://") or url.startswith("https://")):`
			`logging.debug("Skipping icon %s", url)`
			`continue`
			`logging.debug("Dowloading icon %s", url)`
			`filename = download_icon(url)`
			`if filename:`
			`out[index]["details"]["icons"][url] = filename`

Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`output_filename = "/out/spider_result.json"`
			`with open(output_filename, 'w', encoding="utf8") as jsonfile:`
			`json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)`
Add site screenshots 2018-04-17 20:45:51 +02:00

Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`def export_screenshots():`
			`"""`
			`Export of screenshot meta data`
			`"""`
Fetch screenshot data from database 2018-08-15 22:02:20 +02:00			`out = {}`
Add site screenshots 2018-04-17 20:45:51 +02:00
Fetch screenshot data from database 2018-08-15 22:02:20 +02:00			`query = client.query(kind='webscreenshot')`
			`for item in query.fetch():`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`logging.debug(item['url'], os.path.basename(item['screenshot_url']))`
Fetch screenshot data from database 2018-08-15 22:02:20 +02:00			`out[item['url']] = os.path.basename(item['screenshot_url'])`

Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`output_filename = "/out/screenshots.json"`
Add site screenshots 2018-04-17 20:45:51 +02:00			`with open(output_filename, 'w', encoding="utf8") as jsonfile:`
Fetch screenshot data from database 2018-08-15 22:02:20 +02:00			`json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)`
Add site screenshots 2018-04-17 20:45:51 +02:00

Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`def download_icon(icon_url):`
			`"""`
			`Download an icon from the given URL and store it with`
			`a file name of <hash>.<ending>`
			`"""`

			`default_endings = {`
			`"image/x-icon": "ico",`
			`"image/vnd.microsoft.icon": "ico",`
			`"image/png": "png",`
			`"image/jpeg": "jpg",`
			`}`

			`# Download the icon`
			`try:`
			`req = requests.get(icon_url, timeout=10)`
			`except:`
			`return None`
			`if req.status_code >= 400:`
			`return None`

			`content_hash = hashlib.md5(req.content).hexdigest()`
			`extension = ""`

			`try:`
			`file_name = os.path.basename(icon_url)[-1]`
			`except IndexError as exc:`
			`logging.error("Error in URL %s: %s", icon_url, exc)`
			`return None`

			`if file_name != "" and "." in file_name:`
			`ext = file_name.split(".")[-1]`
			`if ext != "":`
			`extension = ext`

			`if extension == "":`
			`# derive from content type`
			`ctype = req.headers.get('content-type')`
			`try:`
			`extension = default_endings[ctype]`
			`except KeyError:`
			`logging.error("No file ending defined for icon type '%s'", ctype)`
			`return None`

			`filename = content_hash + "." + extension.lower()`

			`path = SITEICONS_PATH + os.path.sep + filename`
			`with open(path, 'wb') as iconfile:`
			`iconfile.write(req.content)`

			`return filename`


Add site screenshots 2018-04-17 20:45:51 +02:00			`if __name__ == "__main__":`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`logging.basicConfig(level=logging.DEBUG)`

Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`if len(sys.argv) == 1:`
			`print("Error: please provide path to Google Storage API system account JSON file as argument")`
			`sys.exit(1)`

			`key_path = sys.argv[1]`
			`client = datastore.Client.from_service_account_json(key_path)`

Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`#export_screenshots()`
Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`export_results()`