green-spider/export/__init__.py

"""
Exports data from the database to JSON files for use in a static webapp
"""

from hashlib import md5
import json
import logging
import sys
import os

import requests


SITEICONS_PATH = "/icons"

def export_results(client, entity_kind):
    """
    Export of the main results data
    """
    out = []

    # Load data from database
    query = client.query(kind=entity_kind)
    for entity in query.fetch():
        logging.debug(entity.key.name)
        out.append({
            'input_url': entity.key.name,
            'resulting_urls': entity.get('checks').get('url_canonicalization'),
            'created': entity.get('created').isoformat(),
            'meta': entity.get('meta'),
            'checks': entity.get('checks'),
            'rating': entity.get('rating'),
            'score': entity.get('score'),
            'icons': [],
        })
    
    # load icons, reformat icons details
    icons_downloaded = set()
    for index in range(len(out)):
        assert "checks" in out[index]
        assert "html_head" in out[index]["checks"]
        
        # collect icons urls
        icons = set()
        for url in out[index]['checks']['html_head']:
            assert 'link_icon' in out[index]['checks']['html_head'][url]
            if out[index]['checks']['html_head'][url]['link_icon'] is not None:
                iconurl = out[index]['checks']['html_head'][url]['link_icon']
                if iconurl.startswith("data:"):
                    continue
                if iconurl in icons_downloaded:
                    continue
                icons.add(iconurl)
        
        out[index]["icons"] = {}
        for iconurl in list(icons):
            logging.debug("Dowloading icon %s", iconurl)
            icons_downloaded.add(iconurl)
            filename = download_icon(iconurl)
            if filename:
                out[index]["icons"][url] = filename

    output_filename = "/out/spider_result.json"
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
        json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
    
    # compact version
    output_filename = "/out/spider_result_compact.json"
    for i in range(len(out)):
        out[i]['cms'] = list(out[i]['checks']['generator'].values())
        del out[i]['checks']
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
        json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)


def export_screenshots(client):
    """
    Export of screenshot meta data
    """
    out = {}

    query = client.query(kind='webscreenshot')
    for item in query.fetch():
        if 'screenshot_url' not in item:
            logging.error("Export failed. No 'screenshot_url' attribute set in dataset. %s\n" % item)
            return
        logging.debug("url: %s, screenshot_url: %s" % (item['url'], item['screenshot_url']))
        filename = os.path.basename(item['screenshot_url'])
        out[item['url']] = filename
    
    output_filename = "/out/screenshots.json"
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
        json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)


def download_icon(icon_url):
    """
    Download an icon from the given URL and store it with
    a file name of <hash>.<ending>
    """

    default_endings = {
        "image/x-ico": "ico",
        "image/x-icon": "ico",
        "image/vnd.microsoft.icon": "ico",
        "image/png": "png",
        "image/jpeg": "jpg",
        "image/gif": "gif",
    }

    # Download the icon
    try:
        req = requests.get(icon_url, timeout=10)
    except:
        return None
    if req.status_code >= 400:
        return None

    content_hash = md5(req.content).hexdigest()
    extension = ""

    try:
        file_name = os.path.basename(icon_url)[-1]
    except IndexError as exc:
        logging.error("Could not get file name from URL %s. Not downloading. Details: %s", icon_url, exc)
        return None

    if file_name != "" and "." in file_name:
        ext = file_name.split(".")[-1]
        if ext != "":
            extension = ext

    if extension == "":
        # derive from content type
        ctype = req.headers.get('content-type')
        if ctype is None:
            return

        try:
            extension = default_endings[ctype]
        except KeyError:
            logging.error("No file ending defined for icon type %s. Not downloading.", ctype)
            return None

    filename = content_hash + "." + extension.lower()

    path = SITEICONS_PATH + os.path.sep + filename
    with open(path, 'wb') as iconfile:
        iconfile.write(req.content)

    return filename
Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`"""`
			`Exports data from the database to JSON files for use in a static webapp`
			`"""`

Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00			`from hashlib import md5`
Add site screenshots 2018-04-17 20:45:51 +02:00			`import json`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`import logging`
Add site screenshots 2018-04-17 20:45:51 +02:00			`import sys`
Fetch screenshot data from database 2018-08-15 22:02:20 +02:00			`import os`
Add site screenshots 2018-04-17 20:45:51 +02:00
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`import requests`


			`SITEICONS_PATH = "/icons"`
Add site screenshots 2018-04-17 20:45:51 +02:00
Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00			`def export_results(client, entity_kind):`
Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`"""`
			`Export of the main results data`
			`"""`
			`out = []`

Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00			`# Load data from database`
			`query = client.query(kind=entity_kind)`
Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`for entity in query.fetch():`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`logging.debug(entity.key.name)`
Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00			`out.append({`
			`'input_url': entity.key.name,`
			`'resulting_urls': entity.get('checks').get('url_canonicalization'),`
			`'created': entity.get('created').isoformat(),`
			`'meta': entity.get('meta'),`
			`'checks': entity.get('checks'),`
			`'rating': entity.get('rating'),`
			`'score': entity.get('score'),`
			`'icons': [],`
			`})`
Add spider result export capabilities 2018-08-23 09:37:02 +02:00
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`# load icons, reformat icons details`
Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00			`icons_downloaded = set()`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`for index in range(len(out)):`
Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00			`assert "checks" in out[index]`
			`assert "html_head" in out[index]["checks"]`

			`# collect icons urls`
			`icons = set()`
			`for url in out[index]['checks']['html_head']:`
			`assert 'link_icon' in out[index]['checks']['html_head'][url]`
			`if out[index]['checks']['html_head'][url]['link_icon'] is not None:`
			`iconurl = out[index]['checks']['html_head'][url]['link_icon']`
			`if iconurl.startswith("data:"):`
			`continue`
			`if iconurl in icons_downloaded:`
			`continue`
			`icons.add(iconurl)`

			`out[index]["icons"] = {}`
			`for iconurl in list(icons):`
			`logging.debug("Dowloading icon %s", iconurl)`
			`icons_downloaded.add(iconurl)`
			`filename = download_icon(iconurl)`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`if filename:`
Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00			`out[index]["icons"][url] = filename`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00
Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`output_filename = "/out/spider_result.json"`
			`with open(output_filename, 'w', encoding="utf8") as jsonfile:`
			`json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)`
Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00
			`# compact version`
			`output_filename = "/out/spider_result_compact.json"`
			`for i in range(len(out)):`
			`out[i]['cms'] = list(out[i]['checks']['generator'].values())`
			`del out[i]['checks']`
			`with open(output_filename, 'w', encoding="utf8") as jsonfile:`
			`json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)`
Add site screenshots 2018-04-17 20:45:51 +02:00

Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00			`def export_screenshots(client):`
Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`"""`
			`Export of screenshot meta data`
			`"""`
Fetch screenshot data from database 2018-08-15 22:02:20 +02:00			`out = {}`
Add site screenshots 2018-04-17 20:45:51 +02:00
Fetch screenshot data from database 2018-08-15 22:02:20 +02:00			`query = client.query(kind='webscreenshot')`
			`for item in query.fetch():`
Fix data export logging bug 2018-09-17 17:35:21 +02:00			`if 'screenshot_url' not in item:`
			`logging.error("Export failed. No 'screenshot_url' attribute set in dataset. %s\n" % item)`
			`return`
			`logging.debug("url: %s, screenshot_url: %s" % (item['url'], item['screenshot_url']))`
			`filename = os.path.basename(item['screenshot_url'])`
			`out[item['url']] = filename`
Fetch screenshot data from database 2018-08-15 22:02:20 +02:00
Add spider result export capabilities 2018-08-23 09:37:02 +02:00			`output_filename = "/out/screenshots.json"`
Add site screenshots 2018-04-17 20:45:51 +02:00			`with open(output_filename, 'w', encoding="utf8") as jsonfile:`
Fetch screenshot data from database 2018-08-15 22:02:20 +02:00			`json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)`
Add site screenshots 2018-04-17 20:45:51 +02:00

Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`def download_icon(icon_url):`
			`"""`
			`Download an icon from the given URL and store it with`
			`a file name of <hash>.<ending>`
			`"""`

			`default_endings = {`
Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00			`"image/x-ico": "ico",`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`"image/x-icon": "ico",`
			`"image/vnd.microsoft.icon": "ico",`
			`"image/png": "png",`
			`"image/jpeg": "jpg",`
Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00			`"image/gif": "gif",`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`}`

			`# Download the icon`
			`try:`
			`req = requests.get(icon_url, timeout=10)`
			`except:`
			`return None`
			`if req.status_code >= 400:`
			`return None`

Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00			`content_hash = md5(req.content).hexdigest()`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`extension = ""`

			`try:`
			`file_name = os.path.basename(icon_url)[-1]`
			`except IndexError as exc:`
Improve error messages in export 2018-10-08 08:42:29 +02:00			`logging.error("Could not get file name from URL %s. Not downloading. Details: %s", icon_url, exc)`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`return None`

			`if file_name != "" and "." in file_name:`
			`ext = file_name.split(".")[-1]`
			`if ext != "":`
			`extension = ext`

			`if extension == "":`
			`# derive from content type`
			`ctype = req.headers.get('content-type')`
Refactor and modularize spider (#70) See PR description for details 2018-10-03 11:05:42 +02:00			`if ctype is None:`
			`return`

Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`try:`
			`extension = default_endings[ctype]`
			`except KeyError:`
Improve error messages in export 2018-10-08 08:42:29 +02:00			`logging.error("No file ending defined for icon type %s. Not downloading.", ctype)`
Add icon fetching to data export 2018-08-27 23:39:00 +02:00			`return None`

			`filename = content_hash + "." + extension.lower()`

			`path = SITEICONS_PATH + os.path.sep + filename`
			`with open(path, 'wb') as iconfile:`
			`iconfile.write(req.content)`

			`return filename`