Merge pull request #60 from netzbegruenung/download-icons

Remove downloading of icons
2018-08-28 20:51:05 +02:00 · 2018-08-28 20:51:05 +02:00 · 0210e03d5d
parent 89f5f4fbe5 e2077fe12b
commit 0210e03d5d
9 changed files with 24164 additions and 23084 deletions
--- a/2
+++ b/2
@ -12,7 +12,7 @@ RUN apt-get update \
  && wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
    && dpkg -i google-chrome-stable_current_amd64.deb \
    && rm google-chrome-stable_current_amd64.deb \
-  && pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 \
+  && pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 \
  && wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \
    && unzip chromedriver_linux64.zip \
    && rm chromedriver_linux64.zip \
--- a/8
+++ b/8
@ -20,22 +20,25 @@ spiderjobs: dockerimage
 spider: dockerimage
 	docker run --rm -ti \
 		-v $(PWD)/webapp/dist/data:/out \
 		-v $(PWD)/docs/siteicons:/icons \
 		-v $(PWD)/secrets:/secrets \
 		spider spider.py \
 		--credentials-path /secrets/datastore-writer.json \
 		--loglevel debug \
 		spider
 # run spider tests
 test: dockerimage
 	docker run --rm -ti spider /spider_test.py
-export:
+# export JSON data for the webapp
 export: dockerimage
 	docker run --rm -ti \
 		-v $(PWD)/webapp/dist/data:/out \
 		-v $(PWD)/secrets:/secrets \
 		-v $(PWD)/docs/siteicons:/icons \
 		spider data_export.py /secrets/datastore-reader.json
 # NodeJS modules for the webapp creation
 webapp/node_modules:
 	cd webapp && npm install
@ -46,5 +49,6 @@ webapp: webapp/node_modules
 	cp webapp/node_modules/tooltipster/dist/css/tooltipster.bundle.min.css ./docs/css/
 	rm webapp/dist/bundle.js
 # Run a dev server for the webapp
 serve-webapp:
 	cd docs && ../venv/bin/python -m http.server
--- a/data_export.py
+++ b/data_export.py
@ -3,10 +3,16 @@ Exports data from the database to JSON files for use in a static webapp
 """
 from google.cloud import datastore
 import hashlib
 import json
 import logging
 import sys
 import os
 import requests
 SITEICONS_PATH = "/icons"
 client = None
@ -18,9 +24,26 @@ def export_results():
    query = client.query(kind='spider-results')
    for entity in query.fetch():
-        print(entity.key.name)
+        logging.debug(entity.key.name)
        out.append(dict(entity)["results"])
    # load icons, reformat icons details
    for index in range(len(out)):
        if "details" not in out[index]:
            continue
        if "icons" not in out[index]["details"]:
            continue
        urls = out[index]["details"]["icons"]
        out[index]["details"]["icons"] = {}
        for url in urls:
            if not (url.startswith("http://") or url.startswith("https://")):
                logging.debug("Skipping icon %s", url)
                continue
            logging.debug("Dowloading icon %s", url)
            filename = download_icon(url)
            if filename:
                out[index]["details"]["icons"][url] = filename
    output_filename = "/out/spider_result.json"
    with open(output_filename, 'w', encoding="utf8") as jsonfile:
        json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
@ -34,7 +57,7 @@ def export_screenshots():
    query = client.query(kind='webscreenshot')
    for item in query.fetch():
-        print(item['url'], os.path.basename(item['screenshot_url']))
+        logging.debug(item['url'], os.path.basename(item['screenshot_url']))
        out[item['url']] = os.path.basename(item['screenshot_url'])
    output_filename = "/out/screenshots.json"
@ -42,7 +65,62 @@ def export_screenshots():
        json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
 def download_icon(icon_url):
    """
    Download an icon from the given URL and store it with
    a file name of <hash>.<ending>
    """
    default_endings = {
        "image/x-icon": "ico",
        "image/vnd.microsoft.icon": "ico",
        "image/png": "png",
        "image/jpeg": "jpg",
    }
    # Download the icon
    try:
        req = requests.get(icon_url, timeout=10)
    except:
        return None
    if req.status_code >= 400:
        return None
    content_hash = hashlib.md5(req.content).hexdigest()
    extension = ""
    try:
        file_name = os.path.basename(icon_url)[-1]
    except IndexError as exc:
        logging.error("Error in URL %s: %s", icon_url, exc)
        return None
    if file_name != "" and "." in file_name:
        ext = file_name.split(".")[-1]
        if ext != "":
            extension = ext
    if extension == "":
        # derive from content type
        ctype = req.headers.get('content-type')
        try:
            extension = default_endings[ctype]
        except KeyError:
            logging.error("No file ending defined for icon type '%s'", ctype)
            return None
    filename = content_hash + "." + extension.lower()
    path = SITEICONS_PATH + os.path.sep + filename
    with open(path, 'wb') as iconfile:
        iconfile.write(req.content)
    return filename
 if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    if len(sys.argv) == 1:
        print("Error: please provide path to Google Storage API system account JSON file as argument")
        sys.exit(1)
@ -50,5 +128,5 @@ if __name__ == "__main__":
    key_path = sys.argv[1]
    client = datastore.Client.from_service_account_json(key_path)
-    export_screenshots()
+    #export_screenshots()
    export_results()
--- a/docs/bundle.js
+++ b/docs/bundle.js
--- a/docs/data/spider_result.json
+++ b/docs/data/spider_result.json
--- a/docs/siteicons/83d74cceb607c6b19336d72f877ef474.jpg
+++ b/docs/siteicons/83d74cceb607c6b19336d72f877ef474.jpg
--- a/spider.py
+++ b/spider.py
@ -3,7 +3,6 @@ Provides the spider functionality (website checks).
 """
 import argparse
 import hashlib
 import json
 import logging
 import os
@ -18,11 +17,13 @@ from urllib.parse import urlparse
 import requests
 import yaml
 import tenacity
 from bs4 import BeautifulSoup
 from git import Repo
 from selenium import webdriver
 from google.cloud import datastore
 from google.api_core.exceptions import Aborted
 from google.api_core.exceptions import InvalidArgument
@ -42,8 +43,6 @@ GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
 RESULT_PATH = '/out'
 SITEICONS_PATH = '/icons'
 # IP address of the newthinking GCMS server
 GCMS_IP = "91.102.13.20"
@ -244,50 +243,6 @@ def normalize_title(title):
    return title
 def download_icon(icon_url):
    """
    Download an icon from the given URL and store it with
    a file name of <hash>.<ending>
    """
    default_endings = {
        "image/x-icon": "ico",
        "image/vnd.microsoft.icon": "ico",
        "image/png": "png",
        "image/jpeg": "jpg",
    }
    # Download the icon
    req = requests.get(icon_url)
    req.raise_for_status()
    content_hash = hashlib.md5(req.content).hexdigest()
    extension = ""
    file_name = os.path.basename(icon_url)[-1]
    if file_name != "" and "." in file_name:
        ext = file_name.split(".")[-1]
        if ext != "":
            extension = ext
    if extension == "":
        # derive from content type
        ctype = req.headers.get('content-type')
        try:
            extension = default_endings[ctype]
        except KeyError:
            logging.error("No file ending defined for icon type '%s'", ctype)
            return None
    filename = content_hash + "." + extension.lower()
    path = SITEICONS_PATH + os.path.sep + filename
    with open(path, 'wb') as iconfile:
        iconfile.write(req.content)
    return filename
 def check_responsiveness(url):
    """
    Checks
@ -611,14 +566,7 @@ def check_site(entry):
            continue
        if c['content']['icon'] is not None:
            icons.add(c['content']['icon'])
-    downloaded_icons = set()
+    result['details']['icons'] = sorted(list(icons))
    for icon_url in icons:
        logging.info("Getting icon %s", icon_url)
        try:
            downloaded_icons.add(download_icon(icon_url))
        except Exception as e:
            logging.error("Could not download icon: %s", e)
    result['details']['icons'] = sorted(list(downloaded_icons))
    # collect feeds
    feeds = set()
@ -766,6 +714,8 @@ def check_site(entry):
    return result
@tenacity.retry(wait=tenacity.wait_exponential(),
                retry=tenacity.retry_if_exception_type(Aborted))
 def get_job_from_queue():
    """
    Returns a URL from the queue
--- a/webapp/dist/data/spider_result.json
+++ b/webapp/dist/data/spider_result.json
--- a/webapp/src/index.js
+++ b/webapp/src/index.js
@ -75,8 +75,8 @@ $(function(){
        }
        // FAVICON
-        var icon = item.result.FAVICON.value && (item.details.icons[0] != null);
+        var icon = item.result.FAVICON.value && (Object.keys(item.details.icons).length > 0);
-        var iconFile = (icon && (item.details.icons[0] != null) ? item.details.icons[0] : '');
+        var iconFile = icon ? Object.values(item.details.icons)[0] : '';
        var noicon = '<span class="tt" title="Diese Site hat kein Icon.">'+ no +'</span>'
        var icontag = (icon ? ('<img src="/siteicons/' + iconFile + '" class="siteicon tt" title="Die Site verweist auf das gezeigte Icon.">') : noicon);
        row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center" data-order="'+ iconFile +'">' + icontag + '</td>');