Merge pull request #60 from netzbegruenung/download-icons

Remove downloading of icons
This commit is contained in:
Marian Steinbach 2018-08-28 20:51:05 +02:00 committed by GitHub
commit 0210e03d5d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 24164 additions and 23084 deletions

View File

@ -12,7 +12,7 @@ RUN apt-get update \
&& wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \ && wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
&& dpkg -i google-chrome-stable_current_amd64.deb \ && dpkg -i google-chrome-stable_current_amd64.deb \
&& rm google-chrome-stable_current_amd64.deb \ && rm google-chrome-stable_current_amd64.deb \
&& pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 \ && pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 \
&& wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \ && wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \
&& unzip chromedriver_linux64.zip \ && unzip chromedriver_linux64.zip \
&& rm chromedriver_linux64.zip \ && rm chromedriver_linux64.zip \

View File

@ -20,22 +20,25 @@ spiderjobs: dockerimage
spider: dockerimage spider: dockerimage
docker run --rm -ti \ docker run --rm -ti \
-v $(PWD)/webapp/dist/data:/out \ -v $(PWD)/webapp/dist/data:/out \
-v $(PWD)/docs/siteicons:/icons \
-v $(PWD)/secrets:/secrets \ -v $(PWD)/secrets:/secrets \
spider spider.py \ spider spider.py \
--credentials-path /secrets/datastore-writer.json \ --credentials-path /secrets/datastore-writer.json \
--loglevel debug \ --loglevel debug \
spider spider
# run spider tests
test: dockerimage test: dockerimage
docker run --rm -ti spider /spider_test.py docker run --rm -ti spider /spider_test.py
export: # export JSON data for the webapp
export: dockerimage
docker run --rm -ti \ docker run --rm -ti \
-v $(PWD)/webapp/dist/data:/out \ -v $(PWD)/webapp/dist/data:/out \
-v $(PWD)/secrets:/secrets \ -v $(PWD)/secrets:/secrets \
-v $(PWD)/docs/siteicons:/icons \
spider data_export.py /secrets/datastore-reader.json spider data_export.py /secrets/datastore-reader.json
# NodeJS modules for the webapp creation
webapp/node_modules: webapp/node_modules:
cd webapp && npm install cd webapp && npm install
@ -46,5 +49,6 @@ webapp: webapp/node_modules
cp webapp/node_modules/tooltipster/dist/css/tooltipster.bundle.min.css ./docs/css/ cp webapp/node_modules/tooltipster/dist/css/tooltipster.bundle.min.css ./docs/css/
rm webapp/dist/bundle.js rm webapp/dist/bundle.js
# Run a dev server for the webapp
serve-webapp: serve-webapp:
cd docs && ../venv/bin/python -m http.server cd docs && ../venv/bin/python -m http.server

View File

@ -3,10 +3,16 @@ Exports data from the database to JSON files for use in a static webapp
""" """
from google.cloud import datastore from google.cloud import datastore
import hashlib
import json import json
import logging
import sys import sys
import os import os
import requests
SITEICONS_PATH = "/icons"
client = None client = None
@ -18,9 +24,26 @@ def export_results():
query = client.query(kind='spider-results') query = client.query(kind='spider-results')
for entity in query.fetch(): for entity in query.fetch():
print(entity.key.name) logging.debug(entity.key.name)
out.append(dict(entity)["results"]) out.append(dict(entity)["results"])
# load icons, reformat icons details
for index in range(len(out)):
if "details" not in out[index]:
continue
if "icons" not in out[index]["details"]:
continue
urls = out[index]["details"]["icons"]
out[index]["details"]["icons"] = {}
for url in urls:
if not (url.startswith("http://") or url.startswith("https://")):
logging.debug("Skipping icon %s", url)
continue
logging.debug("Dowloading icon %s", url)
filename = download_icon(url)
if filename:
out[index]["details"]["icons"][url] = filename
output_filename = "/out/spider_result.json" output_filename = "/out/spider_result.json"
with open(output_filename, 'w', encoding="utf8") as jsonfile: with open(output_filename, 'w', encoding="utf8") as jsonfile:
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False) json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
@ -34,7 +57,7 @@ def export_screenshots():
query = client.query(kind='webscreenshot') query = client.query(kind='webscreenshot')
for item in query.fetch(): for item in query.fetch():
print(item['url'], os.path.basename(item['screenshot_url'])) logging.debug(item['url'], os.path.basename(item['screenshot_url']))
out[item['url']] = os.path.basename(item['screenshot_url']) out[item['url']] = os.path.basename(item['screenshot_url'])
output_filename = "/out/screenshots.json" output_filename = "/out/screenshots.json"
@ -42,7 +65,62 @@ def export_screenshots():
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False) json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
def download_icon(icon_url):
"""
Download an icon from the given URL and store it with
a file name of <hash>.<ending>
"""
default_endings = {
"image/x-icon": "ico",
"image/vnd.microsoft.icon": "ico",
"image/png": "png",
"image/jpeg": "jpg",
}
# Download the icon
try:
req = requests.get(icon_url, timeout=10)
except:
return None
if req.status_code >= 400:
return None
content_hash = hashlib.md5(req.content).hexdigest()
extension = ""
try:
file_name = os.path.basename(icon_url)[-1]
except IndexError as exc:
logging.error("Error in URL %s: %s", icon_url, exc)
return None
if file_name != "" and "." in file_name:
ext = file_name.split(".")[-1]
if ext != "":
extension = ext
if extension == "":
# derive from content type
ctype = req.headers.get('content-type')
try:
extension = default_endings[ctype]
except KeyError:
logging.error("No file ending defined for icon type '%s'", ctype)
return None
filename = content_hash + "." + extension.lower()
path = SITEICONS_PATH + os.path.sep + filename
with open(path, 'wb') as iconfile:
iconfile.write(req.content)
return filename
if __name__ == "__main__": if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
if len(sys.argv) == 1: if len(sys.argv) == 1:
print("Error: please provide path to Google Storage API system account JSON file as argument") print("Error: please provide path to Google Storage API system account JSON file as argument")
sys.exit(1) sys.exit(1)
@ -50,5 +128,5 @@ if __name__ == "__main__":
key_path = sys.argv[1] key_path = sys.argv[1]
client = datastore.Client.from_service_account_json(key_path) client = datastore.Client.from_service_account_json(key_path)
export_screenshots() #export_screenshots()
export_results() export_results()

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

View File

@ -3,7 +3,6 @@ Provides the spider functionality (website checks).
""" """
import argparse import argparse
import hashlib
import json import json
import logging import logging
import os import os
@ -18,11 +17,13 @@ from urllib.parse import urlparse
import requests import requests
import yaml import yaml
import tenacity
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from git import Repo from git import Repo
from selenium import webdriver from selenium import webdriver
from google.cloud import datastore from google.cloud import datastore
from google.api_core.exceptions import Aborted
from google.api_core.exceptions import InvalidArgument from google.api_core.exceptions import InvalidArgument
@ -42,8 +43,6 @@ GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
RESULT_PATH = '/out' RESULT_PATH = '/out'
SITEICONS_PATH = '/icons'
# IP address of the newthinking GCMS server # IP address of the newthinking GCMS server
GCMS_IP = "91.102.13.20" GCMS_IP = "91.102.13.20"
@ -244,50 +243,6 @@ def normalize_title(title):
return title return title
def download_icon(icon_url):
"""
Download an icon from the given URL and store it with
a file name of <hash>.<ending>
"""
default_endings = {
"image/x-icon": "ico",
"image/vnd.microsoft.icon": "ico",
"image/png": "png",
"image/jpeg": "jpg",
}
# Download the icon
req = requests.get(icon_url)
req.raise_for_status()
content_hash = hashlib.md5(req.content).hexdigest()
extension = ""
file_name = os.path.basename(icon_url)[-1]
if file_name != "" and "." in file_name:
ext = file_name.split(".")[-1]
if ext != "":
extension = ext
if extension == "":
# derive from content type
ctype = req.headers.get('content-type')
try:
extension = default_endings[ctype]
except KeyError:
logging.error("No file ending defined for icon type '%s'", ctype)
return None
filename = content_hash + "." + extension.lower()
path = SITEICONS_PATH + os.path.sep + filename
with open(path, 'wb') as iconfile:
iconfile.write(req.content)
return filename
def check_responsiveness(url): def check_responsiveness(url):
""" """
Checks Checks
@ -611,14 +566,7 @@ def check_site(entry):
continue continue
if c['content']['icon'] is not None: if c['content']['icon'] is not None:
icons.add(c['content']['icon']) icons.add(c['content']['icon'])
downloaded_icons = set() result['details']['icons'] = sorted(list(icons))
for icon_url in icons:
logging.info("Getting icon %s", icon_url)
try:
downloaded_icons.add(download_icon(icon_url))
except Exception as e:
logging.error("Could not download icon: %s", e)
result['details']['icons'] = sorted(list(downloaded_icons))
# collect feeds # collect feeds
feeds = set() feeds = set()
@ -766,6 +714,8 @@ def check_site(entry):
return result return result
@tenacity.retry(wait=tenacity.wait_exponential(),
retry=tenacity.retry_if_exception_type(Aborted))
def get_job_from_queue(): def get_job_from_queue():
""" """
Returns a URL from the queue Returns a URL from the queue

File diff suppressed because it is too large Load Diff

View File

@ -75,8 +75,8 @@ $(function(){
} }
// FAVICON // FAVICON
var icon = item.result.FAVICON.value && (item.details.icons[0] != null); var icon = item.result.FAVICON.value && (Object.keys(item.details.icons).length > 0);
var iconFile = (icon && (item.details.icons[0] != null) ? item.details.icons[0] : ''); var iconFile = icon ? Object.values(item.details.icons)[0] : '';
var noicon = '<span class="tt" title="Diese Site hat kein Icon.">'+ no +'</span>' var noicon = '<span class="tt" title="Diese Site hat kein Icon.">'+ no +'</span>'
var icontag = (icon ? ('<img src="/siteicons/' + iconFile + '" class="siteicon tt" title="Die Site verweist auf das gezeigte Icon.">') : noicon); var icontag = (icon ? ('<img src="/siteicons/' + iconFile + '" class="siteicon tt" title="Die Site verweist auf das gezeigte Icon.">') : noicon);
row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center" data-order="'+ iconFile +'">' + icontag + '</td>'); row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center" data-order="'+ iconFile +'">' + icontag + '</td>');