Merge pull request #60 from netzbegruenung/download-icons
Remove downloading of icons
This commit is contained in:
commit
0210e03d5d
|
@ -12,7 +12,7 @@ RUN apt-get update \
|
||||||
&& wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
|
&& wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
|
||||||
&& dpkg -i google-chrome-stable_current_amd64.deb \
|
&& dpkg -i google-chrome-stable_current_amd64.deb \
|
||||||
&& rm google-chrome-stable_current_amd64.deb \
|
&& rm google-chrome-stable_current_amd64.deb \
|
||||||
&& pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 \
|
&& pip3 install GitPython idna PyYAML beautifulsoup4==4.6.0 requests==2.18.4 responses==0.9.0 selenium==3.11.0 smmap2==2.0.3 urllib3==1.22 google-cloud-datastore==1.7.0 tenacity==5.0.2 \
|
||||||
&& wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \
|
&& wget https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip \
|
||||||
&& unzip chromedriver_linux64.zip \
|
&& unzip chromedriver_linux64.zip \
|
||||||
&& rm chromedriver_linux64.zip \
|
&& rm chromedriver_linux64.zip \
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -20,22 +20,25 @@ spiderjobs: dockerimage
|
||||||
spider: dockerimage
|
spider: dockerimage
|
||||||
docker run --rm -ti \
|
docker run --rm -ti \
|
||||||
-v $(PWD)/webapp/dist/data:/out \
|
-v $(PWD)/webapp/dist/data:/out \
|
||||||
-v $(PWD)/docs/siteicons:/icons \
|
|
||||||
-v $(PWD)/secrets:/secrets \
|
-v $(PWD)/secrets:/secrets \
|
||||||
spider spider.py \
|
spider spider.py \
|
||||||
--credentials-path /secrets/datastore-writer.json \
|
--credentials-path /secrets/datastore-writer.json \
|
||||||
--loglevel debug \
|
--loglevel debug \
|
||||||
spider
|
spider
|
||||||
|
|
||||||
|
# run spider tests
|
||||||
test: dockerimage
|
test: dockerimage
|
||||||
docker run --rm -ti spider /spider_test.py
|
docker run --rm -ti spider /spider_test.py
|
||||||
|
|
||||||
export:
|
# export JSON data for the webapp
|
||||||
|
export: dockerimage
|
||||||
docker run --rm -ti \
|
docker run --rm -ti \
|
||||||
-v $(PWD)/webapp/dist/data:/out \
|
-v $(PWD)/webapp/dist/data:/out \
|
||||||
-v $(PWD)/secrets:/secrets \
|
-v $(PWD)/secrets:/secrets \
|
||||||
|
-v $(PWD)/docs/siteicons:/icons \
|
||||||
spider data_export.py /secrets/datastore-reader.json
|
spider data_export.py /secrets/datastore-reader.json
|
||||||
|
|
||||||
|
# NodeJS modules for the webapp creation
|
||||||
webapp/node_modules:
|
webapp/node_modules:
|
||||||
cd webapp && npm install
|
cd webapp && npm install
|
||||||
|
|
||||||
|
@ -46,5 +49,6 @@ webapp: webapp/node_modules
|
||||||
cp webapp/node_modules/tooltipster/dist/css/tooltipster.bundle.min.css ./docs/css/
|
cp webapp/node_modules/tooltipster/dist/css/tooltipster.bundle.min.css ./docs/css/
|
||||||
rm webapp/dist/bundle.js
|
rm webapp/dist/bundle.js
|
||||||
|
|
||||||
|
# Run a dev server for the webapp
|
||||||
serve-webapp:
|
serve-webapp:
|
||||||
cd docs && ../venv/bin/python -m http.server
|
cd docs && ../venv/bin/python -m http.server
|
||||||
|
|
|
@ -3,10 +3,16 @@ Exports data from the database to JSON files for use in a static webapp
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from google.cloud import datastore
|
from google.cloud import datastore
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
SITEICONS_PATH = "/icons"
|
||||||
|
|
||||||
client = None
|
client = None
|
||||||
|
|
||||||
|
@ -18,9 +24,26 @@ def export_results():
|
||||||
|
|
||||||
query = client.query(kind='spider-results')
|
query = client.query(kind='spider-results')
|
||||||
for entity in query.fetch():
|
for entity in query.fetch():
|
||||||
print(entity.key.name)
|
logging.debug(entity.key.name)
|
||||||
out.append(dict(entity)["results"])
|
out.append(dict(entity)["results"])
|
||||||
|
|
||||||
|
# load icons, reformat icons details
|
||||||
|
for index in range(len(out)):
|
||||||
|
if "details" not in out[index]:
|
||||||
|
continue
|
||||||
|
if "icons" not in out[index]["details"]:
|
||||||
|
continue
|
||||||
|
urls = out[index]["details"]["icons"]
|
||||||
|
out[index]["details"]["icons"] = {}
|
||||||
|
for url in urls:
|
||||||
|
if not (url.startswith("http://") or url.startswith("https://")):
|
||||||
|
logging.debug("Skipping icon %s", url)
|
||||||
|
continue
|
||||||
|
logging.debug("Dowloading icon %s", url)
|
||||||
|
filename = download_icon(url)
|
||||||
|
if filename:
|
||||||
|
out[index]["details"]["icons"][url] = filename
|
||||||
|
|
||||||
output_filename = "/out/spider_result.json"
|
output_filename = "/out/spider_result.json"
|
||||||
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
||||||
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
||||||
|
@ -34,7 +57,7 @@ def export_screenshots():
|
||||||
|
|
||||||
query = client.query(kind='webscreenshot')
|
query = client.query(kind='webscreenshot')
|
||||||
for item in query.fetch():
|
for item in query.fetch():
|
||||||
print(item['url'], os.path.basename(item['screenshot_url']))
|
logging.debug(item['url'], os.path.basename(item['screenshot_url']))
|
||||||
out[item['url']] = os.path.basename(item['screenshot_url'])
|
out[item['url']] = os.path.basename(item['screenshot_url'])
|
||||||
|
|
||||||
output_filename = "/out/screenshots.json"
|
output_filename = "/out/screenshots.json"
|
||||||
|
@ -42,7 +65,62 @@ def export_screenshots():
|
||||||
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
|
def download_icon(icon_url):
|
||||||
|
"""
|
||||||
|
Download an icon from the given URL and store it with
|
||||||
|
a file name of <hash>.<ending>
|
||||||
|
"""
|
||||||
|
|
||||||
|
default_endings = {
|
||||||
|
"image/x-icon": "ico",
|
||||||
|
"image/vnd.microsoft.icon": "ico",
|
||||||
|
"image/png": "png",
|
||||||
|
"image/jpeg": "jpg",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Download the icon
|
||||||
|
try:
|
||||||
|
req = requests.get(icon_url, timeout=10)
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
if req.status_code >= 400:
|
||||||
|
return None
|
||||||
|
|
||||||
|
content_hash = hashlib.md5(req.content).hexdigest()
|
||||||
|
extension = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
file_name = os.path.basename(icon_url)[-1]
|
||||||
|
except IndexError as exc:
|
||||||
|
logging.error("Error in URL %s: %s", icon_url, exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if file_name != "" and "." in file_name:
|
||||||
|
ext = file_name.split(".")[-1]
|
||||||
|
if ext != "":
|
||||||
|
extension = ext
|
||||||
|
|
||||||
|
if extension == "":
|
||||||
|
# derive from content type
|
||||||
|
ctype = req.headers.get('content-type')
|
||||||
|
try:
|
||||||
|
extension = default_endings[ctype]
|
||||||
|
except KeyError:
|
||||||
|
logging.error("No file ending defined for icon type '%s'", ctype)
|
||||||
|
return None
|
||||||
|
|
||||||
|
filename = content_hash + "." + extension.lower()
|
||||||
|
|
||||||
|
path = SITEICONS_PATH + os.path.sep + filename
|
||||||
|
with open(path, 'wb') as iconfile:
|
||||||
|
iconfile.write(req.content)
|
||||||
|
|
||||||
|
return filename
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
print("Error: please provide path to Google Storage API system account JSON file as argument")
|
print("Error: please provide path to Google Storage API system account JSON file as argument")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
@ -50,5 +128,5 @@ if __name__ == "__main__":
|
||||||
key_path = sys.argv[1]
|
key_path = sys.argv[1]
|
||||||
client = datastore.Client.from_service_account_json(key_path)
|
client = datastore.Client.from_service_account_json(key_path)
|
||||||
|
|
||||||
export_screenshots()
|
#export_screenshots()
|
||||||
export_results()
|
export_results()
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
Binary file not shown.
After Width: | Height: | Size: 2.1 KiB |
60
spider.py
60
spider.py
|
@ -3,7 +3,6 @@ Provides the spider functionality (website checks).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import hashlib
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
@ -18,11 +17,13 @@ from urllib.parse import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import yaml
|
import yaml
|
||||||
|
import tenacity
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from git import Repo
|
from git import Repo
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from google.cloud import datastore
|
from google.cloud import datastore
|
||||||
|
from google.api_core.exceptions import Aborted
|
||||||
from google.api_core.exceptions import InvalidArgument
|
from google.api_core.exceptions import InvalidArgument
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,8 +43,6 @@ GREEN_DIRECTORY_LOCAL_PATH = './cache/green-directory'
|
||||||
|
|
||||||
RESULT_PATH = '/out'
|
RESULT_PATH = '/out'
|
||||||
|
|
||||||
SITEICONS_PATH = '/icons'
|
|
||||||
|
|
||||||
# IP address of the newthinking GCMS server
|
# IP address of the newthinking GCMS server
|
||||||
GCMS_IP = "91.102.13.20"
|
GCMS_IP = "91.102.13.20"
|
||||||
|
|
||||||
|
@ -244,50 +243,6 @@ def normalize_title(title):
|
||||||
return title
|
return title
|
||||||
|
|
||||||
|
|
||||||
def download_icon(icon_url):
|
|
||||||
"""
|
|
||||||
Download an icon from the given URL and store it with
|
|
||||||
a file name of <hash>.<ending>
|
|
||||||
"""
|
|
||||||
|
|
||||||
default_endings = {
|
|
||||||
"image/x-icon": "ico",
|
|
||||||
"image/vnd.microsoft.icon": "ico",
|
|
||||||
"image/png": "png",
|
|
||||||
"image/jpeg": "jpg",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Download the icon
|
|
||||||
req = requests.get(icon_url)
|
|
||||||
req.raise_for_status()
|
|
||||||
|
|
||||||
content_hash = hashlib.md5(req.content).hexdigest()
|
|
||||||
extension = ""
|
|
||||||
|
|
||||||
file_name = os.path.basename(icon_url)[-1]
|
|
||||||
if file_name != "" and "." in file_name:
|
|
||||||
ext = file_name.split(".")[-1]
|
|
||||||
if ext != "":
|
|
||||||
extension = ext
|
|
||||||
|
|
||||||
if extension == "":
|
|
||||||
# derive from content type
|
|
||||||
ctype = req.headers.get('content-type')
|
|
||||||
try:
|
|
||||||
extension = default_endings[ctype]
|
|
||||||
except KeyError:
|
|
||||||
logging.error("No file ending defined for icon type '%s'", ctype)
|
|
||||||
return None
|
|
||||||
|
|
||||||
filename = content_hash + "." + extension.lower()
|
|
||||||
|
|
||||||
path = SITEICONS_PATH + os.path.sep + filename
|
|
||||||
with open(path, 'wb') as iconfile:
|
|
||||||
iconfile.write(req.content)
|
|
||||||
|
|
||||||
return filename
|
|
||||||
|
|
||||||
|
|
||||||
def check_responsiveness(url):
|
def check_responsiveness(url):
|
||||||
"""
|
"""
|
||||||
Checks
|
Checks
|
||||||
|
@ -611,14 +566,7 @@ def check_site(entry):
|
||||||
continue
|
continue
|
||||||
if c['content']['icon'] is not None:
|
if c['content']['icon'] is not None:
|
||||||
icons.add(c['content']['icon'])
|
icons.add(c['content']['icon'])
|
||||||
downloaded_icons = set()
|
result['details']['icons'] = sorted(list(icons))
|
||||||
for icon_url in icons:
|
|
||||||
logging.info("Getting icon %s", icon_url)
|
|
||||||
try:
|
|
||||||
downloaded_icons.add(download_icon(icon_url))
|
|
||||||
except Exception as e:
|
|
||||||
logging.error("Could not download icon: %s", e)
|
|
||||||
result['details']['icons'] = sorted(list(downloaded_icons))
|
|
||||||
|
|
||||||
# collect feeds
|
# collect feeds
|
||||||
feeds = set()
|
feeds = set()
|
||||||
|
@ -766,6 +714,8 @@ def check_site(entry):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@tenacity.retry(wait=tenacity.wait_exponential(),
|
||||||
|
retry=tenacity.retry_if_exception_type(Aborted))
|
||||||
def get_job_from_queue():
|
def get_job_from_queue():
|
||||||
"""
|
"""
|
||||||
Returns a URL from the queue
|
Returns a URL from the queue
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -75,8 +75,8 @@ $(function(){
|
||||||
}
|
}
|
||||||
|
|
||||||
// FAVICON
|
// FAVICON
|
||||||
var icon = item.result.FAVICON.value && (item.details.icons[0] != null);
|
var icon = item.result.FAVICON.value && (Object.keys(item.details.icons).length > 0);
|
||||||
var iconFile = (icon && (item.details.icons[0] != null) ? item.details.icons[0] : '');
|
var iconFile = icon ? Object.values(item.details.icons)[0] : '';
|
||||||
var noicon = '<span class="tt" title="Diese Site hat kein Icon.">'+ no +'</span>'
|
var noicon = '<span class="tt" title="Diese Site hat kein Icon.">'+ no +'</span>'
|
||||||
var icontag = (icon ? ('<img src="/siteicons/' + iconFile + '" class="siteicon tt" title="Die Site verweist auf das gezeigte Icon.">') : noicon);
|
var icontag = (icon ? ('<img src="/siteicons/' + iconFile + '" class="siteicon tt" title="Die Site verweist auf das gezeigte Icon.">') : noicon);
|
||||||
row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center" data-order="'+ iconFile +'">' + icontag + '</td>');
|
row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center" data-order="'+ iconFile +'">' + icontag + '</td>');
|
||||||
|
|
Loading…
Reference in New Issue