mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-04 10:03:40 +02:00
80 lines
2.4 KiB
Python
80 lines
2.4 KiB
Python
from google.cloud import storage
|
|
import hashlib
|
|
import json
|
|
import subprocess
|
|
import os
|
|
import sys
|
|
|
|
json_file = 'webapp/dist/data/spider_result.json'
|
|
|
|
bucket_name = "green-spider-screenshots.sendung.de"
|
|
|
|
if len(sys.argv) == 1:
|
|
print("Error: please provide path to Google Storage API system account JSON file as argument")
|
|
sys.exit(1)
|
|
|
|
key_path = sys.argv[1]
|
|
|
|
client = None
|
|
bucket = None
|
|
|
|
# result dict. key: url, value: file name
|
|
urls_done = {}
|
|
|
|
def main():
|
|
global client
|
|
global bucket
|
|
|
|
client = storage.Client.from_service_account_json(key_path)
|
|
bucket = client.get_bucket(bucket_name)
|
|
|
|
with open(json_file, 'r', encoding="utf8") as jsonfile:
|
|
data = json.load(jsonfile)
|
|
for entry in data:
|
|
urls = entry['details'].get('canonical_urls')
|
|
if urls is None or len(urls) == 0:
|
|
continue
|
|
url = urls[0]
|
|
|
|
if url in urls_done:
|
|
continue
|
|
|
|
filename = make_screenshots(url)
|
|
|
|
urls_done[url] = filename
|
|
|
|
output_filename = "./webapp/dist/data/screenshots.json"
|
|
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
|
json.dump(urls_done, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
|
|
|
def make_screenshots(url):
|
|
"""
|
|
Creates screenshots in various sizes, uploads them to
|
|
Google Cloud Storage and returns the output filename
|
|
"""
|
|
sizes = ([320, 640], [1500, 1500])
|
|
for size in sizes:
|
|
print("Screenshotting size %s for %s" % (size, url))
|
|
sizeargument = "%spx*%spx" % (size[0], size[1])
|
|
subfolder = "%sx%s" % (size[0], size[1])
|
|
filename = hashlib.md5(bytearray(url, 'utf-8')).hexdigest() + ".png"
|
|
command = [
|
|
"docker", "run", "--rm", "-v",
|
|
os.getenv("PWD") + "/temp/%s:/srv" % subfolder,
|
|
"netzbegruenung/green-spider-screenshotter:latest",
|
|
url, filename, sizeargument
|
|
]
|
|
subprocess.run(command)
|
|
blob = bucket.blob('%s/%s' % (subfolder, filename))
|
|
local_path = './temp/%s/%s' % (subfolder, filename)
|
|
if os.path.exists(local_path):
|
|
with open(local_path, 'rb') as my_file:
|
|
blob.upload_from_file(my_file, content_type="image/png")
|
|
blob.make_public()
|
|
else:
|
|
print("Error: No screenshot created: size=%s, url='%s'" % (size, url))
|
|
return filename
|
|
|
|
if __name__ == "__main__":
|
|
main()
|