parent
d6f3f93244
commit
989ed53acf
4 changed files with 17 additions and 106 deletions
@ -1,79 +1,28 @@ |
||||
from google.cloud import storage |
||||
import hashlib |
||||
from google.cloud import datastore |
||||
import json |
||||
import subprocess |
||||
import os |
||||
import sys |
||||
import os |
||||
|
||||
json_file = 'webapp/dist/data/spider_result.json' |
||||
|
||||
bucket_name = "green-spider-screenshots.sendung.de" |
||||
|
||||
if len(sys.argv) == 1: |
||||
print("Error: please provide path to Google Storage API system account JSON file as argument") |
||||
sys.exit(1) |
||||
|
||||
key_path = sys.argv[1] |
||||
|
||||
client = None |
||||
bucket = None |
||||
|
||||
# result dict. key: url, value: file name |
||||
urls_done = {} |
||||
|
||||
def main(): |
||||
global client |
||||
global bucket |
||||
|
||||
client = storage.Client.from_service_account_json(key_path) |
||||
bucket = client.get_bucket(bucket_name) |
||||
|
||||
with open(json_file, 'r', encoding="utf8") as jsonfile: |
||||
data = json.load(jsonfile) |
||||
for entry in data: |
||||
urls = entry['details'].get('canonical_urls') |
||||
if urls is None or len(urls) == 0: |
||||
continue |
||||
url = urls[0] |
||||
|
||||
if url in urls_done: |
||||
continue |
||||
if len(sys.argv) == 1: |
||||
print("Error: please provide path to Google Storage API system account JSON file as argument") |
||||
sys.exit(1) |
||||
|
||||
filename = make_screenshots(url) |
||||
key_path = sys.argv[1] |
||||
client = datastore.Client.from_service_account_json(key_path) |
||||
|
||||
urls_done[url] = filename |
||||
out = {} |
||||
|
||||
query = client.query(kind='webscreenshot') |
||||
for item in query.fetch(): |
||||
print(item['url'], os.path.basename(item['screenshot_url'])) |
||||
out[item['url']] = os.path.basename(item['screenshot_url']) |
||||
|
||||
output_filename = "./webapp/dist/data/screenshots.json" |
||||
with open(output_filename, 'w', encoding="utf8") as jsonfile: |
||||
json.dump(urls_done, jsonfile, indent=2, sort_keys=True, ensure_ascii=False) |
||||
json.dump(out, jsonfile, indent=2, sort_keys=True, ensure_ascii=False) |
||||
|
||||
def make_screenshots(url): |
||||
""" |
||||
Creates screenshots in various sizes, uploads them to |
||||
Google Cloud Storage and returns the output filename |
||||
""" |
||||
sizes = ([320, 640], [1500, 1500]) |
||||
for size in sizes: |
||||
print("Screenshotting size %s for %s" % (size, url)) |
||||
sizeargument = "%spx*%spx" % (size[0], size[1]) |
||||
subfolder = "%sx%s" % (size[0], size[1]) |
||||
filename = hashlib.md5(bytearray(url, 'utf-8')).hexdigest() + ".png" |
||||
command = [ |
||||
"docker", "run", "--rm", "-v", |
||||
os.getenv("PWD") + "/temp/%s:/srv" % subfolder, |
||||
"netzbegruenung/green-spider-screenshotter:latest", |
||||
url, filename, sizeargument |
||||
] |
||||
subprocess.run(command) |
||||
blob = bucket.blob('%s/%s' % (subfolder, filename)) |
||||
local_path = './temp/%s/%s' % (subfolder, filename) |
||||
if os.path.exists(local_path): |
||||
with open(local_path, 'rb') as my_file: |
||||
blob.upload_from_file(my_file, content_type="image/png") |
||||
blob.make_public() |
||||
else: |
||||
print("Error: No screenshot created: size=%s, url='%s'" % (size, url)) |
||||
return filename |
||||
|
||||
if __name__ == "__main__": |
||||
main() |
||||
|
Loading…
Reference in new issue