mirror of
https://github.com/netzbegruenung/green-spider-api.git
synced 2024-05-03 17:23:40 +02:00
Add table data export
This commit is contained in:
parent
ae40915185
commit
f243c32f40
49
main.py
49
main.py
|
@ -1,6 +1,7 @@
|
||||||
import collections
|
import collections
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from os import getenv
|
from os import getenv
|
||||||
|
import sys
|
||||||
from wsgiref import simple_server
|
from wsgiref import simple_server
|
||||||
|
|
||||||
import falcon
|
import falcon
|
||||||
|
@ -9,7 +10,6 @@ import jsonhandler
|
||||||
|
|
||||||
from google.cloud import datastore
|
from google.cloud import datastore
|
||||||
|
|
||||||
|
|
||||||
credentials_path = getenv('GCLOUD_DATASTORE_CREDENTIALS_PATH')
|
credentials_path = getenv('GCLOUD_DATASTORE_CREDENTIALS_PATH')
|
||||||
datastore_client = datastore.Client.from_service_account_json(credentials_path)
|
datastore_client = datastore.Client.from_service_account_json(credentials_path)
|
||||||
|
|
||||||
|
@ -62,7 +62,40 @@ def get_compact_results(client):
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def get_full_results(client):
|
def simplify_rating(d):
|
||||||
|
"""
|
||||||
|
Removes some keys from a flattened rating dict
|
||||||
|
"""
|
||||||
|
keys_to_delete = []
|
||||||
|
for key in d.keys():
|
||||||
|
if key.endswith(".type") or key.endswith(".max_score"):
|
||||||
|
keys_to_delete.append(key)
|
||||||
|
|
||||||
|
for key in keys_to_delete:
|
||||||
|
del d[key]
|
||||||
|
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def tablelize_checks(d):
|
||||||
|
"""
|
||||||
|
Returns a dict with the check details we want to be contained
|
||||||
|
in a table export.
|
||||||
|
"""
|
||||||
|
out = {}
|
||||||
|
|
||||||
|
# CMS names separated by space
|
||||||
|
out['generator'] = " ".join(list(set([i for i in d['generator'].values() if i is not None])))
|
||||||
|
|
||||||
|
# List of actual URLs crawled
|
||||||
|
out['resulting_urls'] = ""
|
||||||
|
if 'url_canonicalization' in d:
|
||||||
|
out['resulting_urls'] = " ".join([i for i in d['url_canonicalization'] if i is not None])
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def get_table_result(client):
|
||||||
query = client.query(kind=spider_results_kind)
|
query = client.query(kind=spider_results_kind)
|
||||||
|
|
||||||
out = []
|
out = []
|
||||||
|
@ -74,8 +107,11 @@ def get_full_results(client):
|
||||||
'created': created.isoformat(),
|
'created': created.isoformat(),
|
||||||
'score': entity.get('score'),
|
'score': entity.get('score'),
|
||||||
}
|
}
|
||||||
|
|
||||||
record.update(flatten(entity.get('meta'), parent_key='meta'))
|
record.update(flatten(entity.get('meta'), parent_key='meta'))
|
||||||
record.update(flatten(entity.get('rating'), parent_key='rating'))
|
record.update(simplify_rating(flatten(entity.get('rating'), parent_key='rating')))
|
||||||
|
record.update(tablelize_checks(entity.get('checks')))
|
||||||
|
|
||||||
out.append(record)
|
out.append(record)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
@ -113,13 +149,13 @@ class CompactResults(object):
|
||||||
resp.media = out
|
resp.media = out
|
||||||
|
|
||||||
|
|
||||||
class BigResults(object):
|
class TableResults(object):
|
||||||
|
|
||||||
def on_get(self, req, resp):
|
def on_get(self, req, resp):
|
||||||
"""
|
"""
|
||||||
Returns big sites results
|
Returns big sites results
|
||||||
"""
|
"""
|
||||||
out = get_full_results(datastore_client)
|
out = get_table_result(datastore_client)
|
||||||
|
|
||||||
maxage = 48 * 60 * 60 # two days
|
maxage = 48 * 60 * 60 # two days
|
||||||
resp.cache_control = ["max_age=%d" % maxage]
|
resp.cache_control = ["max_age=%d" % maxage]
|
||||||
|
@ -201,12 +237,13 @@ app.resp_options.media_handlers = handlers
|
||||||
|
|
||||||
app.add_route('/api/v1/spider-results/last-updated/', LastUpdated())
|
app.add_route('/api/v1/spider-results/last-updated/', LastUpdated())
|
||||||
app.add_route('/api/v1/spider-results/compact/', CompactResults())
|
app.add_route('/api/v1/spider-results/compact/', CompactResults())
|
||||||
app.add_route('/api/v1/spider-results/big/', BigResults())
|
app.add_route('/api/v1/spider-results/table/', TableResults())
|
||||||
app.add_route('/api/v1/spider-results/site', SiteDetails())
|
app.add_route('/api/v1/spider-results/site', SiteDetails())
|
||||||
app.add_route('/api/v1/screenshots/site', SiteScreenshots())
|
app.add_route('/api/v1/screenshots/site', SiteScreenshots())
|
||||||
app.add_route('/', Index())
|
app.add_route('/', Index())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
httpd = simple_server.make_server('127.0.0.1', 5000, app)
|
httpd = simple_server.make_server('127.0.0.1', 5000, app)
|
||||||
httpd.serve_forever()
|
httpd.serve_forever()
|
||||||
|
|
Loading…
Reference in a new issue