Add big result export

This commit is contained in:
Marian Steinbach 2018-11-26 09:35:52 +01:00
parent 61c555f45e
commit ae40915185
2 changed files with 111 additions and 16 deletions

88
main.py
View File

@ -1,3 +1,4 @@
import collections
from datetime import datetime
from os import getenv
from wsgiref import simple_server
@ -16,6 +17,32 @@ spider_results_kind = 'spider-results'
webscreenshots_kind = 'webscreenshot'
def convert_datastore_datetime(field):
"""
return datetime in different ways, depending on whether the lib returns
a str, int, or datetime.datetime
"""
dt = ''
if type(field) == datetime:
dt = field
elif type(field) == int:
dt = datetime.utcfromtimestamp(field / 1000000)
elif type(field) == str:
dt = datetime.utcfromtimestamp(int(field) / 1000000)
return dt
def flatten(d, parent_key='', sep='.'):
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
def get_compact_results(client):
query = client.query(kind=spider_results_kind,
order=['-created'],
@ -24,27 +51,35 @@ def get_compact_results(client):
out = []
for entity in query.fetch(eventual=True):
# handle creation date in different ways, depending on whether the lib returns
# a str, int, or datetime.datetime
created = entity.get('created')
dt = ''
if type(created) == datetime:
dt = created
elif type(created) == int:
dt = datetime.utcfromtimestamp(created / 1000000)
elif type(created) == str:
dt = datetime.utcfromtimestamp(int(created) / 1000000)
created = convert_datastore_datetime(entity.get('created'))
out.append({
'input_url': entity.key.name,
'created': dt.isoformat(),
'created': created.isoformat(),
'meta': entity.get('meta'),
'score': entity.get('score'),
})
return out
def get_full_results(client):
query = client.query(kind=spider_results_kind)
out = []
for entity in query.fetch(eventual=True):
created = convert_datastore_datetime(entity.get('created'))
record = {
'input_url': entity.key.name,
'created': created.isoformat(),
'score': entity.get('score'),
}
record.update(flatten(entity.get('meta'), parent_key='meta'))
record.update(flatten(entity.get('rating'), parent_key='rating'))
out.append(record)
return out
class LastUpdated(object):
def on_get(self, req, resp):
@ -78,6 +113,19 @@ class CompactResults(object):
resp.media = out
class BigResults(object):
def on_get(self, req, resp):
"""
Returns big sites results
"""
out = get_full_results(datastore_client)
maxage = 48 * 60 * 60 # two days
resp.cache_control = ["max_age=%d" % maxage]
resp.media = out
class SiteDetails(object):
def on_get(self, req, resp):
@ -128,6 +176,20 @@ class SiteScreenshots(object):
resp.media = entities
class Index(object):
def on_get(self, req, resp):
resp.media = {
"message": "This is green-spider-api",
"url": "https://github.com/netzbegruenung/green-spider-api",
"endpoints": [
"/api/v1/spider-results/last-updated/",
"/api/v1/spider-results/big/",
"/api/v1/spider-results/compact/",
"/api/v1/spider-results/site",
"/api/v1/screenshots/site",
]
}
handlers = media.Handlers({
'application/json': jsonhandler.JSONHandler(),
})
@ -139,8 +201,10 @@ app.resp_options.media_handlers = handlers
app.add_route('/api/v1/spider-results/last-updated/', LastUpdated())
app.add_route('/api/v1/spider-results/compact/', CompactResults())
app.add_route('/api/v1/spider-results/big/', BigResults())
app.add_route('/api/v1/spider-results/site', SiteDetails())
app.add_route('/api/v1/screenshots/site', SiteScreenshots())
app.add_route('/', Index())
if __name__ == '__main__':

31
main_test.py Normal file
View File

@ -0,0 +1,31 @@
import unittest
from main import flatten
class TestFlattenDict(unittest.TestCase):
def test_flatten(self):
input = {
"foo": {
"bar": {
"one": 1,
"two": 2,
}
},
"bar": {
"one": 1,
"two": 2,
}
}
expected = {
"foo.bar.one": 1,
"foo.bar.two": 2,
"bar.one": 1,
"bar.two": 2,
}
out = flatten(input)
self.assertEqual(out, expected)
if __name__ == '__main__':
unittest.main()