From 569747ae4ccfb065302da0c9d8468e8d4433a118 Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Fri, 12 Apr 2019 09:13:54 +0200 Subject: [PATCH 1/8] Fix endpoint index --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 0860cff..ff300ee 100644 --- a/main.py +++ b/main.py @@ -219,7 +219,7 @@ class Index(object): "url": "https://github.com/netzbegruenung/green-spider-api", "endpoints": [ "/api/v1/spider-results/last-updated/", - "/api/v1/spider-results/big/", + "/api/v1/spider-results/table/", "/api/v1/spider-results/compact/", "/api/v1/spider-results/site", "/api/v1/screenshots/site", From 0600480db3ca08f4d30f45f2c35d42b9b9163d5e Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Fri, 12 Apr 2019 09:14:56 +0200 Subject: [PATCH 2/8] Adapt SiteDetails to use ES --- main.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index ff300ee..fe12b82 100644 --- a/main.py +++ b/main.py @@ -175,16 +175,18 @@ class SiteDetails(object): 'Bad request', 'The parameter url must not be empty') - key = datastore_client.key(spider_results_kind, req.get_param('url')) - entity = datastore_client.get(key) + entity = es.get(index=es_index_name, doc_type=es_doc_type, id=url) if entity is None: raise falcon.HTTPError(falcon.HTTP_404, 'Not found', 'A site with this URL does not exist') - maxage = 24 * 60 * 60 # 24 hours in seconds + if 'url' in entity['_source']: + del entity['_source']['url'] + + maxage = 5 * 60 # 5 minutes in seconds resp.cache_control = ["max_age=%d" % maxage] - resp.media = dict(entity) + resp.media = entity['_source'] class SiteScreenshots(object): From 110a3c0ac03d7873013f85f71aec1345aa50362b Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Fri, 12 Apr 2019 09:15:14 +0200 Subject: [PATCH 3/8] Adapt LastUpdated to use ES --- main.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index fe12b82..fb81f99 100644 --- a/main.py +++ b/main.py @@ -122,17 +122,14 @@ class LastUpdated(object): """ Informs about the most recent update to the spider results data """ - query = datastore_client.query(kind=spider_results_kind, - order=['-created'], - projection=['created']) - items = list(query.fetch(limit=1, eventual=True)) - ts = int(items[0].get('created')) / 1000000 - dt = datetime.utcfromtimestamp(ts).isoformat() + res = es.search(index=es_index_name, + filter_path=['hits.hits._source.created'], + body={"query": {"match_all": {}}}, + sort='created:desc', + size=1) - maxage = 60 * 60 # one hour in seconds - resp.cache_control = ["max_age=%d" % maxage] resp.media = { - "last_updated": dt + "last_updated": res['hits']['hits'][0]['_source']['created'] } From 9ea662c58288ab2d305e7f1e6151c35b60a0ccb1 Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Fri, 12 Apr 2019 09:15:35 +0200 Subject: [PATCH 4/8] Adapt CompactResults to use ES --- main.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index fb81f99..ac5fb38 100644 --- a/main.py +++ b/main.py @@ -9,13 +9,18 @@ from falcon import media import jsonhandler from google.cloud import datastore +from elasticsearch import Elasticsearch credentials_path = getenv('GCLOUD_DATASTORE_CREDENTIALS_PATH') datastore_client = datastore.Client.from_service_account_json(credentials_path) +es = Elasticsearch([{'host': 'elasticsearch', 'port': 9200}]) + +es_doc_type = 'result' spider_results_kind = 'spider-results' webscreenshots_kind = 'webscreenshot' +es_index_name = spider_results_kind def convert_datastore_datetime(field): """ @@ -43,10 +48,9 @@ def flatten(d, parent_key='', sep='.'): return dict(items) -def get_compact_results(client): +def get_compact_results(): query = client.query(kind=spider_results_kind, order=['-created'], - #projection=['created', 'meta', 'score'], ) out = [] @@ -139,7 +143,7 @@ class CompactResults(object): """ Returns compact sites overview and score """ - out = get_compact_results(datastore_client) + out = get_compact_results() maxage = 6 * 60 * 60 # six hours in seconds resp.cache_control = ["max_age=%d" % maxage] From 614c696878d372245968166d75931583bdf0fa99 Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Fri, 12 Apr 2019 09:16:02 +0200 Subject: [PATCH 5/8] Add elasticsearch dependency --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ead59c1..218555f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ astroid==2.0.4 cachetools==2.1.0 certifi==2018.10.15 chardet==3.0.4 +elasticsearch==6.3.1 falcon==1.4.1 google-api-core==1.5.1 google-auth==1.5.1 @@ -15,8 +16,8 @@ isort==4.3.4 lazy-object-proxy==1.3.1 mccabe==0.6.1 protobuf==3.6.1 -pyasn1==0.4.4 pyasn1-modules==0.2.2 +pyasn1==0.4.4 pylint==2.1.1 python-mimeparse==1.6.0 pytz==2018.7 From d344060d3f31b9a49adedc84dc1810164e1c7ea0 Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Mon, 15 Apr 2019 21:46:20 +0200 Subject: [PATCH 6/8] Add upgrading of pip --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index 67ed079..8f3681c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,8 @@ FROM python:3.6.7-slim-jessie + ADD requirements.txt / +RUN pip install --upgrade pip RUN pip install --no-cache-dir -r requirements.txt ADD jsonhandler.py / From afc65ec3e6f5c0a013c293d9df85cabf4c502ddd Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Mon, 15 Apr 2019 21:46:57 +0200 Subject: [PATCH 7/8] Remove /api/v1/spider-results/compact/ endpoint --- main.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/main.py b/main.py index ac5fb38..d227898 100644 --- a/main.py +++ b/main.py @@ -48,24 +48,6 @@ def flatten(d, parent_key='', sep='.'): return dict(items) -def get_compact_results(): - query = client.query(kind=spider_results_kind, - order=['-created'], - ) - - out = [] - for entity in query.fetch(eventual=True): - created = convert_datastore_datetime(entity.get('created')) - - out.append({ - 'input_url': entity.key.name, - 'created': created.isoformat(), - 'meta': entity.get('meta'), - 'score': entity.get('score'), - }) - return out - - def simplify_rating(d): """ Removes some keys from a flattened rating dict @@ -223,7 +205,6 @@ class Index(object): "endpoints": [ "/api/v1/spider-results/last-updated/", "/api/v1/spider-results/table/", - "/api/v1/spider-results/compact/", "/api/v1/spider-results/site", "/api/v1/screenshots/site", ] From 378838f6a99f326d02dead224b0231ec7d862987 Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Mon, 15 Apr 2019 21:47:18 +0200 Subject: [PATCH 8/8] Various changes and additions --- main.py | 86 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 17 deletions(-) diff --git a/main.py b/main.py index d227898..ab2e7bc 100644 --- a/main.py +++ b/main.py @@ -109,7 +109,7 @@ class LastUpdated(object): Informs about the most recent update to the spider results data """ res = es.search(index=es_index_name, - filter_path=['hits.hits._source.created'], + _source_include=['created'], body={"query": {"match_all": {}}}, sort='created:desc', size=1) @@ -119,19 +119,6 @@ class LastUpdated(object): } -class CompactResults(object): - - def on_get(self, req, resp): - """ - Returns compact sites overview and score - """ - out = get_compact_results() - - maxage = 6 * 60 * 60 # six hours in seconds - resp.cache_control = ["max_age=%d" % maxage] - resp.media = out - - class TableResults(object): def on_get(self, req, resp): @@ -145,6 +132,70 @@ class TableResults(object): resp.media = out +class SpiderResultsQuery(object): + + def on_get(self, req, resp): + """ + Queries the ES index for sites matching a term + """ + query_term = req.get_param('q', default='') + from_num = req.get_param('from', default='0') + + try: + from_num = int(from_num) + except Exception: + raise falcon.HTTPError(falcon.HTTP_400, + 'Bad request', + 'The parameter "from" bust be an integer.') + + res = es.search(index=es_index_name, + _source_include=['created', 'meta', 'rating', 'score', 'url'], + body={ + "query": { + "query_string": { + "query": query_term, + "default_operator": "AND", + } + } + }, + from_=from_num, + size=20, + sort='score:desc') + resp.media = { + "hits": res['hits'] + } + + +class SpiderResultsCount(object): + + def on_get(self, req, resp): + """ + Returns the number of items in the spider-results ES index + """ + query_term = req.get_param('q') + body = {"query": {"match_all" : {}}} + if query_term is not None: + body = { + "query": { + "bool" : { + "must" : { + "query_string" : { + "query" : query_term + } + } + } + } + } + + res = es.search(index=es_index_name, body=body, size=0) + + maxage = 5 * 60 # 5 minutes in seconds + resp.cache_control = ["max_age=%d" % maxage] + resp.media = { + "count": res['hits']['total'] + } + + class SiteDetails(object): def on_get(self, req, resp): @@ -164,8 +215,7 @@ class SiteDetails(object): 'Not found', 'A site with this URL does not exist') - if 'url' in entity['_source']: - del entity['_source']['url'] + entity['_source']['url'] = entity['_id'] maxage = 5 * 60 # 5 minutes in seconds resp.cache_control = ["max_age=%d" % maxage] @@ -203,6 +253,7 @@ class Index(object): "message": "This is green-spider-api", "url": "https://github.com/netzbegruenung/green-spider-api", "endpoints": [ + "/api/v1/spider-results/count/", "/api/v1/spider-results/last-updated/", "/api/v1/spider-results/table/", "/api/v1/spider-results/site", @@ -220,8 +271,9 @@ app.req_options.media_handlers = handlers app.resp_options.media_handlers = handlers app.add_route('/api/v1/spider-results/last-updated/', LastUpdated()) -app.add_route('/api/v1/spider-results/compact/', CompactResults()) app.add_route('/api/v1/spider-results/table/', TableResults()) +app.add_route('/api/v1/spider-results/query/', SpiderResultsQuery()) +app.add_route('/api/v1/spider-results/count/', SpiderResultsCount()) app.add_route('/api/v1/spider-results/site', SiteDetails()) app.add_route('/api/v1/screenshots/site', SiteScreenshots()) app.add_route('/', Index())