Introduce rating modules

This commit is contained in:
Marian Steinbach 2018-09-28 22:49:07 +02:00
parent 849aa1ad2a
commit d4b5695ae9
13 changed files with 412 additions and 1 deletions

View file

@ -15,6 +15,7 @@ ADD data_export.py /
ADD config /config
ADD jobs /jobs
ADD checks /checks
ADD rating /rating
ENTRYPOINT ["python3"]
CMD ["/spider.py"]

View file

@ -22,4 +22,5 @@ GCMS_IP = "91.102.13.20"
JOB_DATASTORE_KIND = 'spider-jobs'
# kind name of the spider results datastore entities
RESULTS_DATASTORE_KIND = 'spider-results'
# TODO: change back to 'spider-results'
RESULTS_DATASTORE_KIND = 'spider-results-dev'

50
rating/__init__.py Normal file
View file

@ -0,0 +1,50 @@
"""
The rating module contains the functionality to get calculate score for certain
criteria based on information gather by checks before.
"""
import logging
from rating import canonical_url
from rating import favicon
from rating import feeds
from rating import https
from rating import reachable
from rating import resolvable
from rating import response_duration
from rating import responsive_layout
from rating import www_optional
def calculate_rating(results):
"""
Calculates ratings for a number of criteria.
Params:
results - Results dictionary from checks
"""
# The sequence of checks to run. Order is important!
# Checks which expand the URLs list must come first.
# After that, dependencies (encoded in the checks) have to be fulfilled.
rating_modules = [
('DNS_RESOLVABLE_IPV4', resolvable),
('SITE_REACHABLE', reachable),
('HTTPS', https),
('WWW_OPTIONAL', www_optional),
('CANONICAL_URL', canonical_url),
('HTTP_RESPONSE_DURATION', response_duration),
('FAVICON', favicon),
('FEEDS', feeds),
('RESPONSIVE', responsive_layout),
]
output = {}
for name, mod in rating_modules:
rater = mod.Rater(results)
output[name] = rater.rate()
return output

22
rating/abstract_rater.py Normal file
View file

@ -0,0 +1,22 @@
class AbstractRater(object):
# String 'boolean' or 'number'
rating_type = None
# The default value to return if no rating given
default_value = None
max_score = 1
# Name of the checks this rater depends on
depends_on_checks = []
def __init__(self, check_results):
self.check_results = check_results
for item in self.depends_on_checks:
assert item in self.check_results
def rate(self):
raise NotImplementedError()

31
rating/canonical_url.py Normal file
View file

@ -0,0 +1,31 @@
"""
This looks at remaining resolvable URLs after redirects
and gives score if there is only one URL left.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['url_canonicalization']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
if len(self.check_results['url_reachability']) == 1:
value = True
score = self.max_score
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

32
rating/favicon.py Normal file
View file

@ -0,0 +1,32 @@
"""
This gives a score if the site has an icon.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['html_head']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
for url in self.check_results['html_head']:
if self.check_results['html_head'][url]['link_icon'] is not None:
value = True
score = self.max_score
break
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

32
rating/feeds.py Normal file
View file

@ -0,0 +1,32 @@
"""
This gives a score if the site has feeds.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['html_head']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
for url in self.check_results['html_head']:
if self.check_results['html_head'][url]['link_rss_atom'] is not None:
value = True
score = self.max_score
break
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

47
rating/https.py Normal file
View file

@ -0,0 +1,47 @@
"""
This looks at all HTTPS URLs we checked for reachability.
If all of them were reachable without errors, we give full score.
If some or all had errors, or no HTTPS URL is reachable, we give zero.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['url_reachability']
# HTTPS is very important, so this counts double
max_score = 2
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
reachable_count = 0
unreachable_count = 0
for url in self.check_results['url_reachability']:
if not url.startswith('https://'):
continue
if self.check_results['url_reachability'][url]['exception'] is None:
reachable_count += 1
else:
unreachable_count += 1
if unreachable_count == 0 and reachable_count > 0:
value = True
score = self.max_score
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

36
rating/reachable.py Normal file
View file

@ -0,0 +1,36 @@
"""
This gives a score if one of the checked URL variations was reachable.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['url_reachability']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
count = 0
for url in self.check_results['url_reachability']:
if self.check_results['url_reachability'][url]['exception'] is not None:
continue
count += 1
if count > 0:
value = True
score = self.max_score
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

35
rating/resolvable.py Normal file
View file

@ -0,0 +1,35 @@
"""
This gives a score if one of the input URL's hostnames was resolvable
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['dns_resolution']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
count = 0
for url in self.check_results['dns_resolution']:
if self.check_results['dns_resolution'][url]['resolvable']:
count += 1
if count > 0:
value = True
score = self.max_score
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

View file

@ -0,0 +1,45 @@
"""
This looks at the response duration(s) and scores based on the bucket
the value is in. Fast responses get one point, slower half a point,
more than a seconds gets nothing.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'number'
default_value = False
depends_on_checks = ['page_content']
max_score = 1.0
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
duration_sum = 0
duration_count = 0
for url in self.check_results['page_content']:
if self.check_results['page_content'][url]['exception'] is not None:
continue
duration_sum += self.check_results['page_content'][url]['duration']
duration_count += 1
value = round(duration_sum / duration_count)
# value is duration in milliseconds
if value < 100:
score = self.max_score
elif value < 1000:
score = self.max_score * 0.5
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

View file

@ -0,0 +1,35 @@
"""
This gives a score if the site's minimal document width during checks
was smaller than or equal to the minimal viewport size tested.
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['responsive_layout']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
for url in self.check_results['responsive_layout']:
if (self.check_results['responsive_layout'][url]['min_document_width'] <=
self.check_results['responsive_layout'][url]['sizes'][0]['viewport_width']):
value = True
score = self.max_score
# we use the first URL found here
break
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

44
rating/www_optional.py Normal file
View file

@ -0,0 +1,44 @@
"""
This looks at reachable URLs and checks whether (sub)domains
both with and without www. are reachable.
"""
from urllib.parse import urlparse
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['url_reachability']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
hostnames = set()
for url in self.check_results['url_reachability']:
if self.check_results['url_reachability'][url]['exception'] is not None:
continue
parsed = urlparse(url)
hostnames.add(parsed)
# FIXME
# we simply check whether there is more than one hostname.
# this works with our current input URls but might be too
# simplistic in the future.
if len(list(hostnames)) > 1:
value = True
score = self.max_score
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}