mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-01 16:44:51 +02:00
Introduce rating modules
This commit is contained in:
parent
849aa1ad2a
commit
d4b5695ae9
|
@ -15,6 +15,7 @@ ADD data_export.py /
|
|||
ADD config /config
|
||||
ADD jobs /jobs
|
||||
ADD checks /checks
|
||||
ADD rating /rating
|
||||
|
||||
ENTRYPOINT ["python3"]
|
||||
CMD ["/spider.py"]
|
||||
|
|
|
@ -22,4 +22,5 @@ GCMS_IP = "91.102.13.20"
|
|||
JOB_DATASTORE_KIND = 'spider-jobs'
|
||||
|
||||
# kind name of the spider results datastore entities
|
||||
RESULTS_DATASTORE_KIND = 'spider-results'
|
||||
# TODO: change back to 'spider-results'
|
||||
RESULTS_DATASTORE_KIND = 'spider-results-dev'
|
||||
|
|
50
rating/__init__.py
Normal file
50
rating/__init__.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
"""
|
||||
The rating module contains the functionality to get calculate score for certain
|
||||
criteria based on information gather by checks before.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from rating import canonical_url
|
||||
from rating import favicon
|
||||
from rating import feeds
|
||||
from rating import https
|
||||
from rating import reachable
|
||||
from rating import resolvable
|
||||
from rating import response_duration
|
||||
from rating import responsive_layout
|
||||
from rating import www_optional
|
||||
|
||||
|
||||
def calculate_rating(results):
|
||||
"""
|
||||
Calculates ratings for a number of criteria.
|
||||
|
||||
Params:
|
||||
results - Results dictionary from checks
|
||||
"""
|
||||
|
||||
# The sequence of checks to run. Order is important!
|
||||
# Checks which expand the URLs list must come first.
|
||||
# After that, dependencies (encoded in the checks) have to be fulfilled.
|
||||
rating_modules = [
|
||||
('DNS_RESOLVABLE_IPV4', resolvable),
|
||||
('SITE_REACHABLE', reachable),
|
||||
('HTTPS', https),
|
||||
('WWW_OPTIONAL', www_optional),
|
||||
('CANONICAL_URL', canonical_url),
|
||||
('HTTP_RESPONSE_DURATION', response_duration),
|
||||
('FAVICON', favicon),
|
||||
('FEEDS', feeds),
|
||||
('RESPONSIVE', responsive_layout),
|
||||
]
|
||||
|
||||
output = {}
|
||||
|
||||
for name, mod in rating_modules:
|
||||
|
||||
rater = mod.Rater(results)
|
||||
output[name] = rater.rate()
|
||||
|
||||
|
||||
return output
|
22
rating/abstract_rater.py
Normal file
22
rating/abstract_rater.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
class AbstractRater(object):
|
||||
|
||||
# String 'boolean' or 'number'
|
||||
rating_type = None
|
||||
|
||||
# The default value to return if no rating given
|
||||
default_value = None
|
||||
|
||||
max_score = 1
|
||||
|
||||
# Name of the checks this rater depends on
|
||||
depends_on_checks = []
|
||||
|
||||
def __init__(self, check_results):
|
||||
self.check_results = check_results
|
||||
|
||||
for item in self.depends_on_checks:
|
||||
assert item in self.check_results
|
||||
|
||||
def rate(self):
|
||||
raise NotImplementedError()
|
||||
|
31
rating/canonical_url.py
Normal file
31
rating/canonical_url.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
"""
|
||||
This looks at remaining resolvable URLs after redirects
|
||||
and gives score if there is only one URL left.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['url_canonicalization']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
if len(self.check_results['url_reachability']) == 1:
|
||||
value = True
|
||||
score = self.max_score
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
32
rating/favicon.py
Normal file
32
rating/favicon.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
"""
|
||||
This gives a score if the site has an icon.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['html_head']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
for url in self.check_results['html_head']:
|
||||
if self.check_results['html_head'][url]['link_icon'] is not None:
|
||||
value = True
|
||||
score = self.max_score
|
||||
break
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
32
rating/feeds.py
Normal file
32
rating/feeds.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
"""
|
||||
This gives a score if the site has feeds.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['html_head']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
for url in self.check_results['html_head']:
|
||||
if self.check_results['html_head'][url]['link_rss_atom'] is not None:
|
||||
value = True
|
||||
score = self.max_score
|
||||
break
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
47
rating/https.py
Normal file
47
rating/https.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
"""
|
||||
This looks at all HTTPS URLs we checked for reachability.
|
||||
|
||||
If all of them were reachable without errors, we give full score.
|
||||
If some or all had errors, or no HTTPS URL is reachable, we give zero.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['url_reachability']
|
||||
|
||||
# HTTPS is very important, so this counts double
|
||||
max_score = 2
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
reachable_count = 0
|
||||
unreachable_count = 0
|
||||
|
||||
for url in self.check_results['url_reachability']:
|
||||
if not url.startswith('https://'):
|
||||
continue
|
||||
|
||||
if self.check_results['url_reachability'][url]['exception'] is None:
|
||||
reachable_count += 1
|
||||
else:
|
||||
unreachable_count += 1
|
||||
|
||||
if unreachable_count == 0 and reachable_count > 0:
|
||||
value = True
|
||||
score = self.max_score
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
36
rating/reachable.py
Normal file
36
rating/reachable.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
"""
|
||||
This gives a score if one of the checked URL variations was reachable.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['url_reachability']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
count = 0
|
||||
for url in self.check_results['url_reachability']:
|
||||
if self.check_results['url_reachability'][url]['exception'] is not None:
|
||||
continue
|
||||
count += 1
|
||||
|
||||
if count > 0:
|
||||
value = True
|
||||
score = self.max_score
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
35
rating/resolvable.py
Normal file
35
rating/resolvable.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
"""
|
||||
This gives a score if one of the input URL's hostnames was resolvable
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['dns_resolution']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
count = 0
|
||||
for url in self.check_results['dns_resolution']:
|
||||
if self.check_results['dns_resolution'][url]['resolvable']:
|
||||
count += 1
|
||||
|
||||
if count > 0:
|
||||
value = True
|
||||
score = self.max_score
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
45
rating/response_duration.py
Normal file
45
rating/response_duration.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
"""
|
||||
This looks at the response duration(s) and scores based on the bucket
|
||||
the value is in. Fast responses get one point, slower half a point,
|
||||
more than a seconds gets nothing.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'number'
|
||||
default_value = False
|
||||
depends_on_checks = ['page_content']
|
||||
max_score = 1.0
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
duration_sum = 0
|
||||
duration_count = 0
|
||||
|
||||
for url in self.check_results['page_content']:
|
||||
if self.check_results['page_content'][url]['exception'] is not None:
|
||||
continue
|
||||
duration_sum += self.check_results['page_content'][url]['duration']
|
||||
duration_count += 1
|
||||
|
||||
value = round(duration_sum / duration_count)
|
||||
|
||||
# value is duration in milliseconds
|
||||
if value < 100:
|
||||
score = self.max_score
|
||||
elif value < 1000:
|
||||
score = self.max_score * 0.5
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
35
rating/responsive_layout.py
Normal file
35
rating/responsive_layout.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
"""
|
||||
This gives a score if the site's minimal document width during checks
|
||||
was smaller than or equal to the minimal viewport size tested.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['responsive_layout']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
for url in self.check_results['responsive_layout']:
|
||||
if (self.check_results['responsive_layout'][url]['min_document_width'] <=
|
||||
self.check_results['responsive_layout'][url]['sizes'][0]['viewport_width']):
|
||||
value = True
|
||||
score = self.max_score
|
||||
# we use the first URL found here
|
||||
break
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
44
rating/www_optional.py
Normal file
44
rating/www_optional.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
"""
|
||||
This looks at reachable URLs and checks whether (sub)domains
|
||||
both with and without www. are reachable.
|
||||
"""
|
||||
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['url_reachability']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
hostnames = set()
|
||||
for url in self.check_results['url_reachability']:
|
||||
if self.check_results['url_reachability'][url]['exception'] is not None:
|
||||
continue
|
||||
parsed = urlparse(url)
|
||||
hostnames.add(parsed)
|
||||
|
||||
# FIXME
|
||||
# we simply check whether there is more than one hostname.
|
||||
# this works with our current input URls but might be too
|
||||
# simplistic in the future.
|
||||
if len(list(hostnames)) > 1:
|
||||
value = True
|
||||
score = self.max_score
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
Loading…
Reference in a new issue