Introduce rating modules

2024-05-01 16:44:51 +02:00 · 2018-09-28 22:49:07 +02:00 · 2018-09-28 22:49:07 +02:00 · d4b5695ae9
parent 849aa1ad2a
commit d4b5695ae9
13 changed files with 412 additions and 1 deletions
--- a/1
+++ b/1
@ -15,6 +15,7 @@ ADD data_export.py /
 ADD config /config
 ADD jobs /jobs
 ADD checks /checks
+ADD rating /rating

 ENTRYPOINT ["python3"]
 CMD ["/spider.py"]
--- a/config/init.py
+++ b/config/init.py
@ -22,4 +22,5 @@ GCMS_IP = "91.102.13.20"
 JOB_DATASTORE_KIND = 'spider-jobs'

 # kind name of the spider results datastore entities
-RESULTS_DATASTORE_KIND = 'spider-results'
+# TODO: change back to 'spider-results'
+RESULTS_DATASTORE_KIND = 'spider-results-dev'
--- a/rating/init.py
+++ b/rating/init.py
@ -0,0 +1,50 @@
+"""
+The rating module contains the functionality to get calculate score for certain
+criteria based on information gather by checks before.
+"""
+
+import logging
+
+from rating import canonical_url
+from rating import favicon
+from rating import feeds
+from rating import https
+from rating import reachable
+from rating import resolvable
+from rating import response_duration
+from rating import responsive_layout
+from rating import www_optional
+
+
+def calculate_rating(results):
+    """
+    Calculates ratings for a number of criteria.
+
+    Params:
+    results - Results dictionary from checks
+    """
+
+    # The sequence of checks to run. Order is important!
+    # Checks which expand the URLs list must come first.
+    # After that, dependencies (encoded in the checks) have to be fulfilled.
+    rating_modules = [
+        ('DNS_RESOLVABLE_IPV4', resolvable),
+        ('SITE_REACHABLE', reachable),
+        ('HTTPS', https),
+        ('WWW_OPTIONAL', www_optional),
+        ('CANONICAL_URL', canonical_url),
+        ('HTTP_RESPONSE_DURATION', response_duration),
+        ('FAVICON', favicon),
+        ('FEEDS', feeds),
+        ('RESPONSIVE', responsive_layout),
+    ]
+
+    output = {}
+
+    for name, mod in rating_modules:
+
+        rater = mod.Rater(results)
+        output[name] = rater.rate()
+
+    
+    return output
--- a/rating/abstract_rater.py
+++ b/rating/abstract_rater.py
@ -0,0 +1,22 @@
+class AbstractRater(object):
+
+    # String 'boolean' or 'number'
+    rating_type = None
+
+    # The default value to return if no rating given
+    default_value = None
+    
+    max_score = 1
+
+    # Name of the checks this rater depends on
+    depends_on_checks = []
+
+    def __init__(self, check_results):
+        self.check_results = check_results
+
+        for item in self.depends_on_checks:
+            assert item in self.check_results
+
+    def rate(self):
+        raise NotImplementedError()
+    
--- a/rating/canonical_url.py
+++ b/rating/canonical_url.py
@ -0,0 +1,31 @@
+"""
+This looks at remaining resolvable URLs after redirects
+and gives score if there is only one URL left.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['url_canonicalization']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        if len(self.check_results['url_reachability']) == 1:
+            value = True
+            score = self.max_score
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
--- a/rating/favicon.py
+++ b/rating/favicon.py
@ -0,0 +1,32 @@
+"""
+This gives a score if the site has an icon.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['html_head']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        for url in self.check_results['html_head']:
+            if self.check_results['html_head'][url]['link_icon'] is not None:
+                value = True
+                score = self.max_score
+                break
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
--- a/rating/feeds.py
+++ b/rating/feeds.py
@ -0,0 +1,32 @@
+"""
+This gives a score if the site has feeds.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['html_head']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        for url in self.check_results['html_head']:
+            if self.check_results['html_head'][url]['link_rss_atom'] is not None:
+                value = True
+                score = self.max_score
+                break
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
--- a/rating/https.py
+++ b/rating/https.py
@ -0,0 +1,47 @@
+"""
+This looks at all HTTPS URLs we checked for reachability.
+
+If all of them were reachable without errors, we give full score.
+If some or all had errors, or no HTTPS URL is reachable, we give zero.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['url_reachability']
+
+    # HTTPS is very important, so this counts double
+    max_score = 2
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        reachable_count = 0
+        unreachable_count = 0
+
+        for url in self.check_results['url_reachability']:
+            if not url.startswith('https://'):
+                continue
+
+            if self.check_results['url_reachability'][url]['exception'] is None:
+                reachable_count += 1
+            else:
+                unreachable_count += 1
+        
+        if unreachable_count == 0 and reachable_count > 0:
+            value = True
+            score = self.max_score
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
--- a/rating/reachable.py
+++ b/rating/reachable.py
@ -0,0 +1,36 @@
+"""
+This gives a score if one of the checked URL variations was reachable.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['url_reachability']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        count = 0
+        for url in self.check_results['url_reachability']:
+            if self.check_results['url_reachability'][url]['exception'] is not None:
+                continue
+            count += 1
+        
+        if count > 0:
+            value = True
+            score = self.max_score
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
--- a/rating/resolvable.py
+++ b/rating/resolvable.py
@ -0,0 +1,35 @@
+"""
+This gives a score if one of the input URL's hostnames was resolvable
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['dns_resolution']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        count = 0
+        for url in self.check_results['dns_resolution']:
+            if self.check_results['dns_resolution'][url]['resolvable']:
+                count += 1
+        
+        if count > 0:
+            value = True
+            score = self.max_score
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
--- a/rating/response_duration.py
+++ b/rating/response_duration.py
@ -0,0 +1,45 @@
+"""
+This looks at the response duration(s) and scores based on the bucket
+the value is in. Fast responses get one point, slower half a point,
+more than a seconds gets nothing.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'number'
+    default_value = False
+    depends_on_checks = ['page_content']
+    max_score = 1.0
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        duration_sum = 0
+        duration_count = 0
+
+        for url in self.check_results['page_content']:
+            if self.check_results['page_content'][url]['exception'] is not None:
+                continue
+            duration_sum += self.check_results['page_content'][url]['duration']
+            duration_count += 1
+        
+        value = round(duration_sum / duration_count)
+        
+        # value is duration in milliseconds
+        if value < 100:
+            score = self.max_score
+        elif value < 1000:
+            score = self.max_score * 0.5
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
--- a/rating/responsive_layout.py
+++ b/rating/responsive_layout.py
@ -0,0 +1,35 @@
+"""
+This gives a score if the site's minimal document width during checks
+was smaller than or equal to the minimal viewport size tested.
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['responsive_layout']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        for url in self.check_results['responsive_layout']:
+            if (self.check_results['responsive_layout'][url]['min_document_width'] <=
+                self.check_results['responsive_layout'][url]['sizes'][0]['viewport_width']):
+                value = True
+                score = self.max_score
+                # we use the first URL found here
+                break
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
--- a/rating/www_optional.py
+++ b/rating/www_optional.py
@ -0,0 +1,44 @@
+"""
+This looks at reachable URLs and checks whether (sub)domains
+both with and without www. are reachable.
+"""
+
+from urllib.parse import urlparse
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['url_reachability']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        hostnames = set()
+        for url in self.check_results['url_reachability']:
+            if self.check_results['url_reachability'][url]['exception'] is not None:
+                continue
+            parsed = urlparse(url)
+            hostnames.add(parsed)
+        
+        # FIXME
+        # we simply check whether there is more than one hostname.
+        # this works with our current input URls but might be too
+        # simplistic in the future.
+        if len(list(hostnames)) > 1:
+            value = True
+            score = self.max_score
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }