Add criteria: social media links, contact link (#90)

* Add hyperlink checker * Add rating for contact and social media links * Update a comment * Remove hyperlinks details from final payload
2024-04-27 14:54:52 +02:00 · 2018-11-20 22:47:34 +01:00 · 2018-11-20 22:47:34 +01:00 · 3ba6940e94
parent 4524cb5714
commit 3ba6940e94
7 changed files with 219 additions and 8 deletions
--- a/checks/init.py
+++ b/checks/init.py
@ -13,6 +13,7 @@ from checks import domain_variations
 from checks import generator
 from checks import html_head
 from checks import http_and_https
+from checks import hyperlinks
 from checks import page_content
 from checks import load_in_browser
 from checks import url_reachability
@ -40,6 +41,7 @@ def perform_checks(input_url):
        ('duplicate_content', duplicate_content),
        ('charset', charset),
        ('html_head', html_head),
+        ('hyperlinks', hyperlinks),
        ('generator', generator),
        ('load_in_browser', load_in_browser),
    ]
--- a/checks/hyperlinks.py
+++ b/checks/hyperlinks.py
@ -0,0 +1,48 @@
+"""
+Collects information on hyperlinks on the page.
+"""
+
+import logging
+
+from bs4 import BeautifulSoup
+
+from checks.abstract_checker import AbstractChecker
+
+class Checker(AbstractChecker):
+    def __init__(self, config, previous_results=None):
+        super().__init__(config, previous_results)
+    
+    def run(self):
+        assert 'page_content' in self.previous_results
+        
+        results = {}
+
+        for url in self.config.urls:
+            results[url] = self.get_links(url)
+
+        return results
+    
+    def get_links(self, url):
+        """
+        Expects page_content_dict['content'] to carry the HTML content
+        """
+        page_content = self.previous_results['page_content'][url]
+        assert 'content' in page_content
+
+        if page_content['content'] is None:
+            return
+
+        result = {
+            'links': [],
+            'exception': None,
+        }
+
+        soup = BeautifulSoup(page_content['content'], 'html.parser')
+
+        for link in soup.find_all("a"):
+            result['links'].append({
+                'href': link.get('href'),
+                'text': link.text.strip(),
+            })
+
+        return result
--- a/checks/hyperlinks_test.py
+++ b/checks/hyperlinks_test.py
@ -0,0 +1,61 @@
+import httpretty
+from httpretty import httprettified
+import unittest
+
+from checks import hyperlinks
+from checks import page_content
+from checks.config import Config
+
+@httprettified
+class TestHyperlinks(unittest.TestCase):
+
+    def test_links(self):
+        self.maxDiff = 2000
+        page_body = """
+            <html>
+                <head>
+                    <title>Title</title>
+                </head>
+                <body>
+                    <a href="/">Home</a>
+                    <a href="/sub/">Sub page</a>
+                    <a href="/"> Spaces </a>
+                    <a href="https://www.google.com/">External</a>
+                    <a href="/" style="display: hidden">Hidden</a>
+                    <a href="/" style="display: none">Hidden</a>
+                </body>
+            </html>
+        """
+
+        url = 'http://example.com/'
+        httpretty.register_uri(httpretty.GET, url, body=page_body)
+
+        results = {}
+
+        config = Config(urls=[url])
+        page_content_checker = page_content.Checker(config=config, previous_results={})
+        results['page_content'] = page_content_checker.run()
+
+        checker = hyperlinks.Checker(config=page_content_checker.config,
+                                            previous_results=results)
+        result = checker.run()
+        urls_after = checker.config.urls
+
+        self.assertEqual(result, {
+            'http://example.com/': {
+                'links': [
+                    {'href': '/', 'text': 'Home'},
+                    {'href': '/sub/', 'text': 'Sub page'},
+                    {'href': '/', 'text': 'Spaces'},
+                    {'href': 'https://www.google.com/', 'text': 'External'},
+                    {'href': '/', 'text': 'Hidden'},
+                    {'href': '/', 'text': 'Hidden'},
+                ],
+                'exception': None,
+            }
+        })
+        self.assertEqual(urls_after, ['http://example.com/'])
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/rating/init.py
+++ b/rating/init.py
@ -6,6 +6,7 @@ criteria based on information gather by checks before.
 import logging

 from rating import canonical_url
+from rating import contact_link
 from rating import favicon
 from rating import feeds
 from rating import https
@ -15,6 +16,7 @@ from rating import reachable
 from rating import resolvable
 from rating import response_duration
 from rating import responsive_layout
+from rating import social_media_links
 from rating import use_specific_fonts
 from rating import www_optional

@ -30,6 +32,7 @@ def calculate_rating(results):
    # The raters to execute.
    rating_modules = {
        'CANONICAL_URL': canonical_url,
+        'CONTACT_LINK': contact_link,
        'DNS_RESOLVABLE_IPV4': resolvable,
        'FAVICON': favicon,
        'FEEDS': feeds,
@ -39,6 +42,7 @@ def calculate_rating(results):
        'NO_SCRIPT_ERRORS': no_script_errors,
        'RESPONSIVE': responsive_layout,
        'SITE_REACHABLE': reachable,
+        'SOCIAL_MEDIA_LINKS': social_media_links,
        'USE_SPECIFIC_FONTS': use_specific_fonts,
        'WWW_OPTIONAL': www_optional,
    }
--- a/rating/contact_link.py
+++ b/rating/contact_link.py
@ -0,0 +1,41 @@
+"""
+Checks whether the pages has a link "Kontakt"
+"""
+
+from rating.abstract_rater import AbstractRater
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['hyperlinks']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        urls = 0
+        urls_with_contact_link = 0
+
+        for url in self.check_results['hyperlinks']:
+
+            urls += 1
+
+            for link in self.check_results['hyperlinks'][url]['links']:
+                if link['text'].lower() == 'kontakt':
+                    urls_with_contact_link += 1
+
+        if urls_with_contact_link == urls:
+            score = self.max_score
+            value = True
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
--- a/rating/social_media_links.py
+++ b/rating/social_media_links.py
@ -0,0 +1,59 @@
+"""
+Checks whether the pages have a link to social media profiles.
+"""
+
+from rating.abstract_rater import AbstractRater
+from urllib.parse import urlparse
+import logging
+
+class Rater(AbstractRater):
+
+    rating_type = 'boolean'
+    default_value = False
+    depends_on_checks = ['hyperlinks']
+    max_score = 1
+
+    def __init__(self, check_results):
+        super().__init__(check_results)
+    
+    def rate(self):
+        value = self.default_value
+        score = 0
+
+        urls = 0
+        urls_with_social_media_links = 0
+
+        for url in self.check_results['hyperlinks']:
+
+            urls += 1
+
+            for link in self.check_results['hyperlinks'][url]['links']:
+                
+                if link['href'] is None:
+                    continue
+
+                # only process absolute links
+                if not (link['href'].startswith('http:') or link['href'].startswith('https:')):
+                    continue
+                
+                parsed = urlparse(link['href'])
+                if ("facebook.com" in parsed.hostname or
+                    "twitter.com" in parsed.hostname or
+                    "instagram.com" in parsed.hostname or
+                    "plus.google.com" in parsed.hostname):
+                    logging.debug("Found social media link on %s: %s" % (url, link['href']))
+                    urls_with_social_media_links += 1
+                    
+                    # make sure we only count 1 for this url
+                    break
+
+        if urls_with_social_media_links == urls:
+            score = self.max_score
+            value = True
+
+        return {
+            'type': self.rating_type,
+            'value': value,
+            'score': score,
+            'max_score': self.max_score,
+        }
--- a/spider/spider.py
+++ b/spider/spider.py
@ -21,12 +21,8 @@ import rating

 def check_and_rate_site(entry):
    """
-    Performs our site check and returns results as a dict.
-
-    1. Normalize the input URL and derive the URLs to check for
-    2. HEAD the check urls
-    3. Determine the canonical URL
-    4. Run full check on canonical URL
+    Performs our site checks, calculates the score
+    and returns results as a dict.
    """

    # all the info we'll return for the site
@ -58,11 +54,11 @@ def check_and_rate_site(entry):
    for key in result['rating']:
        result['score'] += result['rating'][key]['score']

-    # remove full HTML page content,
-    # as it's no longer needed
+    # remove full HTML page content and hyperlinks to safe some storage
    try:
        for url in result['checks']['page_content']:
            del result['checks']['page_content'][url]['content']
+        del result['checks']['hyperlinks']
    except:
        pass