Add criteria: social media links, contact link (#90)

* Add hyperlink checker

* Add rating for contact and social media links

* Update a comment

* Remove hyperlinks details from final payload
This commit is contained in:
Marian Steinbach 2018-11-20 22:47:34 +01:00 committed by GitHub
parent 4524cb5714
commit 3ba6940e94
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 219 additions and 8 deletions

View file

@ -13,6 +13,7 @@ from checks import domain_variations
from checks import generator
from checks import html_head
from checks import http_and_https
from checks import hyperlinks
from checks import page_content
from checks import load_in_browser
from checks import url_reachability
@ -40,6 +41,7 @@ def perform_checks(input_url):
('duplicate_content', duplicate_content),
('charset', charset),
('html_head', html_head),
('hyperlinks', hyperlinks),
('generator', generator),
('load_in_browser', load_in_browser),
]

48
checks/hyperlinks.py Normal file
View file

@ -0,0 +1,48 @@
"""
Collects information on hyperlinks on the page.
"""
import logging
from bs4 import BeautifulSoup
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
assert 'page_content' in self.previous_results
results = {}
for url in self.config.urls:
results[url] = self.get_links(url)
return results
def get_links(self, url):
"""
Expects page_content_dict['content'] to carry the HTML content
"""
page_content = self.previous_results['page_content'][url]
assert 'content' in page_content
if page_content['content'] is None:
return
result = {
'links': [],
'exception': None,
}
soup = BeautifulSoup(page_content['content'], 'html.parser')
for link in soup.find_all("a"):
result['links'].append({
'href': link.get('href'),
'text': link.text.strip(),
})
return result

61
checks/hyperlinks_test.py Normal file
View file

@ -0,0 +1,61 @@
import httpretty
from httpretty import httprettified
import unittest
from checks import hyperlinks
from checks import page_content
from checks.config import Config
@httprettified
class TestHyperlinks(unittest.TestCase):
def test_links(self):
self.maxDiff = 2000
page_body = """
<html>
<head>
<title>Title</title>
</head>
<body>
<a href="/">Home</a>
<a href="/sub/">Sub page</a>
<a href="/"> Spaces </a>
<a href="https://www.google.com/">External</a>
<a href="/" style="display: hidden">Hidden</a>
<a href="/" style="display: none">Hidden</a>
</body>
</html>
"""
url = 'http://example.com/'
httpretty.register_uri(httpretty.GET, url, body=page_body)
results = {}
config = Config(urls=[url])
page_content_checker = page_content.Checker(config=config, previous_results={})
results['page_content'] = page_content_checker.run()
checker = hyperlinks.Checker(config=page_content_checker.config,
previous_results=results)
result = checker.run()
urls_after = checker.config.urls
self.assertEqual(result, {
'http://example.com/': {
'links': [
{'href': '/', 'text': 'Home'},
{'href': '/sub/', 'text': 'Sub page'},
{'href': '/', 'text': 'Spaces'},
{'href': 'https://www.google.com/', 'text': 'External'},
{'href': '/', 'text': 'Hidden'},
{'href': '/', 'text': 'Hidden'},
],
'exception': None,
}
})
self.assertEqual(urls_after, ['http://example.com/'])
if __name__ == '__main__':
unittest.main()

View file

@ -6,6 +6,7 @@ criteria based on information gather by checks before.
import logging
from rating import canonical_url
from rating import contact_link
from rating import favicon
from rating import feeds
from rating import https
@ -15,6 +16,7 @@ from rating import reachable
from rating import resolvable
from rating import response_duration
from rating import responsive_layout
from rating import social_media_links
from rating import use_specific_fonts
from rating import www_optional
@ -30,6 +32,7 @@ def calculate_rating(results):
# The raters to execute.
rating_modules = {
'CANONICAL_URL': canonical_url,
'CONTACT_LINK': contact_link,
'DNS_RESOLVABLE_IPV4': resolvable,
'FAVICON': favicon,
'FEEDS': feeds,
@ -39,6 +42,7 @@ def calculate_rating(results):
'NO_SCRIPT_ERRORS': no_script_errors,
'RESPONSIVE': responsive_layout,
'SITE_REACHABLE': reachable,
'SOCIAL_MEDIA_LINKS': social_media_links,
'USE_SPECIFIC_FONTS': use_specific_fonts,
'WWW_OPTIONAL': www_optional,
}

41
rating/contact_link.py Normal file
View file

@ -0,0 +1,41 @@
"""
Checks whether the pages has a link "Kontakt"
"""
from rating.abstract_rater import AbstractRater
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['hyperlinks']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
urls = 0
urls_with_contact_link = 0
for url in self.check_results['hyperlinks']:
urls += 1
for link in self.check_results['hyperlinks'][url]['links']:
if link['text'].lower() == 'kontakt':
urls_with_contact_link += 1
if urls_with_contact_link == urls:
score = self.max_score
value = True
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

View file

@ -0,0 +1,59 @@
"""
Checks whether the pages have a link to social media profiles.
"""
from rating.abstract_rater import AbstractRater
from urllib.parse import urlparse
import logging
class Rater(AbstractRater):
rating_type = 'boolean'
default_value = False
depends_on_checks = ['hyperlinks']
max_score = 1
def __init__(self, check_results):
super().__init__(check_results)
def rate(self):
value = self.default_value
score = 0
urls = 0
urls_with_social_media_links = 0
for url in self.check_results['hyperlinks']:
urls += 1
for link in self.check_results['hyperlinks'][url]['links']:
if link['href'] is None:
continue
# only process absolute links
if not (link['href'].startswith('http:') or link['href'].startswith('https:')):
continue
parsed = urlparse(link['href'])
if ("facebook.com" in parsed.hostname or
"twitter.com" in parsed.hostname or
"instagram.com" in parsed.hostname or
"plus.google.com" in parsed.hostname):
logging.debug("Found social media link on %s: %s" % (url, link['href']))
urls_with_social_media_links += 1
# make sure we only count 1 for this url
break
if urls_with_social_media_links == urls:
score = self.max_score
value = True
return {
'type': self.rating_type,
'value': value,
'score': score,
'max_score': self.max_score,
}

View file

@ -21,12 +21,8 @@ import rating
def check_and_rate_site(entry):
"""
Performs our site check and returns results as a dict.
1. Normalize the input URL and derive the URLs to check for
2. HEAD the check urls
3. Determine the canonical URL
4. Run full check on canonical URL
Performs our site checks, calculates the score
and returns results as a dict.
"""
# all the info we'll return for the site
@ -58,11 +54,11 @@ def check_and_rate_site(entry):
for key in result['rating']:
result['score'] += result['rating'][key]['score']
# remove full HTML page content,
# as it's no longer needed
# remove full HTML page content and hyperlinks to safe some storage
try:
for url in result['checks']['page_content']:
del result['checks']['page_content'][url]['content']
del result['checks']['hyperlinks']
except:
pass