Add hyperlink checker

This commit is contained in:
Marian Steinbach 2018-11-20 22:38:14 +01:00
parent 4524cb5714
commit f555781735
3 changed files with 111 additions and 0 deletions

View file

@ -13,6 +13,7 @@ from checks import domain_variations
from checks import generator
from checks import html_head
from checks import http_and_https
from checks import hyperlinks
from checks import page_content
from checks import load_in_browser
from checks import url_reachability
@ -40,6 +41,7 @@ def perform_checks(input_url):
('duplicate_content', duplicate_content),
('charset', charset),
('html_head', html_head),
('hyperlinks', hyperlinks),
('generator', generator),
('load_in_browser', load_in_browser),
]

48
checks/hyperlinks.py Normal file
View file

@ -0,0 +1,48 @@
"""
Collects information on hyperlinks on the page.
"""
import logging
from bs4 import BeautifulSoup
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
assert 'page_content' in self.previous_results
results = {}
for url in self.config.urls:
results[url] = self.get_links(url)
return results
def get_links(self, url):
"""
Expects page_content_dict['content'] to carry the HTML content
"""
page_content = self.previous_results['page_content'][url]
assert 'content' in page_content
if page_content['content'] is None:
return
result = {
'links': [],
'exception': None,
}
soup = BeautifulSoup(page_content['content'], 'html.parser')
for link in soup.find_all("a"):
result['links'].append({
'href': link.get('href'),
'text': link.text.strip(),
})
return result

61
checks/hyperlinks_test.py Normal file
View file

@ -0,0 +1,61 @@
import httpretty
from httpretty import httprettified
import unittest
from checks import hyperlinks
from checks import page_content
from checks.config import Config
@httprettified
class TestHyperlinks(unittest.TestCase):
def test_links(self):
self.maxDiff = 2000
page_body = """
<html>
<head>
<title>Title</title>
</head>
<body>
<a href="/">Home</a>
<a href="/sub/">Sub page</a>
<a href="/"> Spaces </a>
<a href="https://www.google.com/">External</a>
<a href="/" style="display: hidden">Hidden</a>
<a href="/" style="display: none">Hidden</a>
</body>
</html>
"""
url = 'http://example.com/'
httpretty.register_uri(httpretty.GET, url, body=page_body)
results = {}
config = Config(urls=[url])
page_content_checker = page_content.Checker(config=config, previous_results={})
results['page_content'] = page_content_checker.run()
checker = hyperlinks.Checker(config=page_content_checker.config,
previous_results=results)
result = checker.run()
urls_after = checker.config.urls
self.assertEqual(result, {
'http://example.com/': {
'links': [
{'href': '/', 'text': 'Home'},
{'href': '/sub/', 'text': 'Sub page'},
{'href': '/', 'text': 'Spaces'},
{'href': 'https://www.google.com/', 'text': 'External'},
{'href': '/', 'text': 'Hidden'},
{'href': '/', 'text': 'Hidden'},
],
'exception': None,
}
})
self.assertEqual(urls_after, ['http://example.com/'])
if __name__ == '__main__':
unittest.main()