mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-01 16:44:51 +02:00
Add hyperlink checker
This commit is contained in:
parent
4524cb5714
commit
f555781735
|
@ -13,6 +13,7 @@ from checks import domain_variations
|
|||
from checks import generator
|
||||
from checks import html_head
|
||||
from checks import http_and_https
|
||||
from checks import hyperlinks
|
||||
from checks import page_content
|
||||
from checks import load_in_browser
|
||||
from checks import url_reachability
|
||||
|
@ -40,6 +41,7 @@ def perform_checks(input_url):
|
|||
('duplicate_content', duplicate_content),
|
||||
('charset', charset),
|
||||
('html_head', html_head),
|
||||
('hyperlinks', hyperlinks),
|
||||
('generator', generator),
|
||||
('load_in_browser', load_in_browser),
|
||||
]
|
||||
|
|
48
checks/hyperlinks.py
Normal file
48
checks/hyperlinks.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
"""
|
||||
Collects information on hyperlinks on the page.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
def run(self):
|
||||
assert 'page_content' in self.previous_results
|
||||
|
||||
results = {}
|
||||
|
||||
for url in self.config.urls:
|
||||
results[url] = self.get_links(url)
|
||||
|
||||
return results
|
||||
|
||||
def get_links(self, url):
|
||||
"""
|
||||
Expects page_content_dict['content'] to carry the HTML content
|
||||
"""
|
||||
page_content = self.previous_results['page_content'][url]
|
||||
assert 'content' in page_content
|
||||
|
||||
if page_content['content'] is None:
|
||||
return
|
||||
|
||||
result = {
|
||||
'links': [],
|
||||
'exception': None,
|
||||
}
|
||||
|
||||
soup = BeautifulSoup(page_content['content'], 'html.parser')
|
||||
|
||||
for link in soup.find_all("a"):
|
||||
result['links'].append({
|
||||
'href': link.get('href'),
|
||||
'text': link.text.strip(),
|
||||
})
|
||||
|
||||
return result
|
61
checks/hyperlinks_test.py
Normal file
61
checks/hyperlinks_test.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
import httpretty
|
||||
from httpretty import httprettified
|
||||
import unittest
|
||||
|
||||
from checks import hyperlinks
|
||||
from checks import page_content
|
||||
from checks.config import Config
|
||||
|
||||
@httprettified
|
||||
class TestHyperlinks(unittest.TestCase):
|
||||
|
||||
def test_links(self):
|
||||
self.maxDiff = 2000
|
||||
page_body = """
|
||||
<html>
|
||||
<head>
|
||||
<title>Title</title>
|
||||
</head>
|
||||
<body>
|
||||
<a href="/">Home</a>
|
||||
<a href="/sub/">Sub page</a>
|
||||
<a href="/"> Spaces </a>
|
||||
<a href="https://www.google.com/">External</a>
|
||||
<a href="/" style="display: hidden">Hidden</a>
|
||||
<a href="/" style="display: none">Hidden</a>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
url = 'http://example.com/'
|
||||
httpretty.register_uri(httpretty.GET, url, body=page_body)
|
||||
|
||||
results = {}
|
||||
|
||||
config = Config(urls=[url])
|
||||
page_content_checker = page_content.Checker(config=config, previous_results={})
|
||||
results['page_content'] = page_content_checker.run()
|
||||
|
||||
checker = hyperlinks.Checker(config=page_content_checker.config,
|
||||
previous_results=results)
|
||||
result = checker.run()
|
||||
urls_after = checker.config.urls
|
||||
|
||||
self.assertEqual(result, {
|
||||
'http://example.com/': {
|
||||
'links': [
|
||||
{'href': '/', 'text': 'Home'},
|
||||
{'href': '/sub/', 'text': 'Sub page'},
|
||||
{'href': '/', 'text': 'Spaces'},
|
||||
{'href': 'https://www.google.com/', 'text': 'External'},
|
||||
{'href': '/', 'text': 'Hidden'},
|
||||
{'href': '/', 'text': 'Hidden'},
|
||||
],
|
||||
'exception': None,
|
||||
}
|
||||
})
|
||||
self.assertEqual(urls_after, ['http://example.com/'])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Loading…
Reference in a new issue