green-spider/checks/hyperlinks.py
Marian Steinbach d0e3a4210f
Fix link raters (social media links, contact link) (#95)
* Fix rating for contact_link and social_media_link

* Skip checks when dependencies not met
2018-11-28 23:46:40 +01:00

52 lines
1.2 KiB
Python

"""
Collects information on hyperlinks on the page.
"""
import logging
from bs4 import BeautifulSoup
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def depends_on_results(self):
return ['page_content']
def run(self):
assert 'page_content' in self.previous_results
results = {}
for url in self.config.urls:
results[url] = self.get_links(url)
return results
def get_links(self, url):
"""
Expects page_content_dict['content'] to carry the HTML content
"""
page_content = self.previous_results['page_content'][url]
assert 'content' in page_content
if page_content['content'] is None:
return
result = {
'links': [],
'exception': None,
}
soup = BeautifulSoup(page_content['content'], 'html.parser')
for link in soup.find_all("a"):
result['links'].append({
'href': link.get('href'),
'text': link.text.strip(),
})
return result