mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-02 17:14:51 +02:00
d0e3a4210f
* Fix rating for contact_link and social_media_link * Skip checks when dependencies not met
52 lines
1.2 KiB
Python
52 lines
1.2 KiB
Python
"""
|
|
Collects information on hyperlinks on the page.
|
|
"""
|
|
|
|
import logging
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from checks.abstract_checker import AbstractChecker
|
|
|
|
class Checker(AbstractChecker):
|
|
def __init__(self, config, previous_results=None):
|
|
super().__init__(config, previous_results)
|
|
|
|
def depends_on_results(self):
|
|
return ['page_content']
|
|
|
|
def run(self):
|
|
assert 'page_content' in self.previous_results
|
|
|
|
results = {}
|
|
|
|
for url in self.config.urls:
|
|
results[url] = self.get_links(url)
|
|
|
|
return results
|
|
|
|
def get_links(self, url):
|
|
"""
|
|
Expects page_content_dict['content'] to carry the HTML content
|
|
"""
|
|
page_content = self.previous_results['page_content'][url]
|
|
assert 'content' in page_content
|
|
|
|
if page_content['content'] is None:
|
|
return
|
|
|
|
result = {
|
|
'links': [],
|
|
'exception': None,
|
|
}
|
|
|
|
soup = BeautifulSoup(page_content['content'], 'html.parser')
|
|
|
|
for link in soup.find_all("a"):
|
|
result['links'].append({
|
|
'href': link.get('href'),
|
|
'text': link.text.strip(),
|
|
})
|
|
|
|
return result
|