mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-04-27 14:54:52 +02:00
Add criteria: social media links, contact link (#90)
* Add hyperlink checker * Add rating for contact and social media links * Update a comment * Remove hyperlinks details from final payload
This commit is contained in:
parent
4524cb5714
commit
3ba6940e94
|
@ -13,6 +13,7 @@ from checks import domain_variations
|
|||
from checks import generator
|
||||
from checks import html_head
|
||||
from checks import http_and_https
|
||||
from checks import hyperlinks
|
||||
from checks import page_content
|
||||
from checks import load_in_browser
|
||||
from checks import url_reachability
|
||||
|
@ -40,6 +41,7 @@ def perform_checks(input_url):
|
|||
('duplicate_content', duplicate_content),
|
||||
('charset', charset),
|
||||
('html_head', html_head),
|
||||
('hyperlinks', hyperlinks),
|
||||
('generator', generator),
|
||||
('load_in_browser', load_in_browser),
|
||||
]
|
||||
|
|
48
checks/hyperlinks.py
Normal file
48
checks/hyperlinks.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
"""
|
||||
Collects information on hyperlinks on the page.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from checks.abstract_checker import AbstractChecker
|
||||
|
||||
class Checker(AbstractChecker):
|
||||
def __init__(self, config, previous_results=None):
|
||||
super().__init__(config, previous_results)
|
||||
|
||||
def run(self):
|
||||
assert 'page_content' in self.previous_results
|
||||
|
||||
results = {}
|
||||
|
||||
for url in self.config.urls:
|
||||
results[url] = self.get_links(url)
|
||||
|
||||
return results
|
||||
|
||||
def get_links(self, url):
|
||||
"""
|
||||
Expects page_content_dict['content'] to carry the HTML content
|
||||
"""
|
||||
page_content = self.previous_results['page_content'][url]
|
||||
assert 'content' in page_content
|
||||
|
||||
if page_content['content'] is None:
|
||||
return
|
||||
|
||||
result = {
|
||||
'links': [],
|
||||
'exception': None,
|
||||
}
|
||||
|
||||
soup = BeautifulSoup(page_content['content'], 'html.parser')
|
||||
|
||||
for link in soup.find_all("a"):
|
||||
result['links'].append({
|
||||
'href': link.get('href'),
|
||||
'text': link.text.strip(),
|
||||
})
|
||||
|
||||
return result
|
61
checks/hyperlinks_test.py
Normal file
61
checks/hyperlinks_test.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
import httpretty
|
||||
from httpretty import httprettified
|
||||
import unittest
|
||||
|
||||
from checks import hyperlinks
|
||||
from checks import page_content
|
||||
from checks.config import Config
|
||||
|
||||
@httprettified
|
||||
class TestHyperlinks(unittest.TestCase):
|
||||
|
||||
def test_links(self):
|
||||
self.maxDiff = 2000
|
||||
page_body = """
|
||||
<html>
|
||||
<head>
|
||||
<title>Title</title>
|
||||
</head>
|
||||
<body>
|
||||
<a href="/">Home</a>
|
||||
<a href="/sub/">Sub page</a>
|
||||
<a href="/"> Spaces </a>
|
||||
<a href="https://www.google.com/">External</a>
|
||||
<a href="/" style="display: hidden">Hidden</a>
|
||||
<a href="/" style="display: none">Hidden</a>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
url = 'http://example.com/'
|
||||
httpretty.register_uri(httpretty.GET, url, body=page_body)
|
||||
|
||||
results = {}
|
||||
|
||||
config = Config(urls=[url])
|
||||
page_content_checker = page_content.Checker(config=config, previous_results={})
|
||||
results['page_content'] = page_content_checker.run()
|
||||
|
||||
checker = hyperlinks.Checker(config=page_content_checker.config,
|
||||
previous_results=results)
|
||||
result = checker.run()
|
||||
urls_after = checker.config.urls
|
||||
|
||||
self.assertEqual(result, {
|
||||
'http://example.com/': {
|
||||
'links': [
|
||||
{'href': '/', 'text': 'Home'},
|
||||
{'href': '/sub/', 'text': 'Sub page'},
|
||||
{'href': '/', 'text': 'Spaces'},
|
||||
{'href': 'https://www.google.com/', 'text': 'External'},
|
||||
{'href': '/', 'text': 'Hidden'},
|
||||
{'href': '/', 'text': 'Hidden'},
|
||||
],
|
||||
'exception': None,
|
||||
}
|
||||
})
|
||||
self.assertEqual(urls_after, ['http://example.com/'])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -6,6 +6,7 @@ criteria based on information gather by checks before.
|
|||
import logging
|
||||
|
||||
from rating import canonical_url
|
||||
from rating import contact_link
|
||||
from rating import favicon
|
||||
from rating import feeds
|
||||
from rating import https
|
||||
|
@ -15,6 +16,7 @@ from rating import reachable
|
|||
from rating import resolvable
|
||||
from rating import response_duration
|
||||
from rating import responsive_layout
|
||||
from rating import social_media_links
|
||||
from rating import use_specific_fonts
|
||||
from rating import www_optional
|
||||
|
||||
|
@ -30,6 +32,7 @@ def calculate_rating(results):
|
|||
# The raters to execute.
|
||||
rating_modules = {
|
||||
'CANONICAL_URL': canonical_url,
|
||||
'CONTACT_LINK': contact_link,
|
||||
'DNS_RESOLVABLE_IPV4': resolvable,
|
||||
'FAVICON': favicon,
|
||||
'FEEDS': feeds,
|
||||
|
@ -39,6 +42,7 @@ def calculate_rating(results):
|
|||
'NO_SCRIPT_ERRORS': no_script_errors,
|
||||
'RESPONSIVE': responsive_layout,
|
||||
'SITE_REACHABLE': reachable,
|
||||
'SOCIAL_MEDIA_LINKS': social_media_links,
|
||||
'USE_SPECIFIC_FONTS': use_specific_fonts,
|
||||
'WWW_OPTIONAL': www_optional,
|
||||
}
|
||||
|
|
41
rating/contact_link.py
Normal file
41
rating/contact_link.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
"""
|
||||
Checks whether the pages has a link "Kontakt"
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['hyperlinks']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
urls = 0
|
||||
urls_with_contact_link = 0
|
||||
|
||||
for url in self.check_results['hyperlinks']:
|
||||
|
||||
urls += 1
|
||||
|
||||
for link in self.check_results['hyperlinks'][url]['links']:
|
||||
if link['text'].lower() == 'kontakt':
|
||||
urls_with_contact_link += 1
|
||||
|
||||
if urls_with_contact_link == urls:
|
||||
score = self.max_score
|
||||
value = True
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
59
rating/social_media_links.py
Normal file
59
rating/social_media_links.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
"""
|
||||
Checks whether the pages have a link to social media profiles.
|
||||
"""
|
||||
|
||||
from rating.abstract_rater import AbstractRater
|
||||
from urllib.parse import urlparse
|
||||
import logging
|
||||
|
||||
class Rater(AbstractRater):
|
||||
|
||||
rating_type = 'boolean'
|
||||
default_value = False
|
||||
depends_on_checks = ['hyperlinks']
|
||||
max_score = 1
|
||||
|
||||
def __init__(self, check_results):
|
||||
super().__init__(check_results)
|
||||
|
||||
def rate(self):
|
||||
value = self.default_value
|
||||
score = 0
|
||||
|
||||
urls = 0
|
||||
urls_with_social_media_links = 0
|
||||
|
||||
for url in self.check_results['hyperlinks']:
|
||||
|
||||
urls += 1
|
||||
|
||||
for link in self.check_results['hyperlinks'][url]['links']:
|
||||
|
||||
if link['href'] is None:
|
||||
continue
|
||||
|
||||
# only process absolute links
|
||||
if not (link['href'].startswith('http:') or link['href'].startswith('https:')):
|
||||
continue
|
||||
|
||||
parsed = urlparse(link['href'])
|
||||
if ("facebook.com" in parsed.hostname or
|
||||
"twitter.com" in parsed.hostname or
|
||||
"instagram.com" in parsed.hostname or
|
||||
"plus.google.com" in parsed.hostname):
|
||||
logging.debug("Found social media link on %s: %s" % (url, link['href']))
|
||||
urls_with_social_media_links += 1
|
||||
|
||||
# make sure we only count 1 for this url
|
||||
break
|
||||
|
||||
if urls_with_social_media_links == urls:
|
||||
score = self.max_score
|
||||
value = True
|
||||
|
||||
return {
|
||||
'type': self.rating_type,
|
||||
'value': value,
|
||||
'score': score,
|
||||
'max_score': self.max_score,
|
||||
}
|
|
@ -21,12 +21,8 @@ import rating
|
|||
|
||||
def check_and_rate_site(entry):
|
||||
"""
|
||||
Performs our site check and returns results as a dict.
|
||||
|
||||
1. Normalize the input URL and derive the URLs to check for
|
||||
2. HEAD the check urls
|
||||
3. Determine the canonical URL
|
||||
4. Run full check on canonical URL
|
||||
Performs our site checks, calculates the score
|
||||
and returns results as a dict.
|
||||
"""
|
||||
|
||||
# all the info we'll return for the site
|
||||
|
@ -58,11 +54,11 @@ def check_and_rate_site(entry):
|
|||
for key in result['rating']:
|
||||
result['score'] += result['rating'][key]['score']
|
||||
|
||||
# remove full HTML page content,
|
||||
# as it's no longer needed
|
||||
# remove full HTML page content and hyperlinks to safe some storage
|
||||
try:
|
||||
for url in result['checks']['page_content']:
|
||||
del result['checks']['page_content'][url]['content']
|
||||
del result['checks']['hyperlinks']
|
||||
except:
|
||||
pass
|
||||
|
||||
|
|
Loading…
Reference in a new issue