mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-11 13:06:08 +02:00
ae6a2e83e9
See PR description for details
153 lines
4.1 KiB
Python
153 lines
4.1 KiB
Python
"""
|
|
Extracts information from the html <head>, like existence and value
|
|
of certain meta tags, link tags, title, etc.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from urllib.parse import urljoin
|
|
from urllib.parse import urlparse
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from checks.abstract_checker import AbstractChecker
|
|
|
|
class Checker(AbstractChecker):
|
|
def __init__(self, config, previous_results=None):
|
|
super().__init__(config, previous_results)
|
|
|
|
def run(self):
|
|
results = {}
|
|
|
|
for url in self.config.urls:
|
|
results[url] = self.get_content(url)
|
|
|
|
return results
|
|
|
|
def get_content(self, url):
|
|
"""
|
|
Expects page_content_dict['content'] to carry the HTML content
|
|
"""
|
|
|
|
page_content = self.previous_results['page_content'][url]
|
|
assert 'content' in page_content
|
|
assert 'response_headers' in page_content
|
|
assert 'content-type' in page_content['response_headers']
|
|
|
|
if page_content['content'] is None:
|
|
return
|
|
|
|
soup = BeautifulSoup(page_content['content'], 'html.parser')
|
|
head = soup.find('head')
|
|
|
|
result = {
|
|
'title': self.get_title(head),
|
|
'link_canonical': self.get_link_canonical(head, url),
|
|
'link_rss_atom': self.get_link_rss_atom(head, url),
|
|
'link_icon': self.get_link_icon(head, url),
|
|
'generator': self.get_generator(head),
|
|
'opengraph': self.get_opengraph(head),
|
|
'viewport': self.get_viewport(head),
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
def get_title(self, head):
|
|
"""Extract and clean up page title"""
|
|
if head is None:
|
|
return
|
|
|
|
title = None
|
|
|
|
tag = head.find('title')
|
|
if tag is None:
|
|
return
|
|
|
|
title = tag.get_text()
|
|
|
|
# clean up
|
|
title = title.replace(u'\u00a0', ' ')
|
|
title = title.replace(' ', ' ')
|
|
title = title.strip()
|
|
|
|
return title
|
|
|
|
|
|
def get_link_canonical(self, head, url):
|
|
if head is None:
|
|
return
|
|
link = head.find('link', rel='canonical')
|
|
if link:
|
|
return urljoin(url, link.get('href'))
|
|
|
|
|
|
def get_link_rss_atom(self, head, url):
|
|
if head is None:
|
|
return
|
|
hrefs = []
|
|
rss_links = head.find_all('link', type='application/rss+xml')
|
|
atom_links = head.find_all('link', type='application/atom+xml')
|
|
|
|
if rss_links:
|
|
for link in rss_links:
|
|
hrefs.append(link.get('href'))
|
|
if atom_links:
|
|
for link in rss_links:
|
|
hrefs.append(link.get('href'))
|
|
|
|
# make URLs absolute
|
|
for i in range(len(hrefs)):
|
|
parsed = urlparse(hrefs[i])
|
|
if parsed.scheme == '':
|
|
hrefs[i] = urljoin(url, hrefs[i])
|
|
|
|
return hrefs
|
|
|
|
|
|
def get_link_icon(self, head, url):
|
|
if head is None:
|
|
return
|
|
|
|
tag = head.find('link', rel=lambda x: x and x.lower() == 'icon')
|
|
if tag:
|
|
return urljoin(url, tag.get('href'))
|
|
tag = head.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
|
|
if tag:
|
|
return urljoin(url, tag.get('href'))
|
|
|
|
|
|
def get_generator(self, head):
|
|
if head is None:
|
|
return
|
|
|
|
tags = head.select('[name=generator]')
|
|
if tags:
|
|
return tags[0].get('content')
|
|
|
|
|
|
def get_opengraph(self, head):
|
|
if head is None:
|
|
return
|
|
|
|
# we find tags by matching this property/itemprop value regex
|
|
property_re = re.compile('^og:')
|
|
|
|
opengraph = set()
|
|
for tag in head.find_all(property=property_re):
|
|
opengraph.add(tag.get('property'))
|
|
for tag in head.find_all(itemprop=property_re):
|
|
opengraph.add(tag.get('itemprop'))
|
|
|
|
opengraph = sorted(list(opengraph))
|
|
if opengraph != []:
|
|
return opengraph
|
|
|
|
|
|
def get_viewport(self, head):
|
|
if head is None:
|
|
return
|
|
tags = head.select('[name=viewport]')
|
|
if tags:
|
|
return tags[0].get('content')
|