green-spider/checks/html_head.py

153 lines
4.1 KiB
Python

"""
Extracts information from the html <head>, like existence and value
of certain meta tags, link tags, title, etc.
"""
import logging
import re
from urllib.parse import urljoin
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
results = {}
for url in self.config.urls:
results[url] = self.get_content(url)
return results
def get_content(self, url):
"""
Expects page_content_dict['content'] to carry the HTML content
"""
page_content = self.previous_results['page_content'][url]
assert 'content' in page_content
assert 'response_headers' in page_content
assert 'content-type' in page_content['response_headers']
if page_content['content'] is None:
return
soup = BeautifulSoup(page_content['content'], 'html.parser')
head = soup.find('head')
result = {
'title': self.get_title(head),
'link_canonical': self.get_link_canonical(head, url),
'link_rss_atom': self.get_link_rss_atom(head, url),
'link_icon': self.get_link_icon(head, url),
'generator': self.get_generator(head),
'opengraph': self.get_opengraph(head),
'viewport': self.get_viewport(head),
}
return result
def get_title(self, head):
"""Extract and clean up page title"""
if head is None:
return
title = None
tag = head.find('title')
if tag is None:
return
title = tag.get_text()
# clean up
title = title.replace(u'\u00a0', ' ')
title = title.replace(' ', ' ')
title = title.strip()
return title
def get_link_canonical(self, head, url):
if head is None:
return
link = head.find('link', rel='canonical')
if link:
return urljoin(url, link.get('href'))
def get_link_rss_atom(self, head, url):
if head is None:
return
hrefs = []
rss_links = head.find_all('link', type='application/rss+xml')
atom_links = head.find_all('link', type='application/atom+xml')
if rss_links:
for link in rss_links:
hrefs.append(link.get('href'))
if atom_links:
for link in rss_links:
hrefs.append(link.get('href'))
# make URLs absolute
for i in range(len(hrefs)):
parsed = urlparse(hrefs[i])
if parsed.scheme == '':
hrefs[i] = urljoin(url, hrefs[i])
return hrefs
def get_link_icon(self, head, url):
if head is None:
return
tag = head.find('link', rel=lambda x: x and x.lower() == 'icon')
if tag:
return urljoin(url, tag.get('href'))
tag = head.find('link', rel=lambda x: x and x.lower() == 'shortcut icon')
if tag:
return urljoin(url, tag.get('href'))
def get_generator(self, head):
if head is None:
return
tags = head.select('[name=generator]')
if tags:
return tags[0].get('content')
def get_opengraph(self, head):
if head is None:
return
# we find tags by matching this property/itemprop value regex
property_re = re.compile('^og:')
opengraph = set()
for tag in head.find_all(property=property_re):
opengraph.add(tag.get('property'))
for tag in head.find_all(itemprop=property_re):
opengraph.add(tag.get('itemprop'))
opengraph = sorted(list(opengraph))
if opengraph != []:
return opengraph
def get_viewport(self, head):
if head is None:
return
tags = head.select('[name=viewport]')
if tags:
return tags[0].get('content')