Add content checks

This commit is contained in:
Marian Steinbach 2018-04-09 22:39:53 +02:00
parent 0ecf7e306e
commit 48ee1ab8a1

View file

@ -1,5 +1,6 @@
# coding: utf8
from bs4 import BeautifulSoup
from git import Repo
from multiprocessing import Pool
from socket import gethostbyname_ex
@ -9,6 +10,7 @@ import json
import logging
import os
import random
import re
import requests
import shutil
import sys
@ -115,6 +117,69 @@ def reduce_urls(urllist):
return list(targets)
def normalize_title(s):
"""
Removes garbage from HTML page titles
"""
s = s.replace('\u00a0', ' ')
s = s.replace(' ', ' ')
s = s.strip()
return s
def check_content(r):
"""
Adds details to check regarding content of the page
check: the dict containing details for this URL
r: requests request/response object
"""
result = {}
result['encoding'] = r.encoding
soup = BeautifulSoup(r.text, 'html.parser')
# page title
result['title'] = None
title = soup.find('head').find('title')
if title is not None:
result['title'] = normalize_title(title.get_text())
# canonical link
result['canonical_link'] = None
link = soup.find('link', rel='canonical')
if link:
result['canonical_link'] = link.get('href')
# feed links
result['feeds'] = []
rss_links = soup.find_all('link', type='application/rss+xml')
atom_links = soup.find_all('link', type='application/atom+xml')
if len(rss_links) > 0:
for l in rss_links:
result['feeds'].append(l.get('href'))
if len(atom_links) > 0:
for l in rss_links:
result['feeds'].append(l.get('href'))
# generator meta tag
result['generator'] = None
generator = soup.head.select('[name=generator]')
if len(generator):
result['generator'] = generator[0].get('content')
# opengraph meta tags
result['opengraph'] = None
og = set()
for item in soup.head.find_all(property=re.compile('^og:')):
og.add(item.get('property'))
for item in soup.head.find_all(itemprop=re.compile('^og:')):
og.add(item.get('itemprop'))
if len(og):
result['opengraph'] = list(og)
return result
def check_site(url):
"""
Performs our site check and returns results as a dict.
@ -204,12 +269,18 @@ def check_site(url):
'status_code': None,
'duration': None,
'error': None,
'content': None,
}
try:
r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
check['status_code'] = r.status_code
check['duration'] = round(r.elapsed.microseconds / 1000)
# Content checks
if r.status_code < 300:
check['content'] = check_content(r)
except requests.exceptions.ConnectionError as e:
logging.error(str(e) + " " + check_url)
check['error'] = "connection"