green-spider/checks/page_content.py

95 lines
2.8 KiB
Python

"""
This check downloads the HTML page for each URL
"""
import logging
import requests
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
# connection timeout (seconds)
CONNECT_TIMEOUT = 10
# response timeout (seconds)
READ_TIMEOUT = 20
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
results = {}
self.headers = {
"User-Agent": self.config.user_agent,
}
# copy URLs, as we may be manipulating self.config.urls in the loop
url = list(self.config.urls)
for url in self.config.urls:
result = self.download_page(url)
results[url] = result
# remove bad URLs from config, to avoid later checks using them
if 'exception' in result and result['exception'] is not None:
self.config.remove_url(url)
return results
def download_page(self, url):
result = {
'url': url,
'content': None,
'content_type': None,
'content_length': None,
'status_code': None,
'response_headers': None,
'duration': None,
'exception': None,
}
try:
r = requests.get(url,
headers=self.headers,
timeout=(self.CONNECT_TIMEOUT, self.READ_TIMEOUT))
result['url'] = r.url
result['status_code'] = r.status_code
result['content'] = r.text
result['content_length'] = len(r.text)
result['response_headers'] = self.get_headers(r.headers)
result['duration'] = round(r.elapsed.total_seconds() * 1000)
if r.headers.get("content-type") is not None:
result['content_type'] = r.headers.get("content-type").split(";")[0].strip()
except requests.exceptions.ConnectionError as exc:
logging.error(str(exc) + " " + url)
result['exception'] = "connection"
except requests.exceptions.ReadTimeout as exc:
logging.error(str(exc) + " " + url)
result['exception'] = "read_timeout"
except requests.exceptions.Timeout as exc:
logging.error(str(exc) + " " + url)
result['exception'] = "connection_timeout"
except Exception as exc:
logging.error(str(exc) + " " + url)
result['exception'] = "%s %s" % (str(type(exc)), exc)
return result
def get_headers(self, headers):
"""
Transforms CaseInsensitiveDict into dict with lowercase keys
"""
out = {}
for key in headers:
out[key.lower()] = headers[key]
return out