95 lines
2.8 KiB
Python
95 lines
2.8 KiB
Python
"""
|
|
This check downloads the HTML page for each URL
|
|
"""
|
|
|
|
import logging
|
|
|
|
import requests
|
|
|
|
from checks.abstract_checker import AbstractChecker
|
|
|
|
|
|
class Checker(AbstractChecker):
|
|
|
|
# connection timeout (seconds)
|
|
CONNECT_TIMEOUT = 10
|
|
|
|
# response timeout (seconds)
|
|
READ_TIMEOUT = 20
|
|
|
|
def __init__(self, config, previous_results=None):
|
|
super().__init__(config, previous_results)
|
|
|
|
|
|
def run(self):
|
|
results = {}
|
|
|
|
self.headers = {
|
|
"User-Agent": self.config.user_agent,
|
|
}
|
|
|
|
# copy URLs, as we may be manipulating self.config.urls in the loop
|
|
url = list(self.config.urls)
|
|
|
|
for url in self.config.urls:
|
|
result = self.download_page(url)
|
|
results[url] = result
|
|
|
|
# remove bad URLs from config, to avoid later checks using them
|
|
if 'exception' in result and result['exception'] is not None:
|
|
self.config.remove_url(url)
|
|
|
|
return results
|
|
|
|
|
|
def download_page(self, url):
|
|
result = {
|
|
'url': url,
|
|
'content': None,
|
|
'content_type': None,
|
|
'content_length': None,
|
|
'status_code': None,
|
|
'response_headers': None,
|
|
'duration': None,
|
|
'exception': None,
|
|
}
|
|
|
|
try:
|
|
r = requests.get(url,
|
|
headers=self.headers,
|
|
timeout=(self.CONNECT_TIMEOUT, self.READ_TIMEOUT))
|
|
|
|
result['url'] = r.url
|
|
result['status_code'] = r.status_code
|
|
result['content'] = r.text
|
|
result['content_length'] = len(r.text)
|
|
result['response_headers'] = self.get_headers(r.headers)
|
|
result['duration'] = round(r.elapsed.total_seconds() * 1000)
|
|
|
|
if r.headers.get("content-type") is not None:
|
|
result['content_type'] = r.headers.get("content-type").split(";")[0].strip()
|
|
|
|
except requests.exceptions.ConnectionError as exc:
|
|
logging.error(str(exc) + " " + url)
|
|
result['exception'] = "connection"
|
|
except requests.exceptions.ReadTimeout as exc:
|
|
logging.error(str(exc) + " " + url)
|
|
result['exception'] = "read_timeout"
|
|
except requests.exceptions.Timeout as exc:
|
|
logging.error(str(exc) + " " + url)
|
|
result['exception'] = "connection_timeout"
|
|
except Exception as exc:
|
|
logging.error(str(exc) + " " + url)
|
|
result['exception'] = "%s %s" % (str(type(exc)), exc)
|
|
|
|
return result
|
|
|
|
def get_headers(self, headers):
|
|
"""
|
|
Transforms CaseInsensitiveDict into dict with lowercase keys
|
|
"""
|
|
out = {}
|
|
for key in headers:
|
|
out[key.lower()] = headers[key]
|
|
return out
|