green-spider/checks/load_feeds.py

121 lines
3.6 KiB
Python

"""
Loads feeds linked from pages and collects information on the contained content
"""
import logging
from time import mktime
from datetime import datetime
import feedparser
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
self.feeds = {}
def depends_on_results(self):
return ['html_head']
def run(self):
assert 'html_head' in self.previous_results
for url in self.config.urls:
self.collect_feeds(url)
for feed_url in self.feeds:
self.feeds[feed_url] = self.analyse_feed(feed_url)
return self.feeds
def collect_feeds(self, url):
"""
This collects the feeds from all urls.
The assumption is that in most cases the urls will reference the same
feeds.
"""
head = self.previous_results['html_head'][url]
if 'link_rss_atom' not in head:
return
if not isinstance(head['link_rss_atom'], list):
return
for feed_url in head['link_rss_atom']:
if feed_url not in self.feeds:
self.feeds[feed_url] = {}
result = {
'feeds': [],
'exception': None,
}
return result
def analyse_feed(self, feed_url):
result = {
'exception': None,
'title': None,
'latest_entry': None,
'first_entry': None,
'average_interval': None,
'num_entries': None,
}
logging.debug("Loading feed %s" % feed_url)
data = feedparser.parse(feed_url)
if 'bozo_exception' in data:
result['exception'] = str(data['bozo_exception'])
if 'headers' not in data:
return result
if data['headers'].get('status') not in ('200', '301', '302'):
result['exception'] = 'Server responded with status %s' % data['headers'].get('status')
if 'feed' in data:
result['title'] = data['feed'].get('title')
if 'entries' in data:
result['num_entries'] = len(data['entries'])
result['latest_entry'] = self.find_latest_entry(data['entries'])
result['first_entry'] = self.find_first_entry(data['entries'])
if (result['num_entries'] > 1 and
result['first_entry'] is not None and
result['latest_entry'] is not None and
result['first_entry'] < result['latest_entry']):
result['average_interval'] = round((result['latest_entry'] - result['first_entry']).total_seconds() / (result['num_entries'] - 1))
return result
def find_latest_entry(self, entries):
max_date = None
for entry in entries:
published_parsed = entry.get('published_parsed')
if published_parsed is None:
return
timestamp = mktime(published_parsed)
if max_date is None or timestamp > max_date:
max_date = timestamp
if max_date is not None:
return datetime.fromtimestamp(max_date)
def find_first_entry(self, entries):
min_date = None
for entry in entries:
published_parsed = entry.get('published_parsed')
if published_parsed is None:
return
timestamp = mktime(published_parsed)
if min_date is None or timestamp < min_date:
min_date = timestamp
if min_date is not None:
return datetime.fromtimestamp(min_date)