green-spider/checks/load_feeds.py

"""
Loads feeds linked from pages and collects information on the contained content
"""

import logging
from time import mktime
from datetime import datetime

import feedparser

from checks.abstract_checker import AbstractChecker

class Checker(AbstractChecker):
    def __init__(self, config, previous_results=None):
        super().__init__(config, previous_results)
        self.feeds = {}

    def depends_on_results(self):
        return ['html_head']

    def run(self):
        assert 'html_head' in self.previous_results

        for url in self.config.urls:
            self.collect_feeds(url)

        for feed_url in self.feeds:
            self.feeds[feed_url] = self.analyse_feed(feed_url)

        return self.feeds
    
    def collect_feeds(self, url):
        """
        This collects the feeds from all urls.
        The assumption is that in most cases the urls will reference the same
        feeds.
        """
        head = self.previous_results['html_head'][url]
        if 'link_rss_atom' not in head:
            return
        if not isinstance(head['link_rss_atom'], list):
            return
        
        for feed_url in head['link_rss_atom']:
            if feed_url not in self.feeds:
                self.feeds[feed_url] = {}

        result = {
            'feeds': [],
            'exception': None,
        }

        return result
    

    def analyse_feed(self, feed_url):
        result = {
            'exception': None,
            'title': None,
            'latest_entry': None,
            'first_entry': None,
            'average_interval': None,
            'num_entries': None,
        }

        logging.debug("Loading feed %s" % feed_url)
        data = feedparser.parse(feed_url)

        if 'bozo_exception' in data:
            result['exception'] = str(data['bozo_exception'])

        if 'headers' not in data:
            return result

        if data['headers'].get('status') not in ('200', '301', '302'):
            result['exception'] = 'Server responded with status %s' % data['headers'].get('status')
        
        if 'feed' in data:
            result['title'] = data['feed'].get('title')
        if 'entries' in data:
            result['num_entries'] = len(data['entries'])
            result['latest_entry'] = self.find_latest_entry(data['entries'])
            result['first_entry'] = self.find_first_entry(data['entries'])
            if (result['num_entries'] > 1 and 
                result['first_entry'] is not None and
                result['latest_entry'] is not None and
                result['first_entry'] < result['latest_entry']):
                result['average_interval'] = round((result['latest_entry'] - result['first_entry']).total_seconds() / (result['num_entries'] - 1))
        
        return result


    def find_latest_entry(self, entries):
        max_date = None

        for entry in entries:
            published_parsed = entry.get('published_parsed')
            if published_parsed is None:
                return
            timestamp = mktime(published_parsed)
            if max_date is None or timestamp > max_date:
                max_date = timestamp
        
        if max_date is not None:
            return datetime.fromtimestamp(max_date)


    def find_first_entry(self, entries):
        min_date = None

        for entry in entries:
            published_parsed = entry.get('published_parsed')
            if published_parsed is None:
                return
            timestamp = mktime(published_parsed)
            if min_date is None or timestamp < min_date:
                min_date = timestamp
        
        if min_date is not None:
            return datetime.fromtimestamp(min_date)
Load feeds and gather info (#103) 2018-12-07 16:32:42 +01:00			`"""`
			`Loads feeds linked from pages and collects information on the contained content`
			`"""`

			`import logging`
			`from time import mktime`
			`from datetime import datetime`

			`import feedparser`

			`from checks.abstract_checker import AbstractChecker`

			`class Checker(AbstractChecker):`
			`def __init__(self, config, previous_results=None):`
			`super().__init__(config, previous_results)`
			`self.feeds = {}`

			`def depends_on_results(self):`
			`return ['html_head']`

			`def run(self):`
			`assert 'html_head' in self.previous_results`

			`for url in self.config.urls:`
			`self.collect_feeds(url)`

			`for feed_url in self.feeds:`
			`self.feeds[feed_url] = self.analyse_feed(feed_url)`

			`return self.feeds`

			`def collect_feeds(self, url):`
			`"""`
			`This collects the feeds from all urls.`
			`The assumption is that in most cases the urls will reference the same`
			`feeds.`
			`"""`
			`head = self.previous_results['html_head'][url]`
Several fixes for edge cases 2018-12-17 23:54:09 +01:00			`if 'link_rss_atom' not in head:`
			`return`
			`if not isinstance(head['link_rss_atom'], list):`
			`return`
Load feeds and gather info (#103) 2018-12-07 16:32:42 +01:00
			`for feed_url in head['link_rss_atom']:`
			`if feed_url not in self.feeds:`
			`self.feeds[feed_url] = {}`

			`result = {`
			`'feeds': [],`
			`'exception': None,`
			`}`

			`return result`


			`def analyse_feed(self, feed_url):`
			`result = {`
			`'exception': None,`
			`'title': None,`
			`'latest_entry': None,`
			`'first_entry': None,`
			`'average_interval': None,`
			`'num_entries': None,`
			`}`

			`logging.debug("Loading feed %s" % feed_url)`
			`data = feedparser.parse(feed_url)`

			`if 'bozo_exception' in data:`
Problembehebung mit Exception-Daten im Ergbebnis, die nicht geschrieben werden können, und Spidern einzelner Sites (#132) * WIP commit for single job execution * Convert exception to string * Pass more arguments * Move python modules lsit into requirements.txt * Document single site spidering * Remove debugging 2019-11-22 23:13:57 +01:00			`result['exception'] = str(data['bozo_exception'])`
Load feeds and gather info (#103) 2018-12-07 16:32:42 +01:00
Several fixes for edge cases 2018-12-17 23:54:09 +01:00			`if 'headers' not in data:`
			`return result`

Load feeds and gather info (#103) 2018-12-07 16:32:42 +01:00			`if data['headers'].get('status') not in ('200', '301', '302'):`
			`result['exception'] = 'Server responded with status %s' % data['headers'].get('status')`

			`if 'feed' in data:`
			`result['title'] = data['feed'].get('title')`
			`if 'entries' in data:`
			`result['num_entries'] = len(data['entries'])`
			`result['latest_entry'] = self.find_latest_entry(data['entries'])`
			`result['first_entry'] = self.find_first_entry(data['entries'])`
Several fixes for edge cases 2018-12-17 23:54:09 +01:00			`if (result['num_entries'] > 1 and`
			`result['first_entry'] is not None and`
			`result['latest_entry'] is not None and`
			`result['first_entry'] < result['latest_entry']):`
Load feeds and gather info (#103) 2018-12-07 16:32:42 +01:00			`result['average_interval'] = round((result['latest_entry'] - result['first_entry']).total_seconds() / (result['num_entries'] - 1))`

			`return result`


			`def find_latest_entry(self, entries):`
			`max_date = None`

			`for entry in entries:`
Several fixes for edge cases 2018-12-17 23:54:09 +01:00			`published_parsed = entry.get('published_parsed')`
			`if published_parsed is None:`
			`return`
			`timestamp = mktime(published_parsed)`
Load feeds and gather info (#103) 2018-12-07 16:32:42 +01:00			`if max_date is None or timestamp > max_date:`
			`max_date = timestamp`

Fixing several bugs in spider code 2018-12-17 17:31:09 +01:00			`if max_date is not None:`
			`return datetime.fromtimestamp(max_date)`
Load feeds and gather info (#103) 2018-12-07 16:32:42 +01:00

			`def find_first_entry(self, entries):`
			`min_date = None`

			`for entry in entries:`
Several fixes for edge cases 2018-12-17 23:54:09 +01:00			`published_parsed = entry.get('published_parsed')`
			`if published_parsed is None:`
			`return`
			`timestamp = mktime(published_parsed)`
Load feeds and gather info (#103) 2018-12-07 16:32:42 +01:00			`if min_date is None or timestamp < min_date:`
			`min_date = timestamp`

Fixing several bugs in spider code 2018-12-17 17:31:09 +01:00			`if min_date is not None:`
			`return datetime.fromtimestamp(min_date)`