Fixing several bugs in spider code

This commit is contained in:
Marian Steinbach 2018-12-17 17:31:09 +01:00
parent 3b9ead330d
commit 3b8328d804
3 changed files with 53 additions and 3 deletions

View File

@ -90,7 +90,8 @@ class Checker(AbstractChecker):
if max_date is None or timestamp > max_date:
max_date = timestamp
return datetime.fromtimestamp(max_date)
if max_date is not None:
return datetime.fromtimestamp(max_date)
def find_first_entry(self, entries):
@ -101,4 +102,5 @@ class Checker(AbstractChecker):
if min_date is None or timestamp < min_date:
min_date = timestamp
return datetime.fromtimestamp(min_date)
if min_date is not None:
return datetime.fromtimestamp(min_date)

View File

@ -70,5 +70,53 @@ class TestFeed(unittest.TestCase):
})
def test_empty_feed_rss2(self):
"""
Checks RSS 2.0
"""
feed = """<?xml version="1.0"?>
<rss version="2.0">
<channel>
<title>Empty Feed</title>
<link>http://example.com/</link>
<pubDate></pubDate>
</channel>
</rss>
"""
feed_url = 'http://example.com/feed.xml'
httpretty.register_uri(httpretty.GET, feed_url,
body=feed,
adding_headers={
"Content-type": "application/rss+xml",
})
# mocking a previous result from some page
results = {
'html_head': {
'http://example.com/': {
'link_rss_atom': ['http://example.com/feed.xml']
}
}
}
config = Config(urls=['http://example.com/'])
checker = load_feeds.Checker(config=config, previous_results=results)
result = checker.run()
print(result)
self.assertEqual(result, {
'http://example.com/feed.xml': {
'exception': None,
'title': 'Empty Feed',
'latest_entry': None,
'first_entry': None,
'average_interval': None,
'num_entries': 0,
}
})
if __name__ == '__main__':
unittest.main()

View File

@ -78,7 +78,7 @@ def work_of_queue(datastore_client, entity_kind):
logging.info("Starting job %s", job["url"])
result = check_and_rate_site(entry=job)
logging.debug("Full JSON representation of returned result: %s", json.dumps(result))
logging.debug("Full JSON representation of returned result: %s", json.dumps(result, default=str))
logging.info("Job %s finished checks", job["url"])
logging.info("Job %s writing to DB", job["url"])