green-spider/checks/duplicate_content.py

108 lines
3.1 KiB
Python

"""
This checker looks at the similarity between previously downloaded pages
and removes duplicates from the config URLs
"""
import logging
import html_similarity
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
# value above which we consider a page pair a duplicate
similarity_threshold = 0.99999
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
if len(self.config.urls) == 1:
# nothing to do for us
return
urls = list(self.config.urls)
# get content
content = {}
assert 'page_content' in self.previous_results
for url in urls:
page_content = self.previous_results['page_content'][url]
if page_content['content'] is None:
logging.warn("Content for URL %s is None" % url)
content[url] = page_content['content']
pairs = self.compare_pairwise(content)
# remove duplicates
for key in pairs:
if pairs[key]['similarity'] is None:
continue
if pairs[key]['similarity'] > self.similarity_threshold:
# this pair is a duplicate.
# Decide which one to keep
url1, url2 = key.split(" ", 1)
reject = self.select_url_to_reject(url1, url2)
self.config.remove_url(reject)
return pairs
def compare_pairwise(self, content):
# compair pairwise
pairs = {}
for url1 in content:
for url2 in content:
if url1 == url2:
continue
# avoid checking pairs twice
pair_key = " ".join(sorted([url1, url2]))
if pair_key in pairs:
continue
try:
s = html_similarity.similarity(content[url1], content[url2])
logging.debug("Comparing pages for URLs %s and %s: similarity=%s", url1, url2, s)
pairs[pair_key] = {
'similarity': s,
'exception': None,
}
except (AttributeError, ValueError) as e:
logging.error("html_similarity.similarity thre exception for URL pair %s and %s: %s", url1, url2, e)
pairs[pair_key] = {
'similarity': None,
'exception': str(e),
}
return pairs
def select_url_to_reject(self, url1, url2):
"""Determine which of two URLs to keep, which to reject"""
# HTTPS takes precedence
if url1.startswith('https://') and not url2.startswith('https://'):
return url2
elif url2.startswith('https://') and not url1.startswith('https://'):
return url1
# Shorter URL wins
if len(url1) < len(url2):
return url2
elif len(url1) > len(url2):
return url1
# default behaviour
return url1