green-spider/checks/duplicate_content.py

108 lines
3.1 KiB
Python
Raw Normal View History

"""
This checker looks at the similarity between previously downloaded pages
and removes duplicates from the config URLs
"""
import logging
import html_similarity
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
# value above which we consider a page pair a duplicate
similarity_threshold = 0.99999
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def run(self):
if len(self.config.urls) == 1:
# nothing to do for us
return
urls = list(self.config.urls)
# get content
content = {}
assert 'page_content' in self.previous_results
for url in urls:
page_content = self.previous_results['page_content'][url]
if page_content['content'] is None:
logging.warn("Content for URL %s is None" % url)
content[url] = page_content['content']
pairs = self.compare_pairwise(content)
# remove duplicates
for key in pairs:
if pairs[key]['similarity'] is None:
continue
if pairs[key]['similarity'] > self.similarity_threshold:
# this pair is a duplicate.
# Decide which one to keep
url1, url2 = key.split(" ", 1)
reject = self.select_url_to_reject(url1, url2)
self.config.remove_url(reject)
return pairs
def compare_pairwise(self, content):
# compair pairwise
pairs = {}
for url1 in content:
for url2 in content:
if url1 == url2:
continue
# avoid checking pairs twice
pair_key = " ".join(sorted([url1, url2]))
if pair_key in pairs:
continue
try:
s = html_similarity.similarity(content[url1], content[url2])
logging.debug("Comparing pages for URLs %s and %s: similarity=%s", url1, url2, s)
pairs[pair_key] = {
'similarity': s,
'exception': None,
}
except (AttributeError, ValueError) as e:
logging.error("html_similarity.similarity thre exception for URL pair %s and %s: %s", url1, url2, e)
pairs[pair_key] = {
'similarity': None,
'exception': str(e),
}
return pairs
def select_url_to_reject(self, url1, url2):
"""Determine which of two URLs to keep, which to reject"""
# HTTPS takes precedence
if url1.startswith('https://') and not url2.startswith('https://'):
return url2
elif url2.startswith('https://') and not url1.startswith('https://'):
return url1
# Shorter URL wins
if len(url1) < len(url2):
return url2
elif len(url1) > len(url2):
return url1
# default behaviour
return url1