108 lines
3.1 KiB
Python
108 lines
3.1 KiB
Python
"""
|
|
This checker looks at the similarity between previously downloaded pages
|
|
and removes duplicates from the config URLs
|
|
"""
|
|
|
|
import logging
|
|
|
|
import html_similarity
|
|
|
|
from checks.abstract_checker import AbstractChecker
|
|
|
|
|
|
class Checker(AbstractChecker):
|
|
|
|
# value above which we consider a page pair a duplicate
|
|
similarity_threshold = 0.99999
|
|
|
|
def __init__(self, config, previous_results=None):
|
|
super().__init__(config, previous_results)
|
|
|
|
|
|
def run(self):
|
|
|
|
if len(self.config.urls) == 1:
|
|
# nothing to do for us
|
|
return
|
|
|
|
urls = list(self.config.urls)
|
|
|
|
# get content
|
|
content = {}
|
|
|
|
assert 'page_content' in self.previous_results
|
|
|
|
for url in urls:
|
|
page_content = self.previous_results['page_content'][url]
|
|
|
|
if page_content['content'] is None:
|
|
logging.warn("Content for URL %s is None" % url)
|
|
|
|
content[url] = page_content['content']
|
|
|
|
pairs = self.compare_pairwise(content)
|
|
|
|
# remove duplicates
|
|
for key in pairs:
|
|
if pairs[key]['similarity'] is None:
|
|
continue
|
|
if pairs[key]['similarity'] > self.similarity_threshold:
|
|
# this pair is a duplicate.
|
|
# Decide which one to keep
|
|
url1, url2 = key.split(" ", 1)
|
|
reject = self.select_url_to_reject(url1, url2)
|
|
self.config.remove_url(reject)
|
|
|
|
return pairs
|
|
|
|
|
|
def compare_pairwise(self, content):
|
|
# compair pairwise
|
|
pairs = {}
|
|
|
|
for url1 in content:
|
|
for url2 in content:
|
|
|
|
if url1 == url2:
|
|
continue
|
|
|
|
# avoid checking pairs twice
|
|
pair_key = " ".join(sorted([url1, url2]))
|
|
if pair_key in pairs:
|
|
continue
|
|
|
|
try:
|
|
s = html_similarity.similarity(content[url1], content[url2])
|
|
logging.debug("Comparing pages for URLs %s and %s: similarity=%s", url1, url2, s)
|
|
pairs[pair_key] = {
|
|
'similarity': s,
|
|
'exception': None,
|
|
}
|
|
except (AttributeError, ValueError) as e:
|
|
logging.error("html_similarity.similarity thre exception for URL pair %s and %s: %s", url1, url2, e)
|
|
pairs[pair_key] = {
|
|
'similarity': None,
|
|
'exception': str(e),
|
|
}
|
|
|
|
return pairs
|
|
|
|
|
|
def select_url_to_reject(self, url1, url2):
|
|
"""Determine which of two URLs to keep, which to reject"""
|
|
|
|
# HTTPS takes precedence
|
|
if url1.startswith('https://') and not url2.startswith('https://'):
|
|
return url2
|
|
elif url2.startswith('https://') and not url1.startswith('https://'):
|
|
return url1
|
|
|
|
# Shorter URL wins
|
|
if len(url1) < len(url2):
|
|
return url2
|
|
elif len(url1) > len(url2):
|
|
return url1
|
|
|
|
# default behaviour
|
|
return url1
|