mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-04 10:03:40 +02:00
108 lines
3.1 KiB
Python
108 lines
3.1 KiB
Python
|
"""
|
||
|
This checker looks at the similarity between previously downloaded pages
|
||
|
and removes duplicates from the config URLs
|
||
|
"""
|
||
|
|
||
|
import logging
|
||
|
|
||
|
import html_similarity
|
||
|
|
||
|
from checks.abstract_checker import AbstractChecker
|
||
|
|
||
|
|
||
|
class Checker(AbstractChecker):
|
||
|
|
||
|
# value above which we consider a page pair a duplicate
|
||
|
similarity_threshold = 0.99999
|
||
|
|
||
|
def __init__(self, config, previous_results=None):
|
||
|
super().__init__(config, previous_results)
|
||
|
|
||
|
|
||
|
def run(self):
|
||
|
|
||
|
if len(self.config.urls) == 1:
|
||
|
# nothing to do for us
|
||
|
return
|
||
|
|
||
|
urls = list(self.config.urls)
|
||
|
|
||
|
# get content
|
||
|
content = {}
|
||
|
|
||
|
assert 'page_content' in self.previous_results
|
||
|
|
||
|
for url in urls:
|
||
|
page_content = self.previous_results['page_content'][url]
|
||
|
|
||
|
if page_content['content'] is None:
|
||
|
logging.warn("Content for URL %s is None" % url)
|
||
|
|
||
|
content[url] = page_content['content']
|
||
|
|
||
|
pairs = self.compare_pairwise(content)
|
||
|
|
||
|
# remove duplicates
|
||
|
for key in pairs:
|
||
|
if pairs[key]['similarity'] is None:
|
||
|
continue
|
||
|
if pairs[key]['similarity'] > self.similarity_threshold:
|
||
|
# this pair is a duplicate.
|
||
|
# Decide which one to keep
|
||
|
url1, url2 = key.split(" ", 1)
|
||
|
reject = self.select_url_to_reject(url1, url2)
|
||
|
self.config.remove_url(reject)
|
||
|
|
||
|
return pairs
|
||
|
|
||
|
|
||
|
def compare_pairwise(self, content):
|
||
|
# compair pairwise
|
||
|
pairs = {}
|
||
|
|
||
|
for url1 in content:
|
||
|
for url2 in content:
|
||
|
|
||
|
if url1 == url2:
|
||
|
continue
|
||
|
|
||
|
# avoid checking pairs twice
|
||
|
pair_key = " ".join(sorted([url1, url2]))
|
||
|
if pair_key in pairs:
|
||
|
continue
|
||
|
|
||
|
try:
|
||
|
s = html_similarity.similarity(content[url1], content[url2])
|
||
|
logging.debug("Comparing pages for URLs %s and %s: similarity=%s", url1, url2, s)
|
||
|
pairs[pair_key] = {
|
||
|
'similarity': s,
|
||
|
'exception': None,
|
||
|
}
|
||
|
except (AttributeError, ValueError) as e:
|
||
|
logging.error("html_similarity.similarity thre exception for URL pair %s and %s: %s", url1, url2, e)
|
||
|
pairs[pair_key] = {
|
||
|
'similarity': None,
|
||
|
'exception': str(e),
|
||
|
}
|
||
|
|
||
|
return pairs
|
||
|
|
||
|
|
||
|
def select_url_to_reject(self, url1, url2):
|
||
|
"""Determine which of two URLs to keep, which to reject"""
|
||
|
|
||
|
# HTTPS takes precedence
|
||
|
if url1.startswith('https://') and not url2.startswith('https://'):
|
||
|
return url2
|
||
|
elif url2.startswith('https://') and not url1.startswith('https://'):
|
||
|
return url1
|
||
|
|
||
|
# Shorter URL wins
|
||
|
if len(url1) < len(url2):
|
||
|
return url2
|
||
|
elif len(url1) > len(url2):
|
||
|
return url1
|
||
|
|
||
|
# default behaviour
|
||
|
return url1
|