mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-01 16:44:51 +02:00
Fix in duplicate_content checker
This commit is contained in:
parent
1c0c7539e2
commit
3e42e342a1
|
@ -44,6 +44,8 @@ class Checker(AbstractChecker):
|
|||
|
||||
# remove duplicates
|
||||
for key in pairs:
|
||||
if pairs[key]['similarity'] is None:
|
||||
continue
|
||||
if pairs[key]['similarity'] > self.similarity_threshold:
|
||||
# this pair is a duplicate.
|
||||
# Decide which one to keep
|
||||
|
@ -69,12 +71,19 @@ class Checker(AbstractChecker):
|
|||
if pair_key in pairs:
|
||||
continue
|
||||
|
||||
s = html_similarity.similarity(content[url1], content[url2])
|
||||
logging.debug("Comparing pages for URLs %s and %s: similarity=%s", url1, url2, s)
|
||||
|
||||
pairs[pair_key] = {
|
||||
'similarity': s,
|
||||
}
|
||||
try:
|
||||
s = html_similarity.similarity(content[url1], content[url2])
|
||||
logging.debug("Comparing pages for URLs %s and %s: similarity=%s", url1, url2, s)
|
||||
pairs[pair_key] = {
|
||||
'similarity': s,
|
||||
'exception': None,
|
||||
}
|
||||
except (AttributeError, ValueError) as e:
|
||||
logging.error("html_similarity.similarity thre exception for URL pair %s and %s: %s", url1, url2, e)
|
||||
pairs[pair_key] = {
|
||||
'similarity': None,
|
||||
'exception': str(e),
|
||||
}
|
||||
|
||||
return pairs
|
||||
|
||||
|
|
Loading…
Reference in a new issue