green-spider/checks/duplicate_content_test.py

57 lines
1.7 KiB
Python

import httpretty
from httpretty import httprettified
import unittest
from checks import duplicate_content
from checks import page_content
from checks.config import Config
@httprettified
class TestDuplicateContent(unittest.TestCase):
def test_identical(self):
page_body = """
<html>
<head>
<title>Title</title>
</head>
<body>
<h1 class="title">Headline</h1>
<p class="intro">Second paragraph with <strong>strong words</strong></p>
<p class="text">Third paragraph</p>
<ul class="somelist">
<li>A list item</li>
</ul>
</body>
</html>
"""
url1 = 'http://example.com/'
httpretty.register_uri(httpretty.GET, url1, body=page_body)
url2 = 'http://www.example.com/'
httpretty.register_uri(httpretty.GET, url2, body=page_body)
results = {}
config = Config(urls=[url1, url2])
page_content_checker = page_content.Checker(config=config, previous_results={})
results['page_content'] = page_content_checker.run()
checker = duplicate_content.Checker(config=page_content_checker.config,
previous_results=results)
result = checker.run()
urls_after = checker.config.urls
self.assertEqual(result, {
'http://example.com/ http://www.example.com/': {
'exception': None,
'similarity': 1.0
}
})
self.assertEqual(urls_after, ['http://example.com/'])
if __name__ == '__main__':
unittest.main()