diff --git a/Dockerfile b/Dockerfile index 70b2403..0be8584 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ RUN wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x && rm -rf phantomjs-2.1.1-linux-x86_64 ADD spider.py / -ADD test.py / +ADD spider_test.py / ENTRYPOINT ["python3"] CMD ["/spider.py"] diff --git a/Makefile b/Makefile index 6277d99..8a14663 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ spider: dockerimage docker run --rm -ti -v $(PWD)/webapp/dist/data:/out spider test: dockerimage - docker run --rm -ti spider /test.py + docker run --rm -ti spider /spider_test.py screenshots: venv docker pull netzbegruenung/green-spider-screenshotter:latest diff --git a/spider.py b/spider.py index 6866c4e..3cba123 100644 --- a/spider.py +++ b/spider.py @@ -181,14 +181,17 @@ def check_content(r): """ result = {} - result['encoding'] = r.encoding + result['encoding'] = r.encoding.lower() soup = BeautifulSoup(r.text, 'html.parser') result['html'] = r.text # page title result['title'] = None - title = soup.find('head').find('title') + title = None + head = soup.find('head') + if head is not None: + title = head.find('title') if title is not None: result['title'] = normalize_title(title.get_text()) @@ -222,19 +225,21 @@ def check_content(r): # generator meta tag result['generator'] = None - generator = soup.head.select('[name=generator]') - if len(generator): - result['generator'] = generator[0].get('content') + if head is not None: + generator = head.select('[name=generator]') + if len(generator): + result['generator'] = generator[0].get('content') # opengraph meta tags result['opengraph'] = None og = set() - for item in soup.head.find_all(property=re.compile('^og:')): - og.add(item.get('property')) - for item in soup.head.find_all(itemprop=re.compile('^og:')): - og.add(item.get('itemprop')) - if len(og): - result['opengraph'] = list(og) + if head is not None: + for item in head.find_all(property=re.compile('^og:')): + og.add(item.get('property')) + for item in head.find_all(itemprop=re.compile('^og:')): + og.add(item.get('itemprop')) + if len(og): + result['opengraph'] = sorted(list(og)) return result diff --git a/spider_test.py b/spider_test.py new file mode 100644 index 0000000..a617147 --- /dev/null +++ b/spider_test.py @@ -0,0 +1,125 @@ +import unittest +import requests +import responses +import spider + + +class TestDeriveHostnames(unittest.TestCase): + + def test_basic1(self): + hn = spider.derive_test_hostnames('www.my-domain.de') + expected = ['my-domain.de', 'www.my-domain.de'] + self.assertEqual(hn, expected) + + def test_basic2(self): + hn = spider.derive_test_hostnames('domain.de') + expected = ['domain.de', 'www.domain.de'] + self.assertEqual(hn, expected) + + +class TestReduceURLs(unittest.TestCase): + + def test_basic(self): + testdata = [ + {'url': 'one', 'error': None, 'redirects_to': None}, + {'url': 'two', 'error': 'Yes', 'redirects_to': None}, + {'url': 'three', 'error': None, 'redirects_to': 'five'}, + ] + expected_result = ['five', 'one'] + result = spider.reduce_urls(testdata) + self.assertEqual(result, expected_result) + + +class TestContentChecks(unittest.TestCase): + + @responses.activate + def test_minimal(self): + url = 'http://my.url' + responses.add(responses.GET, url, status=200, + content_type='text/html', + body='') + r = requests.get(url) + result = spider.check_content(r) + + del result['html'] # don't want to have the messy HTML part in comparison + + expected_result = { + 'icon': None, + 'title': None, + 'generator': None, + 'feeds': [], + 'encoding': 'iso-8859-1', + 'canonical_link': None, + 'opengraph': None + } + self.assertDictEqual(result, expected_result) + + @responses.activate + def test_basic(self): + url = 'http://my.url' + responses.add(responses.GET, url, status=200, + content_type='text/html; charset=UTF-8', + body=''' + + + + The page's title + + + + + + + ''') + r = requests.get(url) + result = spider.check_content(r) + + del result['html'] # don't want to have the messy HTML part in comparison + + expected_result = { + 'icon': 'http://foo.bar/image.png', + 'title': 'The page\'s title', + 'generator': 'some-cms/1.0', + 'feeds': [ + 'http://example.com/feed', + ], + 'encoding': 'utf-8', + 'canonical_link': 'https://my.site.com/', + 'opengraph': None + } + self.assertDictEqual(result, expected_result) + + @responses.activate + def test_opengraph(self): + url = 'http://my.url' + responses.add(responses.GET, url, status=200, + content_type='text/html; charset=UTF-8', + body=''' + + + + + + + + + ''') + r = requests.get(url) + result = spider.check_content(r) + + del result['html'] # don't want to have the messy HTML part in comparison + + expected_result = { + 'icon': None, + 'title': None, + 'generator': None, + 'feeds': [], + 'encoding': 'utf-8', + 'canonical_link': None, + 'opengraph': ['og:image', 'og:title', 'og:type', 'og:url'], + } + self.assertDictEqual(result, expected_result) + + +if __name__ == '__main__': + unittest.main() diff --git a/test.py b/test.py deleted file mode 100644 index e165a45..0000000 --- a/test.py +++ /dev/null @@ -1,64 +0,0 @@ -import unittest -import requests -import responses -import spider - -class TestSpider(unittest.TestCase): - - def test_derive_test_hostnames(self): - # case 1 - hn = spider.derive_test_hostnames('www.my-domain.de') - expected = ['my-domain.de', 'www.my-domain.de'] - self.assertEqual(hn, expected) - # case 2 - hn = spider.derive_test_hostnames('domain.de') - expected = ['domain.de', 'www.domain.de'] - self.assertEqual(hn, expected) - - def test_reduce_urls(self): - # This is our testdata - testdata = [ - {'url': 'one', 'error': None, 'redirects_to': None}, - {'url': 'two', 'error': 'Yes', 'redirects_to': None}, - {'url': 'three', 'error': None, 'redirects_to': 'five'}, - ] - expected_result = ['five', 'one'] - result = spider.reduce_urls(testdata) - self.assertEqual(result, expected_result) - - @responses.activate - def test_check_content1(self): - """ - Very basic test of our content analysis function - """ - url = 'http://my.url' - responses.add(responses.GET, url, status=200, - content_type='text/html', - body=''' - - - - - The title - - - ''') - r = requests.get(url) - result = spider.check_content(r) - - del result['html'] # don't want to have the messy HTML part in comparison - - expected_result = { - 'icon': None, - 'title': 'The title', - 'generator': None, - 'feeds': [], - 'encoding': 'ISO-8859-1', - 'canonical_link': None, - 'opengraph': None - } - self.assertDictEqual(result, expected_result) - - -if __name__ == '__main__': - unittest.main()