Browse Source

More tests

pull/36/head
Marian Steinbach 4 years ago
parent
commit
997519df35
  1. 2
      Dockerfile
  2. 2
      Makefile
  3. 27
      spider.py
  4. 125
      spider_test.py
  5. 64
      test.py

2
Dockerfile

@ -24,7 +24,7 @@ RUN wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x
&& rm -rf phantomjs-2.1.1-linux-x86_64
ADD spider.py /
ADD test.py /
ADD spider_test.py /
ENTRYPOINT ["python3"]
CMD ["/spider.py"]

2
Makefile

@ -12,7 +12,7 @@ spider: dockerimage
docker run --rm -ti -v $(PWD)/webapp/dist/data:/out spider
test: dockerimage
docker run --rm -ti spider /test.py
docker run --rm -ti spider /spider_test.py
screenshots: venv
docker pull netzbegruenung/green-spider-screenshotter:latest

27
spider.py

@ -181,14 +181,17 @@ def check_content(r):
"""
result = {}
result['encoding'] = r.encoding
result['encoding'] = r.encoding.lower()
soup = BeautifulSoup(r.text, 'html.parser')
result['html'] = r.text
# page title
result['title'] = None
title = soup.find('head').find('title')
title = None
head = soup.find('head')
if head is not None:
title = head.find('title')
if title is not None:
result['title'] = normalize_title(title.get_text())
@ -222,19 +225,21 @@ def check_content(r):
# generator meta tag
result['generator'] = None
generator = soup.head.select('[name=generator]')
if len(generator):
result['generator'] = generator[0].get('content')
if head is not None:
generator = head.select('[name=generator]')
if len(generator):
result['generator'] = generator[0].get('content')
# opengraph meta tags
result['opengraph'] = None
og = set()
for item in soup.head.find_all(property=re.compile('^og:')):
og.add(item.get('property'))
for item in soup.head.find_all(itemprop=re.compile('^og:')):
og.add(item.get('itemprop'))
if len(og):
result['opengraph'] = list(og)
if head is not None:
for item in head.find_all(property=re.compile('^og:')):
og.add(item.get('property'))
for item in head.find_all(itemprop=re.compile('^og:')):
og.add(item.get('itemprop'))
if len(og):
result['opengraph'] = sorted(list(og))
return result

125
spider_test.py

@ -0,0 +1,125 @@
import unittest
import requests
import responses
import spider
class TestDeriveHostnames(unittest.TestCase):
def test_basic1(self):
hn = spider.derive_test_hostnames('www.my-domain.de')
expected = ['my-domain.de', 'www.my-domain.de']
self.assertEqual(hn, expected)
def test_basic2(self):
hn = spider.derive_test_hostnames('domain.de')
expected = ['domain.de', 'www.domain.de']
self.assertEqual(hn, expected)
class TestReduceURLs(unittest.TestCase):
def test_basic(self):
testdata = [
{'url': 'one', 'error': None, 'redirects_to': None},
{'url': 'two', 'error': 'Yes', 'redirects_to': None},
{'url': 'three', 'error': None, 'redirects_to': 'five'},
]
expected_result = ['five', 'one']
result = spider.reduce_urls(testdata)
self.assertEqual(result, expected_result)
class TestContentChecks(unittest.TestCase):
@responses.activate
def test_minimal(self):
url = 'http://my.url'
responses.add(responses.GET, url, status=200,
content_type='text/html',
body='<html></html>')
r = requests.get(url)
result = spider.check_content(r)
del result['html'] # don't want to have the messy HTML part in comparison
expected_result = {
'icon': None,
'title': None,
'generator': None,
'feeds': [],
'encoding': 'iso-8859-1',
'canonical_link': None,
'opengraph': None
}
self.assertDictEqual(result, expected_result)
@responses.activate
def test_basic(self):
url = 'http://my.url'
responses.add(responses.GET, url, status=200,
content_type='text/html; charset=UTF-8',
body='''
<!DOCTYPE html>
<html>
<head>
<title> The page's title </title>
<meta name="generator" content="some-cms/1.0">
<link rel="shortcut icon" href="http://foo.bar/image.png">
<link rel="alternate" type="application/rss+xml" href="http://example.com/feed">
<link rel="canonical" href="https://my.site.com/">
</head>
</html>
''')
r = requests.get(url)
result = spider.check_content(r)
del result['html'] # don't want to have the messy HTML part in comparison
expected_result = {
'icon': 'http://foo.bar/image.png',
'title': 'The page\'s title',
'generator': 'some-cms/1.0',
'feeds': [
'http://example.com/feed',
],
'encoding': 'utf-8',
'canonical_link': 'https://my.site.com/',
'opengraph': None
}
self.assertDictEqual(result, expected_result)
@responses.activate
def test_opengraph(self):
url = 'http://my.url'
responses.add(responses.GET, url, status=200,
content_type='text/html; charset=UTF-8',
body='''
<html>
<head>
<meta property="og:title" content="The Rock" />
<meta property="og:type" content="video.movie" />
<meta property="og:url" content="http://www.foor.bar" />
<meta property="og:image" content="http://www.foo.bar/foo.jpg" />
</head>
</html>
''')
r = requests.get(url)
result = spider.check_content(r)
del result['html'] # don't want to have the messy HTML part in comparison
expected_result = {
'icon': None,
'title': None,
'generator': None,
'feeds': [],
'encoding': 'utf-8',
'canonical_link': None,
'opengraph': ['og:image', 'og:title', 'og:type', 'og:url'],
}
self.assertDictEqual(result, expected_result)
if __name__ == '__main__':
unittest.main()

64
test.py

@ -1,64 +0,0 @@
import unittest
import requests
import responses
import spider
class TestSpider(unittest.TestCase):
def test_derive_test_hostnames(self):
# case 1
hn = spider.derive_test_hostnames('www.my-domain.de')
expected = ['my-domain.de', 'www.my-domain.de']
self.assertEqual(hn, expected)
# case 2
hn = spider.derive_test_hostnames('domain.de')
expected = ['domain.de', 'www.domain.de']
self.assertEqual(hn, expected)
def test_reduce_urls(self):
# This is our testdata
testdata = [
{'url': 'one', 'error': None, 'redirects_to': None},
{'url': 'two', 'error': 'Yes', 'redirects_to': None},
{'url': 'three', 'error': None, 'redirects_to': 'five'},
]
expected_result = ['five', 'one']
result = spider.reduce_urls(testdata)
self.assertEqual(result, expected_result)
@responses.activate
def test_check_content1(self):
"""
Very basic test of our content analysis function
"""
url = 'http://my.url'
responses.add(responses.GET, url, status=200,
content_type='text/html',
body='''
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>The title</title>
</head>
</html>
''')
r = requests.get(url)
result = spider.check_content(r)
del result['html'] # don't want to have the messy HTML part in comparison
expected_result = {
'icon': None,
'title': 'The title',
'generator': None,
'feeds': [],
'encoding': 'ISO-8859-1',
'canonical_link': None,
'opengraph': None
}
self.assertDictEqual(result, expected_result)
if __name__ == '__main__':
unittest.main()
Loading…
Cancel
Save