mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-02 17:14:51 +02:00
More tests
This commit is contained in:
parent
e09bf7a4d4
commit
997519df35
|
@ -24,7 +24,7 @@ RUN wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x
|
||||||
&& rm -rf phantomjs-2.1.1-linux-x86_64
|
&& rm -rf phantomjs-2.1.1-linux-x86_64
|
||||||
|
|
||||||
ADD spider.py /
|
ADD spider.py /
|
||||||
ADD test.py /
|
ADD spider_test.py /
|
||||||
|
|
||||||
ENTRYPOINT ["python3"]
|
ENTRYPOINT ["python3"]
|
||||||
CMD ["/spider.py"]
|
CMD ["/spider.py"]
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -12,7 +12,7 @@ spider: dockerimage
|
||||||
docker run --rm -ti -v $(PWD)/webapp/dist/data:/out spider
|
docker run --rm -ti -v $(PWD)/webapp/dist/data:/out spider
|
||||||
|
|
||||||
test: dockerimage
|
test: dockerimage
|
||||||
docker run --rm -ti spider /test.py
|
docker run --rm -ti spider /spider_test.py
|
||||||
|
|
||||||
screenshots: venv
|
screenshots: venv
|
||||||
docker pull netzbegruenung/green-spider-screenshotter:latest
|
docker pull netzbegruenung/green-spider-screenshotter:latest
|
||||||
|
|
27
spider.py
27
spider.py
|
@ -181,14 +181,17 @@ def check_content(r):
|
||||||
"""
|
"""
|
||||||
result = {}
|
result = {}
|
||||||
|
|
||||||
result['encoding'] = r.encoding
|
result['encoding'] = r.encoding.lower()
|
||||||
soup = BeautifulSoup(r.text, 'html.parser')
|
soup = BeautifulSoup(r.text, 'html.parser')
|
||||||
|
|
||||||
result['html'] = r.text
|
result['html'] = r.text
|
||||||
|
|
||||||
# page title
|
# page title
|
||||||
result['title'] = None
|
result['title'] = None
|
||||||
title = soup.find('head').find('title')
|
title = None
|
||||||
|
head = soup.find('head')
|
||||||
|
if head is not None:
|
||||||
|
title = head.find('title')
|
||||||
if title is not None:
|
if title is not None:
|
||||||
result['title'] = normalize_title(title.get_text())
|
result['title'] = normalize_title(title.get_text())
|
||||||
|
|
||||||
|
@ -222,19 +225,21 @@ def check_content(r):
|
||||||
|
|
||||||
# generator meta tag
|
# generator meta tag
|
||||||
result['generator'] = None
|
result['generator'] = None
|
||||||
generator = soup.head.select('[name=generator]')
|
if head is not None:
|
||||||
if len(generator):
|
generator = head.select('[name=generator]')
|
||||||
result['generator'] = generator[0].get('content')
|
if len(generator):
|
||||||
|
result['generator'] = generator[0].get('content')
|
||||||
|
|
||||||
# opengraph meta tags
|
# opengraph meta tags
|
||||||
result['opengraph'] = None
|
result['opengraph'] = None
|
||||||
og = set()
|
og = set()
|
||||||
for item in soup.head.find_all(property=re.compile('^og:')):
|
if head is not None:
|
||||||
og.add(item.get('property'))
|
for item in head.find_all(property=re.compile('^og:')):
|
||||||
for item in soup.head.find_all(itemprop=re.compile('^og:')):
|
og.add(item.get('property'))
|
||||||
og.add(item.get('itemprop'))
|
for item in head.find_all(itemprop=re.compile('^og:')):
|
||||||
if len(og):
|
og.add(item.get('itemprop'))
|
||||||
result['opengraph'] = list(og)
|
if len(og):
|
||||||
|
result['opengraph'] = sorted(list(og))
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
125
spider_test.py
Normal file
125
spider_test.py
Normal file
|
@ -0,0 +1,125 @@
|
||||||
|
import unittest
|
||||||
|
import requests
|
||||||
|
import responses
|
||||||
|
import spider
|
||||||
|
|
||||||
|
|
||||||
|
class TestDeriveHostnames(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_basic1(self):
|
||||||
|
hn = spider.derive_test_hostnames('www.my-domain.de')
|
||||||
|
expected = ['my-domain.de', 'www.my-domain.de']
|
||||||
|
self.assertEqual(hn, expected)
|
||||||
|
|
||||||
|
def test_basic2(self):
|
||||||
|
hn = spider.derive_test_hostnames('domain.de')
|
||||||
|
expected = ['domain.de', 'www.domain.de']
|
||||||
|
self.assertEqual(hn, expected)
|
||||||
|
|
||||||
|
|
||||||
|
class TestReduceURLs(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_basic(self):
|
||||||
|
testdata = [
|
||||||
|
{'url': 'one', 'error': None, 'redirects_to': None},
|
||||||
|
{'url': 'two', 'error': 'Yes', 'redirects_to': None},
|
||||||
|
{'url': 'three', 'error': None, 'redirects_to': 'five'},
|
||||||
|
]
|
||||||
|
expected_result = ['five', 'one']
|
||||||
|
result = spider.reduce_urls(testdata)
|
||||||
|
self.assertEqual(result, expected_result)
|
||||||
|
|
||||||
|
|
||||||
|
class TestContentChecks(unittest.TestCase):
|
||||||
|
|
||||||
|
@responses.activate
|
||||||
|
def test_minimal(self):
|
||||||
|
url = 'http://my.url'
|
||||||
|
responses.add(responses.GET, url, status=200,
|
||||||
|
content_type='text/html',
|
||||||
|
body='<html></html>')
|
||||||
|
r = requests.get(url)
|
||||||
|
result = spider.check_content(r)
|
||||||
|
|
||||||
|
del result['html'] # don't want to have the messy HTML part in comparison
|
||||||
|
|
||||||
|
expected_result = {
|
||||||
|
'icon': None,
|
||||||
|
'title': None,
|
||||||
|
'generator': None,
|
||||||
|
'feeds': [],
|
||||||
|
'encoding': 'iso-8859-1',
|
||||||
|
'canonical_link': None,
|
||||||
|
'opengraph': None
|
||||||
|
}
|
||||||
|
self.assertDictEqual(result, expected_result)
|
||||||
|
|
||||||
|
@responses.activate
|
||||||
|
def test_basic(self):
|
||||||
|
url = 'http://my.url'
|
||||||
|
responses.add(responses.GET, url, status=200,
|
||||||
|
content_type='text/html; charset=UTF-8',
|
||||||
|
body='''
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title> The page's title </title>
|
||||||
|
<meta name="generator" content="some-cms/1.0">
|
||||||
|
<link rel="shortcut icon" href="http://foo.bar/image.png">
|
||||||
|
<link rel="alternate" type="application/rss+xml" href="http://example.com/feed">
|
||||||
|
<link rel="canonical" href="https://my.site.com/">
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
''')
|
||||||
|
r = requests.get(url)
|
||||||
|
result = spider.check_content(r)
|
||||||
|
|
||||||
|
del result['html'] # don't want to have the messy HTML part in comparison
|
||||||
|
|
||||||
|
expected_result = {
|
||||||
|
'icon': 'http://foo.bar/image.png',
|
||||||
|
'title': 'The page\'s title',
|
||||||
|
'generator': 'some-cms/1.0',
|
||||||
|
'feeds': [
|
||||||
|
'http://example.com/feed',
|
||||||
|
],
|
||||||
|
'encoding': 'utf-8',
|
||||||
|
'canonical_link': 'https://my.site.com/',
|
||||||
|
'opengraph': None
|
||||||
|
}
|
||||||
|
self.assertDictEqual(result, expected_result)
|
||||||
|
|
||||||
|
@responses.activate
|
||||||
|
def test_opengraph(self):
|
||||||
|
url = 'http://my.url'
|
||||||
|
responses.add(responses.GET, url, status=200,
|
||||||
|
content_type='text/html; charset=UTF-8',
|
||||||
|
body='''
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta property="og:title" content="The Rock" />
|
||||||
|
<meta property="og:type" content="video.movie" />
|
||||||
|
<meta property="og:url" content="http://www.foor.bar" />
|
||||||
|
<meta property="og:image" content="http://www.foo.bar/foo.jpg" />
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
''')
|
||||||
|
r = requests.get(url)
|
||||||
|
result = spider.check_content(r)
|
||||||
|
|
||||||
|
del result['html'] # don't want to have the messy HTML part in comparison
|
||||||
|
|
||||||
|
expected_result = {
|
||||||
|
'icon': None,
|
||||||
|
'title': None,
|
||||||
|
'generator': None,
|
||||||
|
'feeds': [],
|
||||||
|
'encoding': 'utf-8',
|
||||||
|
'canonical_link': None,
|
||||||
|
'opengraph': ['og:image', 'og:title', 'og:type', 'og:url'],
|
||||||
|
}
|
||||||
|
self.assertDictEqual(result, expected_result)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
64
test.py
64
test.py
|
@ -1,64 +0,0 @@
|
||||||
import unittest
|
|
||||||
import requests
|
|
||||||
import responses
|
|
||||||
import spider
|
|
||||||
|
|
||||||
class TestSpider(unittest.TestCase):
|
|
||||||
|
|
||||||
def test_derive_test_hostnames(self):
|
|
||||||
# case 1
|
|
||||||
hn = spider.derive_test_hostnames('www.my-domain.de')
|
|
||||||
expected = ['my-domain.de', 'www.my-domain.de']
|
|
||||||
self.assertEqual(hn, expected)
|
|
||||||
# case 2
|
|
||||||
hn = spider.derive_test_hostnames('domain.de')
|
|
||||||
expected = ['domain.de', 'www.domain.de']
|
|
||||||
self.assertEqual(hn, expected)
|
|
||||||
|
|
||||||
def test_reduce_urls(self):
|
|
||||||
# This is our testdata
|
|
||||||
testdata = [
|
|
||||||
{'url': 'one', 'error': None, 'redirects_to': None},
|
|
||||||
{'url': 'two', 'error': 'Yes', 'redirects_to': None},
|
|
||||||
{'url': 'three', 'error': None, 'redirects_to': 'five'},
|
|
||||||
]
|
|
||||||
expected_result = ['five', 'one']
|
|
||||||
result = spider.reduce_urls(testdata)
|
|
||||||
self.assertEqual(result, expected_result)
|
|
||||||
|
|
||||||
@responses.activate
|
|
||||||
def test_check_content1(self):
|
|
||||||
"""
|
|
||||||
Very basic test of our content analysis function
|
|
||||||
"""
|
|
||||||
url = 'http://my.url'
|
|
||||||
responses.add(responses.GET, url, status=200,
|
|
||||||
content_type='text/html',
|
|
||||||
body='''
|
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<title>The title</title>
|
|
||||||
</head>
|
|
||||||
</html>
|
|
||||||
''')
|
|
||||||
r = requests.get(url)
|
|
||||||
result = spider.check_content(r)
|
|
||||||
|
|
||||||
del result['html'] # don't want to have the messy HTML part in comparison
|
|
||||||
|
|
||||||
expected_result = {
|
|
||||||
'icon': None,
|
|
||||||
'title': 'The title',
|
|
||||||
'generator': None,
|
|
||||||
'feeds': [],
|
|
||||||
'encoding': 'ISO-8859-1',
|
|
||||||
'canonical_link': None,
|
|
||||||
'opengraph': None
|
|
||||||
}
|
|
||||||
self.assertDictEqual(result, expected_result)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
unittest.main()
|
|
Loading…
Reference in a new issue