Detect frameset (#102)

* Add frameset checker

* Remove unused variable (unrelated)
This commit is contained in:
Marian Steinbach 2018-12-07 16:31:56 +01:00 committed by GitHub
parent deff95306b
commit 3063a4488d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 140 additions and 1 deletions

View File

@ -10,6 +10,7 @@ from checks import certificate
from checks import dns_resolution
from checks import duplicate_content
from checks import domain_variations
from checks import frameset
from checks import generator
from checks import html_head
from checks import http_and_https
@ -41,6 +42,7 @@ def perform_checks(input_url):
('duplicate_content', duplicate_content),
('charset', charset),
('html_head', html_head),
('frameset', frameset),
('hyperlinks', hyperlinks),
('generator', generator),
('load_in_browser', load_in_browser),

53
checks/frameset.py Normal file
View File

@ -0,0 +1,53 @@
"""
Collects information on usage of the frameset tag
"""
import logging
from bs4 import BeautifulSoup
from checks.abstract_checker import AbstractChecker
class Checker(AbstractChecker):
def __init__(self, config, previous_results=None):
super().__init__(config, previous_results)
def depends_on_results(self):
return ['page_content']
def run(self):
assert 'page_content' in self.previous_results
results = {}
for url in self.config.urls:
results[url] = self.get_framesets(url)
return results
def get_framesets(self, url):
"""
Expects page_content_dict['content'] to carry the HTML content
"""
page_content = self.previous_results['page_content'][url]
assert 'content' in page_content
if page_content['content'] is None:
return
result = {
'frameset': None,
}
soup = BeautifulSoup(page_content['content'], 'html.parser')
count = 0
for _ in soup.find_all("frameset"):
count += 1
if count > 0:
result['frameset'] = True
else:
result['frameset'] = False
return result

85
checks/frameset_test.py Normal file
View File

@ -0,0 +1,85 @@
import httpretty
from httpretty import httprettified
import unittest
from checks import frameset
from checks import page_content
from checks.config import Config
@httprettified
class TestFrameset(unittest.TestCase):
def test_frameset_positive(self):
page_body = """
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<html>
<head>
<title>A frameset page</title>
</head>
<frameset framespacing="0" border="false" frameborder="0" rows="30,*">
<frame name="top" src="top.htm" scrolling="no">
<frame name="base" src="titel.htm" target="_top">
<noframes>
<body>
<p>Here we have some body content</p>
</body>
</noframes>
</frameset>
</html>
"""
url = 'http://example.com/'
httpretty.register_uri(httpretty.GET, url, body=page_body)
results = {}
config = Config(urls=[url])
page_content_checker = page_content.Checker(config=config, previous_results={})
results['page_content'] = page_content_checker.run()
checker = frameset.Checker(config=page_content_checker.config,
previous_results=results)
result = checker.run()
urls_after = checker.config.urls
self.assertEqual(result, {
'http://example.com/': {'frameset': True}
})
self.assertEqual(urls_after, ['http://example.com/'])
def test_frameset_negative(self):
page_body = """
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<html>
<head>
<title>A frameset page</title>
</head>
<body>
<p>Here we have some body content</p>
</body>
</html>
"""
url = 'http://example.com/'
httpretty.register_uri(httpretty.GET, url, body=page_body)
results = {}
config = Config(urls=[url])
page_content_checker = page_content.Checker(config=config, previous_results={})
results['page_content'] = page_content_checker.run()
checker = frameset.Checker(config=page_content_checker.config,
previous_results=results)
result = checker.run()
urls_after = checker.config.urls
self.assertEqual(result, {
'http://example.com/': {'frameset': False}
})
self.assertEqual(urls_after, ['http://example.com/'])
if __name__ == '__main__':
unittest.main()

View File

@ -10,7 +10,6 @@ from checks.config import Config
class TestHyperlinks(unittest.TestCase):
def test_links(self):
self.maxDiff = 2000
page_body = """
<html>
<head>