mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-04-27 23:04:52 +02:00
Marian Steinbach
618e29d763
* CLI: remove 'jobs' command, add 'manager' * Add job definition * Move jobs to manage folder * Rename jobs to manager * Add rq and redis dependencies * Add docker-compose YAML * Downgrade to alpine 3.8 * Adjust paths in Dockerfile, remove entrypoint * Rename 'make spiderjobs' to 'make jobs' * Fix docker exectution * Adapt 'make jobs' * Fix metadata scheme * Add docker dependency * Rendomize queue (a bit) * Use latest image, remove debug output * Make docker-compose file downwards-compatible * Use latest instead of dev image tag * Update docker-compose.yaml * Adapt job start script * Fix redis connection in manager * Add support for increasing timeout via environment variable * Adapt load_in_browser to cookies table schema change * Fix execution * Mitigate yaml warning * Bump some dependency versions * Report resource usage stats for each job * checks/load_in_browser: Return DOM size, prevent multiple page loads * Update .dockerignore * Code update * Script update * Update README.md * WIP * WIP commit * Update Dockerfile to alpine:edge and chromium v90 * Update TestCertificateChecker * Set defaults for __init__ function * Detect sunflower theme * Update unit test for new datetime (zero-basing) * Set logging prefs from Chromium in a new way * Move datastore client instantiation As it is not needed for all commands * Change green-directory repository URL * Add git settings for cloning green-directory * Pin alpine version 3.14, fix py3-cryptography * Use plain docker build progress output * Add volumes to 'make test' docker run command * Fix bug * Update example command in README * Update dependencies * Add creation of Kubernetes jobs
96 lines
3.2 KiB
Python
96 lines
3.2 KiB
Python
"""
|
|
Checks the 'generator' meta tag and page content properties
|
|
to detect well-known content management systems, themes etc.
|
|
"""
|
|
|
|
from checks.abstract_checker import AbstractChecker
|
|
|
|
class Checker(AbstractChecker):
|
|
|
|
# IP address of the newthinking GCMS server
|
|
gcms_ip = "91.102.13.20"
|
|
|
|
def __init__(self, config, previous_results=None):
|
|
super().__init__(config, previous_results)
|
|
|
|
def depends_on_results(self):
|
|
return ['page_content', 'html_head', 'dns_resolution']
|
|
|
|
def run(self):
|
|
assert 'page_content' in self.previous_results
|
|
assert 'html_head' in self.previous_results
|
|
assert 'dns_resolution' in self.previous_results
|
|
|
|
results = {}
|
|
|
|
for url in self.config.urls:
|
|
results[url] = self.get_generator(url)
|
|
|
|
return results
|
|
|
|
|
|
def get_generator(self, url):
|
|
page_content = self.previous_results['page_content'][url]
|
|
assert 'content' in page_content
|
|
|
|
dns_resolution = self.previous_results['dns_resolution']
|
|
|
|
head = self.previous_results['html_head'][url]
|
|
|
|
generator = None
|
|
|
|
if 'generator' in head and head['generator'] is not None:
|
|
generator = head['generator'].lower()
|
|
if 'typo3' in generator:
|
|
generator = 'typo3'
|
|
if 'wordpress' in generator:
|
|
generator = 'wordpress'
|
|
if 'drupal' in generator:
|
|
generator = 'drupal'
|
|
if 'joomla' in generator:
|
|
generator = 'joomla'
|
|
if 'drupal' in generator:
|
|
generator = 'drupal'
|
|
|
|
# Qualify certain CMS flavours in more detail
|
|
if generator == "typo3":
|
|
# Typo3-Gruene advertises in the page content
|
|
if 'typo3-gruene.de' in page_content['content']:
|
|
generator = "typo3-gruene"
|
|
# newthinking GCMS in some page hrefs
|
|
elif 'ntc_gcms' in page_content['content']:
|
|
generator = "typo3-gcms"
|
|
# check if one of the IPs matches the well-known GCMS Server IP
|
|
elif url in dns_resolution:
|
|
for addr in dns_resolution[url]['ipv4_addresses']:
|
|
if addr == self.gcms_ip:
|
|
generator = 'typo3-gcms'
|
|
|
|
elif 'blum-o-matic' in page_content['content']:
|
|
generator = 'wordpress-blumomatic'
|
|
|
|
elif 'gruenes-internet.de' in page_content['content']:
|
|
generator = 'wordpress-gruenes-internet'
|
|
|
|
elif ('Urwahl3000' in page_content['content'] or
|
|
'/themes/urwahl3000' in page_content['content']):
|
|
generator = 'wordpress-urwahl'
|
|
|
|
elif ('/themes/sunflower' in page_content['content']):
|
|
generator = 'wordpress-sunflower'
|
|
|
|
elif ('/themes/sunflower' in page_content['content']):
|
|
generator = 'wordpress-sunflower'
|
|
|
|
elif ('josephknowsbest' in page_content['content'] or
|
|
'Joseph-knows-best' in page_content['content']):
|
|
generator = 'wordpress-josephknowsbest'
|
|
|
|
elif 'wordpress' in page_content['content']:
|
|
generator = 'wordpress'
|
|
|
|
elif 'jimdo' in page_content['content']:
|
|
generator = 'jimdo'
|
|
|
|
return generator
|