mirror of
https://github.com/netzbegruenung/green-spider.git
synced 2024-05-05 18:33:40 +02:00
Merge pull request #6 from netzbegruenung/content-checks
Neue checks basierend auf Inhalten
This commit is contained in:
commit
12e2770ffb
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load diff
|
@ -21,6 +21,11 @@
|
|||
background-color: #cfeaa8;
|
||||
font-size: 1rem;
|
||||
}
|
||||
|
||||
.icon {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
@ -34,9 +39,11 @@
|
|||
<tr>
|
||||
<th scope="col">URL</th>
|
||||
<th scope="col">IP-Adresse</th>
|
||||
<th scope="col">Icon</th>
|
||||
<th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
|
||||
<th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
|
||||
<th scope="col"><abbr title="Site nutzt HTTP-Verschlüsselung">HTTPS</abbr></th>
|
||||
<th scope="col">Feed</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
beautifulsoup4==4.6.0
|
||||
certifi==2018.1.18
|
||||
chardet==3.0.4
|
||||
gitdb2==2.0.3
|
||||
GitPython==2.1.9
|
||||
idna==2.6
|
||||
PyYAML==3.12
|
||||
requests==2.18.4
|
||||
smmap2==2.0.3
|
||||
urllib3==1.22
|
||||
pyyaml==3.12
|
||||
|
|
85
spider.py
85
spider.py
|
@ -1,14 +1,17 @@
|
|||
# coding: utf8
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from git import Repo
|
||||
from multiprocessing import Pool
|
||||
from socket import gethostbyname_ex
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urlparse
|
||||
import certifi
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import requests
|
||||
import shutil
|
||||
import sys
|
||||
|
@ -115,6 +118,80 @@ def reduce_urls(urllist):
|
|||
return list(targets)
|
||||
|
||||
|
||||
def normalize_title(s):
|
||||
"""
|
||||
Removes garbage from HTML page titles
|
||||
"""
|
||||
s = s.replace('\u00a0', ' ')
|
||||
s = s.replace(' ', ' ')
|
||||
s = s.strip()
|
||||
return s
|
||||
|
||||
def check_content(r):
|
||||
"""
|
||||
Adds details to check regarding content of the page
|
||||
|
||||
check: the dict containing details for this URL
|
||||
r: requests request/response object
|
||||
"""
|
||||
result = {}
|
||||
|
||||
result['encoding'] = r.encoding
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
|
||||
# page title
|
||||
result['title'] = None
|
||||
title = soup.find('head').find('title')
|
||||
if title is not None:
|
||||
result['title'] = normalize_title(title.get_text())
|
||||
|
||||
# canonical link
|
||||
result['canonical_link'] = None
|
||||
link = soup.find('link', rel='canonical')
|
||||
if link:
|
||||
result['canonical_link'] = urljoin(r.url, link.get('href'))
|
||||
|
||||
# icon
|
||||
result['icon'] = None
|
||||
link = soup.find('link', rel='icon')
|
||||
if link:
|
||||
result['icon'] = urljoin(r.url, link.get('href'))
|
||||
else:
|
||||
link = soup.find('link', rel='shortcut icon')
|
||||
if link:
|
||||
result['icon'] = urljoin(r.url, link.get('href'))
|
||||
|
||||
# feed links
|
||||
result['feeds'] = []
|
||||
rss_links = soup.find_all('link', type='application/rss+xml')
|
||||
atom_links = soup.find_all('link', type='application/atom+xml')
|
||||
|
||||
if len(rss_links) > 0:
|
||||
for l in rss_links:
|
||||
result['feeds'].append(urljoin(r.url, l.get('href')))
|
||||
if len(atom_links) > 0:
|
||||
for l in rss_links:
|
||||
result['feeds'].append(urljoin(r.url, l.get('href')))
|
||||
|
||||
# generator meta tag
|
||||
result['generator'] = None
|
||||
generator = soup.head.select('[name=generator]')
|
||||
if len(generator):
|
||||
result['generator'] = generator[0].get('content')
|
||||
|
||||
# opengraph meta tags
|
||||
result['opengraph'] = None
|
||||
og = set()
|
||||
for item in soup.head.find_all(property=re.compile('^og:')):
|
||||
og.add(item.get('property'))
|
||||
for item in soup.head.find_all(itemprop=re.compile('^og:')):
|
||||
og.add(item.get('itemprop'))
|
||||
if len(og):
|
||||
result['opengraph'] = list(og)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def check_site(url):
|
||||
"""
|
||||
Performs our site check and returns results as a dict.
|
||||
|
@ -204,12 +281,18 @@ def check_site(url):
|
|||
'status_code': None,
|
||||
'duration': None,
|
||||
'error': None,
|
||||
'content': None,
|
||||
}
|
||||
|
||||
try:
|
||||
r = requests.get(check_url, headers=headers, timeout=(connect_timeout, read_timeout))
|
||||
check['status_code'] = r.status_code
|
||||
check['duration'] = round(r.elapsed.microseconds / 1000)
|
||||
|
||||
# Content checks
|
||||
if r.status_code < 300:
|
||||
check['content'] = check_content(r)
|
||||
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
logging.error(str(e) + " " + check_url)
|
||||
check['error'] = "connection"
|
||||
|
@ -285,7 +368,7 @@ def main():
|
|||
# Write result as JSON
|
||||
output_filename = os.path.join(result_path, "spider_result.json")
|
||||
with open(output_filename, 'w', encoding="utf8") as jsonfile:
|
||||
json.dump(results2, jsonfile, indent=2, sort_keys=True)
|
||||
json.dump(results2, jsonfile, indent=2, sort_keys=True, ensure_ascii=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
26
webapp/dist/bundle.js
vendored
26
webapp/dist/bundle.js
vendored
File diff suppressed because one or more lines are too long
13462
webapp/dist/data/spider_result.json
vendored
13462
webapp/dist/data/spider_result.json
vendored
File diff suppressed because it is too large
Load diff
7
webapp/dist/index.html
vendored
7
webapp/dist/index.html
vendored
|
@ -21,6 +21,11 @@
|
|||
background-color: #cfeaa8;
|
||||
font-size: 1rem;
|
||||
}
|
||||
|
||||
.icon {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
@ -34,9 +39,11 @@
|
|||
<tr>
|
||||
<th scope="col">URL</th>
|
||||
<th scope="col">IP-Adresse</th>
|
||||
<th scope="col">Icon</th>
|
||||
<th scope="col"><abbr title="Site ist sowohl mit www. als auch ohne www. in URL erreichbar">www. optional</abbr></th>
|
||||
<th scope="col"><abbr title="URL-Varianten leiten auf eine einzige Startseiten-URL weiter">Kanonische URL</abbr></th>
|
||||
<th scope="col"><abbr title="Site nutzt HTTP-Verschlüsselung">HTTPS</abbr></th>
|
||||
<th scope="col">Feed</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
|
|
|
@ -20,6 +20,15 @@ $(function(){
|
|||
var ips = _.join(_.uniq(_.flatten(_.map(item.hostnames, 'ip_addresses'))), ', ');
|
||||
row.append('<td class="'+ (ips === '' ? 'bad' : 'good') +' text-center">' + (ips === '' ? '❌ Keine' : ips) + '</td>');
|
||||
|
||||
// icon
|
||||
var icons = [];
|
||||
var icon = false;
|
||||
icons = _.uniq(_.map(item.urlchecks, 'content.icon'));
|
||||
if (icons.length > 0 && icons[0]) {
|
||||
icon = icons[0];
|
||||
}
|
||||
row.append('<td class="' + (icon ? 'good' : 'bad') + ' text-center">' + (icon ? ('<img src="' + icon + '" class="icon"/>') : '❌') + '</td>');
|
||||
|
||||
// hostnames
|
||||
var twoHostnames = false;
|
||||
if (_.filter(item.hostnames, {'resolvable': true}).length === 2) {
|
||||
|
@ -30,6 +39,8 @@ $(function(){
|
|||
// one canonical URL
|
||||
var canonical = false;
|
||||
if (item.canonical_urls.length === 1 ) canonical = true;
|
||||
var canonical_links = _.uniq(_.map(item.urlchecks, 'content.canonical_link'));
|
||||
if (canonical_links.length === 1) canonical = true;
|
||||
row.append('<td class="'+ (canonical ? 'good' : 'bad') +' text-center">' + (canonical ? '✅' : '❌') + '</td>');
|
||||
|
||||
// https
|
||||
|
@ -39,6 +50,11 @@ $(function(){
|
|||
});
|
||||
row.append('<td class="'+ (hasHTTPS ? 'good' : 'bad') +' text-center">' + (hasHTTPS ? '✅' : '❌') + '</td>');
|
||||
|
||||
// feeds
|
||||
var feeds = false;
|
||||
feeds = _.uniq(_.flatten(_.map(item.urlchecks, 'content.feeds')));
|
||||
row.append('<td class="'+ (feeds.length ? 'good' : 'bad') +' text-center">' + (feeds.length ? '✅' : '❌') + '</td>');
|
||||
|
||||
tbody.append(row);
|
||||
});
|
||||
|
||||
|
|
Loading…
Reference in a new issue