From 9672e41dbac6bf0932bcbd3389cf174b300ad9e3 Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Mon, 9 Apr 2018 23:02:12 +0200 Subject: [PATCH] Add icon retrieval --- spider.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/spider.py b/spider.py index 8929043..a540493 100644 --- a/spider.py +++ b/spider.py @@ -4,6 +4,7 @@ from bs4 import BeautifulSoup from git import Repo from multiprocessing import Pool from socket import gethostbyname_ex +from urllib.parse import urljoin from urllib.parse import urlparse import certifi import json @@ -148,7 +149,17 @@ def check_content(r): result['canonical_link'] = None link = soup.find('link', rel='canonical') if link: - result['canonical_link'] = link.get('href') + result['canonical_link'] = urljoin(r.url, link.get('href')) + + # icon + result['icon'] = None + link = soup.find('link', rel='icon') + if link: + result['icon'] = urljoin(r.url, link.get('href')) + else: + link = soup.find('link', rel='shortcut icon') + if link: + result['icon'] = urljoin(r.url, link.get('href')) # feed links result['feeds'] = [] @@ -157,10 +168,10 @@ def check_content(r): if len(rss_links) > 0: for l in rss_links: - result['feeds'].append(l.get('href')) + result['feeds'].append(urljoin(r.url, l.get('href'))) if len(atom_links) > 0: for l in rss_links: - result['feeds'].append(l.get('href')) + result['feeds'].append(urljoin(r.url, l.get('href'))) # generator meta tag result['generator'] = None @@ -180,6 +191,7 @@ def check_content(r): return result + def check_site(url): """ Performs our site check and returns results as a dict.