social-spider/spider.py

178 lines
5.3 KiB
Python
Raw Normal View History

from git import Repo
import os
import shutil
from ruamel import yaml
from pprint import pprint
import sys
2018-06-01 21:52:15 +02:00
import re
import json
2019-09-25 15:44:29 +02:00
from scraper import scrapeFacebookData, scrapeInstagramData, scrapeTwitterData
2018-07-21 17:37:45 +02:00
from time import sleep
# Git repo for our data
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
# folder in that repo that holds the data
green_direcory_data_path = 'data/countries/de'
green_directory_local_path = './cache/green-directory'
2018-07-21 17:37:45 +02:00
def get_green_directory():
"""
Clones the source of website URLs, the green directory,
into the local file system using git
"""
if os.path.exists(green_directory_local_path):
shutil.rmtree(green_directory_local_path, onerror=onerror)
Repo.clone_from(green_directory_repo, green_directory_local_path)
def dir_entries():
"""
Iterator over all data files in the cloned green directory
"""
path = os.path.join(green_directory_local_path, green_direcory_data_path)
for root, dirs, files in os.walk(path):
for fname in files:
filepath = os.path.join(root, fname)
if not filepath.endswith(".yaml"):
continue
with open(filepath, 'r') as yamlfile:
2018-06-01 21:52:15 +02:00
for doc in yaml.load_all(yamlfile, Loader=yaml.Loader):
yield doc
2018-06-01 21:52:15 +02:00
def onerror(func, path, _):
"""
Error handler for ``shutil.rmtree``.
If the error is due to an access error (read only file)
it attempts to add write permission and then retries.
If the error is for another reason it re-raises the error.
Usage : ``shutil.rmtree(path, onerror=onerror)``
"""
import stat
if not os.access(path, os.W_OK):
# Is the error an access error ?
os.chmod(path, stat.S_IWUSR)
func(path)
else:
raise
2018-06-01 22:43:59 +02:00
2018-06-01 21:52:15 +02:00
def getFacebookName(url):
if "/groups/" in url:
return None
if re.match(r".+-(\d)+", url):
result = re.match(r".+-(\d+)", url).group(1)
if len(result) < 10:
print(url, "--", result, file=sys.stderr)
return
return result
2018-06-01 21:52:15 +02:00
if url.split("/")[-1]:
return url.split("/")[-1]
2018-06-01 21:52:15 +02:00
elif url.split("/")[-2]:
return url.split("/")[-2]
2018-06-01 22:43:59 +02:00
2018-06-01 21:52:15 +02:00
def getTwitterName(url):
if url.split("/")[-1]:
return url.split("/")[-1]
elif url.split("/")[-2]:
return url.split("/")[-2]
2018-06-01 22:43:59 +02:00
2018-07-21 17:37:45 +02:00
def getInstagramName(url):
if url.split("/")[-1]:
return url.split("/")[-1]
elif url.split("/")[-2]:
return url.split("/")[-2]
def main():
get_green_directory()
2018-07-21 17:37:45 +02:00
result = {}
idx = 0
2018-06-01 22:43:59 +02:00
fbcount = 0
twtcount = 0
2018-07-21 17:37:45 +02:00
instacount = 0
2018-06-01 21:52:15 +02:00
for entry in dir_entries():
fbname = "--"
fbLikes = 0
2019-09-25 15:44:29 +02:00
fbVerified = False
twtname = "--"
twtFollower = 0
2019-09-25 15:44:29 +02:00
twtVerified = False
2018-07-21 17:37:45 +02:00
instaName = "--"
instaFollower = 0
2019-09-25 15:44:29 +02:00
instaVerified = False
2018-06-01 21:52:15 +02:00
if not entry.get("urls"):
continue
for url in entry["urls"]:
if url["type"] == "FACEBOOK":
fbname = getFacebookName(url["url"])
if fbname:
try:
2019-09-25 15:44:29 +02:00
fbLikes, fbVerified = scrapeFacebookData(fbname)
2018-07-21 17:37:45 +02:00
sleep(0.1)
2018-06-01 21:52:15 +02:00
except Exception as e:
2018-06-01 22:43:59 +02:00
print("FACEBOOK ERROR for", url["url"], "--", fbname, file=sys.stderr)
2018-06-01 21:52:15 +02:00
print(e, file=sys.stderr)
continue
2019-09-25 15:44:29 +02:00
print(" FB", fbname, fbLikes, fbVerified)
2018-06-01 22:43:59 +02:00
fbcount += 1
2018-06-01 21:52:15 +02:00
elif url["type"] == "TWITTER":
twtname = getTwitterName(url["url"])
2018-06-01 22:43:59 +02:00
try:
2019-09-25 15:44:29 +02:00
twtFollower, twtVerified = scrapeTwitterData(twtname)
sleep(0.1)
2018-06-01 22:43:59 +02:00
except Exception as e:
print("TWITTER ERROR for", url["url"], "--", twtname, file=sys.stderr)
print(e, file=sys.stderr)
continue
twtcount += 1
2019-09-25 15:44:29 +02:00
print(" TWITTER", twtname, twtFollower, twtVerified)
2018-07-21 17:37:45 +02:00
elif url["type"] == "INSTAGRAM":
instaName = getInstagramName(url["url"])
try:
instaFollower, instaVerified = scrapeInstagramData(instaName)
2018-07-21 17:37:45 +02:00
sleep(0.1)
except Exception as e:
print("INSTAGRAM ERROR for", url["url"], "--", instaName, file=sys.stderr)
print(e, file=sys.stderr)
continue
instacount += 1
2019-09-25 15:44:29 +02:00
print(" INSTA", instaName, instaFollower, instaVerified)
2018-06-01 22:43:59 +02:00
2019-06-17 18:02:17 +02:00
typ = entry.get("type")
level = entry.get("level", "")
land = entry.get("state", "")
kreis = entry.get("district", "")
stadt = entry.get("city", "")
if fbname is None:
2018-07-21 17:37:45 +02:00
fbname = "--"
if fbLikes + twtFollower + instaFollower > 0:
key = "//".join([typ, level, land, kreis, stadt])
result.update({key: [typ, level, land, kreis, stadt, fbname, fbLikes, fbVerified, twtname, twtFollower, twtVerified, instaName, instaFollower, instaVerified]})
idx += 1
2018-07-21 17:37:45 +02:00
with open("docs/result.json", "w") as f:
json.dump(result, f)
2018-07-21 17:37:45 +02:00
print("facebook:", fbcount, "twitter:", twtcount, "instagram:", instacount)
2018-06-01 22:43:59 +02:00
2018-04-28 22:07:06 +02:00
if __name__ == "__main__":
2018-04-28 22:07:06 +02:00
main()