social-spider/spider.py

178 lines
5.3 KiB
Python

from git import Repo
import os
import shutil
from ruamel import yaml
from pprint import pprint
import sys
import re
import json
from scraper import scrapeFacebookData, scrapeInstagramData, scrapeTwitterData
from time import sleep
# Git repo for our data
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
# folder in that repo that holds the data
green_direcory_data_path = 'data/countries/de'
green_directory_local_path = './cache/green-directory'
def get_green_directory():
"""
Clones the source of website URLs, the green directory,
into the local file system using git
"""
if os.path.exists(green_directory_local_path):
shutil.rmtree(green_directory_local_path, onerror=onerror)
Repo.clone_from(green_directory_repo, green_directory_local_path)
def dir_entries():
"""
Iterator over all data files in the cloned green directory
"""
path = os.path.join(green_directory_local_path, green_direcory_data_path)
for root, dirs, files in os.walk(path):
for fname in files:
filepath = os.path.join(root, fname)
if not filepath.endswith(".yaml"):
continue
with open(filepath, 'r') as yamlfile:
for doc in yaml.load_all(yamlfile, Loader=yaml.Loader):
yield doc
def onerror(func, path, _):
"""
Error handler for ``shutil.rmtree``.
If the error is due to an access error (read only file)
it attempts to add write permission and then retries.
If the error is for another reason it re-raises the error.
Usage : ``shutil.rmtree(path, onerror=onerror)``
"""
import stat
if not os.access(path, os.W_OK):
# Is the error an access error ?
os.chmod(path, stat.S_IWUSR)
func(path)
else:
raise
def getFacebookName(url):
if "/groups/" in url:
return None
if re.match(r".+-(\d)+", url):
result = re.match(r".+-(\d+)", url).group(1)
if len(result) < 10:
print(url, "--", result, file=sys.stderr)
return
return result
if url.split("/")[-1]:
return url.split("/")[-1]
elif url.split("/")[-2]:
return url.split("/")[-2]
def getTwitterName(url):
if url.split("/")[-1]:
return url.split("/")[-1]
elif url.split("/")[-2]:
return url.split("/")[-2]
def getInstagramName(url):
if url.split("/")[-1]:
return url.split("/")[-1]
elif url.split("/")[-2]:
return url.split("/")[-2]
def main():
get_green_directory()
result = {}
idx = 0
fbcount = 0
twtcount = 0
instacount = 0
for entry in dir_entries():
fbname = "--"
fbLikes = 0
fbVerified = False
twtname = "--"
twtFollower = 0
twtVerified = False
instaName = "--"
instaFollower = 0
instaVerified = False
if not entry.get("urls"):
continue
for url in entry["urls"]:
if url["type"] == "FACEBOOK":
fbname = getFacebookName(url["url"])
if fbname:
try:
fbLikes, fbVerified = scrapeFacebookData(fbname)
sleep(0.1)
except Exception as e:
print("FACEBOOK ERROR for", url["url"], "--", fbname, file=sys.stderr)
print(e, file=sys.stderr)
continue
print(" FB", fbname, fbLikes, fbVerified)
fbcount += 1
elif url["type"] == "TWITTER":
twtname = getTwitterName(url["url"])
try:
twtFollower, twtVerified = scrapeTwitterData(twtname)
sleep(0.1)
except Exception as e:
print("TWITTER ERROR for", url["url"], "--", twtname, file=sys.stderr)
print(e, file=sys.stderr)
continue
twtcount += 1
print(" TWITTER", twtname, twtFollower, twtVerified)
elif url["type"] == "INSTAGRAM":
instaName = getInstagramName(url["url"])
try:
instaFollower, instaVerified = scrapeInstagramData(instaName)
sleep(0.1)
except Exception as e:
print("INSTAGRAM ERROR for", url["url"], "--", instaName, file=sys.stderr)
print(e, file=sys.stderr)
continue
instacount += 1
print(" INSTA", instaName, instaFollower, instaVerified)
typ = entry.get("type")
level = entry.get("level", "")
land = entry.get("state", "")
kreis = entry.get("district", "")
stadt = entry.get("city", "")
if fbname is None:
fbname = "--"
if fbLikes + twtFollower + instaFollower > 0:
key = "//".join([typ, level, land, kreis, stadt])
result.update({key: [typ, level, land, kreis, stadt, fbname, fbLikes, fbVerified, twtname, twtFollower, twtVerified, instaName, instaFollower, instaVerified]})
idx += 1
with open("docs/result.json", "w") as f:
json.dump(result, f)
print("facebook:", fbcount, "twitter:", twtcount, "instagram:", instacount)
if __name__ == "__main__":
main()