mirror of
https://github.com/netzbegruenung/social-spider.git
synced 2024-05-04 11:43:40 +02:00
180 lines
5.3 KiB
Python
180 lines
5.3 KiB
Python
from git import Repo
|
|
import os
|
|
import shutil
|
|
from ruamel import yaml
|
|
from pprint import pprint
|
|
import sys
|
|
import re
|
|
import json
|
|
from scraper import scrapeFacebookData, scrapeInstagramData, scrapeTwitterData
|
|
from time import sleep
|
|
|
|
# Git repo for our data
|
|
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
|
|
|
|
# folder in that repo that holds the data
|
|
green_direcory_data_path = 'data/countries/de'
|
|
green_directory_local_path = './cache/green-directory'
|
|
|
|
|
|
def get_green_directory():
|
|
"""
|
|
Clones the source of website URLs, the green directory,
|
|
into the local file system using git
|
|
"""
|
|
if os.path.exists(green_directory_local_path):
|
|
shutil.rmtree(green_directory_local_path, onerror=onerror)
|
|
Repo.clone_from(green_directory_repo, green_directory_local_path)
|
|
|
|
|
|
def dir_entries():
|
|
"""
|
|
Iterator over all data files in the cloned green directory
|
|
"""
|
|
path = os.path.join(green_directory_local_path, green_direcory_data_path)
|
|
for root, dirs, files in os.walk(path):
|
|
for fname in files:
|
|
|
|
filepath = os.path.join(root, fname)
|
|
if not filepath.endswith(".yaml"):
|
|
continue
|
|
|
|
with open(filepath, 'r') as yamlfile:
|
|
for doc in yaml.load_all(yamlfile, Loader=yaml.Loader):
|
|
yield doc
|
|
|
|
|
|
def onerror(func, path, _):
|
|
"""
|
|
Error handler for ``shutil.rmtree``.
|
|
|
|
If the error is due to an access error (read only file)
|
|
it attempts to add write permission and then retries.
|
|
|
|
If the error is for another reason it re-raises the error.
|
|
|
|
Usage : ``shutil.rmtree(path, onerror=onerror)``
|
|
"""
|
|
import stat
|
|
if not os.access(path, os.W_OK):
|
|
# Is the error an access error ?
|
|
os.chmod(path, stat.S_IWUSR)
|
|
func(path)
|
|
else:
|
|
raise
|
|
|
|
|
|
def getFacebookName(url):
|
|
if "/groups/" in url:
|
|
return None
|
|
if re.match(r".+-(\d)+", url):
|
|
result = re.match(r".+-(\d+)", url).group(1)
|
|
if len(result) < 10:
|
|
print(url, "--", result, file=sys.stderr)
|
|
return
|
|
return result
|
|
|
|
if url.split("/")[-1]:
|
|
return url.split("/")[-1]
|
|
|
|
elif url.split("/")[-2]:
|
|
return url.split("/")[-2]
|
|
|
|
|
|
def getTwitterName(url):
|
|
if url.split("/")[-1]:
|
|
return url.split("/")[-1]
|
|
elif url.split("/")[-2]:
|
|
return url.split("/")[-2]
|
|
|
|
|
|
def getInstagramName(url):
|
|
if url.split("/")[-1]:
|
|
return url.split("/")[-1]
|
|
elif url.split("/")[-2]:
|
|
return url.split("/")[-2]
|
|
|
|
|
|
def main():
|
|
get_green_directory()
|
|
|
|
result = {}
|
|
idx = 0
|
|
fbcount = 0
|
|
twtcount = 0
|
|
instacount = 0
|
|
|
|
for entry in dir_entries():
|
|
fbname = "--"
|
|
fbLikes = 0
|
|
fbVerified = False
|
|
twtname = "--"
|
|
twtFollower = 0
|
|
twtVerified = False
|
|
instaName = "--"
|
|
instaFollower = 0
|
|
instaVerified = False
|
|
|
|
if not entry.get("urls"):
|
|
continue
|
|
for url in entry["urls"]:
|
|
if url["type"] == "FACEBOOK":
|
|
fbname = getFacebookName(url["url"])
|
|
if fbname:
|
|
try:
|
|
fbLikes, fbVerified = scrapeFacebookData(fbname)
|
|
sleep(0.1)
|
|
except Exception as e:
|
|
print("FACEBOOK ERROR for", url["url"], "--", fbname, file=sys.stderr)
|
|
print(e, file=sys.stderr)
|
|
continue
|
|
print(" FB", fbname, fbLikes, fbVerified)
|
|
fbcount += 1
|
|
|
|
elif url["type"] == "TWITTER":
|
|
twtname = getTwitterName(url["url"])
|
|
try:
|
|
twtFollower, twtVerified = scrapeTwitterData(twtname)
|
|
sleep(0.1)
|
|
except Exception as e:
|
|
print("TWITTER ERROR for", url["url"], "--", twtname, file=sys.stderr)
|
|
print(e, file=sys.stderr)
|
|
continue
|
|
twtcount += 1
|
|
print(" TWITTER", twtname, twtFollower, twtVerified)
|
|
|
|
elif url["type"] == "INSTAGRAM":
|
|
instaName = getInstagramName(url["url"])
|
|
try:
|
|
instaFollower, instaVerified = scrapeInstagramData(instaName)
|
|
sleep(0.1)
|
|
except Exception as e:
|
|
print("INSTAGRAM ERROR for", url["url"], "--", instaName, file=sys.stderr)
|
|
print(e, file=sys.stderr)
|
|
continue
|
|
instacount += 1
|
|
print(" INSTA", instaName, instaFollower, instaVerified)
|
|
|
|
typ = entry.get("type")
|
|
level = entry.get("level", "")
|
|
land = entry.get("state", "")
|
|
kreis = entry.get("district", "")
|
|
stadt = entry.get("city", "")
|
|
if fbname is None:
|
|
fbname = "--"
|
|
if fbLikes + twtFollower + instaFollower > 0:
|
|
key = "//".join([typ, level, land, kreis, stadt])
|
|
result.update({key: [typ, level, land, kreis, stadt, fbname, fbLikes, fbVerified, twtname, twtFollower, twtVerified, instaName, instaFollower, instaVerified]})
|
|
idx += 1
|
|
if idx == 50:
|
|
break
|
|
|
|
with open("docs/result.json", "w") as f:
|
|
json.dump(result, f)
|
|
|
|
print("facebook:", fbcount, "twitter:", twtcount, "instagram:", instacount)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|