social-spider/spider.py

from git import Repo
import os
import shutil
from ruamel import yaml
from pprint import pprint
import sys
import re
import json
from scraper import scrapeFacebookData, scrapeInstagramData, scrapeTwitterData
from time import sleep

# Git repo for our data
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'

# folder in that repo that holds the data
green_direcory_data_path = 'data/countries/de'
green_directory_local_path = './cache/green-directory'


def get_green_directory():
    """
    Clones the source of website URLs, the green directory,
    into the local file system using git
    """
    if os.path.exists(green_directory_local_path):
        shutil.rmtree(green_directory_local_path, onerror=onerror)
    Repo.clone_from(green_directory_repo, green_directory_local_path)


def dir_entries():
    """
    Iterator over all data files in the cloned green directory
    """
    path = os.path.join(green_directory_local_path, green_direcory_data_path)
    for root, dirs, files in os.walk(path):
        for fname in files:

            filepath = os.path.join(root, fname)
            if not filepath.endswith(".yaml"):
                continue

            with open(filepath, 'r') as yamlfile:
                for doc in yaml.load_all(yamlfile, Loader=yaml.Loader):
                    yield doc


def onerror(func, path, _):
    """
    Error handler for ``shutil.rmtree``.

    If the error is due to an access error (read only file)
    it attempts to add write permission and then retries.

    If the error is for another reason it re-raises the error.

    Usage : ``shutil.rmtree(path, onerror=onerror)``
    """
    import stat
    if not os.access(path, os.W_OK):
        # Is the error an access error ?
        os.chmod(path, stat.S_IWUSR)
        func(path)
    else:
        raise


def getFacebookName(url):
    if "/groups/" in url:
        return None
    if re.match(r".+-(\d)+", url):
        result = re.match(r".+-(\d+)", url).group(1)
        if len(result) < 10:
            print(url, "--", result, file=sys.stderr)
            return
        return result

    if url.split("/")[-1]:
        return url.split("/")[-1]

    elif url.split("/")[-2]:
        return url.split("/")[-2]


def getTwitterName(url):
    if url.split("/")[-1]:
        return url.split("/")[-1]
    elif url.split("/")[-2]:
        return url.split("/")[-2]


def getInstagramName(url):
    if url.split("/")[-1]:
        return url.split("/")[-1]
    elif url.split("/")[-2]:
        return url.split("/")[-2]


def main():
    get_green_directory()

    result = {}
    idx = 0
    fbcount = 0
    twtcount = 0
    instacount = 0

    for entry in dir_entries():
        fbname = "--"
        fbLikes = 0
        fbVerified = False
        twtname = "--"
        twtFollower = 0
        twtVerified = False
        instaName = "--"
        instaFollower = 0
        instaVerified = False

        if not entry.get("urls"):
            continue
        for url in entry["urls"]:
            if url["type"] == "FACEBOOK":
                fbname = getFacebookName(url["url"])
                if fbname:
                    try:
                        fbLikes, fbVerified = scrapeFacebookData(fbname)
                        sleep(0.1)
                    except Exception as e:
                        print("FACEBOOK ERROR for", url["url"], "--", fbname, file=sys.stderr)
                        print(e, file=sys.stderr)
                        continue
                    print(" FB", fbname, fbLikes, fbVerified)
                    fbcount += 1

            elif url["type"] == "TWITTER":
                twtname = getTwitterName(url["url"])
                try:
                    twtFollower, twtVerified = scrapeTwitterData(twtname)
                    sleep(0.1)
                except Exception as e:
                    print("TWITTER ERROR for", url["url"], "--", twtname, file=sys.stderr)
                    print(e, file=sys.stderr)
                    continue
                twtcount += 1
                print(" TWITTER", twtname, twtFollower, twtVerified)

            elif url["type"] == "INSTAGRAM":
                instaName = getInstagramName(url["url"])
                try:
                    instaFollower, instaVerified = scrapeInstagramData(instaName)
                    sleep(0.1)
                except Exception as e:
                    print("INSTAGRAM ERROR for", url["url"], "--", instaName, file=sys.stderr)
                    print(e, file=sys.stderr)
                    continue
                instacount += 1
                print(" INSTA", instaName, instaFollower, instaVerified)

        typ = entry.get("type")
        level = entry.get("level", "")
        land = entry.get("state", "")
        kreis = entry.get("district", "")
        stadt = entry.get("city", "")
        if fbname is None:
            fbname = "--"
        if fbLikes + twtFollower + instaFollower > 0:
            key = "//".join([typ, level, land, kreis, stadt])
            result.update({key: [typ, level, land, kreis, stadt, fbname, fbLikes, fbVerified, twtname, twtFollower, twtVerified, instaName, instaFollower, instaVerified]})
        idx += 1
        if idx == 50:
            break

    with open("docs/result.json", "w") as f:
        json.dump(result, f)

    print("facebook:", fbcount, "twitter:", twtcount, "instagram:", instacount)


if __name__ == "__main__":
    main()