social-spider/spider.py

from git import Repo
import os
import shutil
from ruamel import yaml
from pprint import pprint
import sys
import re
import json
from scraper import scrapeFacebookData, scrapeInstagramData, scrapeTwitterData
from time import sleep

# Git repo for our data
green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'

# folder in that repo that holds the data
green_direcory_data_path = 'data/countries/de'
green_directory_local_path = './cache/green-directory'


def get_green_directory():
    """
    Clones the source of website URLs, the green directory,
    into the local file system using git
    """
    if os.path.exists(green_directory_local_path):
        shutil.rmtree(green_directory_local_path, onerror=onerror)
    Repo.clone_from(green_directory_repo, green_directory_local_path)


def dir_entries():
    """
    Iterator over all data files in the cloned green directory
    """
    path = os.path.join(green_directory_local_path, green_direcory_data_path)
    for root, dirs, files in os.walk(path):
        for fname in files:

            filepath = os.path.join(root, fname)
            if not filepath.endswith(".yaml"):
                continue

            with open(filepath, 'r') as yamlfile:
                for doc in yaml.load_all(yamlfile, Loader=yaml.Loader):
                    yield doc


def onerror(func, path, _):
    """
    Error handler for ``shutil.rmtree``.

    If the error is due to an access error (read only file)
    it attempts to add write permission and then retries.

    If the error is for another reason it re-raises the error.

    Usage : ``shutil.rmtree(path, onerror=onerror)``
    """
    import stat
    if not os.access(path, os.W_OK):
        # Is the error an access error ?
        os.chmod(path, stat.S_IWUSR)
        func(path)
    else:
        raise


def getFacebookName(url):
    if "/groups/" in url:
        return None
    if re.match(r".+-(\d)+", url):
        result = re.match(r".+-(\d+)", url).group(1)
        if len(result) < 10:
            print(url, "--", result, file=sys.stderr)
            return
        return result

    if url.split("/")[-1]:
        return url.split("/")[-1]

    elif url.split("/")[-2]:
        return url.split("/")[-2]

    
def getTwitterName(url):
    if url.split("/")[-1]:
        return url.split("/")[-1]
    elif url.split("/")[-2]:
        return url.split("/")[-2]


def getInstagramName(url):
    if url.split("/")[-1]:
        return url.split("/")[-1]
    elif url.split("/")[-2]:
        return url.split("/")[-2]


def main():
    get_green_directory()

    result = {}
    idx = 0
    fbcount = 0
    twtcount = 0
    instacount = 0

    for entry in dir_entries():
        fbname = "--"
        fbLikes = 0
        fbVerified = False
        twtname = "--"
        twtFollower = 0
        twtVerified = False
        instaName = "--"
        instaFollower = 0
        instaVerified = False

        if not entry.get("urls"):
            continue
        for url in entry["urls"]:
            if url["type"] == "FACEBOOK":
                fbname = getFacebookName(url["url"])
                if fbname:
                    try:
                        fbLikes, fbVerified = scrapeFacebookData(fbname)
                        sleep(0.1)
                    except Exception as e:
                        print("FACEBOOK ERROR for", url["url"], "--", fbname, file=sys.stderr)
                        print(e, file=sys.stderr)
                        continue
                    print(" FB", fbname, fbLikes, fbVerified)
                    fbcount += 1

            elif url["type"] == "TWITTER":
                twtname = getTwitterName(url["url"])
                try:
                    twtFollower, twtVerified = scrapeTwitterData(twtname)
                    sleep(0.1)
                except Exception as e:
                    print("TWITTER ERROR for", url["url"], "--", twtname, file=sys.stderr)
                    print(e, file=sys.stderr)
                    continue
                twtcount += 1
                print(" TWITTER", twtname, twtFollower, twtVerified)

            elif url["type"] == "INSTAGRAM":
                instaName = getInstagramName(url["url"])
                try:
                    instaFollower, instaVerified = scrapeInstagramData(instaName)
                    sleep(0.1)
                except Exception as e:
                    print("INSTAGRAM ERROR for", url["url"], "--", instaName, file=sys.stderr)
                    print(e, file=sys.stderr)
                    continue
                instacount += 1
                print(" INSTA", instaName, instaFollower, instaVerified)

        typ = entry.get("type")
        level = entry.get("level", "")
        land = entry.get("state", "")
        kreis = entry.get("district", "")
        stadt = entry.get("city", "")
        if fbname is None:
            fbname = "--"
        if fbLikes + twtFollower + instaFollower > 0:
            key = "//".join([typ, level, land, kreis, stadt])
            result.update({key: [typ, level, land, kreis, stadt, fbname, fbLikes, fbVerified, twtname, twtFollower, twtVerified, instaName, instaFollower, instaVerified]})
        idx += 1

    with open("docs/result.json", "w") as f:
        json.dump(result, f)

    print("facebook:", fbcount, "twitter:", twtcount, "instagram:", instacount)


if __name__ == "__main__":
    main()
add first implementation for facebook API retrieval 2018-04-28 22:04:25 +02:00			`from git import Repo`
			`import os`
			`import shutil`
			`from ruamel import yaml`
			`from pprint import pprint`
			`import sys`
get facebook infos 2018-06-01 21:52:15 +02:00			`import re`
initial working version of table visualization 2018-07-17 22:51:00 +02:00			`import json`
wip on add verification info 2019-09-25 15:44:29 +02:00			`from scraper import scrapeFacebookData, scrapeInstagramData, scrapeTwitterData`
add instagram column 2018-07-21 17:37:45 +02:00			`from time import sleep`
add first implementation for facebook API retrieval 2018-04-28 22:04:25 +02:00
			`# Git repo for our data`
			`green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'`

			`# folder in that repo that holds the data`
			`green_direcory_data_path = 'data/countries/de'`
			`green_directory_local_path = './cache/green-directory'`
add instagram column 2018-07-21 17:37:45 +02:00
add first implementation for facebook API retrieval 2018-04-28 22:04:25 +02:00
			`def get_green_directory():`
			`"""`
			`Clones the source of website URLs, the green directory,`
			`into the local file system using git`
			`"""`
			`if os.path.exists(green_directory_local_path):`
			`shutil.rmtree(green_directory_local_path, onerror=onerror)`
			`Repo.clone_from(green_directory_repo, green_directory_local_path)`


			`def dir_entries():`
			`"""`
			`Iterator over all data files in the cloned green directory`
			`"""`
			`path = os.path.join(green_directory_local_path, green_direcory_data_path)`
			`for root, dirs, files in os.walk(path):`
			`for fname in files:`

			`filepath = os.path.join(root, fname)`
			`if not filepath.endswith(".yaml"):`
			`continue`

			`with open(filepath, 'r') as yamlfile:`
get facebook infos 2018-06-01 21:52:15 +02:00			`for doc in yaml.load_all(yamlfile, Loader=yaml.Loader):`
add first implementation for facebook API retrieval 2018-04-28 22:04:25 +02:00			`yield doc`

code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00
get facebook infos 2018-06-01 21:52:15 +02:00			`def onerror(func, path, _):`
add first implementation for facebook API retrieval 2018-04-28 22:04:25 +02:00			`"""`
			Error handler for ``shutil.rmtree``.

			`If the error is due to an access error (read only file)`
			`it attempts to add write permission and then retries.`

			`If the error is for another reason it re-raises the error.`

			Usage : ``shutil.rmtree(path, onerror=onerror)``
			`"""`
			`import stat`
			`if not os.access(path, os.W_OK):`
			`# Is the error an access error ?`
			`os.chmod(path, stat.S_IWUSR)`
			`func(path)`
			`else:`
			`raise`

add twitter API retrieval 2018-06-01 22:43:59 +02:00
get facebook infos 2018-06-01 21:52:15 +02:00			`def getFacebookName(url):`
			`if "/groups/" in url:`
			`return None`
			`if re.match(r".+-(\d)+", url):`
			`result = re.match(r".+-(\d+)", url).group(1)`
			`if len(result) < 10:`
			`print(url, "--", result, file=sys.stderr)`
			`return`
			`return result`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00
get facebook infos 2018-06-01 21:52:15 +02:00			`if url.split("/")[-1]:`
			`return url.split("/")[-1]`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00
get facebook infos 2018-06-01 21:52:15 +02:00			`elif url.split("/")[-2]:`
			`return url.split("/")[-2]`
add twitter API retrieval 2018-06-01 22:43:59 +02:00
get facebook infos 2018-06-01 21:52:15 +02:00
			`def getTwitterName(url):`
			`if url.split("/")[-1]:`
			`return url.split("/")[-1]`
			`elif url.split("/")[-2]:`
			`return url.split("/")[-2]`
add first implementation for facebook API retrieval 2018-04-28 22:04:25 +02:00
add twitter API retrieval 2018-06-01 22:43:59 +02:00
add instagram column 2018-07-21 17:37:45 +02:00			`def getInstagramName(url):`
			`if url.split("/")[-1]:`
			`return url.split("/")[-1]`
			`elif url.split("/")[-2]:`
			`return url.split("/")[-2]`


add first implementation for facebook API retrieval 2018-04-28 22:04:25 +02:00			`def main():`
			`get_green_directory()`
add instagram column 2018-07-21 17:37:45 +02:00
initial working version of table visualization 2018-07-17 22:51:00 +02:00			`result = {}`
			`idx = 0`
add twitter API retrieval 2018-06-01 22:43:59 +02:00			`fbcount = 0`
			`twtcount = 0`
add instagram column 2018-07-21 17:37:45 +02:00			`instacount = 0`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00
get facebook infos 2018-06-01 21:52:15 +02:00			`for entry in dir_entries():`
initial working version of table visualization 2018-07-17 22:51:00 +02:00			`fbname = "--"`
			`fbLikes = 0`
wip on add verification info 2019-09-25 15:44:29 +02:00			`fbVerified = False`
initial working version of table visualization 2018-07-17 22:51:00 +02:00			`twtname = "--"`
			`twtFollower = 0`
wip on add verification info 2019-09-25 15:44:29 +02:00			`twtVerified = False`
add instagram column 2018-07-21 17:37:45 +02:00			`instaName = "--"`
			`instaFollower = 0`
wip on add verification info 2019-09-25 15:44:29 +02:00			`instaVerified = False`

get facebook infos 2018-06-01 21:52:15 +02:00			`if not entry.get("urls"):`
			`continue`
			`for url in entry["urls"]:`
			`if url["type"] == "FACEBOOK":`
			`fbname = getFacebookName(url["url"])`
			`if fbname:`
			`try:`
wip on add verification info 2019-09-25 15:44:29 +02:00			`fbLikes, fbVerified = scrapeFacebookData(fbname)`
add instagram column 2018-07-21 17:37:45 +02:00			`sleep(0.1)`
get facebook infos 2018-06-01 21:52:15 +02:00			`except Exception as e:`
add twitter API retrieval 2018-06-01 22:43:59 +02:00			`print("FACEBOOK ERROR for", url["url"], "--", fbname, file=sys.stderr)`
get facebook infos 2018-06-01 21:52:15 +02:00			`print(e, file=sys.stderr)`
			`continue`
wip on add verification info 2019-09-25 15:44:29 +02:00			`print(" FB", fbname, fbLikes, fbVerified)`
add twitter API retrieval 2018-06-01 22:43:59 +02:00			`fbcount += 1`

get facebook infos 2018-06-01 21:52:15 +02:00			`elif url["type"] == "TWITTER":`
			`twtname = getTwitterName(url["url"])`
add twitter API retrieval 2018-06-01 22:43:59 +02:00			`try:`
wip on add verification info 2019-09-25 15:44:29 +02:00			`twtFollower, twtVerified = scrapeTwitterData(twtname)`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00			`sleep(0.1)`
add twitter API retrieval 2018-06-01 22:43:59 +02:00			`except Exception as e:`
			`print("TWITTER ERROR for", url["url"], "--", twtname, file=sys.stderr)`
			`print(e, file=sys.stderr)`
			`continue`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00			`twtcount += 1`
wip on add verification info 2019-09-25 15:44:29 +02:00			`print(" TWITTER", twtname, twtFollower, twtVerified)`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00
add instagram column 2018-07-21 17:37:45 +02:00			`elif url["type"] == "INSTAGRAM":`
			`instaName = getInstagramName(url["url"])`
			`try:`
add verification info to result, html page 2019-10-19 15:26:42 +02:00			`instaFollower, instaVerified = scrapeInstagramData(instaName)`
add instagram column 2018-07-21 17:37:45 +02:00			`sleep(0.1)`
			`except Exception as e:`
			`print("INSTAGRAM ERROR for", url["url"], "--", instaName, file=sys.stderr)`
			`print(e, file=sys.stderr)`
			`continue`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00			`instacount += 1`
wip on add verification info 2019-09-25 15:44:29 +02:00			`print(" INSTA", instaName, instaFollower, instaVerified)`
add twitter API retrieval 2018-06-01 22:43:59 +02:00
add column type 2019-06-17 18:02:17 +02:00			`typ = entry.get("type")`
			`level = entry.get("level", "")`
initial working version of table visualization 2018-07-17 22:51:00 +02:00			`land = entry.get("state", "")`
			`kreis = entry.get("district", "")`
			`stadt = entry.get("city", "")`
			`if fbname is None:`
add instagram column 2018-07-21 17:37:45 +02:00			`fbname = "--"`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00			`if fbLikes + twtFollower + instaFollower > 0:`
add key in result json file, update data 2019-09-06 11:38:07 +02:00			`key = "//".join([typ, level, land, kreis, stadt])`
add verification info to result, html page 2019-10-19 15:26:42 +02:00			`result.update({key: [typ, level, land, kreis, stadt, fbname, fbLikes, fbVerified, twtname, twtFollower, twtVerified, instaName, instaFollower, instaVerified]})`
initial working version of table visualization 2018-07-17 22:51:00 +02:00			`idx += 1`
add instagram column 2018-07-21 17:37:45 +02:00
initial working version of table visualization 2018-07-17 22:51:00 +02:00			`with open("docs/result.json", "w") as f:`
			`json.dump(result, f)`
add instagram column 2018-07-21 17:37:45 +02:00
			`print("facebook:", fbcount, "twitter:", twtcount, "instagram:", instacount)`
add twitter API retrieval 2018-06-01 22:43:59 +02:00
add newline at EOF 2018-04-28 22:07:06 +02:00
add first implementation for facebook API retrieval 2018-04-28 22:04:25 +02:00			`if __name__ == "__main__":`
add newline at EOF 2018-04-28 22:07:06 +02:00			`main()`