social-spider/scraper.py

import requests
import json
from pprint import pprint
import re
import sys

def decode(s):
    return s.replace("\\'", "'").replace("\\\\\"", "\\\"")

def scrapeInstagramData(username):
    url = "https://www.instagram.com/" + username
    r = requests.get(url)

    s = str(r.content)
    part1 = """<script type="text/javascript">window._sharedData = """
    part2 = """;</script>"""
    pattern = part1 + "(.*?)" + part2
    result = re.search(pattern, s)
    if result:
        decoded = decode(result[1])
        data = json.loads(decoded)
        data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"] = "----"
        return data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_followed_by"]["count"], data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["is_verified"]
    else:
        print("No data found for", username, file=sys.stderr)
        
def scrapeFacebookData(username):
    url = "https://www.facebook.com/" + username
    r = requests.get(url)

    s = str(r.content)
    verified = "Das blaue Verifizierungsabzeichen" in s
    pattern = r"Gef&#xe4;llt ([\d\.]+) Mal"
    result = re.search(pattern, s)
    if result:
        return int(result[1].replace(".", "")), verified
    else:
        print("No data found for", username, file=sys.stderr)
        return 0, verified

def scrapeTwitterData(username):
    url = "https://www.twitter.com/" + username
    r = requests.get(url)

    s = str(r.content)
    verified = "ProfileHeaderCard-badges" in s
    pattern = r' title="([\d\.]+) Follower"'
    result = re.search(pattern, s)
    if result:
        return int(result[1].replace(".", "")), verified
    else:
        print("No data found for", username, file=sys.stderr)
        return 0, verified

if __name__ == '__main__':
    print(scrapeFacebookData("B90DieGruenen"))
    print(scrapeTwitterData("Die_Gruenen"))
    print(scrapeInstagramData("die_gruenen"))
add instagram scraper 2018-07-20 18:00:02 +02:00			`import requests`
			`import json`
			`from pprint import pprint`
			`import re`
			`import sys`

add instagram column 2018-07-21 17:37:45 +02:00			`def decode(s):`
			`return s.replace("\\'", "'").replace("\\\\\"", "\\\"")`
add instagram scraper 2018-07-20 18:00:02 +02:00
refactor both scrapers to one file 2018-07-21 13:07:00 +02:00			`def scrapeInstagramData(username):`
add instagram scraper 2018-07-20 18:00:02 +02:00			`url = "https://www.instagram.com/" + username`
			`r = requests.get(url)`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00
add instagram scraper 2018-07-20 18:00:02 +02:00			`s = str(r.content)`
			`part1 = """<script type="text/javascript">window._sharedData = """`
			`part2 = """;</script>"""`
			`pattern = part1 + "(.*?)" + part2`
			`result = re.search(pattern, s)`
			`if result:`
add instagram column 2018-07-21 17:37:45 +02:00			`decoded = decode(result[1])`
			`data = json.loads(decoded)`
add instagram scraper 2018-07-20 18:00:02 +02:00			`data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"] = "----"`
add verification info to result, html page 2019-10-19 15:26:42 +02:00			`return data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_followed_by"]["count"], data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["is_verified"]`
add instagram scraper 2018-07-20 18:00:02 +02:00			`else:`
add instagram column 2018-07-21 17:37:45 +02:00			`print("No data found for", username, file=sys.stderr)`
add instagram scraper 2018-07-20 18:00:02 +02:00
wip on add verification info 2019-09-25 15:44:29 +02:00			`def scrapeFacebookData(username):`
refactor both scrapers to one file 2018-07-21 13:07:00 +02:00			`url = "https://www.facebook.com/" + username`
			`r = requests.get(url)`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00
refactor both scrapers to one file 2018-07-21 13:07:00 +02:00			`s = str(r.content)`
wip on add verification info 2019-09-25 15:44:29 +02:00			`verified = "Das blaue Verifizierungsabzeichen" in s`
			`pattern = r"Gefällt ([\d\.]+) Mal"`
refactor both scrapers to one file 2018-07-21 13:07:00 +02:00			`result = re.search(pattern, s)`
			`if result:`
wip on add verification info 2019-09-25 15:44:29 +02:00			`return int(result[1].replace(".", "")), verified`
refactor both scrapers to one file 2018-07-21 13:07:00 +02:00			`else:`
add instagram column 2018-07-21 17:37:45 +02:00			`print("No data found for", username, file=sys.stderr)`
wip on add verification info 2019-09-25 15:44:29 +02:00			`return 0, verified`
add instagram scraper 2018-07-20 18:00:02 +02:00
wip on add verification info 2019-09-25 15:44:29 +02:00			`def scrapeTwitterData(username):`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00			`url = "https://www.twitter.com/" + username`
			`r = requests.get(url)`

			`s = str(r.content)`
wip on add verification info 2019-09-25 15:44:29 +02:00			`verified = "ProfileHeaderCard-badges" in s`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00			`pattern = r' title="([\d\.]+) Follower"'`
			`result = re.search(pattern, s)`
			`if result:`
wip on add verification info 2019-09-25 15:44:29 +02:00			`return int(result[1].replace(".", "")), verified`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00			`else:`
			`print("No data found for", username, file=sys.stderr)`
wip on add verification info 2019-09-25 15:44:29 +02:00			`return 0, verified`
code update: scrape also twitter, cleanup, improve html table 2019-06-14 20:24:33 +02:00
add instagram scraper 2018-07-20 18:00:02 +02:00			`if __name__ == '__main__':`
wip on add verification info 2019-09-25 15:44:29 +02:00			`print(scrapeFacebookData("B90DieGruenen"))`
			`print(scrapeTwitterData("Die_Gruenen"))`
add verification info to result, html page 2019-10-19 15:26:42 +02:00			`print(scrapeInstagramData("die_gruenen"))`