social-spider/scraper.py

59 lines
1.9 KiB
Python
Raw Normal View History

2018-07-20 18:00:02 +02:00
import requests
import json
from pprint import pprint
import re
import sys
2018-07-21 17:37:45 +02:00
def decode(s):
return s.replace("\\'", "'").replace("\\\\\"", "\\\"")
2018-07-20 18:00:02 +02:00
2018-07-21 13:07:00 +02:00
def scrapeInstagramData(username):
2018-07-20 18:00:02 +02:00
url = "https://www.instagram.com/" + username
r = requests.get(url)
2018-07-20 18:00:02 +02:00
s = str(r.content)
part1 = """<script type="text/javascript">window._sharedData = """
part2 = """;</script>"""
pattern = part1 + "(.*?)" + part2
result = re.search(pattern, s)
if result:
2018-07-21 17:37:45 +02:00
decoded = decode(result[1])
data = json.loads(decoded)
2018-07-20 18:00:02 +02:00
data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"] = "----"
return data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_followed_by"]["count"], data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["is_verified"]
2018-07-20 18:00:02 +02:00
else:
2018-07-21 17:37:45 +02:00
print("No data found for", username, file=sys.stderr)
2018-07-20 18:00:02 +02:00
2019-09-25 15:44:29 +02:00
def scrapeFacebookData(username):
2018-07-21 13:07:00 +02:00
url = "https://www.facebook.com/" + username
r = requests.get(url)
2018-07-21 13:07:00 +02:00
s = str(r.content)
2019-09-25 15:44:29 +02:00
verified = "Das blaue Verifizierungsabzeichen" in s
pattern = r"Gef&#xe4;llt ([\d\.]+) Mal"
2018-07-21 13:07:00 +02:00
result = re.search(pattern, s)
if result:
2019-09-25 15:44:29 +02:00
return int(result[1].replace(".", "")), verified
2018-07-21 13:07:00 +02:00
else:
2018-07-21 17:37:45 +02:00
print("No data found for", username, file=sys.stderr)
2019-09-25 15:44:29 +02:00
return 0, verified
2018-07-20 18:00:02 +02:00
2019-09-25 15:44:29 +02:00
def scrapeTwitterData(username):
url = "https://www.twitter.com/" + username
r = requests.get(url)
s = str(r.content)
2019-09-25 15:44:29 +02:00
verified = "ProfileHeaderCard-badges" in s
pattern = r' title="([\d\.]+) Follower"'
result = re.search(pattern, s)
if result:
2019-09-25 15:44:29 +02:00
return int(result[1].replace(".", "")), verified
else:
print("No data found for", username, file=sys.stderr)
2019-09-25 15:44:29 +02:00
return 0, verified
2018-07-20 18:00:02 +02:00
if __name__ == '__main__':
2019-09-25 15:44:29 +02:00
print(scrapeFacebookData("B90DieGruenen"))
print(scrapeTwitterData("Die_Gruenen"))
print(scrapeInstagramData("die_gruenen"))