code update: scrape also twitter, cleanup, improve html table

This commit is contained in:
Lukas Mehl 2019-06-14 20:24:33 +02:00
parent c052384426
commit ba0b7e189b
4 changed files with 63 additions and 47 deletions

View file

@ -4,12 +4,29 @@
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<title>Social Spider</title>
<link rel="stylesheet" href="https://netzbegruenung.github.io/webfonts/style.css">
<style type="text/css">
h1 {
font-family: 'Arvo Gruen', sans-serif;
font-weight: bold;
color: #46962b;
text-transform: uppercase;
}
table {
font-family: 'PT Sans';
}
.dataTables_filter {
font-family: 'PT Sans';
}
</style>
</head>
<body>
<div>
<h5>Social Spider</h5>
<table id="datatable">
<div class="header">
<h1>Social Spider</h5>
</div>
<table id="datatable" class="display">
<thead>
<tr>
<th>Typ</th>
@ -27,21 +44,24 @@
</table>
</div>
<script src="jquery-3.3.1.min.js"></script>
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.19/css/jquery.dataTables.min.css">
<script type="text/javascript" charset="utf8" src="https://cdn.datatables.net/1.10.19/js/jquery.dataTables.js"></script>
<script type="text/javascript">
$.getJSON("result.json" , function(data) {
var tbl_body = document.createElement("tbody");
var odd_even = false;
$.each(data, function() {
var tbl_row = tbl_body.insertRow();
tbl_row.className = odd_even ? "odd" : "even";
$.each(this, function(k , v) {
var cell = tbl_row.insertCell();
cell.appendChild(document.createTextNode(v.toString()));
})
odd_even = !odd_even;
})
})
$("#datatable").append(tbl_body);
});
$('#datatable').DataTable( {
paging: false,
"order" : [[ 5, "desc"]]
} );
});
</script>
</body>

File diff suppressed because one or more lines are too long

View file

@ -10,7 +10,7 @@ def decode(s):
def scrapeInstagramData(username):
url = "https://www.instagram.com/" + username
r = requests.get(url)
s = str(r.content)
part1 = """<script type="text/javascript">window._sharedData = """
part2 = """;</script>"""
@ -27,9 +27,8 @@ def scrapeInstagramData(username):
def scrapeFacebookLikes(username):
url = "https://www.facebook.com/" + username
r = requests.get(url)
s = str(r.content)
pattern = "Gef&#xe4;llt ([\d\.]+) Mal"
result = re.search(pattern, s)
if result:
@ -38,6 +37,20 @@ def scrapeFacebookLikes(username):
print("No data found for", username, file=sys.stderr)
return 0
def scrapeTwitterFollowers(username):
url = "https://www.twitter.com/" + username
r = requests.get(url)
s = str(r.content)
pattern = r' title="([\d\.]+) Follower"'
result = re.search(pattern, s)
if result:
return int(result[1].replace(".", ""))
else:
print("No data found for", username, file=sys.stderr)
return 0
if __name__ == '__main__':
pprint(scrapeInstagramData("die_gruenen"))
print(scrapeFacebookLikes("B90DieGruenen"))
print(scrapeTwitterFollowers("Die_Gruenen"))
print(scrapeInstagramData("die_gruenen")["edge_followed_by"]["count"])

View file

@ -7,7 +7,7 @@ import sys
import re
import twitter
import json
from scraper import scrapeFacebookLikes, scrapeInstagramData
from scraper import scrapeFacebookLikes, scrapeInstagramData, scrapeTwitterFollowers
from time import sleep
# Git repo for our data
@ -17,12 +17,6 @@ green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
green_direcory_data_path = 'data/countries/de'
green_directory_local_path = './cache/green-directory'
# facebook_access_token = os.getenv("secret_facebook_access_token")
twitter_consumer_key = os.getenv("twitter_consumer_key")
twitter_consumer_secret = os.getenv("twitter_consumer_secret")
twitter_access_token_key = os.getenv("twitter_access_token_key")
twitter_access_token_secret = os.getenv("twitter_access_token_secret")
def get_green_directory():
"""
@ -50,7 +44,7 @@ def dir_entries():
for doc in yaml.load_all(yamlfile, Loader=yaml.Loader):
yield doc
def onerror(func, path, _):
"""
Error handler for ``shutil.rmtree``.
@ -80,10 +74,10 @@ def getFacebookName(url):
print(url, "--", result, file=sys.stderr)
return
return result
if url.split("/")[-1]:
return url.split("/")[-1]
elif url.split("/")[-2]:
return url.split("/")[-2]
@ -104,20 +98,13 @@ def getInstagramName(url):
def main():
get_green_directory()
twitterAPI = twitter.Api(consumer_key=twitter_consumer_key,
consumer_secret=twitter_consumer_secret,
access_token_key=twitter_access_token_key,
access_token_secret=twitter_access_token_secret)
# facebookGraphAPI = facebook.GraphAPI(access_token=facebook_access_token)
result = {}
idx = 0
fbcount = 0
twtcount = 0
instacount = 0
for entry in dir_entries():
fbname = "--"
fbLikes = 0
@ -133,54 +120,50 @@ def main():
fbname = getFacebookName(url["url"])
if fbname:
try:
# fbdata = facebookGraphAPI.get_object(fbname, fields="fan_count,username,verification_status,website")
fbLikes = scrapeFacebookLikes(fbname)
sleep(0.1)
except Exception as e:
print("FACEBOOK ERROR for", url["url"], "--", fbname, file=sys.stderr)
print(e, file=sys.stderr)
continue
print(fbname)
print("FB", fbname, fbLikes)
fbcount += 1
elif url["type"] == "TWITTER":
twtname = getTwitterName(url["url"])
twtcount += 1
try:
user = twitterAPI.GetUser(screen_name=twtname)
twtData = user.AsDict()
twtFollower = scrapeTwitterFollowers(twtname)
sleep(0.1)
except Exception as e:
print("TWITTER ERROR for", url["url"], "--", twtname, file=sys.stderr)
print(e, file=sys.stderr)
continue
twtFollower = twtData["followers_count"]
print(twtname)
twtcount += 1
print("TWITTER", twtname, twtFollower)
elif url["type"] == "INSTAGRAM":
instaName = getInstagramName(url["url"])
try:
instaData = scrapeInstagramData(instaName)
if instaData:
instaFollower = instaData["edge_followed_by"]["count"]
sleep(0.1)
except Exception as e:
print("INSTAGRAM ERROR for", url["url"], "--", instaName, file=sys.stderr)
print(e, file=sys.stderr)
continue
if instaData:
instaFollower = instaData["edge_followed_by"]["count"]
print(instaName, instaFollower)
instacount += 1
instacount += 1
print("INSTA", instaName, instaFollower)
typ = entry.get("level").split(":")[1].replace("KREISVERBAND", "KV").replace("ORTSVERBAND", "OV").replace("LANDESVERBAND", "LV").replace("BUNDESVERBAND", "BV")
typ = entry.get("level").split(":")[1]
land = entry.get("state", "")
kreis = entry.get("district", "")
stadt = entry.get("city", "")
if fbname is None:
fbname = "--"
result.update({str(idx): [typ, land, kreis, stadt, fbname, fbLikes, twtname, twtFollower, instaName, instaFollower]})
if fbLikes + twtFollower + instaFollower > 0:
result.update({str(idx): [typ, land, kreis, stadt, fbname, fbLikes, twtname, twtFollower, instaName, instaFollower]})
idx += 1
#if idx == 200:
#break
with open("docs/result.json", "w") as f:
json.dump(result, f)