mirror of
https://github.com/netzbegruenung/social-spider.git
synced 2024-05-04 19:53:40 +02:00
code update: scrape also twitter, cleanup, improve html table
This commit is contained in:
parent
c052384426
commit
ba0b7e189b
|
@ -4,12 +4,29 @@
|
|||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
<title>Social Spider</title>
|
||||
<link rel="stylesheet" href="https://netzbegruenung.github.io/webfonts/style.css">
|
||||
<style type="text/css">
|
||||
h1 {
|
||||
font-family: 'Arvo Gruen', sans-serif;
|
||||
font-weight: bold;
|
||||
color: #46962b;
|
||||
text-transform: uppercase;
|
||||
}
|
||||
table {
|
||||
font-family: 'PT Sans';
|
||||
}
|
||||
.dataTables_filter {
|
||||
font-family: 'PT Sans';
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div>
|
||||
<h5>Social Spider</h5>
|
||||
<table id="datatable">
|
||||
<div class="header">
|
||||
<h1>Social Spider</h5>
|
||||
</div>
|
||||
<table id="datatable" class="display">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Typ</th>
|
||||
|
@ -27,21 +44,24 @@
|
|||
</table>
|
||||
</div>
|
||||
<script src="jquery-3.3.1.min.js"></script>
|
||||
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.19/css/jquery.dataTables.min.css">
|
||||
<script type="text/javascript" charset="utf8" src="https://cdn.datatables.net/1.10.19/js/jquery.dataTables.js"></script>
|
||||
<script type="text/javascript">
|
||||
$.getJSON("result.json" , function(data) {
|
||||
var tbl_body = document.createElement("tbody");
|
||||
var odd_even = false;
|
||||
$.each(data, function() {
|
||||
var tbl_row = tbl_body.insertRow();
|
||||
tbl_row.className = odd_even ? "odd" : "even";
|
||||
$.each(this, function(k , v) {
|
||||
var cell = tbl_row.insertCell();
|
||||
cell.appendChild(document.createTextNode(v.toString()));
|
||||
})
|
||||
odd_even = !odd_even;
|
||||
})
|
||||
})
|
||||
$("#datatable").append(tbl_body);
|
||||
});
|
||||
$('#datatable').DataTable( {
|
||||
paging: false,
|
||||
"order" : [[ 5, "desc"]]
|
||||
} );
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
21
scraper.py
21
scraper.py
|
@ -10,7 +10,7 @@ def decode(s):
|
|||
def scrapeInstagramData(username):
|
||||
url = "https://www.instagram.com/" + username
|
||||
r = requests.get(url)
|
||||
|
||||
|
||||
s = str(r.content)
|
||||
part1 = """<script type="text/javascript">window._sharedData = """
|
||||
part2 = """;</script>"""
|
||||
|
@ -27,9 +27,8 @@ def scrapeInstagramData(username):
|
|||
def scrapeFacebookLikes(username):
|
||||
url = "https://www.facebook.com/" + username
|
||||
r = requests.get(url)
|
||||
|
||||
|
||||
s = str(r.content)
|
||||
|
||||
pattern = "Gefällt ([\d\.]+) Mal"
|
||||
result = re.search(pattern, s)
|
||||
if result:
|
||||
|
@ -38,6 +37,20 @@ def scrapeFacebookLikes(username):
|
|||
print("No data found for", username, file=sys.stderr)
|
||||
return 0
|
||||
|
||||
def scrapeTwitterFollowers(username):
|
||||
url = "https://www.twitter.com/" + username
|
||||
r = requests.get(url)
|
||||
|
||||
s = str(r.content)
|
||||
pattern = r' title="([\d\.]+) Follower"'
|
||||
result = re.search(pattern, s)
|
||||
if result:
|
||||
return int(result[1].replace(".", ""))
|
||||
else:
|
||||
print("No data found for", username, file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
pprint(scrapeInstagramData("die_gruenen"))
|
||||
print(scrapeFacebookLikes("B90DieGruenen"))
|
||||
print(scrapeTwitterFollowers("Die_Gruenen"))
|
||||
print(scrapeInstagramData("die_gruenen")["edge_followed_by"]["count"])
|
||||
|
|
53
spider.py
53
spider.py
|
@ -7,7 +7,7 @@ import sys
|
|||
import re
|
||||
import twitter
|
||||
import json
|
||||
from scraper import scrapeFacebookLikes, scrapeInstagramData
|
||||
from scraper import scrapeFacebookLikes, scrapeInstagramData, scrapeTwitterFollowers
|
||||
from time import sleep
|
||||
|
||||
# Git repo for our data
|
||||
|
@ -17,12 +17,6 @@ green_directory_repo = 'https://github.com/netzbegruenung/green-directory.git'
|
|||
green_direcory_data_path = 'data/countries/de'
|
||||
green_directory_local_path = './cache/green-directory'
|
||||
|
||||
# facebook_access_token = os.getenv("secret_facebook_access_token")
|
||||
twitter_consumer_key = os.getenv("twitter_consumer_key")
|
||||
twitter_consumer_secret = os.getenv("twitter_consumer_secret")
|
||||
twitter_access_token_key = os.getenv("twitter_access_token_key")
|
||||
twitter_access_token_secret = os.getenv("twitter_access_token_secret")
|
||||
|
||||
|
||||
def get_green_directory():
|
||||
"""
|
||||
|
@ -50,7 +44,7 @@ def dir_entries():
|
|||
for doc in yaml.load_all(yamlfile, Loader=yaml.Loader):
|
||||
yield doc
|
||||
|
||||
|
||||
|
||||
def onerror(func, path, _):
|
||||
"""
|
||||
Error handler for ``shutil.rmtree``.
|
||||
|
@ -80,10 +74,10 @@ def getFacebookName(url):
|
|||
print(url, "--", result, file=sys.stderr)
|
||||
return
|
||||
return result
|
||||
|
||||
|
||||
if url.split("/")[-1]:
|
||||
return url.split("/")[-1]
|
||||
|
||||
|
||||
elif url.split("/")[-2]:
|
||||
return url.split("/")[-2]
|
||||
|
||||
|
@ -104,20 +98,13 @@ def getInstagramName(url):
|
|||
|
||||
def main():
|
||||
get_green_directory()
|
||||
|
||||
twitterAPI = twitter.Api(consumer_key=twitter_consumer_key,
|
||||
consumer_secret=twitter_consumer_secret,
|
||||
access_token_key=twitter_access_token_key,
|
||||
access_token_secret=twitter_access_token_secret)
|
||||
|
||||
# facebookGraphAPI = facebook.GraphAPI(access_token=facebook_access_token)
|
||||
|
||||
result = {}
|
||||
idx = 0
|
||||
fbcount = 0
|
||||
twtcount = 0
|
||||
instacount = 0
|
||||
|
||||
|
||||
for entry in dir_entries():
|
||||
fbname = "--"
|
||||
fbLikes = 0
|
||||
|
@ -133,54 +120,50 @@ def main():
|
|||
fbname = getFacebookName(url["url"])
|
||||
if fbname:
|
||||
try:
|
||||
# fbdata = facebookGraphAPI.get_object(fbname, fields="fan_count,username,verification_status,website")
|
||||
fbLikes = scrapeFacebookLikes(fbname)
|
||||
sleep(0.1)
|
||||
except Exception as e:
|
||||
print("FACEBOOK ERROR for", url["url"], "--", fbname, file=sys.stderr)
|
||||
print(e, file=sys.stderr)
|
||||
continue
|
||||
|
||||
print(fbname)
|
||||
print("FB", fbname, fbLikes)
|
||||
fbcount += 1
|
||||
|
||||
elif url["type"] == "TWITTER":
|
||||
twtname = getTwitterName(url["url"])
|
||||
twtcount += 1
|
||||
try:
|
||||
user = twitterAPI.GetUser(screen_name=twtname)
|
||||
twtData = user.AsDict()
|
||||
twtFollower = scrapeTwitterFollowers(twtname)
|
||||
sleep(0.1)
|
||||
except Exception as e:
|
||||
print("TWITTER ERROR for", url["url"], "--", twtname, file=sys.stderr)
|
||||
print(e, file=sys.stderr)
|
||||
continue
|
||||
twtFollower = twtData["followers_count"]
|
||||
print(twtname)
|
||||
twtcount += 1
|
||||
print("TWITTER", twtname, twtFollower)
|
||||
|
||||
elif url["type"] == "INSTAGRAM":
|
||||
instaName = getInstagramName(url["url"])
|
||||
try:
|
||||
instaData = scrapeInstagramData(instaName)
|
||||
if instaData:
|
||||
instaFollower = instaData["edge_followed_by"]["count"]
|
||||
sleep(0.1)
|
||||
except Exception as e:
|
||||
print("INSTAGRAM ERROR for", url["url"], "--", instaName, file=sys.stderr)
|
||||
print(e, file=sys.stderr)
|
||||
continue
|
||||
|
||||
if instaData:
|
||||
instaFollower = instaData["edge_followed_by"]["count"]
|
||||
print(instaName, instaFollower)
|
||||
instacount += 1
|
||||
instacount += 1
|
||||
print("INSTA", instaName, instaFollower)
|
||||
|
||||
typ = entry.get("level").split(":")[1].replace("KREISVERBAND", "KV").replace("ORTSVERBAND", "OV").replace("LANDESVERBAND", "LV").replace("BUNDESVERBAND", "BV")
|
||||
typ = entry.get("level").split(":")[1]
|
||||
land = entry.get("state", "")
|
||||
kreis = entry.get("district", "")
|
||||
stadt = entry.get("city", "")
|
||||
if fbname is None:
|
||||
fbname = "--"
|
||||
result.update({str(idx): [typ, land, kreis, stadt, fbname, fbLikes, twtname, twtFollower, instaName, instaFollower]})
|
||||
if fbLikes + twtFollower + instaFollower > 0:
|
||||
result.update({str(idx): [typ, land, kreis, stadt, fbname, fbLikes, twtname, twtFollower, instaName, instaFollower]})
|
||||
idx += 1
|
||||
#if idx == 200:
|
||||
#break
|
||||
|
||||
with open("docs/result.json", "w") as f:
|
||||
json.dump(result, f)
|
||||
|
|
Loading…
Reference in a new issue