import requests
import json
username = '' # Your GitHub username
password = '' # Your GitHub password
# Note that credentials will be transmitted over a secure SSL connection
url = 'https://api.github.com/authorizations'
note = 'Mining the Social Web - Mining Github'
post_data = {'scopes':['repo'],'note': note }
response = requests.post(
url,
auth = (username, password),
data = json.dumps(post_data),
)
print("API response:", response.text)
print()
print("Your OAuth token is", response.json()['token'])
# Go to https://github.com/settings/tokens to revoke this token
import json
import requests
# An unauthenticated request that doesn't contain an ?access_token=xxx query string
url = "https://api.github.com/repos/ptwobrussell/Mining-the-Social-Web/stargazers"
response = requests.get(url)
# Display one stargazer
print(json.dumps(response.json()[0], indent=1))
print()
# Display headers
for (k,v) in response.headers.items():
print(k, "=>", v)
from github import Github # pip install pygithub
# XXX: Specify your own access token here
ACCESS_TOKEN = ''
# Specify a username and repository of interest for that user.
USER = 'ptwobrussell'
REPO = 'Mining-the-Social-Web'
#REPO = 'Mining-the-Social-Web-2nd-Edition'
client = Github(ACCESS_TOKEN, per_page=100)
user = client.get_user(USER)
repo = user.get_repo(REPO)
# Get a list of people who have bookmarked the repo.
# Since you'll get a lazy iterator back, you have to traverse
# it if you want to get the total number of stargazers.
stargazers = [ s for s in repo.get_stargazers() ]
print("Number of stargazers", len(stargazers))
import networkx as nx # pip install networkx
# Create a directed graph
g = nx.DiGraph()
# Add an edge to the directed graph from X to Y
g.add_edge('X', 'Y')
# Print some statistics about the graph
print(nx.info(g))
# Get the nodes and edges from the graph
print("Nodes:", g.nodes())
print("Edges:", g.edges())
print()
# Get node properties
print("X props:", g.node['X'])
print("Y props:", g.node['Y'])
print()
# Get edge properties
print("X=>Y props:", g['X']['Y'])
print()
# Update a node property
g.node['X'].update({'prop1' : 'value1'})
print("X props:", g.node['X'])
print()
# Update an edge property
g['X']['Y'].update({'label' : 'label1'})
print("X=>Y props:", g['X']['Y'])
# Expand the initial graph with (interest) edges pointing each direction for
# additional people interested. Take care to ensure that user and repo nodes
# do not collide by appending their type.
g = nx.DiGraph()
g.add_node(repo.name + '(repo)', type='repo', lang=repo.language, owner=user.login)
for sg in stargazers:
g.add_node(sg.login + '(user)', type='user')
g.add_edge(sg.login + '(user)', repo.name + '(repo)', type='gazes')
Poke around in the current graph to get a better feel for how NetworkX works.
print(nx.info(g))
print(g.node['Mining-the-Social-Web(repo)'])
print(g.node['ptwobrussell(user)'])
print(g['ptwobrussell(user)']['Mining-the-Social-Web(repo)'])
print(g['ptwobrussell(user)'])
print(g['Mining-the-Social-Web(repo)'])
print(g.in_edges(['ptwobrussell(user)']))
print(g.out_edges(['ptwobrussell(user)']))
print(g.in_edges(['Mining-the-Social-Web(repo)']))
print(g.out_edges(['Mining-the-Social-Web(repo)']))
from operator import itemgetter
from IPython.display import HTML
from IPython.core.display import display
display(HTML('<img src="resources/ch08-github/kite-graph.png" width="400px">'))
# The classic Krackhardt kite graph
kkg = nx.generators.small.krackhardt_kite_graph()
print("Degree Centrality")
print(sorted(nx.degree_centrality(kkg).items(),
key=itemgetter(1), reverse=True))
print()
print("Betweenness Centrality")
print(sorted(nx.betweenness_centrality(kkg).items(),
key=itemgetter(1), reverse=True))
print()
print("Closeness Centrality")
print(sorted(nx.closeness_centrality(kkg).items(),
key=itemgetter(1), reverse=True))
# Add (social) edges from the stargazers' followers. This can take a while
# because of all of the potential API calls to GitHub. The approximate number
# of requests for followers for each iteration of this loop can be calculated as
# math.ceil(sg.get_followers() / 100.0) per the API returning up to 100 items
# at a time.
import sys
for i, sg in enumerate(stargazers):
# Add "follows" edges between stargazers in the graph if any relationships exist
try:
for follower in sg.get_followers():
if follower.login + '(user)' in g:
g.add_edge(follower.login + '(user)', sg.login + '(user)',
type='follows')
except Exception as e: #ssl.SSLError
print("Encountered an error fetching followers for", sg.login, \
"Skipping.", file=sys.stderr)
print(e, file=sys.stderr)
print("Processed", i+1, " stargazers. Num nodes/edges in graph", \
g.number_of_nodes(), "/", g.number_of_edges())
print("Rate limit remaining", client.rate_limiting)
from operator import itemgetter
from collections import Counter
# Let's see how many social edges we added since last time.
print(nx.info(g))
# The number of "follows" edges is the difference
print(len([e for e in g.edges_iter(data=True) if e[2]['type'] == 'follows']))
# The repository owner is possibly one of the more popular users in this graph.
print(len([e
for e in g.edges_iter(data=True)
if e[2]['type'] == 'follows' and e[1] == 'ptwobrussell(user)']))
# Let's examine the number of adjacent edges to each node
print(sorted([n for n in g.degree_iter()], key=itemgetter(1), reverse=True)[:10])
# Consider the ratio of incoming and outgoing edges for a couple of users with
# high node degrees...
print(len(g.out_edges('angusshire(user)')))
print(len(g.in_edges('angusshire(user)')))
# A user who is followed by many but does not follow back.
print(len(g.out_edges('ptwobrussell(user)')))
print(len(g.in_edges('ptwobrussell(user)')))
c = Counter([e[1] for e in g.edges_iter(data=True) if e[2]['type'] == 'follows'])
popular_users = [ (u, f) for (u, f) in c.most_common() if f > 1 ]
print("Number of popular users", len(popular_users))
print("Top 10 popular users:", popular_users[:10])
# Save your work by serializing out (pickling) the graph
nx.write_gpickle(g, "data/github.gpickle.1")
# How to restore the graph...
# import networkx as nx
# g = nx.read_gpickle("data/github.gpickle.1")
from operator import itemgetter
# Create a copy of the graph so that we can iteratively mutate the copy
# as needed for experimentation
h = g.copy()
# Remove the seed of the interest graph, which is a supernode, in order
# to get a better idea of the network dynamics
h.remove_node('Mining-the-Social-Web(repo)')
# XXX: Remove any other nodes that appear to be supernodes.
# Filter any other nodes that you can by threshold
# criteria or heuristics from inspection.
# Display the centrality measures for the top 10 nodes
dc = sorted(nx.degree_centrality(h).items(),
key=itemgetter(1), reverse=True)
print("Degree Centrality")
print(dc[:10])
print()
bc = sorted(nx.betweenness_centrality(h).items(),
key=itemgetter(1), reverse=True)
print("Betweenness Centrality")
print(bc[:10])
print()
print("Closeness Centrality")
cc = sorted(nx.closeness_centrality(h).items(),
key=itemgetter(1), reverse=True)
print(cc[:10])
# Let's add each stargazer's additional starred repos and add edges
# to find additional interests.
MAX_REPOS = 500
for i, sg in enumerate(stargazers):
print(sg.login)
try:
for starred in sg.get_starred()[:MAX_REPOS]: # Slice to avoid supernodes
g.add_node(starred.name + '(repo)', type='repo', lang=starred.language, \
owner=starred.owner.login)
g.add_edge(sg.login + '(user)', starred.name + '(repo)', type='gazes')
except Exception as e: #ssl.SSLError:
print("Encountered an error fetching starred repos for", sg.login, "Skipping.")
print("Processed", i+1, "stargazers' starred repos")
print("Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges())
print("Rate limit", client.rate_limiting)
NOTE: Given that the above example is potentially a very time-consuming example to run, be sure to snapshot your work
# Save your work by serializing out another snapshot of the graph
nx.write_gpickle(g, "data/github.gpickle.2")
#import networkx as nx
# g = nx.read_gpickle("data/github.gpickle.2")
# Poke around: how to get users/repos
from operator import itemgetter
print(nx.info(g))
print()
# Get a list of repositories from the graph.
repos = [n for n in g.nodes_iter() if g.node[n]['type'] == 'repo']
# Most popular repos
print("Popular repositories")
print(sorted([(n,d)
for (n,d) in g.in_degree_iter()
if g.node[n]['type'] == 'repo'], \
key=itemgetter(1), reverse=True)[:10])
print()
# Projects gazed at by a user
print("Respositories that ptwobrussell has bookmarked")
print([(n,g.node[n]['lang'])
for n in g['ptwobrussell(user)']
if g['ptwobrussell(user)'][n]['type'] == 'gazes'])
print()
# Programming languages for each user
print("Programming languages ptwobrussell is interested in")
print(list(set([g.node[n]['lang']
for n in g['ptwobrussell(user)']
if g['ptwobrussell(user)'][n]['type'] == 'gazes'])))
print()
# Find supernodes in the graph by approximating with a high number of
# outgoing edges
print("Supernode candidates")
print(sorted([(n, len(g.out_edges(n)))
for n in g.nodes_iter()
if g.node[n]['type'] == 'user' and len(g.out_edges(n)) > 500], \
key=itemgetter(1), reverse=True))
# Iterate over all of the repos, and add edges for programming languages
# for each person in the graph. We'll also add edges back to repos so that
# we have a good point to "pivot" upon.
repos = [n
for n in g.nodes_iter()
if g.node[n]['type'] == 'repo']
for repo in repos:
lang = (g.node[repo]['lang'] or "") + "(lang)"
stargazers = [u
for (u, r, d) in g.in_edges_iter(repo, data=True)
if d['type'] == 'gazes'
]
for sg in stargazers:
g.add_node(lang, type='lang')
g.add_edge(sg, lang, type='programs')
g.add_edge(lang, repo, type='implements')
# Some stats
print(nx.info(g))
print()
# What languages exist in the graph?
print([n
for n in g.nodes_iter()
if g.node[n]['type'] == 'lang'])
print()
# What languages do users program with?
print([n
for n in g['ptwobrussell(user)']
if g['ptwobrussell(user)'][n]['type'] == 'programs'])
print()
# What is the most popular programming language?
print("Most popular languages")
print(sorted([(n, g.in_degree(n))
for n in g.nodes_iter()
if g.node[n]['type'] == 'lang'], key=itemgetter(1), reverse=True)[:10])
print()
# How many users program in a particular language?
python_programmers = [u
for (u, l) in g.in_edges_iter('Python(lang)')
if g.node[u]['type'] == 'user']
print("Number of Python programmers:", len(python_programmers))
print()
javascript_programmers = [u for
(u, l) in g.in_edges_iter('JavaScript(lang)')
if g.node[u]['type'] == 'user']
print("Number of JavaScript programmers:", len(javascript_programmers))
print()
# What users program in both Python and JavaScript?
print("Number of programmers who use JavaScript and Python")
print(len(set(python_programmers).intersection(set(javascript_programmers))))
# Programmers who use JavaScript but not Python
print("Number of programmers who use JavaScript but not Python")
print(len(set(javascript_programmers).difference(set(python_programmers))))
# XXX: Can you determine who is the most polyglot programmer?
NOTE: Optionally, snapshot the final graph
# Save your work by serializing out another snapshot of the graph
nx.write_gpickle(g, "data/github.gpickle.3")
#import networkx as nx
# g = nx.read_gpickle("data/github.gpickle.3")
print("Stats on the full graph")
print(nx.info(g))
print()
# Create a subgraph from a collection of nodes. In this case, the
# collection is all of the users in the original interest graph
mtsw_users = [n for n in g if g.node[n]['type'] == 'user']
h = g.subgraph(mtsw_users)
print("Stats on the extracted subgraph")
print(nx.info(h))
import os
import json
from IPython.display import IFrame
from IPython.core.display import display
from networkx.readwrite import json_graph
# Visualize the social network of all people from the original interest graph.
d = json_graph.node_link_data(h)
json.dump(d, open('force.json', 'w'))
# IPython Notebook can serve files and display them into
# inline frames. Prepend the path with the 'files' prefix.
# A D3 template for displaying the graph data.
viz_file = 'force.html'
# Display the D3 visualization.
display(IFrame(viz_file, '100%', '500px'))
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111)
labels = dict([(n, n.split('(user)')[0]) for n in h.nodes_iter()])
nx.draw(h, pos=nx.spring_layout(h),
arrows=False, ax=ax, node_size=50,
edge_color='#aaaaaa',
alpha=0.8, labels=labels, font_size=8)