Github Exploration.ipynb
Explore your github data!
Visualize the most commonly used words in your commit messages
See the commit activity per repo per year. A trip down memory lane!
Checking the busiest times and days of the week. If a repo is busiest on the weekend, could it be a side project?
Approximate the duration of coding per repo, by looking at times between timestamps. This will tend to underestimate the total time, since the time between starting coding and the first commit is not captured.
import sys
!{sys.executable} -m pip install wordcloud==1.5.0
from datetime import datetime
import json
import numpy as np
import pandas as pd
import os
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pickle
import requests
import tempfile
import urllib.request
import matplotlib.pyplot as plt
%matplotlib inline
# load the commits from open humans API
token = os.environ.get('OH_ACCESS_TOKEN')
response = requests.get("https://www.openhumans.org/api/direct-sharing/project/exchange-member/?access_token={}".format(token))
user = json.loads(response.content.decode("utf-8"))
for dset in sorted(user['data'], key=lambda x:x['id']):
if 'Github' in dset['metadata']['tags'] and 'commits' in dset['metadata']['tags']:
raw_data = requests.get(dset['download_url']).content
commit_data = json.loads(raw_data.decode("utf-8"))
break
messages = []
timestamps = []
repos = []
for repo in commit_data['repo_data']:
repo_data = commit_data['repo_data'][repo]
for commit in repo_data['commits']:
messages.append(commit['commit']['message'].lower())
timestamps.append(datetime.strptime(commit['commit']['committer']['date'], '%Y-%m-%dT%H:%M:%SZ'))
repos.append(repo)
# turn into a neat dataframe
df = pd.DataFrame(columns=['repo','message','datetime'])
df.repo = repos; df.message = messages; df.datetime = timestamps
df.head()
Visualize the most commonly used words in your commit messages
mask_url = 'http://oh-github.herokuapp.com/static/github.png'
mask_path = tempfile.NamedTemporaryFile(delete=False).name
urllib.request.urlretrieve(mask_url, mask_path)
mask = np.array(Image.open(mask_path))
def transform_format(val):
#print(val)
if val[3] == 255:
return 255
else:
return 0
# Transform your mask into a new one that will work with the function:
transformed_mask = np.ndarray((mask.shape[0], mask.shape[1]), np.int32)
for i in range(len(mask)):
transformed_mask[i] = list(map(transform_format, mask[i]))
# Create and generate a word cloud image:
text = ' '.join(df.message.values)
wordcloud = WordCloud(mask=transformed_mask, background_color='white',
width=len(mask[0]), height=len(mask)).generate(text)
# Display the generated image:
plt.rcParams['figure.figsize'] = [20, 10]
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
See the commit activity per repo per year. A trip down memory lane!
import random
import matplotlib
from matplotlib import cm
df['date'] = df.datetime.apply(lambda x: x.date())
df['year'] = df.datetime.apply(lambda x: x.year)
for year in sorted(df.year.unique()):
df_curr = df[df.year==year]
plt.clf()
fig, ax = plt.subplots()
#max_num_repos = 7
#num_repos = df_curr.repo.nunique()
#repos_to_choose = set(random.sample(list(range(num_repos)), k=min(max_num_repos, num_repos)))
have_data = False
for idx, (key, grp) in enumerate(df_curr.groupby(['repo'])):
#if idx not in repos_to_choose and "OpenHumans" not in key: # <:o)
# continue
df_temp = grp.groupby("date")['message'].count().reset_index(name='num_commits')
if df_temp.num_commits.sum() < 10:
continue
have_data = True
ax.plot_date( x=df_temp['date'].values, y=df_temp['num_commits'].values,
label=key, markersize=14)
if not have_data:
continue
plt.xlabel("Time")
plt.ylabel("Number of commits")
plt.title("Repo activity in {}".format(year))
plt.legend(loc='best')
plt.show()
Checking the busiest times and days of the week. If a repo is busiest on the weekend, could it be a side project?
def get_part_of_day(hour):
return (
"morning" if 6 <= hour <= 11
else
"afternoon" if 12 <= hour <= 17
else
"evening" if 18 <= hour <= 22
else
"night"
)
df['day_of_week'] = df.datetime.apply(lambda x: datetime.strftime(x, '%A'))
df['hour_of_day'] = df.datetime.apply(lambda x: x.hour)
df['part_of_day'] = df.hour_of_day.apply(get_part_of_day)
df['day_and_hour'] = df.day_of_week + " " + df.part_of_day
print("Busiest days")
print(df['day_of_week'].value_counts())
print('\n')
print("Busiest times")
print(df['part_of_day'].value_counts())
print('\n')
print("Busiest days + times")
print(df['day_and_hour'].value_counts())
print('\n')
print("Side Project detection")
repos_with_dow = df.groupby('repo')['day_of_week'].agg(lambda x: (x.value_counts().index[0], len(x)))
for repo, (day, commit_count) in repos_with_dow.iteritems():
if commit_count >= 5 and day in ("Saturday", "Sunday"):
print("Project {} has most commits on {}s.".format(repo, day))
Approximate the duration of coding per repo, by looking at times between timestamps. This will tend to underestimate the total time, since the time between starting coding and the first commit is not captured.
from datetime import timedelta
df_sorted = df.sort_values(['repo', 'datetime'])
df_sorted2 = df_sorted.shift(-1)
df_sorted['next_datetime'] = df_sorted2.datetime
df_sorted['next_repo'] = df_sorted2.repo
df_sorted['next_part_of_day'] = df_sorted2.part_of_day
# Set this to a value that makes sense with how often you commit
MAX_TIME_BETWEEN_COMMITS = timedelta(hours=8)
def determine_duration(dt1, dt2, repo1, repo2, pod1, pod2):
if repo1 != repo2 or (pod1 != "morning" and pod2 == "morning") or dt2 - dt1 > MAX_TIME_BETWEEN_COMMITS:
# this means commit at dt1 was the last one for the time period
return timedelta(0)
return dt2 - dt1
df_sorted['duration'] = df_sorted.apply(lambda x:
determine_duration(x['datetime'], x['next_datetime'],
x['repo'], x['next_repo'],
x['part_of_day'], x['next_part_of_day']), axis=1)
df_sorted.groupby('repo')['duration'].sum().sort_values(ascending=False).head(20)