Daylio Analysis.ipynb
Notebook for reading and processing an Daylio exported csv to a more friendly dataset and doing an quick visualization of it.
In this notebook, I perform an resampled visualization of the mood score and an visualization of the correlation matrix between the different activities when doing an multilinear fit on the data.
But the best visualization is the results of the multilinear fit on the activities - it summarizes the relations between the activities and the mood
Please log in to comment.Right now this notebook isn't tied in into the Open Humans API but rather relies on the presence of a `daylio_export.csv` file in the same folder as the Notebook. Would be cool to get this tied in with the API though!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import scipy.optimize as opt
from pandas.tools.plotting import table
import statsmodels.api as sm
import scipy.fftpack as fft
# Put here the score for each custom mood in Daylio
mood_score = {"awful": 0, "melancholic": 0, "aaa": 0,
"bad": 1, "shitty": 1, "grr": 1,
"aff":2, "zzz": 2, "meh": 2,
"good": 3, "satisfied": 3, "gogo": 3,
"rad": 4, "wonderful": 4, "hihi": 4}
# When doing visualizations and data-fitting, only consider activities with more than N points
activities_N_minimum = 19
# Preparar dados do humor
raw_mood_data = pd.read_csv("daylio_export.csv", parse_dates=[[0, 1, 2, 3]], index_col="year_date_weekday_time")
raw_mood_data = raw_mood_data.replace({"mood": mood_score}).copy()
raw_mood_data = raw_mood_data.replace(np.nan, "")
mood_data = raw_mood_data.copy()
# Normalize mood score to an domain between 0 and 100
mood_data.mood *= 100
mood_data.mood /= 4
# Separate the activities on each row
activities_string = " | ".join(mood_data.activities.values)
activities_string = activities_string.split(" | ")
unique_activities = np.unique(activities_string)
unique_activities = unique_activities[unique_activities != ""]
mood_data.activities = mood_data.activities.str.split(" \| ")
# Generate new columns with the activities
for activity in unique_activities:
contains_activity = mood_data.activities.apply(lambda x: activity in x)
mood_data[activity] = contains_activity
resample_rule = "7d"
daily_mood = mood_data.resample(resample_rule).mean().mood
plt.figure(figsize=(14, 7))
plt.grid()
plt.plot(daily_mood, "--o")
#plt.plot(mood_data[mood_data.sick == True].mood, 'o')
plt.ylim((0, 100))
plt.title("Mood score along time, resmapling rule: {}".format(resample_rule))
plt.ylabel("Resampled mood score (%)")
plt.xlabel("Time")
plt.show()
# Get list of activities which has more than activities_N_minimum points (user-defined)
unique_activities_2 = []
for activity in unique_activities:
if mood_data[activity].sum() > activities_N_minimum:
unique_activities_2.append(activity)
# Do an multilinear fitting on the data
N_samples = len(mood_data)
N_features = len(unique_activities_2)
features = []
X = np.zeros((N_samples, N_features))
Y = np.array(mood_data.mood.values)
for i in range(N_features):
feature = unique_activities_2[i]
features.append(feature)
X[:, i] = mood_data[feature].astype(int)
X = sm.add_constant(X)
est = sm.OLS(Y, X).fit()
summary = est.summary()
cov_mtr = est.cov_params() # covariance matrix
diag = np.diag(np.diag(cov_mtr))
D = np.linalg.inv(np.sqrt(diag))
r = np.matmul(D, np.matmul(cov_mtr, D)) # correlation matrix
# Plot the correlation matrix for the multilinear fit
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
mtr = ax.matshow(r, vmin=-1, vmax=1, cmap="PiYG")
plt.colorbar(mtr)
ax.set_xticks(np.arange(r.shape[0]))
ax.set_xticklabels(["CL"] + unique_activities_2, rotation="vertical")
ax.set_yticks(np.arange(r.shape[0]))
ax.set_yticklabels(["CL"] + unique_activities_2, rotation="horizontal")
plt.show()
# Transforming the summary table into something more readable
i = 0
for activity in unique_activities_2:
summary.tables[1][2:][i][0].data = activity
i += 1
# Show the summary of the multilinear fit
# Rough (and technically wrong) meanings of the columns
# [coef] how much (+/-)% the activity has influence on the mood score (on average)
# [std err] the dispersion of the [coef]
# [t] measure of the reliability for the [coef] estimation.
# |t|=2 means about 95% chance of being right and |t|>2 means about 99% chance of being ok. |t|=1 is about 68%
summary