import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
def read(path="./datasets/train.csv"):
return(pd.read_csv(path, index_col='Date', parse_dates='Date'))
###########################
data=read()################
###########################
def cv(data=data, n_folds=10):
"""split data in n_folds parts for cross validation
"""
cleanData=data[pd.notnull(data['Weight'])]
kf=KFold(len(cleanData), shuffle=True, n_folds=n_folds)
trainid=[]
testid=[]
for train, test in kf:
trainid.append(train)
testid.append(test)
data_test=[]
data_train=[]
for i in range(n_folds):
data_train.append(data.copy())
data_test.append([])
for j in testid[i]:
data_test[i].append(pd.DataFrame(cleanData.iloc[j]))
#crazy but necessary ...
train=data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]
train['Weight']=float('NaN')
data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]=train
return (data_train,data_test)
#########################################
data_train, data_test=cv()###############
#########################################
def evaluate(predictions, data_test, predictedWeight='predWeight'):
"""calcs the rmse on the testdata"""
n=len(data_test)
error=0
for i in range(n):
test_value=np.float64(data_test[i].loc['Weight'])
#no better idea...
pred_value=predictions.iloc[int(data_test[i].loc['ID'])-1][predictedWeight]
error+= (test_value - pred_value)**2
return(np.sqrt(error/n))
######################### Approaches ##########################
def interpol(data):
"""Interpolation only"""
return data['Weight'].interpolate()
def calorieBased(data):
calMean=data['Calories'].mean()
calSTD=data['Calories'].std()
#fill with random data for nan-values
a=data['Calories'].fillna(np.random.normal(loc=calMean,scale=calSTD)) # takes only one random value
data['Calories']=a
for i in range(len(data)):
if i==0:
data['Weight'][0]=data['Weight'].mean()
elif np.isnan(data['Weight'][i]):
data['Weight',i]=data['Weight'][i-1]+(np.mean(data['Calories'][i-5:i])-calMean)/10
return(data['Weight'])
def fillCalories(data):
calMean=data['Calories'].mean()
calSTD=data['Calories'].std()
#fill with random data for nan-values
nans=len(data['Calories'])-data['Calories'].count()
dfrand = calSTD*np.random.randn(nans)+calMean
a=[]
c=0
for i in range(len(data['Calories'])):
if np.isnan(data['Calories'][i]):
a.append(dfrand[c])
c+=1
else:
a.append(data['Calories'][i])
data['Calories']=a
return data
def procRegression(data,n=5):
weights= np.array(data['Weight']);
calories=np.array(data['Calories']);
ids = np.array(data['ID'])
counter =0
trainX=[]
trainY=[]
calMean=data['Calories'].mean()
f = np.vectorize(lambda x:x-calMean)
for i in range(n,len(weights)):
for j in range(0,n): # letzten n calorien nicht nan
if(np.isnan(calories[i-j])):
counter =0
break
else:
counter +=1
if((counter == n) & (not np.isnan(weights[i]))&(not np.isnan(weights[i-1]))):
trainY.append([weights[i]-weights[i-1]])
tr= np.append(f((calories[i-n+1:i+1])[::-1]),ids[i])
trainX.append(tr)#reverse f, mittelwertbefreit, vllt calorien gewichten
counter =0
lmlinear = linear_model.LinearRegression()
lmlinear.fit(trainX,trainY)
return lmlinear
def applyRegression(data,n=5):
data=fillCalories(data)
lmlinear=procRegression(data,n)
weights= np.array(data['Weight']);
calories=np.array(data['Calories']);
ids = np.array(data['ID'])
calMean=data['Calories'].mean()
f = np.vectorize(lambda x:x-calMean)
for i in range(len(weights)-1,-1+n,-1): #letztes elemente muss ein gewicht haben (wie in den echten daten)
if np.isnan(weights[i]):
pred=lmlinear.predict(np.append(f((calories[i-n+1:i+1])[::-1]),ids[i]))
weights[i]= weights[i+1]-pred
return weights
# giving up on a nice solution
def regOnInterpol(data):
ids=data['ID']
weights = data['Weight']
counter =0;
for i in range(len(weights)):
if(not(np.isnan(weights[i]))): #take every serveral non nan value as test data
counter += 1
if(counter %2 ==0):
weights[i]=np.nan
intWeights= weights.interpolate()
regdata=data[data['Weight'].apply(lambda x: not(np.isnan(x)))]
trainY= regdata['Weight'][1::2]
trainX= regdata['ID'][1::2]
for i in range(len(trainX)):
trainY[i]= trainY[i]-intWeights[trainX[i]]
trainY= np.reshape(trainY,(len(trainY),1))
trainX= np.reshape(trainX,(len(trainX),1))
lmlinear = make_pipeline(PolynomialFeatures(5), LinearRegression())
lmlinear = linear_model.LinearRegression()
lmlinear.fit(trainX,trainY)
weights2 = data['Weight']
for i in range(len(intWeights)):
if (not(np.isnan(weights2[i]))):
intWeights[i]+= lmlinear.predict(ids[i]);
return intWeights
def regOnInterpol2(data):
diff=[]
ids=[]
for i in range(len(data['diff'])):
if not(np.isnan(data['diff'][i])):
diff.append(data['diff'][i])
ids.append(data['ID'][i])
lmlinear = make_pipeline(PolynomialFeatures(5), LinearRegression())
lmlinear = linear_model.LinearRegression()
trainX=np.expand_dims(ids,axis=1)
trainY=diff
lmlinear.fit(trainX,trainY)
data['diff']=lmlinear.predict(np.expand_dims(data['ID'],axis=1))
data['inter']=data['Weight'].interpolate()
return data['inter']+data['diff']
def dataDiff(data=data):
"""calculates diff between interpolated and true weight for each point"""
data['diff']=np.NaN
for i in range(len(data['Weight'])):
if(not(np.isnan(data['Weight'][i]))):
d=data['Weight'].copy()
d[i]=np.NaN
diff=data['Weight'][i]-d.interpolate()[i]
data['diff'][i]=diff
else :
data['diff'][i]=np.NaN
rmse=[]
sum=0
n=0
for i in range(10):
data_train[i]['predWeight'] = interpol(data_train[i]) # change this line
rmse.append(evaluate(data_train[i], data_test[i]))
if(~np.isnan(rmse[i])):
n+=1
sum+=rmse[i]
print("RMSE(",i,"):",rmse[i])
print("Mean RSME:",sum/n)