#including cv
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
def read(path="./datasets/train.csv"):
return(pd.read_csv(path, index_col='Date', parse_dates='Date'))
data=read()
def cv(data=data, n_folds=10):
"""split data in n_folds parts for cross validation
"""
cleanData=data[pd.notnull(data['Weight'])]
kf=KFold(len(cleanData), shuffle=True, n_folds=n_folds)
trainid=[]
testid=[]
for train, test in kf:
trainid.append(train)
testid.append(test)
data_test=[]
data_train=[]
for i in range(n_folds):
data_train.append(data.copy())
data_test.append([])
for j in testid[i]:
data_test[i].append(pd.DataFrame(cleanData.iloc[j]))
#crazy hack, necessary ...
train=data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]
train['Weight']=float('NaN')
data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]=train
print(data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]['Weight'])
return (data_train,data_test)
data_train, data_test=cv()
def evaluate(predictions, data_test, predictedWeight='predWeight'):
"""calcs the rmse on the testdata"""
n=len(data_test)
error=0
for i in range(n):
test_value=np.float64(data_test[i].loc['Weight'])
#no better idea...
pred_value=predictions.iloc[int(data_test[i].loc['ID'])-1][predictedWeight]
print("test:",test_value)
print("pred:",pred_value)
error+= (test_value - pred_value)**2
print(error)
return(np.sqrt(error/n))
#1st example
for i in range(10):
data_train[i]['WeightInter'] = data_train[i]['Weight'].interpolate()
print(data_train[i])
print("RMSE(",i,"):",evaluate(data_train[i], data_test[i],'WeightInter'))