diff --git a/is/kaggle/cv.py b/is/kaggle/cv.py index 3d11bf3..acb1d68 100644 --- a/is/kaggle/cv.py +++ b/is/kaggle/cv.py @@ -7,26 +7,54 @@ return(pd.read_csv(path, index_col='Date', parse_dates='Date')) data=read() -cleanData=data[pd.notnull(data['Weight'])] -def cv(data=cleanData, n_folds=10): +def cv(data=data, n_folds=10): """split data in n_folds parts for cross validation """ - kf=KFold(len(data), shuffle=True, n_folds=n_folds) + + cleanData=data[pd.notnull(data['Weight'])] + + kf=KFold(len(cleanData), shuffle=True, n_folds=n_folds) trainid=[] testid=[] for train, test in kf: trainid.append(train) testid.append(test) - data_train=[] - data_test=[] + data_test=[] + data_train=[] for i in range(n_folds): - data_train.append(pd.DataFrame(data.iloc[j] for j in trainid[i])) - data_test.append(pd.DataFrame(data.iloc[j] for j in testid[i])) + data_train.append(data.copy()) + data_test.append([]) + for j in testid[i]: + data_test[i].append(pd.DataFrame(cleanData.iloc[j])) + #crazy hack, necessary ... + train=data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']] + train['Weight']=float('NaN') + data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]=train + print(data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]['Weight']) return (data_train,data_test) data_train, data_test=cv() -#print(data_test[0]) +def evaluate(predictions, data_test, predictedWeight='predWeight'): + """calcs the rmse on the testdata""" + n=len(data_test) + error=0 + for i in range(n): + test_value=np.float64(data_test[i].loc['Weight']) + #no better idea... + pred_value=predictions.iloc[int(data_test[i].loc['ID'])-1][predictedWeight] + print("test:",test_value) + print("pred:",pred_value) + error+= (test_value - pred_value)**2 + print(error) + return(np.sqrt(error/n)) + +#1st example +for i in range(10): + data_train[i]['WeightInter'] = data_train[i]['Weight'].interpolate() + print(data_train[i]) + print("RMSE(",i,"):",evaluate(data_train[i], data_test[i],'WeightInter')) +