diff --git a/is/kaggle/cv Max.py b/is/kaggle/cv Max.py new file mode 100644 index 0000000..1b093d8 --- /dev/null +++ b/is/kaggle/cv Max.py @@ -0,0 +1,96 @@ +#including cv +import pandas as pd +import numpy as np +from sklearn.cross_validation import KFold + +def read(path="./datasets/train.csv"): + return(pd.read_csv(path, index_col='Date', parse_dates='Date')) + +data=read() + +def cv(data=data, n_folds=10): + """split data in n_folds parts for cross validation + """ + + cleanData=data[pd.notnull(data['Weight'])] + + kf=KFold(len(cleanData), shuffle=True, n_folds=n_folds) + + trainid=[] + testid=[] + for train, test in kf: + trainid.append(train) + testid.append(test) + + data_test=[] + data_train=[] + for i in range(n_folds): + data_train.append(data.copy()) + data_test.append([]) + for j in testid[i]: + data_test[i].append(pd.DataFrame(cleanData.iloc[j])) + #crazy hack, necessary ... + train=data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']] + train['Weight']=float('NaN') + data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]=train + return (data_train,data_test) + +data_train, data_test=cv() + +def evaluate(predictions, data_test, predictedWeight='predWeight'): + """calcs the rmse on the testdata""" + n=len(data_test) + error=0 + for i in range(n): + test_value=np.float64(data_test[i].loc['Weight']) + #no better idea... + pred_value=predictions.iloc[int(data_test[i].loc['ID'])-1][predictedWeight] + error+= (test_value - pred_value)**2 + return(np.sqrt(error/n)) + +#1st example +def interpol(data): + return data['Weight'].interpolate() + +def calorieBased(data): + calMean=data['Calories'].mean() + calSTD=data['Calories'].std() + #fill with random data for nan-values + # data['Calories']=data['Calories'].fillna(np.random.normal(loc=calMean,scale=calSTD,size=len(data['Calories'])-data['Calories'].count())) + nans=len(data['Calories'])-data['Calories'].count() + dfrand = calSTD*np.random.randn(nans)+calMean + + #data['Calories',np.isnan(data['Calories'])]= dfrand[np.isnan(data['Calories'])]; #Erzeuge zufaellige kalorienwerte, ersetze sie durch nan werte. + a=[] + c=0 + for i in range(len(data['Calories'])): + if np.isnan(data['Calories'][i]): + + a.append(dfrand[c]) + c+=1 + else: + a.append(data['Calories'][i]) + data['Calories']=a + + for i in range(len(data)): + if i==0: + data['Weight'][0]=data['Weight'].mean() + elif np.isnan(data['Weight'][i]): + data['Weight',i]=data['Weight'][i-1]+(np.mean(data['Calories'][i-5:i])-calMean)/10 + + return(data['Weight']) + +rmse=[] +sum=0 +n=0 +for i in range(10): + data_train[i]['predWeight'] = interpol(data_train[i]) + rmse.append(evaluate(data_train[i], data_test[i])) + if(~np.isnan(rmse[i])): + n+=1 + sum+=rmse[i] + print("RMSE(",i,"):",rmse[i]) + +print("Mean RSME:",sum/n) +#print(data_train[1]) +calorieBased(data_train[1])