Newer
Older
abgabensammlungSS15 / is / kaggle / cv.py
@Jan-Peter Hohloch Jan-Peter Hohloch on 6 Jun 2015 1 KB IS: add mean to cv
#including cv
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold

def read(path="./datasets/train.csv"):
    return(pd.read_csv(path, index_col='Date', parse_dates='Date'))

data=read()

def cv(data=data, n_folds=10):
    """split data in n_folds parts for cross validation
    """

    cleanData=data[pd.notnull(data['Weight'])]

    kf=KFold(len(cleanData), shuffle=True, n_folds=n_folds)

    trainid=[]
    testid=[]
    for train, test in kf:
        trainid.append(train)
        testid.append(test)

    data_test=[]
    data_train=[]
    for i in range(n_folds):
        data_train.append(data.copy())
        data_test.append([])
        for j in testid[i]:
            data_test[i].append(pd.DataFrame(cleanData.iloc[j]))
            #crazy hack, necessary ...
            train=data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]
            train['Weight']=float('NaN')
            data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]=train
    return (data_train,data_test)

data_train, data_test=cv()

def evaluate(predictions, data_test, predictedWeight='predWeight'):
    """calcs the rmse on the testdata"""
    n=len(data_test)
    error=0
    for i in range(n):
        test_value=np.float64(data_test[i].loc['Weight'])
        #no better idea...
        pred_value=predictions.iloc[int(data_test[i].loc['ID'])-1][predictedWeight]
        error+= (test_value - pred_value)**2
    return(np.sqrt(error/n))

#1st example
rmse=[]
sum=0
n=0
for i in range(10):
    data_train[i]['WeightInter'] = data_train[i]['Weight'].interpolate()
    rmse.append(evaluate(data_train[i], data_test[i],'WeightInter'))
    if(~np.isnan(rmse[i])):
        n+=1
        sum+=rmse[i]
    print("RMSE(",i,"):",rmse[i])

print("Mean RSME:",sum/n)