Newer
Older
abgabensammlungSS15 / is / kaggle / present.py
@Jan-Peter Hohloch Jan-Peter Hohloch on 15 Jun 2015 6 KB IS: add code for presenting
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

def read(path="./datasets/train.csv"):
    return(pd.read_csv(path, index_col='Date', parse_dates='Date'))

###########################
data=read()################
###########################

def cv(data=data, n_folds=10):
    """split data in n_folds parts for cross validation
    """
    cleanData=data[pd.notnull(data['Weight'])]

    kf=KFold(len(cleanData), shuffle=True, n_folds=n_folds)

    trainid=[]
    testid=[]
    for train, test in kf:
        trainid.append(train)
        testid.append(test)

    data_test=[]
    data_train=[]
    for i in range(n_folds):
        data_train.append(data.copy())
        data_test.append([])
        for j in testid[i]:
            data_test[i].append(pd.DataFrame(cleanData.iloc[j]))
            #crazy but necessary ...
            train=data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]
            train['Weight']=float('NaN')
            data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]=train
    return (data_train,data_test)

#########################################
data_train, data_test=cv()###############
#########################################

def evaluate(predictions, data_test, predictedWeight='predWeight'):
    """calcs the rmse on the testdata"""
    n=len(data_test)
    error=0
    for i in range(n):
        test_value=np.float64(data_test[i].loc['Weight'])
        #no better idea...
        pred_value=predictions.iloc[int(data_test[i].loc['ID'])-1][predictedWeight]
        error+= (test_value - pred_value)**2
    return(np.sqrt(error/n))

######################### Approaches ##########################
def interpol(data):
    """Interpolation only"""
    return data['Weight'].interpolate()

def calorieBased(data):
    calMean=data['Calories'].mean()
    calSTD=data['Calories'].std()
    #fill with random data for nan-values
    a=data['Calories'].fillna(np.random.normal(loc=calMean,scale=calSTD)) # takes only one random value
    data['Calories']=a
    for i in range(len(data)):
        if i==0:
            data['Weight'][0]=data['Weight'].mean()
        elif np.isnan(data['Weight'][i]):
            data['Weight',i]=data['Weight'][i-1]+(np.mean(data['Calories'][i-5:i])-calMean)/10
    return(data['Weight'])

def fillCalories(data):
    calMean=data['Calories'].mean()
    calSTD=data['Calories'].std()
    #fill with random data for nan-values
    nans=len(data['Calories'])-data['Calories'].count()
    dfrand = calSTD*np.random.randn(nans)+calMean
    a=[]
    c=0
    for i in range(len(data['Calories'])):
        if np.isnan(data['Calories'][i]):
            a.append(dfrand[c])
            c+=1
        else:
            a.append(data['Calories'][i])
    data['Calories']=a
    return data

def procRegression(data,n=5):
    weights= np.array(data['Weight']);
    calories=np.array(data['Calories']);
    ids = np.array(data['ID'])
    counter =0
    trainX=[]
    trainY=[]
    calMean=data['Calories'].mean()
    f = np.vectorize(lambda x:x-calMean)
    for i in range(n,len(weights)):
        for j in range(0,n): # letzten n calorien nicht nan
            if(np.isnan(calories[i-j])):
                counter =0
                break
            else:
                counter +=1
        if((counter == n) & (not np.isnan(weights[i]))&(not np.isnan(weights[i-1]))):

            trainY.append([weights[i]-weights[i-1]])

            tr= np.append(f((calories[i-n+1:i+1])[::-1]),ids[i])
            trainX.append(tr)#reverse f, mittelwertbefreit, vllt calorien gewichten
        counter =0
    lmlinear = linear_model.LinearRegression()
    lmlinear.fit(trainX,trainY)
    return lmlinear

def applyRegression(data,n=5):
    data=fillCalories(data)
    lmlinear=procRegression(data,n)
    weights= np.array(data['Weight']);
    calories=np.array(data['Calories']);
    ids = np.array(data['ID'])
    calMean=data['Calories'].mean()
    f = np.vectorize(lambda x:x-calMean)
    for i in range(len(weights)-1,-1+n,-1): #letztes elemente muss ein gewicht haben (wie in den echten daten)
            if np.isnan(weights[i]):
                    pred=lmlinear.predict(np.append(f((calories[i-n+1:i+1])[::-1]),ids[i]))
                    weights[i]= weights[i+1]-pred
    return weights

# giving up on a nice solution
def regOnInterpol(data):
    ids=data['ID']
    weights = data['Weight']
    counter =0;
    for i in range(len(weights)):
        if(not(np.isnan(weights[i]))): #take every serveral non nan value as test data
            counter += 1
            if(counter %2 ==0):
                weights[i]=np.nan
    intWeights= weights.interpolate()

    regdata=data[data['Weight'].apply(lambda x: not(np.isnan(x)))]
    trainY= regdata['Weight'][1::2]
    trainX= regdata['ID'][1::2]
    for i in range(len(trainX)):
        trainY[i]= trainY[i]-intWeights[trainX[i]]
    trainY= np.reshape(trainY,(len(trainY),1))
    trainX= np.reshape(trainX,(len(trainX),1))
    lmlinear = make_pipeline(PolynomialFeatures(5), LinearRegression())
    lmlinear = linear_model.LinearRegression()
    lmlinear.fit(trainX,trainY)
    weights2 = data['Weight']
    for i in range(len(intWeights)):
        if (not(np.isnan(weights2[i]))):
            intWeights[i]+= lmlinear.predict(ids[i]);
    return intWeights

def regOnInterpol2(data):
    diff=[]
    ids=[]
    for i in range(len(data['diff'])):
        if not(np.isnan(data['diff'][i])):
            diff.append(data['diff'][i])
            ids.append(data['ID'][i])

    lmlinear = make_pipeline(PolynomialFeatures(5), LinearRegression())
    lmlinear = linear_model.LinearRegression()
    trainX=np.expand_dims(ids,axis=1)
    trainY=diff
    lmlinear.fit(trainX,trainY)
    data['diff']=lmlinear.predict(np.expand_dims(data['ID'],axis=1))
    data['inter']=data['Weight'].interpolate()
    return data['inter']+data['diff']

def dataDiff(data=data):
    """calculates diff between interpolated and true weight for each point"""

    data['diff']=np.NaN
    for i in range(len(data['Weight'])):
        if(not(np.isnan(data['Weight'][i]))):
            d=data['Weight'].copy()
            d[i]=np.NaN
            diff=data['Weight'][i]-d.interpolate()[i]
            data['diff'][i]=diff
        else :
            data['diff'][i]=np.NaN

rmse=[]
sum=0
n=0
for i in range(10):
    data_train[i]['predWeight'] = interpol(data_train[i]) # change this line
    rmse.append(evaluate(data_train[i], data_test[i]))
    if(~np.isnan(rmse[i])):
        n+=1
        sum+=rmse[i]
    print("RMSE(",i,"):",rmse[i])

print("Mean RSME:",sum/n)