Newer
Older
abgabensammlungSS15 / is / kaggle / cvMaxRegressionWorking.py
@MaxXximus92 MaxXximus92 on 8 Jun 2015 4 KB ea is
#including cv
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
from sklearn import linear_model
from collections import Counter

def read(path="./datasets/train.csv"):
    return(pd.read_csv(path, index_col='Date', parse_dates='Date'))

data=read()
data.drop(data.columns[[1,4,5,6,7]],axis=1,inplace=True)


def cv(data=data, n_folds=2):
    """split data in n_folds parts for cross validation
    """

    cleanData=data[pd.notnull(data['Weight'])]

    kf=KFold(len(cleanData), shuffle=True, n_folds=n_folds)

    trainid=[]
    testid=[]
    for train, test in kf:
        trainid.append(train)
        testid.append(test)

    data_test=[]
    data_train=[]
    for i in range(n_folds):
        data_train.append(data.copy())
        data_test.append([])
        for j in testid[i]:
            data_test[i].append(pd.DataFrame(cleanData.iloc[j]))
            #crazy hack, necessary ...
            train=data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]
            train['Weight']=float('NaN')
            data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]=train
    return (data_train,data_test)

data_train, data_test=cv()

def evaluate(predictions, data_test, predictedWeight='predWeight'):
    """calcs the rmse on the testdata"""
    n=len(data_test)
    error=0
    for i in range(n):
        test_value=np.float64(data_test[i].loc['Weight'])
        #no better idea...
        pred_value=predictions.iloc[int(data_test[i].loc['ID'])-1][predictedWeight]
        error+= (test_value - pred_value)**2
    return(np.sqrt(error/n))

#1st example
def interpol(data):
    return data['Weight'].interpolate()

def calorieBased(data):
    calMean=data['Calories'].mean()
    calSTD=data['Calories'].std()
    #fill with random data for nan-values
    #data['Calories']=data['Calories'].fillna(np.random.normal(loc=calMean,scale=calSTD,size=len(data['Calories'])-data['Calories'].count()))
    nans=len(data['Calories'])-data['Calories'].count()
    dfrand = calSTD*np.random.randn(nans)+calMean
    
    #data['Calories',np.isnan(data['Calories'])]= dfrand[np.isnan(data['Calories'])]; #Erzeuge zufaellige kalorienwerte, ersetze sie durch nan werte.
    a=[]
    c=0
    for i in range(len(data['Calories'])):
        if np.isnan(data['Calories'][i]):
        
            a.append(dfrand[c])
            c+=1       
        else:
            a.append(data['Calories'][i])     
    data['Calories']=a

    
    return data

   # for i in range(len(data)):
    #    if i==0:
    #        data['Weight'][0]=data['Weight'].mean()
    #    elif np.isnan(data['Weight'][i]):
    #        data['Weight',i]=data['Weight'][i-1]+(np.mean(data['Calories'][i-5:i])-calMean)/10

    #return(data['Weight'])
def applyRegression(data,lmlinear,n):  
    weights= np.array(data['Weight']);
    calories=np.array(data['Calories']);
    ids = np.array(data['ID'])
    calMean=data['Calories'].mean()
    f = np.vectorize(lambda x:x-calMean)
    for i in range(len(weights)-1,-1+n,-1): #letztes elemente muss ein gewicht haben (wie in den echten daten)
            if np.isnan(weights[i]):
                    pred=lmlinear.predict(np.append(f((calories[i-n+1:i+1])[::-1]),ids[i]))
                    weights[i]= weights[i+1]-pred
    data['Weight']= weights
    #print(weights)
    print(data)
    
    data.to_csv(path_or_buf='data.csv')
    
    
def procRegression(data,n):
    weights= np.array(data['Weight']);
    calories=np.array(data['Calories']);
    ids = np.array(data['ID'])
    counter =0
    trainX=[]
    trainY=[]
    calMean=data['Calories'].mean()
    f = np.vectorize(lambda x:x-calMean)
    for i in range(n,len(weights)):
        for j in range(0,n): # letzten n calorien nicht nan
            if(np.isnan(calories[i-j])):
                counter =0
                break
            else:
                counter +=1                  
        if((counter == n) & (not np.isnan(weights[i]))&(not np.isnan(weights[i-1]))):
           
            trainY.append([weights[i]-weights[i-1]])
            
            tr= np.append(f((calories[i-n+1:i+1])[::-1]),ids[i])
            trainX.append(tr)#reverse f, mittelwertbefreit, vllt calorien gewichten
            #print((calories[i-n+1:i+1])[::-1])
        counter =0
    #print(trainX)    
    lmlinear = linear_model.LinearRegression()
    lmlinear.fit(trainX,trainY)     
    return lmlinear           
            
rmse=[]
sum=0
n=0
#for i in range(10):
#    data_train[i]['predWeight'] = interpol(data_train[i])
#    rmse.append(evaluate(data_train[i], data_test[i]))
#    if(~np.isnan(rmse[i])):
#        n+=1
#        sum+=rmse[i]
#    print("RMSE(",i,"):",rmse[i])
nn=2#menger der kalorien elemente
lmlinear =procRegression(data, nn)
data=calorieBased(data)
applyRegression(data, lmlinear,nn)

#print("Mean RSME:",sum/n)
#print(data_train[1])