Newer
Older
abgabensammlungSS15 / is / kaggle / interpolateOnRegression.py
@MaxXximus92 MaxXximus92 on 14 Jun 2015 9 KB is ub 5 interpol regression
#including cv
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
from sklearn import linear_model
from collections import Counter
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

def read(path="./datasets/train.csv"):
    return(pd.read_csv(path, index_col='Date', parse_dates='Date'))

data=read()
#data.drop(data.columns[[1,4,5,6,7]],axis=1,inplace=True)
#data.drop(data.index[range(0,180)])
#print(data)

def cv(data=data, n_folds=10):
    """split data in n_folds parts for cross validation
    """

    cleanData=data[pd.notnull(data['Weight'])]

    kf=KFold(len(cleanData), shuffle=True, n_folds=n_folds)

    trainid=[]
    testid=[]
    for train, test in kf:
        trainid.append(train)
        testid.append(test)

    data_test=[]
    data_train=[]
    for i in range(n_folds):
        data_train.append(data.copy())
        data_test.append([])
        for j in testid[i]:
            data_test[i].append(pd.DataFrame(cleanData.iloc[j]))
            #crazy hack, necessary ...
            train=data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]
            train['Weight']=float('NaN')
            data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]=train
    return (data_train,data_test)



def evaluate(predictions, data_test, predictedWeight='predWeight'):
    """calcs the rmse on the testdata"""
    n=len(data_test)
    error=0
    for i in range(n):
        test_value=np.float64(data_test[i].loc['Weight'])
        #no better idea...
        pred_value=predictions.iloc[int(data_test[i].loc['ID'])-1][predictedWeight]
        error+= (test_value - pred_value)**2
    return(np.sqrt(error/n))

#1st example
#===============================================================================
# def interpol(data):
#     return data['Weight'].interpolate()
#===============================================================================

def regOnInterpol(data):
   
    ids=data['ID']
    weights = data['Weight']
    counter =0;
    for i in range(len(weights)):
        if(not(np.isnan(weights[i]))): #take every serveral non nan value as test data
            counter += 1
            if(counter %2 ==0):
                weights[i]=np.nan
    intWeights= weights.interpolate()
                      
    regdata=data[data['Weight'].apply(lambda x: not(np.isnan(x)))]
    trainY= regdata['Weight'][1::2]
    trainX= regdata['ID'][1::2]
    for i in range(len(trainX)):
        trainY[i]= trainY[i]-intWeights[trainX[i]]    
    trainY= np.reshape(trainY,(len(trainY),1))
    trainX= np.reshape(trainX,(len(trainX),1))
    lmlinear = make_pipeline(PolynomialFeatures(5), LinearRegression())
    lmlinear = linear_model.LinearRegression()
    lmlinear.fit(trainX,trainY) 
    weights2 = data['Weight']
    for i in range(len(intWeights)):
        if (not(np.isnan(weights2[i]))):
            intWeights[i]+= lmlinear.predict(ids[i]);
    return intWeights        
        
    
    

#===============================================================================
# def calorieBased(data):
#     calMean=data['Proteins'].mean()
#     calSTD=data['Proteins'].std()
#     #fill with random data for nan-values
#     #data['Calories']=data['Calories'].fillna(np.random.normal(loc=calMean,scale=calSTD,size=len(data['Calories'])-data['Calories'].count()))
#     nans=len(data['Proteins'])-data['Proteins'].count()
#     dfrand = calSTD*np.random.randn(nans)+calMean
#     
#     #data['Proteins',np.isnan(data['Proteins'])]= dfrand[np.isnan(data['Proteins'])]; #Erzeuge zufaellige kalorienwerte, ersetze sie durch nan werte.
#     a=[]
#     c=0
#     for i in range(len(data['Proteins'])):
#         if np.isnan(data['Proteins'][i]):
#         
#             a.append(dfrand[c])
#             c+=1       
#         else:
#             a.append(data['Proteins'][i])     
#     data['Proteins']=a
# 
#     
#     return data
#===============================================================================

   # for i in range(len(data)):
    #    if i==0:
    #        data['Weight'][0]=data['Weight'].mean()
    #    elif np.isnan(data['Weight'][i]):
    #        data['Weight',i]=data['Weight'][i-1]+(np.mean(data['Calories'][i-5:i])-calMean)/10

    #return(data['Weight'])
#===============================================================================
# def applyRegression(data,lmlinear,n):  
#     weights= np.array(data['Weight']);
#     calories=np.array(data['Calories']);
#     ids = np.array(data['ID'])
#     calMean=data['Calories'].mean()
#     f = np.vectorize(lambda x:x-calMean)
#     if np.isnan(weights[-1]):
#         weights[-1]=weights.mean();
#     
#     for i in range(len(weights)-2,-1+n,-1): #letztes elemente muss ein gewicht haben (wie in den echten daten)
#             if np.isnan(weights[i]):
#                     pred=lmlinear.predict(np.append(f((calories[i-n+1:i+1])[::-1]),ids[i]))
#                     weights[i]= weights[i+1]-pred
#     data['Weight']= weights
#===============================================================================
    #print(weights)
   # print(data)
    
    #data.to_csv(path_or_buf='data.csv')
    #return weights
    
    
#===============================================================================
# def procRegression(data,n):
#     weights= np.array(data['Weight']);
#     calories=np.array(data['Calories']);
#     ids = np.array(data['ID'])
#     counter =0
#     trainX=[]
#     trainY=[]
#     calMean=data['Calories'].mean()
#     f = np.vectorize(lambda x:x-calMean)
#     for i in range(n,len(weights)):
#         for j in range(0,n): # letzten n calorien nicht nan
#             if(np.isnan(calories[i-j])):
#                 counter =0
#                 break
#             else:
#                 counter +=1                  
#         if((counter == n) & (not np.isnan(weights[i]))&(not np.isnan(weights[i-1]))):
#            
#             trainY.append([weights[i]-weights[i-1]])
#             
#             tr= np.append(f((calories[i-n+1:i+1])[::-1]),ids[i])
#             trainX.append(tr)#reverse f, mittelwertbefreit, vllt calorien gewichten
#             #print((calories[i-n+1:i+1])[::-1])
#         counter =0
#     #print(trainX)    
#     lmlinear = model = make_pipeline(PolynomialFeatures(100), Ridge())
#     lmlinear.fit(trainX,trainY)     
#     return lmlinear     
#===============================================================================

#===============================================================================
# def applyRegressionWeightsOnly(data):
#     regdata=data[data['Weight'].apply(lambda x: not(np.isnan(x)))]
#     trainY= regdata['Weight']
#     trainY= np.reshape(trainY,(len(trainY),1))
#     trainX= regdata['ID']
#     trainX= np.reshape(trainX,(len(trainX),1))
#     lmlinear = make_pipeline(PolynomialFeatures(1000), Ridge())
#     #lmlinear = LogisticRegression()
#   #  print(lmlinear.score(trainX, trainY))# fehler bei train daten (soll eig nicht verwendet werden)
#     
#     
#     lmlinear.fit(trainX,trainY) 
#     weights= np.array(data['Weight']);
#     ids=np.array(data['ID']);
#     for i in range(len(weights)):
#         if(np.isnan(weights[i])):
#             weights[i]= lmlinear.predict(ids[i])
#     #data['Weight']=weights
#     #print(data)        
#     #data.to_csv(path_or_buf='dataW.csv')   
#     return weights  
# 
# def procweightAndCalories(data):
#     data=data[data['Weight'].apply(lambda x: not(np.isnan(x)))&data['Proteins'].apply(lambda x: not(np.isnan(x)))]
#     weights= np.array(data['Weight'])
#     calories=np.array(data['Proteins'])
#     ids = np.array(data['ID'])
#     ids= np.delete(ids,len(ids)-1,0)
#     weights=np.delete(weights,len(weights)-1,0)
#     calMean=data['Proteins'].mean()
#     f = np.vectorize(lambda x:x-calMean)
#     calories=np.delete(calories,0,0)
#     calories= f(calories)
#     trainX= np.vstack((ids,calories))
#     trainX= trainX.T #np.reshape(trainX,(len(trainX),1))
#     trainY=weights
#     np.reshape(weights,(len(weights),1))
#     #print(trainX)    
#     lmlinear  = make_pipeline(PolynomialFeatures(2), Ridge())
#    # lmlinear = linear_model.LinearRegression()
#     lmlinear.fit(trainX,trainY)    
#     return lmlinear   
# def applyRegressionWeightCalories(data,lmlinear):  
#     weights= np.array(data['Weight']);
#     calories=np.array(data['Proteins']);
#     ids = np.array(data['ID'])
#     calMean=data['Proteins'].mean()
#     f = np.vectorize(lambda x:x-calMean)
#     calories=f(calories)
#     if np.isnan(weights[-1]):
#         weights[-1]=weights.mean();
#     for i in range(len(weights)-2,0,-1): #letztes elemente muss ein gewicht haben (wie in den echten daten)
#             if np.isnan(weights[i]):
#                     pred=lmlinear.predict([ids[i],calories[i-1]])
#                     weights[i]= pred
#     data['Weight']= weights
#     #print(weights)
#    # print(data)
#     
#     #data.to_csv(path_or_buf='data.csv')
#     return weights
#===============================================================================
    
    
data_train, data_test=cv() 
print("CV abgeschlossen")  
#data_train.to_csv(path_or_buf='data_train.csv')
#data_test.to_csv(path_or_buf='data_test.csv')
#print("a")

rmse=[]
sum=0
n=0
lowest=10000000
s=''

for i in range(10):
    nn=1#menge der kalorien elemente
    data2=data_train[i]
    data_train[i]['predWeight'] = regOnInterpol(data2)  
    rmse.append(evaluate(data_train[i], data_test[i]))
    if(~np.isnan(rmse[i])):
        n+=1
        sum+=rmse[i]
    print("RMSE(",i,"):",rmse[i])
print("Mean RSME:",sum/n) 


data['Weight'] = regOnInterpol(data)  
data = data.set_index('ID')
test = pd.read_csv("./datasets/test.csv")
predictions = test.join(data,on='ID')

predictions[['ID','Weight']].to_csv('sampleSubmission.csv', header = ['ID','Weight'],index_label=False,index=False)

#nn=2#menger der kalorien elemente
#lmlinear =procRegression(data, nn)
#data=calorieBased(data)
#applyRegression(data, lmlinear,nn)


#print(data_train[1])