diff --git a/is/kaggle/interpolateOnRegression.py b/is/kaggle/interpolateOnRegression.py new file mode 100644 index 0000000..03a779d --- /dev/null +++ b/is/kaggle/interpolateOnRegression.py @@ -0,0 +1,287 @@ +#including cv +import pandas as pd +import numpy as np +from sklearn.cross_validation import KFold +from sklearn import linear_model +from collections import Counter +from sklearn.linear_model import LinearRegression +from sklearn.preprocessing import PolynomialFeatures +from sklearn.pipeline import make_pipeline +from sklearn.linear_model import LogisticRegression + +def read(path="./datasets/train.csv"): + return(pd.read_csv(path, index_col='Date', parse_dates='Date')) + +data=read() +#data.drop(data.columns[[1,4,5,6,7]],axis=1,inplace=True) +#data.drop(data.index[range(0,180)]) +#print(data) + +def cv(data=data, n_folds=10): + """split data in n_folds parts for cross validation + """ + + cleanData=data[pd.notnull(data['Weight'])] + + kf=KFold(len(cleanData), shuffle=True, n_folds=n_folds) + + trainid=[] + testid=[] + for train, test in kf: + trainid.append(train) + testid.append(test) + + data_test=[] + data_train=[] + for i in range(n_folds): + data_train.append(data.copy()) + data_test.append([]) + for j in testid[i]: + data_test[i].append(pd.DataFrame(cleanData.iloc[j])) + #crazy hack, necessary ... + train=data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']] + train['Weight']=float('NaN') + data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]=train + return (data_train,data_test) + + + +def evaluate(predictions, data_test, predictedWeight='predWeight'): + """calcs the rmse on the testdata""" + n=len(data_test) + error=0 + for i in range(n): + test_value=np.float64(data_test[i].loc['Weight']) + #no better idea... + pred_value=predictions.iloc[int(data_test[i].loc['ID'])-1][predictedWeight] + error+= (test_value - pred_value)**2 + return(np.sqrt(error/n)) + +#1st example +#=============================================================================== +# def interpol(data): +# return data['Weight'].interpolate() +#=============================================================================== + +def regOnInterpol(data): + + ids=data['ID'] + weights = data['Weight'] + counter =0; + for i in range(len(weights)): + if(not(np.isnan(weights[i]))): #take every serveral non nan value as test data + counter += 1 + if(counter %2 ==0): + weights[i]=np.nan + intWeights= weights.interpolate() + + regdata=data[data['Weight'].apply(lambda x: not(np.isnan(x)))] + trainY= regdata['Weight'][1::2] + trainX= regdata['ID'][1::2] + for i in range(len(trainX)): + trainY[i]= trainY[i]-intWeights[trainX[i]] + trainY= np.reshape(trainY,(len(trainY),1)) + trainX= np.reshape(trainX,(len(trainX),1)) + lmlinear = make_pipeline(PolynomialFeatures(5), LinearRegression()) + lmlinear = linear_model.LinearRegression() + lmlinear.fit(trainX,trainY) + weights2 = data['Weight'] + for i in range(len(intWeights)): + if (not(np.isnan(weights2[i]))): + intWeights[i]+= lmlinear.predict(ids[i]); + return intWeights + + + + +#=============================================================================== +# def calorieBased(data): +# calMean=data['Proteins'].mean() +# calSTD=data['Proteins'].std() +# #fill with random data for nan-values +# #data['Calories']=data['Calories'].fillna(np.random.normal(loc=calMean,scale=calSTD,size=len(data['Calories'])-data['Calories'].count())) +# nans=len(data['Proteins'])-data['Proteins'].count() +# dfrand = calSTD*np.random.randn(nans)+calMean +# +# #data['Proteins',np.isnan(data['Proteins'])]= dfrand[np.isnan(data['Proteins'])]; #Erzeuge zufaellige kalorienwerte, ersetze sie durch nan werte. +# a=[] +# c=0 +# for i in range(len(data['Proteins'])): +# if np.isnan(data['Proteins'][i]): +# +# a.append(dfrand[c]) +# c+=1 +# else: +# a.append(data['Proteins'][i]) +# data['Proteins']=a +# +# +# return data +#=============================================================================== + + # for i in range(len(data)): + # if i==0: + # data['Weight'][0]=data['Weight'].mean() + # elif np.isnan(data['Weight'][i]): + # data['Weight',i]=data['Weight'][i-1]+(np.mean(data['Calories'][i-5:i])-calMean)/10 + + #return(data['Weight']) +#=============================================================================== +# def applyRegression(data,lmlinear,n): +# weights= np.array(data['Weight']); +# calories=np.array(data['Calories']); +# ids = np.array(data['ID']) +# calMean=data['Calories'].mean() +# f = np.vectorize(lambda x:x-calMean) +# if np.isnan(weights[-1]): +# weights[-1]=weights.mean(); +# +# for i in range(len(weights)-2,-1+n,-1): #letztes elemente muss ein gewicht haben (wie in den echten daten) +# if np.isnan(weights[i]): +# pred=lmlinear.predict(np.append(f((calories[i-n+1:i+1])[::-1]),ids[i])) +# weights[i]= weights[i+1]-pred +# data['Weight']= weights +#=============================================================================== + #print(weights) + # print(data) + + #data.to_csv(path_or_buf='data.csv') + #return weights + + +#=============================================================================== +# def procRegression(data,n): +# weights= np.array(data['Weight']); +# calories=np.array(data['Calories']); +# ids = np.array(data['ID']) +# counter =0 +# trainX=[] +# trainY=[] +# calMean=data['Calories'].mean() +# f = np.vectorize(lambda x:x-calMean) +# for i in range(n,len(weights)): +# for j in range(0,n): # letzten n calorien nicht nan +# if(np.isnan(calories[i-j])): +# counter =0 +# break +# else: +# counter +=1 +# if((counter == n) & (not np.isnan(weights[i]))&(not np.isnan(weights[i-1]))): +# +# trainY.append([weights[i]-weights[i-1]]) +# +# tr= np.append(f((calories[i-n+1:i+1])[::-1]),ids[i]) +# trainX.append(tr)#reverse f, mittelwertbefreit, vllt calorien gewichten +# #print((calories[i-n+1:i+1])[::-1]) +# counter =0 +# #print(trainX) +# lmlinear = model = make_pipeline(PolynomialFeatures(100), Ridge()) +# lmlinear.fit(trainX,trainY) +# return lmlinear +#=============================================================================== + +#=============================================================================== +# def applyRegressionWeightsOnly(data): +# regdata=data[data['Weight'].apply(lambda x: not(np.isnan(x)))] +# trainY= regdata['Weight'] +# trainY= np.reshape(trainY,(len(trainY),1)) +# trainX= regdata['ID'] +# trainX= np.reshape(trainX,(len(trainX),1)) +# lmlinear = make_pipeline(PolynomialFeatures(1000), Ridge()) +# #lmlinear = LogisticRegression() +# # print(lmlinear.score(trainX, trainY))# fehler bei train daten (soll eig nicht verwendet werden) +# +# +# lmlinear.fit(trainX,trainY) +# weights= np.array(data['Weight']); +# ids=np.array(data['ID']); +# for i in range(len(weights)): +# if(np.isnan(weights[i])): +# weights[i]= lmlinear.predict(ids[i]) +# #data['Weight']=weights +# #print(data) +# #data.to_csv(path_or_buf='dataW.csv') +# return weights +# +# def procweightAndCalories(data): +# data=data[data['Weight'].apply(lambda x: not(np.isnan(x)))&data['Proteins'].apply(lambda x: not(np.isnan(x)))] +# weights= np.array(data['Weight']) +# calories=np.array(data['Proteins']) +# ids = np.array(data['ID']) +# ids= np.delete(ids,len(ids)-1,0) +# weights=np.delete(weights,len(weights)-1,0) +# calMean=data['Proteins'].mean() +# f = np.vectorize(lambda x:x-calMean) +# calories=np.delete(calories,0,0) +# calories= f(calories) +# trainX= np.vstack((ids,calories)) +# trainX= trainX.T #np.reshape(trainX,(len(trainX),1)) +# trainY=weights +# np.reshape(weights,(len(weights),1)) +# #print(trainX) +# lmlinear = make_pipeline(PolynomialFeatures(2), Ridge()) +# # lmlinear = linear_model.LinearRegression() +# lmlinear.fit(trainX,trainY) +# return lmlinear +# def applyRegressionWeightCalories(data,lmlinear): +# weights= np.array(data['Weight']); +# calories=np.array(data['Proteins']); +# ids = np.array(data['ID']) +# calMean=data['Proteins'].mean() +# f = np.vectorize(lambda x:x-calMean) +# calories=f(calories) +# if np.isnan(weights[-1]): +# weights[-1]=weights.mean(); +# for i in range(len(weights)-2,0,-1): #letztes elemente muss ein gewicht haben (wie in den echten daten) +# if np.isnan(weights[i]): +# pred=lmlinear.predict([ids[i],calories[i-1]]) +# weights[i]= pred +# data['Weight']= weights +# #print(weights) +# # print(data) +# +# #data.to_csv(path_or_buf='data.csv') +# return weights +#=============================================================================== + + +data_train, data_test=cv() +print("CV abgeschlossen") +#data_train.to_csv(path_or_buf='data_train.csv') +#data_test.to_csv(path_or_buf='data_test.csv') +#print("a") + +rmse=[] +sum=0 +n=0 +lowest=10000000 +s='' + +for i in range(10): + nn=1#menge der kalorien elemente + data2=data_train[i] + data_train[i]['predWeight'] = regOnInterpol(data2) + rmse.append(evaluate(data_train[i], data_test[i])) + if(~np.isnan(rmse[i])): + n+=1 + sum+=rmse[i] + print("RMSE(",i,"):",rmse[i]) +print("Mean RSME:",sum/n) + + +data['Weight'] = regOnInterpol(data) +data = data.set_index('ID') +test = pd.read_csv("./datasets/test.csv") +predictions = test.join(data,on='ID') + +predictions[['ID','Weight']].to_csv('sampleSubmission.csv', header = ['ID','Weight'],index_label=False,index=False) + +#nn=2#menger der kalorien elemente +#lmlinear =procRegression(data, nn) +#data=calorieBased(data) +#applyRegression(data, lmlinear,nn) + + +#print(data_train[1]) + +