diff --git a/is/kaggle/present.py b/is/kaggle/present.py new file mode 100644 index 0000000..ea6976d --- /dev/null +++ b/is/kaggle/present.py @@ -0,0 +1,200 @@ +import pandas as pd +import numpy as np +from sklearn.cross_validation import KFold +from sklearn import linear_model +from sklearn.linear_model import LinearRegression + +def read(path="./datasets/train.csv"): + return(pd.read_csv(path, index_col='Date', parse_dates='Date')) + +########################### +data=read()################ +########################### + +def cv(data=data, n_folds=10): + """split data in n_folds parts for cross validation + """ + cleanData=data[pd.notnull(data['Weight'])] + + kf=KFold(len(cleanData), shuffle=True, n_folds=n_folds) + + trainid=[] + testid=[] + for train, test in kf: + trainid.append(train) + testid.append(test) + + data_test=[] + data_train=[] + for i in range(n_folds): + data_train.append(data.copy()) + data_test.append([]) + for j in testid[i]: + data_test[i].append(pd.DataFrame(cleanData.iloc[j])) + #crazy but necessary ... + train=data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']] + train['Weight']=float('NaN') + data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]=train + return (data_train,data_test) + +######################################### +data_train, data_test=cv()############### +######################################### + +def evaluate(predictions, data_test, predictedWeight='predWeight'): + """calcs the rmse on the testdata""" + n=len(data_test) + error=0 + for i in range(n): + test_value=np.float64(data_test[i].loc['Weight']) + #no better idea... + pred_value=predictions.iloc[int(data_test[i].loc['ID'])-1][predictedWeight] + error+= (test_value - pred_value)**2 + return(np.sqrt(error/n)) + +######################### Approaches ########################## +def interpol(data): + """Interpolation only""" + return data['Weight'].interpolate() + +def calorieBased(data): + calMean=data['Calories'].mean() + calSTD=data['Calories'].std() + #fill with random data for nan-values + a=data['Calories'].fillna(np.random.normal(loc=calMean,scale=calSTD)) # takes only one random value + data['Calories']=a + for i in range(len(data)): + if i==0: + data['Weight'][0]=data['Weight'].mean() + elif np.isnan(data['Weight'][i]): + data['Weight',i]=data['Weight'][i-1]+(np.mean(data['Calories'][i-5:i])-calMean)/10 + return(data['Weight']) + +def fillCalories(data): + calMean=data['Calories'].mean() + calSTD=data['Calories'].std() + #fill with random data for nan-values + nans=len(data['Calories'])-data['Calories'].count() + dfrand = calSTD*np.random.randn(nans)+calMean + a=[] + c=0 + for i in range(len(data['Calories'])): + if np.isnan(data['Calories'][i]): + a.append(dfrand[c]) + c+=1 + else: + a.append(data['Calories'][i]) + data['Calories']=a + return data + +def procRegression(data,n=5): + weights= np.array(data['Weight']); + calories=np.array(data['Calories']); + ids = np.array(data['ID']) + counter =0 + trainX=[] + trainY=[] + calMean=data['Calories'].mean() + f = np.vectorize(lambda x:x-calMean) + for i in range(n,len(weights)): + for j in range(0,n): # letzten n calorien nicht nan + if(np.isnan(calories[i-j])): + counter =0 + break + else: + counter +=1 + if((counter == n) & (not np.isnan(weights[i]))&(not np.isnan(weights[i-1]))): + + trainY.append([weights[i]-weights[i-1]]) + + tr= np.append(f((calories[i-n+1:i+1])[::-1]),ids[i]) + trainX.append(tr)#reverse f, mittelwertbefreit, vllt calorien gewichten + counter =0 + lmlinear = linear_model.LinearRegression() + lmlinear.fit(trainX,trainY) + return lmlinear + +def applyRegression(data,n=5): + data=fillCalories(data) + lmlinear=procRegression(data,n) + weights= np.array(data['Weight']); + calories=np.array(data['Calories']); + ids = np.array(data['ID']) + calMean=data['Calories'].mean() + f = np.vectorize(lambda x:x-calMean) + for i in range(len(weights)-1,-1+n,-1): #letztes elemente muss ein gewicht haben (wie in den echten daten) + if np.isnan(weights[i]): + pred=lmlinear.predict(np.append(f((calories[i-n+1:i+1])[::-1]),ids[i])) + weights[i]= weights[i+1]-pred + return weights + +# giving up on a nice solution +def regOnInterpol(data): + ids=data['ID'] + weights = data['Weight'] + counter =0; + for i in range(len(weights)): + if(not(np.isnan(weights[i]))): #take every serveral non nan value as test data + counter += 1 + if(counter %2 ==0): + weights[i]=np.nan + intWeights= weights.interpolate() + + regdata=data[data['Weight'].apply(lambda x: not(np.isnan(x)))] + trainY= regdata['Weight'][1::2] + trainX= regdata['ID'][1::2] + for i in range(len(trainX)): + trainY[i]= trainY[i]-intWeights[trainX[i]] + trainY= np.reshape(trainY,(len(trainY),1)) + trainX= np.reshape(trainX,(len(trainX),1)) + lmlinear = make_pipeline(PolynomialFeatures(5), LinearRegression()) + lmlinear = linear_model.LinearRegression() + lmlinear.fit(trainX,trainY) + weights2 = data['Weight'] + for i in range(len(intWeights)): + if (not(np.isnan(weights2[i]))): + intWeights[i]+= lmlinear.predict(ids[i]); + return intWeights + +def regOnInterpol2(data): + diff=[] + ids=[] + for i in range(len(data['diff'])): + if not(np.isnan(data['diff'][i])): + diff.append(data['diff'][i]) + ids.append(data['ID'][i]) + + lmlinear = make_pipeline(PolynomialFeatures(5), LinearRegression()) + lmlinear = linear_model.LinearRegression() + trainX=np.expand_dims(ids,axis=1) + trainY=diff + lmlinear.fit(trainX,trainY) + data['diff']=lmlinear.predict(np.expand_dims(data['ID'],axis=1)) + data['inter']=data['Weight'].interpolate() + return data['inter']+data['diff'] + +def dataDiff(data=data): + """calculates diff between interpolated and true weight for each point""" + + data['diff']=np.NaN + for i in range(len(data['Weight'])): + if(not(np.isnan(data['Weight'][i]))): + d=data['Weight'].copy() + d[i]=np.NaN + diff=data['Weight'][i]-d.interpolate()[i] + data['diff'][i]=diff + else : + data['diff'][i]=np.NaN + +rmse=[] +sum=0 +n=0 +for i in range(10): + data_train[i]['predWeight'] = interpol(data_train[i]) # change this line + rmse.append(evaluate(data_train[i], data_test[i])) + if(~np.isnan(rmse[i])): + n+=1 + sum+=rmse[i] + print("RMSE(",i,"):",rmse[i]) + +print("Mean RSME:",sum/n)