diff --git a/is/kaggle/interpolateOnRegression.py b/is/kaggle/interpolateOnRegression.py index 03a779d..18aca09 100644 --- a/is/kaggle/interpolateOnRegression.py +++ b/is/kaggle/interpolateOnRegression.py @@ -15,7 +15,7 @@ data=read() #data.drop(data.columns[[1,4,5,6,7]],axis=1,inplace=True) #data.drop(data.index[range(0,180)]) -#print(data) +print(data) def cv(data=data, n_folds=10): """split data in n_folds parts for cross validation @@ -64,7 +64,7 @@ #=============================================================================== def regOnInterpol(data): - + ids=data['ID'] weights = data['Weight'] counter =0; @@ -74,25 +74,54 @@ if(counter %2 ==0): weights[i]=np.nan intWeights= weights.interpolate() - + regdata=data[data['Weight'].apply(lambda x: not(np.isnan(x)))] trainY= regdata['Weight'][1::2] trainX= regdata['ID'][1::2] for i in range(len(trainX)): - trainY[i]= trainY[i]-intWeights[trainX[i]] + trainY[i]= trainY[i]-intWeights[trainX[i]] trainY= np.reshape(trainY,(len(trainY),1)) trainX= np.reshape(trainX,(len(trainX),1)) lmlinear = make_pipeline(PolynomialFeatures(5), LinearRegression()) lmlinear = linear_model.LinearRegression() - lmlinear.fit(trainX,trainY) + lmlinear.fit(trainX,trainY) weights2 = data['Weight'] for i in range(len(intWeights)): if (not(np.isnan(weights2[i]))): intWeights[i]+= lmlinear.predict(ids[i]); - return intWeights - - - + return intWeights + + +def regOnInterpol2(data): + diff=[] + lipids=[] + ids=[] + for i in range(len(data['diff'])): + if not(np.isnan(data['diff'][i])): + diff.append(data['diff'][i]) + ids.append(data['ID'][i]) + + lmlinear = make_pipeline(PolynomialFeatures(5), LinearRegression()) + lmlinear = linear_model.LinearRegression() + trainX=np.expand_dims(ids,axis=1) + trainY=diff + lmlinear.fit(trainX,trainY) + data['diff']=lmlinear.predict(np.expand_dims(data['ID'],axis=1)) + data['inter']=data['Weight'].interpolate() + return data['inter']+data['diff'] + +def dataDiff(data=data): + """calculates diff between interpolated and true weight for each point""" + + data['diff']=np.NaN + for i in range(len(data['Weight'])): + if(not(np.isnan(data['Weight'][i]))): + d=data['Weight'].copy() + d[i]=np.NaN + diff=data['Weight'][i]-d.interpolate()[i] + data['diff'][i]=diff + else : + data['diff'][i]=np.NaN #=============================================================================== # def calorieBased(data): @@ -102,20 +131,20 @@ # #data['Calories']=data['Calories'].fillna(np.random.normal(loc=calMean,scale=calSTD,size=len(data['Calories'])-data['Calories'].count())) # nans=len(data['Proteins'])-data['Proteins'].count() # dfrand = calSTD*np.random.randn(nans)+calMean -# +# # #data['Proteins',np.isnan(data['Proteins'])]= dfrand[np.isnan(data['Proteins'])]; #Erzeuge zufaellige kalorienwerte, ersetze sie durch nan werte. # a=[] # c=0 # for i in range(len(data['Proteins'])): # if np.isnan(data['Proteins'][i]): -# +# # a.append(dfrand[c]) -# c+=1 +# c+=1 # else: -# a.append(data['Proteins'][i]) +# a.append(data['Proteins'][i]) # data['Proteins']=a -# -# +# +# # return data #=============================================================================== @@ -127,7 +156,7 @@ #return(data['Weight']) #=============================================================================== -# def applyRegression(data,lmlinear,n): +# def applyRegression(data,lmlinear,n): # weights= np.array(data['Weight']); # calories=np.array(data['Calories']); # ids = np.array(data['ID']) @@ -135,7 +164,7 @@ # f = np.vectorize(lambda x:x-calMean) # if np.isnan(weights[-1]): # weights[-1]=weights.mean(); -# + # for i in range(len(weights)-2,-1+n,-1): #letztes elemente muss ein gewicht haben (wie in den echten daten) # if np.isnan(weights[i]): # pred=lmlinear.predict(np.append(f((calories[i-n+1:i+1])[::-1]),ids[i])) @@ -144,11 +173,11 @@ #=============================================================================== #print(weights) # print(data) - + #data.to_csv(path_or_buf='data.csv') #return weights - - + + #=============================================================================== # def procRegression(data,n): # weights= np.array(data['Weight']); @@ -165,19 +194,19 @@ # counter =0 # break # else: -# counter +=1 +# counter +=1 # if((counter == n) & (not np.isnan(weights[i]))&(not np.isnan(weights[i-1]))): -# +# # trainY.append([weights[i]-weights[i-1]]) -# +# # tr= np.append(f((calories[i-n+1:i+1])[::-1]),ids[i]) # trainX.append(tr)#reverse f, mittelwertbefreit, vllt calorien gewichten # #print((calories[i-n+1:i+1])[::-1]) # counter =0 -# #print(trainX) +# #print(trainX) # lmlinear = model = make_pipeline(PolynomialFeatures(100), Ridge()) -# lmlinear.fit(trainX,trainY) -# return lmlinear +# lmlinear.fit(trainX,trainY) +# return lmlinear #=============================================================================== #=============================================================================== @@ -190,19 +219,19 @@ # lmlinear = make_pipeline(PolynomialFeatures(1000), Ridge()) # #lmlinear = LogisticRegression() # # print(lmlinear.score(trainX, trainY))# fehler bei train daten (soll eig nicht verwendet werden) -# -# -# lmlinear.fit(trainX,trainY) +# +# +# lmlinear.fit(trainX,trainY) # weights= np.array(data['Weight']); # ids=np.array(data['ID']); # for i in range(len(weights)): # if(np.isnan(weights[i])): # weights[i]= lmlinear.predict(ids[i]) # #data['Weight']=weights -# #print(data) -# #data.to_csv(path_or_buf='dataW.csv') -# return weights -# +# #print(data) +# #data.to_csv(path_or_buf='dataW.csv') +# return weights +# # def procweightAndCalories(data): # data=data[data['Weight'].apply(lambda x: not(np.isnan(x)))&data['Proteins'].apply(lambda x: not(np.isnan(x)))] # weights= np.array(data['Weight']) @@ -218,12 +247,12 @@ # trainX= trainX.T #np.reshape(trainX,(len(trainX),1)) # trainY=weights # np.reshape(weights,(len(weights),1)) -# #print(trainX) +# #print(trainX) # lmlinear = make_pipeline(PolynomialFeatures(2), Ridge()) # # lmlinear = linear_model.LinearRegression() -# lmlinear.fit(trainX,trainY) -# return lmlinear -# def applyRegressionWeightCalories(data,lmlinear): +# lmlinear.fit(trainX,trainY) +# return lmlinear +# def applyRegressionWeightCalories(data,lmlinear): # weights= np.array(data['Weight']); # calories=np.array(data['Proteins']); # ids = np.array(data['ID']) @@ -239,18 +268,23 @@ # data['Weight']= weights # #print(weights) # # print(data) -# +# # #data.to_csv(path_or_buf='data.csv') # return weights #=============================================================================== - - -data_train, data_test=cv() -print("CV abgeschlossen") + + +data_train, data_test=cv() +print("CV abgeschlossen") #data_train.to_csv(path_or_buf='data_train.csv') #data_test.to_csv(path_or_buf='data_test.csv') #print("a") +#print(data.describe()) +#print(data) +#print(data.describe()) +#print(data.corr()) + rmse=[] sum=0 n=0 @@ -260,21 +294,22 @@ for i in range(10): nn=1#menge der kalorien elemente data2=data_train[i] - data_train[i]['predWeight'] = regOnInterpol(data2) + dataDiff(data2) + data_train[i]['predWeight'] = regOnInterpol2(data2) rmse.append(evaluate(data_train[i], data_test[i])) if(~np.isnan(rmse[i])): n+=1 sum+=rmse[i] print("RMSE(",i,"):",rmse[i]) -print("Mean RSME:",sum/n) +print("Mean RSME:",sum/n) -data['Weight'] = regOnInterpol(data) -data = data.set_index('ID') -test = pd.read_csv("./datasets/test.csv") -predictions = test.join(data,on='ID') +# data['Weight'] = regOnInterpol(data) +# data = data.set_index('ID') +# test = pd.read_csv("./datasets/test.csv") +# predictions = test.join(data,on='ID') -predictions[['ID','Weight']].to_csv('sampleSubmission.csv', header = ['ID','Weight'],index_label=False,index=False) +# predictions[['ID','Weight']].to_csv('sampleSubmission.csv', header = ['ID','Weight'],index_label=False,index=False) #nn=2#menger der kalorien elemente #lmlinear =procRegression(data, nn) diff --git a/is/kaggle/regOnInterpol2.csv b/is/kaggle/regOnInterpol2.csv new file mode 100644 index 0000000..05c5a04 --- /dev/null +++ b/is/kaggle/regOnInterpol2.csv @@ -0,0 +1,101 @@ +ID,Weight +183,92.09226429596886 +184,91.9755699349035 +185,91.85887557383816 +186,91.74218121277283 +187,91.62548685170749 +192,91.54201504638078 +197,91.5835432410541 +203,91.32837707466206 +209,90.85821090827001 +210,90.83318321387135 +226,91.45774010349257 +247,92.50715852112046 +257,92.80688157713372 +268,91.70657693874831 +271,92.0064938555523 +288,90.15602305077488 +292,90.43924560651352 +317,91.70521991321336 +341,90.63788858097855 +365,93.10389058207706 +422,92.00231200135272 +441,94.51011914111126 +494,95.35031800464827 +535,95.03251586763602 +581,95.99790859196375 +593,95.05639978859145 +625,95.49669003842214 +638,95.7763300112394 +639,95.83630231684073 +649,94.546025372854 +657,96.34580381766462 +670,95.46211045714854 +685,96.5093140887875 +690,96.5806041882227 +700,95.14461295852169 +708,93.1943914033323 +724,94.59394829295354 +740,94.69350518257478 +743,94.96008876604543 +758,94.49300668339866 +765,94.2261461559413 +789,94.23920698056922 +809,97.24159426906635 +816,95.24140040827565 +835,95.19087421470087 +844,95.44062496511279 +861,93.99015416033536 +863,94.24009877153803 +873,94.08982182755129 +892,96.28929563397651 +127,92.31048184896119 +160,92.30956793380498 +191,91.4753760741128 +195,91.73359862985143 +196,91.65857093545277 +212,91.008127825074 +222,92.00785088108728 +229,91.70765702029655 +233,93.00754624270188 +250,92.77850400935301 +254,93.4641075174726 +260,91.90679849393771 +265,91.75666002194434 +270,91.90652154995098 +272,92.10646616115363 +278,90.5062999947616 +286,90.55607843957222 +293,90.47255124544817 +296,90.46580149558548 +299,91.84571841238947 +373,93.00366902688768 +408,92.25269972293414 +489,95.46712314330831 +490,95.0337621155763 +537,95.34912714550535 +542,95.53232200684532 +543,95.46562764577997 +544,95.39893328471464 +545,95.3322389236493 +546,95.26554456258395 +550,93.9987671183226 +561,95.93179581327054 +562,95.5651014522052 +567,94.39829631354517 +611,96.11707776000357 +618,96.96355056587953 +619,96.43018953814752 +634,95.4964407888341 +646,95.84610845605002 +683,96.19508376329912 +689,96.56634616833566 +720,94.14405907054824 +751,93.39320054418938 +812,96.49151118587034 +837,94.84081882590351 +857,93.34026493793006 +859,93.39020954913272 +877,94.43971104995661 +1048,98.09245194329758 +1050,98.1540787974909