#including cv
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
from sklearn import linear_model
from collections import Counter
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
def read(path="./datasets/train.csv"):
return(pd.read_csv(path, index_col='Date', parse_dates='Date'))
data=read()
#data.drop(data.columns[[1,4,5,6,7]],axis=1,inplace=True)
#data.drop(data.index[range(0,180)])
#print(data)
def cv(data=data, n_folds=10):
"""split data in n_folds parts for cross validation
"""
cleanData=data[pd.notnull(data['Weight'])]
kf=KFold(len(cleanData), shuffle=True, n_folds=n_folds)
trainid=[]
testid=[]
for train, test in kf:
trainid.append(train)
testid.append(test)
data_test=[]
data_train=[]
for i in range(n_folds):
data_train.append(data.copy())
data_test.append([])
for j in testid[i]:
data_test[i].append(pd.DataFrame(cleanData.iloc[j]))
#crazy hack, necessary ...
train=data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]
train['Weight']=float('NaN')
data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]=train
return (data_train,data_test)
def evaluate(predictions, data_test, predictedWeight='predWeight'):
"""calcs the rmse on the testdata"""
n=len(data_test)
error=0
for i in range(n):
test_value=np.float64(data_test[i].loc['Weight'])
#no better idea...
pred_value=predictions.iloc[int(data_test[i].loc['ID'])-1][predictedWeight]
error+= (test_value - pred_value)**2
return(np.sqrt(error/n))
#1st example
#===============================================================================
# def interpol(data):
# return data['Weight'].interpolate()
#===============================================================================
def regOnInterpol(data):
ids=data['ID']
weights = data['Weight']
counter =0;
for i in range(len(weights)):
if(not(np.isnan(weights[i]))): #take every serveral non nan value as test data
counter += 1
if(counter %2 ==0):
weights[i]=np.nan
intWeights= weights.interpolate()
regdata=data[data['Weight'].apply(lambda x: not(np.isnan(x)))]
trainY= regdata['Weight'][1::2]
trainX= regdata['ID'][1::2]
for i in range(len(trainX)):
trainY[i]= trainY[i]-intWeights[trainX[i]]
trainY= np.reshape(trainY,(len(trainY),1))
trainX= np.reshape(trainX,(len(trainX),1))
lmlinear = make_pipeline(PolynomialFeatures(5), LinearRegression())
lmlinear = linear_model.LinearRegression()
lmlinear.fit(trainX,trainY)
weights2 = data['Weight']
for i in range(len(intWeights)):
if (not(np.isnan(weights2[i]))):
intWeights[i]+= lmlinear.predict(ids[i]);
return intWeights
#===============================================================================
# def calorieBased(data):
# calMean=data['Proteins'].mean()
# calSTD=data['Proteins'].std()
# #fill with random data for nan-values
# #data['Calories']=data['Calories'].fillna(np.random.normal(loc=calMean,scale=calSTD,size=len(data['Calories'])-data['Calories'].count()))
# nans=len(data['Proteins'])-data['Proteins'].count()
# dfrand = calSTD*np.random.randn(nans)+calMean
#
# #data['Proteins',np.isnan(data['Proteins'])]= dfrand[np.isnan(data['Proteins'])]; #Erzeuge zufaellige kalorienwerte, ersetze sie durch nan werte.
# a=[]
# c=0
# for i in range(len(data['Proteins'])):
# if np.isnan(data['Proteins'][i]):
#
# a.append(dfrand[c])
# c+=1
# else:
# a.append(data['Proteins'][i])
# data['Proteins']=a
#
#
# return data
#===============================================================================
# for i in range(len(data)):
# if i==0:
# data['Weight'][0]=data['Weight'].mean()
# elif np.isnan(data['Weight'][i]):
# data['Weight',i]=data['Weight'][i-1]+(np.mean(data['Calories'][i-5:i])-calMean)/10
#return(data['Weight'])
#===============================================================================
# def applyRegression(data,lmlinear,n):
# weights= np.array(data['Weight']);
# calories=np.array(data['Calories']);
# ids = np.array(data['ID'])
# calMean=data['Calories'].mean()
# f = np.vectorize(lambda x:x-calMean)
# if np.isnan(weights[-1]):
# weights[-1]=weights.mean();
#
# for i in range(len(weights)-2,-1+n,-1): #letztes elemente muss ein gewicht haben (wie in den echten daten)
# if np.isnan(weights[i]):
# pred=lmlinear.predict(np.append(f((calories[i-n+1:i+1])[::-1]),ids[i]))
# weights[i]= weights[i+1]-pred
# data['Weight']= weights
#===============================================================================
#print(weights)
# print(data)
#data.to_csv(path_or_buf='data.csv')
#return weights
#===============================================================================
# def procRegression(data,n):
# weights= np.array(data['Weight']);
# calories=np.array(data['Calories']);
# ids = np.array(data['ID'])
# counter =0
# trainX=[]
# trainY=[]
# calMean=data['Calories'].mean()
# f = np.vectorize(lambda x:x-calMean)
# for i in range(n,len(weights)):
# for j in range(0,n): # letzten n calorien nicht nan
# if(np.isnan(calories[i-j])):
# counter =0
# break
# else:
# counter +=1
# if((counter == n) & (not np.isnan(weights[i]))&(not np.isnan(weights[i-1]))):
#
# trainY.append([weights[i]-weights[i-1]])
#
# tr= np.append(f((calories[i-n+1:i+1])[::-1]),ids[i])
# trainX.append(tr)#reverse f, mittelwertbefreit, vllt calorien gewichten
# #print((calories[i-n+1:i+1])[::-1])
# counter =0
# #print(trainX)
# lmlinear = model = make_pipeline(PolynomialFeatures(100), Ridge())
# lmlinear.fit(trainX,trainY)
# return lmlinear
#===============================================================================
#===============================================================================
# def applyRegressionWeightsOnly(data):
# regdata=data[data['Weight'].apply(lambda x: not(np.isnan(x)))]
# trainY= regdata['Weight']
# trainY= np.reshape(trainY,(len(trainY),1))
# trainX= regdata['ID']
# trainX= np.reshape(trainX,(len(trainX),1))
# lmlinear = make_pipeline(PolynomialFeatures(1000), Ridge())
# #lmlinear = LogisticRegression()
# # print(lmlinear.score(trainX, trainY))# fehler bei train daten (soll eig nicht verwendet werden)
#
#
# lmlinear.fit(trainX,trainY)
# weights= np.array(data['Weight']);
# ids=np.array(data['ID']);
# for i in range(len(weights)):
# if(np.isnan(weights[i])):
# weights[i]= lmlinear.predict(ids[i])
# #data['Weight']=weights
# #print(data)
# #data.to_csv(path_or_buf='dataW.csv')
# return weights
#
# def procweightAndCalories(data):
# data=data[data['Weight'].apply(lambda x: not(np.isnan(x)))&data['Proteins'].apply(lambda x: not(np.isnan(x)))]
# weights= np.array(data['Weight'])
# calories=np.array(data['Proteins'])
# ids = np.array(data['ID'])
# ids= np.delete(ids,len(ids)-1,0)
# weights=np.delete(weights,len(weights)-1,0)
# calMean=data['Proteins'].mean()
# f = np.vectorize(lambda x:x-calMean)
# calories=np.delete(calories,0,0)
# calories= f(calories)
# trainX= np.vstack((ids,calories))
# trainX= trainX.T #np.reshape(trainX,(len(trainX),1))
# trainY=weights
# np.reshape(weights,(len(weights),1))
# #print(trainX)
# lmlinear = make_pipeline(PolynomialFeatures(2), Ridge())
# # lmlinear = linear_model.LinearRegression()
# lmlinear.fit(trainX,trainY)
# return lmlinear
# def applyRegressionWeightCalories(data,lmlinear):
# weights= np.array(data['Weight']);
# calories=np.array(data['Proteins']);
# ids = np.array(data['ID'])
# calMean=data['Proteins'].mean()
# f = np.vectorize(lambda x:x-calMean)
# calories=f(calories)
# if np.isnan(weights[-1]):
# weights[-1]=weights.mean();
# for i in range(len(weights)-2,0,-1): #letztes elemente muss ein gewicht haben (wie in den echten daten)
# if np.isnan(weights[i]):
# pred=lmlinear.predict([ids[i],calories[i-1]])
# weights[i]= pred
# data['Weight']= weights
# #print(weights)
# # print(data)
#
# #data.to_csv(path_or_buf='data.csv')
# return weights
#===============================================================================
data_train, data_test=cv()
print("CV abgeschlossen")
#data_train.to_csv(path_or_buf='data_train.csv')
#data_test.to_csv(path_or_buf='data_test.csv')
#print("a")
rmse=[]
sum=0
n=0
lowest=10000000
s=''
for i in range(10):
nn=1#menge der kalorien elemente
data2=data_train[i]
data_train[i]['predWeight'] = regOnInterpol(data2)
rmse.append(evaluate(data_train[i], data_test[i]))
if(~np.isnan(rmse[i])):
n+=1
sum+=rmse[i]
print("RMSE(",i,"):",rmse[i])
print("Mean RSME:",sum/n)
data['Weight'] = regOnInterpol(data)
data = data.set_index('ID')
test = pd.read_csv("./datasets/test.csv")
predictions = test.join(data,on='ID')
predictions[['ID','Weight']].to_csv('sampleSubmission.csv', header = ['ID','Weight'],index_label=False,index=False)
#nn=2#menger der kalorien elemente
#lmlinear =procRegression(data, nn)
#data=calorieBased(data)
#applyRegression(data, lmlinear,nn)
#print(data_train[1])