diff --git a/ea/UB6/ea6.pdf b/ea/UB6/ea6.pdf index d767d7d..ec937c7 100644 --- a/ea/UB6/ea6.pdf +++ b/ea/UB6/ea6.pdf Binary files differ diff --git a/ea/UB6/ea6.tex b/ea/UB6/ea6.tex index dc9f792..4c61eb5 100644 --- a/ea/UB6/ea6.tex +++ b/ea/UB6/ea6.tex @@ -84,39 +84,35 @@ \vspace{0.5cm} \Aufgabe{Crossover-Operatoren}{6}\\ \begin{enumerate}[(a)] \item - A=(1010110000)\\ - B=(0011100011)\\ + $ A=(1010110000)\\ + B=(0011100011)$\\ \begin{enumerate}[(i)] - \item 1-Punkt Crossover an 4 : A'=(1010|\textbf{100011}) B'=(0011|\textbf{110000}) - \item 2-Punkt Crossover an 3,7 : A'=(101|\textbf{1100}|000) B'=(001|\textbf{0110}|000) - \item Uniformes Crossover 2,4,6,8: A'=(1\textbf{0}1\textbf{1}1\textbf{0}0\textbf{0}00) B'=(0\textbf{0}1\textbf{0}1\textbf{1}0\textbf{0}11) + \item 1-Punkt Crossover an 4 : $A'=(1010|\textbf{100011})$ $B'=(0011|\textbf{110000})$ + \item 2-Punkt Crossover an 3,7 :$ A'=(101|\textbf{1100}|000) B'=(001|\textbf{0110}|000)$ + \item Uniformes Crossover 2,4,6,8: $A'=(1\textbf{0}1\textbf{1}1\textbf{0}0\textbf{0}00) B'=(0\textbf{0}1\textbf{0}1\textbf{1}0\textbf{0}11)$ \end{enumerate} \item - C=(9,4,6,7,1,2,10,8,5,3)\\ - D=(7,3,8,10,5,4,2,9,6,1)\\ + $C=(9,4,6,7,1,2,10,8,5,3)\\ + D=(7,3,8,10,5,4,2,9,6,1)$\\ \begin{enumerate}[(i)] - \item Partially matched Xover: 3,7 C'=(9,\textbf{2},6|\textbf{10,5,4,2}|8,\textbf{1},3) D'=(\textbf{2},3,8|\textbf{7,1,2,10}|9,6,\textbf{5}) - \item order xover 1,4 C'=(7|3,8,10|1,2,5,9,4,6) D'=(10|4,6,7|5,2,9,1,3,8) - \item Cycle Xover in C, 5 . C'=C=(9,4,6,7,1,2,10,8,5,3) D'=D=(7,3,8,10,5,4,2,9,6,1) + \item Partially matched Xover: 3,7 $C'=(9,\textbf{7},6|\textbf{10,5,4,2}|8,\textbf{1},3) D'=(\textbf{4},3,8|\textbf{7,1,2,10}|9,6,\textbf{5})$ + \item Order Crossover 1,4 C'=$(7|3,8,10|1,2,5,9,4,6) D'=(10|4,6,7|5,2,9,1,3,8)$ + \item Cycle Crossover in C, 5 .$ C'=C=(9,4,6,7,1,2,10,8,5,3) D'=D=(7,3,8,10,5,4,2,9,6,1)$ \end{enumerate} \end{enumerate} \Aufgabe{Selektionswahrscheinlichkeit}{6}\\ $P_{Sorted}=(110110,110101,001011,010011,010100)$\\ \begin{enumerate}[(a)] - \item Lineares Ranking\\$p_i=\frac{1}{\lambda}\left(\mu^+-(\mu^+-\mu^-)\dfrac{i-1}{\lambda-1}\right)$\\ - $\mu^+=1\\ - \mu^-=1 + \item Lineares Ranking\\$p_i=\frac{1}{\lambda}\left(\mu^+-(\mu^+-\eta^-)\dfrac{i-1}{\lambda-1}\right)$\\ + $\eta^+=1.2\\ + \eta^-=0.8\\ \lambda = 5\\ - p_i=\frac{1}{5}$ für alle i\\ - $ - \mu^+=2\\ - \mu^-=0\\ - p_i=\dfrac{1}{5}(2-\dfrac{2(i-1)}{4})= \dfrac{2}{5}-\dfrac{1}{10}(i-1)=\dfrac{5-i}{10}\\ - p_1=0.4\\ - p_2=0.3\\ + p_i=\dfrac{1}{5}(1.2-\dfrac{0.4(i-1)}{4})= \dfrac{6}{25}-\dfrac{1}{50}(i-1)=\dfrac{13-i}{50}\\ + p_1=0.24\\ + p_2=0.22\\ p_3=0.2\\ - p_4=0.1\\ - p_5=0\\ + p_4=0.18\\ + p_5=0.16\\ $ \item nicht lineares ranking\\ $ p_i=c(1-c)^{i-1} @@ -127,7 +123,7 @@ p_3=0.04*(0.96)^2= 0.036864\cdot s=0.2\\ p_4=0.04*(0.96)^3= 0.03538944\cdot s=0.192\\ p_5=0.04*(0.96)^4= 0.0339738624\cdot s=0.183\\ - Saklierungsfaktor=s = 5.41632 + Saklierungsfaktor=s = 5.41632 \rightarrow \sum_{i=1}^5p_i =1 $ \item tournament selektion \\ $ @@ -144,14 +140,14 @@ \end{enumerate} \Aufgabe{ Population-based Incremental Learning -}{8} \begin{enumerate}[(a)] - \item $f_{twin}=\left| \sum_{i=1}^{\frac{l}{2}}a_i- \sum_{i=1+\frac{l}{2}}^{l}a_i\right| = f_{twin}=\left| \sum_{i=1}^{3}a_i- \sum_{i=4}^{6}a_i\right|\\ + \item $f_{twin}=\left| \sum_{i=1}^{\frac{l}{2}}a_i- \sum_{i=1+\frac{l}{2}}^{l}a_i\right| \\ f_{twin}=\left| \sum_{i=1}^{3}a_i- \sum_{i=4}^{6}a_i\right|\\ \mu = 2\\ \eta = \frac{1}{5}\\ l = 6\\ P_i=P_i*(1-\eta)+a_{ij}\eta = P_i +\frac{1}{5}\cdot(a_{ij}-P_i)\\ f(1)=-2 ,f(2)=0,f(3)=0,f(4)=2,f(5)=1,f(6)=-1\\ $ - Beste 2 Kinder:$ 4:(111010),5:(101010)$\\ + Besten 2 Kinder: $ 4:(111010),5:(101010)$\\ P mit Kind 4:$ (\frac{3}{5},\frac{3}{5},\frac{3}{5},\frac{2}{5},\frac{3}{5},\frac{2}{5})$\\ $P_{twin}(1)$ mit Kind 4 und 5: $(\frac{17}{25},\frac{12}{25},\frac{17}{25},\frac{8}{25},\frac{17}{25},\frac{8}{25})$ \item $ @@ -169,9 +165,9 @@ f_{bmax}(P(0))= 3 \\ f_{bmax}(P(1))= \frac{84}{25}= 3.36 \\ $ - Die Fitness von $P_{pmax}$ ist 2.64 und die von $P_{tmin}$ 2.58 von ihrem Maximum entfernt. Durch die verschiedenen Fitnessfunktionen sind andere Kinder Maxima in der aktuellen Population, wodurch sich die ermittelten Wahrscheinlichkeiten verändern und es zu einem unterschiedlichen Fortschritt kommt. %TODO - Auf Gundlage der Änderung zwischen Schritt 0 und 1 wird PBIL auf $f_{twin}$ schneller konvergieren da es die höhere Änderungsrate hat \TODO - \item Todo keine ahnung, kann hier nicht denken %TODO + Die Fitness von $P_{bmax}$ ist 2.64 und die von $P_{tmin}$ 2.58 von ihrem Maximum entfernt. Durch die verschiedenen Fitnessfunktionen sind verschiedene Kinder Maxima in der aktuellen Population, wodurch sich die ermittelten Wahrscheinlichkeiten verändern und es zu einem unterschiedlichen Fortschritt kommt. %TODO + Da die Fitness von $P(1)_{twin}$ näher am Maximum liegt als die Fitness von $P(1)_{bmax}$ wird die nächste Population wahrscheinlich bessere Individuen enthalten. Dementsprechend wird diese wahrscheinlich schneller konvergieren. \\ Grundsätzlich jedoch brauchen beide Optimierungen ungefähr gleichlang, da beide 6bits optimieren müssen. + \item Da wir weiterhin das beste Drittel der Individuen auswählen, ist die durchschnittliche Güte der gewählten Individuen gleich gut wie zuvor. Jedoch wird der Wahrscheinlichkeitsvektor öfter aktualisiert, wodurch sich dieser schneller verbessert. Wir benötigen also weniger Iterationen. Das gilt sowohl für $f_{bmax}$ als auch für $f_{twin}$. \end{enumerate} \end{document} diff --git a/is/UB4/Ex4.ipynb b/is/UB4/Ex4.ipynb index d5d51ae..b09c83e 100644 --- a/is/UB4/Ex4.ipynb +++ b/is/UB4/Ex4.ipynb @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -464,7 +464,46 @@ "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 0.068299\n", + "1 0.017395\n", + "2 -0.050700\n", + "3 0.032200\n", + "4 0.061997\n", + "5 0.089905\n", + "6 0.107902\n", + "7 -0.054100\n", + "8 -0.127800\n", + "9 -0.080800\n", + "10 0.266001\n", + "11 0.046798\n", + "12 0.144595\n", + "13 -0.042900\n", + "14 -0.084100\n", + "...\n", + "125 -0.010501\n", + "126 -0.122201\n", + "127 -0.017400\n", + "128 0.096505\n", + "129 -0.051401\n", + "130 0.141001\n", + "131 0.027494\n", + "132 0.031099\n", + "133 -0.017400\n", + "134 -0.061400\n", + "135 0.080098\n", + "136 0.115100\n", + "137 -0.048000\n", + "138 0.002206\n", + "139 0.063503\n", + "Name: y, Length: 140, dtype: float64\n" + ] + } + ], "source": [ "### CHANGE THE PATHS ###\n", "a=4\n", @@ -482,7 +521,8 @@ "Xtrain = Xtrain.drop(Xtrain.columns[[0]],axis = 1)\n", "ytrain = squeeze(ytrain.drop(ytrain.columns[[0]],axis = 1))\n", "Xtest = Xtest.drop(Xtest.columns[[0]],axis = 1)\n", - "ytest = squeeze(ytest.drop(ytest.columns[[0]],axis = 1))" + "ytest = squeeze(ytest.drop(ytest.columns[[0]],axis = 1))\n", + "print ytrain" ] }, { @@ -1786,21 +1826,21 @@ }, "celltoolbar": "Edit Metadata", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.4.3" + "pygments_lexer": "ipython2", + "version": "2.7.9" } }, "nbformat": 4, diff --git a/is/kaggle/cvMaxRegressionWorking.py b/is/kaggle/cvMaxRegressionWorking.py new file mode 100644 index 0000000..c0da0d5 --- /dev/null +++ b/is/kaggle/cvMaxRegressionWorking.py @@ -0,0 +1,153 @@ +#including cv +import pandas as pd +import numpy as np +from sklearn.cross_validation import KFold +from sklearn import linear_model +from collections import Counter + +def read(path="./datasets/train.csv"): + return(pd.read_csv(path, index_col='Date', parse_dates='Date')) + +data=read() +data.drop(data.columns[[1,4,5,6,7]],axis=1,inplace=True) + + +def cv(data=data, n_folds=2): + """split data in n_folds parts for cross validation + """ + + cleanData=data[pd.notnull(data['Weight'])] + + kf=KFold(len(cleanData), shuffle=True, n_folds=n_folds) + + trainid=[] + testid=[] + for train, test in kf: + trainid.append(train) + testid.append(test) + + data_test=[] + data_train=[] + for i in range(n_folds): + data_train.append(data.copy()) + data_test.append([]) + for j in testid[i]: + data_test[i].append(pd.DataFrame(cleanData.iloc[j])) + #crazy hack, necessary ... + train=data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']] + train['Weight']=float('NaN') + data_train[i][data_train[i]['ID']==cleanData.iloc[j]['ID']]=train + return (data_train,data_test) + +data_train, data_test=cv() + +def evaluate(predictions, data_test, predictedWeight='predWeight'): + """calcs the rmse on the testdata""" + n=len(data_test) + error=0 + for i in range(n): + test_value=np.float64(data_test[i].loc['Weight']) + #no better idea... + pred_value=predictions.iloc[int(data_test[i].loc['ID'])-1][predictedWeight] + error+= (test_value - pred_value)**2 + return(np.sqrt(error/n)) + +#1st example +def interpol(data): + return data['Weight'].interpolate() + +def calorieBased(data): + calMean=data['Calories'].mean() + calSTD=data['Calories'].std() + #fill with random data for nan-values + #data['Calories']=data['Calories'].fillna(np.random.normal(loc=calMean,scale=calSTD,size=len(data['Calories'])-data['Calories'].count())) + nans=len(data['Calories'])-data['Calories'].count() + dfrand = calSTD*np.random.randn(nans)+calMean + + #data['Calories',np.isnan(data['Calories'])]= dfrand[np.isnan(data['Calories'])]; #Erzeuge zufaellige kalorienwerte, ersetze sie durch nan werte. + a=[] + c=0 + for i in range(len(data['Calories'])): + if np.isnan(data['Calories'][i]): + + a.append(dfrand[c]) + c+=1 + else: + a.append(data['Calories'][i]) + data['Calories']=a + + + return data + + # for i in range(len(data)): + # if i==0: + # data['Weight'][0]=data['Weight'].mean() + # elif np.isnan(data['Weight'][i]): + # data['Weight',i]=data['Weight'][i-1]+(np.mean(data['Calories'][i-5:i])-calMean)/10 + + #return(data['Weight']) +def applyRegression(data,lmlinear,n): + weights= np.array(data['Weight']); + calories=np.array(data['Calories']); + ids = np.array(data['ID']) + calMean=data['Calories'].mean() + f = np.vectorize(lambda x:x-calMean) + for i in range(len(weights)-1,-1+n,-1): #letztes elemente muss ein gewicht haben (wie in den echten daten) + if np.isnan(weights[i]): + pred=lmlinear.predict(np.append(f((calories[i-n+1:i+1])[::-1]),ids[i])) + weights[i]= weights[i+1]-pred + data['Weight']= weights + #print(weights) + print(data) + + data.to_csv(path_or_buf='data.csv') + + +def procRegression(data,n): + weights= np.array(data['Weight']); + calories=np.array(data['Calories']); + ids = np.array(data['ID']) + counter =0 + trainX=[] + trainY=[] + calMean=data['Calories'].mean() + f = np.vectorize(lambda x:x-calMean) + for i in range(n,len(weights)): + for j in range(0,n): # letzten n calorien nicht nan + if(np.isnan(calories[i-j])): + counter =0 + break + else: + counter +=1 + if((counter == n) & (not np.isnan(weights[i]))&(not np.isnan(weights[i-1]))): + + trainY.append([weights[i]-weights[i-1]]) + + tr= np.append(f((calories[i-n+1:i+1])[::-1]),ids[i]) + trainX.append(tr)#reverse f, mittelwertbefreit, vllt calorien gewichten + #print((calories[i-n+1:i+1])[::-1]) + counter =0 + #print(trainX) + lmlinear = linear_model.LinearRegression() + lmlinear.fit(trainX,trainY) + return lmlinear + +rmse=[] +sum=0 +n=0 +#for i in range(10): +# data_train[i]['predWeight'] = interpol(data_train[i]) +# rmse.append(evaluate(data_train[i], data_test[i])) +# if(~np.isnan(rmse[i])): +# n+=1 +# sum+=rmse[i] +# print("RMSE(",i,"):",rmse[i]) +nn=2#menger der kalorien elemente +lmlinear =procRegression(data, nn) +data=calorieBased(data) +applyRegression(data, lmlinear,nn) + +#print("Mean RSME:",sum/n) +#print(data_train[1]) + + diff --git a/mr/ub6/UB6.pdf b/mr/ub6/UB6.pdf index bb0f376..666d256 100644 --- a/mr/ub6/UB6.pdf +++ b/mr/ub6/UB6.pdf Binary files differ diff --git a/mr/ub6/UB6.tex b/mr/ub6/UB6.tex index 78fe9d7..71f9323 100644 --- a/mr/ub6/UB6.tex +++ b/mr/ub6/UB6.tex @@ -108,13 +108,13 @@ 0.01730&0.09848&0.00151\\ \end{pmatrix} $ - \item match if $n^T_R\cdot E\cdot n_L=0\\ - n_L= \begin{pmatrix} - 1.33,&0.54&1.0 + \item match if $p^T_L\cdot E\cdot p_R=0\\ + p_L= \begin{pmatrix} + 0.46&0.27&1.00 \end{pmatrix} $\begin{itemize} - \item $n_{R,1}\rightarrow \begin{pmatrix} - 0.3&0.2&1.0 + \item $p_{R,1}\rightarrow \begin{pmatrix} + 0.46&0.27&1.00 \end{pmatrix} \begin{pmatrix} 0&0&0\\ @@ -122,28 +122,20 @@ 0.01730&0.09848&0.00151\\ \end{pmatrix} \begin{pmatrix} - 1.33,\\0.54\\1.0 - \end{pmatrix}\\ =\begin{pmatrix} - 0.3&0.2&1.0 - \end{pmatrix} \begin{pmatrix} - 0\\-0.0880224\\0.0776982 + 0.30\\0.20\\1.00 \end{pmatrix} - $\\ - \item $n_{R,2}\rightarrow\begin{pmatrix} - -0.39&0.32&1.0 - \end{pmatrix} + =0.0020492\rightarrow$ match\\ + \item $p_{R,2}\rightarrow p_l\cdot E \begin{pmatrix} - 0\\-0.0880224\\0.0776982 - \end{pmatrix} - $\\ - \item $n_{R,3}\rightarrow\begin{pmatrix} - -0.25&0.1&1.0 - \end{pmatrix} - \begin{pmatrix} - 0\\-0.0880224\\0.0776982 - \end{pmatrix}$ - \end{itemize} - werte falsch ... finde rechenfehler nicht. + 0.26\\0.21\\1.0 + \end{pmatrix}=0.00040354 + \rightarrow$ match\\ + \item $p_{R,3}\rightarrow p_l\cdot E + \begin{pmatrix} + -0.25\\-0.10\\1.00 + \end{pmatrix}=-0.040149 + \rightarrow$ no match\end{itemize} + \item $(K^{-1} v_R)^T\cdot E\cdot K^{-1}v_L = v_R^{T}\cdot K^{T^{-1}}\cdot E\cdot K^{-1} \cdot v_L\\ E= K^T\cdot F\cdot K \\