\documentclass[a4paper,12pt]{scrartcl}
\usepackage[ngerman]{babel}
\usepackage{graphicx} %BIlder einbinden
\usepackage{amsmath} %erweiterte Mathe-Zeichen
\usepackage{amsfonts} %weitere fonts
\usepackage[utf8]{inputenc} %Umlaute & Co
\usepackage{hyperref} %Links
\usepackage{ifthen} %ifthenelse
\usepackage{enumerate}
\usepackage{listings}
\lstset{language=Python}
\usepackage{algpseudocode} %Pseudocode
\usepackage{dsfont} % schöne Zahlenräumezeichen
\usepackage{amssymb, amsthm} %noch stärker erweiterte Mathe-Zeichen
\usepackage{tikz} %TikZ ist kein Zeichenprogramm
\usetikzlibrary{trees,automata,arrows,shapes}
\pagestyle{empty}
\topmargin-50pt
\newcounter{aufgabe}
\def\tand{&}
\newcommand{\makeTableLine}[2][0]{%
\setcounter{aufgabe}{1}%
\whiledo{\value{aufgabe} < #1}%
{%
#2\tand\stepcounter{aufgabe}%
}
}
\newcommand{\aufgTable}[1]{
\def\spalten{\numexpr #1 + 1 \relax}
\begin{tabular}{|*{\spalten}{p{1cm}|}}
\makeTableLine[\spalten]{A\theaufgabe}$\Sigma$~~\\ \hline
\rule{0pt}{15pt}\makeTableLine[\spalten]{}\\
\end{tabular}
}
\def\header#1#2#3#4#5#6#7{\pagestyle{empty}
\begin{minipage}[t]{0.47\textwidth}
\begin{flushleft}
{\bf #4}\\
#5
\end{flushleft}
\end{minipage}
\begin{minipage}[t]{0.5\textwidth}
\begin{flushright}
#6 \vspace{0.5cm}\\
% Number of Columns Definition of Columns second empty line
% \begin{tabular}{|*{5}{C{1cm}|}}\hline A1&A2&A3&A4&$\Sigma$\\\hline&&&&\\\hline\end{tabular}\\\vspace*{0.1cm}
\aufgTable{#7}
\end{flushright}
\end{minipage}
\vspace{1cm}
\begin{center}
{\Large\bf Sheet #1}
{(Hand in #3)}
\end{center}
}
%counts the exercisenumber
\newcounter{n}
%Kommando für Aufgaben
%\Aufgabe{AufgTitel}{Punktezahl}
\newcommand{\Aufgabe}[2]{\stepcounter{n}
\textbf{Exercise \arabic{n}: #1} (#2 Points)\\}
\newcommand{\norm}[1]{\left \| #1 \right \|}
\DeclareMathOperator{\minimize}{minimize}
\DeclareMathOperator{\maximize}{maximize}
\newcommand{\real}{\mathbb{R}}
\newcommand{\blasso}{\beta^{\mathrm{LASSO}}}
\newcommand{\bzero}{\beta^0}
\newcommand{\bLS}{\hat{\beta}^{\mathrm{LS}}}
\DeclareMathOperator*{\argmin}{arg\,min}
\newcommand{\textcorr}[1]{\textcolor{red}{#1}}
\newenvironment{corr}{\color{red}}{\color{black}\newline}
\newcommand{\ok}{\begin{corr}
$\checkmark$
\end{corr}}
\begin{document}
%\header{BlattNr}{Tutor}{Abgabedatum}{Vorlesungsname}{Namen}{Semester}{Anzahl Aufgaben}
\header{4}{}{2015-05-21}{Intelligent Systems I}{\textit{Maximus Mutschler}\\ \textit{Jan-Peter Hohloch}
}{SS 15}{2}
\vspace{1cm}
\Aufgabe{LASSO \& $l_0$}{30}
\begin{enumerate}
\item \begin{enumerate}
\item \includegraphics[width=.4\textwidth]{ghard.png}\textcorr{\checkmark}
\item \begin{align*}
\hat{\beta}^{LS}&=\left(X^TX\right)^{-1}X^T\mathbf{y}\\
&=\left(nI_{n\times n} \right)^{-1}X^T\mathbf{y}\\
&=\frac{1}{n}\cdot I_{n\times n}X^T\mathbf{y}\\
&=\frac{1}{n}\cdot X^T\mathbf{y}\textcorr{\checkmark}
\end{align*}
\item \begin{math}
\bzero(\lambda) := \mathrm{arg \, min}_{\beta} \frac{1}{n} \norm{Y - X \beta}_2^2 + \lambda \norm{\beta}_0 ,\\
=\mathrm{arg \, min}_{\beta} \underbrace{\frac{1}{n} Y^{T}Y}_{constant}- \frac{2}{n}Y^{T}X\beta+\norm{\beta}^2+\lambda \norm{\beta}_0 \\
=\mathrm{arg \, min}_{\beta} -\frac{2}{n}Y^{T}X\beta+\norm{\beta}^2+\lambda \norm{\beta}_0 \\
=\textcorr{\textbf{arg }}\mathrm{min}_{\beta} \sum^p_{i=1}-\frac{2}{n}Y^{T}_iX_i\beta_i+\beta_i^2+\lambda \cdot \begin{cases}
0, \beta_i =0\\
1, \beta_i \neq 0\\
\end{cases} %0 oder eins
\\\text{The problem can be simplified to one}\\ \text{ some element representing all others}\\
L_i=\begin{cases}
0, \beta_i =0\\
-\frac{2}{n}Y^{T}_iX_i\beta_i+\beta_i^2+\lambda , \beta_i \neq 0\\
\end{cases} \\
\\\frac{d L_i }{d \beta}= -\frac{2}{n} X_i^TY_i +2 \beta_i \overset{!}{=} 0\\
\beta_i= \frac{1}{n} X^T_iY_i\\
\text{$\beta_i$ in $L_i$,$\beta_i \neq 0$, determine for which $\lambda$ $\beta_i =0$}\\
0= -2\beta_i^2+\beta_i^2 +\lambda\\
\lambda=|\beta_i|^2\\
\lambda = \left|\frac{1}{n} X^T_iY_i\right|^2 \\
\sqrt{\lambda} = \left|\frac{1}{n} X^T_iY_i\right| \\
\bzero(\lambda) = \frac{1}{n} X^T_iY_i \cdot
\begin{cases}
0, |X^T_iY_i| =\sqrt{\lambda}\\
1, |X^T_iY_i|><\sqrt{\lambda}\\
\end{cases}
\end{math}
It's obviously the wrong solution, but maybe a right attempt.
\begin{corr}
\begin{align*}
\beta^0(\lambda)&=\argmin_\beta \mathcal{L}(\beta)\\
&= \argmin_\beta \frac{1}{n}\left(Y-X\beta\right)^T\left(Y-X\beta\right)+\sum\limits_{i=1}^p\lambda \mathds{1}_{\{\beta_i\not=0\}}\\
&= \argmin_\beta \frac{1}{n}\left(-2Y^TX\beta +n\beta^T\beta\right)+\sum\limits_{i=1}^p\lambda \mathds{1}_{\{\beta_i\not=0\}}\\
&=\argmin_\beta \sum\limits_{i=1}^p \underbrace{-\frac{2}{n}\left(Y^TX\right)_i\beta_i+|\beta_i|^2+\lambda\mathds{1}_{\{\beta_i\not=0\}}}_{=:\mathcal{L}_i\left(\beta_i\right)}\\
\beta_i^0&=\argmin_{\beta_i} \mathcal{L}_i(\beta_i)\\
\frac{2}{n}\left(Y^TX\right)_i\beta_i+|\beta_i|^2&= \beta_i\left(\beta_i-\frac{2}{n}\left(Y^TX)_i\right)\right)\\
\text{solution: }& \text{minimum at } \frac{1}{n}\left(Y^TX\right)_i\\
\Rightarrow \mathcal{L}_i\left(\tilde{\beta_i}\right) &= \lambda \mathds{1}_{\{\beta_i\not=0\}} - \left(\frac{1}{n}\left(Y^TX\right)_i\right)^2\\
(assume\ \beta_i\not=0) &= \lambda -\left(\frac{1}{n}\left(Y^TX\right)_i\right)^2\\
\min_{\beta_i}\mathcal{L}_i\left(\beta_i\right)&=\min\left(\underbrace{\min_{\beta_i\not=0}\mathcal{L}_i\left(\beta_i\right)}_{\lambda-z_i^2},\underbrace{\mathcal{L}_i(0)}_{=0}\right)\\
\lambda-z_i^2\leq 0 &\Leftrightarrow \sqrt{\lambda}\leq z_i\\
\Rightarrow \beta_i^0(\lambda) &= z_i\mathds{1}_{\{|z_i|\geq\sqrt{\lambda}\}}=g_{hard,\lambda}(z_i)
\end{align*}
\end{corr}
\end{enumerate}
\item \begin{enumerate}
\item \includegraphics[width=0.4\textwidth]{gsoft.png}
\item \begin{math}\blasso(\lambda) := \mathrm{arg \, min}_{\beta} \frac{1}{n} \norm{Y - X \beta}_2^2 + \lambda \norm{\beta}_1 ,\\
=\mathrm{arg \, min}_{\beta} \underbrace{\frac{1}{n} Y^{T}Y}_{constant}- \frac{2}{n}Y^{T}X\beta+\norm{\beta}^2+\lambda \norm{\beta}_1 \\
=\mathrm{arg \, min}_{\beta} -\frac{2}{n}Y^{T}X\beta+\norm{\beta}^2+\lambda \norm{\beta}_1 \\
=\mathrm{min}_{\beta} \sum^p_{i=1}-\frac{2}{n}Y^{T}_iX_i\beta_i+\beta_i^2+\lambda |\beta_i|\\
\text{Minimize for a fixed i}\\\text{The problem can be simplified to the sum of its parts}\\
L_i=-\frac{2}{n}Y^{T}_iX_i\beta_i+\beta_i^2+\lambda |\beta_i|\\
\text{ $Y^{T}_iX_i >0$ subjects to $\beta_i \geq 0$}\\\text{and vice versa $Y^{T}_iX_i \leq0$ subjects to $\beta_i \leq 0$ }\\\\
\text{Case 1: $Y^{T}_iX_i >0$ and $\beta_i \geq 0$}\\
\frac{dL_i}{d\beta} =-\frac{2}{n}Y^{T}_iX_i+2\beta_i+\lambda \overset{!}{=} 0\\
\beta_i = \frac{1}{n}Y^{T}_iX_i-\frac{\lambda}{2}\\
\text{this is only feasible if the right-hand side is nonnegative}\\
\blasso_i(\lambda) =\left(\frac{1}{n}Y^{T}_iX_i-\frac{\lambda}{2}\right)^+
\\= sgn(\frac{1}{n}Y^{T}_iX_i)% den schritt verstehe ich nicht
\cdot\left(\left|\frac{1}{n}Y^{T}_iX_i\right|-\frac{\lambda}{2}\right)^+\\
\text{Case 2: $Y^{T}_iX_i \leq0$ and $\beta_i \leq 0$}\\
\text{leads obviously to the same equation, so it's proven}
% quelle http://stats.stackexchange.com/questions/17781/derivation-of-closed-form-lasso-solution
\end{math}\\
We needed more than 12 hours to solve these equations. Please be aware of the fact that we are computer scientist with only a basic math education. We are not mathematics students!
\begin{corr}
\begin{align*}
\mathcal{L}(\beta)&=\argmin_\beta \frac{1}{n}\left|\left|Y-X\beta\right|\right|^2+\lambda ||\beta||_1\\
&= \argmin_\beta \frac{1}{n}\left(-2Y^TX\beta +n\beta^T\beta\right)+\lambda \sum\limits_{i=1}^p|\beta_i|\\
&=\argmin_\beta \sum\limits_{i=1}^p \underbrace{-2z_i\beta_i+|\beta_i|^2+\lambda|\beta_i|}_{=:\mathcal{L}_i}
\end{align*}
\begin{align*}
\beta_i\geq 0: & \mathcal{L}_i(\beta_i)&=\beta_i\left(\beta_i-2z_i+\lambda\right)\\
&\tilde{\beta}_i&=\begin{cases}
z_i-\frac{\lambda}{2} & if\ z_i\geq \frac{\lambda}{2}\\
0 & else
\end{cases}\\
\beta_i\leq 0: & \mathcal{L}_i(\beta_i)&=\beta_i\left(\beta_i-2z_i-\lambda\right)\\
&\tilde{\beta}_i&=\begin{cases}
z_i+\frac{\lambda}{2} & if\ z_i\leq -\frac{\lambda}{2}\\
0 & else
\end{cases}\\
\Rightarrow & \beta_i^{LASSO}&=g_{soft,\frac{\lambda}{2}}\left(z_i\right)
\end{align*}
\end{corr}
\end{enumerate}
\end{enumerate}
\Aufgabe{LASSO}{70}
\begin{lstlisting}
def printOutput(lm_, Xtest_, ytest_): # lm_ = instance of linear_model.xxx(xxx)
yhat = lm_.predict(Xtest_)
plt.figure(1)
plt.title("Regression Weights")
plt.plot(lm_.coef_.T)
plt.figure(2)
plt.title('yhat vs ytest')
plt.plot(ytest_, yhat, 'ro')
plt.show()
print('R2 :', lm_.score(Xtest_,ytest_))
print('Gene with Strongest Coefficient :' , abs(lm_.coef_).argmax())
if hasattr(lm_, 'alpha'): print('Used Lambda :', lm.alpha)
if hasattr(lm_, 'alpha_'): print('Lambda_ResultOfCV :', lm.alpha_ )
\end{lstlisting}
\begin{enumerate}[a)]
\item What should happen if you tried to apply linear regression without any regularizer on this data set?
\begin{itemize}
\item many non-zero weights
\item possible large weights
\item \textcorr{the solution is not unique}
\end{itemize}
\item Does this happen with the function \_linear\_model.LinearReagression()\_ ?
\begin{itemize}
\item yes (only 1 zero for 4710, 3290)
\item no (largest weight for 4710 ~0.0227, for 3290 ~0.0477)
\end{itemize}
\end{enumerate}
\includegraphics[width=0.49\textwidth]{outputLinear.png}
\includegraphics[width=0.49\textwidth]{outputRidge.png}\\
\includegraphics[width=\textwidth]{outputLasso.png}\\
with the following Code:
\begin{lstlisting}
lm = linear_model.RidgeCV(cv=10) ### CHANGE THIS LINE ###
lm.n_jobs=-1
lm.fit(Xtrain, ytrain);
#-------------------------------------------------------------
lm = linear_model.LassoCV(cv=10) ### CHANGE THIS LINE ###
lm.n_jobs=-1
lm.max_iter=1000
lm.fit(Xtrain, ytrain)
printOutput(lm, Xtest, ytest) ### PROVIDE THE OUTPUT ###
print('Regression Coefficient of Gene 5954 : ', lm.coef_[5954])
\end{lstlisting}
\textcorr{has to be 5953}\\
\textcorr{\textbf{other gene missing}}
\end{document}