Newer
Older
isSummary / isSummary.tex
@Jan-Peter Hohloch Jan-Peter Hohloch on 16 Jul 2015 5 KB summary from lecture
\documentclass[a4paper,12pt]{scrartcl}
\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{dsfont}

\newcommand{\iid}{\overset{\text{iid}}{\sim}}

\begin{document}
    \section{Summary} % (fold)
    \label{sec:summary}
        \subsection{Probability Theory \& Statistics} % (fold)
        \label{sub:probTheo}
            \begin{itemize}
                \item random variables, distributions, expectations, variance, Bayes' rule, law of total prob.
                \item $X_1,\dots,X_N\iid p_\theta$
                \item MLE $\hat\theta$ is chosen to maximize $\prod\limits_{i=1}^N p_\theta(x_i)$
            \end{itemize}
        % subsection probTheo (end)
        \subsection{Classification} % (fold)
        \label{sub:classification}
            \begin{itemize}
                \item[given:] $(X_1,Y_1),\dots, (X_N,Y_N)\iid \mathds{P}$
                \item[goal:] find $f:\mathds{R}^m\rightarrow \{-1,1\}$ (classifier)
                \item criterion: loss $l:\{-1,1\}^2\rightarrow \{0,1\}: (y_1,y_2)\mapsto 1_{y_1\not= y_2}$
                \item SVM: consider $f_{w,b}:\mathds{R}^m\rightarrow\{-1,1\}:x\mapsto sgn(w^Tx+b)$
                \begin{itemize}
                    \item e.g. hard margin SVM (primal)
                    \item $O_1: \text{min: }\frac{1}{2}||w||^2\ s.t.\ y_i(w^Tx+b)\geq 1\forall i$
                \end{itemize}
            \end{itemize}
        % subsection classification (end)
        \subsection{Regression} % (fold)
        \label{sub:regression}
        \begin{itemize}
            \item[given:] $(X_1,Y_1),\dots, (X_N,Y_N)\iid \mathds{P}$
            \item[goal:] find $f:\mathds{R}^m\rightarrow \mathds{R}$
            \item e.g. linear functions $x\mapsto w^Tx$
        \end{itemize}
        \begin{enumerate}
            \item e.g. squared loss $l:(y_1,y_2)\mapsto (y_1-y_2)^2$\\
            $\leadsto$ OLS $w_{OLS}:=\arg\min_w \frac{1}{N}\sum\limits_{i=1}^N\left(w^Tx_i-y_i\right)^2$
            \item same thing as:\\
            assume $Y=w^TX+\epsilon,\ \epsilon\sim\mathcal{N}(0,\sigma^2)$\\
            find MLE for $(w,\sigma^2)$
            \item penalized regression
            \begin{itemize}
                \item e.g. RIDGE-Regression $w_{Ridge}:=\frac{1}{N}\sum\limits_{i=1}^N\left(w^Tx_i-y_i\right)^2+\lambda ||w||_2$\\
                $w_{Ridge}=\left(X^TX+\lambda \mathds{1}\right)^{-1}X^Ty$
                \item $w_{LASSO}:=\frac{1}{N}\sum\limits_{i=1}^N \left(w^Tx_i-y_i\right)^2+\lambda||w||_1$
            \end{itemize}
            \item Put prior $p(w)$ on $w$. Compute posterior mean.\\
            $w=E_{w\mid X_1,Y_1,\dots,X_N,Y_N}$\\
            Gaussian prior corresponds to Ridge regression.
        \end{enumerate}
        % subsection regression (end)
        \subsection{Cross Validation} % (fold)
        \label{sub:cross_validation}
            \begin{itemize}
                \item for choosing $\lambda$
                \item e.g. 10-fold CV
            \end{itemize}
        % subsection cross_validation (end)
        \subsection{Features} % (fold)
        \label{sub:features}
            \begin{itemize}
                \item Use $\phi(x_1),\dots,\phi(x_N)$ instead of $x_1,\dots,x_N$
            \end{itemize}
        % subsection features (end)
        \subsection{Kernels} % (fold)
        \label{sub:kernels}
            \begin{itemize}
                \item $k: X\times X\rightarrow \mathds{R}$
                \item strictly / semi-definite positive
                \item For each $k$ $\exists\text{ RKHS }\mathcal{H}\subseteq\{f:X\rightarrow \mathds{R}\}$
                \begin{itemize}
                    \item $k(x,\cdot)\in\mathcal{H}$
                    \item $\left< k(x,\cdot),k(\tilde{x},\cdot)\right>=k(x,\tilde{x})$
                    \item $\left< k(x,\cdot) + k(\tilde{x},\cdot),\dots\right>=\left< k(x,\cdot),\dots\right>+\left< k(\tilde{x},\cdot),\dots\right>$
                \end{itemize}
                \item e.g. kernel ridge regression
                \item e.g. mapping of distribution $\mu:\mathds{P}\mapsto \mu(\mathds{P}):=E_{X\sim\mathds{P}}k(x,\cdot)$
                \begin{itemize}
                    \item Example:\\
                        $\mathds{P}(\{x_1\})=\mathds{P}(\{x_2\})=\frac{1}{2} \Rightarrow \mu(\mathds{P})=k(x_1,\cdot)+k(x_2,\cdot)$
                \end{itemize}
            \end{itemize}
        % subsection kernels (end)
        \subsection{PCA} % (fold)
        \label{sub:pca}
            \begin{itemize}
                \item[goal:] find principle components (directions with highest variance)
                \item[idea:] eigenvalue decomposition of cov $\Sigma=\frac{1}{N}X^TX=W\Lambda W^T$ where $\Lambda$ is diagonal
                \item $\tilde{X}:=XW\ \left(cov(\tilde{X})=\frac{1}{N}W^TX^TXW=\Lambda\right)$
                \item $(X_1,\dots,X_N)W=(\tilde{X}_1,\dots,\tilde{X}_p)$ : principal components
            \end{itemize}
        % subsection pca (end)
        \subsection{Causality} % (fold)
        \label{sub:causality}
            \begin{itemize}
                \item SEM
                \item G
                \item $P_{Obs}$
                \item Counterfactuals
                \item \dots
            \end{itemize}
        % subsection causality (end)
    % section summary (end)
\end{document}