\documentclass[a4paper,12pt]{scrartcl}
\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{dsfont}
\newcommand{\iid}{\overset{\text{iid}}{\sim}}
\begin{document}
\section{Summary} % (fold)
\label{sec:summary}
\subsection{Probability Theory \& Statistics} % (fold)
\label{sub:probTheo}
\begin{itemize}
\item random variables, distributions, expectations, variance, Bayes' rule, law of total prob.
\item $X_1,\dots,X_N\iid p_\theta$
\item MLE $\hat\theta$ is chosen to maximize $\prod\limits_{i=1}^N p_\theta(x_i)$
\end{itemize}
% subsection probTheo (end)
\subsection{Classification} % (fold)
\label{sub:classification}
\begin{itemize}
\item[given:] $(X_1,Y_1),\dots, (X_N,Y_N)\iid \mathds{P}$
\item[goal:] find $f:\mathds{R}^m\rightarrow \{-1,1\}$ (classifier)
\item criterion: loss $l:\{-1,1\}^2\rightarrow \{0,1\}: (y_1,y_2)\mapsto 1_{y_1\not= y_2}$
\item SVM: consider $f_{w,b}:\mathds{R}^m\rightarrow\{-1,1\}:x\mapsto sgn(w^Tx+b)$
\begin{itemize}
\item e.g. hard margin SVM (primal)
\item $O_1: \text{min: }\frac{1}{2}||w||^2\ s.t.\ y_i(w^Tx+b)\geq 1\forall i$
\end{itemize}
\end{itemize}
% subsection classification (end)
\subsection{Regression} % (fold)
\label{sub:regression}
\begin{itemize}
\item[given:] $(X_1,Y_1),\dots, (X_N,Y_N)\iid \mathds{P}$
\item[goal:] find $f:\mathds{R}^m\rightarrow \mathds{R}$
\item e.g. linear functions $x\mapsto w^Tx$
\end{itemize}
\begin{enumerate}
\item e.g. squared loss $l:(y_1,y_2)\mapsto (y_1-y_2)^2$\\
$\leadsto$ OLS $w_{OLS}:=\arg\min_w \frac{1}{N}\sum\limits_{i=1}^N\left(w^Tx_i-y_i\right)^2$
\item same thing as:\\
assume $Y=w^TX+\epsilon,\ \epsilon\sim\mathcal{N}(0,\sigma^2)$\\
find MLE for $(w,\sigma^2)$
\item penalized regression
\begin{itemize}
\item e.g. RIDGE-Regression $w_{Ridge}:=\frac{1}{N}\sum\limits_{i=1}^N\left(w^Tx_i-y_i\right)^2+\lambda ||w||_2$\\
$w_{Ridge}=\left(X^TX+\lambda \mathds{1}\right)^{-1}X^Ty$
\item $w_{LASSO}:=\frac{1}{N}\sum\limits_{i=1}^N \left(w^Tx_i-y_i\right)^2+\lambda||w||_1$
\end{itemize}
\item Put prior $p(w)$ on $w$. Compute posterior mean.\\
$w=E_{w\mid X_1,Y_1,\dots,X_N,Y_N}$\\
Gaussian prior corresponds to Ridge regression.
\end{enumerate}
% subsection regression (end)
\subsection{Cross Validation} % (fold)
\label{sub:cross_validation}
\begin{itemize}
\item for choosing $\lambda$
\item e.g. 10-fold CV
\end{itemize}
% subsection cross_validation (end)
\subsection{Features} % (fold)
\label{sub:features}
\begin{itemize}
\item Use $\phi(x_1),\dots,\phi(x_N)$ instead of $x_1,\dots,x_N$
\end{itemize}
% subsection features (end)
\subsection{Kernels} % (fold)
\label{sub:kernels}
\begin{itemize}
\item $k: X\times X\rightarrow \mathds{R}$
\item strictly / semi-definite positive
\item For each $k$ $\exists\text{ RKHS }\mathcal{H}\subseteq\{f:X\rightarrow \mathds{R}\}$
\begin{itemize}
\item $k(x,\cdot)\in\mathcal{H}$
\item $\left< k(x,\cdot),k(\tilde{x},\cdot)\right>=k(x,\tilde{x})$
\item $\left< k(x,\cdot) + k(\tilde{x},\cdot),\dots\right>=\left< k(x,\cdot),\dots\right>+\left< k(\tilde{x},\cdot),\dots\right>$
\end{itemize}
\item e.g. kernel ridge regression
\item e.g. mapping of distribution $\mu:\mathds{P}\mapsto \mu(\mathds{P}):=E_{X\sim\mathds{P}}k(x,\cdot)$
\begin{itemize}
\item Example:\\
$\mathds{P}(\{x_1\})=\mathds{P}(\{x_2\})=\frac{1}{2} \Rightarrow \mu(\mathds{P})=k(x_1,\cdot)+k(x_2,\cdot)$
\end{itemize}
\end{itemize}
% subsection kernels (end)
\subsection{PCA} % (fold)
\label{sub:pca}
\begin{itemize}
\item[goal:] find principle components (directions with highest variance)
\item[idea:] eigenvalue decomposition of cov $\Sigma=\frac{1}{N}X^TX=W\Lambda W^T$ where $\Lambda$ is diagonal
\item $\tilde{X}:=XW\ \left(cov(\tilde{X})=\frac{1}{N}W^TX^TXW=\Lambda\right)$
\item $(X_1,\dots,X_N)W=(\tilde{X}_1,\dots,\tilde{X}_p)$ : principal components
\end{itemize}
% subsection pca (end)
\subsection{Causality} % (fold)
\label{sub:causality}
\begin{itemize}
\item SEM
\item G
\item $P_{Obs}$
\item Counterfactuals
\item \dots
\end{itemize}
% subsection causality (end)
% section summary (end)
\end{document}