diff --git a/isSummary.pdf b/isSummary.pdf new file mode 100644 index 0000000..873ce38 --- /dev/null +++ b/isSummary.pdf Binary files differ diff --git a/isSummary.tex b/isSummary.tex new file mode 100644 index 0000000..67e7dd6 --- /dev/null +++ b/isSummary.tex @@ -0,0 +1,110 @@ +\documentclass[a4paper,12pt]{scrartcl} +\usepackage[english]{babel} +\usepackage[utf8]{inputenc} +\usepackage{amsmath} +\usepackage{amsfonts} +\usepackage{dsfont} + +\newcommand{\iid}{\overset{\text{iid}}{\sim}} + +\begin{document} + \section{Summary} % (fold) + \label{sec:summary} + \subsection{Probability Theory \& Statistics} % (fold) + \label{sub:probTheo} + \begin{itemize} + \item random variables, distributions, expectations, variance, Bayes' rule, law of total prob. + \item $X_1,\dots,X_N\iid p_\theta$ + \item MLE $\hat\theta$ is chosen to maximize $\prod\limits_{i=1}^N p_\theta(x_i)$ + \end{itemize} + % subsection probTheo (end) + \subsection{Classification} % (fold) + \label{sub:classification} + \begin{itemize} + \item[given:] $(X_1,Y_1),\dots, (X_N,Y_N)\iid \mathds{P}$ + \item[goal:] find $f:\mathds{R}^m\rightarrow \{-1,1\}$ (classifier) + \item criterion: loss $l:\{-1,1\}^2\rightarrow \{0,1\}: (y_1,y_2)\mapsto 1_{y_1\not= y_2}$ + \item SVM: consider $f_{w,b}:\mathds{R}^m\rightarrow\{-1,1\}:x\mapsto sgn(w^Tx+b)$ + \begin{itemize} + \item e.g. hard margin SVM (primal) + \item $O_1: \text{min: }\frac{1}{2}||w||^2\ s.t.\ y_i(w^Tx+b)\geq 1\forall i$ + \end{itemize} + \end{itemize} + % subsection classification (end) + \subsection{Regression} % (fold) + \label{sub:regression} + \begin{itemize} + \item[given:] $(X_1,Y_1),\dots, (X_N,Y_N)\iid \mathds{P}$ + \item[goal:] find $f:\mathds{R}^m\rightarrow \mathds{R}$ + \item e.g. linear functions $x\mapsto w^Tx$ + \end{itemize} + \begin{enumerate} + \item e.g. squared loss $l:(y_1,y_2)\mapsto (y_1-y_2)^2$\\ + $\leadsto$ OLS $w_{OLS}:=\arg\min_w \frac{1}{N}\sum\limits_{i=1}^N\left(w^Tx_i-y_i\right)^2$ + \item same thing as:\\ + assume $Y=w^TX+\epsilon,\ \epsilon\sim\mathcal{N}(0,\sigma^2)$\\ + find MLE for $(w,\sigma^2)$ + \item penalized regression + \begin{itemize} + \item e.g. RIDGE-Regression $w_{Ridge}:=\frac{1}{N}\sum\limits_{i=1}^N\left(w^Tx_i-y_i\right)^2+\lambda ||w||_2$\\ + $w_{Ridge}=\left(X^TX+\lambda \mathds{1}\right)^{-1}X^Ty$ + \item $w_{LASSO}:=\frac{1}{N}\sum\limits_{i=1}^N \left(w^Tx_i-y_i\right)^2+\lambda||w||_1$ + \end{itemize} + \item Put prior $p(w)$ on $w$. Compute posterior mean.\\ + $w=E_{w\mid X_1,Y_1,\dots,X_N,Y_N}$\\ + Gaussian prior corresponds to Ridge regression. + \end{enumerate} + % subsection regression (end) + \subsection{Cross Validation} % (fold) + \label{sub:cross_validation} + \begin{itemize} + \item for choosing $\lambda$ + \item e.g. 10-fold CV + \end{itemize} + % subsection cross_validation (end) + \subsection{Features} % (fold) + \label{sub:features} + \begin{itemize} + \item Use $\phi(x_1),\dots,\phi(x_N)$ instead of $x_1,\dots,x_N$ + \end{itemize} + % subsection features (end) + \subsection{Kernels} % (fold) + \label{sub:kernels} + \begin{itemize} + \item $k: X\times X\rightarrow \mathds{R}$ + \item strictly / semi-definite positive + \item For each $k$ $\exists\text{ RKHS }\mathcal{H}\subseteq\{f:X\rightarrow \mathds{R}\}$ + \begin{itemize} + \item $k(x,\cdot)\in\mathcal{H}$ + \item $\left< k(x,\cdot),k(\tilde{x},\cdot)\right>=k(x,\tilde{x})$ + \item $\left< k(x,\cdot) + k(\tilde{x},\cdot),\dots\right>=\left< k(x,\cdot),\dots\right>+\left< k(\tilde{x},\cdot),\dots\right>$ + \end{itemize} + \item e.g. kernel ridge regression + \item e.g. mapping of distribution $\mu:\mathds{P}\mapsto \mu(\mathds{P}):=E_{X\sim\mathds{P}}k(x,\cdot)$ + \begin{itemize} + \item Example:\\ + $\mathds{P}(\{x_1\})=\mathds{P}(\{x_2\})=\frac{1}{2} \Rightarrow \mu(\mathds{P})=k(x_1,\cdot)+k(x_2,\cdot)$ + \end{itemize} + \end{itemize} + % subsection kernels (end) + \subsection{PCA} % (fold) + \label{sub:pca} + \begin{itemize} + \item[goal:] find principle components (directions with highest variance) + \item[idea:] eigenvalue decomposition of cov $\Sigma=\frac{1}{N}X^TX=W\Lambda W^T$ where $\Lambda$ is diagonal + \item $\tilde{X}:=XW\ \left(cov(\tilde{X})=\frac{1}{N}W^TX^TXW=\Lambda\right)$ + \item $(X_1,\dots,X_N)W=(\tilde{X}_1,\dots,\tilde{X}_p)$ : principal components + \end{itemize} + % subsection pca (end) + \subsection{Causality} % (fold) + \label{sub:causality} + \begin{itemize} + \item SEM + \item G + \item $P_{Obs}$ + \item Counterfactuals + \item \dots + \end{itemize} + % subsection causality (end) + % section summary (end) +\end{document}