diff --git a/isSummary.pdf b/isSummary.pdf
new file mode 100644
index 0000000..873ce38
--- /dev/null
+++ b/isSummary.pdf
Binary files differ
diff --git a/isSummary.tex b/isSummary.tex
new file mode 100644
index 0000000..67e7dd6
--- /dev/null
+++ b/isSummary.tex
@@ -0,0 +1,110 @@
+\documentclass[a4paper,12pt]{scrartcl}
+\usepackage[english]{babel}
+\usepackage[utf8]{inputenc}
+\usepackage{amsmath}
+\usepackage{amsfonts}
+\usepackage{dsfont}
+
+\newcommand{\iid}{\overset{\text{iid}}{\sim}}
+
+\begin{document}
+    \section{Summary} % (fold)
+    \label{sec:summary}
+        \subsection{Probability Theory \& Statistics} % (fold)
+        \label{sub:probTheo}
+            \begin{itemize}
+                \item random variables, distributions, expectations, variance, Bayes' rule, law of total prob.
+                \item $X_1,\dots,X_N\iid p_\theta$
+                \item MLE $\hat\theta$ is chosen to maximize $\prod\limits_{i=1}^N p_\theta(x_i)$
+            \end{itemize}
+        % subsection probTheo (end)
+        \subsection{Classification} % (fold)
+        \label{sub:classification}
+            \begin{itemize}
+                \item[given:] $(X_1,Y_1),\dots, (X_N,Y_N)\iid \mathds{P}$
+                \item[goal:] find $f:\mathds{R}^m\rightarrow \{-1,1\}$ (classifier)
+                \item criterion: loss $l:\{-1,1\}^2\rightarrow \{0,1\}: (y_1,y_2)\mapsto 1_{y_1\not= y_2}$
+                \item SVM: consider $f_{w,b}:\mathds{R}^m\rightarrow\{-1,1\}:x\mapsto sgn(w^Tx+b)$
+                \begin{itemize}
+                    \item e.g. hard margin SVM (primal)
+                    \item $O_1: \text{min: }\frac{1}{2}||w||^2\ s.t.\ y_i(w^Tx+b)\geq 1\forall i$
+                \end{itemize}
+            \end{itemize}
+        % subsection classification (end)
+        \subsection{Regression} % (fold)
+        \label{sub:regression}
+        \begin{itemize}
+            \item[given:] $(X_1,Y_1),\dots, (X_N,Y_N)\iid \mathds{P}$
+            \item[goal:] find $f:\mathds{R}^m\rightarrow \mathds{R}$
+            \item e.g. linear functions $x\mapsto w^Tx$
+        \end{itemize}
+        \begin{enumerate}
+            \item e.g. squared loss $l:(y_1,y_2)\mapsto (y_1-y_2)^2$\\
+            $\leadsto$ OLS $w_{OLS}:=\arg\min_w \frac{1}{N}\sum\limits_{i=1}^N\left(w^Tx_i-y_i\right)^2$
+            \item same thing as:\\
+            assume $Y=w^TX+\epsilon,\ \epsilon\sim\mathcal{N}(0,\sigma^2)$\\
+            find MLE for $(w,\sigma^2)$
+            \item penalized regression
+            \begin{itemize}
+                \item e.g. RIDGE-Regression $w_{Ridge}:=\frac{1}{N}\sum\limits_{i=1}^N\left(w^Tx_i-y_i\right)^2+\lambda ||w||_2$\\
+                $w_{Ridge}=\left(X^TX+\lambda \mathds{1}\right)^{-1}X^Ty$
+                \item $w_{LASSO}:=\frac{1}{N}\sum\limits_{i=1}^N \left(w^Tx_i-y_i\right)^2+\lambda||w||_1$
+            \end{itemize}
+            \item Put prior $p(w)$ on $w$. Compute posterior mean.\\
+            $w=E_{w\mid X_1,Y_1,\dots,X_N,Y_N}$\\
+            Gaussian prior corresponds to Ridge regression.
+        \end{enumerate}
+        % subsection regression (end)
+        \subsection{Cross Validation} % (fold)
+        \label{sub:cross_validation}
+            \begin{itemize}
+                \item for choosing $\lambda$
+                \item e.g. 10-fold CV
+            \end{itemize}
+        % subsection cross_validation (end)
+        \subsection{Features} % (fold)
+        \label{sub:features}
+            \begin{itemize}
+                \item Use $\phi(x_1),\dots,\phi(x_N)$ instead of $x_1,\dots,x_N$
+            \end{itemize}
+        % subsection features (end)
+        \subsection{Kernels} % (fold)
+        \label{sub:kernels}
+            \begin{itemize}
+                \item $k: X\times X\rightarrow \mathds{R}$
+                \item strictly / semi-definite positive
+                \item For each $k$ $\exists\text{ RKHS }\mathcal{H}\subseteq\{f:X\rightarrow \mathds{R}\}$
+                \begin{itemize}
+                    \item $k(x,\cdot)\in\mathcal{H}$
+                    \item $\left< k(x,\cdot),k(\tilde{x},\cdot)\right>=k(x,\tilde{x})$
+                    \item $\left< k(x,\cdot) + k(\tilde{x},\cdot),\dots\right>=\left< k(x,\cdot),\dots\right>+\left< k(\tilde{x},\cdot),\dots\right>$
+                \end{itemize}
+                \item e.g. kernel ridge regression
+                \item e.g. mapping of distribution $\mu:\mathds{P}\mapsto \mu(\mathds{P}):=E_{X\sim\mathds{P}}k(x,\cdot)$
+                \begin{itemize}
+                    \item Example:\\
+                        $\mathds{P}(\{x_1\})=\mathds{P}(\{x_2\})=\frac{1}{2} \Rightarrow \mu(\mathds{P})=k(x_1,\cdot)+k(x_2,\cdot)$
+                \end{itemize}
+            \end{itemize}
+        % subsection kernels (end)
+        \subsection{PCA} % (fold)
+        \label{sub:pca}
+            \begin{itemize}
+                \item[goal:] find principle components (directions with highest variance)
+                \item[idea:] eigenvalue decomposition of cov $\Sigma=\frac{1}{N}X^TX=W\Lambda W^T$ where $\Lambda$ is diagonal
+                \item $\tilde{X}:=XW\ \left(cov(\tilde{X})=\frac{1}{N}W^TX^TXW=\Lambda\right)$
+                \item $(X_1,\dots,X_N)W=(\tilde{X}_1,\dots,\tilde{X}_p)$ : principal components
+            \end{itemize}
+        % subsection pca (end)
+        \subsection{Causality} % (fold)
+        \label{sub:causality}
+            \begin{itemize}
+                \item SEM
+                \item G
+                \item $P_{Obs}$
+                \item Counterfactuals
+                \item \dots
+            \end{itemize}
+        % subsection causality (end)
+    % section summary (end)
+\end{document}