diff --git a/.gitignore b/.gitignore
index 72d8f38..0825848 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,5 @@
 *.run.xml
 .Rproj.user
 .Rhistory
+*.bbl
+*.blg
diff --git a/07_final_assignment/paper/main.tex b/07_final_assignment/paper/main.tex
index 58f3201..32a1cf2 100644
--- a/07_final_assignment/paper/main.tex
+++ b/07_final_assignment/paper/main.tex
@@ -10,7 +10,9 @@
 \usepackage{color} 
 
 \usepackage{todonotes}
-\title{Simulation of Grainger et al. (2012) with Rescorla Wagner equations}
+
+
+\title{Simulation of \cite{Grainger245} with Rescorla Wagner equations}
 \shorttitle{Grainger et al. (2012) simulation with RW equations}
 \author{Robert Geirhos (3827808), Klara Grethen (3899962), \\David-Elias Künstle (3822829), Felicia Saar (3818590),\\Julia Maier (3879869), Marlene Weller (3837283), Anne-Kathrin Mahlke (3897867)}
 \affiliation{Linguistics for Cognitive Science Course, University of Tübingen}
@@ -46,15 +48,38 @@
 The results show that correctness of the responses for both words and nonwords grew above chance very quickly, while word accuracy was slightly higher overall. It also became clear, that the monkeys did not only recognize words because of their appearing more often, but also were able to find patterns and by that recognize new words as words quite quickly.\\
 We decided to try to model the results from Grainger et al. (2012) using Naive Discriminative Learning (NDL), which is a concept of modelling learning (and also an R-package) based on the Rescorla-Wagner model (Rescorla \& Wagner, 1972) and the equilibrium equations by Danks (2003).
 
+\subsection{Naive Discrimination Learning}
+Since the first experiments in modern learning theory by Ivan Pavlov it's observable, that learning is not only making associations between co-occurring cues and outcomes but discriminating which cues predict the presence and the absence of a outcome \parencite{baayen2015abstraction}.
+
+A naive discrimination learning (\emph{ndl}) model as a 2-layer network implementation of the established learning rules described by \cite{rescorla1972theory} fulfills this and was applied successfully in the language context (e.g. \cite{baayen2016comprehension}).
+
 \section{Simulations}
 
 \subsection{Stimuli}
 For stimuli we used the words given in the supplemetary material of the original paper. The list contained 307 four-letter words and 7832 non-words, each also made up of four letters. In every trial, the word or non-word was presented split into overlapping trigrams (for example for the word atom: \#at, ato, tom, om\#), one trigram after the other, as proposed by Baayen et al. (2016). 
 
 \subsection{Experimental Code}
-%\todo{why we didn't use the given code, what we improved, how the result is structured - Goal: modular and comprehensive experiment. Problems with paper and given code. What's a block in our experiment.}
+
+The simulation code is split in three parts, the creation of the trials, the learning of the monkey and the analysis of the learning results, implemented in the \emph{R Programming Language} \parencite{Rcore}.
+
+\subsubsection{Trial creation}
+The algorithm follows in general the structure defined in the reference paper and supplemental materials and described above.
+The word-nonword corpus is the one used by the monkey DAN in \cite{Grainger245}.
+
+The lack of information leaded to our own design decision in some edge cases.
+Trials will always be created in blocks of 100.
+To ensure this constraint the new word block part can be replaced by learned words if there is no new word left in the corpus and vice versa if there's no word learned the learned word part will be filled by the new word.
+The new words, learned words and nonwords get picked randomly out of their pool with repetition allowed.
+
+\subsubsection{Monkey learning}
+After a block the presented new word can be marked as learned by the definition in \cite{Grainger245}. The rescorla wagner learner therefore has to learn a block, return the guesses and then continue learning with the next block.
+This is not easily possible with \emph{ndl} \parencite{Rndl} where for we implemented a rescorla wagner learner ourself.
+
 Since preliminary experiments showed that the monkeys performed with very high accuracies (>90\%), we decided to introduce a random parameter $ r $ in the experiment, defined as the fraction of times the monkey would make a random guess instead of an experience-based prediction.
 
+\subsubsection{Data analysis}
+To compare the accuracy with different learning rates we used not only standard tools like linear regression models \emph{(lm)} and \emph{anova} \parencite{Rcore} but also more advanced non linear general additive models \emph{(GAM)} provided by the package \emph{mgcv} \parencite{Rmgcv} compared and visualized with \emph{itsadug} \parencite{Ritsadug}
+
 \subsection{Choice of Parameters}
 \subsubsection{Number of Trials}
 The six monkeys in the original experiment participated in a different number of trials (min: 43.041, max: 61.142, mean: 52.812). For the sake of simplicity, we presented exactly 50.000 trials in each of our experiments.
@@ -67,8 +92,8 @@
 It makes it possible to modulate saliency of a stimulus. A more salient stimulus could not only have higher learning rates but also a higher maximum activation. In the original experiment the stimulus were same colored words and nonwords with four letters on an equally colored background. We assume the single words and nonwords are equally salient and keep therefore $\lambda$ constant (1). 
 
 \subsection{Running Parallelized Experiments}
-Running an experiment with a single combination of $ \alpha $ and $ \beta $ on a normal desktop computer took about 75 minutes. Therefore, the parameter space one could explore within a reasonable amount of time was quite restricted. We decided to write a parallelized version of the code to reduce the overall runtime. Using the R packages foreach, parallel and doParallel %\todo{(TODO: Cite them properly)}
-, we restructured the experiment. Since conflicts can easily occur when more than one core is trying to access a shared data structure at the same time, we implemented a parallelized version that is able to run without even containing critical sections. Instead, each thread has its own data structure, a .txt file, and in the end the results are harvested and combined. This version of the experiment ran on a cluster with 15 cores, each performing a total amount of eight experiments. Altogether, 120 combinations of $ \alpha $ and $ \beta $ were explored overnight, which would have taken about 150 hours in a non-parallelized version.
+
+Running an experiment with a single combination of $ \alpha $ and $ \beta $ on a normal desktop computer took about 75 minutes. Therefore, the parameter space one could explore within a reasonable amount of time was quite restricted. We decided to write a parallelized version of the code to reduce the overall runtime. Using the R packages foreach \parencite{Rforeach}, parallel \parencite{Rparallel} and doParallel \parencite{RdoParallel}, restructured the experiment. Since conflicts can easily occur when more than one core is trying to access a shared data structure at the same time, we implemented a parallelized version that is able to run without even containing critical sections. Instead, each thread has its own data structure, a .txt file, and in the end the results are harvested and combined. This version of the experiment ran on a cluster with 15 cores, each performing a total amount of eight experiments. Altogether, 120 combinations of $ \alpha $ and $ \beta $ were explored overnight, which would have taken about 150 hours in a non-parallelized version.
 
 \section{Results}
 The number of words learned by the actual monkeys ranged between 87 and 308. With the chosen range for $\alpha$ and $\beta$, we obtained between 275 and 307 learned words, however, it is important to note that we only presented 307 words, so the model reached maximum learning potential. The general accuracy for the real monkeys lay between 71.14\% and 79.81\%, while our accuracies moved between 0.60 and 0.68. Accuracies for word and non-word decisions are similar in both cases. 
@@ -99,7 +124,7 @@
 
 
 \section{Discussion}
-%\todo{"your conclusions about what is most likely to underlie the different success rates of the baboons"}
+
 The results show that our model is actually too good for the actual monkeys. Only the random parameter we introduced made it possible to obtain similar results as in the original experiment. When trying to account for the unequality only by lowering the learning rates, we encountered a restriction in form of the need to use floating-point numbers, which might have led to unforeseeable behaviour. Therefore, we chose to use the random parameter instead.\\
 Unfortunately, some information on the exact conduct of the original experiment was missing in the paper, so we had to guess some of the details. For expample, it was not made clear what a block of trials would have looked like in the first few blocks, when there were no already known words to be used in the corresponding 25\% of the block.\\
 We were also slightly unhappy with the definition of a word being learned, which was when the word had 80\% accuracy of recognition. We would expect this definition to become proplematic when a word was 'almost' learned, but not quite reaching the 80\%. In the next block with that word, the learning would be a lot quicker than for an actually new word. It might be a good idea to monitor and save the knowledge level concerning one specific word an measuring the actual number of reptitions a word needed to become known.\\
@@ -108,7 +133,13 @@
 Lastly, of course, different models could be used in the experiment, to see if other models fit the results of the actual monkeys better.
 
 \newpage
+
+\printbibliography{}
+
 \appendix
+
+\onecolumn
+
 \section{Complete Results}
 Here are the complete results of our experiments. The abbreviations used are:
 \begin{APAitemize}
@@ -120,15 +151,12 @@
 \item NWAcc: Nonword accuracy
 \end{APAitemize}
 
-\onecolumn
 \input{result_tables.tex}
 
 \lstinputlisting[language=R]{../baboonSimulation.R}
 
-\printbibliography{}
-
 \end{document}
-%%% Local Variables:
+%%% Local Variable:
 %%% mode: latex
 %%% TeX-master: t
 %%% End:
diff --git a/07_final_assignment/paper/references.bib b/07_final_assignment/paper/references.bib
index e69de29..de6c0ac 100644
--- a/07_final_assignment/paper/references.bib
+++ b/07_final_assignment/paper/references.bib
@@ -0,0 +1,95 @@
+@article {Grainger245,
+	author = {Grainger, Jonathan and Dufau, St{\'e}phane and Montant, Marie and Ziegler, Johannes C. and Fagot, Jo{\"e}l},
+	title = {Orthographic Processing in Baboons (Papio papio)},
+	volume = {336},
+	number = {6078},
+	pages = {245--248},
+	year = {2012},
+	doi = {10.1126/science.1218152},
+	publisher = {American Association for the Advancement of Science},
+	abstract = {An orthographic object such as a set of letters, and the ability to recognize such sets as words, is a key component of reading. The ability to develop these skills has often been attributed to the prior acquisition of a complex language. For example, we learn how letters sound and thus recognize when a particular letter makes up part of a word. However, orthographic processing is also a visual process, because we learn to recognize words as discrete objects, and the ability to read may thus be related to an ability to recognize and classify objects. Grainger et al. (p. 245; see the Perspective by Platt and Adams) tested orthographic skills in baboons. Captive, but freely ranging, baboons were trained to distinguish real English words from combinations of similar letters that are not words, and they were able to distinguish real words with remarkable accuracy. Thus, a basic ability to recognize words as objects does not require complex linguistic understanding.Skilled readers use information about which letters are where in a word (orthographic information) in order to access the sounds and meanings of printed words. We asked whether efficient processing of orthographic information could be achieved in the absence of prior language knowledge. To do so, we trained baboons to discriminate English words from nonsense combinations of letters that resembled real words. The results revealed that the baboons were using orthographic information in order to efficiently discriminate words from letter strings that were not words. Our results demonstrate that basic orthographic processing skills can be acquired in the absence of preexisting linguistic representations.},
+	issn = {0036-8075},
+	URL = {http://science.sciencemag.org/content/336/6078/245},
+	eprint = {http://science.sciencemag.org/content/336/6078/245.full.pdf},
+	journal = {Science}
+}
+
+@Manual{Rparallel,
+  title = {R: A Language and Environment for Statistical Computing},
+  author = {{R Core Team}},
+  organization = {R Foundation for Statistical Computing},
+  address = {Vienna, Austria},
+  year = {2015},
+  url = {https://www.R-project.org/},
+}
+
+@Manual{Rforeach,
+  title = {foreach: Provides Foreach Looping Construct for R},
+  author = {Revolution Analytics and Steve Weston},
+  year = {2015},
+  note = {R package version 1.4.3},
+  url = {https://CRAN.R-project.org/package=foreach},
+}
+
+@Manual{RdoParallel,
+  title = {doParallel: Foreach Parallel Adaptor for the 'parallel' Package},
+  author = {Revolution Analytics and Steve Weston},
+  year = {2015},
+  note = {R package version 1.0.10},
+  url = {https://CRAN.R-project.org/package=doParallel},
+}
+@Manual{Rndl,
+  title = {ndl: Naive Discriminative Learning},
+  author = {{Antti Arppe} and {Peter Hendrix} and {Petar Milin} and {R. Harald Baayen} and {Cyrus Shaoul}},
+  year = {2014},
+  note = {R package version 0.2.16},
+  url = {https://CRAN.R-project.org/package=ndl},
+}
+@Manual{Rcore,
+  title = {R: A Language and Environment for Statistical Computing},
+  author = {{R Core Team}},
+  organization = {R Foundation for Statistical Computing},
+  address = {Vienna, Austria},
+  year = {2016},
+  url = {https://www.R-project.org/},
+}
+
+@Book{Rmgcv,
+  title = {Generalized Additive Models: An Introduction with R},
+  year = {2006},
+  author = {S.N Wood},
+  publisher = {Chapman and Hall/CRC},
+}
+@Misc{Ritsadug,
+  title = {{itsadug}: Interpreting Time Series and Autocorrelated Data Using GAMMs},
+  author = {Jacolien {van Rij} and Martijn Wieling and R. Harald Baayen and Hedderik {van Rijn}},
+  year = {2016},
+  note = {R package version 2.0},
+}
+@article{baayen2015abstraction,
+  title={Abstraction, storage and naive discriminative learning},
+  author={Baayen, RH and Ramscar, M},
+  journal={Handbook of cognitive linguistics},
+  pages={99--120},
+  year={2015},
+  publisher={De Gruyter Mouton}
+}
+@article{rescorla1972theory,
+  title={A theory of Pavlovian conditioning: Variations in the effectiveness of reinforcement and nonreinforcement},
+  author={Rescorla, Robert A and Wagner, Allan R and others},
+  journal={Classical conditioning II: Current research and theory},
+  volume={2},
+  pages={64--99},
+  year={1972},
+  publisher={New York}
+}
+@article{baayen2016comprehension,
+  title={Comprehension without segmentation: A proof of concept with naive discriminative learning},
+  author={Baayen, R Harald and Shaoul, Cyrus and Willits, Jon and Ramscar, Michael},
+  journal={Language, Cognition and Neuroscience},
+  volume={31},
+  number={1},
+  pages={106--128},
+  year={2016},
+  publisher={Taylor \& Francis}
+}
\ No newline at end of file