Newer
Older
linguistic_assignments / 07_final_assignment / baboonSimulation.R
library(testthat)
library(ndl)
library(compiler)
library(fields)
library(parallel)
library(doParallel)
library(foreach)
library(itsadug)
library(mgcv)
library(xtable)
enableJIT(3)


########################################################################################
####                           DATA OF ORIGINAL EXPERIMENT
########################################################################################

monkeydat <- data.frame(Monkey=c("DAN", "ART", "CAU", "DOR", "VIO", "ARI"),
                        NumTrials=c(56689, 50985, 61142, 49608, 43041, 55407),
                        NumWordsLearned=c(308, 125, 112, 121, 81, 87),
                        NumNonwordsPresented=c(7832, 7832, 7832, 7832, 7832, 7832),
                        GeneralAccuracy=c(79.81, 73.41, 72.43, 73.15, 71.55, 71.14),
                        WordAccuracy=c(80.01, 74.83, 73.15, 79.26, 76.75, 75.38),
                        NonwordAccuracy=c(79.61, 72.00, 71.72, 67.06, 66.33, 66.90),
                        stringsAsFactors=F)


########################################################################################
####                           DEFINING THE EXPERIMENTAL FUNCTIONS
########################################################################################

Resample <- function(x){
  x[sample.int(length(x))]
}

test_that("Resample", {
  x <- 1:10
  expect_that(length(Resample(x[x >  8])), equals(2))
  expect_that(length(Resample(x[x >  9])), equals(1))
  expect_that(length(Resample(x[x >  10])), equals(0))
})

RandomPick <- function(data, n) {
  if ( is.data.frame(data) ){
    data[sample(nrow(data), n, replace=TRUE),]
  } else {
    data[sample(length(data), n, replace=TRUE)]
  }
}

test_that("RandomPick", {
  n <- 10
  
  bigger <- 1:(3 * n)
  equal <- 1:n
  smaller <- c(1)
  empty <- c()
  df <- data.frame(a=c(1,2,3), b=c(4,5,6))
  
  expect_equal(length(RandomPick(bigger, n)), n)
  expect_equal(length(RandomPick(equal, n)), n)
  expect_equal(length(RandomPick(smaller, n)), n)
  expect_equal(length(RandomPick(empty, n)), 0)
  expect_equal(nrow(RandomPick(df, n)), n)
})


CreateBlock <- function(data){
  result <- data.frame(Stimulus=character(BLOCK_SIZE),
                       StimulusType=factor(character(BLOCK_SIZE),
                                           c("LearnedWord","NewWord","Nonword")),
                       stringsAsFactors=F)
  data <- data[,c("Stimulus","StimulusType")]
  stopifnot(is.character(data$Stimulus))
  
  newSample <- RandomPick(data[data$StimulusType == "NewWord",], 1)
  learnedSample <- RandomPick(data[data$StimulusType == "LearnedWord",], 25)
  if( nrow(newSample) > 0 ) {
    result[1:25,] <- newSample
  } else {
    result[1:25,] <- learnedSample
  }
  if( nrow(learnedSample) > 0 ){
    result[26:50,] <- learnedSample
  } else {
    result[26:50,] <- newSample
  }
  
  result[51:100,] <- RandomPick(data[data$StimulusType == "Nonword",], 50)
  
  result <- result[sample(nrow(result)),]
  rownames(result) <- 1:nrow(result)
  result
}

BLOCK_SIZE <- 100
test_that("CreateBlock", {
  stim <- as.character(1:300)
  data <- data.frame(Stimulus=stim, stringsAsFactors=F)
  data$StimulusType[1:100] <- "NewWord"
  data$StimulusType[101:200] <- "LearnedWord"
  data$StimulusType[201:300] <- "Nonword"
  
  block <- CreateBlock(data)
  trialNewWords <- block$Stimulus[ block$StimulusType == "NewWord" ]
  trialLearnedWords <- block$Stimulus[ block$StimulusType == "LearnedWord" ]
  trialNonwords <- block$Stimulus[ block$StimulusType == "Nonword" ]
  
  expect_that(nrow(block), equals(BLOCK_SIZE))
  expect_that(length(trialNewWords), equals(25))
  expect_that(length(unique(trialNewWords)), equals(1))
  expect_that(length(trialLearnedWords), equals(25))
  expect_that(length(trialNonwords), equals(50))
  # TODO: Decide what should be expected for empty vectors?
})


PresentTrials <- function(trialCount, present, data){
  
  trials = data.frame()
  
  for ( curTrialNum in 1:trialCount ) {
    curBlock <- ( (curTrialNum - 1) %/% BLOCK_SIZE) + 1
    curTrialNumInBlock <- ( (curTrialNum - 1) %% BLOCK_SIZE ) + 1
    isNewBlock <- curTrialNumInBlock == 1
    
    if ( isNewBlock ) {
      if ( curBlock > 1 ){
        newWordTrials <- block[block$StimulusType == "NewWord", ]
        if( length(unique(newWordTrials$Stimulus)) == 1 ){
          newWordTrials <- newWordTrials[1, ]
          wordResponses <- newWordTrials[newWordTrials$Response == "Word", ]
          wordResponseRate <- nrow(wordResponses) / nrow(newWordTrials)
          if ( wordResponseRate >= 0.8  ){
            data$StimulusType[data$Stimulus == newWordTrials$Stimulus] <- "LearnedWord"
          }
        }
      }
      block <- CreateBlock(data)
      block$Block <- curBlock
      block$TrialInBlock <- 1:nrow(block)
    }
    
    curTrial <- block[curTrialNumInBlock,]
    curStim <- curTrial$Stimulus
    curStimIsWord <- present(curStim, ifelse(curTrial$StimulusType == "Nonword", "Nonword", "Word"))
    
    block$Trial[curTrialNumInBlock] <- curTrialNum
    block$Response[curTrialNumInBlock] <- curStimIsWord
    
    trials <- rbind(trials, block[curTrialNumInBlock, ])
  }
  
  rownames(trials) <- 1:trialCount
  
  result <- list(Trials=trials, Stimuli=data)
  
  return(result)
}

test_that("PresentTrials", {
  stim <- as.character(1:300)
  data <- data.frame(Stimulus=stim, stringsAsFactors=F)
  data$StimulusType[1:100] <- "NewWord"
  data$StimulusType[101:200] <- "LearnedWord"
  data$StimulusType[201:300] <- "Nonword"
  count <- BLOCK_SIZE * 2.5
  
  result <- PresentTrials(count,
                          function(cue, outcome){
                            return(outcome)
                          },
                          data)
  trials <- result$Trials
  stimuli <- result$Stimuli
  expect_equal(nrow(trials), count)
  expect_equal(length(stimuli$StimulusType[stimuli$StimulusType == "LearnedWord"]), 102)
})

# The original colSums fails if matrix filters to 1 row and gets implicit casted to a vector
ColSums <- function(x) { 
  if(is.matrix(x)){
    return(colSums(x))
  } else {
    return(x)
  }
}

test_that("ColSums",{
  m2x2 <- matrix(1,2,2)
  colnames(m2x2) <- c("a","b")
  
  expect_equal(ColSums(m2x2), c(a=2,b=2))
  expect_equal(ColSums(m2x2[1,]), c(a=1,b=1))
})

MakeMonkey <- function(cueset, outcomeset=c("Word", "Nonword"),
                       alpha=sqrt(0.001), beta=sqrt(0.001), lambda=1.0, randomRatio=0.2){
  weights <- matrix(0, length(cueset), length(outcomeset))
  rownames(weights) <- cueset
  colnames(weights) <- outcomeset
  
  rate <- function(cue){
    cues <- unlist(strsplit(cue, "_"))
    cueWeights <- weights[cues, outcomeset]
    totalActivation <- ColSums(cueWeights)
    
    maxActivated <- names(totalActivation[totalActivation == max(totalActivation)])
    return(RandomPick(maxActivated,1))
  }
  randomLearner <- function(stim,resp){
      c("Word","Nonword")[sample(1:2,1)]
  }
  
  realLearner <- function(stim, resp){
    cues <- unlist(strsplit(stim, "_"))
    
    cueWeights <- weights[cues, outcomeset]
    totalActivation <- ColSums(cueWeights)
    
    for (j in 1:length(cues)) {
      if (resp == "Nonword") {
        yesType <- "Nonword"
        noType <- "Word"
      } else if (resp == "Word") {
        yesType <- "Word"
        noType <- "Nonword"
      } else {
        stop("Unknown outcome", resp)
      }
      weights[cues[j],yesType] <<-  weights[cues[j],yesType] +
        alpha * beta * (lambda - totalActivation[yesType])
      weights[cues[j],noType] <<-  weights[cues[j],noType] +
        alpha * beta * (0 - totalActivation[noType])
    }
    monkeyGuess <- rate(stim)
    return(monkeyGuess)
  }

  learner <- function(stim, resp) {
      if( runif(1) < randomRatio ){
          randomLearner(stim,resp)
      } else {
          realLearner(stim,resp)
      }
  }
  
  give_weights <- function(){
    return(weights)
  }
  
  return(list(
    GetWeights=give_weights,
    Learner=learner,
    Rate=rate))
}


test_that("MakeMonkey",{
  cueset <- c("a","b","c")
  
  monkey <- MakeMonkey(cueset)
  for (i in 1:1000){
    monkey$Learner("a_b","Word")
    monkey$Learner("b_c","Nonword")
  }
  
  print(monkey$GetWeights())
  expect_equal(monkey$Rate("a"), "Word")
  expect_equal(monkey$Rate("c"), "Nonword")
})

prepareDat <- function(){
  dat <- read.table("dataDan.txt", header=T, stringsAsFactors=F)
  dat$Cues <- orthoCoding(dat$String, grams=2)
  dat$Frequency <- 1
  dat$Stimulus <- dat$Cues
  dat$StimulusType <- factor(ifelse(dat$Type == "word", "NewWord", "Nonword"), c("Nonword","NewWord","LearnedWord"))
  dat$Outcomes <- factor(ifelse(dat$Type == "word", "Word", "Nonword"), c("Word","Nonword"))
  dat <- dat[sample(1:nrow(dat)),]
  cues <- unique(unlist(strsplit(dat$Cues, "_")))
  outcomes <- levels(dat$Outcomes)
  list(cues=cues,outcomes=outcomes,dat=dat)
}

runSim <- function(data, trialCount, alpha, beta, lambda=1, randomRatio) {
  monkey <- MakeMonkey(data$cues, data$outcomes, alpha, beta, lambda=lambda, randomRatio=randomRatio)
  trialNum <- round(trialCount)
  pres <- PresentTrials(
    trialNum,
    monkey$Learner, data$dat)
  return(pres)
}

analyseSim <- function(pres){
  learnedWordNum <- nrow(pres$Stimuli[pres$Stimuli$StimulusType == "LearnedWord",])
  trialNum <- nrow(pres$Trials)
  nonwordTrials <- pres$Trials[pres$Trials$StimulusType == "Nonword",]
  wordTrials <- pres$Trials[pres$Trials$StimulusType != "Nonword",]
  presNonwordNum <- length(unique(nonwordTrials$Stimulus))
  presWordNum <- length(unique(wordTrials$Stimulus))
  nonwordAcc <- nrow(nonwordTrials[nonwordTrials$Response == "Nonword",]) / nrow(nonwordTrials)
  wordAcc <- nrow(nonwordTrials[wordTrials$Response == "Word",]) / nrow(wordTrials)
  genAcc <- (nrow(nonwordTrials) * nonwordAcc + nrow(wordTrials) * wordAcc) / trialNum
  
  c("Sim",
    trialNum,
    learnedWordNum,
    presNonwordNum,
    genAcc,
    wordAcc,
    nonwordAcc)
}

simulateAndAnalyse <- function(data, trialCount, alpha=sqrt(0.001), beta=sqrt(0.001), randomRatio=0.2){
  p <- runSim(data, trialCount, alpha, beta, lambda=1, randomRatio)
  analyseSim(p)
}


########################################################################################
####                            IGNORE THIS BLOCK
########################################################################################
if (FALSE) {
  data <- prepareDat()
  
  testfunc = function() {
    n <- 10
    trialCount <- 50000
    for( a in 0:n) {
      k <- 1.0 / n * a
      print(k)
      #s <- simulateAndAnalyse(data, trialCount, alpha=1, beta=k)
      s <- simulateAndAnalyse(data, trialCount, alpha=0.000001, beta=0.00001)
      s[1] <- as.character(k)
      monkeydat <- rbind(monkeydat,
                         s)
    }
    return(monkeydat)
  }
  
  
}

########################################################################################
####                            RUNNING THE PARALLELIZED EXPERIMENTS
########################################################################################

# code for parallelizing the experiment.
# Goal: Being able to run without a shared data structure
# and thus avoiding conflicts with critical sections.
# Solution: Reading and printing from and to different files
# for each thread (divide-and-conquer). Results need to be combined
# in the end.

print.iterations <- FALSE #Note: Not meaningful for parallel execution,
#see http://blog.revolutionanalytics.com/2015/02/monitoring-progress-of-a-foreach-parallel-job.html
#for a discussion

num.cores <- 15       #Can NOT be changed without further code adjustment
monkeys.per.core <- 8 #Can NOT be changed without further code adjustment


start.of.a.and.b <- 0.001 #Can be changed with no further code adjustment
end.of.a.and.b <- 0.300   #Can be changed with no further code adjustment

debug <- FALSE


#8 experiments for each core, 15 cores
# -> 120 Experiments possible.
#This is exactly the number of experiments
#necessary to run 15 x 15 conditions in
#a symmetrical setting (= little Gauss of 15)


a.and.b.combinations <- matrix(nrow=2, ncol=0)
aseq <- seq(from=start.of.a.and.b, to=end.of.a.and.b, length.out=15)
step <- aseq[2] - aseq[1]
for(a in aseq) {
  for(b in seq(from=start.of.a.and.b, to=a, by=step)) {
    a.and.b.combinations <- cbind(a.and.b.combinations, c(a, b))
  }
}

get.a <- function(i, index) {
  return((a.and.b.combinations[1, ((i*monkeys.per.core)-(monkeys.per.core-1)) : (i*monkeys.per.core)])[index])
}

get.b <- function(i, index) {
  return((a.and.b.combinations[2, ((i*monkeys.per.core)-(monkeys.per.core-1)) : (i*monkeys.per.core)])[index])
}

do.experiment <- function(i) {
  resultdir <- "results"
  dir.create(resultdir, showWarnings = FALSE) # dont show warning if dir exists
  filename <- paste(resultdir, "/", "resultdat", i, ".txt", sep="")
  
  if(debug) {
    resultdat <- data.frame(alpha=numeric(1), beta=numeric(1),
                            Time=character(1),
                            NumWordsLearned=numeric(1),
                            NumNonwordsPresented=numeric(1),
                            GeneralAccuracy=numeric(1),
                            WordAccuracy=numeric(1),
                            NonwordAccuracy=numeric(1))
  } else {
    resultdat <- data.frame(alpha=numeric(1), beta=numeric(1),
                            NumTrials=numeric(1),
                            NumWordsLearned=numeric(1),
                            NumNonwordsPresented=numeric(1),
                            GeneralAccuracy=numeric(1),
                            WordAccuracy=numeric(1),
                            NonwordAccuracy=numeric(1))
  }
  write.table(resultdat, file=filename, col.names=names(resultdat))
  
  trialCount <- 50000 
  r <- 0.65
  for(index in 1:monkeys.per.core) {
    
    a = get.a(i, index)
    b = get.b(i, index)
    resultdat = read.table(filename)
    
    if(print.iterations) {
      print(paste("--running-- i: ", i, " a: ", a, " b: ", b, " r: ", r, sep="" ))
    }
    
    if(debug) {
      Sys.sleep(1)
      current.time = Sys.time()
      debug.frame = data.frame(alpha=a, beta=b,
                               Time=as.character(current.time),
                               NumWordsLearned=numeric(1),
                               NumNonwordsPresented=numeric(1),
                               GeneralAccuracy=numeric(1),
                               WordAccuracy=numeric(1),
                               NonwordAccuracy=numeric(1))
      
      resultdat <- rbind(resultdat, debug.frame)
      
    } else {
      
      s <- simulateAndAnalyse(data, trialCount, alpha=a, beta=b, randomRatio=r )
      s[1] <- as.character(trialCount)
      resultdat <- rbind(resultdat,
                         c(a, b, s[2:length(s)]))
      
    }
    
    #write preliminary result to file -> in case of interruption nothing gets lost
    write.table(resultdat, file=filename, col.names=names(resultdat))
  }
  
  resultdat <- resultdat[-1, ] #remove 1st row which is just always 0 
  write.table(resultdat, file=filename, col.names=names(resultdat))
  return(resultdat)
}


#...RUN, MONKEY, RUN!
cl <- makeCluster(num.cores)
registerDoParallel(cl)
foreach(i=1:num.cores) %dopar% do.experiment(i)


########################################################################################
####                            HARVESTING DATA & VISUALIZATION I
########################################################################################

harvest.data <- function(write.to.file=FALSE, folder="results_backup") {
  data = data.frame()
  for(i in 1:num.cores) {
    d <- read.table(paste(folder, "resultdat", i, ".txt", sep=""))
    data <- rbind(data, d)
  }
  if(write.to.file) {
    write.table(data, file="preliminary_results.txt", col.names=names(data))
  }
  return(data)
}


get.index <- function(sequence, value) {
  res <- logical(length(sequence))
  for(i in 1:length(sequence)) {
    res[i] <- isTRUE(all.equal(sequence[i], value))
  }
  return(which(res))
}


get.matrix <- function(data, value, sequence) {
  if(!any(names(data)==value)) {
    stop("'value' needs to be part of the data.frame")
  }
  v.index = which(names(data)==value)
  
  sequence <- round(sequence, 3)
  
  z <- matrix(nrow=length(sequence), ncol=length(sequence))
  
  for(i in 1:nrow(data)) {
    
    alpha <- round(data[i, ]$alpha, 3)
    beta  <- round(data[i, ]$beta, 3)

    a.index <- get.index(sequence, alpha)
    b.index <- get.index(sequence, beta)
    if(isTRUE(all.equal(length(a.index),  0)) ||
         isTRUE(all.equal(length(b.index), 0))){
      print("problem: ")
      print(alpha)
      print(beta)
      stop("sequence must contain alpha and beta values")
    }
    z[a.index, b.index] <- data[i, v.index]
    z[b.index, a.index] <- data[i, v.index]
  }
  return(z)
}

data = harvest.data(TRUE)


mat1 = matrix(rexp(15*15, rate=.1), ncol=15, nrow=15)

conditions <- c("GeneralAccuracy", "WordAccuracy",
                "NonwordAccuracy", "NumWordsLearned")

for(c in conditions) {
  image.plot(x=sort(aseq),
             y=sort(aseq),
             z=get.matrix(data, c, aseq),
             #z = mat1, #alternative while results are not here yet
             main=c,
             xlab="alpha",
             ylab="beta",
             useRaster=TRUE,
             col = topo.colors(100))
}


########################################################################################
####                            MODEL FITTING & VISUALIZATION II
########################################################################################

### GeneralAccuracy
m1.g = gam(GeneralAccuracy ~ s(alpha) + s(beta), data=data)
summary(m1)

m2.g = gam(GeneralAccuracy ~ s(alpha) + s(beta) + ti(alpha, beta), data=data)
summary(m2.g)

compareML(m1.g, m2.g)
#m1 preferred

vis.gam(m1.g, plot.type="contour", color="topo", main="GeneralAccuracy")




### WordAccuracy
m1.w = gam(WordAccuracy ~ s(alpha) + s(beta), data=data)
summary(m1.w)

m2.w = gam(WordAccuracy ~ s(alpha) + s(beta) + ti(alpha, beta), data=data)
summary(m2.w)

compareML(m1.w, m2.w)
#m1 preferred

vis.gam(m1.w, plot.type="contour", color="topo", main="WordAccuracy")



### NonwordAccuracy
m1.n = gam(NonwordAccuracy ~ s(alpha) + s(beta), data=data)
summary(m1.n)

m2.n = gam(NonwordAccuracy ~ s(alpha) + s(beta) + ti(alpha, beta), data=data)
summary(m2.n)

compareML(m1.n, m2.n)
#m1 preferred

vis.gam(m1.n, plot.type="contour", color="topo", main="NonwordAccuracy")


### NumWordsLearned
m1.l = gam(NumWordsLearned ~ s(alpha) + s(beta), data=data)
summary(m1.l)

m2.l = gam(NumWordsLearned ~ s(beta), data=data)
summary(m2.l)

compareML(m1.l, m2.l)
#m2 preferred

m3.l = gam(NumWordsLearned ~ te(alpha, beta), data=data)
summary(m3.l)

compareML(m2.l, m3.l)
#m2 preferred

vis.gam(m1.l, plot.type="contour", color="topo", main="NumWordsLearned")
#note: this plot shows a model which is not justified because m2 is preferred.
#it still is interesting, though.

#explains the rather strange gam fit
hist(data$NumWordsLearned, main="Histogram of NumWordsLearned",
     xlab="NumWordsLearned")


### model criticism
qqnorm(m1.g$residuals)
qqnorm(m1.w$residuals)
qqnorm(m1.n$residuals)
# mostly ok

acf(resid(m1.g))
acf(resid(m1.w))
acf(resid(m1.n))
# ok


########################################################################################
####                            OTHER ANALYSIS SNIPPETS
########################################################################################



#0. preprocessing
monkeydat$GeneralAccuracy = as.numeric(monkeydat$GeneralAccuracy)
monkeydat$WordAccuracy = as.numeric(monkeydat$WordAccuracy)
monkeydat$NonwordAccuracy = as.numeric(monkeydat$NonwordAccuracy)
monkeydat$NumTrials = as.numeric(monkeydat$NumTrials)
monkeydat$NumWordsLearned = as.numeric(monkeydat$NumWordsLearned)
monkeydat$NumNonwordsPresented = as.numeric(monkeydat$NumNonwordsPresented)


#1. chance level
chance.level <- r / 2
chance.level

max.accuracy <- 1-chance.level
max.accuracy

max(data$GeneralAccuracy)
max(data$WordAccuracy)
max(data$NonwordAccuracy)


#2. correlations
#interesting to look at (& to write in the paper):
cor(data$WordAccuracy, data$NonwordAccuracy)
cor(monkeydat$WordAccuracy, monkeydat$NonwordAccuracy)
#why is this correlation so different???

cor(data$GeneralAccuracy, data$NonwordAccuracy)
cor(monkeydat$GeneralAccuracy, monkeydat$NonwordAccuracy)

cor(data$GeneralAccuracy, data$WordAccuracy)
cor(monkeydat$GeneralAccuracy, monkeydat$NonwordAccuracy)


#3. linear model
#fitting a linear model:
m1 = lm(GeneralAccuracy ~ alpha, data=data)
summary(m1)

m2 = lm(GeneralAccuracy ~ alpha + beta, data=data)
summary(m2)

anova(m1, m2)

m3 = lm(GeneralAccuracy ~ alpha * beta, data=data)
summary(m3)

anova(m2, m3)


########################################################################################
####                            EXPORT RESULTS TO LaTeX
########################################################################################


data.rounded = data
data.rounded$alpha = round(data.rounded$alpha, 3)
data.rounded$beta = round(data.rounded$beta, 3)
data.rounded$GeneralAccuracy = round(data.rounded$GeneralAccuracy, 2)
data.rounded$WordAccuracy = round(data.rounded$WordAccuracy, 2)
data.rounded$NonwordAccuracy = round(data.rounded$NonwordAccuracy, 2)
rownames(data.rounded) = NULL
colnames(data.rounded)[3] = "#Trials"
colnames(data.rounded)[4] = "#LearnedW"
colnames(data.rounded)[5] = "#PresW"
colnames(data.rounded)[6] = "GenAcc"
colnames(data.rounded)[7] = "WAcc"
colnames(data.rounded)[8] = "NWAcc"
View(data.rounded)

xtable(data.rounded[1:40, ])
xtable(data.rounded[41:80, ])
xtable(data.rounded[81:120, ])

# EOF