# This is a sampler of the R code used for the Mental Lexicon talk
# "Form and meaning in the reading of Chinese compounds"
# <http://www.ccunix.ccu.edu.tw/~lngproc/MLD-ML5-HO.pdf>. 
# The analyses rely on tools in the R packages "lme4"
# (which permits subjects and items to be treated as random simultaneously)
# and "nlme" (which has some tools for plotting, but only for by-subject analyses).
# Thanks to Harald Baayen and Lee Wurm for tips, but neither is responsible
# for any misunderstandings -- so caveat emptor!

# I first demo how the analyses were done (as summarized in slides 16ff)
# (note that the slides show mean RTs, though the stats were actually based on
# ALL raw data, and using log(RT) rather than RT).
# Then I show how I made the partial effect plots in slide 17.

# The data file consists of the following variables:

# Subject: ID numbers for subjects (permits grouping as random variable)
# Item: ID numbers for items (permits grouping as random variable)
# LR: direction (1 = LR, -1 = RL)
# Alone: whether direction differed across groups or within (1 = separate experiments, -1 = mixed)
# C1S: 1st char transparency (1 = T, -1 = O)
# C2S: ditto for 2nd char
# C1F: log freq of 1st char
# C2F: ditto for 2nd char
# WF: log word freq
# RC: whether compound is right-headed (1 = yes, -1 = no)

# Load the lme4 package (it must first be installed from CRAN)
library(lme4)

# Name the data (this is for horizontal-running text only)
hdat = read.table("HorizontalData.txt",T)

# Treat the headers in the data file as variable names
attach(hdat)

# Test if RT is normally distributed
qqnorm(RT) # It's not normal
qqnorm(log(RT)) # Now it's better

# Simplify notation
logRT = log(RT)

# The simplest model (by subject only)
hdat.lme01 = lmer(logRT~LR*Alone*C1S*C2S+C1F*C2F+WF+(1|Subject), data=hdat)
hdat.lme01 

# The simplest model (by subject and by item)
hdat.lme02 = lmer(logRT~LR*Alone*C1S*C2S+C1F*C2F+WF+(1|Subject)+(1|Item), data=hdat)
hdat.lme02

# The above two models are nested, so can be compared to see if items should indeed
# be treated as random
anova(hdat.lme01,hdat.lme02)

# Add RC as additive effect
hdat.lmeRC01 = lmer(logRT~LR*Alone*C1S*C2S+RC+C1F*C2F+WF+(1|Subject), data=hdat)
hdat.lmeRC01 

hdat.lmeRC02 = lmer(logRT~LR*Alone*C1S*C2S+RC+C1F*C2F+WF+(1|Subject)+(1|Item), data=hdat)
hdat.lmeRC02

anova(hdat.lmeRC01,hdat.lmeRC02)

# Include interactions with RC
hdat.lmeRCint01 = lmer(logRT~LR*Alone*C1S*C2S*RC+C1F*C2F+WF+(1|Subject), data=hdat)
hdat.lmeRCint01 

## BEST MODEL: By-subj-&-item, including "+/-right-headed compounds" as factor
## and its interactions with direction and whether the direction was within or between subject groups

hdat.lmeRCint02 = lmer(logRT~LR*Alone*C1S*C2S*RC+C1F*C2F+WF+(1|Subject)+(1|Item), data=hdat)
hdat.lmeRCint02

anova(hdat.lmeRCint01,hdat.lmeRCint02)

# Without including C1F*C2F, the effect of C2F is counterintuitive: negative!
hdat.lmeRCintnoCFint02 = lmer(logRT~LR*Alone*C1S*C2S*RC+C1F+C2F+WF+(1|Subject)+(1|Item), data=hdat)
hdat.lmeRCintnoCFint02

anova(hdat.lmeRCintnoCFint02,hdat.lmeRCint02)

# Comparing the relevance of RC and its interactions
anova(hdat.lmeRCint02,hdat.lmeRC02)
anova(hdat.lmeRC02,hdat.lme02)
anova(hdat.lmeRCint02,hdat.lme02)

# Now make partial effect plots, which requires a different package
library(nlme)

# As a reminder, here's the model (by-subj only, using nlme syntax):
# hdat.lme = lmer(logRT~LR*Alone*C1S*C2S*RC+C1F*C2F+WF, random=~1|Subject, data=hdat)

# But this time we must compute interactions "by hand" so model can plot properly
# (not sure why this is, but it just is)

LRAlone = LR*Alone
LRC1S = LR*C1S
AloneC1S = Alone*C1S
LRC2S = LR*C2S
AloneC2S = Alone*C2S
C1SC2S = C1S*C2S
LRRC = LR*RC
AloneRC = Alone*RC
C1SRC = C1S*RC
C2SRC = C2S*RC
C1FC2F = C1F*C2F
LRAloneC1S = LR*Alone*C1S
LRAloneC2S = LR*Alone*C2S
LRC1SC2S = LR*C1S*C2S
AloneC1SC2S = Alone*C1S*C2S
LRAloneRC = LR*Alone*RC
LRC1SRC = LR*C1S*RC
AloneC1SRC = Alone*C1S*RC
LRC2SRC = LR*C2S*RC
AloneC2SRC = Alone*C2S*RC
C1SC2SRC = C1S*C2S*RC
LRAloneC1SC2S = LR*Alone*C1S*C2S
LRAloneC1SRC = LR*Alone*C1S*RC
LRAloneC2SRC = LR*Alone*C2S*RC
LRC1SC2SRC = LR*C1S*C2S*RC
AloneC1SC2SRC = Alone*C1S*C2S*RC
LRAloneC1SC2SRC = LR*Alone*C1S*C2S*RC

# Now redefine model using those hand-made interaction factors:

hdat.lme = lme(logRT ~ LR + Alone + C1S + C2S + RC + C1F + C2F + WF +
LRAlone + LRC1S + AloneC1S + LRC2S + AloneC2S + C1SC2S + 
LRRC + AloneRC + C1SRC + C2SRC + C1FC2F + LRAloneC1S + LRAloneC2S + LRC1SC2S + 
AloneC1SC2S + LRAloneRC + LRC1SRC + AloneC1SRC + LRC2SRC + AloneC2SRC + 
C1SC2SRC + LRAloneC1SC2S + LRAloneC1SRC + LRAloneC2SRC + LRC1SC2SRC + 
AloneC1SC2SRC + LRAloneC1SC2SRC, random=~1|Subject, data=hdat)

# Unlike the lmer function (in lme4 package), the lme function (in nlme package)
# lets you extract the coefficients table, so here it is:

round(summary(hdat.lme)$tTable,4)

# Now define the new dataframe that keeps everything constant except
# the independent variable we're looking at

# First define the number of observations
# (This value is arbitrary; with linear models, it'll be straight anyway)

nobs = 8

# Define fake data frame with fixed values:

predictdata=data.frame(
Subject = rep("99999", nobs),
LR = rep(1, nobs),
TD = rep(1, nobs),
Alone = rep(1, nobs),
RC = rep(1, nobs),
CC = rep(1, nobs),
EC = rep(1, nobs),
C1S = rep(1, nobs),
C2S = rep(1, nobs),
C1F = rep(median(hdat$C1F), nobs),
C2F = rep(median(hdat$C2F), nobs),
WF = rep(median(hdat$WF), nobs),
LRAlone = rep(1, nobs),
LRC1S = rep(1, nobs),
AloneC1S = rep(1, nobs),
LRC2S = rep(1, nobs),
AloneC2S = rep(1, nobs),
C1SC2S = rep(1, nobs),
LRRC = rep(1, nobs),
AloneRC = rep(1, nobs),
C1SRC = rep(1, nobs),
C2SRC = rep(1, nobs),
C1FC2F = rep(1, nobs),
LRAloneC1S = rep(1, nobs),
LRAloneC2S = rep(1, nobs),
LRC1SC2S = rep(1, nobs),
AloneC1SC2S = rep(1, nobs),
LRAloneRC = rep(1, nobs),
LRC1SRC = rep(1, nobs),
AloneC1SRC = rep(1, nobs),
LRC2SRC = rep(1, nobs),
AloneC2SRC = rep(1, nobs),
C1SC2SRC = rep(1, nobs),
LRAloneC1SC2S = rep(1, nobs),
LRAloneC1SRC = rep(1, nobs),
LRAloneC2SRC = rep(1, nobs),
LRC1SC2SRC = rep(1, nobs),
AloneC1SC2SRC = rep(1, nobs),
LRAloneC1SC2SRC = rep(1, nobs)
)

# Default plotting range go across all RT values;
# you can improve appearance of plot by adjusting range later

ylimit = c(min(logRT), max(logRT))

# In this demo, I'm making four plots: C1F, C2F, C1F*C2F, WF
# so let's arrange them in a nice 2*2 matrix, for easier comparison

par(mfrow = c(2,2))

# C1F effect

predictX = predictdata # Copy data for safe keeping

predictX$C1F = seq(min(hdat$C1F), max(hdat$C1F), length = nobs)
plot(as.numeric(predict(hdat.lme, predictX, level=0)) ~
predictX$C1F, type="l", ylim=ylimit, xlab="CF1", ylab="log RT",
col = "red")

# C2F effect

predictX = predictdata

predictX$C2F = seq(min(hdat$C2F), max(hdat$C2F), length = nobs)
plot(as.numeric(predict(hdat.lme, predictX, level=0)) ~
predictX$C2F, type="l", ylim=ylimit, xlab="CF2", ylab="log RT",
col = "red")

# C1F*C2F interaction

predictX = predictdata

predictX$C1FC2F = seq(min(hdat$C1F*hdat$C2F), max(hdat$C1F*hdat$C2F), length = nobs)
plot(as.numeric(predict(hdat.lme, predictX, level=0)) ~
predictX$C1FC2F, type="l", ylim=ylimit, xlab="CF1 * CF2", ylab="log RT",
col = "red")

# WF effect

predictX = predictdata

predictX$WF = seq(min(hdat$WF), max(hdat$WF), length = nobs)
plot(as.numeric(predict(hdat.lme, predictX, level=0)) ~
predictX$WF, type="l", ylim=ylimit, xlab="WF", ylab="log RT",
col = "red")

# Close up shop

detach(hdat)