###############################
# R-copauxmodeling-HbkVarSlx.R
#  Kendall, Tyler. forthcoming. Quantitatively analyzing variation and change. In Paul Kerswill, 
#  Yoshiyuki Asahi, and Alexandra D'Arcy (eds.), Handbook of Variationist Sociolinguistics. London: Routledge.
#
# R code to load the (cop/aux) data from CORAAL, run the R-based statistical models, and generate 
#  the figures presented in Tyler Kendall's chapter.
#
# Tyler Kendall
#  Last mod. November 3, 2022


# Set where you want to save things 
#  (make sure this directory exists if you want to save things)
my.wd <- "~/Documents/QuantAnalysis/"
setwd(my.wd)

# Set up graphical parameters for later
par(bg='white')
defaults<-par(no.readonly=TRUE)

# Load the necessary libraries (you will need to install these and their dependencies from CRAN 
#  if you have not already; see a source like https://r-coder.com/install-r-packages/ if you need help)
library(lmerTest)
library(rms)
library(languageR)
library(partykit)

# Load the data in (directly from the website; or can download and replace URL with file loc)
cop <- read.delim("http://lingtools.uoregon.edu/coraal/explorer/R/CORAAL-copaux-2500tokens-HbkVarSlx.txt")

# Examine the data in some preliminary ways
head(cop)

table(cop$SUBJTYPE)
table(cop$SUBJTYPE, cop$Cat)

table(cop$PERSNUM)
table(cop$PERSNUM, cop$Cat)

table(cop$FOLLGRAM)
table(cop$FOLLGRAM, cop$Cat)

table(cop$Gender)
table(cop$Gender, cop$Cat)

table(cop$SEC)
table(cop$SEC, cop$Cat)

table(cop$Age.Grp)
table(cop$Age.Grp, cop$Cat)

# Generate bar plots (these are Figures 2 - 5 in the chapter)

# Figures

# Can uncomment jpeg() and dev.off() lines to save as files (will save in working directory)
#jpeg(filename="CopAux-SubjType+PersNum-barplot.jpg", width = 6, height = 3, units = "in", res = 300, pointsize = 8, quality=100, bg = "white", type = "quartz", antialias="none")
par(mfrow=c(1,2), mai=c(0.4, 0.55, 0.1, 0.1))
barplot(t(round(100*prop.table(table(cop$SUBJTYPE, cop$Cat, useNA="ifany"), 1), 2)), beside=T, leg=T, args.legend=list(cex=0.8, ncol=3, bty="n"), ylim=c(0, 110), ylab="%", names=paste(names(table(cop$SUBJTYPE)), paste("N=",table(cop$SUBJTYPE), sep=""), sep="\n"), cex.names=0.9, main="Subject Type")
barplot(t(round(100*prop.table(table(cop$PERSNUM, cop$Cat, useNA="ifany"), 1), 2)), beside=T, leg=T, args.legend=list(cex=0.8, ncol=3, bty="n"), ylim=c(0, 110), ylab="%", names=paste(names(table(cop$PERSNUM)), paste("N=",table(cop$PERSNUM), sep=""), sep="\n"), cex.names=0.9, main="Person & Number")
par(defaults)
#dev.off()

#jpeg(filename="CopAux-FollGram-barplot.jpg", width = 6, height = 3, units = "in", res = 300, pointsize = 8, quality=100, bg = "white", type = "quartz", antialias="none")
par(mfrow=c(1,1), mai=c(0.4, 0.55, 0.1, 0.1))
barplot(t(round(100*prop.table(table(cop$FOLLGRAM, cop$Cat, useNA="ifany"), 1), 2)), beside=T, leg=T, args.legend=list(cex=0.8, ncol=3, bty="n"), ylim=c(0, 110), ylab="%", names=paste(names(table(cop$FOLLGRAM)), paste("N=",table(cop$FOLLGRAM), sep=""), sep="\n"), cex.names=0.9, main="Following Grammatical Environment")
par(defaults)
#dev.off()

#jpeg(filename="CopAux-Gender+SEC-barplot.jpg", width = 6, height = 3, units = "in", res = 300, pointsize = 8, quality=100, bg = "white", type = "quartz", antialias="none")
par(mfrow=c(1,2), mai=c(0.4, 0.55, 0.1, 0.1))
barplot(t(round(100*prop.table(table(cop$Gender, cop$Cat, useNA="ifany"), 1), 2)), beside=T, leg=T, args.legend=list(cex=0.8, ncol=3, bty="n"), ylim=c(0, 110), ylab="%", names=paste(names(table(cop$Gender)), paste("N=",table(cop$Gender), sep=""), sep="\n"), cex.names=0.9, main="Gender")
barplot(t(round(100*prop.table(table(cop$SEC, cop$Cat, useNA="ifany"), 1), 2)), beside=T, leg=T, args.legend=list(cex=0.8, ncol=3, bty="n"), ylim=c(0, 110), ylab="%", names=paste(names(table(cop$SEC)), paste("N=",table(cop$SEC), sep=""), sep="\n"), cex.names=0.9, main="Socioeconomic Status")
par(defaults)
#dev.off()

#jpeg(filename="CopAux-AgeGroup-barplot.jpg", width = 6, height = 3, units = "in", res = 300, pointsize = 8, quality=100, bg = "white", type = "quartz", antialias="none")
par(mfrow=c(1,1), mai=c(0.4, 0.55, 0.1, 0.1))
barplot(t(round(100*prop.table(table(cop$Age.Grp, cop$Cat, useNA="ifany"), 1), 2)), beside=T, leg=T, args.legend=list(cex=0.8, ncol=3, bty="n"), ylim=c(0, 110), ylab="%", names=paste(names(table(cop$Age.Grp)), paste("N=",table(cop$Age.Grp), sep=""), sep="\n"), cex.names=0.9, main="Age Group")
par(defaults)
#dev.off()


# For analysis, dropping the very rare sposta cases and also the Other prep phrase tokens 
#  (i.e. locative like but not loc) there aren't many)
cop2 <- droplevels(cop[cop$FOLLGRAM!="sposta" & cop$FOLLGRAM!="OtherPP",])

# This dropped 41 tokens
nrow(cop) - nrow(cop2)

# Now we do some simple recoding to prepare for statistical analysis
cop2$SUBJTYPE <- relevel(as.factor(cop2$SUBJTYPE), ref="PersPron")
contrasts(cop2$SUBJTYPE)

cop2$PERSNUM <- as.factor(cop2$PERSNUM)
contrasts(cop2$PERSNUM)

cop2$FOLLGRAM <- relevel(as.factor(cop2$FOLLGRAM), ref="V+ing")
contrasts(cop2$FOLLGRAM)

cop2$Gender <- as.factor(cop2$Gender)
cop2$GenderC <- cop2$Gender
contrasts(cop2$GenderC) <- contr.sum(2)/2

cop2$AgeC <- scale(cop2$Age, center=T, scale=F)

cop2$SEC <- relevel(as.factor(cop2$SEC), ref="2")

# And then we will also drop the very common but mostly invariant it/there 
#  subjects (nearly categorical: 95+% contracted, only 0.4% abs)
cop3 <- droplevels(cop2[cop2$SUBJTYPE!="it/there",])

# Overall amount of absence is 27.9%
100*prop.table(table(cop3$IsAbs))

# What range of token amounts (Ns) do we have across the speakers?
range(table(cop3$Spkr))

# For Varbrul analysis, other software, GoldVarb, needs to be used
# Available at http://individual.utoronto.ca/tagliamonte/goldvarb.html
#  or Google "GoldVarb download".
# But R can be used to simplify the preparation of the data for GoldVarb,
#  which needs each factor represented as a single character
# This code will convert the data for importing and analyzing in GoldVarb

gv <- cop3[,c("IsAbs", "Gender", "Age.Grp", "SEC", "FOLLGRAM", "SUBJTYPE", "PERSNUM")]
head(gv)

gv$IsAbs <- as.factor(gv$IsAbs)
levels(gv$IsAbs)
levels(gv$IsAbs) <- c("(P", "(A")
levels(gv$Gender)
levels(gv$Gender) <- c("F", "M")
gv$Age.Grp <- as.factor(gv$Age.Grp)
levels(gv$Age.Grp)
levels(gv$Age.Grp) <- c("V", "Y", "T", "O")
levels(gv$SEC) # don't need to change anything - keep 1, 2, and 3
levels(gv$FOLLGRAM)
levels(gv$FOLLGRAM) <- c("v", "a", "g", "l", "k", "n")
levels(gv$SUBJTYPE)
levels(gv$SUBJTYPE) <- c("R", "N", "W")
levels(gv$PERSNUM)
levels(gv$PERSNUM) <- c("r", "s")

head(gv)

# And save out the file in the working directory (note no separator)
write.table(gv, file="CORAAL-copaux-4goldvarb.tok", quote=F, sep="", row.names=F, col.names=F)
# Now you'll need to open this in GoldVarb and walk through its steps there
# Sorry, not including instructions here.


# Logistic regressions (no random effects) with lrm() from rms library

# Set up the data for the rms library; this first step is generic for lrm
dd <- datadist(cop3)
options(datadist='dd')

# This is the basic logistic regression model in the chapter
lrm.fit0 <- lrm(IsAbs ~ GenderC + AgeC + SEC + FOLLGRAM + SUBJTYPE + PERSNUM, data=cop3)
print(lrm.fit0) # this is Table 3 in chapter

# Could go beyond what was done in the chapter - e.g. testing interaction btw Age and SEC
lrm.fit1 <- lrm(IsAbs ~ GenderC + AgeC * SEC + FOLLGRAM + SUBJTYPE + PERSNUM, data=cop3)
print(lrm.fit1) # Appears significant

# Or testing non-linearity for Age
lrm.fit2 <- lrm(IsAbs ~ GenderC + rcs(AgeC, 5) + SEC + FOLLGRAM + SUBJTYPE + PERSNUM, data=cop3)
print(lrm.fit2) # Does not appear significant


# Mixed-effect logistic regression using glmer() in LmerTest library

# Build the model above but now with a random intercept for speaker
# This is the model for Table 4 in the chapter
glmr.fit0 <- glmer(IsAbs ~ GenderC + AgeC + SEC + FOLLGRAM + SUBJTYPE + PERSNUM + (1|Spkr), data=cop3, family="binomial", control=glmerControl(optimizer="bobyqa"))
summary(glmr.fit0)

# Can then try additional models, like Age * SEC interaction
glmr.fit1 <- glmer(IsAbs ~ GenderC + AgeC * SEC + FOLLGRAM + SUBJTYPE + PERSNUM + (1|Spkr), data=cop3, family="binomial", control=glmerControl(optimizer="bobyqa"))
summary(glmr.fit1)
anova(glmr.fit0, glmr.fit1) # Note that it is not quite significant, p = 0.05211
#  Adding the interaction does not significantly improve the model

# Age isn't significant as a main effect, so we might want to remove it
glmr.fit2 <- glmer(IsAbs ~ GenderC + SEC + FOLLGRAM + SUBJTYPE + PERSNUM + (1|Spkr), data=cop3, family="binomial", control=glmerControl(optimizer="bobyqa"))
summary(glmr.fit2)
anova(glmr.fit0, glmr.fit2) # And confirmation that the model is not better with Age, p = 0.6247

# what about gender now
glmr.fit3 <- glmer(IsAbs ~ SEC + FOLLGRAM + SUBJTYPE + PERSNUM + (1|Spkr), data=cop3, family="binomial", control=glmerControl(optimizer="bobyqa"))
summary(glmr.fit3)
anova(glmr.fit2, glmr.fit3) # And actually Gender is not quite significant, p = 0.05748

# From here an analysis would likely want to test other potential interactions of interest
#  And to develop models that include random slopes (which often won't converge)

# Examine the cop/aux data using conditional inference trees
#  Using the ctree() function from partykit library (which we loaded at start)

# We'll simplify the data coding a little to make it earlier to read in the tree format
cop4 <- cop3
levels(cop4$FOLLGRAM) <- c("Ving", "Adj", "gna", "QLike", "Loc", "NP")
levels(cop4$PERSNUM) <- c("are", "is")
cop4$Cat <- as.factor(cop4$Cat)
levels(cop4$Cat) <- c("A", "C", "F")
names(cop4)[which(names(cop4)=="SUBJTYPE")] <- "SubjType"
names(cop4)[which(names(cop4)=="PERSNUM")] <- "PersNum"
names(cop4)[which(names(cop4)=="FOLLGRAM")] <- "FolGram"
cop4$IsAbs <- as.factor(cop4$IsAbs)

ctree.fit0 <- ctree(Cat ~ Gender + Age + SEC + FolGram + SubjType + PersNum, data=cop4)

# Examine the ctree output; this is Figure 6 in the chapter
#  Printing this to a file (in your working dir) - it is much easier to read in the 
#  dimensions that can be printed to a file than in the R window
jpeg(filename="CopAux-CTREE_partykit_output.jpg", width = 12, height = 6, units = "in", res = 400, pointsize = 7, quality=100, bg = "white", type = "quartz", antialias="none")
par(mfrow=c(1,1), mai=c(0.1, 0.1, 0.1, 0.1))
plot(ctree.fit0, terminal_panel=node_barplot(ctree.fit0, id=F, text="none", beside=F))
par(defaults)
dev.off()


# A simple random forest of the cop/aux data; details not reported in the chapter
#  This can be slow to run.
cf.fit0 <- cforest(IsAbs ~ Gender + Age + SEC + FolGram + SubjType + PersNum, data=cop4)
summary(cf.fit0)
vimp <- varimp(cf.fit0)
barplot(sort(vimp, decreasing=F), horiz=T, las=1)

# And that's it. Hopefully this code is a helpful addition to the chapter!