############################### # R-copauxmodeling-HbkVarSlx.R # Kendall, Tyler. forthcoming. Quantitatively analyzing variation and change. In Paul Kerswill, # Yoshiyuki Asahi, and Alexandra D'Arcy (eds.), Handbook of Variationist Sociolinguistics. London: Routledge. # # R code to load the (cop/aux) data from CORAAL, run the R-based statistical models, and generate # the figures presented in Tyler Kendall's chapter. # # Tyler Kendall # Last mod. November 3, 2022 # Set where you want to save things # (make sure this directory exists if you want to save things) my.wd <- "~/Documents/QuantAnalysis/" setwd(my.wd) # Set up graphical parameters for later par(bg='white') defaults<-par(no.readonly=TRUE) # Load the necessary libraries (you will need to install these and their dependencies from CRAN # if you have not already; see a source like https://r-coder.com/install-r-packages/ if you need help) library(lmerTest) library(rms) library(languageR) library(partykit) # Load the data in (directly from the website; or can download and replace URL with file loc) cop <- read.delim("http://lingtools.uoregon.edu/coraal/explorer/R/CORAAL-copaux-2500tokens-HbkVarSlx.txt") # Examine the data in some preliminary ways head(cop) table(cop$SUBJTYPE) table(cop$SUBJTYPE, cop$Cat) table(cop$PERSNUM) table(cop$PERSNUM, cop$Cat) table(cop$FOLLGRAM) table(cop$FOLLGRAM, cop$Cat) table(cop$Gender) table(cop$Gender, cop$Cat) table(cop$SEC) table(cop$SEC, cop$Cat) table(cop$Age.Grp) table(cop$Age.Grp, cop$Cat) # Generate bar plots (these are Figures 2 - 5 in the chapter) # Figures # Can uncomment jpeg() and dev.off() lines to save as files (will save in working directory) #jpeg(filename="CopAux-SubjType+PersNum-barplot.jpg", width = 6, height = 3, units = "in", res = 300, pointsize = 8, quality=100, bg = "white", type = "quartz", antialias="none") par(mfrow=c(1,2), mai=c(0.4, 0.55, 0.1, 0.1)) barplot(t(round(100*prop.table(table(cop$SUBJTYPE, cop$Cat, useNA="ifany"), 1), 2)), beside=T, leg=T, args.legend=list(cex=0.8, ncol=3, bty="n"), ylim=c(0, 110), ylab="%", names=paste(names(table(cop$SUBJTYPE)), paste("N=",table(cop$SUBJTYPE), sep=""), sep="\n"), cex.names=0.9, main="Subject Type") barplot(t(round(100*prop.table(table(cop$PERSNUM, cop$Cat, useNA="ifany"), 1), 2)), beside=T, leg=T, args.legend=list(cex=0.8, ncol=3, bty="n"), ylim=c(0, 110), ylab="%", names=paste(names(table(cop$PERSNUM)), paste("N=",table(cop$PERSNUM), sep=""), sep="\n"), cex.names=0.9, main="Person & Number") par(defaults) #dev.off() #jpeg(filename="CopAux-FollGram-barplot.jpg", width = 6, height = 3, units = "in", res = 300, pointsize = 8, quality=100, bg = "white", type = "quartz", antialias="none") par(mfrow=c(1,1), mai=c(0.4, 0.55, 0.1, 0.1)) barplot(t(round(100*prop.table(table(cop$FOLLGRAM, cop$Cat, useNA="ifany"), 1), 2)), beside=T, leg=T, args.legend=list(cex=0.8, ncol=3, bty="n"), ylim=c(0, 110), ylab="%", names=paste(names(table(cop$FOLLGRAM)), paste("N=",table(cop$FOLLGRAM), sep=""), sep="\n"), cex.names=0.9, main="Following Grammatical Environment") par(defaults) #dev.off() #jpeg(filename="CopAux-Gender+SEC-barplot.jpg", width = 6, height = 3, units = "in", res = 300, pointsize = 8, quality=100, bg = "white", type = "quartz", antialias="none") par(mfrow=c(1,2), mai=c(0.4, 0.55, 0.1, 0.1)) barplot(t(round(100*prop.table(table(cop$Gender, cop$Cat, useNA="ifany"), 1), 2)), beside=T, leg=T, args.legend=list(cex=0.8, ncol=3, bty="n"), ylim=c(0, 110), ylab="%", names=paste(names(table(cop$Gender)), paste("N=",table(cop$Gender), sep=""), sep="\n"), cex.names=0.9, main="Gender") barplot(t(round(100*prop.table(table(cop$SEC, cop$Cat, useNA="ifany"), 1), 2)), beside=T, leg=T, args.legend=list(cex=0.8, ncol=3, bty="n"), ylim=c(0, 110), ylab="%", names=paste(names(table(cop$SEC)), paste("N=",table(cop$SEC), sep=""), sep="\n"), cex.names=0.9, main="Socioeconomic Status") par(defaults) #dev.off() #jpeg(filename="CopAux-AgeGroup-barplot.jpg", width = 6, height = 3, units = "in", res = 300, pointsize = 8, quality=100, bg = "white", type = "quartz", antialias="none") par(mfrow=c(1,1), mai=c(0.4, 0.55, 0.1, 0.1)) barplot(t(round(100*prop.table(table(cop$Age.Grp, cop$Cat, useNA="ifany"), 1), 2)), beside=T, leg=T, args.legend=list(cex=0.8, ncol=3, bty="n"), ylim=c(0, 110), ylab="%", names=paste(names(table(cop$Age.Grp)), paste("N=",table(cop$Age.Grp), sep=""), sep="\n"), cex.names=0.9, main="Age Group") par(defaults) #dev.off() # For analysis, dropping the very rare sposta cases and also the Other prep phrase tokens # (i.e. locative like but not loc) there aren't many) cop2 <- droplevels(cop[cop$FOLLGRAM!="sposta" & cop$FOLLGRAM!="OtherPP",]) # This dropped 41 tokens nrow(cop) - nrow(cop2) # Now we do some simple recoding to prepare for statistical analysis cop2$SUBJTYPE <- relevel(as.factor(cop2$SUBJTYPE), ref="PersPron") contrasts(cop2$SUBJTYPE) cop2$PERSNUM <- as.factor(cop2$PERSNUM) contrasts(cop2$PERSNUM) cop2$FOLLGRAM <- relevel(as.factor(cop2$FOLLGRAM), ref="V+ing") contrasts(cop2$FOLLGRAM) cop2$Gender <- as.factor(cop2$Gender) cop2$GenderC <- cop2$Gender contrasts(cop2$GenderC) <- contr.sum(2)/2 cop2$AgeC <- scale(cop2$Age, center=T, scale=F) cop2$SEC <- relevel(as.factor(cop2$SEC), ref="2") # And then we will also drop the very common but mostly invariant it/there # subjects (nearly categorical: 95+% contracted, only 0.4% abs) cop3 <- droplevels(cop2[cop2$SUBJTYPE!="it/there",]) # Overall amount of absence is 27.9% 100*prop.table(table(cop3$IsAbs)) # What range of token amounts (Ns) do we have across the speakers? range(table(cop3$Spkr)) # For Varbrul analysis, other software, GoldVarb, needs to be used # Available at http://individual.utoronto.ca/tagliamonte/goldvarb.html # or Google "GoldVarb download". # But R can be used to simplify the preparation of the data for GoldVarb, # which needs each factor represented as a single character # This code will convert the data for importing and analyzing in GoldVarb gv <- cop3[,c("IsAbs", "Gender", "Age.Grp", "SEC", "FOLLGRAM", "SUBJTYPE", "PERSNUM")] head(gv) gv$IsAbs <- as.factor(gv$IsAbs) levels(gv$IsAbs) levels(gv$IsAbs) <- c("(P", "(A") levels(gv$Gender) levels(gv$Gender) <- c("F", "M") gv$Age.Grp <- as.factor(gv$Age.Grp) levels(gv$Age.Grp) levels(gv$Age.Grp) <- c("V", "Y", "T", "O") levels(gv$SEC) # don't need to change anything - keep 1, 2, and 3 levels(gv$FOLLGRAM) levels(gv$FOLLGRAM) <- c("v", "a", "g", "l", "k", "n") levels(gv$SUBJTYPE) levels(gv$SUBJTYPE) <- c("R", "N", "W") levels(gv$PERSNUM) levels(gv$PERSNUM) <- c("r", "s") head(gv) # And save out the file in the working directory (note no separator) write.table(gv, file="CORAAL-copaux-4goldvarb.tok", quote=F, sep="", row.names=F, col.names=F) # Now you'll need to open this in GoldVarb and walk through its steps there # Sorry, not including instructions here. # Logistic regressions (no random effects) with lrm() from rms library # Set up the data for the rms library; this first step is generic for lrm dd <- datadist(cop3) options(datadist='dd') # This is the basic logistic regression model in the chapter lrm.fit0 <- lrm(IsAbs ~ GenderC + AgeC + SEC + FOLLGRAM + SUBJTYPE + PERSNUM, data=cop3) print(lrm.fit0) # this is Table 3 in chapter # Could go beyond what was done in the chapter - e.g. testing interaction btw Age and SEC lrm.fit1 <- lrm(IsAbs ~ GenderC + AgeC * SEC + FOLLGRAM + SUBJTYPE + PERSNUM, data=cop3) print(lrm.fit1) # Appears significant # Or testing non-linearity for Age lrm.fit2 <- lrm(IsAbs ~ GenderC + rcs(AgeC, 5) + SEC + FOLLGRAM + SUBJTYPE + PERSNUM, data=cop3) print(lrm.fit2) # Does not appear significant # Mixed-effect logistic regression using glmer() in LmerTest library # Build the model above but now with a random intercept for speaker # This is the model for Table 4 in the chapter glmr.fit0 <- glmer(IsAbs ~ GenderC + AgeC + SEC + FOLLGRAM + SUBJTYPE + PERSNUM + (1|Spkr), data=cop3, family="binomial", control=glmerControl(optimizer="bobyqa")) summary(glmr.fit0) # Can then try additional models, like Age * SEC interaction glmr.fit1 <- glmer(IsAbs ~ GenderC + AgeC * SEC + FOLLGRAM + SUBJTYPE + PERSNUM + (1|Spkr), data=cop3, family="binomial", control=glmerControl(optimizer="bobyqa")) summary(glmr.fit1) anova(glmr.fit0, glmr.fit1) # Note that it is not quite significant, p = 0.05211 # Adding the interaction does not significantly improve the model # Age isn't significant as a main effect, so we might want to remove it glmr.fit2 <- glmer(IsAbs ~ GenderC + SEC + FOLLGRAM + SUBJTYPE + PERSNUM + (1|Spkr), data=cop3, family="binomial", control=glmerControl(optimizer="bobyqa")) summary(glmr.fit2) anova(glmr.fit0, glmr.fit2) # And confirmation that the model is not better with Age, p = 0.6247 # what about gender now glmr.fit3 <- glmer(IsAbs ~ SEC + FOLLGRAM + SUBJTYPE + PERSNUM + (1|Spkr), data=cop3, family="binomial", control=glmerControl(optimizer="bobyqa")) summary(glmr.fit3) anova(glmr.fit2, glmr.fit3) # And actually Gender is not quite significant, p = 0.05748 # From here an analysis would likely want to test other potential interactions of interest # And to develop models that include random slopes (which often won't converge) # Examine the cop/aux data using conditional inference trees # Using the ctree() function from partykit library (which we loaded at start) # We'll simplify the data coding a little to make it earlier to read in the tree format cop4 <- cop3 levels(cop4$FOLLGRAM) <- c("Ving", "Adj", "gna", "QLike", "Loc", "NP") levels(cop4$PERSNUM) <- c("are", "is") cop4$Cat <- as.factor(cop4$Cat) levels(cop4$Cat) <- c("A", "C", "F") names(cop4)[which(names(cop4)=="SUBJTYPE")] <- "SubjType" names(cop4)[which(names(cop4)=="PERSNUM")] <- "PersNum" names(cop4)[which(names(cop4)=="FOLLGRAM")] <- "FolGram" cop4$IsAbs <- as.factor(cop4$IsAbs) ctree.fit0 <- ctree(Cat ~ Gender + Age + SEC + FolGram + SubjType + PersNum, data=cop4) # Examine the ctree output; this is Figure 6 in the chapter # Printing this to a file (in your working dir) - it is much easier to read in the # dimensions that can be printed to a file than in the R window jpeg(filename="CopAux-CTREE_partykit_output.jpg", width = 12, height = 6, units = "in", res = 400, pointsize = 7, quality=100, bg = "white", type = "quartz", antialias="none") par(mfrow=c(1,1), mai=c(0.1, 0.1, 0.1, 0.1)) plot(ctree.fit0, terminal_panel=node_barplot(ctree.fit0, id=F, text="none", beside=F)) par(defaults) dev.off() # A simple random forest of the cop/aux data; details not reported in the chapter # This can be slow to run. cf.fit0 <- cforest(IsAbs ~ Gender + Age + SEC + FolGram + SubjType + PersNum, data=cop4) summary(cf.fit0) vimp <- varimp(cf.fit0) barplot(sort(vimp, decreasing=F), horiz=T, las=1) # And that's it. Hopefully this code is a helpful addition to the chapter!