############################### # ExploreCORAALSpeechRatePause_HbkofSociophonetics.R # Kendall, Tyler. forthcoming. Sociophonetics and Speech Rate and Pause. # In Christopher Strelluf (ed.) Routledge Handbook of Sociophonetics. # London: Routledge. # # Code to explore the Corpus of Regional African American Language (CORAAL) # for some initial forays into speech rate and pause. Code replicates the # analyses in the Handbook chapter and provides opportunities to expand as # desired. Note this is all meant to exemplify some sociophonetic inquiring # about speech rate and pause and to provide some examples for how to work # with CORAAL transcripts using R; these are not meant to represent definitive # analyses or best-practices for e.g. statistical analysis. # # Tyler Kendall # Last mod. April 30, 2022 # Set where you want to save things # (make sure this directory exists if you want to save things) my.wd <- "~/Desktop/CORAAL_SRP_Examples/" setwd(my.wd) # Set up graphical parameters for later par(bg="white") defaults <- par(no.readonly=TRUE) # Load the CORAAL Explore R functions from web # This requires that RCurl is installed source('http://lingtools.uoregon.edu/coraal/explorer/R/CORAAL_web.R') # Download and set up CORAAL data and metadata # The argument collapse = F gets each utterance as a separate row in the data # Nb. this downloads the current version of CORAAL; results here may differ # from the chapter due to updates and new data available over time. coraal.init <- coraal.webget.data(collapse=F) meta <- coraal.webget.meta() # Examine the data (for instance)... head(coraal.init) # As of CORAAL v. 2021.07 there are 7 components in CORAAL. For simplicity and to follow # the components used in Kendall's chapter in the handbook, we'll drop DCA, the 1968 # Washington DC data, and the LES and VLD components which were released most recently unique(coraal.init$Comp) nrow(coraal.init) coraal <- droplevels(coraal.init[coraal.init$Comp %in% c("DCB", "ATL", "PRV", "ROC"),]) nrow(coraal) # Add columns to the data for metadata of interest # Relevel some factors for later analysis coraal$Gender <- coraal.add.meta("Gender", meta=meta, src=coraal) coraal$IntGender <- coraal.add.meta("Interviewer.Gender", meta=meta, src=coraal) # Interviewer.Relationship has two different entries for ROC_se0_ag3_f_02 (as of v. 2021.07) # This will be corrected in an upcoming version but for now needs to be hand-corrected meta[meta$CORAAL.Spkr=="ROC_se0_ag3_f_02",] meta$Interviewer.Relationship[meta$CORAAL.Spkr=="ROC_se0_ag3_f_02"] <- "Acquaintance" coraal$IntRelat <- coraal.add.meta("Interviewer.Relationship", meta=meta, src=coraal) coraal$IntRelat <- as.factor(coraal$IntRelat) # CORAAL v. 2021.07 introduced some irregularities with LES and VLD # This isn't necessary for just the four components being used here, # but if you expand to the newer components you'll need this to check for them and fix levels(coraal$IntRelat) levels(coraal$IntRelat)[which(levels(coraal$IntRelat)=="Friend")] <- "Close Relationship" levels(coraal$IntRelat)[which(levels(coraal$IntRelat)=="No previous relationship")] <- "No Prev. Relationship" coraal$IntRelat <- relevel(as.factor(coraal$IntRelat), ref="No Prev. Relationship") coraal$Interviewer <- coraal.add.meta("Interviewer.Code", meta=meta, src=coraal) coraal$Interviewer <- relevel(as.factor(coraal$Interviewer), ref="DCB_int_01") coraal$Age <- coraal.add.meta("Age", meta=meta, src=coraal) coraal$Age <- as.numeric(coraal$Age) # The CORAAL AgeGrp categories are not perfectly aligned across components (as of v. 2021.07) # So instead of using these: #coraal$AgeGrp <- coraal.add.meta("Age.Group", meta=meta, src=coraal) #coraal$AgeGrp <- relevel(as.factor(coraal$AgeGrp), ref="30 to 50") # Set up categorical age groups based on the individual ages: coraal$AgeGrp2 <- cut(coraal$Age, c(10, 19, 29, 39, 49, 59, 69, 85)) table(coraal$AgeGrp2) coraal$AgeGrp2 <- relevel(coraal$AgeGrp2, ref="(29,39]") # Note that some factors, like educational background, don't end up being used # in this code - they either don't emerge as factors of relevance in modeling or # just weren't explored by Tyler in the end. They are included here in case you # want to explore (and to exemplify some ways to work with the data) # We'll simplify the educational groups from the metadata to just three categories coraal$EduGrp <- coraal.add.meta("Edu.Group", meta=meta, src=coraal) coraal$EduGrp2 <- as.factor(tolower(coraal$EduGrp)) levels(coraal$EduGrp2) levels(coraal$EduGrp2) <- c("≥College", " 0.2 & < 0.4 seconds? length(which(arate$Dur > 0.2 & arate$Dur < 0.4)) sample(arate$Content[arate$Dur > 0.2 & arate$Dur < 0.4], 40) # We're going to drop all the utterances < 0.2 seconds in Dur # These are so short and rarely complete utterances, it seems better to exclude from # an articulation rate analysis. You could make a different decision about this... arate2 <- droplevels(arate[arate$Dur >= 0.2,]) dim(arate2) # Examine the distribution of the articulation rate data plot(density(arate2$ARate)) # Very few extremely fast utterances (< 0.2%); you could trim them but we don't here # We'll trim outliers more systematically in a moment length(which(arate2$ARate >= 10)) 100*length(which(arate2$ARate >= 10))/nrow(arate2) # Can examine the data in other ways, e.g. plot(arate2$ARate, arate2$Dur) # Now for each speaker, drop art rates that are >= 2sds ± mean # and then center, per speaker, ARate & NSyls, while we're doing this # (we'll want to do that later and it's efficient to just do it here) # We'll set up a function for this trim.artrates <- function(spkr.dat) { spkr.mn <- mean(spkr.dat$ARate) spkr.sd <- sd(spkr.dat$ARate) trim.dat <- spkr.dat[(spkr.dat$ARate >= (spkr.mn - 2*spkr.sd)) & (spkr.dat$ARate <= (spkr.mn + 2*spkr.sd)),] trim.dat$NSyls.sC <- scale(trim.dat$NSyls, center=T, scale=F) trim.dat$ARate.sC <- scale(trim.dat$ARate, center=T, scale=F) trim.dat } # Now to apply the function, we split the data by speaker, apply the function to the list # of speakers and then recombine spkr.arlist <- split(arate2, arate2$Spkr) ar.data <- do.call("rbind", lapply(spkr.arlist, trim.artrates)) # This trimmed about 40 utterances per speaker mean(table(arate2$Spkr) - table(ar.data$Spkr)) # Examined the trimmed data plot(density(ar.data$ARate)) plot(ar.data$ARate, ar.data$Dur) plot(density(ar.data$ARate.sC)) # Articulation rate data is now pretty much set up for analysis # Now set up pause data pauses <- droplevels(coraal[pause.locs,]) # Remove interviewers pauses2 <- droplevels(pauses[!is.na(pauses$Gender),]) # Examine the data a little table(pauses2$Spkr) head(pauses2) # Convert pauses to millisec and log-millisec pauses2$DurMS <- 1000*pauses2$Dur pauses2$LogDur <- log(pauses2$DurMS) # Calculate turn durations pauses2$Turn.Dur <- pauses2$Turn.EnTime - pauses2$Turn.StTime pauses2$Turn.LogDurMS <- log(1000*pauses2$Turn.Dur) # Examine plot(density(pauses2$Turn.Dur)) plot(density(pauses2$Turn.LogDurMS)) # Set up turn durations with the individual pauses subtracted pauses2$Turn.Dur.woP <- pauses2$Turn.Dur - pauses2$Dur pauses2$Turn.Log.woPDurMS <- log(1000*pauses2$Turn.Dur.woP) # Remove any turns that are longer than 300 seconds # With the logic that turns this long may be weird so may exhibit unusual pause # Of course, this could be worth it's own investigation length(which(pauses2$Turn.Dur >= 300)) pauses3 <- droplevels(pauses2[pauses2$Turn.Dur < 300,]) # Now for each speaker, drop pauses that are >= 2sds ± mean log duration # and then center, per speaker, LogDur, while we're doing this; again set up a function trim.pauses <- function(spkr.dat) { spkr.mn <- mean(spkr.dat$LogDur) spkr.sd <- sd(spkr.dat$LogDur) trim.dat <- spkr.dat[(spkr.dat$LogDur >= (spkr.mn - 2*spkr.sd)) & (spkr.dat$LogDur <= (spkr.mn + 2*spkr.sd)),] trim.dat$LogDur.sC <- scale(trim.dat$LogDur, center=T, scale=F) trim.dat } # And apply the function to the data for each speaker spkr.plist <- split(pauses3, pauses3$Spkr) p.data <- do.call("rbind", lapply(spkr.plist, trim.pauses)) # This trimmed about 30 pauses per speaker mean(table(pauses3$Spkr) - table(p.data$Spkr)) # Examine the pause data plot(density(p.data$LogDur)) plot(density(p.data$LogDur.sC)) # Calculate the number of utterances in each row in the data # There are other ways to do this, but this works p.data$NUtts <- 1 + ceiling((p.data$Turn.EnLine - p.data$Turn.StLine)/2) # Examine the data a little plot(density(p.data$NUtts)) plot(density(p.data$Turn.Log.woPDurMS)) plot(density(p.data$Turn.LogDur)) # Uncomment and execute if you want to save the data for later re-use # (remember we set the working directory at the start of the script, # so these should all save in the folder that you set there) #write.table(ar.data, file="CORAAL_ARdata.txt") #write.table(p.data, file="CORAAL_Pdata.txt") # At this point we have processed the CORAAL transcripts and generated basic data # for articulation rates and pauses. The rest of this script does some exploratory # data analysis. Some of these are the basis for the case studies in the handbook # chapter. Of course, feel free to explore more/differently. # We can examine the data in various ways, e.g. boxplot(ARate ~ Gender * Comp, data=ar.data, notch=T, las=2, xlab="", ylab="Articulation Rate") # But we will primarily run some statistics models. # First, load various R packages for data analysis and plotting # You will need to make sure you have installed these packages (and their dependencies) library(lmerTest) library(rms) library(car) library(languageR) library(emmeans) library(multcomp) library(sjPlot) # Part 1. Articulation Rate: Does region (CORAAL component) predict articulation rates? # NSyls effect on ARate is ~logarithmic plot(ar.data$NSyls, ar.data$ARate, pch='.') lines(lowess(ar.data$NSyls, ar.data$ARate)) # So we'll use this as a predictor ar.data$LogNSyls <- log(ar.data$NSyls) # Of course, statistical modeling is not straightforward. The point of this chapter # is not to overview statistical best practices so this script doesn't include the # extensive modeling criticism that would be done around here. The following # model appears to be a pretty good model for the articulation rate data, testing # these factors. As noted in the chapter, Age does not appear to influence art. rates. # Model criticism did not indicate that any interactions were significant. ar.model <- lmer(ARate ~ LogNSyls + Gender + Comp + (1+LogNSyls||Spkr), data=ar.data) summary(ar.model) Anova(ar.model) # Uncomment to save the plots, rather than display them in R #jpeg(filename = "ArtRateModel.jpg", width = 5, height = 3.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz") plot_model(ar.model, terms=c("Comp", "Gender"), type="pred", title="Predicted values of articulation rate", axis.title="σ/sec") #dev.off() #jpeg(filename = "ArtRateModel-allests.jpg", width = 5, height = 3, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz") plot_model(ar.model, title="Model estimates for articulation rate") #dev.off() # These next three plots create Figure 6 in the chapter #jpeg(filename = "ArtRateModel-nsyls.jpg", width = 2.5, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz") plot_model(ar.model, terms=c("LogNSyls"), type="pred", title="", axis.title=c("Log N Syllables", "σ/sec")) #dev.off() #jpeg(filename = "ArtRateModel-gender.jpg", width = 2.5, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz") plot_model(ar.model, terms=c("Gender"), type="pred", title="", axis.title=c("Gender", "σ/sec")) #dev.off() #jpeg(filename = "ArtRateModel-comp.jpg", width = 2.5, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz") plot_model(ar.model, terms=c("Comp"), type="pred", title="", axis.title=c("CORAAL Component", "σ/sec")) #dev.off() # Part 2. Pauses. Does region (CORAAL component) predict pause durations? # Center Turn duration measure p.data$Turn.LogDurMSC <- scale(p.data$Turn.Log.woPDurMS, center=T, scale=F) # As with articulation rate, a lot of model criticism should be done. This model # appears quite good, with model criticism supporting the inclusion of the interaction # and the exclusion of Gender and Age as predictors p.model <- lmer(LogDur ~ Comp * Turn.LogDurMSC + (1 + Turn.LogDurMSC||Spkr), data=p.data) summary(p.model) Anova(p.model) # These next three plots create Figure 7 in the chapter #jpeg(filename = "PauseModel-TurnDur.jpg", width = 2.5, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz") plot_model(p.model, terms=c("Turn.LogDurMSC"), type="pred", title="", axis.title=c("Turn Dur (log millisec)", "Pause Dur (log millisec)")) #dev.off() #jpeg(filename = "PauseModel-comp.jpg", width = 2.5, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz") plot_model(p.model, terms=c("Comp"), type="pred", title="", axis.title=c("CORAAL Component", "Pause Dur (log millisec)")) #dev.off() #jpeg(filename = "PauseModel-compTurndurX.jpg", width = 2.8, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz") plot_model(p.model, terms=c("Turn.LogDurMSC", "Comp"), type="pred", title="", axis.title=c("Turn Dur (log millisec)", "Pause Dur (log millisec)")) #dev.off() # Part 3. Back to Articulation Rate. What about the interviewer's gender and relationship # with the interviewee? # Shorten the Interviewer relationship coding levels(ar.data$IntRelat) <- c("None", "Acquaint.", "Close") # Again, lots of work goes in to identifying the best model. Model criticism indicates # that this model is a good model and improves the fit over the simpler articulation # rate model above. ar.model2 <- lmer(ARate ~ LogNSyls + Gender * IntGender + IntGender * IntRelat + (1+LogNSyls||Spkr), data=ar.data) summary(ar.model2) Anova(ar.model2) # Can check Component here, which is not sig (hence not included in final model) ar.model2.comp <- lmer(ARate ~ LogNSyls + Gender * IntGender + IntGender * IntRelat + Comp + (1+LogNSyls||Spkr), data=ar.data) anova(ar.model2, ar.model2.comp) #p = 0.259 Anova(ar.model2.comp) # This creates the plots for Figure 8 in the chapter #jpeg(filename = "ArtRateModel2-nsyls.jpg", width = 2.5, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz") plot_model(ar.model2, terms=c("LogNSyls"), type="pred", title="", axis.title=c("Log N Syllables", "σ/sec")) #dev.off() #jpeg(filename = "ArtRateModel2-genderXintgender.jpg", width = 2.8, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz") plot_model(ar.model2, terms=c("IntGender", "Gender"), type="pred", title="", axis.title=c("Interviewer Gender", "σ/sec"), legend.title="Speaker\nGender") #dev.off() #jpeg(filename = "ArtRateModel2-intgenderXintrelat.jpg", width = 2.8, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz") plot_model(ar.model2, terms=c("IntRelat", "IntGender"), type="pred", title="", axis.title=c("Interviewer's Relationship", "σ/sec"), legend.title="Interv'er\nGender") #dev.off() # And that wraps up what was included in the chapter. Lots of opportunities remain # for further exploring!