###############################
# ExploreCORAALSpeechRatePause_HbkofSociophonetics.R
#  Kendall, Tyler. forthcoming. Sociophonetics and Speech Rate and Pause. 
#  In Christopher Strelluf (ed.) Routledge Handbook of Sociophonetics. 
#  London: Routledge.
#
# Code to explore the Corpus of Regional African American Language (CORAAL)
#  for some initial forays into speech rate and pause. Code replicates the
#  analyses in the Handbook chapter and provides opportunities to expand as
#  desired. Note this is all meant to exemplify some sociophonetic inquiring
#  about speech rate and pause and to provide some examples for how to work
#  with CORAAL transcripts using R; these are not meant to represent definitive
#  analyses or best-practices for e.g. statistical analysis.
#
# Tyler Kendall
#  Last mod. April 30, 2022


# Set where you want to save things 
#  (make sure this directory exists if you want to save things)
my.wd <- "~/Desktop/CORAAL_SRP_Examples/"
setwd(my.wd)

# Set up graphical parameters for later
par(bg="white")
defaults <- par(no.readonly=TRUE)

# Load the CORAAL Explore R functions from web
# This requires that RCurl is installed
source('http://lingtools.uoregon.edu/coraal/explorer/R/CORAAL_web.R')

# Download and set up CORAAL data and metadata
#  The argument collapse = F gets each utterance as a separate row in the data
#  Nb. this downloads the current version of CORAAL; results here may differ
#  from the chapter due to updates and new data available over time.
coraal.init <- coraal.webget.data(collapse=F)
meta <- coraal.webget.meta()

# Examine the data (for instance)...
head(coraal.init)

# As of CORAAL v. 2021.07 there are 7 components in CORAAL. For simplicity and to follow
#  the components used in Kendall's chapter in the handbook, we'll drop DCA, the 1968
#  Washington DC data, and the LES and VLD components which were released most recently
unique(coraal.init$Comp)
nrow(coraal.init)
coraal <- droplevels(coraal.init[coraal.init$Comp %in% c("DCB", "ATL", "PRV", "ROC"),])
nrow(coraal)

# Add columns to the data for metadata of interest
#  Relevel some factors for later analysis
coraal$Gender <- coraal.add.meta("Gender", meta=meta, src=coraal)
coraal$IntGender <- coraal.add.meta("Interviewer.Gender", meta=meta, src=coraal)

# Interviewer.Relationship has two different entries for ROC_se0_ag3_f_02 (as of v. 2021.07)
#  This will be corrected in an upcoming version but for now needs to be hand-corrected
meta[meta$CORAAL.Spkr=="ROC_se0_ag3_f_02",]
meta$Interviewer.Relationship[meta$CORAAL.Spkr=="ROC_se0_ag3_f_02"] <- "Acquaintance"


coraal$IntRelat <- coraal.add.meta("Interviewer.Relationship", meta=meta, src=coraal)
coraal$IntRelat <- as.factor(coraal$IntRelat)
# CORAAL v. 2021.07 introduced some irregularities with LES and VLD
#  This isn't necessary for just the four components being used here, 
#  but if you expand to the newer components you'll need this to check for them and fix
levels(coraal$IntRelat)
levels(coraal$IntRelat)[which(levels(coraal$IntRelat)=="Friend")] <- "Close Relationship"
levels(coraal$IntRelat)[which(levels(coraal$IntRelat)=="No previous relationship")] <- "No Prev. Relationship"
coraal$IntRelat <- relevel(as.factor(coraal$IntRelat), ref="No Prev. Relationship")
coraal$Interviewer <- coraal.add.meta("Interviewer.Code", meta=meta, src=coraal)
coraal$Interviewer <- relevel(as.factor(coraal$Interviewer), ref="DCB_int_01")

coraal$Age <- coraal.add.meta("Age", meta=meta, src=coraal)
coraal$Age <- as.numeric(coraal$Age)

# The CORAAL AgeGrp categories are not perfectly aligned across components (as of v. 2021.07)
#  So instead of using these:
#coraal$AgeGrp <- coraal.add.meta("Age.Group", meta=meta, src=coraal)
#coraal$AgeGrp <- relevel(as.factor(coraal$AgeGrp), ref="30 to 50")
#  Set up categorical age groups based on the individual ages:
coraal$AgeGrp2 <- cut(coraal$Age, c(10, 19, 29, 39, 49, 59, 69, 85))
table(coraal$AgeGrp2)
coraal$AgeGrp2 <- relevel(coraal$AgeGrp2, ref="(29,39]")

# Note that some factors, like educational background, don't end up being used
#  in this code - they either don't emerge as factors of relevance in modeling or
#  just weren't explored by Tyler in the end. They are included here in case you
#  want to explore (and to exemplify some ways to work with the data)
# We'll simplify the educational groups from the metadata to just three categories
coraal$EduGrp <- coraal.add.meta("Edu.Group", meta=meta, src=coraal)
coraal$EduGrp2 <- as.factor(tolower(coraal$EduGrp))
levels(coraal$EduGrp2)
levels(coraal$EduGrp2) <- c("≥College", "<High School", "≥College", "≥High School", "<High School", "≥High School", "<High School", "<High School", "≥High School", "<High School", "<High School")
coraal$EduGrp2 <- relevel(coraal$EduGrp2, ref="≥High School")

# Just get SEC from the speaker labels, but don't treat se0 as lower than se1!
coraal$SEC <- gsub("^.*_se(\\d)_.*$", "\\1", coraal$Spkr, perl=T)
coraal$SEC[nchar(coraal$SEC)!=1] <- NA
coraal$SEC <- as.factor(coraal$SEC)
coraal$SEC <- relevel(coraal$SEC, ref="1")

coraal$CorpSEC <- as.factor(paste(coraal$Comp, coraal$SEC, sep="-"))
coraal$CorpSEC <- relevel(coraal$CorpSEC, ref="DCB-1")

coraal$Comp <- relevel(as.factor(coraal$Comp), ref="DCB")

# Load the syllable counter function, based on Kendall (2013)
#  Currently this assumes its in the working directory
source("english_syllable_counter-coraal.R")

# Set up some relevant columns in the data
coraal$Dur <- coraal$EnTime - coraal$StTime
coraal$Turn.Dur <- coraal$Turn.EnTime - coraal$Turn.StTime
coraal$NSyls <- NA

# Make sure transcripts are character data (and not factors)
coraal$Turn.Simple <- as.character(coraal$Turn.Simple)
coraal$Content <- as.character(coraal$Content)

# Find all the pauses
pause.locs <- grep("^\\(pause", coraal$Content, perl=T)
# How many pauses?
length(pause.locs)

# And add a column to mark the pauses
coraal$IsPause <- FALSE
coraal$IsPause[pause.locs] <- TRUE

# Now identify utterances that shouldn't be included in an articulation rate analysis
#  Start by marking everything as good for articulation rate analysis
coraal$Good4AR <- TRUE

# Then remove utterances with overlapping speech
coraal$IsOverlap <- FALSE
coraal$IsOverlap[grep("\\[", coraal$Content, perl=T)] <- TRUE
table(coraal$IsOverlap)
coraal$Good4AR[coraal$IsOverlap] <- FALSE

# Noises
coraal$Good4AR[grep("\\<", coraal$Content, perl=T)] <- FALSE

# Utterances with comments or laughing etc.
#  Nb. this will also flag pauses, but that is fine since we don't use them in art rate study
coraal$Good4AR[grep("\\(", coraal$Content, perl=T)] <- FALSE

# How many lines are utterances that are good for articulation rate analysis
table(coraal$Good4AR)

# Get comfortable; this takes a long time...
coraal$NSyls[-pause.locs] <- sapply(coraal$Content[-pause.locs], english_syllable_count)

coraal$ARate <- NA
coraal$ARate[coraal$Good4AR] <- round(coraal$NSyls[coraal$Good4AR]/coraal$Dur[coraal$Good4AR], 3)

# Make a dataframe for just the utterances, removing pauses
cutts <- droplevels(coraal[-pause.locs,])
dim(cutts)

# Remove the interviewers and utterances that are not good analysis, 
#  leaving just the data we want for analysis
#  Note is.na(Gender) is a short cut to find interviewers, since interviewers don't have
#  demographic info like Gender
arate <- droplevels(cutts[!is.na(cutts$Gender) & cutts$Good4AR & cutts$NSyls!=0,])
dim(arate)

# Before analyzing the data, we'll examine the shortest utterances
plot(density(arate$Dur[arate$Dur < 1]))

# What kinds of utterances are < 0.2 seconds?
length(which(arate$Dur < 0.2))
sample(arate$Content[arate$Dur < 0.2], 40)
sample(arate$Content[arate$Dur < 0.2], 40)

# What kinds of utterances are > 0.2 & < 0.4 seconds?
length(which(arate$Dur > 0.2 & arate$Dur < 0.4))
sample(arate$Content[arate$Dur > 0.2 & arate$Dur < 0.4], 40)

# We're going to drop all the utterances < 0.2 seconds in Dur
#  These are so short and rarely complete utterances, it seems better to exclude from
#  an articulation rate analysis. You could make a different decision about this...
arate2 <- droplevels(arate[arate$Dur >= 0.2,])
dim(arate2)

# Examine the distribution of the articulation rate data
plot(density(arate2$ARate))

# Very few extremely fast utterances (< 0.2%); you could trim them but we don't here
#  We'll trim outliers more systematically in a moment
length(which(arate2$ARate >= 10))
100*length(which(arate2$ARate >= 10))/nrow(arate2)

# Can examine the data in other ways, e.g.
plot(arate2$ARate, arate2$Dur)

# Now for each speaker, drop art rates that are >= 2sds ± mean
#  and then center, per speaker, ARate & NSyls, while we're doing this
#  (we'll want to do that later and it's efficient to just do it here)
#  We'll set up a function for this
trim.artrates <- function(spkr.dat) {
	spkr.mn <- mean(spkr.dat$ARate)
	spkr.sd <- sd(spkr.dat$ARate)
	
	trim.dat <- spkr.dat[(spkr.dat$ARate >= (spkr.mn - 2*spkr.sd)) & (spkr.dat$ARate <= (spkr.mn + 2*spkr.sd)),]
	
	trim.dat$NSyls.sC <- scale(trim.dat$NSyls, center=T, scale=F)
	trim.dat$ARate.sC <- scale(trim.dat$ARate, center=T, scale=F)
	
	trim.dat
}

# Now to apply the function, we split the data by speaker, apply the function to the list
#  of speakers and then recombine
spkr.arlist <- split(arate2, arate2$Spkr)
ar.data <- do.call("rbind", lapply(spkr.arlist, trim.artrates))

# This trimmed about 40 utterances per speaker
mean(table(arate2$Spkr) - table(ar.data$Spkr))

# Examined the trimmed data
plot(density(ar.data$ARate))
plot(ar.data$ARate, ar.data$Dur)
plot(density(ar.data$ARate.sC))

# Articulation rate data is now pretty much set up for analysis

# Now set up pause data
pauses <- droplevels(coraal[pause.locs,])

# Remove interviewers
pauses2 <- droplevels(pauses[!is.na(pauses$Gender),])

# Examine the data a little
table(pauses2$Spkr)
head(pauses2)

# Convert pauses to millisec and log-millisec
pauses2$DurMS <- 1000*pauses2$Dur
pauses2$LogDur <- log(pauses2$DurMS)

# Calculate turn durations
pauses2$Turn.Dur <- pauses2$Turn.EnTime - pauses2$Turn.StTime
pauses2$Turn.LogDurMS <- log(1000*pauses2$Turn.Dur)
# Examine
plot(density(pauses2$Turn.Dur))
plot(density(pauses2$Turn.LogDurMS))

# Set up turn durations with the individual pauses subtracted
pauses2$Turn.Dur.woP <- pauses2$Turn.Dur - pauses2$Dur
pauses2$Turn.Log.woPDurMS <- log(1000*pauses2$Turn.Dur.woP)

# Remove any turns that are longer than 300 seconds
#  With the logic that turns this long may be weird so may exhibit unusual pause
#  Of course, this could be worth it's own investigation
length(which(pauses2$Turn.Dur >= 300))
pauses3 <- droplevels(pauses2[pauses2$Turn.Dur < 300,])

# Now for each speaker, drop pauses that are >= 2sds ± mean log duration
# and then center, per speaker, LogDur, while we're doing this; again set up a function
trim.pauses <- function(spkr.dat) {
	spkr.mn <- mean(spkr.dat$LogDur)
	spkr.sd <- sd(spkr.dat$LogDur)
	
	trim.dat <- spkr.dat[(spkr.dat$LogDur >= (spkr.mn - 2*spkr.sd)) & (spkr.dat$LogDur <= (spkr.mn + 2*spkr.sd)),]
	
	trim.dat$LogDur.sC <- scale(trim.dat$LogDur, center=T, scale=F)
	
	trim.dat
}

# And apply the function to the data for each speaker
spkr.plist <- split(pauses3, pauses3$Spkr)
p.data <- do.call("rbind", lapply(spkr.plist, trim.pauses))

# This trimmed about 30 pauses per speaker
mean(table(pauses3$Spkr) - table(p.data$Spkr))

# Examine the pause data
plot(density(p.data$LogDur))
plot(density(p.data$LogDur.sC))

# Calculate the number of utterances in each row in the data
#  There are other ways to do this, but this works
p.data$NUtts <- 1 + ceiling((p.data$Turn.EnLine - p.data$Turn.StLine)/2)

# Examine the data a little
plot(density(p.data$NUtts))
plot(density(p.data$Turn.Log.woPDurMS))
plot(density(p.data$Turn.LogDur))

# Uncomment and execute if you want to save the data for later re-use  
#  (remember we set the working directory at the start of the script, 
#  so these should all save in the folder that you set there)
#write.table(ar.data, file="CORAAL_ARdata.txt")
#write.table(p.data, file="CORAAL_Pdata.txt")

# At this point we have processed the CORAAL transcripts and generated basic data
#  for articulation rates and pauses. The rest of this script does some exploratory
#  data analysis. Some of these are the basis for the case studies in the handbook
#  chapter. Of course, feel free to explore more/differently.

# We can examine the data in various ways, e.g.
boxplot(ARate ~ Gender * Comp, data=ar.data, notch=T, las=2, xlab="", ylab="Articulation Rate")

# But we will primarily run some statistics models.

# First, load various R packages for data analysis and plotting
#  You will need to make sure you have installed these packages (and their dependencies)
library(lmerTest)
library(rms)
library(car)
library(languageR)
library(emmeans)
library(multcomp)
library(sjPlot)

# Part 1. Articulation Rate: Does region (CORAAL component) predict articulation rates?

# NSyls effect on ARate is ~logarithmic
plot(ar.data$NSyls, ar.data$ARate, pch='.')
lines(lowess(ar.data$NSyls, ar.data$ARate))

# So we'll use this as a predictor
ar.data$LogNSyls <- log(ar.data$NSyls)

# Of course, statistical modeling is not straightforward. The point of this chapter
#  is not to overview statistical best practices so this script doesn't include the
#  extensive modeling criticism that would be done around here. The following
#  model appears to be a pretty good model for the articulation rate data, testing
#  these factors. As noted in the chapter, Age does not appear to influence art. rates.
#  Model criticism did not indicate that any interactions were significant.
ar.model <- lmer(ARate ~ LogNSyls + Gender + Comp + (1+LogNSyls||Spkr), data=ar.data)
summary(ar.model)
Anova(ar.model)

# Uncomment to save the plots, rather than display them in R
#jpeg(filename = "ArtRateModel.jpg", width = 5, height = 3.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz")
plot_model(ar.model, terms=c("Comp", "Gender"), type="pred", title="Predicted values of articulation rate", axis.title="σ/sec")
#dev.off()

#jpeg(filename = "ArtRateModel-allests.jpg", width = 5, height = 3, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz")
plot_model(ar.model, title="Model estimates for articulation rate")
#dev.off()

# These next three plots create Figure 6 in the chapter
#jpeg(filename = "ArtRateModel-nsyls.jpg", width = 2.5, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz")
plot_model(ar.model, terms=c("LogNSyls"), type="pred", title="", axis.title=c("Log N Syllables", "σ/sec"))
#dev.off()
#jpeg(filename = "ArtRateModel-gender.jpg", width = 2.5, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz")
plot_model(ar.model, terms=c("Gender"), type="pred", title="", axis.title=c("Gender", "σ/sec"))
#dev.off()
#jpeg(filename = "ArtRateModel-comp.jpg", width = 2.5, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz")
plot_model(ar.model, terms=c("Comp"), type="pred", title="", axis.title=c("CORAAL Component", "σ/sec"))
#dev.off()

# Part 2. Pauses. Does region (CORAAL component) predict pause durations?

# Center Turn duration measure
p.data$Turn.LogDurMSC <- scale(p.data$Turn.Log.woPDurMS, center=T, scale=F)

# As with articulation rate, a lot of model criticism should be done. This model
#  appears quite good, with model criticism supporting the inclusion of the interaction
#  and the exclusion of Gender and Age as predictors
p.model <- lmer(LogDur ~ Comp * Turn.LogDurMSC + (1 + Turn.LogDurMSC||Spkr), data=p.data)
summary(p.model)
Anova(p.model)

# These next three plots create Figure 7 in the chapter
#jpeg(filename = "PauseModel-TurnDur.jpg", width = 2.5, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz")
plot_model(p.model, terms=c("Turn.LogDurMSC"), type="pred", title="", axis.title=c("Turn Dur (log millisec)", "Pause Dur (log millisec)"))
#dev.off()
#jpeg(filename = "PauseModel-comp.jpg", width = 2.5, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz")
plot_model(p.model, terms=c("Comp"), type="pred", title="", axis.title=c("CORAAL Component", "Pause Dur (log millisec)"))
#dev.off()
#jpeg(filename = "PauseModel-compTurndurX.jpg", width = 2.8, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz")
plot_model(p.model, terms=c("Turn.LogDurMSC", "Comp"), type="pred", title="", axis.title=c("Turn Dur (log millisec)", "Pause Dur (log millisec)"))
#dev.off()

# Part 3. Back to Articulation Rate. What about the interviewer's gender and relationship
#  with the interviewee?

# Shorten the Interviewer relationship coding
levels(ar.data$IntRelat) <- c("None", "Acquaint.", "Close")

# Again, lots of work goes in to identifying the best model. Model criticism indicates
#  that this model is a good model and improves the fit over the simpler articulation
#  rate model above.

ar.model2 <- lmer(ARate ~ LogNSyls + Gender * IntGender + IntGender * IntRelat + (1+LogNSyls||Spkr), data=ar.data)
summary(ar.model2)
Anova(ar.model2)

# Can check Component here, which is not sig (hence not included in final model)
ar.model2.comp <- lmer(ARate ~ LogNSyls + Gender * IntGender + IntGender * IntRelat + Comp + (1+LogNSyls||Spkr), data=ar.data)
anova(ar.model2, ar.model2.comp) #p = 0.259 
Anova(ar.model2.comp)

# This creates the plots for Figure 8 in the chapter
#jpeg(filename = "ArtRateModel2-nsyls.jpg", width = 2.5, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz")
plot_model(ar.model2, terms=c("LogNSyls"), type="pred", title="", axis.title=c("Log N Syllables", "σ/sec"))
#dev.off()
#jpeg(filename = "ArtRateModel2-genderXintgender.jpg", width = 2.8, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz")
plot_model(ar.model2, terms=c("IntGender", "Gender"), type="pred", title="", axis.title=c("Interviewer Gender", "σ/sec"), legend.title="Speaker\nGender")
#dev.off()
#jpeg(filename = "ArtRateModel2-intgenderXintrelat.jpg", width = 2.8, height = 2.5, units = "in", res = 300, pointsize = 9, quality = 100, bg = "white", type = "quartz")
plot_model(ar.model2, terms=c("IntRelat", "IntGender"), type="pred", title="", axis.title=c("Interviewer's Relationship", "σ/sec"), legend.title="Interv'er\nGender")
#dev.off()

# And that wraps up what was included in the chapter. Lots of opportunities remain 
#  for further exploring!