###############################
# ExploreCORAALBetweenSpeakerIntervals_KendallNWAV2022.R
#  Kendall, Tyler. 2022. Interturn pausing, overlaps, and the co-construction of linguistic variation. 
#  Paper presented at NWAV 50. San Jose, CA. October 15, 2022.
#
# Code to extract between-speaker interval (BSI) data from CORAAL and to analyze and visualize the
#  data presented in Kendall (2022).
#
# Tyler Kendall
#  Last mod. October 15, 2022


# Set where you want to save things 
#  (make sure this directory exists if you want to save things)
my.wd <- "~/Documents/CORAAL-BetweenSpeakerIntervals/"
setwd(my.wd)

# Set up graphical parameters for later
par(bg="white")
defaults <- par(no.readonly=TRUE)

# Load the CORAAL Explore R functions from web
# This requires that RCurl is installed
source('http://lingtools.uoregon.edu/coraal/explorer/R/CORAAL_web.R')

# Download and set up CORAAL data and metadata
# USE COLLAPSE=T to get combined speaker intervals rather than each individual utterance
coraal.init <- coraal.webget.data(collapse=T)
meta <- coraal.webget.meta()

# Examine the data (for instance)...
head(coraal.init)

# As of CORAAL v. 2021.07 there are 7 components in CORAAL. For simplicity we drop DCA, the 1968
#  Washington DC data, for this analysis (older data, lots of interviewers and young kids...)
unique(coraal.init$Comp)
nrow(coraal.init)
coraal <- droplevels(coraal.init[coraal.init$Comp!="DCA",])
nrow(coraal)

# Add columns to the data for metadata of potential interest 
# Note that some factors, like educational background, don't end up being used
#  in this code - they either don't emerge as factors of relevance in modeling or
#  just weren't explored by Tyler in the end. They are included here in case you
#  want to explore (and to exemplify some ways to work with the data)
# Relevel and/or adjust some factors for later analysis
coraal$Gender <- as.factor(coraal.add.meta("Gender", meta=meta, src=coraal))
coraal$IntGender <- as.factor(coraal.add.meta("Interviewer.Gender", meta=meta, src=coraal))
coraal$IntEthn <- as.factor(coraal.add.meta("Interviewer.Ethnicity", meta=meta, src=coraal))
levels(coraal$IntEthn) <- c("AfrAm", "EurAm", "EurAm")

# Interviewer.Relationship has two different entries for ROC_se0_ag3_f_02 (as of v. 2021.07)
#  This will be corrected in an upcoming version but for now needs to be hand-corrected
meta[meta$CORAAL.Spkr=="ROC_se0_ag3_f_02",]
meta$Interviewer.Relationship[meta$CORAAL.Spkr=="ROC_se0_ag3_f_02"] <- "Acquaintance"

coraal$IntRelat <- as.factor(coraal.add.meta("Interviewer.Relationship", meta=meta, src=coraal))
# CORAAL v. 2021.07 introduced some irregularities with LES and VLD
#  This isn't necessary for just the four components being used here, 
#  but if you expand to the newer components you'll need this to check for them and fix
levels(coraal$IntRelat)
levels(coraal$IntRelat)[which(levels(coraal$IntRelat)=="Friend")] <- "Close Relationship"
levels(coraal$IntRelat)[which(levels(coraal$IntRelat)=="No previous relationship")] <- "No Prev. Relationship"
coraal$IntRelat <- relevel(as.factor(coraal$IntRelat), ref="No Prev. Relationship")
coraal$Interviewer <- as.factor(coraal.add.meta("Interviewer.Code", meta=meta, src=coraal))
coraal$Interviewer <- relevel(coraal$Interviewer, ref="DCB_int_01")

coraal$IsInterviewer <- is.na(coraal$Interviewer)
coraal$SameGender <- NA
coraal$SameGender[!coraal$IsInterviewer] <- coraal$Gender[!coraal$IsInterviewer]==coraal$IntGender[!coraal$IsInterviewer]
table(coraal$SameGender, useNA="always")

coraal$Age <- as.numeric(coraal.add.meta("Age", meta=meta, src=coraal))

# The CORAAL AgeGrp categories are not perfectly aligned across components (as of v. 2021.07)
#  So instead of using these:
#coraal$AgeGrp <- coraal.add.meta("Age.Group", meta=meta, src=coraal)
#coraal$AgeGrp <- relevel(as.factor(coraal$AgeGrp), ref="30 to 50")
#  Set up categorical age groups based on the individual ages:
coraal$AgeGrp2 <- cut(coraal$Age, c(10, 19, 29, 39, 49, 59, 69, 85))
table(coraal$AgeGrp2)
coraal$AgeGrp2 <- relevel(coraal$AgeGrp2, ref="(29,39]")

# Simplify the educational groups from the metadata to just three categories
coraal$EduGrp <- coraal.add.meta("Edu.Group", meta=meta, src=coraal)
coraal$EduGrp2 <- as.factor(tolower(coraal$EduGrp))
levels(coraal$EduGrp2)
levels(coraal$EduGrp2) <- c("≥College", "<High School", "≥College", "≥High School", "<High School", "≥High School", "<High School", "<High School", "≥High School", "<High School", "<High School")
coraal$EduGrp2 <- relevel(coraal$EduGrp2, ref="≥High School")

# Just get SEC from the speaker labels, but don't treat se0 as lower than se1!
coraal$SEC <- gsub("^.*_se(\\d)_.*$", "\\1", coraal$Spkr, perl=T)
coraal$SEC[nchar(coraal$SEC)!=1] <- NA
coraal$SEC <- as.factor(coraal$SEC)
coraal$SEC <- relevel(coraal$SEC, ref="1")

coraal$CorpSEC <- as.factor(paste(coraal$Comp, coraal$SEC, sep="-"))
coraal$CorpSEC <- relevel(coraal$CorpSEC, ref="DCB-1")

coraal$Comp <- relevel(as.factor(coraal$Comp), ref="DCB")

# Can take a look at social factor columns
head(unique(coraal[,c("Comp", "File", "Spkr", "Gender", "IntGender", "SameGender", "IntRelat", "Interviewer", "Age", "AgeGrp2", "EduGrp", "EduGrp2", "SEC")]), 20)

# Load the syllable counter function, based on Kendall (2013), from the web (or download it)
source("http://lingtools.uoregon.edu/coraal/explorer/R/english_syllable_counter-coraal.R")

# Make sure transcripts are character data (and not factors)
coraal$Turn.Simple <- as.character(coraal$Turn.Simple)
coraal$Turn.Content <- as.character(coraal$Turn.Content)

# Set up other relevant columns in the data
coraal$Turn.Dur <- coraal$Turn.EnTime - coraal$Turn.StTime
# syllable counting takes a while; get comfortable
coraal$NSyls <- sapply(coraal$Turn.Simple, english_syllable_count)
coraal$NUtts <- coraal$Turn.EnLine - coraal$Turn.StLine + 1
coraal$Mean.UttSyls <- coraal$NSyls/coraal$NUtts
coraal$SpkRate <- coraal$NSyls/coraal$Turn.Dur

# NOW WALK THROUGH AND CALCULATE BETWEEN-SPEAKER INTERVAL DURATIONS
#  BETWEEN TURNS; ASSIGN THE INTERVAL TO THE SPEAKER ABOUT TO SPEAK
coraal$BSI <- NA
coraal$PrevBSI <- NA
coraal$PrevSpkRate <- NA
upto <- nrow(coraal)
for (i in 2:upto) {
	if (coraal$File[i-1]==coraal$File[i]) {
		coraal$BSI[i] <- coraal$Turn.StTime[i] - coraal$Turn.EnTime[i-1]
		coraal$PrevSpkRate[i] <- coraal$SpkRate[i-1]
		coraal$PrevBSI[i] <- coraal$BSI[i-1]
	}
}

# Following Heldner & Edlund 2010 (and empirical pilots) we analyze
#  between-speaker intervals as a single distribution. But could create
#  columns to separate GAPs from Overlaps, e.g. (uncomment to use)
# head(coraal[which(coraal$BSI < 0),])
# coraal$GAP <- NA
# coraal$GAP[which(coraal$BSI >= 0)] <- coraal$BSI[which(coraal$BSI >= 0)]
# coraal$OLap <- NA
# coraal$OLap[which(coraal$BSI < 0)] <- coraal$BSI[which(coraal$BSI < 0)]

# Convert GAP and Overlap to ms (make OLap positive) and natural log transform
#  Nudge GAP values slightly to avoid any actual 0s
# coraal$lGAP <- log(1000*(coraal$GAP+0.00001))
# coraal$lOLap <- log(-1000*coraal$OLap)
# Again though note that Heldner & Edlund show analyzing these transforms is worse than
#  the whole distribution untransformed.

# Calculate speaker summary statistics, by looping through the data
coraal$FileSpkr <- abbreviate(paste(as.character(coraal$File), as.character(coraal$Spkr), sep="-"))

speakers <- unique(coraal[c("FileSpkr", "Comp", "File", "Spkr", "IsInterviewer", "Gender", "IntGender", "SameGender", "IntRelat", "Interviewer", "Age", "AgeGrp2", "EduGrp", "EduGrp2", "SEC")])
speakers$NPartics <- NA
speakers$NTurns <- NA
speakers$TotalDur <- NA
speakers$Mean.BSI <- NA
speakers$SD.BSI <- NA
speakers$SE.BSI <- NA
speakers$Other.BSI <- NA
speakers$Mean.NUtts <- NA
speakers$SD.NUtts <- NA
speakers$Mean.UttSyls <- NA
speakers$Mean.SpkRate <- NA
speakers$SD.SpkRate <- NA
speakers$Other.NUtts <- NA
speakers$Other.UttSyls <- NA
speakers$Other.SpkRate <- NA

# While doing this we'll add some additional information to the main data.frame
coraal$NPartics <- NA
coraal$Other.MnSpkRate <- NA
coraal$Other.MnBSI <- NA

for (fsi in 1:nrow(speakers)) {
	fs <- speakers$FileSpkr[fsi]
	file <- speakers$File[fsi]
	speakers$NPartics[fsi] <- length(which(speakers$File==file))
	speakers$NTurns[fsi] <- length(which(coraal$FileSpkr==fs))
	speakers$TotalDur[fsi] <- max(coraal$Turn.EnTime[which(coraal$File==file)])
	
	speakers$Mean.BSI[fsi] <- mean(coraal$BSI[coraal$FileSpkr==fs], na.rm=T)
	speakers$SD.BSI[fsi] <- sd(coraal$BSI[coraal$FileSpkr==fs], na.rm=T)
	speakers$SE.BSI[fsi] <- speakers$SD.BSI[fsi]/sqrt(speakers$NTurns[fsi])
	speakers$Other.BSI[fsi] <- mean(coraal$BSI[which(coraal$File==file & coraal$FileSpkr!=fs)], na.rm=T)
	
	speakers$Mean.NUtts[fsi] <- mean(coraal$NUtts[coraal$FileSpkr==fs], na.rm=T)
	speakers$SD.NUtts[fsi] <- sd(coraal$NUtts[coraal$FileSpkr==fs], na.rm=T)
	speakers$Other.NUtts[fsi] <- mean(coraal$NUtts[which(coraal$File==file & coraal$FileSpkr!=fs)], na.rm=T)
	speakers$Mean.UttSyls[fsi] <- mean(coraal$Mean.UttSyls[coraal$FileSpkr==fs], na.rm=T)
	speakers$Other.UttSyls[fsi] <- mean(coraal$Mean.UttSyls[which(coraal$File==file & coraal$FileSpkr!=fs)], na.rm=T)
	speakers$Mean.SpkRate[fsi] <- mean(coraal$SpkRate[coraal$FileSpkr==fs], na.rm=T)
	speakers$SD.SpkRate[fsi] <- sd(coraal$SpkRate[coraal$FileSpkr==fs], na.rm=T)
	speakers$Other.SpkRate[fsi] <- mean(coraal$SpkRate[which(coraal$File==file & coraal$FileSpkr!=fs)], na.rm=T)
			
	coraal$NPartics[coraal$FileSpkr==fs] <- speakers$NPartics[fsi]
	coraal$Other.MnSpkRate[coraal$FileSpkr==fs] <- speakers$Other.SpkRate[fsi]
	coraal$Other.MnBSI[coraal$FileSpkr==fs] <- speakers$Other.BSI[fsi]
}
head(speakers, 10)

# Focus in on just the interviewees
sonly <- droplevels(speakers[!speakers$IsInterviewer,])
dim(sonly)

# We'll focus on just interviews with 2 interlocutors for simplicity
sonly2 <- droplevels(sonly[sonly$NPartics==2,])
dim(sonly2)

# And generate a 3 x 4 grid of interval distributions for a sample of interviews
set.seed(101010)
samp.files <- sample(sonly2$File[sonly2$NTurns>200], 12)

jpeg(filename = "sampledistributions.jpg", width = 8, height = 6, units = "in", res = 300, pointsize = 8, quality = 100, bg = "white", type = "quartz")
par(mfrow=c(3,4), mai=c(0.2, 0.4, 0.4, 0.1))
first <- TRUE
for (s in samp.files) {
	dat <- droplevels(coraal[coraal$File==s & !is.na(coraal$BSI),])
	plot(density(dat$BSI[!dat$IsInterviewer]), xlim=c(-3, 3), ylim=c(0, 1.8), xlab="", col="blue", main=s)
	lines(density(dat$BSI[dat$IsInterviewer]), col="red", lty=2)
	#legend("topright", c(unique(dat$Spkr[!is.na(dat$IntGender)]), unique(dat$Spkr[is.na(dat$IntGender)])), lty=c(1,2), col=c("blue", "red"), cex=0.6, bty="n")
	if (first) {
		legend("topright", c("Int'EE", "Int'er"), lty=1:2, col=c("blue", "red"), ncol=1, cex=1.2, bty="n")
		first <- FALSE
	}
	abline(v=mean(dat$BSI[!is.na(dat$IntGender)]), col="blue")
	abline(v=mean(dat$BSI[is.na(dat$IntGender)]), col="red", lty=2)
}
par(defaults)
dev.off()

# Plot (sample) individual interviews
jpeg(filename = "sampleinterviewpatterns.jpg", width = 8, height = 6, units = "in", res = 300, pointsize = 8, quality = 100, bg = "white", type = "quartz")
par(mfrow=c(3,4), mai=c(0.4, 0.4, 0.3, 0.06))
first <- TRUE
for (s in samp.files) {
	dat <- droplevels(coraal[coraal$File==s & !is.na(coraal$BSI),])
	plot(dat$Turn.StTime[!dat$IsInterviewer], dat$BSI[!dat$IsInterviewer], ylim=c(-2, 4), xlab="Time (s)", ylab="BSI (s)", col="lightblue", main=s)
	points(dat$Turn.StTime[dat$IsInterviewer], dat$BSI[dat$IsInterviewer], col="pink", pch=2)
	if (first) {
		legend("topright", c("InterviewEE", "Interviewer"), pch=1:2, lty=1:2, lwd=2, col=c("blue", "red"), ncol=1, cex=1.2, bty="n")
		first <- FALSE
	}
	lines(lowess(dat$Turn.StTime[!dat$IsInterviewer], dat$BSI[!dat$IsInterviewer]), col="blue", lwd=2)
	lines(lowess(dat$Turn.StTime[dat$IsInterviewer], dat$BSI[dat$IsInterviewer]), col="red", lty=2, lwd=2)
}
par(defaults)
dev.off()

(ct <- cor.test(sonly2$Mean.BSI, sonly2$Other.BSI))
jpeg(filename = "speakercorrelations.jpg", width = 4, height = 4, units = "in", res = 300, pointsize = 8, quality = 100, bg = "white", type = "quartz")
plot(sonly2$Mean.BSI, sonly2$Other.BSI, xlab="InterviewEE Mean BSI Duration (sec)", ylab="Interviewer Mean BSI Duration (sec)")
abline(lsfit(sonly2$Mean.BSI, sonly2$Other.BSI))
# p value is tiny and will appear as 0 if rounded, so just write out a < value
legend("topleft", paste("r = ", round(ct$estimate, 3), "; p < 0.000001", sep=""), cex=1.8, bty="n")
dev.off()

# Statistical modeling
#  You'll need to install these packages if you don't have them
library(lmerTest)
library(car)
library(sjPlot)

fdata <- droplevels(coraal[!is.na(coraal$BSI) & !is.na(coraal$PrevBSI) & (coraal$NPartics==2),])
dim(fdata)

fdata <- droplevels(fdata[!fdata$IsInterviewer,])
dim(fdata)

# Center predictors
fdata$AgeC <- scale(fdata$Age, center=T, scale=F)
fdata$NUttsC <- scale(fdata$NUtts, center=T, scale=F)
fdata$SpkRateC <- scale(fdata$SpkRate, center=T, scale=F)
fdata$PrevSpkRateC <- scale(fdata$PrevSpkRate, center=T, scale=F)
fdata$PrevBSIC <- scale(fdata$PrevBSI, center=T, scale=F)
fdata$NParticsC <- scale(fdata$NPartics, center=T, scale=F)
fdata$Other.MnBSIC <- scale(fdata$Other.MnBSI, center=T, scale=F)
fdata$Other.MnSpkRateC <- scale(fdata$Other.MnSpkRate, center=T, scale=F)
fdata$Time <- scale(fdata$Turn.StTime, center=T, scale=T)

fdata$SameGenderC <- scale(as.numeric(fdata$SameGender), center=T, scale=F)
fdata$GenderC <- scale(as.numeric(fdata$Gender), center=T, scale=F)
fdata$IntGenderC <- scale(as.numeric(fdata$IntGender), center=T, scale=F)

# Various initial modeling attempts led to this as a good model
lmer.mod <- lmer(BSI ~ SameGenderC + AgeC + NUttsC + SpkRateC + Time*(Other.MnBSIC + Other.MnSpkRateC + PrevBSIC) + (1|Spkr) + (1|File), data=fdata)
summary(lmer.mod)
Anova(lmer.mod)

# Some testing of different random effect structures
lmer.mod.s <- lmer(BSI ~ SameGenderC + AgeC + NUttsC + SpkRateC + Time*(Other.MnBSIC + Other.MnSpkRateC + PrevBSIC) + (1|Spkr), data=fdata)
lmer.mod.f <- lmer(BSI ~ SameGenderC + AgeC + NUttsC + SpkRateC + Time*(Other.MnBSIC + Other.MnSpkRateC + PrevBSIC) + (1|File), data=fdata)
anova(lmer.mod, lmer.mod.s)
anova(lmer.mod, lmer.mod.f) # Both random intercepts improve model but File is a little more important than Spkr

lmer.mod.1 <- lmer(BSI ~ SameGenderC + AgeC + NUttsC + SpkRateC + Time*(Other.MnBSIC + Other.MnSpkRateC + PrevBSIC) + (1|Spkr) + (1+Other.MnBSIC||File), data=fdata)
anova(lmer.mod, lmer.mod.1)
lmer.mod.2 <- lmer(BSI ~ SameGenderC + AgeC + NUttsC + SpkRateC + Time*(Other.MnBSIC + Other.MnSpkRateC + PrevBSIC) + (1|Spkr) + (1 + SameGenderC + Other.MnBSIC||File), data=fdata)
anova(lmer.mod.1, lmer.mod.2)
summary(lmer.mod.2)
Anova(lmer.mod.2)

# Many more complex models don't converge or throw other errors, but overall yield the same general patterns
#  So presenting lmer.mod.3 in the talk
lmer.mod.3 <- lmer(BSI ~ SameGenderC + AgeC + NUttsC + SpkRateC + Time*(Other.MnBSIC + Other.MnSpkRateC + PrevBSIC) + (1|Spkr) + (1 + SameGenderC + Other.MnBSIC + PrevBSIC||File), data=fdata)
anova(lmer.mod.2, lmer.mod.3)
summary(lmer.mod.3)
Anova(lmer.mod.3)

# Plot summary of model estimates
jpeg(filename = "model_mod3_effects.jpg", width = 5, height = 6, units = "in", res = 300, pointsize = 8, quality = 100, bg = "white", type = "quartz")
plot_model(lmer.mod.3, type="est", sort.est=T, title="Model estimates for BSI Durations", show.values=T)
dev.off()

# Plot main effects that the NWAV talk might want to discuss
jpeg(filename = "model_mod3_OtherMnBSI.jpg", width = 4, height = 2.5, units = "in", res = 300, pointsize = 6, quality = 100, bg = "white", type = "quartz")
plot_model(lmer.mod.3, terms=c("Other.MnBSIC"), type="pred", title="Model estimates for BSI Durations", axis.lim=list(c(-1.5, 2.5), c(-2, 2)))
dev.off()
jpeg(filename = "model_mod3_SpkRate.jpg", width = 4, height = 2.5, units = "in", res = 300, pointsize = 6, quality = 100, bg = "white", type = "quartz")
plot_model(lmer.mod.3, terms=c("SpkRateC"), type="pred", title="Model estimates for BSI Durations", axis.lim=list(c(-5, 15), c(-2, 2)))
dev.off()
jpeg(filename = "model_mod3_Age.jpg", width = 4, height = 2.5, units = "in", res = 300, pointsize = 6, quality = 100, bg = "white", type = "quartz")
plot_model(lmer.mod.3, terms=c("AgeC"), type="pred", title="Model estimates for BSI Durations", axis.lim=list(c(-30, 45), c(-2, 2)))
dev.off()
jpeg(filename = "model_mod3_SameGender.jpg", width = 4, height = 2.5, units = "in", res = 300, pointsize = 6, quality = 100, bg = "white", type = "quartz")
plot_model(lmer.mod.3, terms=c("SameGenderC"), type="pred", title="Model estimates for BSI Durations", axis.lim=list(c(-0.62, 0.42), c(-2, 2)))
dev.off()
jpeg(filename = "model_mod3_PrevBSI.jpg", width = 4, height = 2.5, units = "in", res = 300, pointsize = 6, quality = 100, bg = "white", type = "quartz")
plot_model(lmer.mod.3, terms=c("PrevBSIC"), type="pred", title="Model estimates for BSI Durations", axis.lim=list(c(-5, 5), c(-2, 2)))
dev.off()

# Plot Time interactions
jpeg(filename = "model_mod3_OtherMnBSIInteraction.jpg", width = 4, height = 2.5, units = "in", res = 300, pointsize = 6, quality = 100, bg = "white", type = "quartz")
plot_model(lmer.mod.3, terms=c("Other.MnBSIC", "Time"), type="pred", title="Model estimates for BSI Durations", axis.lim=list(c(-1.5, 2.5), c(-2, 2)))
dev.off()
jpeg(filename = "model_mod3_OtherMnSpkRateInteraction.jpg", width = 4, height = 2.5, units = "in", res = 300, pointsize = 6, quality = 100, bg = "white", type = "quartz")
plot_model(lmer.mod.3, terms=c("Other.MnSpkRateC", "Time"), type="pred", title="Model estimates for BSI Durations", axis.lim=list(c(-1.5, 1.5), c(-2, 2)))
dev.off()
jpeg(filename = "model_mod3_PrevBSIInteraction.jpg", width = 4, height = 2.5, units = "in", res = 300, pointsize = 6, quality = 100, bg = "white", type = "quartz")
plot_model(lmer.mod.3, terms=c("PrevBSIC", "Time"), type="pred", title="Model estimates for BSI Durations", axis.lim=list(c(-5, 5), c(-2, 2)))
dev.off()

# Back to modeling, jump to something maximal to see if/how it differs from the model above
lmer.rmax <- lmer(BSI ~ SameGenderC + AgeC + NUttsC + SpkRateC + Time*(Other.MnBSIC + Other.MnSpkRateC + PrevBSIC) + (1|Spkr) + (1 + SameGenderC + SpkRateC + NUttsC + Time*(Other.MnBSIC + Other.MnSpkRateC + PrevBSIC)||File), data=fdata)
summary(lmer.rmax) # interactions are non sig but model didn't converge
Anova(lmer.rmax)
plot_model(lmer.rmax, type="est", sort.est=T, title="Model estimates for Between-Speaker Intervals (MAX model)")
# Models are very similar except that the Time interactions move around a little and have much bigger CIs in
#  MAX model. Time interactions are not sig in max model

# Look at SameGender effect empirically (since it can be easy to lose track of 
#  the meaning of centered categorical factors)
boxplot(BSI ~ SameGender, data=fdata, ylim=c(-1, 1), notch=T)

# Turn to a simple linear model on the speaker means
#  mostly just to identify how well a model can fit:
library(rms)

dd <- datadist(sonly2)
options(datadist=dd)

lin.mod <- ols(Mean.BSI ~ SameGender + Mean.UttSyls + Other.SpkRate + Other.BSI, data=sonly2)
print(lin.mod)
anova(lin.mod)
fastbw(lin.mod)


# Following Edlund et al. 2009, creating a moving window of 20 points to smooth values, on a per speaker basis

smoothed <- NA
first <- TRUE
for (fs in unique(speakers$FileSpkr)) {
	fsdat <- droplevels(coraal[coraal$FileSpkr==fs,])
	if (nrow(fsdat) > 30) {
		i <- 1
		for (j in 21:nrow(fsdat)) {
			temp <- data.frame(FileSpkr = fs, File = unique(coraal$File[coraal$FileSpkr==fs]), Spkr = unique(coraal$Spkr[coraal$FileSpkr==fs]), NPartics = unique(coraal$NPartics[coraal$FileSpkr==fs]), IsInterviewer = unique(coraal$IsInterviewer[coraal$FileSpkr==fs]), TimePoint = i, Duration = fsdat$Turn.StTime[j] - fsdat$Turn.StTime[j-20], MidTime = mean(range(fsdat$Turn.StTime[j-20], fsdat$Turn.StTime[j])), SmBSI = mean(fsdat$BSI[(j-20):j], na.rm=T))
			i <- i + 1
			if (first) {
				smoothed <- temp
				first <- FALSE
			} else {
				smoothed <- rbind(smoothed, temp)
			}
		}
	}
}

# Plot (sample) individual interviews, smoothed
jpeg(filename = "smoothedsampleinterviewpatterns.jpg", width = 8, height = 6, units = "in", res = 300, pointsize = 8, quality = 100, bg = "white", type = "quartz")
par(mfrow=c(3,4), mai=c(0.4, 0.4, 0.3, 0.06))
first <- TRUE
for (s in samp.files) {
	dat <- droplevels(smoothed[smoothed$File==s,])
	plot(dat$MidTime[!dat$IsInterviewer], dat$SmBSI[!dat$IsInterviewer], ylim=c(-1.8, 2.2), xlab="Time (s)", ylab="Smoothed BSI (s)", col="blue", main=s, typ="l")
	lines(dat$MidTime[dat$IsInterviewer], dat$SmBSI[dat$IsInterviewer], col="red", lty=2)
	#legend("topright", c(unique(dat$Spkr[!is.na(dat$IntGender)]), unique(dat$Spkr[is.na(dat$IntGender)])), lty=c(1,2), col=c("blue", "red"), cex=0.6, bty="n")
	if (first) {
		legend("topright", c("InterviewEE", "Interviewer"), lty=1:2, lwd=2, col=c("blue", "red"), ncol=1, cex=1.4, bty="n")
		first <- FALSE
	}
	length(dat$SmBSI[!dat$IsInterviewer])
	length(dat$SmBSI[dat$IsInterviewer])
	sct <- cor.test(dat$SmBSI[!dat$IsInterviewer][1:min(length(dat$SmBSI[!dat$IsInterviewer]), length(dat$SmBSI[dat$IsInterviewer]))], dat$SmBSI[dat$IsInterviewer][1:min(length(dat$SmBSI[!dat$IsInterviewer]), length(dat$SmBSI[dat$IsInterviewer]))])
	legend("bottomleft", paste("r = ", round(sct$estimate, 3), "; p = ", round(sct$p.val, 6), sep=""), cex=1.2, bty="n")
	#lines(lowess(dat$Turn.TimePoint[!dat$IsInterviewer], dat$SmBSI[!dat$IsInterviewer]), col="blue", lwd=2)
	#lines(lowess(dat$Turn.TimePoint[dat$IsInterviewer], dat$SmBSI[dat$IsInterviewer]), col="red", lty=2, lwd=2)
}
par(defaults)
dev.off()

smsummary <- unique(smoothed[!smoothed$IsInterview & smoothed$NPartics==2, c("File", "Spkr", "NPartics")])
smsummary$R <- NA
smsummary$P <- NA
for (f in unique(smsummary$File)) {
	spkr <- smsummary$Spkr[smsummary$File==f][1]
	dat <- droplevels(smoothed[smoothed$File==f,])
	sct <- cor.test(dat$SmBSI[dat$Spkr==spkr][1:min(length(dat$SmBSI[dat$Spkr==spkr]), length(dat$SmBSI[dat$Spkr!=spkr]))], dat$SmBSI[dat$Spkr!=spkr][1:min(length(dat$SmBSI[dat$Spkr==spkr]), length(dat$SmBSI[dat$Spkr!=spkr]))])
	smsummary$R[smsummary$File==f] <- sct$estimate
	smsummary$P[smsummary$File==f] <- sct$p.val
}

# How many of these smoothed interviews show correlations among interviewees and interviewers?
dim(smsummary)
table(smsummary$P < 0.05)
prop.table(table(smsummary$P < 0.05))

table(smsummary$P < 0.01)
prop.table(table(smsummary$P < 0.01))

table(smsummary$P < 0.001)
prop.table(table(smsummary$P < 0.001))

# And how many are positive vs. negative correlations?
table((smsummary$R > 0) & (smsummary$P < 0.05))
prop.table(table((smsummary$R > 0) & (smsummary$P < 0.05)))

table((smsummary$R < 0) & (smsummary$P < 0.05))
prop.table(table((smsummary$R < 0) & (smsummary$P < 0.05)))