###### CORAAL_web.R # # To download CORAAL and some helper functions, just source() the file. E.g.: # > source("http://lingtools.uoregon.edu/coraal/explorer/R/CORAAL_web.R") # # Originally created for CORAAL v. 2018.10.06 # Updated May 2020 for CORAAL v. 2020.05 # Updated July 2021 for CORAAL v. 2021.07 # Updated July 2023 for CORAAL v. 2023.06 # T. Kendall July 21, 2023 ###### HELPER FUNCTIONS FOR BUILDING CORAAL DATA STRUCTURE IN R # This function pastes together utterances into turns paste.lines <- function(turn.lines) { nrows <- nrow(turn.lines) content <- rep(paste(turn.lines$Content, collapse=" "), times=nrows) simple <- gsub("\\(pause .*?\\)", "|", content, perl=T) # replace pauses with single | character simple <- gsub("\\s?\\(.*?\\)\\s?", " ", simple, perl=T) # remove line-level comments stline <- rep(min(turn.lines$Line), times=nrows) enline <- rep(max(turn.lines$Line), times=nrows) sttime <- rep(min(turn.lines$StTime), times=nrows) entime <- rep(max(turn.lines$EnTime), times=nrows) utimes <- rep(paste(round(turn.lines$StTime, 2), collapse="|"), times=nrows) data.frame(Turn.StLine = stline, Turn.StTime = sttime, Turn.Content = content, Turn.Simple = simple, Turn.EnTime = entime, Turn.EnLine = enline, Turn.Utt.Times = utimes) } # This function breaks up a transcript into chunks organized by turns split.trans <- function(trans) { turn.no <- rep(0, times=nrow(trans)) cspkr <- "" for (i in 1:nrow(trans)) { if (trans$Spkr[i]!=cspkr) { cspkr <- trans$Spkr[i] turn.no[i:length(turn.no)] <- turn.no[i]+1 } } spkr.turn <- paste(trans$Spkr, turn.no, sep="-") split(trans, turn.no) } # This function reads in a basic CORAAL text file and builds an R data.frame for it # Default is to build turn-level information, but turns=F will skip that part load.file <- function(f, turns=TRUE) { cat("\nDownloading and processing CORAAL transcript: ", basename(f), "...", sep="") inp <- read.delim(f, quote="") if (turns) { turns.list <- split.trans(inp) turns.df <- do.call("rbind", lapply(turns.list, paste.lines)) out <- data.frame(Comp = gsub("^(\\w+?)_.*", "\\1", basename(f), perl=T), File = gsub("\\.txt", "", basename(f), perl=T), inp, turns.df) } else { out <- data.frame(Comp = gsub("^(\\w+?)_.*", "\\1", basename(f), perl=T), File = gsub("\\.txt", "", basename(f), perl=T), inp) } out } ###### MAIN FUNCTIONS FOR BUILDING CORAAL DATA STRUCTURE IN R # This function loads CORAAL, or a subset of CORAAL, over the web # By default creates a "big" version of CORAAL where each individual transcript line # (utterance) also contains a copy of the speaker's entire turn; this can be turned # off with turns=FALSE . Set turns=TRUE and collapse=TRUE to remove individual # utterances and only get one row per turn # Requires RCurl library to be installed! coraal.webget.data <- function(subcorpus.only=NA, turns=TRUE, collapse=FALSE, no.pauses=FALSE) { library(RCurl) txtfiles.url <- paste("http://lingtools.uoregon.edu/coraal/explorer/files/TXTs/", sep="") file.list.html <- getURL(txtfiles.url) file.list.list <- unlist(strsplit(file.list.html, "", perl=T)) file.list <- gsub("(.*txt).*", "\\1", grep(".*txt.*", file.list.list, perl=T, value=T), perl=T) if (!is.na(subcorpus.only) & subcorpus.only %in% c("ATL", "DC", "DCA", "DCB", "DTA", "LES", "PRV", "ROC", "VLD")) { file.list <- grep(paste("^",subcorpus.only,sep=""), file.list, perl=T, value=T) } urls.to.load <- paste(txtfiles.url, file.list, sep="") out <- do.call("rbind", lapply(urls.to.load, load.file, turns)) if (no.pauses) { out <- droplevels(out[-grep("^\\(pause .*?\\)$", out$Content, perl=T),]) } if (collapse) { out <- unique(out[,c("Comp", "File", "Spkr", grep("Turn", names(out), perl=T, value=T))]) } out } # This function loads metadata for CORAAL, or a subset of CORAAL, over the web # Note that only columns shared across individual metadata files are included in the output, # so to retrieve all metadata columns it is best to run this for individual subcomponents # one-at-a-time, e.g. > coraal.webget.meta("DCA") # Requires RCurl library to be installed! coraal.webget.meta <- function(subcorpus.only=NA) { library(RCurl) metafiles.url <- paste("http://lingtools.uoregon.edu/coraal/explorer/files/METAs/", sep="") meta.list.html <- getURL(metafiles.url) meta.list.list <- unlist(strsplit(meta.list.html, "", perl=T)) meta.list <- gsub(" 1) { for (i in 2:length(metalist)) { fields.in.all <- intersect(fields.in.all, names(metalist[[i]])) } } # This temporary function extracts from a data.frame only the columns specified select.cols <- function(df.obj, select.cols) { df.obj[,select.cols] } # Output the combined metadata data.frame containing only the shared columns do.call("rbind", lapply(metalist, select.cols, select.cols=fields.in.all)) } ###### FUNCTIONS FOR MANIPULATING AND SEARCHING CORAAL DATA STRUCTURE IN R # This function filters CORAAL by subcomponent (DCA, DCB, DTA, LES, PRV, ROC, ATL, VLD, # or DC for both DCA & DCB) # Note default arguments assume CORAAL data.frame is called 'coraal' and metadata is # called 'metadat' # but these can be overridden via specifying 'meta' and 'src' in function call # Can also set output.meta.cols to TRUE to have basic metadata fields added to the output # data structure coraal.filter <- function(subcorpus = "", se.grp = "", age.grp = "", gender = "", meta = metadat, src = coraal, output.meta.cols = F) { corpus <- src if (toupper(subcorpus) %in% c("ATL", "DCA", "DCB", "DTA", "LES", "PRV", "ROC", "VLD")) { corpus <- droplevels(corpus[corpus$Comp==subcorpus,]) } else if (toupper(subcorpus)=="DC") { corpus <- droplevels(corpus[corpus$Comp %in% c("DCA", "DCB"),]) } if (se.grp %in% c(0:3, "se0", "se1", "se2", "se3")) { se <- gsub("^.*(\\d).*$", "\\1", se.grp, perl=T) corpus <- droplevels(corpus[grep(paste("_se", se, "_", sep=""), corpus$Spkr, perl=T),]) } if (age.grp %in% c(1:4, "ag1", "ag2", "ag3", "ag4")) { ag <- gsub("^.*(\\d).*$", "\\1", age.grp, perl=T) corpus <- droplevels(corpus[grep(paste("_ag", ag, "_", sep=""), corpus$Spkr, perl=T),]) } if (gender %in% c("m", "f", "n", "male", "female", "non-binary")) { gen <- gsub("^(\\w).*$", "\\1", gender, perl=T) corpus <- droplevels(corpus[grep(paste("_ag\\d_", gen, "_", sep=""), corpus$Spkr, perl=T),]) } if (output.meta.cols) { corpus$SE.Grp <- NA corpus$SE.Grp[grep("_se\\d_", corpus$Spkr, perl=T)] <- toupper(gsub("^.*(se\\d).*$", "\\1", corpus$Spkr[grep("_se\\d_", corpus$Spkr, perl=T)])) corpus$Age.Grp <- NA corpus$Age.Grp[grep("_ag\\d_", corpus$Spkr, perl=T)] <- toupper(gsub("^.*(ag\\d).*$", "\\1", corpus$Spkr[grep("_ag\\d_", corpus$Spkr, perl=T)])) corpus$Gender <- NA corpus$Gender[grep("_[fmn]_", corpus$Spkr, perl=T)] <- toupper(gsub("^.*ag\\d_([fmn])_.*$", "\\1", corpus$Spkr[grep("_[fmn]_", corpus$Spkr, perl=T)])) corpus$Gender <- as.factor(corpus$Gender) levels(corpus$Gender)[which(levels(corpus$Gender)=="F")] <- "Female" levels(corpus$Gender)[which(levels(corpus$Gender)=="M")] <- "Male" levels(corpus$Gender)[which(levels(corpus$Gender)=="N")] <- "Non-binary" corpus$Gender <- as.character(corpus$Gender) } corpus } # This function is available to retreive metadata fields from the metadata (meta) and to # add the fields as columns to the corpus data (src) coraal.add.meta <- function(meta.col, meta = metadat, src = coraal) { get.col.for.spkr <- function(spkr, meta.col) { if (length(which(meta$CORAAL.Spkr==as.character(spkr)))>0) { rslt <- unique(as.character(meta[meta$CORAAL.Spkr==as.character(spkr), which(names(meta)==meta.col)])) } else { rslt <- NA } rslt } sapply(src$Spkr, get.col.for.spkr, meta.col=meta.col) } # This function retrieves individual CORAAL utterances (or any data.frame with 'Content' col) # that match a search term (using grep; so can use R formatted regular expressions) # Note default arguments strip turn-level columns from output (if they exist) but this can # be turned off using remove.turn.cols = FALSE coraal.search <- function(search, ignore.case = TRUE, remove.turn.cols = TRUE, sample.at = NA, src = coraal) { matches <- src[grep(search, src$Content, ignore.case=ignore.case, perl=T),] results <- NA if (nrow(matches) < 1) { results <- paste("No matches for *", search, "* were found.", sep="") } else if (!is.na(sample.at) & (nrow(matches) >= sample.at)) { results <- matches[sample(1:nrow(matches), sample.at),] } else { results <- matches } if (remove.turn.cols & is.data.frame(results)) { results <- results[,-grep("Turn\\.", names(results), perl=T)] } results } # This function retrieves speaker turns in CORAAL (or any data.frame with 'Turn.Simple' col) # that match a search term (using grep; so search can use R formatted regular expressions) coraal.search.turn <- function(search, ignore.case = TRUE, src = coraal) { matches <- src[grep(search, src$Turn.Simple, ignore.case=ignore.case, perl=T),] results <- NA if (nrow(matches) < 1) { results <- paste("No matches for *", gsub("\\[.*?\\]\\+", " ", search, perl=T), "* were found.", sep="") } else { results <- matches } results } # This function is a helper function for coraal.concord # It retrieves the individual utterance locations (line) that match a search term # (using grep). This is so concordances for turns can attempt to indicate which actual lines # within the turn are the location of the match. coraal.findline <- function(in.lines, search, ignore.case = TRUE) { pause.lines <- grep("^\\(pause .*?\\)$", in.lines$Content, perl=T) where <- in.lines if (length(pause.lines) > 0) { where <- in.lines[-pause.lines,] } matches <- grep(search, where$Content, ignore.case, perl=T) results <- where[1,] if (length(matches)==1) { results <- where[matches,] } results } # This function uses coraal.search.turn() function to search for matches and then # format search results. # This is used by the CORAAL Explorer website to generate results for web-presentation. coraal.concord <- function(search.inp, ignore.ps = TRUE, ignore.case = TRUE, sample.at = NA, limit.nchar = NA, all.columns = FALSE, src = coraal) { search <- search.inp if (ignore.ps) { search <- gsub("\\s+", "[-\\\\| \\\\.\\\\,\\\\[\\\\]\\\\?/]+", search.inp, perl=T) } matches <- coraal.search.turn(search, ignore.case, src) if (is.data.frame(matches)) { # If the data.frame has both utterance (Content) and turn-level info (Turn.Content) # then we need coraal.findline() to trim the extra rows from the results if (length(grep("^Content$", names(src), ignore.case=ignore.case, perl=T))==1) { results <- do.call("rbind", lapply(split(matches, paste(matches$File, matches$Turn.StLine, sep="-")), coraal.findline, search=search, ignore.case=ignore.case)) results$HitTime <- results$StTime results$PreMatch <- gsub(paste("^(.*?)", search, ".*$", sep=""), "\\1", results$Turn.Simple, ignore.case=ignore.case, perl=T) } else { results <- matches results$PreMatch <- gsub(paste("^(.*?)", search, ".*$", sep=""), "\\1", results$Turn.Simple, ignore.case=ignore.case, perl=T) utt.times.list <- strsplit(as.character(results$Turn.Utt.Times), "\\|") n.utt <- (nchar(gsub("[^|]", "", results$PreMatch, perl=T)) * 2) + 1 results$Line <- results$Turn.StLine+n.utt-1 pick.times <- function(ut.list, pos.vec) { times <- rep(NA, times=length(ut.list)) for (i in 1:length(ut.list)) { times[i] <- ut.list[[i]][pos.vec[i]] } times } results$HitTime <- pick.times(utt.times.list, n.utt) } results$HitNo <- 1:nrow(results) results$OfHits<- nrow(results) if (!is.na(limit.nchar)) { results$PreMatch[nchar(results$PreMatch) > limit.nchar] <- paste("...", substr(results$PreMatch[nchar(results$PreMatch) > limit.nchar], nchar(results$PreMatch[nchar(results$PreMatch) > limit.nchar])-limit.nchar, nchar(results$PreMatch[nchar(results$PreMatch) > limit.nchar])), sep="") } results$Match <- gsub(paste("^.*?(", search, ").*$", sep=""), "\\1", results$Turn.Simple, ignore.case=ignore.case, perl=T) results$PostMatch <- gsub(paste("^.*?", search, "(.*)$", sep=""), "\\1", results$Turn.Simple, ignore.case=ignore.case, perl=T) if (!is.na(limit.nchar)) { results$PostMatch[nchar(results$PostMatch) > limit.nchar] <- paste(substr(results$PostMatch[nchar(results$PostMatch) > limit.nchar], 1, limit.nchar), "...", sep="") } if (!is.na(sample.at) & (nrow(results) >= sample.at)) { results <- results[sample(1:nrow(results), sample.at),] } if (!all.columns) { results <- results[,c("Comp", "File", "Spkr", "Turn.StLine", "Turn.StTime", "PreMatch", "Match", "PostMatch", "Turn.EnTime", "Turn.EnLine", "Line", "HitTime", "HitNo", "OfHits")] } } else { # there were no matches, just a string saying no matches results <- matches } results } # This function outputs some basic info about this code for users coraal.help <- function() { cat("To download and build CORAAL as an R data structure (data.frame), execute:\n > coraal <- coraal.webget.data()\nThere are three main versions you can create this way:\n 1. coraal.webget.data() will create a large data.frame that contains individual utterances as lines (rows) but each line will also contain the entire turn's transcript.\n 2. coraal.webget.data(turns=FALSE) will concatenate each of the transcript files together to form a long data.frame with each utterance as a row. (This works with coraal.search() function.)\n 3. coraal.webget.data(collapse=TRUE) will create a data.frame with only turn-level information. (This works with coraal.search.turn() and coraal.concord() functions.)\nYou can add the argument no.pauses=TRUE to any of the above to strip rows that are pauses.\nTo download and build CORAAL metadata as an R data structure (data.frame), execute:\n > metadat <- coraal.webget.meta()\nFor both functions, you can include a component prefix (ATL, DCA, DCB, DTA, LES, PRV, ROC, VLD, or DC for both DCA & DCB) to download just a part of CORAAL, e.g.:\n > coraal.dc <- coraal.webget.data(\"DC\")\nNb. You need the RCurl package and its dependecies installed to web-get CORAAL.\n\n") cat("To use CORAAL functions to search CORAAL, try e.g.\n > gogo.utts <- coraal.search(\"go go\", src=coraal)\nAnd to search individual lines (utterances). You can search across turns as well, e.g.\n > gogo.trns <- coraal.search.turn(\"go go\", src=coraal)\nPlease note that this retrieves rows for every utterance of matching turns. Use coraal.concord() to find and format turn-level search results. E.g.\n > gogo.conc <- coraal.concord(\"go go\", src=coraal)\nFinally, coraal.filter() is available to filter the data by some interviewee demographics.\nIf you call the CORAAL data 'coraal' and metadata 'metadat' as in the above examples, you do not need to specify the src and meta arguments for many functions.\n\n") cat("Type coraal.help() to see this again.") } cat("CORAAL functions loaded.\n\n") coraal.help()