############## SUPPORT VECTOR MACHINES AS A SOCIOLINGUISTIC TOOL ##############
# Tyler Kendall, University of Oregon
# Computational Sociolinguistics Workshop 
# NWAV 47 | NYU | Oct 18, 2018
#
# Example Code for running a Support Vector Machine to classify speaker
#  social categories (e.g. gender and age) from word usage, and then to 
#  poke at what words are predictive (of speaker genders and age groups).

#### Part 0. Load library

# Load e1071 library (R interface for libsvm; library must be installed first of course)
# See https://cran.r-project.org/web/packages/e1071/vignettes/svmdoc.pdf
library(e1071)


#### Part 1. Get Corpus of Regional African American Language DCB component data

# Download CORAAL:DCB through its web R functions. 
# This requires that the RCurl library is installed.
source("http://lingtools.uoregon.edu/coraal/explorer/R/CORAAL_web.R")

coraal.dcb <- coraal.webget.data("DCB", collapse=T)
meta.dcb <- coraal.webget.meta("DCB")

# We can inspect the data and metadata of course, e.g.
dim(coraal.dcb)
head(coraal.dcb)
head(meta.dcb)


#### Part 2. Prepare CORAAL data (add metadata for classifying and clean up text)

# We can add some metadata columns to coraal.dcb (this isn't strictly necessary)

# We'll look at speaker gender
coraal.dcb$Gender <- coraal.add.meta("Gender", meta=meta.dcb, src=coraal.dcb)

# And speaker age group (mostly as a binary < 30 vs. ≥ 30)
coraal.dcb$Age.Group <- coraal.add.meta("Age.Group", meta=meta.dcb, src=coraal.dcb)
coraal.dcb$Under30 <- coraal.dcb$Age.Group %in% c("-19", "20 to 29")

# And we can also try things like what region (quadrant) of DC a speaker is from
coraal.dcb$Region.in.City <- coraal.add.meta("Region.in.City", meta=meta.dcb, src=coraal.dcb)
coraal.dcb$Region.SE <- coraal.dcb$Region.in.City=="SE"

# Inspect these social factors that we'll try to "learn" with SVMs
unique(coraal.dcb[,c("Spkr", "Gender", "Age.Group", "Under30", "Region.in.City", "Region.SE")])

# Drop turns from speakers with NAs for social factors (just use Gender to catch this...
#  this removes 'misc' speakers and interviewers)
dcb <- droplevels(coraal.dcb[!is.na(coraal.dcb$Gender),])
dim(dcb)

# Now we will clean up the transcripts a little; make all text lowercase, 
#  remove special characters, notes, etc., and put # to mark turn boundaries
#  (turn boundaries don't matter for word-level analysis, but we'll do it here anyway)
# Cleaning up the data so we only have tokens we want to think of as words is
#  very important. The code here isn't doing everything we probably would want to... So
#  we probably want to remove other special conventions in the transcripts, e.g. could do: 
# dcb$Turn.Cl <- tolower(gsub("/(inaudible|unintelligible)/", "", dcb$Turn.Simple, perl=T))
dcb$Turn.Cl <- tolower(dcb$Turn.Simple) 
dcb$Turn.Cl <- paste("#", gsub("[-\\./,\\?!\\d\\[\\]]", "", dcb$Turn.Cl, perl=T), "#", sep=" ")

# We'll keep apostophes but we'll convert them to _ (later column names won't like ')
dcb$Turn.Cl <- gsub("'", "_", dcb$Turn.Cl, perl=T)

# We can inspect our current data
head(dcb)
dcb[sample(nrow(dcb), 1),]


#### Part 3. Generate a big matrix to characterize the entire dataset's word use

# First create a set of demographic info we want to classify, in its own data.frame
#  This will give us one row per spkr and be ordered in the same way as our big matrix
#  of words.
sinfo <- unique(dcb[,c("Spkr", "Gender", "Age.Group", "Under30", "Region.in.City", "Region.SE")])
table(sinfo$Under30)
table(sinfo$Region.SE)

# Then create a list of word vectors, one list element for each speaker (crucially,
#  speakers here are in the same order as in the 'sinfo' data.frame created just above)
wrds <- list(length = nrow(sinfo))
for (i in 1:length(sinfo$Spkr)) {
	wrds[[i]] <- unlist(strsplit(dcb$Turn.Cl[dcb$Spkr==as.character(sinfo$Spkr[i])], "\\s"))
}

# Set a word limit. Can set to 0 to consider all words, 1 for no hapax, 2 for only 
#  words that occur 2+ times, etc. We don't care about very rare words so here we 
#  only consider words that occur >10 times
word.limit <- 10 

# Now combine the words for all speakers in data
word.vec <- unlist(wrds)
allwords <- sort(word.vec)
length(unique(allwords))

# What are the most frequent overall words (note we still have some special chars here)
head(sort(table(allwords), decreasing=T), 25)

# Trim words based on our word.limit
length(which(table(allwords) > word.limit))
words <- names(table(allwords))[which(table(allwords) > word.limit)]

# And remove various "weird" words and bigrams, including noises like <laugh>
words <- words[-grep("^\\W*$", words, perl=T)]
words <- words[-grep("[<>]", words, perl=T)]
length(words)

# Now make a list of word frequency tables, again one element per speaker
spkr.wrd.tabl <- lapply(wrds, table)

# And set up an initial matrix, one row per speaker, one column per word
#  and everything starts as 0
spk.matrix <- matrix(data=0, nrow=nrow(sinfo), ncol=length(words))
colnames(spk.matrix) <- words
dim(spk.matrix)

# Now we add entries in the matrix for each speaker
# For the data here there aren't that many elements so it's fairly fast but the loop 
#  also outputs its status which is helpful for cases where this process is much slower
for (i in 1:nrow(spk.matrix)) {
	spk.matrix[i, which(colnames(spk.matrix) %in% names(spkr.wrd.tabl[[i]])) ] <- spkr.wrd.tabl[[i]][which(names(spkr.wrd.tabl[[i]]) %in% colnames(spk.matrix))]
	if (i %% 10 == 0) cat("\nGenerating word & bigram matrix... at i = ", i, sep="")
}
row.names(spk.matrix) <- sinfo$Spkr

# The svm() function in e1071 scales the data automatically, so we don't need to scale
#  our data, but we could do the below if we wanted to manually scale our word counts
# scaled.matrix <- scale(spk.matrix, center=T, scale=T)

# Finally, just so we can look at relative sizes of our different cases (in number of words)
#  let's add to our sinfo data speaker-level word counts
sinfo$NumWords <- rowSums(spk.matrix)
head(sinfo)

#### Part 4. Build (i.e. train) an SVM on some social factors

# We'll train on a random 80% of data and test on 20%

# Randomly sample some indices
sampled.indices <- sample(1:nrow(sinfo), size=round(8*nrow(sinfo)/10))
length(sampled.indices) # = 41 (randomly selected) speakers being used for training
nrow(sinfo) - length(sampled.indices) # = 10 left out for testing

# Now set up our training and test matrices and vectors containing the class labels
trn.x <- spk.matrix[sampled.indices,]
trn.age <- sinfo$Age.Group[sampled.indices]
trn.u30 <- sinfo$Under30[sampled.indices]
trn.gnd <- sinfo$Gender[sampled.indices]
trn.rse <- sinfo$Region.SE[sampled.indices]

tst.x <- spk.matrix[-sampled.indices,]
tst.age <- sinfo$Age.Group[-sampled.indices]
tst.u30 <- sinfo$Under30[-sampled.indices]
tst.gnd <- sinfo$Gender[-sampled.indices]
tst.rse <- sinfo$Region.SE[-sampled.indices]

# Remove from training and test matrices columns where training data has no words
#  this shouldn't matter but svm() will give warnings if columns are filled with
#  identical values
trn.x <- trn.x[,-which(colSums(spk.matrix[sampled.indices,])==0)]
tst.x <- tst.x[,-which(colSums(spk.matrix[sampled.indices,])==0)]


## Let's go through some examples now, though please note these are just samples
#  for exploring SVMs and there are a lot of steps that we don't focus on. E.g.,
#  for each example we would really want to run many iterations of the random
#  sampling and SVM modeling to ensure that any "answers" we get are not just
#  properties of the sample we happened to take... Lots of good resources exist
#  online for running SVMs. Tyler thinks the code above, for setting up the 
#  spk.matrix, is more useful than any of the code below...

# Example 1: Classify gender
#  Example RQ: Do word tokens differentiate speakers by gender in CORAAL:DCB?
#  And what words drive the classifiable gender differences?

# First we fit our SVM using svm()
# Note: SVM parameters are highly task and data dependendent.  Here a linear kernel
#  works well (this is often the case when there are lots more columns than rows),
#  and different cost parameters don't seem to matter; but this needs to be assessed
#  on a per-problem basis (see discussions of SVM "tuning", e.g., in
#  https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf )
gnd.svm <- svm(trn.x, trn.gnd, type="C", kernel="linear", cost=1)

# How does our fitted SVM model do on the training data? (It should do well)
gnd.fitd <- fitted(gnd.svm)
# The diagonal of this table are the correct labels
table(gnd.fitd, as.character(trn.gnd))
# % accuracy:
round(100*sum(diag(table(gnd.fitd, as.character(trn.gnd))))/nrow(trn.x), 1)

# Now we use it to classify (predict the labels of) the test speakers
test.preds <- predict(gnd.svm, tst.x)

# How did it do?...
table(test.preds, tst.gnd)
# % accuracy:
round(100*sum(diag(table(test.preds, tst.gnd)))/nrow(tst.x), 1)

# What features are most important? Several potential ways to address this question, better would
#  be feature selection (trying lots of different subsets of words)... see e.g.
#  (Guyon & Elisseeff 2003; http://jmlr.csail.mit.edu/papers/v3/guyon03a.html)
# But one quick way is something like this (works for linear kernal only!)
# First, weight vectors with matrix multiplication
w <- t(gnd.svm$coefs) %*% gnd.svm$SV       
# Then take absolute values
w.abs <- abs(w)
# Sort and look at the top 25 
names(w.abs) <- colnames(w)  							    
w.abs <- sort(w.abs, decreasing = T)
head(w.abs, 25)

# And we can compare the top 25 words' frequencies, side-by-side
data.frame(Female = colSums(trn.x[trn.gnd=="Female",])[head(names(w.abs), 25)], Male = colSums(trn.x[trn.gnd=="Male",])[head(names(w.abs), 25)], row.names=head(names(w.abs), 25))

# We probably would want to normalize the word frequencies to account for different sizes 
#  of the classes but here the two groups are pretty similarly sized so we won't bother
tapply(sinfo$NumWords, sinfo$Gender, sum)
prop.table(tapply(sinfo$NumWords, sinfo$Gender, sum))


# Example 2: Classify Age Groups (two groups, under and over 30)
#  Example RQ: Do word tokens differentiate speakers by broad ages in CORAAL:DCB?
#  And what words drive the classifiable age-related differences?

# Same exact steps as above; all that is different is the labels we're asking the
#  SVM to learn and the labels assigned to the test speakers.

# Again, first we fit our SVM using svm()
u30.svm <- svm(trn.x, trn.u30, type="C", kernel="linear", cost=1)

# How does our fitted SVM model do on the training data (it should do well)
u30.fitd <- fitted(u30.svm)
# The diagonal of this table are the correct labels
table(u30.fitd, as.character(trn.u30))
# % accuracy:
round(100*sum(diag(table(u30.fitd, as.character(trn.u30))))/nrow(trn.x), 1)

# Now we use it to classify (predict the labels of) the test speakers
test.preds <- predict(u30.svm, tst.x)

# How did it do?...  (Does quite well, 90-100%, on Tyler's tests)
table(test.preds, tst.u30)
# % accuracy:
round(100*sum(diag(table(test.preds, tst.u30)))/nrow(tst.x), 1)

# What features are most important? Same approach as before...
w <- t(u30.svm$coefs) %*% u30.svm$SV
w.abs <- abs(w)
names(w.abs) <- colnames(w)  							    
w.abs <- sort(w.abs, decreasing = T)
head(w.abs, 25)

# Again, we can compare the top 25 words' frequencies, side-by-side
data.frame(Under30 = colSums(trn.x[trn.u30,])[head(names(w.abs), 25)], Over30 = colSums(trn.x[!trn.u30,])[head(names(w.abs), 25)], row.names=head(names(w.abs), 25))

# Here we probably do want to weight the groups; there are 2x more words for Over30
tapply(sinfo$NumWords, sinfo$Under30, sum)
prop.table(tapply(sinfo$NumWords, sinfo$Under30, sum))

round(100000*data.frame(Under30 = colSums(trn.x[trn.u30,])[head(names(w.abs), 25)]/sum(trn.x[!trn.u30,]), Over30 = colSums(trn.x[!trn.u30,])[head(names(w.abs), 25)]/sum(trn.x[!trn.u30,]), row.names=head(names(w.abs), 25)), 1)

# Example 3: Classify Speakers by their home region of DC (SE vs. other)
#  Example RQ: Do word tokens differentiate speakers by where in DC they are from?
#  And what words drive the differences?

# Same exact steps as above; again, all that is different is the labels we're 
#  asking the SVM to learn and the labels assigned to the test speakers.

# Again, first we fit our SVM using svm()
rse.svm <- svm(trn.x, trn.rse, type="C", kernel="linear", cost=1)

rse.fitd <- fitted(rse.svm)
# This tends to achieve good training accuracy
table(rse.fitd, as.character(trn.rse))
round(100*sum(diag(table(rse.fitd, as.character(trn.rse))))/nrow(trn.x), 1)

test.preds <- predict(rse.svm, tst.x)

# But in Tyler's experience this tends to not predict as well as gender and age group
table(test.preds, tst.rse)
round(100*sum(diag(table(test.preds, tst.rse)))/nrow(tst.x), 1)

# Again, what features in the training data are most important?
# What features are most important? Same approach as before...
w <- t(rse.svm$coefs) %*% rse.svm$SV
w.abs <- abs(w)
names(w.abs) <- colnames(w)  							    
w.abs <- sort(w.abs, decreasing = T)
head(w.abs, 25)

data.frame(Reg.is.SE = colSums(trn.x[trn.rse,])[head(names(w.abs), 20)], Reg.not.SE = colSums(trn.x[!trn.rse,])[head(names(w.abs), 20)], row.names=head(names(w.abs), 20))

# Probably no need to weight the groups
tapply(sinfo$NumWords, sinfo$Region.SE, sum)


# Example 4: Classify Speakers by their age again but this time into the 4 CORAAL age groups
#  Example RQ: Can we determine finer-level age groupings by word tokens?
#  Note in this case we are doing multi-class classification; this is a special case for
#  SVMs but the software doesn't make us care about the details

age.svm <- svm(trn.x, trn.age, type="C", kernel="linear", cost=1)
age.fitd <- fitted(age.svm)
# Generally, this still trains well
table(age.fitd, as.character(trn.age))
round(100*sum(diag(table(age.fitd, as.character(trn.age))))/nrow(trn.x), 1)

test.preds <- predict(age.svm, tst.x)
# But test results show that the SVM isn't good at separating the four groups,
#  younger speakers (< 30) and older speakers (≥ 30) are often lumped together
table(test.preds, tst.age)
round(100*sum(diag(table(test.preds, tst.age)))/nrow(tst.x), 1)  


# Example 5. One last version looking at age, focusing on just the most frequent words
#  An issue with the above is that it's really just about what speakers talk about
#  Focusing on very frequent words could get at more structural kinds of differences...
#  Let's try just the 100 most frequent words

# 100 most frequent words are:
head(sort(colSums(spk.matrix), decreasing=T), 100)

# So we'll now subset columns from our main matrix
frq.matrix <- spk.matrix[,which(colnames(spk.matrix) %in% names(head(sort(colSums(spk.matrix), decreasing=T), 100)))]

# Now set up new training and test matrices, can use the class labels we defined earlier
trn.frq.x <- frq.matrix[sampled.indices,]
tst.frq.x <- frq.matrix[-sampled.indices,]

# As before, first we fit an SVM
u30.frq.svm <- svm(trn.frq.x, trn.u30, type="C", kernel="linear", cost=1)

# How does our fitted SVM model do on the training data (it should do well)
u30.frq.fitd <- fitted(u30.frq.svm)
# The diagonal of this table are the correct labels
table(u30.frq.fitd, as.character(trn.u30))
# Still does well. % accuracy:
round(100*sum(diag(table(u30.frq.fitd, as.character(trn.u30))))/nrow(trn.frq.x), 1)

# Now we use it to classify (predict the labels of) the test speakers
test.frq.preds <- predict(u30.frq.svm, tst.frq.x)

# How did it do?...  (Still does really well! 90-100%, on Tyler's tests)
table(test.frq.preds, tst.u30)
# % accuracy:
round(100*sum(diag(table(test.frq.preds, tst.u30)))/nrow(tst.frq.x), 1)

# What features are most important? Same approach as before...
w <- t(u30.frq.svm$coefs) %*% u30.frq.svm$SV
w.abs <- abs(w)
names(w.abs) <- colnames(w)  							    
w.abs <- sort(w.abs, decreasing = T)
head(w.abs, 25)

# Again, we can compare the top 25 words frequencies, side-by-side
data.frame(Under30 = colSums(trn.frq.x[trn.u30,])[head(names(w.abs), 25)], Over30 = colSums(trn.frq.x[!trn.u30,])[head(names(w.abs), 25)], row.names=head(names(w.abs), 25))

# Again, we probably do want to weight the groups since they are so unbalanced
tapply(sinfo$NumWords, sinfo$Under30, sum)

round(100000*data.frame(Under30 = colSums(trn.frq.x[trn.u30,])[head(names(w.abs), 25)]/sum(trn.frq.x[!trn.u30,]), Over30 = colSums(trn.frq.x[!trn.u30,])[head(names(w.abs), 25)]/sum(trn.frq.x[!trn.u30,]), row.names=head(names(w.abs), 25)), 1)

# That's it.  I hope this is a useful as a demo of runnings SVMs in R.
#  And of working with CORAAL in R!