############## SUPPORT VECTOR MACHINES AS A SOCIOLINGUISTIC TOOL ############## # Tyler Kendall, University of Oregon # Computational Sociolinguistics Workshop # NWAV 47 | NYU | Oct 18, 2018 # # Example Code for running a Support Vector Machine to classify speaker # social categories (e.g. gender and age) from word usage, and then to # poke at what words are predictive (of speaker genders and age groups). #### Part 0. Load library # Load e1071 library (R interface for libsvm; library must be installed first of course) # See https://cran.r-project.org/web/packages/e1071/vignettes/svmdoc.pdf library(e1071) #### Part 1. Get Corpus of Regional African American Language DCB component data # Download CORAAL:DCB through its web R functions. # This requires that the RCurl library is installed. source("http://lingtools.uoregon.edu/coraal/explorer/R/CORAAL_web.R") coraal.dcb <- coraal.webget.data("DCB", collapse=T) meta.dcb <- coraal.webget.meta("DCB") # We can inspect the data and metadata of course, e.g. dim(coraal.dcb) head(coraal.dcb) head(meta.dcb) #### Part 2. Prepare CORAAL data (add metadata for classifying and clean up text) # We can add some metadata columns to coraal.dcb (this isn't strictly necessary) # We'll look at speaker gender coraal.dcb$Gender <- coraal.add.meta("Gender", meta=meta.dcb, src=coraal.dcb) # And speaker age group (mostly as a binary < 30 vs. ≥ 30) coraal.dcb$Age.Group <- coraal.add.meta("Age.Group", meta=meta.dcb, src=coraal.dcb) coraal.dcb$Under30 <- coraal.dcb$Age.Group %in% c("-19", "20 to 29") # And we can also try things like what region (quadrant) of DC a speaker is from coraal.dcb$Region.in.City <- coraal.add.meta("Region.in.City", meta=meta.dcb, src=coraal.dcb) coraal.dcb$Region.SE <- coraal.dcb$Region.in.City=="SE" # Inspect these social factors that we'll try to "learn" with SVMs unique(coraal.dcb[,c("Spkr", "Gender", "Age.Group", "Under30", "Region.in.City", "Region.SE")]) # Drop turns from speakers with NAs for social factors (just use Gender to catch this... # this removes 'misc' speakers and interviewers) dcb <- droplevels(coraal.dcb[!is.na(coraal.dcb$Gender),]) dim(dcb) # Now we will clean up the transcripts a little; make all text lowercase, # remove special characters, notes, etc., and put # to mark turn boundaries # (turn boundaries don't matter for word-level analysis, but we'll do it here anyway) # Cleaning up the data so we only have tokens we want to think of as words is # very important. The code here isn't doing everything we probably would want to... So # we probably want to remove other special conventions in the transcripts, e.g. could do: # dcb$Turn.Cl <- tolower(gsub("/(inaudible|unintelligible)/", "", dcb$Turn.Simple, perl=T)) dcb$Turn.Cl <- tolower(dcb$Turn.Simple) dcb$Turn.Cl <- paste("#", gsub("[-\\./,\\?!\\d\\[\\]]", "", dcb$Turn.Cl, perl=T), "#", sep=" ") # We'll keep apostophes but we'll convert them to _ (later column names won't like ') dcb$Turn.Cl <- gsub("'", "_", dcb$Turn.Cl, perl=T) # We can inspect our current data head(dcb) dcb[sample(nrow(dcb), 1),] #### Part 3. Generate a big matrix to characterize the entire dataset's word use # First create a set of demographic info we want to classify, in its own data.frame # This will give us one row per spkr and be ordered in the same way as our big matrix # of words. sinfo <- unique(dcb[,c("Spkr", "Gender", "Age.Group", "Under30", "Region.in.City", "Region.SE")]) table(sinfo$Under30) table(sinfo$Region.SE) # Then create a list of word vectors, one list element for each speaker (crucially, # speakers here are in the same order as in the 'sinfo' data.frame created just above) wrds <- list(length = nrow(sinfo)) for (i in 1:length(sinfo$Spkr)) { wrds[[i]] <- unlist(strsplit(dcb$Turn.Cl[dcb$Spkr==as.character(sinfo$Spkr[i])], "\\s")) } # Set a word limit. Can set to 0 to consider all words, 1 for no hapax, 2 for only # words that occur 2+ times, etc. We don't care about very rare words so here we # only consider words that occur >10 times word.limit <- 10 # Now combine the words for all speakers in data word.vec <- unlist(wrds) allwords <- sort(word.vec) length(unique(allwords)) # What are the most frequent overall words (note we still have some special chars here) head(sort(table(allwords), decreasing=T), 25) # Trim words based on our word.limit length(which(table(allwords) > word.limit)) words <- names(table(allwords))[which(table(allwords) > word.limit)] # And remove various "weird" words and bigrams, including noises like words <- words[-grep("^\\W*$", words, perl=T)] words <- words[-grep("[<>]", words, perl=T)] length(words) # Now make a list of word frequency tables, again one element per speaker spkr.wrd.tabl <- lapply(wrds, table) # And set up an initial matrix, one row per speaker, one column per word # and everything starts as 0 spk.matrix <- matrix(data=0, nrow=nrow(sinfo), ncol=length(words)) colnames(spk.matrix) <- words dim(spk.matrix) # Now we add entries in the matrix for each speaker # For the data here there aren't that many elements so it's fairly fast but the loop # also outputs its status which is helpful for cases where this process is much slower for (i in 1:nrow(spk.matrix)) { spk.matrix[i, which(colnames(spk.matrix) %in% names(spkr.wrd.tabl[[i]])) ] <- spkr.wrd.tabl[[i]][which(names(spkr.wrd.tabl[[i]]) %in% colnames(spk.matrix))] if (i %% 10 == 0) cat("\nGenerating word & bigram matrix... at i = ", i, sep="") } row.names(spk.matrix) <- sinfo$Spkr # The svm() function in e1071 scales the data automatically, so we don't need to scale # our data, but we could do the below if we wanted to manually scale our word counts # scaled.matrix <- scale(spk.matrix, center=T, scale=T) # Finally, just so we can look at relative sizes of our different cases (in number of words) # let's add to our sinfo data speaker-level word counts sinfo$NumWords <- rowSums(spk.matrix) head(sinfo) #### Part 4. Build (i.e. train) an SVM on some social factors # We'll train on a random 80% of data and test on 20% # Randomly sample some indices sampled.indices <- sample(1:nrow(sinfo), size=round(8*nrow(sinfo)/10)) length(sampled.indices) # = 41 (randomly selected) speakers being used for training nrow(sinfo) - length(sampled.indices) # = 10 left out for testing # Now set up our training and test matrices and vectors containing the class labels trn.x <- spk.matrix[sampled.indices,] trn.age <- sinfo$Age.Group[sampled.indices] trn.u30 <- sinfo$Under30[sampled.indices] trn.gnd <- sinfo$Gender[sampled.indices] trn.rse <- sinfo$Region.SE[sampled.indices] tst.x <- spk.matrix[-sampled.indices,] tst.age <- sinfo$Age.Group[-sampled.indices] tst.u30 <- sinfo$Under30[-sampled.indices] tst.gnd <- sinfo$Gender[-sampled.indices] tst.rse <- sinfo$Region.SE[-sampled.indices] # Remove from training and test matrices columns where training data has no words # this shouldn't matter but svm() will give warnings if columns are filled with # identical values trn.x <- trn.x[,-which(colSums(spk.matrix[sampled.indices,])==0)] tst.x <- tst.x[,-which(colSums(spk.matrix[sampled.indices,])==0)] ## Let's go through some examples now, though please note these are just samples # for exploring SVMs and there are a lot of steps that we don't focus on. E.g., # for each example we would really want to run many iterations of the random # sampling and SVM modeling to ensure that any "answers" we get are not just # properties of the sample we happened to take... Lots of good resources exist # online for running SVMs. Tyler thinks the code above, for setting up the # spk.matrix, is more useful than any of the code below... # Example 1: Classify gender # Example RQ: Do word tokens differentiate speakers by gender in CORAAL:DCB? # And what words drive the classifiable gender differences? # First we fit our SVM using svm() # Note: SVM parameters are highly task and data dependendent. Here a linear kernel # works well (this is often the case when there are lots more columns than rows), # and different cost parameters don't seem to matter; but this needs to be assessed # on a per-problem basis (see discussions of SVM "tuning", e.g., in # https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf ) gnd.svm <- svm(trn.x, trn.gnd, type="C", kernel="linear", cost=1) # How does our fitted SVM model do on the training data? (It should do well) gnd.fitd <- fitted(gnd.svm) # The diagonal of this table are the correct labels table(gnd.fitd, as.character(trn.gnd)) # % accuracy: round(100*sum(diag(table(gnd.fitd, as.character(trn.gnd))))/nrow(trn.x), 1) # Now we use it to classify (predict the labels of) the test speakers test.preds <- predict(gnd.svm, tst.x) # How did it do?... table(test.preds, tst.gnd) # % accuracy: round(100*sum(diag(table(test.preds, tst.gnd)))/nrow(tst.x), 1) # What features are most important? Several potential ways to address this question, better would # be feature selection (trying lots of different subsets of words)... see e.g. # (Guyon & Elisseeff 2003; http://jmlr.csail.mit.edu/papers/v3/guyon03a.html) # But one quick way is something like this (works for linear kernal only!) # First, weight vectors with matrix multiplication w <- t(gnd.svm$coefs) %*% gnd.svm$SV # Then take absolute values w.abs <- abs(w) # Sort and look at the top 25 names(w.abs) <- colnames(w) w.abs <- sort(w.abs, decreasing = T) head(w.abs, 25) # And we can compare the top 25 words' frequencies, side-by-side data.frame(Female = colSums(trn.x[trn.gnd=="Female",])[head(names(w.abs), 25)], Male = colSums(trn.x[trn.gnd=="Male",])[head(names(w.abs), 25)], row.names=head(names(w.abs), 25)) # We probably would want to normalize the word frequencies to account for different sizes # of the classes but here the two groups are pretty similarly sized so we won't bother tapply(sinfo$NumWords, sinfo$Gender, sum) prop.table(tapply(sinfo$NumWords, sinfo$Gender, sum)) # Example 2: Classify Age Groups (two groups, under and over 30) # Example RQ: Do word tokens differentiate speakers by broad ages in CORAAL:DCB? # And what words drive the classifiable age-related differences? # Same exact steps as above; all that is different is the labels we're asking the # SVM to learn and the labels assigned to the test speakers. # Again, first we fit our SVM using svm() u30.svm <- svm(trn.x, trn.u30, type="C", kernel="linear", cost=1) # How does our fitted SVM model do on the training data (it should do well) u30.fitd <- fitted(u30.svm) # The diagonal of this table are the correct labels table(u30.fitd, as.character(trn.u30)) # % accuracy: round(100*sum(diag(table(u30.fitd, as.character(trn.u30))))/nrow(trn.x), 1) # Now we use it to classify (predict the labels of) the test speakers test.preds <- predict(u30.svm, tst.x) # How did it do?... (Does quite well, 90-100%, on Tyler's tests) table(test.preds, tst.u30) # % accuracy: round(100*sum(diag(table(test.preds, tst.u30)))/nrow(tst.x), 1) # What features are most important? Same approach as before... w <- t(u30.svm$coefs) %*% u30.svm$SV w.abs <- abs(w) names(w.abs) <- colnames(w) w.abs <- sort(w.abs, decreasing = T) head(w.abs, 25) # Again, we can compare the top 25 words' frequencies, side-by-side data.frame(Under30 = colSums(trn.x[trn.u30,])[head(names(w.abs), 25)], Over30 = colSums(trn.x[!trn.u30,])[head(names(w.abs), 25)], row.names=head(names(w.abs), 25)) # Here we probably do want to weight the groups; there are 2x more words for Over30 tapply(sinfo$NumWords, sinfo$Under30, sum) prop.table(tapply(sinfo$NumWords, sinfo$Under30, sum)) round(100000*data.frame(Under30 = colSums(trn.x[trn.u30,])[head(names(w.abs), 25)]/sum(trn.x[!trn.u30,]), Over30 = colSums(trn.x[!trn.u30,])[head(names(w.abs), 25)]/sum(trn.x[!trn.u30,]), row.names=head(names(w.abs), 25)), 1) # Example 3: Classify Speakers by their home region of DC (SE vs. other) # Example RQ: Do word tokens differentiate speakers by where in DC they are from? # And what words drive the differences? # Same exact steps as above; again, all that is different is the labels we're # asking the SVM to learn and the labels assigned to the test speakers. # Again, first we fit our SVM using svm() rse.svm <- svm(trn.x, trn.rse, type="C", kernel="linear", cost=1) rse.fitd <- fitted(rse.svm) # This tends to achieve good training accuracy table(rse.fitd, as.character(trn.rse)) round(100*sum(diag(table(rse.fitd, as.character(trn.rse))))/nrow(trn.x), 1) test.preds <- predict(rse.svm, tst.x) # But in Tyler's experience this tends to not predict as well as gender and age group table(test.preds, tst.rse) round(100*sum(diag(table(test.preds, tst.rse)))/nrow(tst.x), 1) # Again, what features in the training data are most important? # What features are most important? Same approach as before... w <- t(rse.svm$coefs) %*% rse.svm$SV w.abs <- abs(w) names(w.abs) <- colnames(w) w.abs <- sort(w.abs, decreasing = T) head(w.abs, 25) data.frame(Reg.is.SE = colSums(trn.x[trn.rse,])[head(names(w.abs), 20)], Reg.not.SE = colSums(trn.x[!trn.rse,])[head(names(w.abs), 20)], row.names=head(names(w.abs), 20)) # Probably no need to weight the groups tapply(sinfo$NumWords, sinfo$Region.SE, sum) # Example 4: Classify Speakers by their age again but this time into the 4 CORAAL age groups # Example RQ: Can we determine finer-level age groupings by word tokens? # Note in this case we are doing multi-class classification; this is a special case for # SVMs but the software doesn't make us care about the details age.svm <- svm(trn.x, trn.age, type="C", kernel="linear", cost=1) age.fitd <- fitted(age.svm) # Generally, this still trains well table(age.fitd, as.character(trn.age)) round(100*sum(diag(table(age.fitd, as.character(trn.age))))/nrow(trn.x), 1) test.preds <- predict(age.svm, tst.x) # But test results show that the SVM isn't good at separating the four groups, # younger speakers (< 30) and older speakers (≥ 30) are often lumped together table(test.preds, tst.age) round(100*sum(diag(table(test.preds, tst.age)))/nrow(tst.x), 1) # Example 5. One last version looking at age, focusing on just the most frequent words # An issue with the above is that it's really just about what speakers talk about # Focusing on very frequent words could get at more structural kinds of differences... # Let's try just the 100 most frequent words # 100 most frequent words are: head(sort(colSums(spk.matrix), decreasing=T), 100) # So we'll now subset columns from our main matrix frq.matrix <- spk.matrix[,which(colnames(spk.matrix) %in% names(head(sort(colSums(spk.matrix), decreasing=T), 100)))] # Now set up new training and test matrices, can use the class labels we defined earlier trn.frq.x <- frq.matrix[sampled.indices,] tst.frq.x <- frq.matrix[-sampled.indices,] # As before, first we fit an SVM u30.frq.svm <- svm(trn.frq.x, trn.u30, type="C", kernel="linear", cost=1) # How does our fitted SVM model do on the training data (it should do well) u30.frq.fitd <- fitted(u30.frq.svm) # The diagonal of this table are the correct labels table(u30.frq.fitd, as.character(trn.u30)) # Still does well. % accuracy: round(100*sum(diag(table(u30.frq.fitd, as.character(trn.u30))))/nrow(trn.frq.x), 1) # Now we use it to classify (predict the labels of) the test speakers test.frq.preds <- predict(u30.frq.svm, tst.frq.x) # How did it do?... (Still does really well! 90-100%, on Tyler's tests) table(test.frq.preds, tst.u30) # % accuracy: round(100*sum(diag(table(test.frq.preds, tst.u30)))/nrow(tst.frq.x), 1) # What features are most important? Same approach as before... w <- t(u30.frq.svm$coefs) %*% u30.frq.svm$SV w.abs <- abs(w) names(w.abs) <- colnames(w) w.abs <- sort(w.abs, decreasing = T) head(w.abs, 25) # Again, we can compare the top 25 words frequencies, side-by-side data.frame(Under30 = colSums(trn.frq.x[trn.u30,])[head(names(w.abs), 25)], Over30 = colSums(trn.frq.x[!trn.u30,])[head(names(w.abs), 25)], row.names=head(names(w.abs), 25)) # Again, we probably do want to weight the groups since they are so unbalanced tapply(sinfo$NumWords, sinfo$Under30, sum) round(100000*data.frame(Under30 = colSums(trn.frq.x[trn.u30,])[head(names(w.abs), 25)]/sum(trn.frq.x[!trn.u30,]), Over30 = colSums(trn.frq.x[!trn.u30,])[head(names(w.abs), 25)]/sum(trn.frq.x[!trn.u30,]), row.names=head(names(w.abs), 25)), 1) # That's it. I hope this is a useful as a demo of runnings SVMs in R. # And of working with CORAAL in R!