#This is my attempt to provide a relatively brief overview of the different text tools available in Quanteda. #Note the these are in alphabetical order #The Quanteda documentation has more details and examples of course #Start by going to the correct folder and installing the relevant #packages and libraries setwd("C:/Users/John.DESKTOP-ES1A8AS/Desktop/Text/R") install.packages("quanteda") install.packages("ggplot2", dependencies = TRUE) library(quanteda) library(ggplot2) #--------------------------------------- #DICTIONARY - produce counts of specific keywords found in documents #options include exact matches or regular expressions... mycorpus <- subset(inaugCorpus, Year>1900) mydict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"), opposition = c("Opposition", "reject", "notincorpus"), taxing = "taxing", taxation = "taxation", taxregex = "tax*", country = "united states")) head(dfm(mycorpus, dictionary = mydict)) #------------------------------------------ #CHANGEUNITS - parse or aggregate documents in a corpus #create a corpus mycorpus <- corpus(c(textone = "This is a sentence. Another sentence. Yet another.", textwo = "Premiere phrase. Deuxieme phrase."), docvars = data.frame(country=c("UK", "USA"), year=c(1990, 2000)), notes = "This is a simple example to show how changeunits() works.") summary(mycorpus) #change the units from documents down sentences or paragraphs, or from sentences or paragraphs up to documents Summary(changeunits(mycorpus, to = "sentences"), showmeta=TRUE) #------------------------------------------ #CREATE A CORPUS - a corpus is a collection of text documents. #Many Quanteda options start with a corpus #inaugTexts is a vector of presidential state of the union speech texts already in Quanteda #create a corpus inaug <- corpus(inaugTexts) #there are a lot of additional options #see also corpus-class #------------------------------------------ #CREATE A (SPARSE) DOCUMENT FEATURE MATRIX #converting a set of documents into a matrix where each row is a document, #each column is a unique feature, and the cells are counts of those features #inaugCorpus is a preexisting corpus that includes column names such as year mydfm <- dfm(subset(inaugCorpus, Year>1980)) is.dfm(mydfm) #there are a ton of arguments to improve the fidelity of the features (such as removing stopwords or punctuation) head(mydfm) #see also dfm-class #need to do this using something other than a pre-existing corpus #----------------------------------------- #DOCFREQ - compute the frequency of each term or feature in a dfm mydfm2 <- dfm(inaugTexts[1:2], verbose = FALSE) docfreq(mydfm[, 1:20]) #can impose weighting to (e.g.) consider only count instances above some threshold #------------------------------------------------ #DOCNAMES - query the names of the documents in a corpus or dfm docnames(inaugCorpus) docnames(dfm(inaugTexts[1:2])) #------------------------------------------------ #DOCVARS - query the variables and values of a variable in a corpus head(docvars(inaugCorpus)) head(inaugCorpus[, "Year"]) #------------------------------------------------ #ENCODING - detect encoding of texts encoding(inaugTexts) #----------------------------------------------- #FEATURES - extract features of a dfm features(myDfm) #----------------------------------------------- #KWIC - outputs context of keyword usage head(kwic(inaugTexts, "security", window = 3, valuetype = "fixed")) #------------------------------------------------ #LEXDIV - lexical diversity mydfm <- dfm(subset(inaugCorpus, Year > 1980), verbose = FALSE) (results <- lexdiv(mydfm, c("CTTR", "TTR", "U"))) cor(lexdiv(mydfm, "all")) #------------------------------------------------ #NDOC - count the number of documents in a corpus or dfm #NSENTENCE #NTOKEN ndoc(mydfm) #------------------------------------------------ #NGRAMS - see tokenize #------------------------------------------------ #PHRASETOTOKEN - remove " " to make multiple words a single token myDict <- dictionary(list(negative = c("bad* word*", "negative", "awful text"), postiive = c("good stuff", "like? th??"))) txt <- c("I liked this, when we can use bad words, in awful text.", "Some damn good stuff, like the text, she likes that too.") phrasetotoken(txt, myDict) phrasetotoken("This is a simpler version of multi word expressions.", "multi word expression*") #------------------------------------------------ #PLOT.DFM - make a wordcloud of dataframe mydfm <- dfm(subset(inaugCorpus, President=="Obama"), verbose = FALSE, ignoredFeatures = stopwords("english")) plot(mydfm, random.color = TRUE, rot.per = .25, colors = sample(colors()[2:128], 5)) #------------------------------------------------ #PLOT.KWIC inaugCorpusPost70 <- subset(inaugCorpus, Year > 1970) # compare multiple documents plot(kwic(inaugCorpusPost70, "american")) #------------------------------------------------ #PRINT.DFM - print... print(myDfm) #------------------------------------------------ #READABILITY - calculates readability readability(inaugCorpus, measure = "Flesch.Kincaid") #------------------------------------------------ #REMOVE FEATURES - for example remove to remove puctuation or stopwords txt <- c(wash1 <- "Fellow citizens, I am again called upon by the voice of my country to execute the functions of its Chief Magistrate.", wash2 <- "When the occasion proper for it shall arrive, I shall endeavor to express the high sense I entertain of this distinguished honor.") removeFeatures(tokenize(txt, removePunct = TRUE), stopwords("english")) mydfm <- dfm(ukimmigTexts, verbose=FALSE) removeFeatures(mydfm, stopwords("english")) #------------------------------------------------ #SAMPLE - takes a random sample from a dataframe or whatever. Sampling without replacement by default summary(sample(inaugCorpus, 10, replace=TRUE)) #------------------------------------------------ #SEGMENT - segment text into (tokens, sentences, paragraphs, tags, other) Can specify regex delimiters etc. segment(ukimmigTexts[3:4], "paragraphs") #------------------------------------------------ #SELECTFEATURES - can specify what features to keep as well as remove #------------------------------------------------ #SHOW- difference from print? #------------------------------------------------ #SIMILARITY - of documents in dataframe (methods found in simil package) # create a dfm from inaugural addresses from Reagan onwards presDfm <- dfm(subset(inaugCorpus, Year > 1980), ignoredFeatures = stopwords("english"), stem = TRUE) # compute some document similarities (tmp <- similarity(presDfm, margin = "documents", method="cosine")) # output as a matrix as.matrix(tmp) #----------------------------------------------- #SORT.DFM -Sorts a dfm by frequency of total features, total features in documents, or both. If you want to have the most important features or cases at the left or top respectively. dtm <- dfm(inaugCorpus) dtm[1:10, 1:5] dtm <- sort(dtm) sort(dtm, TRUE, "both")[1:10, 1:5] #----------------------------------------------- #SUBSET.CORPUS - select a portion of the corpus summary(subset(inaugCorpus, Year>1980)) #----------------------------------------------- #SUMMARY.CORPUS summary(inaugCorpus, n=10) #----------------------------------------------- #SYLLABLES - counts syllables syllables("This is an example sentence.") #---------------------------------------------- #TEXTFILE - read in .json, .txt, xml, ,csv file getwd() setwd("C:/Users/John.DESKTOP-ES1A8AS/Desktop/POLS559/Text/R") mytf7 <- textfile("bill_law_matches.csv", textField = "version_b") head(corpus(mytf7)) #how to read the textField? #how to manipulate it? #----------------------------------------------- #TOKENIZE - really important tool for pre-processing text. To tokenize is to break the text up into desired features, such as words, sentences etc. #Lots of options, such as removing numbers, Twitter characters etc. (see also ngrams) #----------------------------------------------- #topFeatures - list the most common features. Good way to check for weird stuff topfeatures(dfm(subset(inaugCorpus, Year>1980), verbose=FALSE)) #----------------------------------------------- #TRIM - reduce the selected features of a dataframe (for example by specify a minimum number of responses) create a smaller (see also selectFeatures) (myDfm <- dfm(inaugCorpus, verbose = FALSE)) # only words occuring >=10 times and in >=2 docs trim(myDfm, minCount = 10, minDoc = 2) #----------------------------------------------- #WEIGHT - weight feature frequencies dtm <- dfm(inaugCorpus) tfidfDtm <- weight(dtm, type="tfidf") #----------------------------------------------- #WORDSTEM - stem words by removing suffixes (to treat different usages as the same feature) #Can apply general Porter stemmer or stem specific words wordstem(c("win", "winning", "wins", "won", "winner")) wordstem(dtm, language = "porter") #---------------------------------------------- Yeah!!