#This is my attempt to provide a relatively brief overview of the different text tools available in Quanteda. 
#Note the these are in alphabetical order
#The Quanteda documentation has more details and examples of course

#Start by going to the correct folder and installing the relevant 
#packages and libraries

setwd("C:/Users/John.DESKTOP-ES1A8AS/Desktop/Text/R")
install.packages("quanteda")
install.packages("ggplot2", dependencies = TRUE)
library(quanteda)
library(ggplot2)

#---------------------------------------

#DICTIONARY - produce counts of specific keywords found in documents
#options include exact matches or regular expressions...

mycorpus <- subset(inaugCorpus, Year>1900) 
mydict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"), opposition = c("Opposition", "reject", "notincorpus"), taxing = "taxing", taxation = "taxation", taxregex = "tax*", country = "united states")) 
head(dfm(mycorpus, dictionary = mydict))



#------------------------------------------

#CHANGEUNITS - parse or aggregate documents in a corpus

#create a corpus
mycorpus <- corpus(c(textone = "This is a sentence. Another sentence. Yet another.", textwo = "Premiere phrase. Deuxieme phrase."), docvars = data.frame(country=c("UK", "USA"), year=c(1990, 2000)), notes = "This is a simple example to show how changeunits() works.") 
summary(mycorpus)

#change the units from documents down sentences or paragraphs, or from sentences or paragraphs up to documents

Summary(changeunits(mycorpus, to = "sentences"), showmeta=TRUE)

#------------------------------------------

#CREATE A CORPUS - a corpus is a collection of text documents. 
#Many Quanteda options start with a corpus

#inaugTexts is a vector of presidential state of the union speech texts already in Quanteda
#create a corpus
inaug <- corpus(inaugTexts)
#there are a lot of additional options
#see also corpus-class

#------------------------------------------

#CREATE A (SPARSE) DOCUMENT FEATURE MATRIX

#converting a set of documents into a matrix where each row is a document, 
#each column is a unique feature, and the cells are counts of those features
#inaugCorpus is a preexisting corpus that includes column names such as year

mydfm <- dfm(subset(inaugCorpus, Year>1980)) 
is.dfm(mydfm)

#there are a ton of arguments to improve the fidelity of the features (such as removing stopwords or punctuation)

head(mydfm)

#see also dfm-class
#need to do this using something other than a pre-existing corpus

#-----------------------------------------

#DOCFREQ - compute the frequency of each term or feature in a dfm

mydfm2 <- dfm(inaugTexts[1:2], verbose = FALSE)
docfreq(mydfm[, 1:20])

#can impose weighting to (e.g.) consider only count instances above some threshold

#------------------------------------------------

#DOCNAMES - query the names of the documents in a corpus or dfm

docnames(inaugCorpus)
docnames(dfm(inaugTexts[1:2]))

#------------------------------------------------

#DOCVARS - query the variables and values of a variable in a corpus 

head(docvars(inaugCorpus)) 
head(inaugCorpus[, "Year"])

#------------------------------------------------

#ENCODING - detect encoding of texts
encoding(inaugTexts)

#-----------------------------------------------

#FEATURES - extract features of a dfm

features(myDfm)

#-----------------------------------------------

#KWIC - outputs context of keyword usage

head(kwic(inaugTexts, "security", window = 3, valuetype = "fixed"))

#------------------------------------------------

#LEXDIV - lexical diversity

mydfm <- dfm(subset(inaugCorpus, Year > 1980), verbose = FALSE) (results <- lexdiv(mydfm, c("CTTR", "TTR", "U"))) 
cor(lexdiv(mydfm, "all"))

#------------------------------------------------

#NDOC - count the number of documents in a corpus or dfm
#NSENTENCE
#NTOKEN

ndoc(mydfm)

#------------------------------------------------

#NGRAMS - see tokenize

#------------------------------------------------

#PHRASETOTOKEN - remove " " to make multiple words a single token

myDict <- dictionary(list(negative = c("bad* word*", "negative", "awful text"), postiive = c("good stuff", "like? th??"))) 
txt <- c("I liked this, when we can use bad words, in awful text.", "Some damn good stuff, like the text, she likes that too.") 
phrasetotoken(txt, myDict)

phrasetotoken("This is a simpler version of multi word expressions.", "multi word expression*")

#------------------------------------------------

#PLOT.DFM - make a wordcloud of dataframe

mydfm <- dfm(subset(inaugCorpus, President=="Obama"), verbose = FALSE, ignoredFeatures = stopwords("english"))
plot(mydfm, random.color = TRUE, rot.per = .25, colors = sample(colors()[2:128], 5))

#------------------------------------------------

#PLOT.KWIC

inaugCorpusPost70 <- subset(inaugCorpus, Year > 1970) # compare multiple documents 
plot(kwic(inaugCorpusPost70, "american"))

#------------------------------------------------

#PRINT.DFM - print...

print(myDfm)

#------------------------------------------------

#READABILITY - calculates readability

readability(inaugCorpus, measure = "Flesch.Kincaid") 

#------------------------------------------------

#REMOVE FEATURES - for example remove to remove puctuation or stopwords

txt <- c(wash1 <- "Fellow citizens, I am again called upon by the voice of my country to execute the functions of its Chief Magistrate.", wash2 <- "When the occasion proper for it shall arrive, I shall endeavor to express the high sense I entertain of this distinguished honor.") 
removeFeatures(tokenize(txt, removePunct = TRUE), stopwords("english"))

mydfm <- dfm(ukimmigTexts, verbose=FALSE) 
removeFeatures(mydfm, stopwords("english"))

#------------------------------------------------

#SAMPLE - takes a random sample from a dataframe or whatever. Sampling without replacement by default

summary(sample(inaugCorpus, 10, replace=TRUE)) 

#------------------------------------------------

#SEGMENT - segment text into (tokens, sentences, paragraphs, tags, other) Can specify regex delimiters etc.

segment(ukimmigTexts[3:4], "paragraphs")

#------------------------------------------------

#SELECTFEATURES - can specify what features to keep as well as remove

#------------------------------------------------

#SHOW- difference from print?

#------------------------------------------------

#SIMILARITY - of documents in dataframe (methods found in simil package)

# create a dfm from inaugural addresses from Reagan onwards 
presDfm <- dfm(subset(inaugCorpus, Year > 1980), ignoredFeatures = stopwords("english"), stem = TRUE)
# compute some document similarities
(tmp <- similarity(presDfm, margin = "documents", method="cosine"))
# output as a matrix 
as.matrix(tmp) 

#-----------------------------------------------

#SORT.DFM -Sorts a dfm by frequency of total features, total features in documents, or both. If you want to have the most important features or cases at the left or top respectively.

dtm <- dfm(inaugCorpus) 
dtm[1:10, 1:5] 
dtm <- sort(dtm)
sort(dtm, TRUE, "both")[1:10, 1:5] 

#-----------------------------------------------

#SUBSET.CORPUS - select a portion of the corpus

summary(subset(inaugCorpus, Year>1980))

#-----------------------------------------------

#SUMMARY.CORPUS

summary(inaugCorpus, n=10)

#-----------------------------------------------

#SYLLABLES - counts syllables

syllables("This is an example sentence.") 

#---------------------------------------------- 

#TEXTFILE - read in .json, .txt, xml, ,csv file

getwd()
setwd("C:/Users/John.DESKTOP-ES1A8AS/Desktop/POLS559/Text/R")
mytf7 <- textfile("bill_law_matches.csv", textField = "version_b") 
head(corpus(mytf7))

#how to read the textField?
#how to manipulate it?


#-----------------------------------------------

#TOKENIZE - really important tool for pre-processing text. To tokenize is to break the text up into desired features, such as words, sentences etc.
#Lots of options, such as removing numbers, Twitter characters etc.  (see also ngrams)

#-----------------------------------------------

#topFeatures - list the most common features. Good way to check for weird stuff 

topfeatures(dfm(subset(inaugCorpus, Year>1980), verbose=FALSE)) 

#-----------------------------------------------

#TRIM - reduce the selected features of a dataframe (for example by specify a minimum number of responses) create a smaller (see also selectFeatures)

(myDfm <- dfm(inaugCorpus, verbose = FALSE)) 
# only words occuring >=10 times and in >=2 docs 
trim(myDfm, minCount = 10, minDoc = 2)

#-----------------------------------------------

#WEIGHT - weight feature frequencies 

dtm <- dfm(inaugCorpus)
tfidfDtm <- weight(dtm, type="tfidf") 

#-----------------------------------------------

#WORDSTEM - stem words by removing suffixes (to treat different usages as the same feature)
#Can apply general Porter stemmer or stem specific words

wordstem(c("win", "winning", "wins", "won", "winner"))
wordstem(dtm, language = "porter")

#----------------------------------------------

Yeah!!