SOLUTIONS

Task 1: Creating your Corpus

Select three assignments. Convert them to RAW text files. Import into R and create a corpus.

#Load Packages
require(readr)
#Import Text files.


text1<-read_file('data/GutenbergBooks/AnAfricanMillionaire.txt')
text2<-read_file('data/GutenbergBooks/CrimeAndPunishment.txt')
text3<-read_file('data/GutenbergBooks/DeadMenTellNoTales.txt')

#Load Package
require(quanteda)

#Create a corpus
corpus = corpus(c(text1, text2, text3))

Task 2: Text Statistics

Calcualte the average number of characters per word and the average number of words per sentence.

#Average number of characters per word
avgCharPerWord<-nchar(corpus[])/ntoken(corpus[])
#Display Answer#avgCharPerWord
avgCharPerWord

#Average number of words per sentence
avgWordPerSentence<-ntoken(corpus[])/nsentence(corpus[])
#Display Response
avgWordPerSentence

Task 3: Text Metrics

What is the type-token ratio (TTR) of your texts (each text individually)?

#TTR of each text
Text1_TTR<-ntype(corpus[1])/ntoken(corpus[1])
Text2_TTR<-ntype(corpus[2])/ntoken(corpus[2])
Text3_TTR<-ntype(corpus[3])/ntoken(corpus[3])

#Display The Answers
print(c(TTRText1, TTRText2, TTRText3))

Task 4: Word Frequencies

Build a term frequency count representation, and retrieve the top features (hint: topfeatures) for each text.

# create document-feature matrix
DFM = dfm(corpus)

#USe topfeatures to get the top features
topfeatures(DFM[1])
topfeatures(DFM[2])
topfeatures(DFM[3])

Task 5: TF-IDF

Now build a TF-IDF weighted representation of your corpus. Perform this transformation in five different ways: (1) based on the raw texts, (2) removing stopwords, (3) removing punctuation, (4) stemming the words, and (5) combining (2)-(4).

# create tfidf weighting
# raw text
DFM = dfm(corpus, tolower = FALSE, stem = FALSE, remove = NULL)
text_tfidf = dfm_tfidf(DFM, scheme_tf = "prop", scheme_df = "count")
head(text_tfidf[,1:5])

# removing stopwords
DFM = dfm(corpus, tolower = FALSE, stem = FALSE, remove = stopwords("english"))
text_tfidf = dfm_tfidf(DFM, scheme_tf = "prop", scheme_df = "count")
head(text_tfidf[,1:5])
# removing punctuation

punctuation = c(".", ",", ":", ";", "(", ")")
DFM = dfm(corpus, tolower = FALSE, stem = FALSE, remove = punctuation)
text_tfidf = dfm_tfidf(DFM, scheme_tf = "prop", scheme_df = "count")
head(text_tfidf[,1:5])

# stemming words
DFM = dfm(corpus, tolower = FALSE, stem = TRUE, remove = NULL)
text_tfidf = dfm_tfidf(DFM, scheme_tf = "prop", scheme_df = "count")
head(text_tfidf[,1:5])

# combining 2-4
punctuation = c(".", ",", ":", ";", "(", ")")
DFM = dfm(corpus, tolower = FALSE, stem = TRUE, remove = c(punctuation, stopwords("english")))
text_tfidf = dfm_tfidf(DFM, scheme_tf = "prop", scheme_df = "count")
head(text_tfidf[,1:5])

Task 6: Parts-of-Speech

For all texts, calculate the part-of-speech proportions and find out which POS tag is used most often by you in each text.

#install.packages("qdap")
library("qdap")
# apply part of speech function on each text of the corpus + sort list of frequencies/proportions of tags
PoSText1 = pos(Corpus[1])
#PT1Sort = sort(PoSText1$POSfreq, decreasing = TRUE)
PT1Sort = sort(PoSText1$POSprop, decreasing = TRUE)

PoSText2 = pos(Corpus[2])
#PT2Sort = sort(PoSText2$POSfreq, decreasing = TRUE)
PT2Sort = sort(PoSText2$POSprop, decreasing = TRUE)

PoSText3 = pos(Corpus[3])
#PT3Sort = sort(PoSText3$POSfreq, decreasing = TRUE)
PT3Sort = sort(PoSText3$POSprop, decreasing = TRUE)

# display frequencies of PoS tags (decreasing) - look up PoS abbreviations online for better understanding
PT1Sort
PT2Sort
PT3Sort

LS0tCnRpdGxlOiAiU29sdXRpb25zOiBUZXh0IGRhdGEgJiB0ZXh0IG1pbmluZyIKc3VidGl0ZTogIkhvbWV3b3JrIHdlZWsgNCIKYXV0aG9yOiAiQiBLbGVpbmJlcmcsIEQgSGFtbW9ja3MsIEYgU29sZG5lciIKc3VidGl0bGU6IEFkdmFuY2VkIENyaW1lIEFuYWx5c2lzLCBVQ0wKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKKipTT0xVVElPTlMqKgoKIyMjIFRhc2sgMTogQ3JlYXRpbmcgeW91ciBDb3JwdXMKClNlbGVjdCB0aHJlZSBhc3NpZ25tZW50cy4gQ29udmVydCB0aGVtIHRvIFJBVyB0ZXh0IGZpbGVzLiBJbXBvcnQgaW50byBSIGFuZCBjcmVhdGUgYSBjb3JwdXMuCgoKYGBge3J9CiNMb2FkIFBhY2thZ2VzCnJlcXVpcmUocmVhZHIpCiNJbXBvcnQgVGV4dCBmaWxlcy4KCgp0ZXh0MTwtcmVhZF9maWxlKCdkYXRhL0d1dGVuYmVyZ0Jvb2tzL0FuQWZyaWNhbk1pbGxpb25haXJlLnR4dCcpCnRleHQyPC1yZWFkX2ZpbGUoJ2RhdGEvR3V0ZW5iZXJnQm9va3MvQ3JpbWVBbmRQdW5pc2htZW50LnR4dCcpCnRleHQzPC1yZWFkX2ZpbGUoJ2RhdGEvR3V0ZW5iZXJnQm9va3MvRGVhZE1lblRlbGxOb1RhbGVzLnR4dCcpCgojTG9hZCBQYWNrYWdlCnJlcXVpcmUocXVhbnRlZGEpCgojQ3JlYXRlIGEgY29ycHVzCmNvcnB1cyA9IGNvcnB1cyhjKHRleHQxLCB0ZXh0MiwgdGV4dDMpKQpgYGAKCgoKIyMjIFRhc2sgMjogVGV4dCBTdGF0aXN0aWNzCgpDYWxjdWFsdGUgdGhlIGF2ZXJhZ2UgbnVtYmVyIG9mIGNoYXJhY3RlcnMgcGVyIHdvcmQgYW5kIHRoZSBhdmVyYWdlIG51bWJlciBvZiB3b3JkcyBwZXIgc2VudGVuY2UuCgpgYGB7cn0KI0F2ZXJhZ2UgbnVtYmVyIG9mIGNoYXJhY3RlcnMgcGVyIHdvcmQKYXZnQ2hhclBlcldvcmQ8LW5jaGFyKGNvcnB1c1tdKS9udG9rZW4oY29ycHVzW10pCiNEaXNwbGF5IEFuc3dlciNhdmdDaGFyUGVyV29yZAphdmdDaGFyUGVyV29yZAoKI0F2ZXJhZ2UgbnVtYmVyIG9mIHdvcmRzIHBlciBzZW50ZW5jZQphdmdXb3JkUGVyU2VudGVuY2U8LW50b2tlbihjb3JwdXNbXSkvbnNlbnRlbmNlKGNvcnB1c1tdKQojRGlzcGxheSBSZXNwb25zZQphdmdXb3JkUGVyU2VudGVuY2UKYGBgCgoKIyMjIFRhc2sgMzogVGV4dCBNZXRyaWNzCldoYXQgaXMgdGhlIHR5cGUtdG9rZW4gcmF0aW8gKFRUUikgb2YgeW91ciB0ZXh0cyAoZWFjaCB0ZXh0IGluZGl2aWR1YWxseSk/CgpgYGB7cn0KI1RUUiBvZiBlYWNoIHRleHQKVGV4dDFfVFRSPC1udHlwZShjb3JwdXNbMV0pL250b2tlbihjb3JwdXNbMV0pClRleHQyX1RUUjwtbnR5cGUoY29ycHVzWzJdKS9udG9rZW4oY29ycHVzWzJdKQpUZXh0M19UVFI8LW50eXBlKGNvcnB1c1szXSkvbnRva2VuKGNvcnB1c1szXSkKCiNEaXNwbGF5IFRoZSBBbnN3ZXJzCnByaW50KGMoVFRSVGV4dDEsIFRUUlRleHQyLCBUVFJUZXh0MykpCmBgYAoKCiMjIyBUYXNrIDQ6IFdvcmQgRnJlcXVlbmNpZXMKQnVpbGQgYSB0ZXJtIGZyZXF1ZW5jeSBjb3VudCByZXByZXNlbnRhdGlvbiwgYW5kIHJldHJpZXZlIHRoZSB0b3AgZmVhdHVyZXMgKGhpbnQ6IHRvcGZlYXR1cmVzKSBmb3IgZWFjaCB0ZXh0LgoKYGBge3J9CiMgY3JlYXRlIGRvY3VtZW50LWZlYXR1cmUgbWF0cml4CkRGTSA9IGRmbShjb3JwdXMpCgojVVNlIHRvcGZlYXR1cmVzIHRvIGdldCB0aGUgdG9wIGZlYXR1cmVzCnRvcGZlYXR1cmVzKERGTVsxXSkKdG9wZmVhdHVyZXMoREZNWzJdKQp0b3BmZWF0dXJlcyhERk1bM10pCmBgYAoKCiMjIyBUYXNrIDU6IFRGLUlERgpOb3cgYnVpbGQgYSBURi1JREYgd2VpZ2h0ZWQgcmVwcmVzZW50YXRpb24gb2YgeW91ciBjb3JwdXMuIFBlcmZvcm0gdGhpcyB0cmFuc2Zvcm1hdGlvbiBpbiBmaXZlIGRpZmZlcmVudCB3YXlzOiAoMSkgYmFzZWQgb24gdGhlIHJhdyB0ZXh0cywgKDIpIHJlbW92aW5nIHN0b3B3b3JkcywgKDMpIHJlbW92aW5nIHB1bmN0dWF0aW9uLCAoNCkgc3RlbW1pbmcgdGhlIHdvcmRzLCBhbmQgKDUpIGNvbWJpbmluZyAoMiktKDQpLgoKYGBge3J9CiMgY3JlYXRlIHRmaWRmIHdlaWdodGluZwojIHJhdyB0ZXh0CkRGTSA9IGRmbShjb3JwdXMsIHRvbG93ZXIgPSBGQUxTRSwgc3RlbSA9IEZBTFNFLCByZW1vdmUgPSBOVUxMKQp0ZXh0X3RmaWRmID0gZGZtX3RmaWRmKERGTSwgc2NoZW1lX3RmID0gInByb3AiLCBzY2hlbWVfZGYgPSAiY291bnQiKQpoZWFkKHRleHRfdGZpZGZbLDE6NV0pCgojIHJlbW92aW5nIHN0b3B3b3JkcwpERk0gPSBkZm0oY29ycHVzLCB0b2xvd2VyID0gRkFMU0UsIHN0ZW0gPSBGQUxTRSwgcmVtb3ZlID0gc3RvcHdvcmRzKCJlbmdsaXNoIikpCnRleHRfdGZpZGYgPSBkZm1fdGZpZGYoREZNLCBzY2hlbWVfdGYgPSAicHJvcCIsIHNjaGVtZV9kZiA9ICJjb3VudCIpCmhlYWQodGV4dF90ZmlkZlssMTo1XSkKIyByZW1vdmluZyBwdW5jdHVhdGlvbgoKcHVuY3R1YXRpb24gPSBjKCIuIiwgIiwiLCAiOiIsICI7IiwgIigiLCAiKSIpCkRGTSA9IGRmbShjb3JwdXMsIHRvbG93ZXIgPSBGQUxTRSwgc3RlbSA9IEZBTFNFLCByZW1vdmUgPSBwdW5jdHVhdGlvbikKdGV4dF90ZmlkZiA9IGRmbV90ZmlkZihERk0sIHNjaGVtZV90ZiA9ICJwcm9wIiwgc2NoZW1lX2RmID0gImNvdW50IikKaGVhZCh0ZXh0X3RmaWRmWywxOjVdKQoKIyBzdGVtbWluZyB3b3JkcwpERk0gPSBkZm0oY29ycHVzLCB0b2xvd2VyID0gRkFMU0UsIHN0ZW0gPSBUUlVFLCByZW1vdmUgPSBOVUxMKQp0ZXh0X3RmaWRmID0gZGZtX3RmaWRmKERGTSwgc2NoZW1lX3RmID0gInByb3AiLCBzY2hlbWVfZGYgPSAiY291bnQiKQpoZWFkKHRleHRfdGZpZGZbLDE6NV0pCgojIGNvbWJpbmluZyAyLTQKcHVuY3R1YXRpb24gPSBjKCIuIiwgIiwiLCAiOiIsICI7IiwgIigiLCAiKSIpCkRGTSA9IGRmbShjb3JwdXMsIHRvbG93ZXIgPSBGQUxTRSwgc3RlbSA9IFRSVUUsIHJlbW92ZSA9IGMocHVuY3R1YXRpb24sIHN0b3B3b3JkcygiZW5nbGlzaCIpKSkKdGV4dF90ZmlkZiA9IGRmbV90ZmlkZihERk0sIHNjaGVtZV90ZiA9ICJwcm9wIiwgc2NoZW1lX2RmID0gImNvdW50IikKaGVhZCh0ZXh0X3RmaWRmWywxOjVdKQpgYGAKCiMjIyBUYXNrIDY6IFBhcnRzLW9mLVNwZWVjaApGb3IgYWxsIHRleHRzLCBjYWxjdWxhdGUgdGhlIHBhcnQtb2Ytc3BlZWNoIHByb3BvcnRpb25zIGFuZCBmaW5kIG91dCB3aGljaCBQT1MgdGFnIGlzIHVzZWQgbW9zdCBvZnRlbiBieSB5b3UgaW4gZWFjaCB0ZXh0LgoKYGBge3J9CiNpbnN0YWxsLnBhY2thZ2VzKCJxZGFwIikKbGlicmFyeSgicWRhcCIpCiMgYXBwbHkgcGFydCBvZiBzcGVlY2ggZnVuY3Rpb24gb24gZWFjaCB0ZXh0IG9mIHRoZSBjb3JwdXMgKyBzb3J0IGxpc3Qgb2YgZnJlcXVlbmNpZXMvcHJvcG9ydGlvbnMgb2YgdGFncwpQb1NUZXh0MSA9IHBvcyhDb3JwdXNbMV0pCiNQVDFTb3J0ID0gc29ydChQb1NUZXh0MSRQT1NmcmVxLCBkZWNyZWFzaW5nID0gVFJVRSkKUFQxU29ydCA9IHNvcnQoUG9TVGV4dDEkUE9TcHJvcCwgZGVjcmVhc2luZyA9IFRSVUUpCgpQb1NUZXh0MiA9IHBvcyhDb3JwdXNbMl0pCiNQVDJTb3J0ID0gc29ydChQb1NUZXh0MiRQT1NmcmVxLCBkZWNyZWFzaW5nID0gVFJVRSkKUFQyU29ydCA9IHNvcnQoUG9TVGV4dDIkUE9TcHJvcCwgZGVjcmVhc2luZyA9IFRSVUUpCgpQb1NUZXh0MyA9IHBvcyhDb3JwdXNbM10pCiNQVDNTb3J0ID0gc29ydChQb1NUZXh0MyRQT1NmcmVxLCBkZWNyZWFzaW5nID0gVFJVRSkKUFQzU29ydCA9IHNvcnQoUG9TVGV4dDMkUE9TcHJvcCwgZGVjcmVhc2luZyA9IFRSVUUpCgojIGRpc3BsYXkgZnJlcXVlbmNpZXMgb2YgUG9TIHRhZ3MgKGRlY3JlYXNpbmcpIC0gbG9vayB1cCBQb1MgYWJicmV2aWF0aW9ucyBvbmxpbmUgZm9yIGJldHRlciB1bmRlcnN0YW5kaW5nClBUMVNvcnQKUFQyU29ydApQVDNTb3J0CmBgYAoKLS0t