library(mallet)
library(wordcloud)

# Insert path to the directory where this script is kept.
setwd("~/Desktop/topic_model")

n.topics <- 20

#Insert path to the modified_texts directory
documents <- mallet.read.dir("~/Desktop/topic_model/modified_texts")

mallet.instances <- mallet.import(documents$id, documents$text, "stopwords.txt",
                                  token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")

# Create a topic trainer object.
topic.model <- MalletLDA(num.topics=n.topics)
topic.model$loadDocuments(mallet.instances)
topic.model$setAlphaOptimization(20, 50)

# Train a model.
# We can specify the number of iterations. Here we'll use a large round number.
topic.model$train(1000)

# Run through a few iterations where we pick the best topic for each token, 
# rather than sampling from the posterior distribution.
topic.model$maximize(50)

# Basic variables
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)
topic.labels <- mallet.topic.labels(topic.model, topic.words, 3)
mallet.word.freqs <- mallet.word.freqs(topic.model)

# Doc id vector
doc.ids <- gsub("~/Desktop/topic_model/modified_texts/", "", documents$id)
doc.ids <- gsub(".txt", "", doc.ids)

# Function identifies the 50 articles that are most representative of a given topic
top.docs <- function(x, num){ 
  df <- data.frame(x, topic.labels[x], round(100*(doc.topics[, x]), digits=2), doc.ids)
  docs <- df[order(df[3], decreasing=TRUE),]
  head(docs, num) 
}

# Create file containing a table of all the top articles for each topic.
topic.data <- data.frame()
for (x in c(1:20)){
  topic.data <- rbind(topic.data, top.docs(x, 50))
}
colnames(topic.data) <- c("topic number", "top three words", "percentage", "document")
write.csv(topic.data, file="top_docs.csv")


# Make wordcloud PNG files
for (x in c(1:20)){
  top.words <- mallet.top.words(topic.model, topic.words[x,], 50)
  png(paste("topic_", x, ".png", sep=""), width=1200, height=1200, units='px', res=300 )
  wordcloud(top.words[,1], top.words[,2], scale=c(3, .5), max.words=Inf, min.freq=3, random.order=F, rot.per=0, use.r.layout=FALSE, fixed.asp=TRUE)
dev.off()
}



