大型数据集上的文本挖掘

数据挖掘 r 大数据 文本挖掘
2022-02-09 17:11:16

我有一个大型数据集(460 Mb),其中有一列 - 包含 386551 行的日志。我希望使用聚类和 N-Gram 方法来形成词云。我的代码如下:

library(readr)
AMC <- read_csv("All Tickets.csv")
Desc <- AMC[,4]

#Very large data hence breaking it down for creating corpus
#DataframeSource has been used insted of VectorSource is to be able to       handle the data

library(tm)
docs_new <- data.frame(Desc)

test1 <- docs_new[1:100000,]
test2 <- docs_new[100001:200000,]
test3 <- docs_new[200001:300000,]
test4 <- docs_new[300001:386551,]
test1 <- data.frame(test1)
test1 <- Corpus(DataframeSource(test1))
test2 <- data.frame(test2)
test2 <- Corpus(DataframeSource(test2))
test3 <- data.frame(test3)
test3 <- Corpus(DataframeSource(test3))
test4 <- data.frame(test4)
test4 <- Corpus(DataframeSource(test4))

# attach all the corpus
docs_new <- c(test1,test2,test3,test4)

docs_new <- tm_map(docs_new, tolower)
docs_new <- tm_map(docs_new, removePunctuation)
docs_new <- tm_map(docs_new, removeNumbers)
docs_new <- tm_map(docs_new, removeWords, stopwords(kind = "en"))
docs_new <- tm_map(docs_new, stripWhitespace)
docs_new <- tm_map(docs_new, stemDocument)
docs_new <- tm_map(docs_new, PlainTextDocument)

#tokenizer for tdm with ngrams
library(RWeka)
options(mc.cores=1) 
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max =2))
tdm <- TermDocumentMatrix(docs_new, control = list(tokenize = BigramTokenizer))

这给了我如下结果:

TermDocumentMatrix (terms: 1874071, documents: 386551)>>
Non-/sparse entries: 17313767/724406705354
Sparsity           : 100%
Maximal term length: 733
Weighting          : term frequency (tf)

然后我使用以下方法将其转换为 dgMatrix:

library("Matrix")
mat <- sparseMatrix(i=tdm$i, j=tdm$j, x=tdm$v, dims=c(tdm$nrow, tdm$ncol))

在尝试使用以下内容时,我收到内存大小错误:

removeSparseTerms(tdm, 0.2)

请进一步提出建议,因为我是文本分析的新手。

1个回答

您正在使用 R,并且您当前正在处理的所有内容都将保存在内存中,因此会出现错误。你只是没有足够的。

您最好从原始拆分中创建术语的频率,而不是创建一个大文件。

然后在你把频率加在一起之后。

我个人使用此代码来创建我的 wordcloud。

##Clean code for single column of a dataframe, in this case named Text
  x = alltweets
  tweets.text <- x$text
  tweets.text.cleaned <- gsub("@\\w+ *#", "", tweets.text)
  tweets.text.cleaned <- gsub("(f|ht)tp(s?)://(.*)[.][a-z]+", "", tweets.text.cleaned)
  tweets.text.cleaned <- gsub("[^0-9A-Za-z///' ]", "", tweets.text.cleaned)
  tweets.text.corpus <- Corpus(VectorSource(tweets.text.cleaned))
  tweets.text.final <- tm_map (tweets.text.corpus, removePunctuation, mc.cores=1)
  tweets.text.final2 <- tm_map (tweets.text.final, content_transformer(tolower), mc.cores=1)
  tweets.text.final2 <- tm_map(tweets.text.final2, removeNumbers, mc.cores=1)
  tweets.text.final2 <- tm_map(tweets.text.final2, removePunctuation, mc.cores=1)
  tweets.text.final2 <- tm_map(tweets.text.final2,removeWords, stopwords("English"), mc.cores=1)
  tweets.text.final2 = tm_map(tweets.text.final2, removeWords, c("amp", "&"))
#create corpus
   housing.tweets.corpus <- Corpus(VectorSource(tweets.text))

#clean up by removing stop words
   housing.tweets.corpus <- tm_map(housing.tweets.corpus, function(x)removeWords(x,stopwords()))

#install wordcloud if not already installed
   install.packages("wordcloud")
   library("wordcloud")

#generate wordcloud
   wordcloud(housing.tweets.corpus,min.freq = 2, scale=c(7,0.5),colors=brewer.pal(8, "Dark2"),  random.color= TRUE, random.order = FALSE, max.words = 500)