在下面找到基于 n-gram 的(二进制)文本分类的最小实现。它是用基础 R 编写的。您首先需要提取 ngram 并应用一些模型(在我的例子中是 Lasso)来预测每个 ngram 的类。您可以通过相应地更改模型轻松地将其扩展为多类。请注意,这只是一个有改进空间的最小示例。
另请参阅:https ://github.com/Bixi81/R-ml/blob/master/NLP_ngram_short_text.R
在这里找到 GLMnet 的 Python 实现:https ://web.stanford.edu/~hastie/glmnet_python/
数据:
# Dummy data
# Note the little differences in the strings, however there is a clear pattern
df = data.frame(text=c("ab13ab12ab16","ag16ag16fg16","ab12ab12ab12","fg12fg16fg16","ab16ab12af12","fg16fg16fg16"),target=c(1,0,1,0,1,0))
head(df)
text target
1 ab13ab12ab16 1
2 ag16ag16fg16 0
3 ab12ab12ab12 1
4 fg12fg16fg16 0
5 ab16ab12af12 1
6 fg16fg16fg16 0
Ngram:
# Set up lists to post results from loop
ngrams = list()
targets = list()
observation = list()
# Loop over rows of DF
counter = 1
for (row in 1:nrow(df)){
# Loop over strings in first colum per row
for (s in 1:nchar(as.character(df$text[[row]]))){
# Get "ngram" (sequence of two letters/digits)
substring=substring(as.character(df$text[[row]]), s, s+1)
# Append if >1, also post row and target
if (nchar(substring)>1){
ngrams[[counter]]<-substring
targets[[counter]]<-df$target[[row]]
observation[[counter]]<-row
counter = counter+1
}
}
}
# Lists to DFs
ngramdf=data.frame(ngram=matrix(unlist(ngrams), nrow=length(ngrams), byrow=T))
targets=data.frame(ngram=matrix(unlist(targets), nrow=length(targets), byrow=T))
obs=data.frame(obs=matrix(unlist(observation), nrow=length(observation), byrow=T))
# Get dummy encoding ("one hot") from all the ngrams
dummies = model.matrix(~ . -1 , data=ngramdf)
# Bind target and dummies to train DF
train = cbind(targets,dummies)
估计一些模型:
# Now apply Lasso to predict the classes
# https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
library(glmnet)
cvfit = cv.glmnet(as.matrix(train[,-1]), as.matrix(train[,1]), family = "binomial", type.measure = "class")
# Predict class per ngram
classes = predict(cvfit, newx = as.matrix(train[,-1]), s = "lambda.min", type = "class")
# predicted probability per ngram -> needs (type="response")
probs = predict(cvfit, newx = as.matrix(train[,-1]), s = "lambda.min", type = "response")
# Calculate average prob that a sequence (so ngrams per row) belongs to some class
result = cbind(probs,obs)
# Get mean per row
result = aggregate(. ~ result$obs, result[1], mean)
# Bind original text and target
result = cbind(result,df)
colnames(result)<-c("observation","estimated_prob", "text", "original_target")
结果:
observation estimated_prob text original_target
1 1 0.7173567 ab13ab12ab16 1
2 2 0.3020248 ag16ag16fg16 0
3 3 0.7674571 ab12ab12ab12 1
4 4 0.2922798 fg12fg16fg16 0
5 5 0.6815783 ab16ab12af12 1
6 6 0.2393033 fg16fg16fg16 0