其他选择是...
- 比较相似的文本序列
- 比较相似的字符串序列
- 使用模糊匹配
模糊匹配:
library(fuzzyjoin)
# https://stackoverflow.com/questions/26405895/how-can-i-match-fuzzy-match-strings-from-two-datasets
a <- data.frame(name = c('Ace Co', 'Bayes', 'asd', 'Bcy', 'Baes', 'Bays'),
price = c(10, 13, 2, 1, 15, 1))
b <- data.frame(name = c('Ace Co.', 'Bayes Inc.', 'asdf'),
qty = c(9, 99, 10))
# Find matches
stringdist_join(a, b,
by = "name",
mode = "left",
ignore_case = FALSE,
method = "jw",
max_dist = 99,
distance_col = "dist"
) %>%
group_by(name.x) %>%
top_n(1, -dist)
这给出了一个词和另一个词之间的“距离”。所以如果你知道真正的产品名称,你也许可以通过距离找到错误的名称。
# A tibble: 6 x 5
# Groups: name.x [6]
name.x price name.y qty dist
<fct> <dbl> <fct> <dbl> <dbl>
1 Ace Co 10 Ace Co. 9 0.0476
2 Bayes 13 Bayes Inc. 99 0.167
3 asd 2 asdf 10 0.0833
4 Bcy 1 Bayes Inc. 99 0.378
5 Baes 15 Bayes Inc. 99 0.2
6 Bays 1 Bayes Inc. 99 0.2
或者,您可以查看字符串序列中的相似性:
如果您的产品名称在其字符串序列中并不常见,并且错误名称仅导致部分名称错误,您可以尝试以下操作:
library(dplyr)
library(tidytext)
library(fuzzyjoin)
library(tokenizers)
##############################
# Compare text sequences
text1=as.character("Hi my name is Bixi and I like cycling a lot. It is just great!")
mytext1=data_frame(text1)
text2=as.character("Hi my name is Lissi and I'm good in swimming. It is just great!")
mytext2=data_frame(text2)
ngram1 = unnest_tokens(mytext1, ngram, text1, token = "ngrams", n = 4)
ngram2 = unnest_tokens(mytext2, ngram, text2, token = "ngrams", n = 4)
# Find matching sequence(s)
semi_join(ngram1,ngram2)
##############################
# Compare sequences of single letters
ngram3=tokenize_character_shingles(mytext1$text1, n = 10, n_min = 10, strip_non_alphanum = FALSE)
ngram4=tokenize_character_shingles(mytext2$text2, n = 10, n_min = 10, strip_non_alphanum = FALSE)
ngram3=as.data.frame(ngram3)
ngram4=as.data.frame(ngram4)
# Find matching sequences of single letters
semi_join(ngram3,ngram4)
你可以看看我的Github。还有一些相关的选项。