我想这不是一个直接的答案,但您可以尝试模拟场景并检查经验分布以有一个粗略的想法(下面的 R 代码)。
因此,对于长度超过 ~ 100 的字符串,分布似乎是对称的,并且非常窄,大约是字符串长度的 0.53 倍。
# dependencies
library(ggplot2); theme_set(theme_classic())
library(parallel)
library(RecordLinkage)
# settings
alphabet <- c("A", "C", "G", "T")
Nsim <- 1e3
read_lengths <- seq(60, 500, 20)
# function to create a random string of length "n" using letters of the alphabet "alph"
random_read <- function(n, alph=alphabet) paste(sample(alph, size=n, replace=T), collapse="")
# simulate
res <- mclapply(read_lengths,
function(N) replicate(Nsim, levenshteinDist(random_read(N), random_read(N))),
mc.cores=6)
# arrange results as data.frame
res_df <- data.frame(dist=unlist(res),
length=rep(read_lengths, sapply(res, length)))
# plot densities
ggplot(res_df,
aes(x=dist / length, col=length, group=length)) +
geom_density() +
ggtitle("Distribution of Levenshtein distance / length")
ggplot(res_df,
aes(x=length, y=dist / length, col=length)) +
geom_violin(aes(group=length)) +
geom_smooth(col="black", lwd=1) +
ggtitle("Distribution of Levenshtein distance / length")