比较字符串 Javascript 返回 %of 可能

IT技术 javascript string algorithm comparison
2021-02-27 22:49:15

我正在寻找一个 JavaScript 函数,它可以比较两个字符串并返回它们相似的可能性。我看过 soundex 但这对于多字串或非名称并不是很好。我正在寻找这样的功能:

    function compare(strA,strB){
    
    }
    
    compare("Apples","apple") = Some X Percentage.

该函数适用于所有类型的字符串,包括数字、多字值和名称。也许我可以使用一个简单的算法?

最终这些都没有达到我的目的,所以我使用了这个:

     function compare(c, u) {
            var incept = false;
            var ca = c.split(",");
            u = clean(u);
            //ca = correct answer array (Collection of all correct answer)
            //caa = a single correct answer word array (collection of words of a single correct answer)
            //u = array of user answer words cleaned using custom clean function
            for (var z = 0; z < ca.length; z++) {
                caa = $.trim(ca[z]).split(" ");
                var pc = 0;
                for (var x = 0; x < caa.length; x++) {
                    for (var y = 0; y < u.length; y++) {
                        if (soundex(u[y]) != null && soundex(caa[x]) != null) {
                            if (soundex(u[y]) == soundex(caa[x])) {
                                pc = pc + 1;
                            }
                        }
                        else {
                            if (u[y].indexOf(caa[x]) > -1) {
                                pc = pc + 1;
                            }
                        }
                    }
                }
                if ((pc / caa.length) > 0.5) {
                    return true;
                }
            }
            return false;
        }
        
        // create object listing the SOUNDEX values for each letter
        // -1 indicates that the letter is not coded, but is used for coding
        //  0 indicates that the letter is omitted for modern census archives
        //                              but acts like -1 for older census archives
        //  1 is for BFPV
        //  2 is for CGJKQSXZ
        //  3 is for DT
        //  4 is for L
        //  5 is for MN my home state
        //  6 is for R
        function makesoundex() {
            this.a = -1
            this.b = 1
            this.c = 2
            this.d = 3
            this.e = -1
            this.f = 1
            this.g = 2
            this.h = 0
            this.i = -1
            this.j = 2
            this.k = 2
            this.l = 4
            this.m = 5
            this.n = 5
            this.o = -1
            this.p = 1
            this.q = 2
            this.r = 6
            this.s = 2
            this.t = 3
            this.u = -1
            this.v = 1
            this.w = 0
            this.x = 2
            this.y = -1
            this.z = 2
        }
        
        var sndx = new makesoundex()
        
        // check to see that the input is valid
        function isSurname(name) {
            if (name == "" || name == null) {
                return false
            } else {
                for (var i = 0; i < name.length; i++) {
                    var letter = name.charAt(i)
                    if (!(letter >= 'a' && letter <= 'z' || letter >= 'A' && letter <= 'Z')) {
                        return false
                    }
                }
            }
            return true
        }
        
        // Collapse out directly adjacent sounds
        // 1. Assume that surname.length>=1
        // 2. Assume that surname contains only lowercase letters
        function collapse(surname) {
            if (surname.length == 1) {
                return surname
            }
            var right = collapse(surname.substring(1, surname.length))
            if (sndx[surname.charAt(0)] == sndx[right.charAt(0)]) {
                return surname.charAt(0) + right.substring(1, right.length)
            }
            return surname.charAt(0) + right
        }
        
        // Collapse out directly adjacent sounds using the new National Archives method
        // 1. Assume that surname.length>=1
        // 2. Assume that surname contains only lowercase letters
        // 3. H and W are completely ignored
        function omit(surname) {
            if (surname.length == 1) {
                return surname
            }
            var right = omit(surname.substring(1, surname.length))
            if (!sndx[right.charAt(0)]) {
                return surname.charAt(0) + right.substring(1, right.length)
            }
            return surname.charAt(0) + right
        }
        
        // Output the coded sequence
        function output_sequence(seq) {
            var output = seq.charAt(0).toUpperCase() // Retain first letter
            output += "-" // Separate letter with a dash
            var stage2 = seq.substring(1, seq.length)
            var count = 0
            for (var i = 0; i < stage2.length && count < 3; i++) {
                if (sndx[stage2.charAt(i)] > 0) {
                    output += sndx[stage2.charAt(i)]
                    count++
                }
            }
            for (; count < 3; count++) {
                output += "0"
            }
            return output
        }
        
        // Compute the SOUNDEX code for the surname
        function soundex(value) {
            if (!isSurname(value)) {
                return null
            }
            var stage1 = collapse(value.toLowerCase())
            //form.result.value=output_sequence(stage1);
        
            var stage1 = omit(value.toLowerCase())
            var stage2 = collapse(stage1)
            return output_sequence(stage2);
        
        }
        
        function clean(u) {
            var u = u.replace(/\,/g, "");
            u = u.toLowerCase().split(" ");
            var cw = ["ARRAY OF WORDS TO BE EXCLUDED FROM COMPARISON"];
            var n = [];
            for (var y = 0; y < u.length; y++) {
                var test = false;
                for (var z = 0; z < cw.length; z++) {
                    if (u[y] != "" && u[y] != cw[z]) {
                        test = true;
                        break;
                    }
                }
                if (test) {
        //Don't use & or $ in comparison
                    var val = u[y].replace("$", "").replace("&", "");
                    n.push(val);
                }
            }
            return n;
        }
6个回答

这是基于 Levenshtein 距离的答案https://en.wikipedia.org/wiki/Levenshtein_distance

function similarity(s1, s2) {
  var longer = s1;
  var shorter = s2;
  if (s1.length < s2.length) {
    longer = s2;
    shorter = s1;
  }
  var longerLength = longer.length;
  if (longerLength == 0) {
    return 1.0;
  }
  return (longerLength - editDistance(longer, shorter)) / parseFloat(longerLength);
}

用于计算编辑距离

function editDistance(s1, s2) {
  s1 = s1.toLowerCase();
  s2 = s2.toLowerCase();

  var costs = new Array();
  for (var i = 0; i <= s1.length; i++) {
    var lastValue = i;
    for (var j = 0; j <= s2.length; j++) {
      if (i == 0)
        costs[j] = j;
      else {
        if (j > 0) {
          var newValue = costs[j - 1];
          if (s1.charAt(i - 1) != s2.charAt(j - 1))
            newValue = Math.min(Math.min(newValue, lastValue),
              costs[j]) + 1;
          costs[j - 1] = lastValue;
          lastValue = newValue;
        }
      }
    }
    if (i > 0)
      costs[s2.length] = lastValue;
  }
  return costs[s2.length];
}

用法

similarity('Stack Overflow','Stack Ovrflw')

返回 0.8571428571428571


你可以在下面玩它:

几个词的改进: var Similarity2 = function(s1, s2){ var split1 = s1.split(' '); var split2 = s2.split(' '); 总和 = 0; 无功最大值 = 0; 无功温度 = 0; for(var i=0; i<split1.length;i++){ max = 0; for(var j=0; j<split2.length;j++){ temp = Similarity(split1[i], split2[j]); 如果(最大 < 温度)最大 = 温度;控制台日志(最大);sum += max / split1.length; } 返回总和;};
2021-04-18 22:49:15
@overlord1234 上面的方法是否适用于这样的字符串: 9e27dbb9ff6eea70821c02b4457cbc6b7eb8e12a64f46c192c3a05f1bc1519acd101193dac157c6233d907c6233d90808c6208080808c65c608c65c65c608c65c65c808c68c65c68c68c6b7b8e13
2021-04-19 22:49:15
它确实可以处理没有附加语义的字符串。请尝试运行内嵌代码片段(感谢 David)。当我输入上述字符串时,我得到 0.17857142857142858 的相似度。
2021-04-25 22:49:15
但是,对于长字符串来说太慢了。
2021-04-25 22:49:15
@hyperfkcb 他正在实施编辑距离算法,该算法计算有多少字符处于错误位置(或多或少),因此为了计算百分比,他采用更长的可能编辑距离值(longerLength)并执行(longerLength - editDistance )/更长的长度。
2021-05-16 22:49:15

这个库用于字符串相似性对我来说就像一个魅力!

这是示例 -

var similarity = stringSimilarity.compareTwoStrings("Apples","apple");    // => 0.88
他们已经删除了大多数依赖项,包括 lodash
2021-04-26 22:49:15
是的,在本地添加包时会发生这种情况。但是,我们可以使用CDN来减小包大小。这里是 CDN 链接 - jsdelivr.com/package/npm/lodash - jsdelivr.com/package/npm/string-similarity
2021-05-02 22:49:15
太好了,除了 stringSimilarity 有一个名为 lodash 的依赖项,其中包含 1,000 多个文件被放入您的项目,以便您可以获得字符串相似度。
2021-05-16 22:49:15

这是一个非常简单的函数,它进行比较并返回基于等价的百分比。虽然它尚未针对所有可能的场景进行测试,但它可能会帮助您入门。

function similar(a,b) {
    var equivalency = 0;
    var minLength = (a.length > b.length) ? b.length : a.length;    
    var maxLength = (a.length < b.length) ? b.length : a.length;    
    for(var i = 0; i < minLength; i++) {
        if(a[i] == b[i]) {
            equivalency++;
        }
    }
    

    var weight = equivalency / maxLength;
    return (weight * 100) + "%";
}
alert(similar("test","tes"));   // 75%
alert(similar("test","test"));  // 100%
alert(similar("test","testt")); // 80%
alert(similar("test","tess"));  // 75%
问题在于“测试”和“测试”返回 0%,我们知道这是不正确的。
2021-04-19 22:49:15

求两个字符串的相似度;我们可以使用不止一种或两种方法,但我最倾​​向于使用骰子系数哪个更好!据我所知,比使用Levenshtein 距离”好

使用npm 中的这个“字符串相似性”包,您将能够处理我上面所说的内容。

一些简单的使用示例是

var stringSimilarity = require('string-similarity');

var similarity = stringSimilarity.compareTwoStrings('healed', 'sealed'); 

var matches = stringSimilarity.findBestMatch('healed', ['edward', 'sealed', 'theatre']);

有关更多信息,请访问上面给出的链接。谢谢你。

欢迎提供指向解决方案的链接,但请确保您的答案在没有它的情况下也有用:在链接周围添加上下文,以便您的其他用户了解它是什么以及它为什么在那里,然后引用您页面中最相关的部分“重新链接,以防目标页面不可用。仅是链接的回答可能会被删除
2021-05-01 22:49:15

只是我很快写的一个可能足以满足您的目的:

function Compare(strA,strB){
    for(var result = 0, i = strA.length; i--;){
        if(typeof strB[i] == 'undefined' || strA[i] == strB[i]);
        else if(strA[i].toLowerCase() == strB[i].toLowerCase())
            result++;
        else
            result += 4;
    }
    return 1 - (result + 4*Math.abs(strA.length - strB.length))/(2*(strA.length+strB.length));
}

这对相同但大小写不同的字符的权重与完全不同或缺失的字符的权重相同。它返回一个介于 0 和 1 之间的数字,1 表示字符串相同。0 表示它们没有相似之处。例子:

Compare("Apple", "Apple")    // 1
Compare("Apples", "Apple")   // 0.8181818181818181
Compare("Apples", "apple")   // 0.7727272727272727
Compare("a", "A")            // 0.75
Compare("Apples", "appppp")  // 0.45833333333333337
Compare("a", "b")            // 0
@Paulpro:这不会使您的算法不正确,但会使其成为这个问题的糟糕答案......
2021-04-24 22:49:15
@Kousha ,它的位置。“Apple”和“zApple”只有一个共同的字母(第二个p)。
2021-04-27 22:49:15
@Paulpro Apple 和 zApple 在逻辑上有五个共同的字母。这是你的实现错误。Apple、zApple、Applez 都是类似的。
2021-04-27 22:49:15
@Kousha,根据此算法,zApple 不相似,因为它是定位的。这不会使算法不正确。
2021-05-01 22:49:15
不太准确: Compare("Apple", "zApple") = 0.07 ,而 Compare("Apple", "Applez") = 0.84
2021-05-02 22:49:15