Lucene4.3.1字符串距离接口StringDistance实现之JaroWinklerDistance源码解析

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/asty9000/article/details/81349129

JaroWinklerDistance是JaroWinklerDistance算法的实现,在原本JaroWinklerDistance算法的基础上做了一些小的调整。在计算Jaro-Winkler相似度时与JaroWinklerDistance算法不同的是其在计算时使用的系数p取值为两个字符串最大长度的倒数,最大为0.1。

package org.apache.lucene.search.spell;

import java.util.Arrays;


public class JaroWinklerDistance implements StringDistance {

  private float threshold = 0.7f;
  
  public JaroWinklerDistance() {}

  private int[] matches(String s1, String s2) {
    String max, min;
    if (s1.length() > s2.length()) {
      max = s1;
      min = s2;
    } else {
      max = s2;
      min = s1;
    }
    //计算字符匹配范围
    int range = Math.max(max.length() / 2 - 1, 0);
    //记录与min字符串对应位置字符匹配的字符在max字符串中的位置
    int[] matchIndexes = new int[min.length()];
    Arrays.fill(matchIndexes, -1);
    //记录max字符串对应位置字符是否已被匹配
    boolean[] matchFlags = new boolean[max.length()];
    //记录两个字符串匹配的字符数
    int matches = 0;
    for (int mi = 0; mi < min.length(); mi++) {
      char c1 = min.charAt(mi);
      for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max
          .length()); xi < xn; xi++) {
        if (!matchFlags[xi] && c1 == max.charAt(xi)) {
          matchIndexes[mi] = xi;
          matchFlags[xi] = true;
          matches++;
          break;
        }
      }
    }
    //ms1,ms2分别存储min与max中匹配的字符
    char[] ms1 = new char[matches];
    char[] ms2 = new char[matches];
    for (int i = 0, si = 0; i < min.length(); i++) {
      if (matchIndexes[i] != -1) {
        ms1[si] = min.charAt(i);
        si++;
      }
    }
    for (int i = 0, si = 0; i < max.length(); i++) {
      if (matchFlags[i]) {
        ms2[si] = max.charAt(i);
        si++;
      }
    }
    //计算min、max相匹配的字符需要换位的字符数,计算结果需除2
    int transpositions = 0;
    for (int mi = 0; mi < ms1.length; mi++) {
      if (ms1[mi] != ms2[mi]) {
        transpositions++;
      }
    }
    //计算min、max相匹配的字符的共同前缀的长度
    int prefix = 0;
    for (int mi = 0; mi < min.length(); mi++) {
      if (s1.charAt(mi) == s2.charAt(mi)) {
        prefix++;
      } else {
        break;
      }
    }
    return new int[] { matches, transpositions / 2, prefix, max.length() };
  }

  @Override
  public float getDistance(String s1, String s2) {
    int[] mtp = matches(s1, s2);
    float m = mtp[0];
    if (m == 0) {
      return 0f;
    }
    //计算Jaro Distance
    float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3;
    //如果达到设置的阈值则使用Jaro-Winkler Distance
    float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2]
        * (1 - j);
    return jw;
  }

  public void setThreshold(float threshold) {
    this.threshold = threshold;
  }

  public float getThreshold() {
    return threshold;
  }

  @Override
  public int hashCode() {
    return 113 * Float.floatToIntBits(threshold) * getClass().hashCode();
  }

  @Override
  public boolean equals(Object obj) {
    if (this == obj) return true;
    if (null == obj || getClass() != obj.getClass()) return false;
    
    JaroWinklerDistance o = (JaroWinklerDistance)obj;
    return (Float.floatToIntBits(o.threshold) 
            == Float.floatToIntBits(this.threshold));
  }

  @Override
  public String toString() {
    return "jarowinkler(" + threshold + ")";
  }

}

猜你喜欢

转载自blog.csdn.net/asty9000/article/details/81349129