様々な類似度の計算
ある2つの単語a, bの関連を調べる場合に検索エンジンのヒット数を利用することが多いですが、ヒット数から類似度を計算する方法はたくさんあります。そこで様々な類似度の計算方法をPythonで実装しました。なお、類似度や距離についての解説は類似度と距離 - CatTail Wiki*が詳しいです。
similarity.py
# -*- coding: utf-8 -* def calc_similarity(hits0, hits1, hits01, metric): numerator = 0 denominator = 0 if metric == "jaccard": # Jaccard(A,B) = |A and B| / |A or B| numerator = hits01 denominator = hits0 + hits1 - hits01 elif metric == "dice": # Dice(A,B) = 2 * |A and B| / |A| + |B| numerator = hits01 denominator = hits0 + hits1 elif metric == "overlap": # Overlap(A,B) = |A and B| / min(|A|,|B|) numerator = hits01 denominator = min(hits0, hits1) elif metric == "pmi": # pmi(A,B) = log (p(A and B) / (p(A) * p(B))) numerator = hits01 denominator = hits0 * hits1 elif metric == "conditional_probability": # p(A|B) = p(A and B) / p(B) numerator = hits01 denominator = hits0 else: raise TypeError, "metric ::= (jaccard|dice|overlap|pmi|conditional_probability)" try: return 1.0 * numerator / denominator except ZeroDivisionError: return 0.0 def main(): import sys argv = sys.argv argc = len(argv) if (argc != 4): exit("Usage: python %s hits0 hits1 hits01" % argv[0]) hits0 = int(argv[1]) hits1 = int(argv[2]) hits01 = int(argv[3]) for metric in ["jaccard", "dice", "overlap", "pmi", "conditional_probability"]: similarity = calc_similarity(hits0, hits1, hits01, metric) print "%s: %f" % (metric, similarity) if __name__ == "__main__": main()
$ python similarity.py 5 10 3 jaccard: 0.250000 dice: 0.200000 overlap: 0.600000 pmi: 0.060000 conditional_probability: 0.600000