转自:
def Ngram_distance(str1, str2, n=2): tmp = ' ' * (n-1) str1 = tmp + str1 + tmp#表示以首字母开头和本char结尾 str2 = tmp + str2 + tmp set1 = set([str1[i:i+n] for i in range(len(str1)-(n-1))]) set2 = set([str2[i:i+n] for i in range(len(str2)-(n-1))]) setx = set1 & set2 len1 = len(set1) len2 = len(set2) lenx = len(setx) num_dist = len1 + len2 - 2*lenx num_sim = 1 - num_dist / (len1 + len2) return set1,set2,{ 'dist': num_dist, 'sim': num_sim}print(Ngram_distance('girl','girlfriend'))
输出结果:
({ 'gi', 'ir', 'rl', 'l ', ' g'}, { 'gi', 'en', 'd ', 'ir', 'lf', 'ie', 'rl', 'fr', 'ri', ' g', 'nd'}, { 'dist': 8, 'sim': 0.5})