1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
from math import e,log,sqrt
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
corpus = [] # brown 语料库
for i in brown.categories():
corpus.extend(brown.words(categories=i))
word_buff = {}
threshold = 0.25 # 最小相似度阈值
semantic_and_word_order_factor=0.8 # 语义权重(语义和词序)
def get_min_path_distance_and_subsumer_between_two_words(word1,word2):
"""
获取两个词之间的最小距离和父节点的最小深度
"""
if word1 in word_buff:
word1_synsets = word_buff[word1]
else:
word1_synsets = wn.synsets(word1)
word_buff[word1] = word1_synsets
if word2 in word_buff:
word2_synsets = word_buff[word2]
else:
word2_synsets = wn.synsets(word2)
word_buff[word2] = word2_synsets
if not word1_synsets or not word2_synsets:
return 0,0
min_distance = 999999
min_pairs = None
for word1_synset in word1_synsets:
for word2_synset in word2_synsets:
distance = word1_synset.shortest_path_distance(word2_synset)
if distance and distance < min_distance:
min_distance = distance
min_pairs = (word1_synset,word2_synset)
subsumer_depth = 0
if min_pairs:
subsumer = min_pairs[0].lowest_common_hypernyms(min_pairs[0])
if subsumer and len(subsumer) == 1:
subsumer_depth = subsumer[0].min_depth()
else:
raise BaseException('function "min_path_distance_between_two_words" went wrong,check it')
else:
min_distance = None
return min_distance,subsumer_depth
def similarity_between_two_words(word1,word2,length_factor=0.2,depth_factor=0.45):
# 计算相似度
length,subsumer_depth = get_min_path_distance_and_subsumer_between_two_words(word1,word2)
if not length:
return 0
function_length = e ** -(length_factor*length)
temp1 = e ** (depth_factor * subsumer_depth)
temp2 = e ** -(depth_factor * subsumer_depth)
function_depth = (temp1 - temp2) / (temp1 + temp2)
return function_length * function_depth
def get_information_content(word,corpus):
# 获取词的information content
n = corpus.count(word)
N = len(corpus)
I_w = 1 - (log(n + 1) / log(N + 1))
return I_w
def word_order_vector(word_vector,joint_words):
res = []
for word in joint_words:
if word in word_vector:
res.append(joint_words.index(word) + 1)
else:
max_similarity_word = None
max_similarity = -1
for t_word in word_vector:
current_similarity = similarity_between_two_words(word,t_word)
if current_similarity > max_similarity:
max_similarity_word = t_word
if current_similarity > threshold and current_similarity > max_similarity:
max_similarity = current_similarity
res.append(joint_words.index(max_similarity_word) + 1)
return res
def semantic_vector(word_vector,joint_words):
res = []
for word in joint_words:
i_w1 = get_information_content(word, corpus)
if word in word_vector:
res.append(i_w1 * i_w1)
else:
# 遍历word_vector,寻找与word相似度最大的词
max_similarity_word = None
max_similarity = -1
for t1_word in word_vector:
current_similarity = similarity_between_two_words(word, t1_word)
if current_similarity > threshold and current_similarity > max_similarity:
max_similarity = current_similarity
max_similarity_word = t1_word
if max_similarity != -1:
i_w2 = get_information_content(max_similarity_word, corpus)
res.append(max_similarity * i_w1 * i_w2)
else:
res.append(0)
return res
def sentence_similarity(sentence1,sentence2):
# sentence1 = row['question1']
# sentence2 = row['question2']
words_1 = nltk.word_tokenize(sentence1)
words_2 = nltk.word_tokenize(sentence2)
if not words_1 or not words_2:
return 0
joint_words = list(set(words_1 + words_2))
semantic_vector1,semantic_vector2 = semantic_vector(words_1,joint_words),semantic_vector(words_2,joint_words)
word_order1,word_order2 = word_order_vector(words_1,joint_words),word_order_vector(words_2,joint_words)
s_s = sum(map(lambda x: x[0] * x[1], zip(semantic_vector1, semantic_vector2))) / sqrt(
sum(map(lambda x: x ** 2, semantic_vector1)) * sum(map(lambda x: x ** 2, semantic_vector2)))
s_r = sqrt(sum(map(lambda x: (x[0] - x[1]) ** 2, zip(word_order1, word_order2)))) / sqrt(
sum(map(lambda x: (x[0] + x[1]) ** 2, zip(word_order1, word_order2))))
sentence_similarity = semantic_and_word_order_factor * s_s + (1 - semantic_and_word_order_factor) * s_r
print(sentence1, '%%', sentence2, ':', sentence_similarity)
return sentence_similarity
|