You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
50 lines
1.5 KiB
50 lines
1.5 KiB
import edlib
|
|
import re
|
|
space = re.compile('(?: (?=[^ ]))+')
|
|
def ed_words(sentence1,sentence2):
|
|
words1 = space.split(sentence1)
|
|
words2 = space.split(sentence2)
|
|
all = set(words1).union(set(words2))
|
|
translation = {}
|
|
for i,word in enumerate(all):
|
|
translation[word] = i
|
|
ed = edlib.align(
|
|
bytes(translation[word] for word in words1),
|
|
bytes(translation[word] for word in words2)
|
|
)['editDistance']
|
|
l = max(map(len,(sentence1,sentence2)))
|
|
return ed/l
|
|
|
|
def cluster_by_ed(sentences,threshold):
|
|
'''algorithm calculates word edit distance between words, and so long as it is above a threshold adds to cluster. If above threshold, start new cluster and add new word'''
|
|
ret = []
|
|
sentence_list = list(sentences)
|
|
cont = True
|
|
index = 0
|
|
while index < len(sentence_list):
|
|
current = [sentence_list[index]]
|
|
index += 1
|
|
while index < len(sentence_list):
|
|
ed = ed_words(current[0],sentence_list[index])
|
|
if ed < threshold:
|
|
current.append(sentence_list[index])
|
|
index += 1
|
|
else:
|
|
break
|
|
ret.append(current)
|
|
return ret
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
import json
|
|
import pprint
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('threshold',type=float)
|
|
args = parser.parse_args()
|
|
with open('test.json') as file:
|
|
data = json.load(file)
|
|
|
|
l = cluster_by_ed(data,args.threshold)
|
|
print(len(l))
|
|
if input('pprint?: ') == 'y':
|
|
pprint.pprint(l)
|