|
|
|
@ -0,0 +1,50 @@ |
|
|
|
import edlib |
|
|
|
import re |
|
|
|
space = re.compile('(?: (?=[^ ]))+') |
|
|
|
def ed_words(sentence1,sentence2): |
|
|
|
words1 = space.split(sentence1) |
|
|
|
words2 = space.split(sentence2) |
|
|
|
all = set(words1).union(set(words2)) |
|
|
|
translation = {} |
|
|
|
for i,word in enumerate(all): |
|
|
|
translation[word] = i |
|
|
|
ed = edlib.align( |
|
|
|
bytes(translation[word] for word in words1), |
|
|
|
bytes(translation[word] for word in words2) |
|
|
|
)['editDistance'] |
|
|
|
l = max(map(len,(sentence1,sentence2))) |
|
|
|
return ed/l |
|
|
|
|
|
|
|
def cluster_by_ed(sentences,threshold): |
|
|
|
'''algorithm calculates word edit distance between words, and so long as it is above a threshold adds to cluster. If above threshold, start new cluster and add new word''' |
|
|
|
ret = [] |
|
|
|
sentence_list = list(sentences) |
|
|
|
cont = True |
|
|
|
index = 0 |
|
|
|
while index < len(sentence_list): |
|
|
|
current = [sentence_list[index]] |
|
|
|
index += 1 |
|
|
|
while index < len(sentence_list): |
|
|
|
ed = ed_words(current[0],sentence_list[index]) |
|
|
|
if ed < threshold: |
|
|
|
current.append(sentence_list[index]) |
|
|
|
index += 1 |
|
|
|
else: |
|
|
|
break |
|
|
|
ret.append(current) |
|
|
|
return ret |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
import argparse |
|
|
|
import json |
|
|
|
import pprint |
|
|
|
parser = argparse.ArgumentParser() |
|
|
|
parser.add_argument('threshold',type=float) |
|
|
|
args = parser.parse_args() |
|
|
|
with open('test.json') as file: |
|
|
|
data = json.load(file) |
|
|
|
|
|
|
|
l = cluster_by_ed(data,args.threshold) |
|
|
|
print(len(l)) |
|
|
|
if input('pprint?: ') == 'y': |
|
|
|
pprint.pprint(l) |