Compare commits

...

2 Commits
master ... test

  1. 50
      editd_words.py
  2. 1
      test.json

50
editd_words.py

@ -0,0 +1,50 @@
import edlib
import re
space = re.compile('(?: (?=[^ ]))+')
def ed_words(sentence1,sentence2):
words1 = space.split(sentence1)
words2 = space.split(sentence2)
all = set(words1).union(set(words2))
translation = {}
for i,word in enumerate(all):
translation[word] = i
ed = edlib.align(
bytes(translation[word] for word in words1),
bytes(translation[word] for word in words2)
)['editDistance']
l = max(map(len,(sentence1,sentence2)))
return ed/l
def cluster_by_ed(sentences,threshold):
'''algorithm calculates word edit distance between words, and so long as it is above a threshold adds to cluster. If above threshold, start new cluster and add new word'''
ret = []
sentence_list = list(sentences)
cont = True
index = 0
while index < len(sentence_list):
current = [sentence_list[index]]
index += 1
while index < len(sentence_list):
ed = ed_words(current[0],sentence_list[index])
if ed < threshold:
current.append(sentence_list[index])
index += 1
else:
break
ret.append(current)
return ret
if __name__ == "__main__":
import argparse
import json
import pprint
parser = argparse.ArgumentParser()
parser.add_argument('threshold',type=float)
args = parser.parse_args()
with open('test.json') as file:
data = json.load(file)
l = cluster_by_ed(data,args.threshold)
print(len(l))
if input('pprint?: ') == 'y':
pprint.pprint(l)

1
test.json
File diff suppressed because it is too large
View File

Loading…
Cancel
Save