import edlib import re space = re.compile('(?: (?=[^ ]))+') def ed_words(sentence1,sentence2): words1 = space.split(sentence1) words2 = space.split(sentence2) all = set(words1).union(set(words2)) translation = {} for i,word in enumerate(all): translation[word] = i ed = edlib.align( bytes(translation[word] for word in words1), bytes(translation[word] for word in words2) )['editDistance'] l = max(map(len,(sentence1,sentence2))) return ed/l def cluster_by_ed(sentences,threshold): '''algorithm calculates word edit distance between words, and so long as it is above a threshold adds to cluster. If above threshold, start new cluster and add new word''' ret = [] sentence_list = list(sentences) cont = True index = 0 while index < len(sentence_list): current = [sentence_list[index]] index += 1 while index < len(sentence_list): ed = ed_words(current[0],sentence_list[index]) if ed < threshold: current.append(sentence_list[index]) index += 1 else: break ret.append(current) return ret if __name__ == "__main__": import argparse import json import pprint parser = argparse.ArgumentParser() parser.add_argument('threshold',type=float) args = parser.parse_args() with open('test.json') as file: data = json.load(file) l = cluster_by_ed(data,args.threshold) print(len(l)) if input('pprint?: ') == 'y': pprint.pprint(l)