You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
import edlibimport respace = re.compile('(?: (?=[^ ]))+')def ed_words(sentence1,sentence2): words1 = space.split(sentence1) words2 = space.split(sentence2) all = set(words1).union(set(words2)) translation = {} for i,word in enumerate(all): translation[word] = i ed = edlib.align( bytes(translation[word] for word in words1), bytes(translation[word] for word in words2) )['editDistance'] l = max(map(len,(sentence1,sentence2))) return ed/l
def cluster_by_ed(sentences,threshold): '''algorithm calculates word edit distance between words, and so long as it is above a threshold adds to cluster. If above threshold, start new cluster and add new word''' ret = [] sentence_list = list(sentences) cont = True index = 0 while index < len(sentence_list): current = [sentence_list[index]] index += 1 while index < len(sentence_list): ed = ed_words(current[0],sentence_list[index]) if ed < threshold: current.append(sentence_list[index]) index += 1 else: break ret.append(current) return ret
if __name__ == "__main__": import argparse import json import pprint parser = argparse.ArgumentParser() parser.add_argument('threshold',type=float) args = parser.parse_args() with open('test.json') as file: data = json.load(file)
l = cluster_by_ed(data,args.threshold) print(len(l)) if input('pprint?: ') == 'y': pprint.pprint(l)
|