From ed57fd1f2b45d52f4db6c0011f9b2d88e6fb4b8e Mon Sep 17 00:00:00 2001 From: Raphael Roberts Date: Mon, 17 Dec 2018 18:26:23 -0600 Subject: [PATCH] added clustering algorithm but needs preprocessing --- editd_words.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 editd_words.py diff --git a/editd_words.py b/editd_words.py new file mode 100644 index 0000000..ff0db09 --- /dev/null +++ b/editd_words.py @@ -0,0 +1,50 @@ +import edlib +import re +space = re.compile('(?: (?=[^ ]))+') +def ed_words(sentence1,sentence2): + words1 = space.split(sentence1) + words2 = space.split(sentence2) + all = set(words1).union(set(words2)) + translation = {} + for i,word in enumerate(all): + translation[word] = i + ed = edlib.align( + bytes(translation[word] for word in words1), + bytes(translation[word] for word in words2) + )['editDistance'] + l = max(map(len,(sentence1,sentence2))) + return ed/l + +def cluster_by_ed(sentences,threshold): + '''algorithm calculates word edit distance between words, and so long as it is above a threshold adds to cluster. If above threshold, start new cluster and add new word''' + ret = [] + sentence_list = list(sentences) + cont = True + index = 0 + while index < len(sentence_list): + current = [sentence_list[index]] + index += 1 + while index < len(sentence_list): + ed = ed_words(current[0],sentence_list[index]) + if ed < threshold: + current.append(sentence_list[index]) + index += 1 + else: + break + ret.append(current) + return ret + +if __name__ == "__main__": + import argparse + import json + import pprint + parser = argparse.ArgumentParser() + parser.add_argument('threshold',type=float) + args = parser.parse_args() + with open('test.json') as file: + data = json.load(file) + + l = cluster_by_ed(data,args.threshold) + print(len(l)) + if input('pprint?: ') == 'y': + pprint.pprint(l)