added clustering algorithm but needs preprocessing

7 years ago · ed57fd1f2b
1 changed files with 50 additions and 0 deletions
--- a/editd_words.py
+++ b/editd_words.py
@ -0,0 +1,50 @@
+import edlib
+import re
+space = re.compile('(?: (?=[^ ]))+')
+def ed_words(sentence1,sentence2):
+    words1 = space.split(sentence1)
+    words2 = space.split(sentence2)
+    all = set(words1).union(set(words2))
+    translation = {}
+    for i,word in enumerate(all):
+        translation[word] = i
+    ed = edlib.align(
+        bytes(translation[word] for word in words1),
+        bytes(translation[word] for word in words2)
+        )['editDistance']
+    l = max(map(len,(sentence1,sentence2)))
+    return ed/l
+
+def cluster_by_ed(sentences,threshold):
+    '''algorithm calculates word edit distance between words, and so long as it is above a threshold adds to cluster. If above threshold, start new cluster and add new word'''
+    ret = []
+    sentence_list = list(sentences)
+    cont = True
+    index = 0
+    while index < len(sentence_list):
+        current = [sentence_list[index]]
+        index += 1
+        while index < len(sentence_list):
+            ed = ed_words(current[0],sentence_list[index])
+            if ed < threshold:
+                current.append(sentence_list[index])
+                index += 1
+            else:
+                break
+        ret.append(current)
+    return ret
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    import pprint
+    parser = argparse.ArgumentParser()
+    parser.add_argument('threshold',type=float)
+    args = parser.parse_args()
+    with open('test.json') as file:
+        data = json.load(file)
+
+    l = cluster_by_ed(data,args.threshold)
+    print(len(l))
+    if input('pprint?: ') == 'y':
+        pprint.pprint(l)