money_tracker/editd_words.py

import edlib
import re
space = re.compile('(?: (?=[^ ]))+')
def ed_words(sentence1,sentence2):
    words1 = space.split(sentence1)
    words2 = space.split(sentence2)
    all = set(words1).union(set(words2))
    translation = {}
    for i,word in enumerate(all):
        translation[word] = i
    ed = edlib.align(
        bytes(translation[word] for word in words1),
        bytes(translation[word] for word in words2)
        )['editDistance']
    l = max(map(len,(sentence1,sentence2)))
    return ed/l

def cluster_by_ed(sentences,threshold):
    '''algorithm calculates word edit distance between words, and so long as it is above a threshold adds to cluster. If above threshold, start new cluster and add new word'''
    ret = []
    sentence_list = list(sentences)
    cont = True
    index = 0
    while index < len(sentence_list):
        current = [sentence_list[index]]
        index += 1
        while index < len(sentence_list):
            ed = ed_words(current[0],sentence_list[index])
            if ed < threshold:
                current.append(sentence_list[index])
                index += 1
            else:
                break
        ret.append(current)
    return ret

if __name__ == "__main__":
    import argparse
    import json
    import pprint
    parser = argparse.ArgumentParser()
    parser.add_argument('threshold',type=float)
    args = parser.parse_args()
    with open('test.json') as file:
        data = json.load(file)

    l = cluster_by_ed(data,args.threshold)
    print(len(l))
    if input('pprint?: ') == 'y':
        pprint.pprint(l)