money_tracker/editd_words.py


								import edlib

								import re

								space = re.compile('(?: (?=[^ ]))+')

								def ed_words(sentence1,sentence2):

								    words1 = space.split(sentence1)

								    words2 = space.split(sentence2)

								    all = set(words1).union(set(words2))

								    translation = {}

								    for i,word in enumerate(all):

								        translation[word] = i

								    ed = edlib.align(

								        bytes(translation[word] for word in words1),

								        bytes(translation[word] for word in words2)

								        )['editDistance']

								    l = max(map(len,(sentence1,sentence2)))

								    return ed/l


								def cluster_by_ed(sentences,threshold):

								    '''algorithm calculates word edit distance between words, and so long as it is above a threshold adds to cluster. If above threshold, start new cluster and add new word'''

								    ret = []

								    sentence_list = list(sentences)

								    cont = True

								    index = 0

								    while index < len(sentence_list):

								        current = [sentence_list[index]]

								        index += 1

								        while index < len(sentence_list):

								            ed = ed_words(current[0],sentence_list[index])

								            if ed < threshold:

								                current.append(sentence_list[index])

								                index += 1

								            else:

								                break

								        ret.append(current)

								    return ret


								if __name__ == "__main__":

								    import argparse

								    import json

								    import pprint

								    parser = argparse.ArgumentParser()

								    parser.add_argument('threshold',type=float)

								    args = parser.parse_args()

								    with open('test.json') as file:

								        data = json.load(file)


								    l = cluster_by_ed(data,args.threshold)

								    print(len(l))

								    if input('pprint?: ') == 'y':

								        pprint.pprint(l)