import codecs import hashlib import json import os import multiprocessing import pickle import re from string import ascii_lowercase as alphabet import sys #32 or 64 bit platform? if sys.maxsize > 2**32: HASH_FUNC = hashlib.blake2b else: HASH_FUNC = hashlib.blake2s def load_words(filename): with open(filename) as file: text = file.read() return set(map(str.lower,filter(bool,text.split('\n')))) def _get_wordlist_hash(word_list_s): _hash = HASH_FUNC() for word in sorted(word_list_s): word_bytes = word.encode() _hash.update(word_bytes) return _hash.digest() def hash_wordlist(word_list,raw = False): word_list = sorted(word_list) fhash = _get_wordlist_hash(word_list) if raw: return fhash illegal_hash = codecs.encode(fhash,'base64').decode() replacements = {'+':'-','/':'_',None: ''} return re.sub(r'(\+|\/)|\n',lambda match: replacements[match.group(1)],illegal_hash) def load_freq_cache(word_list): fname = hash_wordlist(word_list) + '.pkl' fname = os.path.join('__hangcache__',fname) if os.path.exists(fname): with open(fname,'rb') as file: return pickle.load(file) def save_freq_cache(word_list,freq): if not os.path.exists('__hangcache__'): os.mkdir('__hangcache__') fname = hash_wordlist(word_list) + '.pkl' fname = os.path.join('__hangcache__',fname) with open(fname,'wb') as file: pickle.dump(freq,file) def generate_letter_frequency(word_list): cached = load_freq_cache(word_list) if cached is not None: return cached ret = {} for word_num,word in enumerate(word_list): letter_counts = {} for i,letter in enumerate(word): try: ret[letter][0] += 1 except KeyError: ret[letter] = [1,0] in_word = letter_counts.get(letter,0) + 1 letter_counts[letter] = in_word for letter,count in letter_counts.items(): word_portion = count/len(word) avg = (ret[letter][1] * word_num) + word_portion avg /= word_num + 1 ret[letter][1] = avg if cached is None: save_freq_cache(word_list,ret) return ret class bool_regex: def __init__(self,expr): self.expr = expr def __call__(self,arg): return bool(self.expr.match(arg)) def filter_wordlist(input,remaining_letters,word_list,mp=True): regex = re.compile(input.replace('.','[{}]'.format(''.join(remaining_letters))) + '$') if mp: regex = bool_regex(regex) pool = multiprocessing.Pool() matches = pool.map(regex,word_list) pool.close() pool.join() else: matches = map(regex.match,word_list) remaining_words = (group[1] for group in filter(lambda group: group[0],zip(matches,word_list))) return list(remaining_words) PROMPT = "Enter word with '.' to represent missing letters ('/' to separate multiple words): " NEG_PROMPT = 'Enter letters which are confirmed not to occur: ' ALPHABET = set(letter for letter in alphabet) def shorten(chars,max_length): rows = [''] * max_length for i,char in enumerate(chars): row_num = i%max_length addition = char + ' ' * 4 rows[row_num] += addition return '\n'.join(map(str.rstrip,rows)) def multi_word(l_words,n = 10): # breakpoint() rows = [''] * (n+1) first = True for count,words in enumerate(l_words): offset = max(map(len,rows)) working_set = words[:min(len(words),n)] working_set.insert(0,str(count+1)) for i,word in enumerate(working_set): prev_line = rows[i] if len(prev_line) < offset: prev_line += ' '*(offset-len(prev_line)) rows[i] = prev_line+(' '*4 if not first else '' )+word first = False return filter(bool,map(str.rstrip,rows)) def print_likely_chars(remaining_letters,let_freq): overall = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][0],reverse = True),5) per_word = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][1],reverse = True),5) print( 'Good candidates by overall frequency:', overall, sep = '\n') print('Good candidates by per-word frequency:', per_word, sep = '\n') # ensures that new expression could come from previous entry def check(prev,new,remaining_letters): prev = '/'.join(prev) new = '/'.join(new) if len(prev) == len(new): good = set(re.findall('[a-z]',prev)) <= remaining_letters for i in range(len(prev)): p_cur = prev[i] n_cur = new[i] if p_cur == '/': good = p_cur == n_cur elif p_cur == '.': continue else: good == p_cur == n_cur if not good: return False return good else: return False negatives = set() def iterate(word_list,let_freq,prev_word = None): if prev_word is None: entered_words = re.sub(r'[^a-z\./]','',input(PROMPT)).split('/') else: valid = False while not valid: entered_words = re.sub(r'[^a-z\./]','',input(PROMPT)).split('/') valid = check(prev_word,entered_words,ALPHABET-negatives) try: word_list[0][0] except: word_list = [word_list] * len(entered_words) negative_letters = re.findall('[a-z]',input(NEG_PROMPT)) negatives.update(negative_letters) output = [] entered_letters = set() for word in entered_words: entered_letters.update(re.findall('[a-z]',word)) remaining_letters = (ALPHABET & set(let_freq.keys())) - entered_letters - negatives for i,word in enumerate(entered_words): remaining_possibilities = filter_wordlist(word,remaining_letters,word_list[i],mp=True) word_list[i] = remaining_possibilities print('Matches found:', '\n'.join(multi_word(word_list,10)),sep='\n') print_likely_chars(remaining_letters,let_freq) return entered_words,word_list if __name__ == "__main__": #src: https://github.com/dwyl/english-words words = load_words('words.txt') FREQ = generate_letter_frequency(words) print_likely_chars(ALPHABET,FREQ) last = None while True: try: last,words = iterate(words,FREQ,last) except KeyboardInterrupt: break