import codecs import hashlib import json import os import pickle import re from string import ascii_lowercase as alphabet import sys #32 or 64 bit platform? if sys.maxsize > 2**32: HASH_FUNC = hashlib.blake2b else: HASH_FUNC = hashlib.blake2s def load_words(filename): with open(filename) as file: text = file.read() return set(map(str.lower,filter(bool,text.split('\n')))) def _get_wordlist_hash(word_list_s): _hash = HASH_FUNC() for word in sorted(word_list_s): word_bytes = word.encode() _hash.update(word_bytes) return _hash.digest() def hash_wordlist(word_list,raw = False): word_list = sorted(word_list) fhash = _get_wordlist_hash(word_list) if raw: return fhash illegal_hash = codecs.encode(fhash,'base64').decode() replacements = {'+':'-','/':'_',None: ''} return re.sub(r'(\+|\/)|\n',lambda match: replacements[match.group(1)],illegal_hash) def load_freq_cache(word_list): fname = hash_wordlist(word_list) + '.pkl' fname = os.path.join('__hangcache__',fname) if os.path.exists(fname): with open(fname,'rb') as file: return pickle.load(file) def save_freq_cache(word_list,freq): if not os.path.exists('__hangcache__'): os.mkdir('__hangcache__') fname = hash_wordlist(word_list) + '.pkl' fname = os.path.join('__hangcache__',fname) with open(fname,'wb') as file: pickle.dump(freq,file) def generate_letter_frequency(word_list): cached = load_freq_cache(word_list) if cached is not None: return cached ret = {} for word_num,word in enumerate(word_list): letter_counts = {} for i,letter in enumerate(word): try: ret[letter][0] += 1 except KeyError: ret[letter] = [1,0] in_word = letter_counts.get(letter,0) + 1 letter_counts[letter] = in_word for letter,count in letter_counts.items(): word_portion = count/len(word) avg = (ret[letter][1] * word_num) + word_portion avg /= word_num + 1 ret[letter][1] = avg if cached is None: save_freq_cache(word_list,ret) return ret PROMPT = "Enter word with '.' to represent missing letters: " NEG_PROMPT = 'Enter letters which are confirmed not to occur: ' ALPHABET = set(letter for letter in alphabet) def shorten(chars,max_length): rows = [''] * max_length for i,char in enumerate(chars): row_num = i%max_length addition = char + ' ' * 4 rows[row_num] += addition return '\n'.join(map(str.rstrip,rows)) def print_likely_chars(remaining_letters,let_freq): overall = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][0],reverse = True),5) per_word = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][1],reverse = True),5) print( 'Good candidates by overall frequency:\n' + overall ) print('Good candidates by per-word frequency:\n' + per_word ) negatives = set() def iterate(word_list,let_freq): entered_word = input(PROMPT) negative_letters = re.findall('[a-z]',input(NEG_PROMPT)) negatives.update(negative_letters) entered_word = entered_word.replace(' ','') entered_letters = set(letter for letter in entered_word.replace('.','')) remaining_letters = set(filter(lambda letter: letter in ALPHABET,let_freq.keys())) - entered_letters - negatives regex = entered_word.replace('.','[{}]'.format(''.join(remaining_letters))) + '$' remaining_possibilities = list(filter(lambda word: re.match(regex,word),word_list)) print('Matches found:\n' + '\n'.join(remaining_possibilities[i] for i in range(min(10,len(remaining_possibilities))))) print_likely_chars(remaining_letters,let_freq) return entered_word,remaining_possibilities if __name__ == "__main__": words = load_words('words.txt') FREQ = generate_letter_frequency(words) print_likely_chars(ALPHABET,FREQ) while True: try: last,words = iterate(words,FREQ) except KeyboardInterrupt: break