|
|
import base64import hashlibimport osimport pickleimport refrom string import ascii_lowercase as alphabetimport sys# 32 or 64 bit platform?if sys.maxsize > 2**32: HASH_FUNC = hashlib.blake2belse: HASH_FUNC = hashlib.blake2s
def load_words(filename): with open(filename) as file: text = file.read() return set(map(str.lower, filter(bool, text.split('\n'))))
def _get_wordlist_hash(word_list_s): _hash = HASH_FUNC() for word in sorted(word_list_s): word_bytes = word.encode() _hash.update(word_bytes) return _hash.digest()
def hash_wordlist(word_list, raw=False): word_list = sorted(word_list) fhash = _get_wordlist_hash(word_list) if raw: return fhash return base64.urlsafe_b64decode(fhash)
def load_freq_cache(word_list): fname = hash_wordlist(word_list) + '.pkl' fname = os.path.join('__hangcache__', fname) if os.path.exists(fname): with open(fname, 'rb') as file: return pickle.load(file)
def save_freq_cache(word_list, freq): if not os.path.exists('__hangcache__'): os.mkdir('__hangcache__') fname = hash_wordlist(word_list) + '.pkl' fname = os.path.join('__hangcache__', fname) with open(fname, 'wb') as file: pickle.dump(freq, file)
def generate_letter_frequency(word_list): cached = load_freq_cache(word_list) if cached is not None: return cached ret = {} for word_num, word in enumerate(word_list): letter_counts = {} for i, letter in enumerate(word): try: ret[letter][0] += 1 except KeyError: ret[letter] = [1, 0] in_word = letter_counts.get(letter, 0) + 1 letter_counts[letter] = in_word for letter, count in letter_counts.items(): word_portion = count/len(word) avg = (ret[letter][1] * word_num) + word_portion avg /= word_num + 1 ret[letter][1] = avg if cached is None: save_freq_cache(word_list, ret) return ret
def filter_wordlist(input, remaining_letters, word_list): regex = re.compile(input.replace( '.', '[{}]'.format(''.join(remaining_letters))) + '$') matches = map(regex.match, word_list) remaining_words = (group[1] for group in filter( lambda group: group[0], zip(matches, word_list))) return list(remaining_words)
PROMPT = """Enter word with '.' to represent missing letters
('/' to separate multiple words): """
NEG_PROMPT = 'Enter letters which are confirmed not to occur: 'ALPHABET = set(letter for letter in alphabet)
def shorten(chars, max_length): rows = [''] * max_length for i, char in enumerate(chars): row_num = i % max_length addition = char + ' ' * 4 rows[row_num] += addition return '\n'.join(map(str.rstrip, rows))
def multi_word(l_words, n=10): # breakpoint() rows = [''] * (n+1) first = True for count, words in enumerate(l_words): offset = max(map(len, rows)) working_set = words[:min(len(words), n)] working_set.insert(0, str(count+1)) for i, word in enumerate(working_set): prev_line = rows[i] if len(prev_line) < offset: prev_line += ' '*(offset-len(prev_line)) rows[i] = prev_line+(' '*4 if not first else '')+word first = False return filter(bool, map(str.rstrip, rows))
def print_likely_chars(remaining_letters, let_freq): overall = shorten(sorted(remaining_letters, key=lambda letter: let_freq[letter][0], reverse=True), 5) per_word = shorten(sorted(remaining_letters, key=lambda letter: let_freq[letter][1], reverse=True), 5) print('Good candidates by overall frequency:', overall, sep='\n') print('Good candidates by per-word frequency:', per_word, sep='\n')
# ensures that new expression could come from previous entry
def check(prev, new, remaining_letters): prev = '/'.join(prev) new = '/'.join(new) if len(prev) == len(new): good = set(re.findall('[a-z]', prev)) <= remaining_letters for i in range(len(prev)): p_cur = prev[i] n_cur = new[i] if p_cur == '/': good = p_cur == n_cur elif p_cur == '.': continue else: good == p_cur == n_cur if not good: return False return good else: return False
negatives = set()
def iterate(word_list, let_freq, prev_word=None): if prev_word is None: entered_words = re.sub(r'[^a-z\./]', '', input(PROMPT)).split('/') else: valid = False while not valid: entered_words = re.sub(r'[^a-z\./]', '', input(PROMPT)).split('/') valid = check(prev_word, entered_words, ALPHABET-negatives) try: word_list[0][0] except Exception as e: print("Exception:", e) word_list = [word_list] * len(entered_words) negative_letters = re.findall('[a-z]', input(NEG_PROMPT)) negatives.update(negative_letters) entered_letters = set() for word in entered_words: entered_letters.update(re.findall('[a-z]', word)) remaining_letters = (ALPHABET & set(let_freq.keys()) ) - entered_letters - negatives for i, word in enumerate(entered_words): remaining_possibilities = filter_wordlist( word, remaining_letters, word_list[i]) word_list[i] = remaining_possibilities print('Matches found:', '\n'.join(multi_word(word_list, 10)), sep='\n') print_likely_chars(remaining_letters, let_freq) return entered_words, word_list
if __name__ == "__main__": # src: https://github.com/dwyl/english-words words = load_words('words.txt') FREQ = generate_letter_frequency(words) print_likely_chars(ALPHABET, FREQ) last = None while True: try: last, words = iterate(words, FREQ, last) except KeyboardInterrupt: break
|