You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
195 lines
6.2 KiB
195 lines
6.2 KiB
import base64
|
|
import hashlib
|
|
import os
|
|
import pickle
|
|
import re
|
|
from string import ascii_lowercase as alphabet
|
|
import sys
|
|
# 32 or 64 bit platform?
|
|
if sys.maxsize > 2**32:
|
|
HASH_FUNC = hashlib.blake2b
|
|
else:
|
|
HASH_FUNC = hashlib.blake2s
|
|
|
|
|
|
def load_words(filename):
|
|
with open(filename) as file:
|
|
text = file.read()
|
|
return set(map(str.lower, filter(bool, text.split('\n'))))
|
|
|
|
|
|
def _get_wordlist_hash(word_list_s):
|
|
_hash = HASH_FUNC()
|
|
for word in sorted(word_list_s):
|
|
word_bytes = word.encode()
|
|
_hash.update(word_bytes)
|
|
return _hash.digest()
|
|
|
|
|
|
def hash_wordlist(word_list, raw=False):
|
|
word_list = sorted(word_list)
|
|
fhash = _get_wordlist_hash(word_list)
|
|
if raw:
|
|
return fhash
|
|
return base64.urlsafe_b64decode(fhash)
|
|
|
|
|
|
def load_freq_cache(word_list):
|
|
fname = hash_wordlist(word_list) + '.pkl'
|
|
fname = os.path.join('__hangcache__', fname)
|
|
if os.path.exists(fname):
|
|
with open(fname, 'rb') as file:
|
|
return pickle.load(file)
|
|
|
|
|
|
def save_freq_cache(word_list, freq):
|
|
if not os.path.exists('__hangcache__'):
|
|
os.mkdir('__hangcache__')
|
|
fname = hash_wordlist(word_list) + '.pkl'
|
|
fname = os.path.join('__hangcache__', fname)
|
|
with open(fname, 'wb') as file:
|
|
pickle.dump(freq, file)
|
|
|
|
|
|
def generate_letter_frequency(word_list):
|
|
cached = load_freq_cache(word_list)
|
|
if cached is not None:
|
|
return cached
|
|
ret = {}
|
|
for word_num, word in enumerate(word_list):
|
|
letter_counts = {}
|
|
for i, letter in enumerate(word):
|
|
try:
|
|
ret[letter][0] += 1
|
|
except KeyError:
|
|
ret[letter] = [1, 0]
|
|
in_word = letter_counts.get(letter, 0) + 1
|
|
letter_counts[letter] = in_word
|
|
for letter, count in letter_counts.items():
|
|
word_portion = count/len(word)
|
|
avg = (ret[letter][1] * word_num) + word_portion
|
|
avg /= word_num + 1
|
|
ret[letter][1] = avg
|
|
if cached is None:
|
|
save_freq_cache(word_list, ret)
|
|
return ret
|
|
|
|
|
|
def filter_wordlist(input, remaining_letters, word_list):
|
|
regex = re.compile(input.replace(
|
|
'.', '[{}]'.format(''.join(remaining_letters))) + '$')
|
|
matches = map(regex.match, word_list)
|
|
remaining_words = (group[1] for group in filter(
|
|
lambda group: group[0], zip(matches, word_list)))
|
|
return list(remaining_words)
|
|
|
|
|
|
PROMPT = """Enter word with '.' to represent missing letters
|
|
('/' to separate multiple words): """
|
|
NEG_PROMPT = 'Enter letters which are confirmed not to occur: '
|
|
ALPHABET = set(letter for letter in alphabet)
|
|
|
|
|
|
def shorten(chars, max_length):
|
|
rows = [''] * max_length
|
|
for i, char in enumerate(chars):
|
|
row_num = i % max_length
|
|
addition = char + ' ' * 4
|
|
rows[row_num] += addition
|
|
return '\n'.join(map(str.rstrip, rows))
|
|
|
|
|
|
def multi_word(l_words, n=10):
|
|
# breakpoint()
|
|
rows = [''] * (n+1)
|
|
first = True
|
|
for count, words in enumerate(l_words):
|
|
offset = max(map(len, rows))
|
|
working_set = words[:min(len(words), n)]
|
|
working_set.insert(0, str(count+1))
|
|
for i, word in enumerate(working_set):
|
|
prev_line = rows[i]
|
|
if len(prev_line) < offset:
|
|
prev_line += ' '*(offset-len(prev_line))
|
|
rows[i] = prev_line+(' '*4 if not first else '')+word
|
|
first = False
|
|
return filter(bool, map(str.rstrip, rows))
|
|
|
|
|
|
def print_likely_chars(remaining_letters, let_freq):
|
|
overall = shorten(sorted(remaining_letters,
|
|
key=lambda letter: let_freq[letter][0],
|
|
reverse=True), 5)
|
|
per_word = shorten(sorted(remaining_letters,
|
|
key=lambda letter: let_freq[letter][1],
|
|
reverse=True), 5)
|
|
print('Good candidates by overall frequency:', overall, sep='\n')
|
|
print('Good candidates by per-word frequency:', per_word, sep='\n')
|
|
|
|
# ensures that new expression could come from previous entry
|
|
|
|
|
|
def check(prev, new, remaining_letters):
|
|
prev = '/'.join(prev)
|
|
new = '/'.join(new)
|
|
if len(prev) == len(new):
|
|
good = set(re.findall('[a-z]', prev)) <= remaining_letters
|
|
for i in range(len(prev)):
|
|
p_cur = prev[i]
|
|
n_cur = new[i]
|
|
if p_cur == '/':
|
|
good = p_cur == n_cur
|
|
elif p_cur == '.':
|
|
continue
|
|
else:
|
|
good == p_cur == n_cur
|
|
if not good:
|
|
return False
|
|
return good
|
|
else:
|
|
return False
|
|
|
|
|
|
negatives = set()
|
|
|
|
|
|
def iterate(word_list, let_freq, prev_word=None):
|
|
if prev_word is None:
|
|
entered_words = re.sub(r'[^a-z\./]', '', input(PROMPT)).split('/')
|
|
else:
|
|
valid = False
|
|
while not valid:
|
|
entered_words = re.sub(r'[^a-z\./]', '', input(PROMPT)).split('/')
|
|
valid = check(prev_word, entered_words, ALPHABET-negatives)
|
|
try:
|
|
word_list[0][0]
|
|
except Exception as e:
|
|
print("Exception:", e)
|
|
word_list = [word_list] * len(entered_words)
|
|
negative_letters = re.findall('[a-z]', input(NEG_PROMPT))
|
|
negatives.update(negative_letters)
|
|
entered_letters = set()
|
|
for word in entered_words:
|
|
entered_letters.update(re.findall('[a-z]', word))
|
|
remaining_letters = (ALPHABET & set(let_freq.keys())
|
|
) - entered_letters - negatives
|
|
for i, word in enumerate(entered_words):
|
|
remaining_possibilities = filter_wordlist(
|
|
word, remaining_letters, word_list[i])
|
|
word_list[i] = remaining_possibilities
|
|
print('Matches found:', '\n'.join(multi_word(word_list, 10)), sep='\n')
|
|
print_likely_chars(remaining_letters, let_freq)
|
|
return entered_words, word_list
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# src: https://github.com/dwyl/english-words
|
|
words = load_words('words.txt')
|
|
FREQ = generate_letter_frequency(words)
|
|
print_likely_chars(ALPHABET, FREQ)
|
|
last = None
|
|
while True:
|
|
try:
|
|
last, words = iterate(words, FREQ, last)
|
|
except KeyboardInterrupt:
|
|
break
|