|
|
|
@ -1,21 +1,22 @@ |
|
|
|
import codecs |
|
|
|
import base64 |
|
|
|
import hashlib |
|
|
|
import json |
|
|
|
import os |
|
|
|
import pickle |
|
|
|
import re |
|
|
|
from string import ascii_lowercase as alphabet |
|
|
|
import sys |
|
|
|
#32 or 64 bit platform? |
|
|
|
# 32 or 64 bit platform? |
|
|
|
if sys.maxsize > 2**32: |
|
|
|
HASH_FUNC = hashlib.blake2b |
|
|
|
else: |
|
|
|
HASH_FUNC = hashlib.blake2s |
|
|
|
|
|
|
|
|
|
|
|
def load_words(filename): |
|
|
|
with open(filename) as file: |
|
|
|
text = file.read() |
|
|
|
return set(map(str.lower,filter(bool,text.split('\n')))) |
|
|
|
return set(map(str.lower, filter(bool, text.split('\n')))) |
|
|
|
|
|
|
|
|
|
|
|
def _get_wordlist_hash(word_list_s): |
|
|
|
_hash = HASH_FUNC() |
|
|
|
@ -24,101 +25,115 @@ def _get_wordlist_hash(word_list_s): |
|
|
|
_hash.update(word_bytes) |
|
|
|
return _hash.digest() |
|
|
|
|
|
|
|
def hash_wordlist(word_list,raw = False): |
|
|
|
|
|
|
|
def hash_wordlist(word_list, raw=False): |
|
|
|
word_list = sorted(word_list) |
|
|
|
fhash = _get_wordlist_hash(word_list) |
|
|
|
if raw: |
|
|
|
return fhash |
|
|
|
illegal_hash = codecs.encode(fhash,'base64').decode() |
|
|
|
replacements = {'+':'-','/':'_',None: ''} |
|
|
|
return re.sub(r'(\+|\/)|\n',lambda match: replacements[match.group(1)],illegal_hash) |
|
|
|
return base64.urlsafe_b64decode(fhash) |
|
|
|
|
|
|
|
|
|
|
|
def load_freq_cache(word_list): |
|
|
|
fname = hash_wordlist(word_list) + '.pkl' |
|
|
|
fname = os.path.join('__hangcache__',fname) |
|
|
|
fname = os.path.join('__hangcache__', fname) |
|
|
|
if os.path.exists(fname): |
|
|
|
with open(fname,'rb') as file: |
|
|
|
with open(fname, 'rb') as file: |
|
|
|
return pickle.load(file) |
|
|
|
|
|
|
|
def save_freq_cache(word_list,freq): |
|
|
|
|
|
|
|
def save_freq_cache(word_list, freq): |
|
|
|
if not os.path.exists('__hangcache__'): |
|
|
|
os.mkdir('__hangcache__') |
|
|
|
fname = hash_wordlist(word_list) + '.pkl' |
|
|
|
fname = os.path.join('__hangcache__',fname) |
|
|
|
with open(fname,'wb') as file: |
|
|
|
pickle.dump(freq,file) |
|
|
|
fname = os.path.join('__hangcache__', fname) |
|
|
|
with open(fname, 'wb') as file: |
|
|
|
pickle.dump(freq, file) |
|
|
|
|
|
|
|
|
|
|
|
def generate_letter_frequency(word_list): |
|
|
|
cached = load_freq_cache(word_list) |
|
|
|
if cached is not None: |
|
|
|
return cached |
|
|
|
ret = {} |
|
|
|
for word_num,word in enumerate(word_list): |
|
|
|
for word_num, word in enumerate(word_list): |
|
|
|
letter_counts = {} |
|
|
|
for i,letter in enumerate(word): |
|
|
|
for i, letter in enumerate(word): |
|
|
|
try: |
|
|
|
ret[letter][0] += 1 |
|
|
|
except KeyError: |
|
|
|
ret[letter] = [1,0] |
|
|
|
in_word = letter_counts.get(letter,0) + 1 |
|
|
|
ret[letter] = [1, 0] |
|
|
|
in_word = letter_counts.get(letter, 0) + 1 |
|
|
|
letter_counts[letter] = in_word |
|
|
|
for letter,count in letter_counts.items(): |
|
|
|
for letter, count in letter_counts.items(): |
|
|
|
word_portion = count/len(word) |
|
|
|
avg = (ret[letter][1] * word_num) + word_portion |
|
|
|
avg = (ret[letter][1] * word_num) + word_portion |
|
|
|
avg /= word_num + 1 |
|
|
|
ret[letter][1] = avg |
|
|
|
if cached is None: |
|
|
|
save_freq_cache(word_list,ret) |
|
|
|
save_freq_cache(word_list, ret) |
|
|
|
return ret |
|
|
|
|
|
|
|
def filter_wordlist(input,remaining_letters,word_list): |
|
|
|
regex = re.compile(input.replace('.','[{}]'.format(''.join(remaining_letters))) + '$') |
|
|
|
matches = map(regex.match,word_list) |
|
|
|
remaining_words = (group[1] for group in filter(lambda group: group[0],zip(matches,word_list))) |
|
|
|
|
|
|
|
def filter_wordlist(input, remaining_letters, word_list): |
|
|
|
regex = re.compile(input.replace( |
|
|
|
'.', '[{}]'.format(''.join(remaining_letters))) + '$') |
|
|
|
matches = map(regex.match, word_list) |
|
|
|
remaining_words = (group[1] for group in filter( |
|
|
|
lambda group: group[0], zip(matches, word_list))) |
|
|
|
return list(remaining_words) |
|
|
|
|
|
|
|
|
|
|
|
PROMPT = "Enter word with '.' to represent missing letters\n('/' to separate multiple words): " |
|
|
|
PROMPT = """Enter word with '.' to represent missing letters |
|
|
|
('/' to separate multiple words): """ |
|
|
|
NEG_PROMPT = 'Enter letters which are confirmed not to occur: ' |
|
|
|
ALPHABET = set(letter for letter in alphabet) |
|
|
|
|
|
|
|
def shorten(chars,max_length): |
|
|
|
|
|
|
|
def shorten(chars, max_length): |
|
|
|
rows = [''] * max_length |
|
|
|
for i,char in enumerate(chars): |
|
|
|
row_num = i%max_length |
|
|
|
for i, char in enumerate(chars): |
|
|
|
row_num = i % max_length |
|
|
|
addition = char + ' ' * 4 |
|
|
|
rows[row_num] += addition |
|
|
|
return '\n'.join(map(str.rstrip,rows)) |
|
|
|
return '\n'.join(map(str.rstrip, rows)) |
|
|
|
|
|
|
|
|
|
|
|
def multi_word(l_words,n = 10): |
|
|
|
def multi_word(l_words, n=10): |
|
|
|
# breakpoint() |
|
|
|
rows = [''] * (n+1) |
|
|
|
first = True |
|
|
|
for count,words in enumerate(l_words): |
|
|
|
offset = max(map(len,rows)) |
|
|
|
working_set = words[:min(len(words),n)] |
|
|
|
working_set.insert(0,str(count+1)) |
|
|
|
for i,word in enumerate(working_set): |
|
|
|
for count, words in enumerate(l_words): |
|
|
|
offset = max(map(len, rows)) |
|
|
|
working_set = words[:min(len(words), n)] |
|
|
|
working_set.insert(0, str(count+1)) |
|
|
|
for i, word in enumerate(working_set): |
|
|
|
prev_line = rows[i] |
|
|
|
if len(prev_line) < offset: |
|
|
|
prev_line += ' '*(offset-len(prev_line)) |
|
|
|
rows[i] = prev_line+(' '*4 if not first else '' )+word |
|
|
|
rows[i] = prev_line+(' '*4 if not first else '')+word |
|
|
|
first = False |
|
|
|
return filter(bool,map(str.rstrip,rows)) |
|
|
|
return filter(bool, map(str.rstrip, rows)) |
|
|
|
|
|
|
|
|
|
|
|
def print_likely_chars(remaining_letters,let_freq): |
|
|
|
overall = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][0],reverse = True),5) |
|
|
|
per_word = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][1],reverse = True),5) |
|
|
|
print( 'Good candidates by overall frequency:', overall, sep = '\n') |
|
|
|
print('Good candidates by per-word frequency:', per_word, sep = '\n') |
|
|
|
def print_likely_chars(remaining_letters, let_freq): |
|
|
|
overall = shorten(sorted(remaining_letters, |
|
|
|
key=lambda letter: let_freq[letter][0], |
|
|
|
reverse=True), 5) |
|
|
|
per_word = shorten(sorted(remaining_letters, |
|
|
|
key=lambda letter: let_freq[letter][1], |
|
|
|
reverse=True), 5) |
|
|
|
print('Good candidates by overall frequency:', overall, sep='\n') |
|
|
|
print('Good candidates by per-word frequency:', per_word, sep='\n') |
|
|
|
|
|
|
|
# ensures that new expression could come from previous entry |
|
|
|
def check(prev,new,remaining_letters): |
|
|
|
|
|
|
|
|
|
|
|
def check(prev, new, remaining_letters): |
|
|
|
prev = '/'.join(prev) |
|
|
|
new = '/'.join(new) |
|
|
|
if len(prev) == len(new): |
|
|
|
good = set(re.findall('[a-z]',prev)) <= remaining_letters |
|
|
|
good = set(re.findall('[a-z]', prev)) <= remaining_letters |
|
|
|
for i in range(len(prev)): |
|
|
|
p_cur = prev[i] |
|
|
|
n_cur = new[i] |
|
|
|
@ -134,42 +149,47 @@ def check(prev,new,remaining_letters): |
|
|
|
else: |
|
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
negatives = set() |
|
|
|
|
|
|
|
def iterate(word_list,let_freq,prev_word = None): |
|
|
|
|
|
|
|
def iterate(word_list, let_freq, prev_word=None): |
|
|
|
if prev_word is None: |
|
|
|
entered_words = re.sub(r'[^a-z\./]','',input(PROMPT)).split('/') |
|
|
|
entered_words = re.sub(r'[^a-z\./]', '', input(PROMPT)).split('/') |
|
|
|
else: |
|
|
|
valid = False |
|
|
|
while not valid: |
|
|
|
entered_words = re.sub(r'[^a-z\./]','',input(PROMPT)).split('/') |
|
|
|
valid = check(prev_word,entered_words,ALPHABET-negatives) |
|
|
|
entered_words = re.sub(r'[^a-z\./]', '', input(PROMPT)).split('/') |
|
|
|
valid = check(prev_word, entered_words, ALPHABET-negatives) |
|
|
|
try: |
|
|
|
word_list[0][0] |
|
|
|
except: |
|
|
|
except Exception as e: |
|
|
|
print("Exception:", e) |
|
|
|
word_list = [word_list] * len(entered_words) |
|
|
|
negative_letters = re.findall('[a-z]',input(NEG_PROMPT)) |
|
|
|
negative_letters = re.findall('[a-z]', input(NEG_PROMPT)) |
|
|
|
negatives.update(negative_letters) |
|
|
|
output = [] |
|
|
|
entered_letters = set() |
|
|
|
for word in entered_words: |
|
|
|
entered_letters.update(re.findall('[a-z]',word)) |
|
|
|
remaining_letters = (ALPHABET & set(let_freq.keys())) - entered_letters - negatives |
|
|
|
for i,word in enumerate(entered_words): |
|
|
|
remaining_possibilities = filter_wordlist(word,remaining_letters,word_list[i]) |
|
|
|
entered_letters.update(re.findall('[a-z]', word)) |
|
|
|
remaining_letters = (ALPHABET & set(let_freq.keys()) |
|
|
|
) - entered_letters - negatives |
|
|
|
for i, word in enumerate(entered_words): |
|
|
|
remaining_possibilities = filter_wordlist( |
|
|
|
word, remaining_letters, word_list[i]) |
|
|
|
word_list[i] = remaining_possibilities |
|
|
|
print('Matches found:', '\n'.join(multi_word(word_list,10)),sep='\n') |
|
|
|
print_likely_chars(remaining_letters,let_freq) |
|
|
|
return entered_words,word_list |
|
|
|
print('Matches found:', '\n'.join(multi_word(word_list, 10)), sep='\n') |
|
|
|
print_likely_chars(remaining_letters, let_freq) |
|
|
|
return entered_words, word_list |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
#src: https://github.com/dwyl/english-words |
|
|
|
# src: https://github.com/dwyl/english-words |
|
|
|
words = load_words('words.txt') |
|
|
|
FREQ = generate_letter_frequency(words) |
|
|
|
print_likely_chars(ALPHABET,FREQ) |
|
|
|
print_likely_chars(ALPHABET, FREQ) |
|
|
|
last = None |
|
|
|
while True: |
|
|
|
try: |
|
|
|
last,words = iterate(words,FREQ,last) |
|
|
|
last, words = iterate(words, FREQ, last) |
|
|
|
except KeyboardInterrupt: |
|
|
|
break |
|
|
|
break |