You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

112 lines
4.1 KiB

import codecs
import hashlib
import json
import os
import pickle
import re
from string import ascii_lowercase as alphabet
import sys
#32 or 64 bit platform?
if sys.maxsize > 2**32:
HASH_FUNC = hashlib.blake2b
else:
HASH_FUNC = hashlib.blake2s
def load_words(filename):
with open(filename) as file:
text = file.read()
return set(map(str.lower,filter(bool,text.split('\n'))))
def _get_wordlist_hash(word_list_s):
_hash = HASH_FUNC()
for word in sorted(word_list_s):
word_bytes = word.encode()
_hash.update(word_bytes)
return _hash.digest()
def hash_wordlist(word_list,raw = False):
word_list = sorted(word_list)
fhash = _get_wordlist_hash(word_list)
if raw:
return fhash
illegal_hash = codecs.encode(fhash,'base64').decode()
replacements = {'+':'-','/':'_',None: ''}
return re.sub(r'(\+|\/)|\n',lambda match: replacements[match.group(1)],illegal_hash)
def load_freq_cache(word_list):
fname = hash_wordlist(word_list) + '.pkl'
fname = os.path.join('__hangcache__',fname)
if os.path.exists(fname):
with open(fname,'rb') as file:
return pickle.load(file)
def save_freq_cache(word_list,freq):
if not os.path.exists('__hangcache__'):
os.mkdir('__hangcache__')
fname = hash_wordlist(word_list) + '.pkl'
fname = os.path.join('__hangcache__',fname)
with open(fname,'wb') as file:
pickle.dump(freq,file)
def generate_letter_frequency(word_list):
cached = load_freq_cache(word_list)
if cached is not None:
return cached
ret = {}
for word_num,word in enumerate(word_list):
letter_counts = {}
for i,letter in enumerate(word):
try:
ret[letter][0] += 1
except KeyError:
ret[letter] = [1,0]
in_word = letter_counts.get(letter,0) + 1
letter_counts[letter] = in_word
for letter,count in letter_counts.items():
word_portion = count/len(word)
avg = (ret[letter][1] * word_num) + word_portion
avg /= word_num + 1
ret[letter][1] = avg
if cached is None:
save_freq_cache(word_list,ret)
return ret
PROMPT = "Enter word with '.' to represent missing letters: "
NEG_PROMPT = 'Enter letters which are confirmed not to occur: '
ALPHABET = set(letter for letter in alphabet)
def shorten(chars,max_length):
rows = [''] * max_length
for i,char in enumerate(chars):
row_num = i%max_length
addition = char + ' ' * 4
rows[row_num] += addition
return '\n'.join(map(str.rstrip,rows))
def print_likely_chars(remaining_letters,let_freq):
overall = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][0],reverse = True),5)
per_word = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][1],reverse = True),5)
print( 'Good candidates by overall frequency:\n' + overall )
print('Good candidates by per-word frequency:\n' + per_word )
negatives = set()
def iterate(word_list,let_freq):
entered_word = re.sub(r'[^a-z\.]','',input(PROMPT))
negative_letters = re.findall('[a-z]',input(NEG_PROMPT))
negatives.update(negative_letters)
entered_letters = set(letter for letter in entered_word.replace('.',''))
remaining_letters = set(filter(lambda letter: letter in ALPHABET,let_freq.keys())) - entered_letters - negatives
regex = entered_word.replace('.','[{}]'.format(''.join(remaining_letters))) + '$'
remaining_possibilities = list(filter(lambda word: re.match(regex,word),word_list))
print('Matches found:\n' + '\n'.join(remaining_possibilities[i] for i in range(min(10,len(remaining_possibilities)))))
print_likely_chars(remaining_letters,let_freq)
return entered_word,remaining_possibilities
if __name__ == "__main__":
words = load_words('words.txt')
FREQ = generate_letter_frequency(words)
print_likely_chars(ALPHABET,FREQ)
while True:
try:
last,words = iterate(words,FREQ)
except KeyboardInterrupt:
break