hangman-ftw/hangman.py

import base64
import hashlib
import os
import pickle
import re
from string import ascii_lowercase as alphabet
import sys
# 32 or 64 bit platform?
if sys.maxsize > 2**32:
    HASH_FUNC = hashlib.blake2b
else:
    HASH_FUNC = hashlib.blake2s


def load_words(filename):
    with open(filename) as file:
        text = file.read()
    return set(map(str.lower, filter(bool, text.split('\n'))))


def _get_wordlist_hash(word_list_s):
    _hash = HASH_FUNC()
    for word in sorted(word_list_s):
        word_bytes = word.encode()
        _hash.update(word_bytes)
    return _hash.digest()


def hash_wordlist(word_list, raw=False):
    word_list = sorted(word_list)
    fhash = _get_wordlist_hash(word_list)
    if raw:
        return fhash
    return base64.urlsafe_b64decode(fhash)


def load_freq_cache(word_list):
    fname = hash_wordlist(word_list) + '.pkl'
    fname = os.path.join('__hangcache__', fname)
    if os.path.exists(fname):
        with open(fname, 'rb') as file:
            return pickle.load(file)


def save_freq_cache(word_list, freq):
    if not os.path.exists('__hangcache__'):
        os.mkdir('__hangcache__')
    fname = hash_wordlist(word_list) + '.pkl'
    fname = os.path.join('__hangcache__', fname)
    with open(fname, 'wb') as file:
        pickle.dump(freq, file)


def generate_letter_frequency(word_list):
    cached = load_freq_cache(word_list)
    if cached is not None:
        return cached
    ret = {}
    for word_num, word in enumerate(word_list):
        letter_counts = {}
        for i, letter in enumerate(word):
            try:
                ret[letter][0] += 1
            except KeyError:
                ret[letter] = [1, 0]
            in_word = letter_counts.get(letter, 0) + 1
            letter_counts[letter] = in_word
        for letter, count in letter_counts.items():
            word_portion = count/len(word)
            avg = (ret[letter][1] * word_num) + word_portion
            avg /= word_num + 1
            ret[letter][1] = avg
    if cached is None:
        save_freq_cache(word_list, ret)
    return ret


def filter_wordlist(input, remaining_letters, word_list):
    regex = re.compile(input.replace(
        '.', '[{}]'.format(''.join(remaining_letters))) + '$')
    matches = map(regex.match, word_list)
    remaining_words = (group[1] for group in filter(
        lambda group: group[0], zip(matches, word_list)))
    return list(remaining_words)


PROMPT = """Enter word with '.' to represent missing letters
('/' to separate multiple words): """
NEG_PROMPT = 'Enter letters which are confirmed not to occur: '
ALPHABET = set(letter for letter in alphabet)


def shorten(chars, max_length):
    rows = [''] * max_length
    for i, char in enumerate(chars):
        row_num = i % max_length
        addition = char + ' ' * 4
        rows[row_num] += addition
    return '\n'.join(map(str.rstrip, rows))


def multi_word(l_words, n=10):
    # breakpoint()
    rows = [''] * (n+1)
    first = True
    for count, words in enumerate(l_words):
        offset = max(map(len, rows))
        working_set = words[:min(len(words), n)]
        working_set.insert(0, str(count+1))
        for i, word in enumerate(working_set):
            prev_line = rows[i]
            if len(prev_line) < offset:
                prev_line += ' '*(offset-len(prev_line))
            rows[i] = prev_line+(' '*4 if not first else '')+word
        first = False
    return filter(bool, map(str.rstrip, rows))


def print_likely_chars(remaining_letters, let_freq):
    overall = shorten(sorted(remaining_letters,
                             key=lambda letter: let_freq[letter][0],
                             reverse=True), 5)
    per_word = shorten(sorted(remaining_letters,
                              key=lambda letter: let_freq[letter][1],
                              reverse=True), 5)
    print('Good candidates by overall frequency:', overall, sep='\n')
    print('Good candidates by per-word frequency:', per_word, sep='\n')

# ensures that new expression could come from previous entry


def check(prev, new, remaining_letters):
    prev = '/'.join(prev)
    new = '/'.join(new)
    if len(prev) == len(new):
        good = set(re.findall('[a-z]', prev)) <= remaining_letters
        for i in range(len(prev)):
            p_cur = prev[i]
            n_cur = new[i]
            if p_cur == '/':
                good = p_cur == n_cur
            elif p_cur == '.':
                continue
            else:
                good == p_cur == n_cur
            if not good:
                return False
        return good
    else:
        return False


negatives = set()


def iterate(word_list, let_freq, prev_word=None):
    if prev_word is None:
        entered_words = re.sub(r'[^a-z\./]', '', input(PROMPT)).split('/')
    else:
        valid = False
        while not valid:
            entered_words = re.sub(r'[^a-z\./]', '', input(PROMPT)).split('/')
            valid = check(prev_word, entered_words, ALPHABET-negatives)
    try:
        word_list[0][0]
    except Exception as e:
        print("Exception:", e)
        word_list = [word_list] * len(entered_words)
    negative_letters = re.findall('[a-z]', input(NEG_PROMPT))
    negatives.update(negative_letters)
    entered_letters = set()
    for word in entered_words:
        entered_letters.update(re.findall('[a-z]', word))
    remaining_letters = (ALPHABET & set(let_freq.keys())
                         ) - entered_letters - negatives
    for i, word in enumerate(entered_words):
        remaining_possibilities = filter_wordlist(
            word, remaining_letters, word_list[i])
        word_list[i] = remaining_possibilities
    print('Matches found:', '\n'.join(multi_word(word_list, 10)), sep='\n')
    print_likely_chars(remaining_letters, let_freq)
    return entered_words, word_list


if __name__ == "__main__":
    # src: https://github.com/dwyl/english-words
    words = load_words('words.txt')
    FREQ = generate_letter_frequency(words)
    print_likely_chars(ALPHABET, FREQ)
    last = None
    while True:
        try:
            last, words = iterate(words, FREQ, last)
        except KeyboardInterrupt:
            break