You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

195 lines
6.2 KiB

7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
  1. import base64
  2. import hashlib
  3. import os
  4. import pickle
  5. import re
  6. from string import ascii_lowercase as alphabet
  7. import sys
  8. # 32 or 64 bit platform?
  9. if sys.maxsize > 2**32:
  10. HASH_FUNC = hashlib.blake2b
  11. else:
  12. HASH_FUNC = hashlib.blake2s
  13. def load_words(filename):
  14. with open(filename) as file:
  15. text = file.read()
  16. return set(map(str.lower, filter(bool, text.split('\n'))))
  17. def _get_wordlist_hash(word_list_s):
  18. _hash = HASH_FUNC()
  19. for word in sorted(word_list_s):
  20. word_bytes = word.encode()
  21. _hash.update(word_bytes)
  22. return _hash.digest()
  23. def hash_wordlist(word_list, raw=False):
  24. word_list = sorted(word_list)
  25. fhash = _get_wordlist_hash(word_list)
  26. if raw:
  27. return fhash
  28. return base64.urlsafe_b64decode(fhash)
  29. def load_freq_cache(word_list):
  30. fname = hash_wordlist(word_list) + '.pkl'
  31. fname = os.path.join('__hangcache__', fname)
  32. if os.path.exists(fname):
  33. with open(fname, 'rb') as file:
  34. return pickle.load(file)
  35. def save_freq_cache(word_list, freq):
  36. if not os.path.exists('__hangcache__'):
  37. os.mkdir('__hangcache__')
  38. fname = hash_wordlist(word_list) + '.pkl'
  39. fname = os.path.join('__hangcache__', fname)
  40. with open(fname, 'wb') as file:
  41. pickle.dump(freq, file)
  42. def generate_letter_frequency(word_list):
  43. cached = load_freq_cache(word_list)
  44. if cached is not None:
  45. return cached
  46. ret = {}
  47. for word_num, word in enumerate(word_list):
  48. letter_counts = {}
  49. for i, letter in enumerate(word):
  50. try:
  51. ret[letter][0] += 1
  52. except KeyError:
  53. ret[letter] = [1, 0]
  54. in_word = letter_counts.get(letter, 0) + 1
  55. letter_counts[letter] = in_word
  56. for letter, count in letter_counts.items():
  57. word_portion = count/len(word)
  58. avg = (ret[letter][1] * word_num) + word_portion
  59. avg /= word_num + 1
  60. ret[letter][1] = avg
  61. if cached is None:
  62. save_freq_cache(word_list, ret)
  63. return ret
  64. def filter_wordlist(input, remaining_letters, word_list):
  65. regex = re.compile(input.replace(
  66. '.', '[{}]'.format(''.join(remaining_letters))) + '$')
  67. matches = map(regex.match, word_list)
  68. remaining_words = (group[1] for group in filter(
  69. lambda group: group[0], zip(matches, word_list)))
  70. return list(remaining_words)
  71. PROMPT = """Enter word with '.' to represent missing letters
  72. ('/' to separate multiple words): """
  73. NEG_PROMPT = 'Enter letters which are confirmed not to occur: '
  74. ALPHABET = set(letter for letter in alphabet)
  75. def shorten(chars, max_length):
  76. rows = [''] * max_length
  77. for i, char in enumerate(chars):
  78. row_num = i % max_length
  79. addition = char + ' ' * 4
  80. rows[row_num] += addition
  81. return '\n'.join(map(str.rstrip, rows))
  82. def multi_word(l_words, n=10):
  83. # breakpoint()
  84. rows = [''] * (n+1)
  85. first = True
  86. for count, words in enumerate(l_words):
  87. offset = max(map(len, rows))
  88. working_set = words[:min(len(words), n)]
  89. working_set.insert(0, str(count+1))
  90. for i, word in enumerate(working_set):
  91. prev_line = rows[i]
  92. if len(prev_line) < offset:
  93. prev_line += ' '*(offset-len(prev_line))
  94. rows[i] = prev_line+(' '*4 if not first else '')+word
  95. first = False
  96. return filter(bool, map(str.rstrip, rows))
  97. def print_likely_chars(remaining_letters, let_freq):
  98. overall = shorten(sorted(remaining_letters,
  99. key=lambda letter: let_freq[letter][0],
  100. reverse=True), 5)
  101. per_word = shorten(sorted(remaining_letters,
  102. key=lambda letter: let_freq[letter][1],
  103. reverse=True), 5)
  104. print('Good candidates by overall frequency:', overall, sep='\n')
  105. print('Good candidates by per-word frequency:', per_word, sep='\n')
  106. # ensures that new expression could come from previous entry
  107. def check(prev, new, remaining_letters):
  108. prev = '/'.join(prev)
  109. new = '/'.join(new)
  110. if len(prev) == len(new):
  111. good = set(re.findall('[a-z]', prev)) <= remaining_letters
  112. for i in range(len(prev)):
  113. p_cur = prev[i]
  114. n_cur = new[i]
  115. if p_cur == '/':
  116. good = p_cur == n_cur
  117. elif p_cur == '.':
  118. continue
  119. else:
  120. good == p_cur == n_cur
  121. if not good:
  122. return False
  123. return good
  124. else:
  125. return False
  126. negatives = set()
  127. def iterate(word_list, let_freq, prev_word=None):
  128. if prev_word is None:
  129. entered_words = re.sub(r'[^a-z\./]', '', input(PROMPT)).split('/')
  130. else:
  131. valid = False
  132. while not valid:
  133. entered_words = re.sub(r'[^a-z\./]', '', input(PROMPT)).split('/')
  134. valid = check(prev_word, entered_words, ALPHABET-negatives)
  135. try:
  136. word_list[0][0]
  137. except Exception as e:
  138. print("Exception:", e)
  139. word_list = [word_list] * len(entered_words)
  140. negative_letters = re.findall('[a-z]', input(NEG_PROMPT))
  141. negatives.update(negative_letters)
  142. entered_letters = set()
  143. for word in entered_words:
  144. entered_letters.update(re.findall('[a-z]', word))
  145. remaining_letters = (ALPHABET & set(let_freq.keys())
  146. ) - entered_letters - negatives
  147. for i, word in enumerate(entered_words):
  148. remaining_possibilities = filter_wordlist(
  149. word, remaining_letters, word_list[i])
  150. word_list[i] = remaining_possibilities
  151. print('Matches found:', '\n'.join(multi_word(word_list, 10)), sep='\n')
  152. print_likely_chars(remaining_letters, let_freq)
  153. return entered_words, word_list
  154. if __name__ == "__main__":
  155. # src: https://github.com/dwyl/english-words
  156. words = load_words('words.txt')
  157. FREQ = generate_letter_frequency(words)
  158. print_likely_chars(ALPHABET, FREQ)
  159. last = None
  160. while True:
  161. try:
  162. last, words = iterate(words, FREQ, last)
  163. except KeyboardInterrupt:
  164. break