You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

174 lines
6.1 KiB

7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
  1. import codecs
  2. import hashlib
  3. import json
  4. import os
  5. import pickle
  6. import re
  7. from string import ascii_lowercase as alphabet
  8. import sys
  9. #32 or 64 bit platform?
  10. if sys.maxsize > 2**32:
  11. HASH_FUNC = hashlib.blake2b
  12. else:
  13. HASH_FUNC = hashlib.blake2s
  14. def load_words(filename):
  15. with open(filename) as file:
  16. text = file.read()
  17. return set(map(str.lower,filter(bool,text.split('\n'))))
  18. def _get_wordlist_hash(word_list_s):
  19. _hash = HASH_FUNC()
  20. for word in sorted(word_list_s):
  21. word_bytes = word.encode()
  22. _hash.update(word_bytes)
  23. return _hash.digest()
  24. def hash_wordlist(word_list,raw = False):
  25. word_list = sorted(word_list)
  26. fhash = _get_wordlist_hash(word_list)
  27. if raw:
  28. return fhash
  29. illegal_hash = codecs.encode(fhash,'base64').decode()
  30. replacements = {'+':'-','/':'_',None: ''}
  31. return re.sub(r'(\+|\/)|\n',lambda match: replacements[match.group(1)],illegal_hash)
  32. def load_freq_cache(word_list):
  33. fname = hash_wordlist(word_list) + '.pkl'
  34. fname = os.path.join('__hangcache__',fname)
  35. if os.path.exists(fname):
  36. with open(fname,'rb') as file:
  37. return pickle.load(file)
  38. def save_freq_cache(word_list,freq):
  39. if not os.path.exists('__hangcache__'):
  40. os.mkdir('__hangcache__')
  41. fname = hash_wordlist(word_list) + '.pkl'
  42. fname = os.path.join('__hangcache__',fname)
  43. with open(fname,'wb') as file:
  44. pickle.dump(freq,file)
  45. def generate_letter_frequency(word_list):
  46. cached = load_freq_cache(word_list)
  47. if cached is not None:
  48. return cached
  49. ret = {}
  50. for word_num,word in enumerate(word_list):
  51. letter_counts = {}
  52. for i,letter in enumerate(word):
  53. try:
  54. ret[letter][0] += 1
  55. except KeyError:
  56. ret[letter] = [1,0]
  57. in_word = letter_counts.get(letter,0) + 1
  58. letter_counts[letter] = in_word
  59. for letter,count in letter_counts.items():
  60. word_portion = count/len(word)
  61. avg = (ret[letter][1] * word_num) + word_portion
  62. avg /= word_num + 1
  63. ret[letter][1] = avg
  64. if cached is None:
  65. save_freq_cache(word_list,ret)
  66. return ret
  67. def filter_wordlist(input,remaining_letters,word_list):
  68. regex = re.compile(input.replace('.','[{}]'.format(''.join(remaining_letters))) + '$')
  69. matches = map(regex.match,word_list)
  70. remaining_words = (group[1] for group in filter(lambda group: group[0],zip(matches,word_list)))
  71. return list(remaining_words)
  72. PROMPT = "Enter word with '.' to represent missing letters ('/' to separate multiple words): "
  73. NEG_PROMPT = 'Enter letters which are confirmed not to occur: '
  74. ALPHABET = set(letter for letter in alphabet)
  75. def shorten(chars,max_length):
  76. rows = [''] * max_length
  77. for i,char in enumerate(chars):
  78. row_num = i%max_length
  79. addition = char + ' ' * 4
  80. rows[row_num] += addition
  81. return '\n'.join(map(str.rstrip,rows))
  82. def multi_word(l_words,n = 10):
  83. # breakpoint()
  84. rows = [''] * (n+1)
  85. first = True
  86. for count,words in enumerate(l_words):
  87. offset = max(map(len,rows))
  88. working_set = words[:min(len(words),n)]
  89. working_set.insert(0,str(count+1))
  90. for i,word in enumerate(working_set):
  91. prev_line = rows[i]
  92. if len(prev_line) < offset:
  93. prev_line += ' '*(offset-len(prev_line))
  94. rows[i] = prev_line+(' '*4 if not first else '' )+word
  95. first = False
  96. return filter(bool,map(str.rstrip,rows))
  97. def print_likely_chars(remaining_letters,let_freq):
  98. overall = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][0],reverse = True),5)
  99. per_word = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][1],reverse = True),5)
  100. print( 'Good candidates by overall frequency:', overall, sep = '\n')
  101. print('Good candidates by per-word frequency:', per_word, sep = '\n')
  102. # ensures that new expression could come from previous entry
  103. def check(prev,new,remaining_letters):
  104. prev = '/'.join(prev)
  105. new = '/'.join(new)
  106. if len(prev) == len(new):
  107. good = set(re.findall('[a-z]',prev)) <= remaining_letters
  108. for i in range(len(prev)):
  109. p_cur = prev[i]
  110. n_cur = new[i]
  111. if p_cur == '/':
  112. good = p_cur == n_cur
  113. elif p_cur == '.':
  114. continue
  115. else:
  116. good == p_cur == n_cur
  117. if not good:
  118. return False
  119. return good
  120. else:
  121. return False
  122. negatives = set()
  123. def iterate(word_list,let_freq,prev_word = None):
  124. if prev_word is None:
  125. entered_words = re.sub(r'[^a-z\./]','',input(PROMPT)).split('/')
  126. else:
  127. valid = False
  128. while not valid:
  129. entered_words = re.sub(r'[^a-z\./]','',input(PROMPT)).split('/')
  130. valid = check(prev_word,entered_words,ALPHABET-negatives)
  131. try:
  132. word_list[0][0]
  133. except:
  134. word_list = [word_list] * len(entered_words)
  135. negative_letters = re.findall('[a-z]',input(NEG_PROMPT))
  136. negatives.update(negative_letters)
  137. output = []
  138. entered_letters = set()
  139. for word in entered_words:
  140. entered_letters.update(re.findall('[a-z]',word))
  141. remaining_letters = (ALPHABET & set(let_freq.keys())) - entered_letters - negatives
  142. for i,word in enumerate(entered_words):
  143. remaining_possibilities = filter_wordlist(word,remaining_letters,word_list[i])
  144. word_list[i] = remaining_possibilities
  145. print('Matches found:', '\n'.join(multi_word(word_list,10)),sep='\n')
  146. print_likely_chars(remaining_letters,let_freq)
  147. return entered_words,word_list
  148. if __name__ == "__main__":
  149. #src: https://github.com/dwyl/english-words
  150. words = load_words('words.txt')
  151. FREQ = generate_letter_frequency(words)
  152. print_likely_chars(ALPHABET,FREQ)
  153. last = None
  154. while True:
  155. try:
  156. last,words = iterate(words,FREQ,last)
  157. except KeyboardInterrupt:
  158. break