You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

186 lines
6.5 KiB

7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
  1. import codecs
  2. import hashlib
  3. import json
  4. import os
  5. import multiprocessing
  6. import pickle
  7. import re
  8. from string import ascii_lowercase as alphabet
  9. import sys
  10. #32 or 64 bit platform?
  11. if sys.maxsize > 2**32:
  12. HASH_FUNC = hashlib.blake2b
  13. else:
  14. HASH_FUNC = hashlib.blake2s
  15. def load_words(filename):
  16. with open(filename) as file:
  17. text = file.read()
  18. return set(map(str.lower,filter(bool,text.split('\n'))))
  19. def _get_wordlist_hash(word_list_s):
  20. _hash = HASH_FUNC()
  21. for word in sorted(word_list_s):
  22. word_bytes = word.encode()
  23. _hash.update(word_bytes)
  24. return _hash.digest()
  25. def hash_wordlist(word_list,raw = False):
  26. word_list = sorted(word_list)
  27. fhash = _get_wordlist_hash(word_list)
  28. if raw:
  29. return fhash
  30. illegal_hash = codecs.encode(fhash,'base64').decode()
  31. replacements = {'+':'-','/':'_',None: ''}
  32. return re.sub(r'(\+|\/)|\n',lambda match: replacements[match.group(1)],illegal_hash)
  33. def load_freq_cache(word_list):
  34. fname = hash_wordlist(word_list) + '.pkl'
  35. fname = os.path.join('__hangcache__',fname)
  36. if os.path.exists(fname):
  37. with open(fname,'rb') as file:
  38. return pickle.load(file)
  39. def save_freq_cache(word_list,freq):
  40. if not os.path.exists('__hangcache__'):
  41. os.mkdir('__hangcache__')
  42. fname = hash_wordlist(word_list) + '.pkl'
  43. fname = os.path.join('__hangcache__',fname)
  44. with open(fname,'wb') as file:
  45. pickle.dump(freq,file)
  46. def generate_letter_frequency(word_list):
  47. cached = load_freq_cache(word_list)
  48. if cached is not None:
  49. return cached
  50. ret = {}
  51. for word_num,word in enumerate(word_list):
  52. letter_counts = {}
  53. for i,letter in enumerate(word):
  54. try:
  55. ret[letter][0] += 1
  56. except KeyError:
  57. ret[letter] = [1,0]
  58. in_word = letter_counts.get(letter,0) + 1
  59. letter_counts[letter] = in_word
  60. for letter,count in letter_counts.items():
  61. word_portion = count/len(word)
  62. avg = (ret[letter][1] * word_num) + word_portion
  63. avg /= word_num + 1
  64. ret[letter][1] = avg
  65. if cached is None:
  66. save_freq_cache(word_list,ret)
  67. return ret
  68. class bool_regex:
  69. def __init__(self,expr):
  70. self.expr = expr
  71. def __call__(self,arg):
  72. return bool(self.expr.match(arg))
  73. def filter_wordlist(input,remaining_letters,word_list,mp=True):
  74. regex = re.compile(input.replace('.','[{}]'.format(''.join(remaining_letters))) + '$')
  75. if mp:
  76. regex = bool_regex(regex)
  77. pool = multiprocessing.Pool()
  78. matches = pool.map(regex,word_list)
  79. pool.close()
  80. pool.join()
  81. else:
  82. matches = map(regex.match,word_list)
  83. remaining_words = (group[1] for group in filter(lambda group: group[0],zip(matches,word_list)))
  84. return list(remaining_words)
  85. PROMPT = "Enter word with '.' to represent missing letters ('/' to separate multiple words): "
  86. NEG_PROMPT = 'Enter letters which are confirmed not to occur: '
  87. ALPHABET = set(letter for letter in alphabet)
  88. def shorten(chars,max_length):
  89. rows = [''] * max_length
  90. for i,char in enumerate(chars):
  91. row_num = i%max_length
  92. addition = char + ' ' * 4
  93. rows[row_num] += addition
  94. return '\n'.join(map(str.rstrip,rows))
  95. def multi_word(l_words,n = 10):
  96. # breakpoint()
  97. rows = [''] * (n+1)
  98. first = True
  99. for count,words in enumerate(l_words):
  100. offset = max(map(len,rows))
  101. working_set = words[:min(len(words),n)]
  102. working_set.insert(0,str(count+1))
  103. for i,word in enumerate(working_set):
  104. prev_line = rows[i]
  105. if len(prev_line) < offset:
  106. prev_line += ' '*(offset-len(prev_line))
  107. rows[i] = prev_line+(' '*4 if not first else '' )+word
  108. first = False
  109. return filter(bool,map(str.rstrip,rows))
  110. def print_likely_chars(remaining_letters,let_freq):
  111. overall = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][0],reverse = True),5)
  112. per_word = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][1],reverse = True),5)
  113. print( 'Good candidates by overall frequency:', overall, sep = '\n')
  114. print('Good candidates by per-word frequency:', per_word, sep = '\n')
  115. # ensures that new expression could come from previous entry
  116. def check(prev,new,remaining_letters):
  117. prev = '/'.join(prev)
  118. new = '/'.join(new)
  119. if len(prev) == len(new):
  120. good = set(re.findall('[a-z]',prev)) <= remaining_letters
  121. for i in range(len(prev)):
  122. p_cur = prev[i]
  123. n_cur = new[i]
  124. if p_cur == '/':
  125. good = p_cur == n_cur
  126. elif p_cur == '.':
  127. continue
  128. else:
  129. good == p_cur == n_cur
  130. if not good:
  131. return False
  132. return good
  133. else:
  134. return False
  135. negatives = set()
  136. def iterate(word_list,let_freq,prev_word = None):
  137. if prev_word is None:
  138. entered_words = re.sub(r'[^a-z\./]','',input(PROMPT)).split('/')
  139. else:
  140. valid = False
  141. while not valid:
  142. entered_words = re.sub(r'[^a-z\./]','',input(PROMPT)).split('/')
  143. valid = check(prev_word,entered_words,ALPHABET-negatives)
  144. try:
  145. word_list[0][0]
  146. except:
  147. word_list = [word_list] * len(entered_words)
  148. negative_letters = re.findall('[a-z]',input(NEG_PROMPT))
  149. negatives.update(negative_letters)
  150. output = []
  151. entered_letters = set()
  152. for word in entered_words:
  153. entered_letters.update(re.findall('[a-z]',word))
  154. remaining_letters = (ALPHABET & set(let_freq.keys())) - entered_letters - negatives
  155. for i,word in enumerate(entered_words):
  156. remaining_possibilities = filter_wordlist(word,remaining_letters,word_list[i],mp=True)
  157. word_list[i] = remaining_possibilities
  158. print('Matches found:', '\n'.join(multi_word(word_list,10)),sep='\n')
  159. print_likely_chars(remaining_letters,let_freq)
  160. return entered_words,word_list
  161. if __name__ == "__main__":
  162. #src: https://github.com/dwyl/english-words
  163. words = load_words('words.txt')
  164. FREQ = generate_letter_frequency(words)
  165. print_likely_chars(ALPHABET,FREQ)
  166. last = None
  167. while True:
  168. try:
  169. last,words = iterate(words,FREQ,last)
  170. except KeyboardInterrupt:
  171. break