You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

112 lines
4.1 KiB

7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
  1. import codecs
  2. import hashlib
  3. import json
  4. import os
  5. import pickle
  6. import re
  7. from string import ascii_lowercase as alphabet
  8. import sys
  9. #32 or 64 bit platform?
  10. if sys.maxsize > 2**32:
  11. HASH_FUNC = hashlib.blake2b
  12. else:
  13. HASH_FUNC = hashlib.blake2s
  14. def load_words(filename):
  15. with open(filename) as file:
  16. text = file.read()
  17. return set(map(str.lower,filter(bool,text.split('\n'))))
  18. def _get_wordlist_hash(word_list_s):
  19. _hash = HASH_FUNC()
  20. for word in sorted(word_list_s):
  21. word_bytes = word.encode()
  22. _hash.update(word_bytes)
  23. return _hash.digest()
  24. def hash_wordlist(word_list,raw = False):
  25. word_list = sorted(word_list)
  26. fhash = _get_wordlist_hash(word_list)
  27. if raw:
  28. return fhash
  29. illegal_hash = codecs.encode(fhash,'base64').decode()
  30. replacements = {'+':'-','/':'_',None: ''}
  31. return re.sub(r'(\+|\/)|\n',lambda match: replacements[match.group(1)],illegal_hash)
  32. def load_freq_cache(word_list):
  33. fname = hash_wordlist(word_list) + '.pkl'
  34. fname = os.path.join('__hangcache__',fname)
  35. if os.path.exists(fname):
  36. with open(fname,'rb') as file:
  37. return pickle.load(file)
  38. def save_freq_cache(word_list,freq):
  39. if not os.path.exists('__hangcache__'):
  40. os.mkdir('__hangcache__')
  41. fname = hash_wordlist(word_list) + '.pkl'
  42. fname = os.path.join('__hangcache__',fname)
  43. with open(fname,'wb') as file:
  44. pickle.dump(freq,file)
  45. def generate_letter_frequency(word_list):
  46. cached = load_freq_cache(word_list)
  47. if cached is not None:
  48. return cached
  49. ret = {}
  50. for word_num,word in enumerate(word_list):
  51. letter_counts = {}
  52. for i,letter in enumerate(word):
  53. try:
  54. ret[letter][0] += 1
  55. except KeyError:
  56. ret[letter] = [1,0]
  57. in_word = letter_counts.get(letter,0) + 1
  58. letter_counts[letter] = in_word
  59. for letter,count in letter_counts.items():
  60. word_portion = count/len(word)
  61. avg = (ret[letter][1] * word_num) + word_portion
  62. avg /= word_num + 1
  63. ret[letter][1] = avg
  64. if cached is None:
  65. save_freq_cache(word_list,ret)
  66. return ret
  67. PROMPT = "Enter word with '.' to represent missing letters: "
  68. NEG_PROMPT = 'Enter letters which are confirmed not to occur: '
  69. ALPHABET = set(letter for letter in alphabet)
  70. def shorten(chars,max_length):
  71. rows = [''] * max_length
  72. for i,char in enumerate(chars):
  73. row_num = i%max_length
  74. addition = char + ' ' * 4
  75. rows[row_num] += addition
  76. return '\n'.join(map(str.rstrip,rows))
  77. def print_likely_chars(remaining_letters,let_freq):
  78. overall = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][0],reverse = True),5)
  79. per_word = shorten(sorted(remaining_letters,key = lambda letter: let_freq[letter][1],reverse = True),5)
  80. print( 'Good candidates by overall frequency:\n' + overall )
  81. print('Good candidates by per-word frequency:\n' + per_word )
  82. negatives = set()
  83. def iterate(word_list,let_freq):
  84. entered_word = input(PROMPT)
  85. negative_letters = re.findall('[a-z]',input(NEG_PROMPT))
  86. negatives.update(negative_letters)
  87. entered_word = entered_word.replace(' ','')
  88. entered_letters = set(letter for letter in entered_word.replace('.',''))
  89. remaining_letters = set(filter(lambda letter: letter in ALPHABET,let_freq.keys())) - entered_letters - negatives
  90. regex = entered_word.replace('.','[{}]'.format(''.join(remaining_letters))) + '$'
  91. remaining_possibilities = list(filter(lambda word: re.match(regex,word),word_list))
  92. print('Matches found:\n' + '\n'.join(remaining_possibilities[i] for i in range(min(10,len(remaining_possibilities)))))
  93. print_likely_chars(remaining_letters,let_freq)
  94. return entered_word,remaining_possibilities
  95. if __name__ == "__main__":
  96. words = load_words('words.txt')
  97. FREQ = generate_letter_frequency(words)
  98. print_likely_chars(ALPHABET,FREQ)
  99. while True:
  100. try:
  101. last,words = iterate(words,FREQ)
  102. except KeyboardInterrupt:
  103. break