The model I'm building first selects a secret word at random from a list. The model which uses an API then returns a row of underscores (space separated)—one for each letter in the secret word—and asks the user to guess a letter. If the user guesses a letter that is in the word, the word is redisplayed with all instances of that letter shown in the correct positions, along with any letters correctly guessed on previous turns. If the letter does not appear in the word, the user is charged with an incorrect guess. The user keeps guessing letters until either (1) the user has correctly guessed all the letters in the word or (2) the user has made six incorrect guesses.
I'm working on an algorithm which is permitted to use a training set of approximately 250,000 dictionary words.
I have built and providing here with a basic, working algorithm. This algorithm will match the provided masked string (e.g. a _ _ l e) to all possible words in the dictionary, tabulate the frequency of letters appearing in these possible words, and then guess the letter with the highest frequency of appearence that has not already been guessed. If there are no remaining words that match then it will default back to the character frequency distribution of the entire dictionary.
This benchmark strategy is successful approximately 18% of the time. I aim to design an algorithm that significantly outperforms this benchmark.
import random
import string
import time
import re
import collections
import math
import json
import requests
import numpy as np
from urllib.parse import parse_qs, urlencode, urlparse
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
####This is a part of the larger code and this code is written by the according to the API mode here
try:
from urllib.parse import parse_qs, urlencode, urlparse
except ImportError:
from urlparse import parse_qs, urlparse
from urllib import urlencode
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
######Start of the code up for review
class HangmanAPI(object):
def __init__(self, access_token=None, session=None, timeout=None):
self.hangman_url = self.determine_hangman_url()
self.access_token = access_token
self.session = session or requests.Session()
self.timeout = timeout
self.guessed_letters = []
full_dictionary_location = "words_250000_train.txt"
self.full_dictionary = self.build_dictionary(full_dictionary_location)
self.full_dictionary_common_letter_sorted = collections.Counter("".join(self.full_dictionary)).most_common()
self.current_dictionary = []
# Initialize the decision tree model and vectorizer here
self.decision_tree_model = DecisionTreeClassifier()
self.vectorizer = CountVectorizer(analyzer='char', lowercase=False, binary=True)
self.target_labels = [chr(ord('a') + i) for i in range(26)]
# Fit the decision tree model with the full dictionary once during initialization
X = self.vectorizer.fit_transform(self.full_dictionary)
y = np.array([word[-1] for word in self.full_dictionary])
self.decision_tree_model.fit(X, y)
@staticmethod
def determine_hangman_url():
links = ['https://trexsim.com', 'https://sg.trexsim.com']
data = {link: 0 for link in links}
for link in links:
requests.get(link)
for i in range(10):
s = time.time()
requests.get(link)
data[link] = time.time() - s
link = sorted(data.items(), key=lambda x: x[1])[0][0]
link += '/trexsim/hangman'
return link
def guess(self, word):
# Clean the word so that we strip away the space characters
# Replace "_" with "." as "." indicates any character in regular expressions
clean_word = word[::2].replace("_", ".")
# Find length of the passed word
len_word = len(clean_word)
# Grab current dictionary of possible words from self object, initialize a new possible words dictionary to empty
current_dictionary = self.current_dictionary
new_dictionary = []
# Iterate through all of the words in the old plausible dictionary
for dict_word in current_dictionary:
# Continue if the word is not of the appropriate length
if len(dict_word) != len_word:
continue
# If dictionary word is a possible match, then add it to the current dictionary
if re.match(clean_word, dict_word):
new_dictionary.append(dict_word)
# Overwrite old possible words dictionary with the updated version
self.current_dictionary = new_dictionary
# If there are no remaining words that match, default back to the ordering of the full dictionary
if not new_dictionary:
sorted_letter_count = self.full_dictionary_common_letter_sorted
else:
# Update the current dictionary with the new_dictionary
self.current_dictionary = new_dictionary
# Get the count of each letter at each position in the current dictionary
letter_counts = [{letter: sum(1 for word in self.current_dictionary if word[i] == letter) for letter in self.target_labels}
for i in range(len(clean_word))]
# Choose the character with the highest count at the next position
next_position = len(self.guessed_letters)
guessed_letter = max(letter_counts[next_position], key=letter_counts[next_position].get)
# Remove the guessed letter from the possible letters in current_dictionary
self.current_dictionary = [word for word in self.current_dictionary if guessed_letter not in word]
return guessed_letter
# Return the letter with the highest information gain that hasn't been guessed yet
for letter, info_gain in sorted_letter_count:
if letter not in self.guessed_letters:
return letter
# If all letters have been guessed, revert to ordering of full dictionary (fallback)
return self.full_dictionary_common_letter_sorted[0][0]
def make_decision(self, word_pattern):
# Use decision tree model to make a prediction
X = self.vectorizer.transform(self.current_dictionary)
# Get the target labels (last character of each word) for the decision tree model
y = np.array([word[-1] for word in self.current_dictionary])
self.decision_tree_model.fit(X, y)
# Transform the word pattern using the vectorizer
pattern_vectorized = self.vectorizer.transform([word_pattern])
# Make a prediction for the next letter probabilities
next_letter_probabilities = self.decision_tree_model.predict_proba(pattern_vectorized)[0]
# Get the index of the predicted letter (letter with the highest probability)
next_letter_index = np.argmax(next_letter_probabilities)
# Convert the predicted index back to the letter
guessed_letter = self.target_labels[next_letter_index]
return guessed_letter
def compute_conditional_probabilities(self, word_pattern):
# Count the occurrence of each letter in the possible words
full_dict_string = "".join(self.current_dictionary)
c = collections.Counter(full_dict_string)
# Calculate the total count of letters in the possible words
total_letter_count = sum(c.values())
# Calculate the conditional probabilities of each letter given the word pattern
letter_probabilities = {}
for letter in string.ascii_lowercase:
if letter not in self.guessed_letters:
pattern_with_letter = word_pattern.replace(".", letter)
matching_words_count = sum(1 for word in self.current_dictionary if re.match(pattern_with_letter, word))
conditional_probability = matching_words_count / total_letter_count
# Calculate the information gain using entropy (log2)
information_gain = -conditional_probability * math.log2(conditional_probability) if conditional_probability > 0 else 0
letter_probabilities[letter] = information_gain
return letter_probabilities
def build_dictionary(self, dictionary_file_location):
text_file = open(dictionary_file_location, "r")
full_dictionary = text_file.read().splitlines()
text_file.close()
return full_dictionary
####This is a part of the larger code and this code is written by the according to the API mode here
def start_game(self, practice=True, verbose=True):
# reset guessed letters to empty set and current plausible dictionary to the full dictionary
self.guessed_letters = []
self.current_dictionary = self.full_dictionary
response = self.request("/new_game", {"practice":practice})
if response.get('status')=="approved":
game_id = response.get('game_id')
word = response.get('word')
tries_remains = response.get('tries_remains')
if verbose:
print("Successfully start a new game! Game ID: {0}. # of tries remaining: {1}. Word: {2}.".format(game_id, tries_remains, word))
while tries_remains>0:
# get guessed letter from user code
guess_letter = self.guess(word)
# append guessed letter to guessed letters field in hangman object
self.guessed_letters.append(guess_letter)
if verbose:
print("Guessing letter: {0}".format(guess_letter))
try:
res = self.request("/guess_letter", {"request":"guess_letter", "game_id":game_id, "letter":guess_letter})
except HangmanAPIError:
print('HangmanAPIError exception caught on request.')
continue
except Exception as e:
print('Other exception caught on request.')
raise e
if verbose:
print("Sever response: {0}".format(res))
status = res.get('status')
tries_remains = res.get('tries_remains')
if status=="success":
if verbose:
print("Successfully finished game: {0}".format(game_id))
return True
elif status=="failed":
reason = res.get('reason', '# of tries exceeded!')
if verbose:
print("Failed game: {0}. Because of: {1}".format(game_id, reason))
return False
elif status=="ongoing":
word = res.get('word')
else:
if verbose:
print("Failed to start a new game")
return status=="success"
def my_status(self):
return self.request("/my_status", {})
def request(self, path, args=None, post_args=None, method=None):
if args is None:
args = dict()
if post_args is not None:
method = "POST"
# Add `access_token` to post_args or args if it has not already been included.
if self.access_token:
# If post_args exists, we assume that args either does not exist or it does not need `access_token`.
if post_args and "access_token" not in post_args:
post_args["access_token"] = self.access_token
elif "access_token" not in args:
args["access_token"] = self.access_token
time.sleep(0.2)
num_retry, time_sleep = 50, 2
for it in range(num_retry):
try:
response = self.session.request(
method or "GET",
self.hangman_url + path,
timeout=self.timeout,
params=args,
data=post_args,
verify=False
)
break
except requests.HTTPError as e:
response = json.loads(e.read())
raise HangmanAPIError(response)
except requests.exceptions.SSLError as e:
if it + 1 == num_retry:
raise
time.sleep(time_sleep)
headers = response.headers
if 'json' in headers['content-type']:
result = response.json()
elif "access_token" in parse_qs(response.text):
query_str = parse_qs(response.text)
if "access_token" in query_str:
result = {"access_token": query_str["access_token"][0]}
if "expires" in query_str:
result["expires"] = query_str["expires"][0]
else:
raise HangmanAPIError(response.json())
else:
raise HangmanAPIError('Maintype was not text, or querystring')
if result and isinstance(result, dict) and result.get("error"):
raise HangmanAPIError(result)
return result
class HangmanAPIError(Exception):
def __init__(self, result):
self.result = result
self.code = None
try:
self.type = result["error_code"]
except (KeyError, TypeError):
self.type = ""
try:
self.message = result["error_description"]
except (KeyError, TypeError):
try:
self.message = result["error"]["message"]
self.code = result["error"].get("code")
if not self.type:
self.type = result["error"].get("type", "")
except (KeyError, TypeError):
try:
self.message = result["error_msg"]
except (KeyError, TypeError):
self.message = result
Exception.__init__(self, self.message)
I am trying to implement a unique method which is under the "guess" method.
The dataset to try the training can be found: here
start_game()
, and the exception class? \$\endgroup\$