Experiment
Here are some examples of errors in the lookup
Wrong absorptive found advertisements was absorptive
Wrong abstain found absentmindedness was abstain
Wrong abstained found absentmindedness was abstained
Wrong abstainer found absentmindedness was abstainer
Wrong abstainers found absentmindedness was abstainers
Wrong abstaining found administrating was abstaining
Wrong abstains found absentmindedness was abstains
Wrong abstemious found abstemiousness was abstemious
Wrong abstemiously found abstemiousness was abstemiously
Wrong abstention found absentmindedness was abstention
Wrong abstentions found absentmindedness was abstentions
Wrong abstinence found absentmindedness was abstinence
Wrong abstinent found absentmindedness was abstinent
Wrong abstract found abstracts was abstract
Wrong abstracted found accelerates was abstracted
Wrong abstractedly found aesthetically was abstractedly
I think it was wrong to use a count as the input into the node that represented how many times the letter appeared in the word. I am going to change that to have a node for each count of a letter with sizes of 0, 1, 2 and more.
Lesson
The new design handles spelling mistakes way better without any training on spelling mistakes. The design needs to get all the words right for the exact case so some refinement is still needed. In addition, I need to get this running on a video card so I can see the effectiveness with the full range of words since in the last experiment the precision with a small amount of input was out of line with the larger amount of input.Data
Train once
Batch size 100
transposition 7644 / 7673
exact 993 / 999
deletion 7139 / 8672
Code
import tensorflow as tfimport numpy as np
"""
For each word there is a length node, a first letter node, and for each letter in the word there is a node with input of 1 * number of letters
"""
import sys
words_file = "words.txt"
if len(sys.argv) > 1:
words_file = sys.argv[1]
show_details = (len(sys.argv) > 2)
words_txt = open(words_file, "r")
words = words_txt.read().split('\n')
words.pop() # last is empty string
words_txt.close()
number_of_letters = 26
number_of_length_nodes = 30
number_of_first_letter_nodes = number_of_letters
number_of_counts_for_letter_nodes = 4
number_of_letters_nodes = number_of_letters * number_of_counts_for_letter_nodes
number_of_positions = 25
size_of_sections = [
number_of_length_nodes,
number_of_first_letter_nodes,
number_of_letters_nodes
]
position_of_sections = [0]
last_position = 0
for size in size_of_sections:
position_of_sections.append(last_position + size)
number_of_inputs = sum(size_of_sections)
number_of_words = len(words)
number_of_outputs = len(words)
#import pdb; pdb.set_trace();
# setup input nodes for a word
def word_to_train(word, inputs):
# section 1 - length of word
length = len(word)
inputs[position_of_sections[0]+length] = 1
if length > 1:
inputs[position_of_sections[0]+length-1] = 0.5
if length > 2:
inputs[position_of_sections[0]+length-2] = 0.25
if length+1 < number_of_length_nodes:
inputs[position_of_sections[0]+length+1] = 0.5
if length+2 < number_of_length_nodes:
inputs[position_of_sections[0]+length+2] = 0.25
# section 2 - first letter of word
inputs[position_of_sections[1] + ord(word[0]) - ord('a')] = 1
# section 3 - number of times letter appears in word (0,1,2,more)
counts = [0] * number_of_letters
for index, ch in enumerate(word):
counts[ord(ch) - ord('a')] += 1
for letter_index, count in enumerate(counts):
if count == 0:
count_index = 0
elif count == 1:
count_index = 1
elif count == 2:
count_index = 2
else:
count_index = 3
inputs[position_of_sections[2] + letter_index*number_of_counts_for_letter_nodes + count_index] = 1
# do the training(get_batch_size)
def train(sess, get_batch_size, train_step, x, y_):
batch_number = 0
batch_size = get_batch_size(batch_number)
x_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_outputs), dtype=float)
index = 0
for word_number, word in enumerate(words):
word_to_train(word, x_array[index])
y_array[index][word_number] = 1
index += 1
if index == batch_size or word_number+1 == number_of_words:
batch_number += 1
batch_size = get_batch_size(batch_number)
index = 0
sess.run(train_step, feed_dict={x:x_array, y_: y_array})
to_do = number_of_words - word_number - 1
x_array = np.zeros(shape=(min(batch_size, to_do), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(batch_size, to_do), number_of_outputs), dtype=float)
def train_twice(sess, get_batch_size, train_step, x, y_):
train(sess, get_batch_size, train_step, x, y_)
train(sess, get_batch_size, train_step, x, y_)
def run_test_one_word(sess, x, y, word):
x_array = np.zeros(shape=(1, number_of_inputs), dtype=float)
word_to_train(word, x_array[0])
prediction = tf.argmax(y,1)
prediction = sess.run(prediction, feed_dict={x: x_array})
return prediction[0]
def word_to_test_words(word, include_original=True, include_remove=True, include_transpose=True):
words = []
#original word
words.append(word)
# remove a character
for i in range(0,len(word)):
words.append( word[:i] + word[i+1:] )
# transpose characters
if len(word) > 1:
for i in range(0,len(word)-1):
before = word[:i]
letter1 = word[i]
letter2 = word[i+1]
after = word[i+2:]
words.append( before + letter2 + letter1 + after )
return words
def word_to_train_words(word):
return [word]
def word_to_test_words_exact(word):
return [word]
def word_to_test_words_deletion(word):
words = []
for i in range(0,len(word)):
words.append( word[:i] + word[i+1:] )
return words
def word_to_test_words_transposition(word):
words = []
if len(word) > 1:
for i in range(0,len(word)-1):
before = word[:i]
letter1 = word[i]
letter2 = word[i+1]
after = word[i+2:]
words.append( before + letter2 + letter1 + after )
return words
def run_test_for_case(sess, x, y, word_to_test_words_function):
n_right = 0
n_checked = 0
for word_number, word in enumerate(words):
test_words = word_to_test_words_function(word)
for test_word in test_words:
prediction = run_test_one_word(sess, x, y, test_word)
is_right = prediction == word_number
if show_details:
#if is_right:
#print "Right %s matches %s" % (words[word_number], test_word)
#else:
#print "Wrong %s found %s was %s" % (test_word, words[prediction], word)
if not is_right:
#import pdb; pdb.set_trace()
print "Wrong %s found %s was %s" % (test_word, words[prediction], word)
if is_right:
n_right += 1
n_checked += 1
return (n_right, n_checked)
def run_test(words, get_batch_size, train):
x = tf.placeholder(tf.float32, shape=[None, number_of_inputs])
y_ = tf.placeholder(tf.float32, shape=[None, number_of_outputs])
W = tf.Variable(tf.zeros([number_of_inputs, number_of_outputs]))
b = tf.Variable(tf.zeros([number_of_outputs]))
y = tf.nn.softmax(tf.matmul(x,W) + b)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
#import pdb; pdb.set_trace();
train(sess, get_batch_size, train_step, x, y_)
results = {}
results["exact"] = run_test_for_case(sess, x, y, word_to_test_words_exact)
results["deletion"] = run_test_for_case(sess, x, y, word_to_test_words_deletion)
results["transposition"] = run_test_for_case(sess, x, y, word_to_test_words_transposition)
return results
def run_tests(get_batch_size, train):
for batch_size in range(100, 1001, 100):
results = run_test(words, get_batch_size(batch_size), train)
print " Batch size %d" % batch_size
for key in results.keys():
(n_right, n_checked) = results[key]
print "%50s %d / %d" % (key, n_right, n_checked)
print "Batches in same order as file, non-full batch appears last"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
return batch_size
return get_batch_size_2
print " Train once"
run_tests(get_batch_size, train)
print " Train twice"
run_tests(get_batch_size, train_twice)
print "Batches in same order as file, non-full batch appears first"
def get_batch_size(batch_size):
def get_batch_size2(batch_number):
if batch_number == 0:
return number_of_words % batch_size
else:
return batch_size
return get_batch_size2
print " Train once"
#import pdb; pdb.set_trace();
run_tests(get_batch_size, train)
print " Train twice"
run_tests(get_batch_size, train_twice)
print "Batches in same order as file, non-full batch appears last, batch size alternate by div 2"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
if batch_number % 2 == 0:
return batch_size / 2
else:
return batch_size
return get_batch_size_2
print " Train once"
run_tests(get_batch_size, train)
print " Train twice"
run_tests(get_batch_size, train_twice)