Experiment
Using the same design as before I updated the tests to check spelling mistakes. Transposition is two letters are transposed then the word is looked up. Deletion is one letter is deleted and then the lookup is done. The data shows that the design is not good enough to handle spelling mistakes. I did not train on the spelling mistakes as I want a design that naturally accomodates spelling mistakes.
Lesson
The design needs to be different to handle spelling mistakes.
Data
Train once
Batch size 100
transposition 6406 / 7673
exact 993 / 999
deletion 1279 / 8672
Batch size 200
transposition 6407 / 7673
exact 995 / 999
deletion 1279 / 8672
Batch size 300
transposition 975 / 7673
exact 154 / 999
deletion 174 / 8672
Batch size 400
transposition 2449 / 7673
exact 411 / 999
deletion 477 / 8672
Batch size 500
transposition 6415 / 7673
exact 999 / 999
deletion 1258 / 8672
Batch size 600
transposition 4800 / 7673
exact 841 / 999
deletion 1011 / 8672
Batch size 700
transposition 2451 / 7673
exact 369 / 999
deletion 437 / 8672
Batch size 800
transposition 1402 / 7673
exact 200 / 999
deletion 275 / 8672
Batch size 900
transposition 696 / 7673
exact 99 / 999
deletion 157 / 8672
Batch size 1000
transposition 6384 / 7673
exact 999 / 999
deletion 1234 / 8672
Source Code
import tensorflow as tf
import numpy as np
"""
For each position there is a set of 26 letters. Output is one hot vector of words
"""
import sys
words_file = "words.txt"
if len(sys.argv) > 1:
words_file = sys.argv[1]
show_details = (len(sys.argv) > 2)
words_txt = open(words_file, "r")
words = words_txt.read().split('\n')
words.pop() # last is empty string
words_txt.close()
number_of_positions = 25
number_of_letters = 26
number_of_inputs = number_of_positions * number_of_letters
number_of_words = len(words)
number_of_outputs = len(words)
#import pdb; pdb.set_trace();
# setup input nodes for a word
def word_to_train(word, inputs):
for index, ch in enumerate(word):
inputs[index*number_of_letters + ord(ch) - ord('a')] = 1
# do the training(get_batch_size)
def train(sess, get_batch_size, train_step, x, y_):
batch_number = 0
batch_size = get_batch_size(batch_number)
x_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_outputs), dtype=float)
index = 0
for word_number, word in enumerate(words):
word_to_train(word, x_array[index])
y_array[index][word_number] = 1
index += 1
if index == batch_size or word_number+1 == number_of_words:
batch_number += 1
batch_size = get_batch_size(batch_number)
index = 0
sess.run(train_step, feed_dict={x:x_array, y_: y_array})
to_do = number_of_words - word_number - 1
x_array = np.zeros(shape=(min(batch_size, to_do), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(batch_size, to_do), number_of_outputs), dtype=float)
def train_twice(sess, get_batch_size, train_step, x, y_):
train(sess, get_batch_size, train_step, x, y_)
train(sess, get_batch_size, train_step, x, y_)
def run_test_one_word(sess, x, y, word):
x_array = np.zeros(shape=(1, number_of_inputs), dtype=float)
word_to_train(word, x_array[0])
prediction = tf.argmax(y,1)
prediction = sess.run(prediction, feed_dict={x: x_array})
return prediction[0]
def word_to_test_words(word, include_original=True, include_remove=True, include_transpose=True):
words = []
#original word
words.append(word)
# remove a character
for i in range(0,len(word)):
words.append( word[:i] + word[i+1:] )
# transpose characters
if len(word) > 1:
for i in range(0,len(word)-1):
before = word[:i]
letter1 = word[i]
letter2 = word[i+1]
after = word[i+2:]
words.append( before + letter2 + letter1 + after )
return words
def word_to_test_words_exact(word):
return [word]
def word_to_test_words_deletion(word):
words = []
for i in range(0,len(word)):
words.append( word[:i] + word[i+1:] )
return words
def word_to_test_words_transposition(word):
words = []
if len(word) > 1:
for i in range(0,len(word)-1):
before = word[:i]
letter1 = word[i]
letter2 = word[i+1]
after = word[i+2:]
words.append( before + letter2 + letter1 + after )
return words
def run_test_for_case(sess, x, y, word_to_test_words_function):
n_right = 0
n_checked = 0
for word_number, word in enumerate(words):
test_words = word_to_test_words_function(word)
for test_word in test_words:
prediction = run_test_one_word(sess, x, y, test_word)
is_right = prediction == word_number
if show_details:
if is_right:
print "Right %s matches %s" % (words[word_number], test_word)
else:
print "Wrong %s found %s" % (test_word, words[prediction])
if is_right:
n_right += 1
n_checked += 1
return (n_right, n_checked)
def run_test(words, get_batch_size, train):
x = tf.placeholder(tf.float32, shape=[None, number_of_inputs])
y_ = tf.placeholder(tf.float32, shape=[None, number_of_outputs])
W = tf.Variable(tf.zeros([number_of_inputs, number_of_outputs]))
b = tf.Variable(tf.zeros([number_of_outputs]))
y = tf.nn.softmax(tf.matmul(x,W) + b)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
#import pdb; pdb.set_trace();
train(sess, get_batch_size, train_step, x, y_)
results = {}
results["exact"] = run_test_for_case(sess, x, y, word_to_test_words_exact)
results["deletion"] = run_test_for_case(sess, x, y, word_to_test_words_deletion)
results["transposition"] = run_test_for_case(sess, x, y, word_to_test_words_transposition)
return results
def run_tests(get_batch_size, train):
for batch_size in range(100, 1001, 100):
results = run_test(words, get_batch_size(batch_size), train)
print " Batch size %d" % batch_size
for key in results.keys():
(n_right, n_checked) = results[key]
print "%50s %d / %d" % (key, n_right, n_checked)
print "Batches in same order as file, non-full batch appears last"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
return batch_size
return get_batch_size_2
print " Train once"
run_tests(get_batch_size, train)
print " Train twice"
run_tests(get_batch_size, train_twice)
print "Batches in same order as file, non-full batch appears first"
def get_batch_size(batch_size):
def get_batch_size2(batch_number):
if batch_number == 0:
return number_of_words % batch_size
else:
return batch_size
return get_batch_size2
print " Train once"
#import pdb; pdb.set_trace();
run_tests(get_batch_size, train)
print " Train twice"
run_tests(get_batch_size, train_twice)
print "Batches in same order as file, non-full batch appears last, batch size alternate by div 2"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
if batch_number % 2 == 0:
return batch_size / 2
else:
return batch_size
return get_batch_size_2
print " Train once"
run_tests(get_batch_size, train)
print " Train twice"
run_tests(get_batch_size, train_twice)