Using Neural Nets for Natural Language Processing: July 2016

Experiment

Using the same design as before I updated the tests to check spelling mistakes. Transposition is two letters are transposed then the word is looked up. Deletion is one letter is deleted and then the lookup is done. The data shows that the design is not good enough to handle spelling mistakes. I did not train on the spelling mistakes as I want a design that naturally accomodates spelling mistakes.

Lesson

The design needs to be different to handle spelling mistakes.

Data

   Train once
   Batch size 100
                                     transposition 6406 / 7673
                                             exact 993 / 999
                                          deletion 1279 / 8672
   Batch size 200
                                     transposition 6407 / 7673
                                             exact 995 / 999
                                          deletion 1279 / 8672
   Batch size 300
                                     transposition 975 / 7673
                                             exact 154 / 999
                                          deletion 174 / 8672
   Batch size 400
                                     transposition 2449 / 7673
                                             exact 411 / 999
                                          deletion 477 / 8672
   Batch size 500
                                     transposition 6415 / 7673
                                             exact 999 / 999
                                          deletion 1258 / 8672
   Batch size 600
                                     transposition 4800 / 7673
                                             exact 841 / 999
                                          deletion 1011 / 8672
   Batch size 700
                                     transposition 2451 / 7673
                                             exact 369 / 999
                                          deletion 437 / 8672
   Batch size 800
                                     transposition 1402 / 7673
                                             exact 200 / 999
                                          deletion 275 / 8672
   Batch size 900
                                     transposition 696 / 7673
                                             exact 99 / 999
                                          deletion 157 / 8672
   Batch size 1000
                                     transposition 6384 / 7673
                                             exact 999 / 999
                                          deletion 1234 / 8672

Source Code

import tensorflow as tf
import numpy as np
"""
For each position there is a set of 26 letters. Output is one hot vector of words
"""
import sys
words_file = "words.txt"
if len(sys.argv) > 1:
words_file = sys.argv[1]
show_details = (len(sys.argv) > 2)
words_txt = open(words_file, "r")
words = words_txt.read().split('\n')
words.pop() # last is empty string
words_txt.close()
number_of_positions = 25
number_of_letters = 26
number_of_inputs = number_of_positions * number_of_letters
number_of_words = len(words)
number_of_outputs = len(words)
#import pdb; pdb.set_trace();
# setup input nodes for a word
def word_to_train(word, inputs):
for index, ch in enumerate(word):
    inputs[index*number_of_letters + ord(ch) - ord('a')] = 1
# do the training(get_batch_size)
def train(sess, get_batch_size, train_step, x, y_):
batch_number = 0
batch_size = get_batch_size(batch_number)
x_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_outputs), dtype=float)
index = 0
for word_number, word in enumerate(words):
    word_to_train(word, x_array[index])
    y_array[index][word_number] = 1
    index += 1
    if index == batch_size or word_number+1 == number_of_words:
      batch_number += 1
      batch_size = get_batch_size(batch_number)
      index = 0
      sess.run(train_step, feed_dict={x:x_array, y_: y_array})
      to_do = number_of_words - word_number - 1
      x_array = np.zeros(shape=(min(batch_size, to_do), number_of_inputs), dtype=float)
      y_array = np.zeros(shape=(min(batch_size, to_do), number_of_outputs), dtype=float)
def train_twice(sess, get_batch_size, train_step, x, y_):
train(sess, get_batch_size, train_step, x, y_)
train(sess, get_batch_size, train_step, x, y_)
def run_test_one_word(sess, x, y, word):
x_array = np.zeros(shape=(1, number_of_inputs), dtype=float)
word_to_train(word, x_array[0])
prediction = tf.argmax(y,1)
prediction = sess.run(prediction, feed_dict={x: x_array})
return prediction[0]
def word_to_test_words(word, include_original=True, include_remove=True, include_transpose=True):
words = []
#original word
words.append(word)
# remove a character
for i in range(0,len(word)):
    words.append( word[:i] + word[i+1:] )
# transpose characters
if len(word) > 1:
    for i in range(0,len(word)-1):
      before = word[:i]
      letter1 = word[i]
      letter2 = word[i+1]
      after = word[i+2:]
      words.append( before + letter2 + letter1 + after )

return words
def word_to_test_words_exact(word):
return [word]
def word_to_test_words_deletion(word):
words = []
for i in range(0,len(word)):
    words.append( word[:i] + word[i+1:] )
return words
def word_to_test_words_transposition(word):
words = []
if len(word) > 1:
    for i in range(0,len(word)-1):
      before = word[:i]
      letter1 = word[i]
      letter2 = word[i+1]
      after = word[i+2:]
      words.append( before + letter2 + letter1 + after )
return words
def run_test_for_case(sess, x, y, word_to_test_words_function):
n_right = 0
n_checked = 0

for word_number, word in enumerate(words):
    test_words = word_to_test_words_function(word)
    for test_word in test_words:
      prediction = run_test_one_word(sess, x, y, test_word)
      is_right = prediction == word_number
      if show_details:
        if is_right:
          print "Right %s matches %s" % (words[word_number], test_word)
        else:
          print "Wrong %s found %s" % (test_word, words[prediction])
      if is_right:
        n_right += 1
      n_checked += 1
return (n_right, n_checked)
def run_test(words, get_batch_size, train):
x = tf.placeholder(tf.float32, shape=[None, number_of_inputs])
y_ = tf.placeholder(tf.float32, shape=[None, number_of_outputs])
W = tf.Variable(tf.zeros([number_of_inputs, number_of_outputs]))
b = tf.Variable(tf.zeros([number_of_outputs]))
y = tf.nn.softmax(tf.matmul(x,W) + b)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
#import pdb; pdb.set_trace();
train(sess, get_batch_size, train_step, x, y_)
results = {}
results["exact"] = run_test_for_case(sess, x, y, word_to_test_words_exact)
results["deletion"] = run_test_for_case(sess, x, y, word_to_test_words_deletion)
results["transposition"] = run_test_for_case(sess, x, y, word_to_test_words_transposition)
return results
def run_tests(get_batch_size, train):
for batch_size in range(100, 1001, 100):
    results = run_test(words, get_batch_size(batch_size), train)
    print "   Batch size %d" % batch_size
    for key in results.keys():
      (n_right, n_checked) = results[key]
      print "%50s %d / %d" % (key, n_right, n_checked)
print "Batches in same order as file, non-full batch appears last"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    return batch_size
return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)
print "Batches in same order as file, non-full batch appears first"
def get_batch_size(batch_size):
def get_batch_size2(batch_number):
    if batch_number == 0:
      return number_of_words % batch_size
    else:
      return batch_size
return get_batch_size2
print "   Train once"
#import pdb; pdb.set_trace();
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)
print "Batches in same order as file, non-full batch appears last, batch size alternate by div 2"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    if batch_number % 2 == 0:
      return batch_size / 2
    else:
      return batch_size
return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)

Experiment

I updated the code to investigate training after the observation that having a small batch last in training had far worse performance. One thing I tried was training twice. That had no effect at all. I checked having the non-full batch first or last and the performance was mostly the same. The final thing I checked was where the batch size alternates between two size. That had much worse performance than a constant batch size.

Lesson

Make all the batches the same size. It will revisit this if I do some kind of incremental learning.

Data

python main1_ordering.py words_999.txt
Batches in same order as file, non-full batch appears last
   Train once
   Batch size 100, Accuracy: 993 / 999
   Batch size 200, Accuracy: 995 / 999
   Batch size 300, Accuracy: 154 / 999
   Batch size 400, Accuracy: 411 / 999
   Batch size 500, Accuracy: 999 / 999
   Batch size 600, Accuracy: 841 / 999
   Batch size 700, Accuracy: 369 / 999
   Batch size 800, Accuracy: 200 / 999
   Batch size 900, Accuracy: 99 / 999
   Batch size 1000, Accuracy: 999 / 999
   Train twice
   Batch size 100, Accuracy: 993 / 999
   Batch size 200, Accuracy: 995 / 999
   Batch size 300, Accuracy: 154 / 999
   Batch size 400, Accuracy: 411 / 999
   Batch size 500, Accuracy: 999 / 999
   Batch size 600, Accuracy: 841 / 999
   Batch size 700, Accuracy: 369 / 999
   Batch size 800, Accuracy: 200 / 999
   Batch size 900, Accuracy: 99 / 999
   Batch size 1000, Accuracy: 999 / 999
Batches in same order as file, non-full batch appears first
   Train once
   Batch size 100, Accuracy: 991 / 999
   Batch size 200, Accuracy: 995 / 999
   Batch size 300, Accuracy: 150 / 999
   Batch size 400, Accuracy: 450 / 999
   Batch size 500, Accuracy: 999 / 999
   Batch size 600, Accuracy: 831 / 999
   Batch size 700, Accuracy: 377 / 999
   Batch size 800, Accuracy: 200 / 999
   Batch size 900, Accuracy: 99 / 999
   Batch size 1000, Accuracy: 999 / 999
   Train twice
   Batch size 100, Accuracy: 991 / 999
   Batch size 200, Accuracy: 995 / 999
   Batch size 300, Accuracy: 150 / 999
   Batch size 400, Accuracy: 450 / 999
   Batch size 500, Accuracy: 999 / 999
   Batch size 600, Accuracy: 831 / 999
   Batch size 700, Accuracy: 377 / 999
   Batch size 800, Accuracy: 200 / 999
   Batch size 900, Accuracy: 99 / 999
   Batch size 1000, Accuracy: 999 / 999
Batches in same order as file, non-full batch appears last, batch size alternate by div 2
   Train once
   Batch size 100, Accuracy: 469 / 999
   Batch size 200, Accuracy: 503 / 999
   Batch size 300, Accuracy: 377 / 999
   Batch size 400, Accuracy: 672 / 999
   Batch size 500, Accuracy: 583 / 999
   Batch size 600, Accuracy: 116 / 999
   Batch size 700, Accuracy: 551 / 999
   Batch size 800, Accuracy: 892 / 999
   Batch size 900, Accuracy: 989 / 999
   Batch size 1000, Accuracy: 999 / 999
   Train twice
   Batch size 100, Accuracy: 475 / 999
   Batch size 200, Accuracy: 505 / 999
   Batch size 300, Accuracy: 378 / 999
   Batch size 400, Accuracy: 674 / 999
   Batch size 500, Accuracy: 583 / 999
   Batch size 600, Accuracy: 116 / 999
   Batch size 700, Accuracy: 551 / 999
   Batch size 800, Accuracy: 892 / 999
   Batch size 900, Accuracy: 989 / 999
   Batch size 1000, Accuracy: 999 / 999

Source Code

import tensorflow as tf
import numpy as np
"""
For each position there is a set of 26 letters. Output is one hot vector of words
"""
import sys
words_file = "words.txt"
if len(sys.argv) > 1:
words_file = sys.argv[1]
show_details = (len(sys.argv) > 2)
words_txt = open(words_file, "r")
words = words_txt.read().split('\n')
words.pop() # last is empty string
words_txt.close()
number_of_positions = 25
number_of_letters = 26
number_of_inputs = number_of_positions * number_of_letters
number_of_words = len(words)
number_of_outputs = len(words)
#import pdb; pdb.set_trace();
# setup input nodes for a word
def word_to_train(word, inputs):
for index, ch in enumerate(word):
    inputs[index*number_of_letters + ord(ch) - ord('a')] = 1
# do the training(get_batch_size)
def train(sess, get_batch_size, train_step, x, y_):
batch_number = 0
batch_size = get_batch_size(batch_number)
x_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_outputs), dtype=float)
index = 0
for word_number, word in enumerate(words):
    word_to_train(word, x_array[index])
    y_array[index][word_number] = 1
    index += 1
    if index == batch_size or word_number+1 == number_of_words:
      batch_number += 1
      batch_size = get_batch_size(batch_number)
      index = 0
      sess.run(train_step, feed_dict={x:x_array, y_: y_array})
      to_do = number_of_words - word_number - 1
      x_array = np.zeros(shape=(min(batch_size, to_do), number_of_inputs), dtype=float)
      y_array = np.zeros(shape=(min(batch_size, to_do), number_of_outputs), dtype=float)
def train_twice(sess, get_batch_size, train_step, x, y_):
train(sess, get_batch_size, train_step, x, y_)
train(sess, get_batch_size, train_step, x, y_)

def run_test(words, get_batch_size, train):
x = tf.placeholder(tf.float32, shape=[None, number_of_inputs])
y_ = tf.placeholder(tf.float32, shape=[None, number_of_outputs])
W = tf.Variable(tf.zeros([number_of_inputs, number_of_outputs]))
b = tf.Variable(tf.zeros([number_of_outputs]))
y = tf.nn.softmax(tf.matmul(x,W) + b)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
#import pdb; pdb.set_trace();
train(sess, get_batch_size, train_step, x, y_)
x_array = None
y_array = None
n_right = 0
#import pdb; pdb.set_trace();
for word_number, word in enumerate(words):
    #import pdb; pdb.set_trace()
    x_array = np.zeros(shape=(1, number_of_inputs), dtype=float)
    word_to_train(word, x_array[0])
    prediction = tf.argmax(y,1)
    prediction = sess.run(prediction, feed_dict={x: x_array})
    is_right = (prediction[0] == word_number)
    if show_details:
      if is_right:
        print "Right %s" % words[word_number]
      else:
        print "Wrong %s found %s" % (words[word_number], words[prediction[0]])
    if is_right:
      n_right += 1
return n_right

def run_tests(get_batch_size, train):
for batch_size in range(100, 1001, 100):
#for batch_size in range(100, 201, 100):
    n_right = run_test(words, get_batch_size(batch_size), train)
    print "   Batch size %d, Accuracy: %d / %d" % (batch_size, n_right, len(words))

print "Batches in same order as file, non-full batch appears last"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    return batch_size
return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)
print "Batches in same order as file, non-full batch appears first"
def get_batch_size(batch_size):
def get_batch_size2(batch_number):
    if batch_number == 0:
      return number_of_words % batch_size
    else:
      return batch_size
return get_batch_size2
print "   Train once"
#import pdb; pdb.set_trace();
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)
print "Batches in same order as file, non-full batch appears last, batch size alternate by div 2"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    if batch_number % 2 == 0:
      return batch_size / 2
    else:
      return batch_size
return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)

Using Neural Nets for Natural Language Processing

Friday, July 29, 2016

Spelling Mistakes

Experiment

Lesson

Data

Source Code

Friday, July 8, 2016

Training

Experiment

Lesson

Data

Source Code

Blog Archive