Using Neural Nets for Natural Language Processing: New Design for Spelling Mistakes

Experiment

I updated the design of the neural net to handle spelling mistakes better. There is a word length section. For that each node represents the word length. There is a first letter section. For that there is one node for each letter and the one that matches the word's first letter is hot. There is a letters section. That has one node for each letter and the input is the number of times the letter appears in the word.

Lesson

This was interesting when I did a small batch run with nine words. The results looked great. When I increased the number of words to 999 then it fell apart.

Data

Small Batch Run (9 words)

Batches in same order as file, non-full batch appears last
   Train once
   Batch size 100
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 200
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 300
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 400
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 500
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 600
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 700
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 800
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 900
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 1000
                                     transposition 54 / 55
                                             exact 9 / 9
Large Batch Run (999 words)

Batches in same order as file, non-full batch appears last
   Train once
   Batch size 100
                                     transposition 1989 / 7673
                                             exact 194 / 999
                                          deletion 1882 / 8672
   Batch size 200
                                     transposition 1893 / 7673
                                             exact 181 / 999
                                          deletion 1926 / 8672
   Batch size 300
                                     transposition 450 / 7673
                                             exact 52 / 999
                                          deletion 460 / 8672
   Batch size 400
                                     transposition 704 / 7673
                                             exact 76 / 999
                                          deletion 693 / 8672
   Batch size 500
                                     transposition 1832 / 7673
                                             exact 174 / 999
                                          deletion 1947 / 8672
   Batch size 600
                                     transposition 1051 / 7673
                                             exact 110 / 999
                                          deletion 1049 / 8672
   Batch size 700
                                     transposition 871 / 7673
                                             exact 90 / 999
                                          deletion 894 / 8672
   Batch size 800
                                     transposition 704 / 7673
                                             exact 76 / 999
                                          deletion 693 / 8672
   Batch size 900
                                     transposition 450 / 7673
                                             exact 52 / 999
                                          deletion 460 / 8672

Code

import tensorflow as tf
import numpy as np
"""
For each word there is a length node, a first letter node, and for each letter in the word there is a node with input of 1 * number of letters
"""
import sys
words_file = "words.txt"
if len(sys.argv) > 1:
words_file = sys.argv[1]
show_details = (len(sys.argv) > 2)
words_txt = open(words_file, "r")
words = words_txt.read().split('\n')
words.pop() # last is empty string
words_txt.close()
number_of_letters = 26
number_of_length_nodes = 30
number_of_first_letter_nodes = number_of_letters
number_of_letters_nodes = number_of_letters
number_of_positions = 25
size_of_sections = [
number_of_length_nodes,
number_of_first_letter_nodes,
number_of_letters_nodes
]
position_of_sections = [0]
last_position = 0
for size in size_of_sections:
position_of_sections.append(last_position + size)
number_of_inputs = sum(size_of_sections)
number_of_words = len(words)
number_of_outputs = len(words)
#import pdb; pdb.set_trace();
# setup input nodes for a word
def word_to_train(word, inputs):
# section 1
length = len(word)
inputs[position_of_sections[0]+length] = 1

if length > 1:
    inputs[position_of_sections[0]+length-1] = 0.5
if length > 2:
    inputs[position_of_sections[0]+length-2] = 0.25

if length+1 < number_of_length_nodes:
    inputs[position_of_sections[0]+length+1] = 0.5
if length+2 < number_of_length_nodes:
    inputs[position_of_sections[0]+length+2] = 0.25
# section 2
inputs[position_of_sections[1] + ord(word[0]) - ord('a')] = 1
# section 3
for index, ch in enumerate(word):
    inputs[position_of_sections[2] + ord(ch) - ord('a')] += 1
# do the training(get_batch_size)
def train(sess, get_batch_size, train_step, x, y_):
batch_number = 0
batch_size = get_batch_size(batch_number)
x_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_outputs), dtype=float)
index = 0
for word_number, word in enumerate(words):
    word_to_train(word, x_array[index])
    y_array[index][word_number] = 1
    index += 1
    if index == batch_size or word_number+1 == number_of_words:
      batch_number += 1
      batch_size = get_batch_size(batch_number)
      index = 0
      sess.run(train_step, feed_dict={x:x_array, y_: y_array})
      to_do = number_of_words - word_number - 1
      x_array = np.zeros(shape=(min(batch_size, to_do), number_of_inputs), dtype=float)
      y_array = np.zeros(shape=(min(batch_size, to_do), number_of_outputs), dtype=float)
def train_twice(sess, get_batch_size, train_step, x, y_):
train(sess, get_batch_size, train_step, x, y_)
train(sess, get_batch_size, train_step, x, y_)
def run_test_one_word(sess, x, y, word):
x_array = np.zeros(shape=(1, number_of_inputs), dtype=float)
word_to_train(word, x_array[0])
prediction = tf.argmax(y,1)
prediction = sess.run(prediction, feed_dict={x: x_array})
return prediction[0]
def word_to_test_words(word, include_original=True, include_remove=True, include_transpose=True):
words = []
#original word
words.append(word)
# remove a character
for i in range(0,len(word)):
    words.append( word[:i] + word[i+1:] )
# transpose characters
if len(word) > 1:
    for i in range(0,len(word)-1):
      before = word[:i]
      letter1 = word[i]
      letter2 = word[i+1]
      after = word[i+2:]
      words.append( before + letter2 + letter1 + after )

return words
def word_to_train_words(word):
return [word]
def word_to_test_words_exact(word):
return [word]
def word_to_test_words_deletion(word):
words = []
for i in range(0,len(word)):
    words.append( word[:i] + word[i+1:] )
return words
def word_to_test_words_transposition(word):
words = []
if len(word) > 1:
    for i in range(0,len(word)-1):
      before = word[:i]
      letter1 = word[i]
      letter2 = word[i+1]
      after = word[i+2:]
      words.append( before + letter2 + letter1 + after )
return words
def run_test_for_case(sess, x, y, word_to_test_words_function):
n_right = 0
n_checked = 0

for word_number, word in enumerate(words):
    test_words = word_to_test_words_function(word)
    for test_word in test_words:
      prediction = run_test_one_word(sess, x, y, test_word)
      is_right = prediction == word_number
      if show_details:
        #if is_right:
          #print "Right %s matches %s" % (words[word_number], test_word)
        #else:
          #print "Wrong %s found %s was %s" % (test_word, words[prediction], word)
        if not is_right:
          #import pdb; pdb.set_trace()
          print "Wrong %s found %s was %s" % (test_word, words[prediction], word)
      if is_right:
        n_right += 1
      n_checked += 1
return (n_right, n_checked)
def run_test(words, get_batch_size, train):
x = tf.placeholder(tf.float32, shape=[None, number_of_inputs])
y_ = tf.placeholder(tf.float32, shape=[None, number_of_outputs])
W = tf.Variable(tf.zeros([number_of_inputs, number_of_outputs]))
b = tf.Variable(tf.zeros([number_of_outputs]))
y = tf.nn.softmax(tf.matmul(x,W) + b)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
#import pdb; pdb.set_trace();
train(sess, get_batch_size, train_step, x, y_)
results = {}
results["exact"] = run_test_for_case(sess, x, y, word_to_test_words_exact)
results["deletion"] = run_test_for_case(sess, x, y, word_to_test_words_deletion)
results["transposition"] = run_test_for_case(sess, x, y, word_to_test_words_transposition)
return results
def run_tests(get_batch_size, train):
for batch_size in range(100, 1001, 100):
    results = run_test(words, get_batch_size(batch_size), train)
    print "   Batch size %d" % batch_size
    for key in results.keys():
      (n_right, n_checked) = results[key]
      print "%50s %d / %d" % (key, n_right, n_checked)
print "Batches in same order as file, non-full batch appears last"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    return batch_size
return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)
print "Batches in same order as file, non-full batch appears first"
def get_batch_size(batch_size):
def get_batch_size2(batch_number):
    if batch_number == 0:
      return number_of_words % batch_size
    else:
      return batch_size
return get_batch_size2
print "   Train once"
#import pdb; pdb.set_trace();
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)
print "Batches in same order as file, non-full batch appears last, batch size alternate by div 2"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    if batch_number % 2 == 0:
      return batch_size / 2
    else:
      return batch_size
return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)

Using Neural Nets for Natural Language Processing

Monday, August 8, 2016

New Design for Spelling Mistakes - Try 1

Experiment

Lesson

Data

Code

No comments:

Post a Comment

Blog Archive