Using Neural Nets for Natural Language Processing: August 2016

Thursday, August 11, 2016

New Design for spelling mistakes - Try 2

Experiment

Here are some examples of errors in the lookup

Wrong absorptive found advertisements was absorptive
Wrong abstain found absentmindedness was abstain
Wrong abstained found absentmindedness was abstained
Wrong abstainer found absentmindedness was abstainer
Wrong abstainers found absentmindedness was abstainers
Wrong abstaining found administrating was abstaining
Wrong abstains found absentmindedness was abstains
Wrong abstemious found abstemiousness was abstemious
Wrong abstemiously found abstemiousness was abstemiously
Wrong abstention found absentmindedness was abstention
Wrong abstentions found absentmindedness was abstentions
Wrong abstinence found absentmindedness was abstinence
Wrong abstinent found absentmindedness was abstinent
Wrong abstract found abstracts was abstract
Wrong abstracted found accelerates was abstracted
Wrong abstractedly found aesthetically was abstractedly

I think it was wrong to use a count as the input into the node that represented how many times the letter appeared in the word. I am going to change that to have a node for each count of a letter with sizes of 0, 1, 2 and more.

Lesson

The new design handles spelling mistakes way better without any training on spelling mistakes. The design needs to get all the words right for the exact case so some refinement is still needed. In addition, I need to get this running on a video card so I can see the effectiveness with the full range of words since in the last experiment the precision with a small amount of input was out of line with the larger amount of input.

Data

   Train once
   Batch size 100
                                     transposition 7644 / 7673
                                             exact 993 / 999
                                          deletion 7139 / 8672

Code

import tensorflow as tf
import numpy as np

"""
For each word there is a length node, a first letter node, and for each letter in the word there is a node with input of 1 * number of letters
"""

import sys
words_file = "words.txt"
if len(sys.argv) > 1:
words_file = sys.argv[1]
show_details = (len(sys.argv) > 2)

words_txt = open(words_file, "r")
words = words_txt.read().split('\n')
words.pop() # last is empty string
words_txt.close()

number_of_letters = 26
number_of_length_nodes = 30
number_of_first_letter_nodes = number_of_letters
number_of_counts_for_letter_nodes = 4
number_of_letters_nodes = number_of_letters * number_of_counts_for_letter_nodes
number_of_positions = 25
size_of_sections = [
number_of_length_nodes,
number_of_first_letter_nodes,
number_of_letters_nodes
]
position_of_sections = [0]
last_position = 0
for size in size_of_sections:
position_of_sections.append(last_position + size)
number_of_inputs = sum(size_of_sections)

number_of_words = len(words)
number_of_outputs = len(words)

#import pdb; pdb.set_trace();

# setup input nodes for a word
def word_to_train(word, inputs):
# section 1 - length of word
length = len(word)
inputs[position_of_sections[0]+length] = 1

if length > 1:
    inputs[position_of_sections[0]+length-1] = 0.5
if length > 2:
    inputs[position_of_sections[0]+length-2] = 0.25

if length+1 < number_of_length_nodes:
    inputs[position_of_sections[0]+length+1] = 0.5
if length+2 < number_of_length_nodes:
    inputs[position_of_sections[0]+length+2] = 0.25

# section 2 - first letter of word
inputs[position_of_sections[1] + ord(word[0]) - ord('a')] = 1

# section 3 - number of times letter appears in word (0,1,2,more)
counts = [0] * number_of_letters
for index, ch in enumerate(word):
    counts[ord(ch) - ord('a')] += 1

for letter_index, count in enumerate(counts):
    if count == 0:
      count_index = 0
    elif count == 1:
      count_index = 1
    elif count == 2:
      count_index = 2
    else:
      count_index = 3
    inputs[position_of_sections[2] + letter_index*number_of_counts_for_letter_nodes + count_index] = 1

# do the training(get_batch_size)
def train(sess, get_batch_size, train_step, x, y_):
batch_number = 0
batch_size = get_batch_size(batch_number)

x_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_outputs), dtype=float)

index = 0
for word_number, word in enumerate(words):
    word_to_train(word, x_array[index])
    y_array[index][word_number] = 1
    index += 1
    if index == batch_size or word_number+1 == number_of_words:
      batch_number += 1
      batch_size = get_batch_size(batch_number)

      index = 0
      sess.run(train_step, feed_dict={x:x_array, y_: y_array})
      to_do = number_of_words - word_number - 1
      x_array = np.zeros(shape=(min(batch_size, to_do), number_of_inputs), dtype=float)
      y_array = np.zeros(shape=(min(batch_size, to_do), number_of_outputs), dtype=float)

def train_twice(sess, get_batch_size, train_step, x, y_):
train(sess, get_batch_size, train_step, x, y_)
train(sess, get_batch_size, train_step, x, y_)

def run_test_one_word(sess, x, y, word):
x_array = np.zeros(shape=(1, number_of_inputs), dtype=float)
word_to_train(word, x_array[0])
prediction = tf.argmax(y,1)
prediction = sess.run(prediction, feed_dict={x: x_array})
return prediction[0]

def word_to_test_words(word, include_original=True, include_remove=True, include_transpose=True):
words = []

#original word
words.append(word)

# remove a character
for i in range(0,len(word)):
    words.append( word[:i] + word[i+1:] )

# transpose characters
if len(word) > 1:
    for i in range(0,len(word)-1):
      before = word[:i]
      letter1 = word[i]
      letter2 = word[i+1]
      after = word[i+2:]
      words.append( before + letter2 + letter1 + after )

return words

def word_to_train_words(word):
return [word]

def word_to_test_words_exact(word):
return [word]

def word_to_test_words_deletion(word):
words = []
for i in range(0,len(word)):
    words.append( word[:i] + word[i+1:] )
return words

def word_to_test_words_transposition(word):
words = []
if len(word) > 1:
    for i in range(0,len(word)-1):
      before = word[:i]
      letter1 = word[i]
      letter2 = word[i+1]
      after = word[i+2:]
      words.append( before + letter2 + letter1 + after )
return words

def run_test_for_case(sess, x, y, word_to_test_words_function):

n_right = 0
n_checked = 0

for word_number, word in enumerate(words):
    test_words = word_to_test_words_function(word)

    for test_word in test_words:
      prediction = run_test_one_word(sess, x, y, test_word)
      is_right = prediction == word_number

      if show_details:
        #if is_right:
          #print "Right %s matches %s" % (words[word_number], test_word)
        #else:
          #print "Wrong %s found %s was %s" % (test_word, words[prediction], word)
        if not is_right:
          #import pdb; pdb.set_trace()
          print "Wrong %s found %s was %s" % (test_word, words[prediction], word)

      if is_right:
        n_right += 1

      n_checked += 1
return (n_right, n_checked)

def run_test(words, get_batch_size, train):
x = tf.placeholder(tf.float32, shape=[None, number_of_inputs])
y_ = tf.placeholder(tf.float32, shape=[None, number_of_outputs])

W = tf.Variable(tf.zeros([number_of_inputs, number_of_outputs]))
b = tf.Variable(tf.zeros([number_of_outputs]))

y = tf.nn.softmax(tf.matmul(x,W) + b)

cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())

#import pdb; pdb.set_trace();
train(sess, get_batch_size, train_step, x, y_)

results = {}
results["exact"] = run_test_for_case(sess, x, y, word_to_test_words_exact)
results["deletion"] = run_test_for_case(sess, x, y, word_to_test_words_deletion)
results["transposition"] = run_test_for_case(sess, x, y, word_to_test_words_transposition)
return results

def run_tests(get_batch_size, train):
for batch_size in range(100, 1001, 100):
    results = run_test(words, get_batch_size(batch_size), train)
    print "   Batch size %d" % batch_size
    for key in results.keys():
      (n_right, n_checked) = results[key]
      print "%50s %d / %d" % (key, n_right, n_checked)

print "Batches in same order as file, non-full batch appears last"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    return batch_size
return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)

print "Batches in same order as file, non-full batch appears first"

def get_batch_size(batch_size):
def get_batch_size2(batch_number):
    if batch_number == 0:
      return number_of_words % batch_size
    else:
      return batch_size
return get_batch_size2

print "   Train once"
#import pdb; pdb.set_trace();
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)

print "Batches in same order as file, non-full batch appears last, batch size alternate by div 2"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    if batch_number % 2 == 0:
      return batch_size / 2
    else:
      return batch_size
return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)

Monday, August 8, 2016

New Design for Spelling Mistakes - Try 1

Experiment

I updated the design of the neural net to handle spelling mistakes better. There is a word length section. For that each node represents the word length. There is a first letter section. For that there is one node for each letter and the one that matches the word's first letter is hot. There is a letters section. That has one node for each letter and the input is the number of times the letter appears in the word.

Lesson

This was interesting when I did a small batch run with nine words. The results looked great. When I increased the number of words to 999 then it fell apart.

Data

Small Batch Run (9 words)

Batches in same order as file, non-full batch appears last
   Train once
   Batch size 100
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 200
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 300
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 400
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 500
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 600
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 700
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 800
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 900
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 1000
                                     transposition 54 / 55
                                             exact 9 / 9
Large Batch Run (999 words)

Batches in same order as file, non-full batch appears last
   Train once
   Batch size 100
                                     transposition 1989 / 7673
                                             exact 194 / 999
                                          deletion 1882 / 8672
   Batch size 200
                                     transposition 1893 / 7673
                                             exact 181 / 999
                                          deletion 1926 / 8672
   Batch size 300
                                     transposition 450 / 7673
                                             exact 52 / 999
                                          deletion 460 / 8672
   Batch size 400
                                     transposition 704 / 7673
                                             exact 76 / 999
                                          deletion 693 / 8672
   Batch size 500
                                     transposition 1832 / 7673
                                             exact 174 / 999
                                          deletion 1947 / 8672
   Batch size 600
                                     transposition 1051 / 7673
                                             exact 110 / 999
                                          deletion 1049 / 8672
   Batch size 700
                                     transposition 871 / 7673
                                             exact 90 / 999
                                          deletion 894 / 8672
   Batch size 800
                                     transposition 704 / 7673
                                             exact 76 / 999
                                          deletion 693 / 8672
   Batch size 900
                                     transposition 450 / 7673
                                             exact 52 / 999
                                          deletion 460 / 8672

Code

import tensorflow as tf
import numpy as np
"""
For each word there is a length node, a first letter node, and for each letter in the word there is a node with input of 1 * number of letters
"""
import sys
words_file = "words.txt"
if len(sys.argv) > 1:
words_file = sys.argv[1]
show_details = (len(sys.argv) > 2)
words_txt = open(words_file, "r")
words = words_txt.read().split('\n')
words.pop() # last is empty string
words_txt.close()
number_of_letters = 26
number_of_length_nodes = 30
number_of_first_letter_nodes = number_of_letters
number_of_letters_nodes = number_of_letters
number_of_positions = 25
size_of_sections = [
number_of_length_nodes,
number_of_first_letter_nodes,
number_of_letters_nodes
]
position_of_sections = [0]
last_position = 0
for size in size_of_sections:
position_of_sections.append(last_position + size)
number_of_inputs = sum(size_of_sections)
number_of_words = len(words)
number_of_outputs = len(words)
#import pdb; pdb.set_trace();
# setup input nodes for a word
def word_to_train(word, inputs):
# section 1
length = len(word)
inputs[position_of_sections[0]+length] = 1

if length > 1:
    inputs[position_of_sections[0]+length-1] = 0.5
if length > 2:
    inputs[position_of_sections[0]+length-2] = 0.25

if length+1 < number_of_length_nodes:
    inputs[position_of_sections[0]+length+1] = 0.5
if length+2 < number_of_length_nodes:
    inputs[position_of_sections[0]+length+2] = 0.25
# section 2
inputs[position_of_sections[1] + ord(word[0]) - ord('a')] = 1
# section 3
for index, ch in enumerate(word):
    inputs[position_of_sections[2] + ord(ch) - ord('a')] += 1
# do the training(get_batch_size)
def train(sess, get_batch_size, train_step, x, y_):
batch_number = 0
batch_size = get_batch_size(batch_number)
x_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_outputs), dtype=float)
index = 0
for word_number, word in enumerate(words):
    word_to_train(word, x_array[index])
    y_array[index][word_number] = 1
    index += 1
    if index == batch_size or word_number+1 == number_of_words:
      batch_number += 1
      batch_size = get_batch_size(batch_number)
      index = 0
      sess.run(train_step, feed_dict={x:x_array, y_: y_array})
      to_do = number_of_words - word_number - 1
      x_array = np.zeros(shape=(min(batch_size, to_do), number_of_inputs), dtype=float)
      y_array = np.zeros(shape=(min(batch_size, to_do), number_of_outputs), dtype=float)
def train_twice(sess, get_batch_size, train_step, x, y_):
train(sess, get_batch_size, train_step, x, y_)
train(sess, get_batch_size, train_step, x, y_)
def run_test_one_word(sess, x, y, word):
x_array = np.zeros(shape=(1, number_of_inputs), dtype=float)
word_to_train(word, x_array[0])
prediction = tf.argmax(y,1)
prediction = sess.run(prediction, feed_dict={x: x_array})
return prediction[0]
def word_to_test_words(word, include_original=True, include_remove=True, include_transpose=True):
words = []
#original word
words.append(word)
# remove a character
for i in range(0,len(word)):
    words.append( word[:i] + word[i+1:] )
# transpose characters
if len(word) > 1:
    for i in range(0,len(word)-1):
      before = word[:i]
      letter1 = word[i]
      letter2 = word[i+1]
      after = word[i+2:]
      words.append( before + letter2 + letter1 + after )

return words
def word_to_train_words(word):
return [word]
def word_to_test_words_exact(word):
return [word]
def word_to_test_words_deletion(word):
words = []
for i in range(0,len(word)):
    words.append( word[:i] + word[i+1:] )
return words
def word_to_test_words_transposition(word):
words = []
if len(word) > 1:
    for i in range(0,len(word)-1):
      before = word[:i]
      letter1 = word[i]
      letter2 = word[i+1]
      after = word[i+2:]
      words.append( before + letter2 + letter1 + after )
return words
def run_test_for_case(sess, x, y, word_to_test_words_function):
n_right = 0
n_checked = 0

for word_number, word in enumerate(words):
    test_words = word_to_test_words_function(word)
    for test_word in test_words:
      prediction = run_test_one_word(sess, x, y, test_word)
      is_right = prediction == word_number
      if show_details:
        #if is_right:
          #print "Right %s matches %s" % (words[word_number], test_word)
        #else:
          #print "Wrong %s found %s was %s" % (test_word, words[prediction], word)
        if not is_right:
          #import pdb; pdb.set_trace()
          print "Wrong %s found %s was %s" % (test_word, words[prediction], word)
      if is_right:
        n_right += 1
      n_checked += 1
return (n_right, n_checked)
def run_test(words, get_batch_size, train):
x = tf.placeholder(tf.float32, shape=[None, number_of_inputs])
y_ = tf.placeholder(tf.float32, shape=[None, number_of_outputs])
W = tf.Variable(tf.zeros([number_of_inputs, number_of_outputs]))
b = tf.Variable(tf.zeros([number_of_outputs]))
y = tf.nn.softmax(tf.matmul(x,W) + b)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
#import pdb; pdb.set_trace();
train(sess, get_batch_size, train_step, x, y_)
results = {}
results["exact"] = run_test_for_case(sess, x, y, word_to_test_words_exact)
results["deletion"] = run_test_for_case(sess, x, y, word_to_test_words_deletion)
results["transposition"] = run_test_for_case(sess, x, y, word_to_test_words_transposition)
return results
def run_tests(get_batch_size, train):
for batch_size in range(100, 1001, 100):
    results = run_test(words, get_batch_size(batch_size), train)
    print "   Batch size %d" % batch_size
    for key in results.keys():
      (n_right, n_checked) = results[key]
      print "%50s %d / %d" % (key, n_right, n_checked)
print "Batches in same order as file, non-full batch appears last"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    return batch_size
return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)
print "Batches in same order as file, non-full batch appears first"
def get_batch_size(batch_size):
def get_batch_size2(batch_number):
    if batch_number == 0:
      return number_of_words % batch_size
    else:
      return batch_size
return get_batch_size2
print "   Train once"
#import pdb; pdb.set_trace();
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)
print "Batches in same order as file, non-full batch appears last, batch size alternate by div 2"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    if batch_number % 2 == 0:
      return batch_size / 2
    else:
      return batch_size
return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)