Monday, August 8, 2016

New Design for Spelling Mistakes - Try 1

Experiment


I updated the design of the neural net to handle spelling mistakes better. There is a word length section. For that each node represents the word length. There is a first letter section. For that there is one node for each letter and the one that matches the word's first letter is hot. There is a letters section. That has one node for each letter and the input is the number of times the letter appears in the word.

Lesson

This was interesting when I did a small batch run with nine words. The results looked great. When I increased the number of words to 999 then it fell apart.

Data

Small Batch Run (9 words)

Batches in same order as file, non-full batch appears last
   Train once
   Batch size 100
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 200
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 300
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 400
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 500
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 600
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 700
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 800
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 900
                                     transposition 54 / 55
                                             exact 9 / 9
                                          deletion 58 / 64
   Batch size 1000
                                     transposition 54 / 55
                                             exact 9 / 9

Large Batch Run (999 words)

Batches in same order as file, non-full batch appears last
   Train once
   Batch size 100
                                     transposition 1989 / 7673
                                             exact 194 / 999
                                          deletion 1882 / 8672
   Batch size 200
                                     transposition 1893 / 7673
                                             exact 181 / 999
                                          deletion 1926 / 8672
   Batch size 300
                                     transposition 450 / 7673
                                             exact 52 / 999
                                          deletion 460 / 8672
   Batch size 400
                                     transposition 704 / 7673
                                             exact 76 / 999
                                          deletion 693 / 8672
   Batch size 500
                                     transposition 1832 / 7673
                                             exact 174 / 999
                                          deletion 1947 / 8672
   Batch size 600
                                     transposition 1051 / 7673
                                             exact 110 / 999
                                          deletion 1049 / 8672
   Batch size 700
                                     transposition 871 / 7673
                                             exact 90 / 999
                                          deletion 894 / 8672
   Batch size 800
                                     transposition 704 / 7673
                                             exact 76 / 999
                                          deletion 693 / 8672
   Batch size 900
                                     transposition 450 / 7673
                                             exact 52 / 999
                                          deletion 460 / 8672

Code

import tensorflow as tf
import numpy as np

"""
For each word there is a length node, a first letter node, and for each letter in the word there is a node with input of 1 * number of letters
"""

import sys
words_file = "words.txt"
if len(sys.argv) > 1:
  words_file = sys.argv[1]
show_details = (len(sys.argv) > 2)

words_txt = open(words_file, "r")
words = words_txt.read().split('\n')
words.pop() # last is empty string
words_txt.close()

number_of_letters = 26
number_of_length_nodes = 30
number_of_first_letter_nodes = number_of_letters
number_of_letters_nodes = number_of_letters
number_of_positions = 25
size_of_sections = [
  number_of_length_nodes,
  number_of_first_letter_nodes,
  number_of_letters_nodes
]
position_of_sections = [0]
last_position = 0
for size in size_of_sections:
  position_of_sections.append(last_position + size)
number_of_inputs = sum(size_of_sections)

number_of_words = len(words)
number_of_outputs = len(words)

#import pdb; pdb.set_trace();
# setup input nodes for a word
def word_to_train(word, inputs):
  # section 1
  length = len(word)
  inputs[position_of_sections[0]+length] = 1

  if length > 1:
    inputs[position_of_sections[0]+length-1] = 0.5
  if length > 2:
    inputs[position_of_sections[0]+length-2] = 0.25

  if length+1 < number_of_length_nodes:
    inputs[position_of_sections[0]+length+1] = 0.5
  if length+2 < number_of_length_nodes:
    inputs[position_of_sections[0]+length+2] = 0.25

  # section 2
  inputs[position_of_sections[1] + ord(word[0]) - ord('a')] = 1

  # section 3
  for index, ch in enumerate(word):
    inputs[position_of_sections[2] + ord(ch) - ord('a')] += 1

# do the training(get_batch_size)
def train(sess, get_batch_size, train_step, x, y_):
  batch_number = 0
  batch_size = get_batch_size(batch_number)

  x_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_inputs), dtype=float)
  y_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_outputs), dtype=float)

  index = 0
  for word_number, word in enumerate(words):
    word_to_train(word, x_array[index])
    y_array[index][word_number] = 1
    index += 1
    if index == batch_size or word_number+1 == number_of_words:
      batch_number += 1
      batch_size = get_batch_size(batch_number)

      index = 0
      sess.run(train_step, feed_dict={x:x_array, y_: y_array})
      to_do = number_of_words - word_number - 1
      x_array = np.zeros(shape=(min(batch_size, to_do), number_of_inputs), dtype=float)
      y_array = np.zeros(shape=(min(batch_size, to_do), number_of_outputs), dtype=float)

def train_twice(sess, get_batch_size, train_step, x, y_):
  train(sess, get_batch_size, train_step, x, y_)
  train(sess, get_batch_size, train_step, x, y_)

def run_test_one_word(sess, x, y, word):
  x_array = np.zeros(shape=(1, number_of_inputs), dtype=float)
  word_to_train(word, x_array[0])
  prediction = tf.argmax(y,1)
  prediction = sess.run(prediction, feed_dict={x: x_array})
  return prediction[0]

def word_to_test_words(word, include_original=True, include_remove=True, include_transpose=True):
  words = []

  #original word
  words.append(word)

  # remove a character
  for i in range(0,len(word)):
    words.append( word[:i] + word[i+1:] )

  # transpose characters
  if len(word) > 1:
    for i in range(0,len(word)-1):
      before = word[:i]
      letter1 = word[i]
      letter2 = word[i+1]
      after = word[i+2:]
      words.append( before + letter2 + letter1 + after )
     
  return words

def word_to_train_words(word):
  return [word]

def word_to_test_words_exact(word):
  return [word]

def word_to_test_words_deletion(word):
  words = []
  for i in range(0,len(word)):
    words.append( word[:i] + word[i+1:] )
  return words

def word_to_test_words_transposition(word):
  words = []
  if len(word) > 1:
    for i in range(0,len(word)-1):
      before = word[:i]
      letter1 = word[i]
      letter2 = word[i+1]
      after = word[i+2:]
      words.append( before + letter2 + letter1 + after )
  return words

def run_test_for_case(sess, x, y, word_to_test_words_function):
  n_right = 0
  n_checked = 0
 
  for word_number, word in enumerate(words):
    test_words = word_to_test_words_function(word)

    for test_word in test_words:
      prediction = run_test_one_word(sess, x, y, test_word)
      is_right = prediction == word_number

      if show_details:
        #if is_right:
          #print "Right %s matches %s" % (words[word_number], test_word)
        #else:
          #print "Wrong %s found %s was %s" % (test_word, words[prediction], word)
        if not is_right:
          #import pdb; pdb.set_trace()
          print "Wrong %s found %s was %s" % (test_word, words[prediction], word)

      if is_right:
        n_right += 1

      n_checked += 1
  return (n_right, n_checked)

def run_test(words, get_batch_size, train):
  x = tf.placeholder(tf.float32, shape=[None, number_of_inputs])
  y_ = tf.placeholder(tf.float32, shape=[None, number_of_outputs])

  W = tf.Variable(tf.zeros([number_of_inputs, number_of_outputs]))
  b = tf.Variable(tf.zeros([number_of_outputs]))

  y = tf.nn.softmax(tf.matmul(x,W) + b)
  cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
  train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

  sess = tf.InteractiveSession()
  sess.run(tf.initialize_all_variables())

  #import pdb; pdb.set_trace();
  train(sess, get_batch_size, train_step, x, y_)

  results = {}
  results["exact"] = run_test_for_case(sess, x, y, word_to_test_words_exact)
  results["deletion"] = run_test_for_case(sess, x, y, word_to_test_words_deletion)
  results["transposition"] = run_test_for_case(sess, x, y, word_to_test_words_transposition)
  return results

def run_tests(get_batch_size, train):
  for batch_size in range(100, 1001, 100):
    results = run_test(words, get_batch_size(batch_size), train)
    print "   Batch size %d" % batch_size
    for key in results.keys():
      (n_right, n_checked) = results[key]
      print "%50s %d / %d" % (key, n_right, n_checked)

print "Batches in same order as file, non-full batch appears last"
def get_batch_size(batch_size):
  def get_batch_size_2(batch_number):
    return batch_size
  return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)

print "Batches in same order as file, non-full batch appears first"
def get_batch_size(batch_size):
  def get_batch_size2(batch_number):
    if batch_number == 0:
      return number_of_words % batch_size
    else:
      return batch_size
  return get_batch_size2

print "   Train once"
#import pdb; pdb.set_trace();
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)

print "Batches in same order as file, non-full batch appears last, batch size alternate by div 2"
def get_batch_size(batch_size):
  def get_batch_size_2(batch_number):
    if batch_number % 2 == 0:
      return batch_size / 2
    else:
      return batch_size
  return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)

No comments:

Post a Comment