Using Neural Nets for Natural Language Processing

Sunday, June 18, 2017

Arithmetic Expressions - Try 2

I am trying an RNN for try number two. The inputs will be the operators in the expression. The output of the RNN will be which operator to process. The training data will be expression of a shorter length. The test data will be expression of a longer length to see if the RNN learned a generalization based on length.

For example, for five operators +, -, *, / and ^ sample input will be

+
*
-
/
^
++
+*

The output will be

+
*
-
/
^
+
*

The code is located at https://github.com/cooledge/nn/tree/master/expressions/math_rnn

The code for training is make_data.py . The args arg --train <sizes> --test <sizes> . For make_data --train=1,2,3 --test=4 the output is right(597), wrong(23). So although trained on shorted length sequences there was enough info for longer sequences. For --train=1,2,3 --test=5 the outputs is right(2697), wrong(423). For --train=1,2,3 --test=6 the output is right(12202), wrong(3417). The current model is not scaling well for training on shorter sequences and apply to longer sequences. The other model that I had that trained on pairwise ops had a better success rate and trained on much less data.

I tried changing the LSTM state size and number of layers that had no effect. I tried adding another fully connected layer on the output with a relu and that had no effect.

Sunday, June 11, 2017

Arithmetic Expressions - Try 1

Design a neural net that can be used to evaluate arithmetic expression of the form 1+2*3-7/4 . The first try will be a neural net that takes the operators and says which operator to apply next. The input will be positions and at each position there will be a node for each operator. Similar to the first design of the neural net to look up words. The output will be one node for each position. The position that is to be applied first will be selected.
This design does not work good enough. I also tried outputing the operator to be used and making the output structure match the input structure. The results were equally as bad.

Data

Number of positions 3
   Train once                     16 / 20
   Train twice                    16 / 20

Number of positions 3
   Train once                     56 / 84
   Train twice                    56 / 84

Number of positions 4
   Train once                     200 / 340
   Train twice                    200 / 340

Code

import tensorflow as tf
import numpy as np
"""
Let's learn how to evaulate an arithmetic expression using '+', '-', '*' and '/'
"""
import itertools
import sys
show_details = (len(sys.argv) > 1)
# no hidden layers or one hidden layer
one_level_version = True
# output layer is the operator that is select or the position in the input
#output_is = "operator"
output_is = "position"
# length of the expressions
number_of_positions = 4
# the output is the position that should be processed next
def train_data_output_is_position(number_of_positions):
data = {}
for length in range(1,number_of_positions+1):
    for input in itertools.product('+-*/', repeat=length):
      select = None
      for index, ch in enumerate(input):
        if ch == '*' or ch == '/':
          select = index
          break
      if select == None:
        select = 0
      data[input] = select
return data;
# the output is the operator that should be processed next
def train_data_output_is_operator(number_of_positions):
data = {}
for length in range(1,number_of_positions+1):
    for input in itertools.product('+-*/', repeat=length):
      select = None
      for index, ch in enumerate(input):
        if ch == '*' or ch == '/':
          select = operator_to_position(ch)
          break
      if select == None:
        select = 0
      data[input] = select
return data;
def operator_to_position(operator):
return {
    "+": 0,
    "-": 1,
    "*": 2,
    "/": 3,
}[operator]
import sys
#number_of_positions = 2
#output_is = "operator"
number_of_operators = 4
if output_is == "operator":
training_data = train_data_output_is_operator(number_of_positions)
number_of_outputs = number_of_operators
else:
training_data = train_data_output_is_position(number_of_positions)
number_of_outputs = number_of_positions

number_of_training_data = len(training_data)
size_of_sections = [
number_of_positions * number_of_operators
]
position_of_sections = [0]
last_position = 0
for size in size_of_sections:
position_of_sections.append(last_position + size)
number_of_inputs = sum(size_of_sections)
# setup input nodes for a word
def input_to_train(input, inputs):
#import pdb; pdb.set_trace()
length = len(input)
for index, operator in enumerate(input):
    inputs[index*number_of_operators + operator_to_position(operator)] = 1
# do the training(get_batch_size)
def train(sess, get_batch_size, train_step, x, y_):
#import pdb; pdb.set_trace();
batch_number = 0
batch_size = get_batch_size(batch_number)
x_array = np.zeros(shape=(min(number_of_training_data, batch_size), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(number_of_training_data, batch_size), number_of_outputs), dtype=float)
index = 0
to_do = number_of_training_data
for input, output in training_data.iteritems():
    input_to_train(input, x_array[index])
    y_array[index][output] = 1
    index += 1
    to_do -= 1
    if index == batch_size or input == training_data.keys()[-1]:
      batch_number += 1
      batch_size = get_batch_size(batch_number)
      index = 0
      sess.run(train_step, feed_dict={x:x_array, y_: y_array})
      x_array = np.zeros(shape=(min(batch_size, to_do), number_of_inputs), dtype=float)
      y_array = np.zeros(shape=(min(batch_size, to_do), number_of_outputs), dtype=float)
def train_twice(sess, get_batch_size, train_step, x, y_):
train(sess, get_batch_size, train_step, x, y_)
train(sess, get_batch_size, train_step, x, y_)
def run_test_one_word(sess, x, y, word):
x_array = np.zeros(shape=(1, number_of_inputs), dtype=float)
input_to_train(word, x_array[0])
prediction = tf.argmax(y,1)
prediction = sess.run(prediction, feed_dict={x: x_array})
#the_y = sess.run(y, feed_dict={x: x_array})
return prediction[0]
def run_test_for_case(sess, x, y):
n_right = 0
n_checked = 0

for input, output in training_data.iteritems():
    test_words = [input]
    for test_word in test_words:
      prediction = run_test_one_word(sess, x, y, test_word)
      is_right = prediction == output

      if show_details:
        if not is_right:
          import pdb; pdb.set_trace()
          print "Wrong %s found %s was %s" % (test_word, prediction, input)
          #run_test_one_word(sess, x, y, test_word)
      if is_right:
        n_right += 1
      n_checked += 1
return (n_right, n_checked)
def run_test(words, get_batch_size, train):
x = tf.placeholder(tf.float32, shape=[None, number_of_inputs])
y_ = tf.placeholder(tf.float32, shape=[None, number_of_outputs])
#old_version = False
if one_level_version:
    W = tf.Variable(tf.zeros([number_of_inputs, number_of_outputs]))
    b = tf.Variable(tf.zeros([number_of_outputs]))
    y = tf.nn.softmax(tf.matmul(x,W) + b)
else:
    h_size = 20
    W = tf.Variable(tf.zeros([number_of_inputs, h_size]))
    H = tf.Variable(tf.zeros([h_size, number_of_outputs]))
    b = tf.Variable(tf.zeros([number_of_outputs]))
    y = tf.nn.softmax(tf.matmul(tf.matmul(x,W),H) + b)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
#import pdb; pdb.set_trace();
train(sess, get_batch_size, train_step, x, y_)
results = {}
results[""] = run_test_for_case(sess, x, y)
return results
def run_tests(get_batch_size, train):
#import pdb; pdb.set_trace();
results = run_test(training_data, get_batch_size, train)
for key in results.keys():
    (n_right, n_checked) = results[key]
    print "%50s %d / %d" % (key, n_right, n_checked)
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    return number_of_training_data
return get_batch_size_2
print "Number of positions %s" % number_of_positions
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)

Monday, March 27, 2017

Combining the Expression and Hierarchy Neural Nets

The next move is to combine the hierarchy and expression parser neural nets. For this I make a hierarchy where "move" and "bought" are a type of "infix", "to" is a type of "preposition" and "a" and the are types of "articles.

The expression parser is then setup with the "infix", "preposition" and "article" types. Where the priorities are "infix" < "preposition" < "article". I made a new class called Chain that can be used to take the output of the hierarchy neural net and use that as the input into the expression neural net. The main change for that was to add code to specify which nodes are outputs from the hierarchy and which nodes are input into the expression neural net. That is pretty much all that I did.

The expression parser is trained with this

priorities = [
('preposition', 'article'),
('infix', 'preposition'),
('constant', 'infix'),
('0', 'constant')
]

The hierarchy neural net is trained with this

words = [
('constant', 'constant'),
('to', 'preposition'),
('from', 'preposition'),
('move', 'infix'),
('bought', 'infix'),
('a', 'article'),
('the', 'article'),

# top off the loop
('article', 'article'),
('preposition', 'preposition'),
('infix', 'infix')
]

The setup is done like so

one_hot_spec = ['constant', 'preposition', 'infix', 'article', '0']

hierarchy_model = HierarchyModel(words, 'constant', one_hot_spec)
parser_model = ParserModel(priorities, 'constant', one_hot_spec)

chain_model = ChainModel(hierarchy_model, parser_model)
chain_model.train(session)

Here is some sample usage:

Enter an sentence if you dare: move t1 to b2 dfhj ahadshfa sdhfgfah I bought a car
Input Expression: ['move', 't1', 'to', 'b2', 'dfhj', 'ahadshfa', 'sdhfgfah', 'I', 'bought', 'a', 'car']
Output Expression: {'action': 'move', 'thing': 't1', 'to': 'b2'}
Output Expression: {'buyer': 'I', 'action': 'buy', 'thing': {'thing': 'car', 'determiner': 'a'}}

The source code is expression5.py, hierarchy3.py and chain1.py . This is invoked by running expression5.py. The source code is here

Thursday, March 16, 2017

Parsing Natural Languages

I think using grammars to parse language is not useful and a dead end. I think language should be parsed using a generalization of the models we use to parse arithmetic expressions. Here is an example for the sentence, "The man bought a car" using these operator definitions

the - prefix priority(400)
man - constant
bought - infix priority(200)
a - prefix priority(400)
car - constant

Then sentence can then be parse by applying the operators in the normal way. Interesting generalizations are to allow operator evaluations to evaluate to other operators. For example, prepositions evaluate to post fix operators. Consider the sentence "The person from France bought a car".

from - prefix operator priority(300) -> evaluates to a post fix operator priority 100.

The sentence can now be parsed with the "from France" being applied to "The person".

Another generalization is to regard verbs as infix operators having named arguments. Consider the sentence "The man bought a car on Saturday". Update our definition of "bought"

bought - infix priority(200) named args: on
on - see from

The sentence can now be parsed with the "on Saturday" phase being part of the evaluation of the verb "bought". Interestingly for the sentence "The man on the boat bought a car" with the same definitions the "on the boat" is evaluated as a post-fix operator applied to "the man". No extra work.

What about conjunctions. Consider the sentence "The man and woman bought and sold a car and went on a trip". We need two generalizations to handle this. The first it to make a verb into a prefix operator that evaluates into a post fix operator. The second is to make a conjunction into a list terminator where the priority of the operator is not a number but rather it evaluates and soon as the term after the operator and the term before the operator have the same type. The result of the evaluation has the same type. The parsing would look like this

Start:
"the man and woman bought and sold a car and went on a trip"

The types before and after the "and" are the same.
"the (man and woman) (bought and sold) a car and went on a trip"

Apply determiners
"(the (man and woman)) (bought and sold) (a car) and went on (a trip)"

kPrefix prepositions
"(the (man and woman)) (bought and sold) (a car) and went (on (a trip))"

Prefix verbs
"(the (man and woman)) ((bought and sold) (a car)) and (went (on (a trip)))"

Apply conjunctions because types match (both are post fix operators of same priority)
"(the (man and woman)) (((bought and sold) (a car)) and (went (on (a trip))))"

Apply postfix verbs
"((the (man and woman)) (((bought and sold) (a car)) and (went (on (a trip)))))"

This also has a beautiful noise handling feature that is difficult to obtain with grammar based parsers. Imagine this as input " blah blah xhsh the man bought a car tree pick the woman bought a boat aa ashsh1334h". The result of parsing this would recognize "the man bought a car" and "the woman bought a boat" as phrases in addition to the other single word parses. The noise is ignore naturally by the parser. Sweet!

Let's take my neural net expression parser and apply that to this problem. The code is at https://github.com/cooledge/nn/blob/master/expressions/expressions2.py

Here is sample output that converts sentences to JSON.

Enter an sentence if you dare: h sadhfashdf asd joe bought a car shdhf

Input Expression: ['h', 'sadhfashdf', 'asd', 'joe', 'bought', 'a', 'car', 'shdhf']
Output Expression: {'buyer': 'joe', 'action': 'buy', 'thing': {'determiner': 'a', 'thing': 'car'}}

Enter an sentence if you dare: h sadhfashdf asd joe bought a car shdhf sally bought a jeep

Input Expression: ['h', 'sadhfashdf', 'asd', 'joe', 'bought', 'a', 'car', 'shdhf', 'sally', 'bought', 'a', 'jeep']
Output Expression: {'buyer': 'joe', 'action': 'buy', 'thing': {'determiner': 'a', 'thing': 'car'}}
Output Expression: {'buyer': 'sally', 'action': 'buy', 'thing': {'determiner': 'a', 'thing': 'jeep'}}

Enter an sentence if you dare: h sadhfashdf asd joe bought a car shdhf sally bought a jeep adsafhsdhdhd ddd move the tank to france

Input Expression: ['h', 'sadhfashdf', 'asd', 'joe', 'bought', 'a', 'car', 'shdhf', 'sally', 'bought', 'a', 'jeep', 'adsafhsdhdhd', 'ddd', 'move', 'the', 'tank', 'to', 'france']

Output Expression: {'buyer': 'joe', 'action': 'buy', 'thing': {'determiner': 'a', 'thing': 'car'}}
Output Expression: {'buyer': 'sally', 'action': 'buy', 'thing': {'determiner': 'a', 'thing': 'jeep'}}
Output Expression: {'to': 'france', 'action': 'move', 'thing': {'determiner': 'the', 'thing': 'tank'}}

Sunday, March 12, 2017

Parsing Expressions with Neural Nets

This post describes the start of using a neural net to evaluate an mathematics expression such as "1 + 2 * 3" equals "7". The idea is to take a list of tokens and have the neural net determine which operator to evaluate next. The training data will be similar to the hierarchy model.

priorities = [

('+', '*'),

('-', '*'),

('+', '/'),

('-', '/'),

('*', '**'),

('/', '**'),

('**', '0'),

('c', '+'),

('c', '-'),

('0', 'c')

]

The values are (lower_priority_op, higher_priority_op). I am minimizing the data purposely to have more of the intelligence contained in the model. 'c' will be anything that is not an op. '0' just means nothing. This is a work in progress and I thought this was a good place to start.

I trained a neural net that given "higher_priority_op" will return softmax probs for the "lower_priority_op" in a one hot vector. That is easy to do. The next step is to generate a model that will take in a sequence of one hot vectors corresponding to the input expression. and return a sequence of probabilities that the corresponding element is the next to process. For example, assuming the ops are c, +, -, *, /, **, 0 for the expression "1 + 2 * 3", the inputs will be

[1,0,0,0,0,0,0] [0,1,0,0,0,0,0] [1,0,0,0,0,0,0] [1,0,0,0,0,0,0] [0,0,0,1,0,0,0] [1,0,0,0,0,0,0]

The output target will be

[0,0,0,1,0]

Which indicates that the op to process next is the multiply. For this I am not training the model. The model is only trained on pairwise op priorities. That output is used as input for the model. This is an example of how the parse model works. For each input vector, a prediction will be calculated using the standard softmax with the trained model.

[0,0,0,0,0,0,1] [1,0,0,0,0,0,1] [0,0,0,0,0,0,1] [1,1,1,0,0,0,1][0,0,0,0,0,0,1]

The next step is to sum the columns.

[2,1,1,0,0,0,5]

Next step is to subtract this from the input with a relu yielding

[0,0,0,0,0,0,0] [0,0,0,0,0,0,0] [0,0,0,0,0,0,0] [0,0,0,1,0,0,0][0,0,0,0,0,0,0]

Then sum the vector for each position

[0,0,0,0,1,0]

Then select the argmax and that is the next op to run. The numbers that I showed are perfect. The real number are more fractional.

This is a sample of the app running

Enter an arithmetic expression: 1 + 1 + 3 * 4

Input Expression: ['1', '+', '1', '+', '3', '*', '4']

Output Expression: [14.0]

The app is here https://github.com/cooledge/nn/blob/master/expressions/expressions.py

Monday, February 20, 2017

Hierarchies in Neural Nets

I took some time off to learn more about neural nets using tensorflow Now I am back. I want to investigate using neural nets to model hierarchies. I will start with a simple hierarchy of animals. This is my input data.

hierarchy = {
"bear": "mammal",
"tiger": "mammal",
"whale": "mammal",
"ostrich": "bird",
"peacock": "bird",
"eagle": "bird",
"salmon": "fish",
"goldfish": "fish",
"guppy" : "fish",
"turtle": "reptile",
"crocodile": "reptile",
"snake": "reptile",
"frog": "amphibian",
"toad": "amphibian",
"newt": "amphibian",
"ant": "invertibrates",
"cockroach": "invertibrates",
"ladybird": "invertibrates",
"mammal": "vertibrates",
"birds": "vertibrates",
"fish": "vertibrates",
"reptiles": "vertibrates",
"amphibians": "vertibrates",
"vertibrates": "animals",
"invertibrates": "animals",
"animals": "animals" # loop for things with no higher class
}

I want a neural net that will take one of the objects in the hierarchy and return a one hot vector that indicates the nodes in the hierarchy that are the ancestors. For example, when I input "bear", I want to get a one-hot vector that has "bear", "mammel", "vertibrate", and "animal" marked as hot. Then one could imaging feeding that into a neural net that knew about invertibrates and could act on the invertibrateness of a bear when being fed the input bear.

The other thing I want the neural net to know the hierarchy with only getting minimal information. For example, instead of training the neural net that the input bear maps to a one hot vector of the aforemention nodes. I want to train the neural net that "bear" is a "mammel" and "mammels" are "invertibrates". The neural net because of its structure would infer that "bears" are "invertibrates".

The reason for doing that is that I don't want to encode the knowledge of a hierarchy in the training data instead I want to encode that in the neural net. That better models the way humans learn. Humans don't get training data about whole hierarchies instead they piece it together. I want the knowledge of hierarchy-ness in the neural net.

The model I came up with is in my github repo, https://github.com/cooledge/nn/commit/ad79bef1af9b3fed40e89e765c6e28ba377d7544. The approach is to train a simple neural net that take a one hot vector with the child and predicts a one hot vector of the parent. For example when "vertibrates" is input the prediction is "animals". That is trained in the usual way. The result is a W and b, the usual matrices.

Then for the model that prediction hierarchy I used this.

l1 = softmax(I*W + b)
l2 = softmax(l1*W + b)
...
lk = softmax(l2*W + b)

Y = l1 + l2 ... lk

I left out some of the math since this is the basic idea. Turns out this works. Here is some sample output from the program.

Enter type type: salmon
Output (prob > 50%)
fish has prob 1.000000
vertibrates has prob 1.000000
animals has prob 1.000000
salmon has prob 1.000000

Enter type type: bird
Output (prob > 50%)
vertibrates has prob 0.894435
animals has prob 1.000000
bird has prob 1.000000

Cool. I am doing a neural net course on Udacity that is free. After that I will come back to this and investigate using embedding instead of one hot vectors. Also it would be fun to hook up my spelling mistake code.

Thursday, August 11, 2016

New Design for spelling mistakes - Try 2

Experiment

Here are some examples of errors in the lookup

Wrong absorptive found advertisements was absorptive
Wrong abstain found absentmindedness was abstain
Wrong abstained found absentmindedness was abstained
Wrong abstainer found absentmindedness was abstainer
Wrong abstainers found absentmindedness was abstainers
Wrong abstaining found administrating was abstaining
Wrong abstains found absentmindedness was abstains
Wrong abstemious found abstemiousness was abstemious
Wrong abstemiously found abstemiousness was abstemiously
Wrong abstention found absentmindedness was abstention
Wrong abstentions found absentmindedness was abstentions
Wrong abstinence found absentmindedness was abstinence
Wrong abstinent found absentmindedness was abstinent
Wrong abstract found abstracts was abstract
Wrong abstracted found accelerates was abstracted
Wrong abstractedly found aesthetically was abstractedly

I think it was wrong to use a count as the input into the node that represented how many times the letter appeared in the word. I am going to change that to have a node for each count of a letter with sizes of 0, 1, 2 and more.

Lesson

The new design handles spelling mistakes way better without any training on spelling mistakes. The design needs to get all the words right for the exact case so some refinement is still needed. In addition, I need to get this running on a video card so I can see the effectiveness with the full range of words since in the last experiment the precision with a small amount of input was out of line with the larger amount of input.

Data

   Train once
   Batch size 100
                                     transposition 7644 / 7673
                                             exact 993 / 999
                                          deletion 7139 / 8672

Code

import tensorflow as tf
import numpy as np

"""
For each word there is a length node, a first letter node, and for each letter in the word there is a node with input of 1 * number of letters
"""

import sys
words_file = "words.txt"
if len(sys.argv) > 1:
words_file = sys.argv[1]
show_details = (len(sys.argv) > 2)

words_txt = open(words_file, "r")
words = words_txt.read().split('\n')
words.pop() # last is empty string
words_txt.close()

number_of_letters = 26
number_of_length_nodes = 30
number_of_first_letter_nodes = number_of_letters
number_of_counts_for_letter_nodes = 4
number_of_letters_nodes = number_of_letters * number_of_counts_for_letter_nodes
number_of_positions = 25
size_of_sections = [
number_of_length_nodes,
number_of_first_letter_nodes,
number_of_letters_nodes
]
position_of_sections = [0]
last_position = 0
for size in size_of_sections:
position_of_sections.append(last_position + size)
number_of_inputs = sum(size_of_sections)

number_of_words = len(words)
number_of_outputs = len(words)

#import pdb; pdb.set_trace();

# setup input nodes for a word
def word_to_train(word, inputs):
# section 1 - length of word
length = len(word)
inputs[position_of_sections[0]+length] = 1

if length > 1:
    inputs[position_of_sections[0]+length-1] = 0.5
if length > 2:
    inputs[position_of_sections[0]+length-2] = 0.25

if length+1 < number_of_length_nodes:
    inputs[position_of_sections[0]+length+1] = 0.5
if length+2 < number_of_length_nodes:
    inputs[position_of_sections[0]+length+2] = 0.25

# section 2 - first letter of word
inputs[position_of_sections[1] + ord(word[0]) - ord('a')] = 1

# section 3 - number of times letter appears in word (0,1,2,more)
counts = [0] * number_of_letters
for index, ch in enumerate(word):
    counts[ord(ch) - ord('a')] += 1

for letter_index, count in enumerate(counts):
    if count == 0:
      count_index = 0
    elif count == 1:
      count_index = 1
    elif count == 2:
      count_index = 2
    else:
      count_index = 3
    inputs[position_of_sections[2] + letter_index*number_of_counts_for_letter_nodes + count_index] = 1

# do the training(get_batch_size)
def train(sess, get_batch_size, train_step, x, y_):
batch_number = 0
batch_size = get_batch_size(batch_number)

x_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(number_of_words, batch_size), number_of_outputs), dtype=float)

index = 0
for word_number, word in enumerate(words):
    word_to_train(word, x_array[index])
    y_array[index][word_number] = 1
    index += 1
    if index == batch_size or word_number+1 == number_of_words:
      batch_number += 1
      batch_size = get_batch_size(batch_number)

      index = 0
      sess.run(train_step, feed_dict={x:x_array, y_: y_array})
      to_do = number_of_words - word_number - 1
      x_array = np.zeros(shape=(min(batch_size, to_do), number_of_inputs), dtype=float)
      y_array = np.zeros(shape=(min(batch_size, to_do), number_of_outputs), dtype=float)

def train_twice(sess, get_batch_size, train_step, x, y_):
train(sess, get_batch_size, train_step, x, y_)
train(sess, get_batch_size, train_step, x, y_)

def run_test_one_word(sess, x, y, word):
x_array = np.zeros(shape=(1, number_of_inputs), dtype=float)
word_to_train(word, x_array[0])
prediction = tf.argmax(y,1)
prediction = sess.run(prediction, feed_dict={x: x_array})
return prediction[0]

def word_to_test_words(word, include_original=True, include_remove=True, include_transpose=True):
words = []

#original word
words.append(word)

# remove a character
for i in range(0,len(word)):
    words.append( word[:i] + word[i+1:] )

# transpose characters
if len(word) > 1:
    for i in range(0,len(word)-1):
      before = word[:i]
      letter1 = word[i]
      letter2 = word[i+1]
      after = word[i+2:]
      words.append( before + letter2 + letter1 + after )

return words

def word_to_train_words(word):
return [word]

def word_to_test_words_exact(word):
return [word]

def word_to_test_words_deletion(word):
words = []
for i in range(0,len(word)):
    words.append( word[:i] + word[i+1:] )
return words

def word_to_test_words_transposition(word):
words = []
if len(word) > 1:
    for i in range(0,len(word)-1):
      before = word[:i]
      letter1 = word[i]
      letter2 = word[i+1]
      after = word[i+2:]
      words.append( before + letter2 + letter1 + after )
return words

def run_test_for_case(sess, x, y, word_to_test_words_function):

n_right = 0
n_checked = 0

for word_number, word in enumerate(words):
    test_words = word_to_test_words_function(word)

    for test_word in test_words:
      prediction = run_test_one_word(sess, x, y, test_word)
      is_right = prediction == word_number

      if show_details:
        #if is_right:
          #print "Right %s matches %s" % (words[word_number], test_word)
        #else:
          #print "Wrong %s found %s was %s" % (test_word, words[prediction], word)
        if not is_right:
          #import pdb; pdb.set_trace()
          print "Wrong %s found %s was %s" % (test_word, words[prediction], word)

      if is_right:
        n_right += 1

      n_checked += 1
return (n_right, n_checked)

def run_test(words, get_batch_size, train):
x = tf.placeholder(tf.float32, shape=[None, number_of_inputs])
y_ = tf.placeholder(tf.float32, shape=[None, number_of_outputs])

W = tf.Variable(tf.zeros([number_of_inputs, number_of_outputs]))
b = tf.Variable(tf.zeros([number_of_outputs]))

y = tf.nn.softmax(tf.matmul(x,W) + b)

cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())

#import pdb; pdb.set_trace();
train(sess, get_batch_size, train_step, x, y_)

results = {}
results["exact"] = run_test_for_case(sess, x, y, word_to_test_words_exact)
results["deletion"] = run_test_for_case(sess, x, y, word_to_test_words_deletion)
results["transposition"] = run_test_for_case(sess, x, y, word_to_test_words_transposition)
return results

def run_tests(get_batch_size, train):
for batch_size in range(100, 1001, 100):
    results = run_test(words, get_batch_size(batch_size), train)
    print "   Batch size %d" % batch_size
    for key in results.keys():
      (n_right, n_checked) = results[key]
      print "%50s %d / %d" % (key, n_right, n_checked)

print "Batches in same order as file, non-full batch appears last"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    return batch_size
return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)

print "Batches in same order as file, non-full batch appears first"

def get_batch_size(batch_size):
def get_batch_size2(batch_number):
    if batch_number == 0:
      return number_of_words % batch_size
    else:
      return batch_size
return get_batch_size2

print "   Train once"
#import pdb; pdb.set_trace();
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)

print "Batches in same order as file, non-full batch appears last, batch size alternate by div 2"
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
    if batch_number % 2 == 0:
      return batch_size / 2
    else:
      return batch_size
return get_batch_size_2
print "   Train once"
run_tests(get_batch_size, train)
print "   Train twice"
run_tests(get_batch_size, train_twice)