Sunday, June 18, 2017
Arithmetic Expressions - Try 2
I am trying an RNN for try number two. The inputs will be the operators in the expression. The output of the RNN will be which operator to process. The training data will be expression of a shorter length. The test data will be expression of a longer length to see if the RNN learned a generalization based on length.
For example, for five operators +, -, *, / and ^ sample input will be
+
*
-
/
^
++
+*
The output will be
+
*
-
/
^
+
*
The code is located at https://github.com/cooledge/nn/tree/master/expressions/math_rnn
The code for training is make_data.py . The args arg --train <sizes> --test <sizes> . For make_data --train=1,2,3 --test=4 the output is right(597), wrong(23). So although trained on shorted length sequences there was enough info for longer sequences. For --train=1,2,3 --test=5 the outputs is right(2697), wrong(423). For --train=1,2,3 --test=6 the output is right(12202), wrong(3417). The current model is not scaling well for training on shorter sequences and apply to longer sequences. The other model that I had that trained on pairwise ops had a better success rate and trained on much less data.
I tried changing the LSTM state size and number of layers that had no effect. I tried adding another fully connected layer on the output with a relu and that had no effect.
Sunday, June 11, 2017
Arithmetic Expressions - Try 1
Design a neural net that can be used to evaluate arithmetic expression of the form 1+2*3-7/4 . The first try will be a neural net that takes the operators and says which operator to apply next. The input will be positions and at each position there will be a node for each operator. Similar to the first design of the neural net to look up words. The output will be one node for each position. The position that is to be applied first will be selected.
This design does not work good enough. I also tried outputing the operator to be used and making the output structure match the input structure. The results were equally as bad.
Number of positions 3
Train once 16 / 20
Train twice 16 / 20
Number of positions 3
Train once 56 / 84
Train twice 56 / 84
Number of positions 4
Train once 200 / 340
Train twice 200 / 340
import numpy as np
"""
Let's learn how to evaulate an arithmetic expression using '+', '-', '*' and '/'
"""
import itertools
import sys
show_details = (len(sys.argv) > 1)
# no hidden layers or one hidden layer
one_level_version = True
# output layer is the operator that is select or the position in the input
#output_is = "operator"
output_is = "position"
# length of the expressions
number_of_positions = 4
# the output is the position that should be processed next
def train_data_output_is_position(number_of_positions):
data = {}
for length in range(1,number_of_positions+1):
for input in itertools.product('+-*/', repeat=length):
select = None
for index, ch in enumerate(input):
if ch == '*' or ch == '/':
select = index
break
if select == None:
select = 0
data[input] = select
return data;
# the output is the operator that should be processed next
def train_data_output_is_operator(number_of_positions):
data = {}
for length in range(1,number_of_positions+1):
for input in itertools.product('+-*/', repeat=length):
select = None
for index, ch in enumerate(input):
if ch == '*' or ch == '/':
select = operator_to_position(ch)
break
if select == None:
select = 0
data[input] = select
return data;
def operator_to_position(operator):
return {
"+": 0,
"-": 1,
"*": 2,
"/": 3,
}[operator]
import sys
#number_of_positions = 2
#output_is = "operator"
number_of_operators = 4
if output_is == "operator":
training_data = train_data_output_is_operator(number_of_positions)
number_of_outputs = number_of_operators
else:
training_data = train_data_output_is_position(number_of_positions)
number_of_outputs = number_of_positions
number_of_training_data = len(training_data)
size_of_sections = [
number_of_positions * number_of_operators
]
position_of_sections = [0]
last_position = 0
for size in size_of_sections:
position_of_sections.append(last_position + size)
number_of_inputs = sum(size_of_sections)
# setup input nodes for a word
def input_to_train(input, inputs):
#import pdb; pdb.set_trace()
length = len(input)
for index, operator in enumerate(input):
inputs[index*number_of_operators + operator_to_position(operator)] = 1
# do the training(get_batch_size)
def train(sess, get_batch_size, train_step, x, y_):
#import pdb; pdb.set_trace();
batch_number = 0
batch_size = get_batch_size(batch_number)
x_array = np.zeros(shape=(min(number_of_training_data, batch_size), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(number_of_training_data, batch_size), number_of_outputs), dtype=float)
index = 0
to_do = number_of_training_data
for input, output in training_data.iteritems():
input_to_train(input, x_array[index])
y_array[index][output] = 1
index += 1
to_do -= 1
if index == batch_size or input == training_data.keys()[-1]:
batch_number += 1
batch_size = get_batch_size(batch_number)
index = 0
sess.run(train_step, feed_dict={x:x_array, y_: y_array})
x_array = np.zeros(shape=(min(batch_size, to_do), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(batch_size, to_do), number_of_outputs), dtype=float)
def train_twice(sess, get_batch_size, train_step, x, y_):
train(sess, get_batch_size, train_step, x, y_)
train(sess, get_batch_size, train_step, x, y_)
def run_test_one_word(sess, x, y, word):
x_array = np.zeros(shape=(1, number_of_inputs), dtype=float)
input_to_train(word, x_array[0])
prediction = tf.argmax(y,1)
prediction = sess.run(prediction, feed_dict={x: x_array})
#the_y = sess.run(y, feed_dict={x: x_array})
return prediction[0]
def run_test_for_case(sess, x, y):
n_right = 0
n_checked = 0
for input, output in training_data.iteritems():
test_words = [input]
for test_word in test_words:
prediction = run_test_one_word(sess, x, y, test_word)
is_right = prediction == output
if show_details:
if not is_right:
import pdb; pdb.set_trace()
print "Wrong %s found %s was %s" % (test_word, prediction, input)
#run_test_one_word(sess, x, y, test_word)
if is_right:
n_right += 1
n_checked += 1
return (n_right, n_checked)
def run_test(words, get_batch_size, train):
x = tf.placeholder(tf.float32, shape=[None, number_of_inputs])
y_ = tf.placeholder(tf.float32, shape=[None, number_of_outputs])
#old_version = False
if one_level_version:
W = tf.Variable(tf.zeros([number_of_inputs, number_of_outputs]))
b = tf.Variable(tf.zeros([number_of_outputs]))
y = tf.nn.softmax(tf.matmul(x,W) + b)
else:
h_size = 20
W = tf.Variable(tf.zeros([number_of_inputs, h_size]))
H = tf.Variable(tf.zeros([h_size, number_of_outputs]))
b = tf.Variable(tf.zeros([number_of_outputs]))
y = tf.nn.softmax(tf.matmul(tf.matmul(x,W),H) + b)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
#import pdb; pdb.set_trace();
train(sess, get_batch_size, train_step, x, y_)
results = {}
results[""] = run_test_for_case(sess, x, y)
return results
def run_tests(get_batch_size, train):
#import pdb; pdb.set_trace();
results = run_test(training_data, get_batch_size, train)
for key in results.keys():
(n_right, n_checked) = results[key]
print "%50s %d / %d" % (key, n_right, n_checked)
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
return number_of_training_data
return get_batch_size_2
print "Number of positions %s" % number_of_positions
print " Train once"
run_tests(get_batch_size, train)
print " Train twice"
run_tests(get_batch_size, train_twice)
This design does not work good enough. I also tried outputing the operator to be used and making the output structure match the input structure. The results were equally as bad.
Data
Number of positions 3
Train once 16 / 20
Train twice 16 / 20
Number of positions 3
Train once 56 / 84
Train twice 56 / 84
Number of positions 4
Train once 200 / 340
Train twice 200 / 340
Code
import tensorflow as tfimport numpy as np
"""
Let's learn how to evaulate an arithmetic expression using '+', '-', '*' and '/'
"""
import itertools
import sys
show_details = (len(sys.argv) > 1)
# no hidden layers or one hidden layer
one_level_version = True
# output layer is the operator that is select or the position in the input
#output_is = "operator"
output_is = "position"
# length of the expressions
number_of_positions = 4
# the output is the position that should be processed next
def train_data_output_is_position(number_of_positions):
data = {}
for length in range(1,number_of_positions+1):
for input in itertools.product('+-*/', repeat=length):
select = None
for index, ch in enumerate(input):
if ch == '*' or ch == '/':
select = index
break
if select == None:
select = 0
data[input] = select
return data;
# the output is the operator that should be processed next
def train_data_output_is_operator(number_of_positions):
data = {}
for length in range(1,number_of_positions+1):
for input in itertools.product('+-*/', repeat=length):
select = None
for index, ch in enumerate(input):
if ch == '*' or ch == '/':
select = operator_to_position(ch)
break
if select == None:
select = 0
data[input] = select
return data;
def operator_to_position(operator):
return {
"+": 0,
"-": 1,
"*": 2,
"/": 3,
}[operator]
import sys
#number_of_positions = 2
#output_is = "operator"
number_of_operators = 4
if output_is == "operator":
training_data = train_data_output_is_operator(number_of_positions)
number_of_outputs = number_of_operators
else:
training_data = train_data_output_is_position(number_of_positions)
number_of_outputs = number_of_positions
number_of_training_data = len(training_data)
size_of_sections = [
number_of_positions * number_of_operators
]
position_of_sections = [0]
last_position = 0
for size in size_of_sections:
position_of_sections.append(last_position + size)
number_of_inputs = sum(size_of_sections)
# setup input nodes for a word
def input_to_train(input, inputs):
#import pdb; pdb.set_trace()
length = len(input)
for index, operator in enumerate(input):
inputs[index*number_of_operators + operator_to_position(operator)] = 1
# do the training(get_batch_size)
def train(sess, get_batch_size, train_step, x, y_):
#import pdb; pdb.set_trace();
batch_number = 0
batch_size = get_batch_size(batch_number)
x_array = np.zeros(shape=(min(number_of_training_data, batch_size), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(number_of_training_data, batch_size), number_of_outputs), dtype=float)
index = 0
to_do = number_of_training_data
for input, output in training_data.iteritems():
input_to_train(input, x_array[index])
y_array[index][output] = 1
index += 1
to_do -= 1
if index == batch_size or input == training_data.keys()[-1]:
batch_number += 1
batch_size = get_batch_size(batch_number)
index = 0
sess.run(train_step, feed_dict={x:x_array, y_: y_array})
x_array = np.zeros(shape=(min(batch_size, to_do), number_of_inputs), dtype=float)
y_array = np.zeros(shape=(min(batch_size, to_do), number_of_outputs), dtype=float)
def train_twice(sess, get_batch_size, train_step, x, y_):
train(sess, get_batch_size, train_step, x, y_)
train(sess, get_batch_size, train_step, x, y_)
def run_test_one_word(sess, x, y, word):
x_array = np.zeros(shape=(1, number_of_inputs), dtype=float)
input_to_train(word, x_array[0])
prediction = tf.argmax(y,1)
prediction = sess.run(prediction, feed_dict={x: x_array})
#the_y = sess.run(y, feed_dict={x: x_array})
return prediction[0]
def run_test_for_case(sess, x, y):
n_right = 0
n_checked = 0
for input, output in training_data.iteritems():
test_words = [input]
for test_word in test_words:
prediction = run_test_one_word(sess, x, y, test_word)
is_right = prediction == output
if show_details:
if not is_right:
import pdb; pdb.set_trace()
print "Wrong %s found %s was %s" % (test_word, prediction, input)
#run_test_one_word(sess, x, y, test_word)
if is_right:
n_right += 1
n_checked += 1
return (n_right, n_checked)
def run_test(words, get_batch_size, train):
x = tf.placeholder(tf.float32, shape=[None, number_of_inputs])
y_ = tf.placeholder(tf.float32, shape=[None, number_of_outputs])
#old_version = False
if one_level_version:
W = tf.Variable(tf.zeros([number_of_inputs, number_of_outputs]))
b = tf.Variable(tf.zeros([number_of_outputs]))
y = tf.nn.softmax(tf.matmul(x,W) + b)
else:
h_size = 20
W = tf.Variable(tf.zeros([number_of_inputs, h_size]))
H = tf.Variable(tf.zeros([h_size, number_of_outputs]))
b = tf.Variable(tf.zeros([number_of_outputs]))
y = tf.nn.softmax(tf.matmul(tf.matmul(x,W),H) + b)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
#import pdb; pdb.set_trace();
train(sess, get_batch_size, train_step, x, y_)
results = {}
results[""] = run_test_for_case(sess, x, y)
return results
def run_tests(get_batch_size, train):
#import pdb; pdb.set_trace();
results = run_test(training_data, get_batch_size, train)
for key in results.keys():
(n_right, n_checked) = results[key]
print "%50s %d / %d" % (key, n_right, n_checked)
def get_batch_size(batch_size):
def get_batch_size_2(batch_number):
return number_of_training_data
return get_batch_size_2
print "Number of positions %s" % number_of_positions
print " Train once"
run_tests(get_batch_size, train)
print " Train twice"
run_tests(get_batch_size, train_twice)
Subscribe to:
Posts (Atom)