IMPLEMENTATION: How to Develop a Neural Machine
Translation System from Scratch
This is an
implementation of Jason BROWNLEE’s program:
You can view
the results at the bottom.
Trans.py
# -*-
coding: utf-8 -*-
"""
Created on
Sat Dec 8 22:37:54 2018
@author: Jason
BROWNLEE
"""
import os;
os.environ['KERAS_BACKEND'] = 'theano'
import
string
import re
from pickle
import dump
from
unicodedata import normalize
from numpy
import array
# load doc
into memory
def
load_doc(filename):
# open the file as read only
file = open(filename, mode='rt',
encoding='utf-8')
# read all text
text = file.read()
# close the file
file.close()
return text
# split a
loaded document into sentences
def
to_pairs(doc):
lines = doc.strip().split('\n')
pairs = [line.split('\t') for
line in lines]
return pairs
# clean a
list of lines
def
clean_pairs(lines):
cleaned = list()
# prepare regex for char
filtering
re_print = re.compile('[^%s]' %
re.escape(string.printable))
# prepare translation table for
removing punctuation
table = str.maketrans('', '',
string.punctuation)
for pair in lines:
clean_pair =
list()
for line in pair:
#
normalize unicode characters
line
= normalize('NFD', line).encode('ascii', 'ignore')
line
= line.decode('UTF-8')
#
tokenize on white space
line
= line.split()
#
convert to lowercase
line
= [word.lower() for word in line]
#
remove punctuation from each token
line
= [word.translate(table) for word in line]
#
remove non-printable chars form each token
line
= [re_print.sub('', w) for w in line]
#
remove tokens with numbers in them
line
= [word for word in line if word.isalpha()]
#
store as string
clean_pair.append('
'.join(line))
cleaned.append(clean_pair)
return array(cleaned)
# save a
list of clean sentences to file
def
save_clean_data(sentences, filename):
dump(sentences, open(filename,
'wb'))
print('Saved: %s' % filename)
# load
dataset
filename =
'deu.txt'
doc =
load_doc(filename)
# split into
english-german pairs
pairs =
to_pairs(doc)
# clean
sentences
clean_pairs
= clean_pairs(pairs)
# save clean
pairs to file
save_clean_data(clean_pairs,
'english-german.pkl')
# spot check
for i in
range(10):
print('[%s] => [%s]' %
(clean_pairs[i,0], clean_pairs[i,1]))
-----------------------------------------------------------------------------------------------------
Trans2.py
# -*-
coding: utf-8 -*-
"""
Created on
Sat Dec 8 23:02:11 2018
@author: ars
"""
import os;
os.environ['KERAS_BACKEND'] = 'theano'
from pickle
import load
from pickle
import dump
from
numpy.random import rand
from
numpy.random import shuffle
# load a
clean dataset
def
load_clean_sentences(filename):
return load(open(filename,
'rb'))
# save a
list of clean sentences to file
def
save_clean_data(sentences, filename):
dump(sentences, open(filename,
'wb'))
print('Saved: %s' % filename)
# load
dataset
raw_dataset
= load_clean_sentences('english-german.pkl')
# reduce
dataset size
n_sentences
= 10000
dataset =
raw_dataset[:n_sentences, :]
# random
shuffle
shuffle(dataset)
# split into
train/test
train, test
= dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset,
'english-german-both.pkl')
save_clean_data(train,
'english-german-train.pkl')
save_clean_data(test,
'english-german-test.pkl')
-------------------------------------------------------------------------
Trans3.py
# -*-
coding: utf-8 -*-
"""
Created on
Sat Dec 8 23:05:25 2018
@author: ars
"""
import os;
os.environ['KERAS_BACKEND'] = 'theano'
#os.environ["PATH"]
+= os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
from pickle
import load
from numpy
import array
from
keras.preprocessing.text import Tokenizer
from
keras.preprocessing.sequence import pad_sequences
from
keras.utils import to_categorical
from
keras.utils.vis_utils import plot_model
from
keras.models import Sequential
from
keras.layers import LSTM
from
keras.layers import Dense
from
keras.layers import Embedding
from
keras.layers import RepeatVector
from
keras.layers import TimeDistributed
from
keras.callbacks import ModelCheckpoint
import
theano;
#print(theano.config)
import
theano.tensor;
theano.config.cxx="C:\\Users\\ars\\Anaconda3\\Library\\mingw-w64\\bin\\g++.exe"
# load a
clean dataset
def
load_clean_sentences(filename):
return load(open(filename,
'rb'))
# fit a
tokenizer
def
create_tokenizer(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# max
sentence length
def
max_length(lines):
return max(len(line.split()) for
line in lines)
# encode and
pad sequences
def
encode_sequences(tokenizer, length, lines):
# integer encode sequences
X = tokenizer.texts_to_sequences(lines)
# pad sequences with 0 values
X = pad_sequences(X,
maxlen=length, padding='post')
return X
# one hot
encode target sequence
def
encode_output(sequences, vocab_size):
ylist = list()
for sequence in sequences:
encoded =
to_categorical(sequence, num_classes=vocab_size)
ylist.append(encoded)
y = array(ylist)
y =
y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
return y
# define NMT
model (Neural Machine Translation)
def
define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
model = Sequential()
model.add(Embedding(src_vocab,
n_units, input_length=src_timesteps, mask_zero=True))
model.add(LSTM(n_units))
model.add(RepeatVector(tar_timesteps))
model.add(LSTM(n_units,
return_sequences=True))
model.add(TimeDistributed(Dense(tar_vocab,
activation='softmax')))
return model
# load
datasets
dataset =
load_clean_sentences('english-german-both.pkl')
train =
load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare
english tokenizer
eng_tokenizer
= create_tokenizer(dataset[:, 0])
eng_vocab_size
= len(eng_tokenizer.word_index) + 1
eng_length =
max_length(dataset[:, 0])
print('English
Vocabulary Size: %d' % eng_vocab_size)
print('English
Max Length: %d' % (eng_length))
# prepare
german tokenizer
ger_tokenizer
= create_tokenizer(dataset[:, 1])
ger_vocab_size
= len(ger_tokenizer.word_index) + 1
ger_length =
max_length(dataset[:, 1])
print('German
Vocabulary Size: %d' % ger_vocab_size)
print('German
Max Length: %d' % (ger_length))
# prepare
training data
trainX =
encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY =
encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY =
encode_output(trainY, eng_vocab_size)
# prepare
validation data
testX =
encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY =
encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY =
encode_output(testY, eng_vocab_size)
# define
model
model =
define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam',
loss='categorical_crossentropy')
# summarize
defined model
print(model.summary())
plot_model(model,
to_file='model.png', show_shapes=True)
# fit model
filename = 'model.h5'
checkpoint =
ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True,
mode='min')
model.fit(trainX,
trainY, epochs=30, batch_size=64, validation_data=(testX, testY),
callbacks=[checkpoint], verbose=2)
------------------------------------------------------------------
Trans4.py
# -*-
coding: utf-8 -*-
"""
Created on
Sun Dec 9 20:50:52 2018
@author: ars
"""
import os;
os.environ['KERAS_BACKEND'] = 'theano'
#import os
#os.environ["PATH"]
+= os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
from pickle
import load
#from numpy
import array
from numpy
import argmax
from
keras.preprocessing.text import Tokenizer
from
keras.preprocessing.sequence import pad_sequences
from
keras.models import load_model
from
nltk.translate.bleu_score import corpus_bleu
#import
theano
#import
theano.tensor
#theano.config.cxx=""
import
theano;
#print(theano.config)
import
theano.tensor;
theano.config.cxx="C:\\Users\\ars\\Anaconda3\\Library\\mingw-w64\\bin\\g++.exe"
# load a
clean dataset
def load_clean_sentences(filename):
return load(open(filename,
'rb'))
# fit a
tokenizer
def
create_tokenizer(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# max
sentence length
def
max_length(lines):
return max(len(line.split()) for
line in lines)
# encode and
pad sequences
def
encode_sequences(tokenizer, length, lines):
# integer encode sequences
X =
tokenizer.texts_to_sequences(lines)
# pad sequences with 0 values
X = pad_sequences(X,
maxlen=length, padding='post')
return X
# map an
integer to a word
def
word_for_id(integer, tokenizer):
for word, index in
tokenizer.word_index.items():
if index ==
integer:
return
word
return None
# generate
target given source sequence
def
predict_sequence(model, tokenizer, source):
prediction =
model.predict(source, verbose=0)[0]
integers = [argmax(vector) for
vector in prediction]
target = list()
for i in integers:
word =
word_for_id(i, tokenizer)
if word is None:
break
target.append(word)
return ' '.join(target)
# evaluate
the skill of the model
def
evaluate_model(model, tokenizer, sources, raw_dataset):
actual, predicted = list(),
list()
for i, source in
enumerate(sources):
# translate
encoded source text
source =
source.reshape((1, source.shape[0]))
translation =
predict_sequence(model, eng_tokenizer, source)
raw_target,
raw_src = raw_dataset[i]
if i < 10:
print('src=[%s],
target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
actual.append(raw_target.split())
predicted.append(translation.split())
# calculate BLEU score
print('BLEU-1: %f' %
corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print('BLEU-2: %f' %
corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3: %f' %
corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
print('BLEU-4: %f' %
corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
# load
datasets
dataset =
load_clean_sentences('english-german-both.pkl')
train =
load_clean_sentences('english-german-train.pkl')
test =
load_clean_sentences('english-german-test.pkl')
# prepare
english tokenizer
eng_tokenizer
= create_tokenizer(dataset[:, 0])
eng_vocab_size
= len(eng_tokenizer.word_index) + 1
eng_length =
max_length(dataset[:, 0])
# prepare
german tokenizer
ger_tokenizer
= create_tokenizer(dataset[:, 1])
ger_vocab_size
= len(ger_tokenizer.word_index) + 1
ger_length =
max_length(dataset[:, 1])
# prepare
data
trainX =
encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX =
encode_sequences(ger_tokenizer, ger_length, test[:, 1])
# load model
model =
load_model('model.h5')
# test on
some training sequences
print('train')
evaluate_model(model,
eng_tokenizer, trainX, train)
# test on
some test sequences
print('test')
evaluate_model(model,
eng_tokenizer, testX, test)
-----------------------------------------------------------------------------------------
OUTPUTS:
English
Vocabulary Size: 2309
English Max
Length: 5
German
Vocabulary Size: 3657
German Max
Length: 10
Neural
Network Structureeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
_________________________________________________________________
Layer
(type) Output Shape Param #
=================================================================
embedding_2
(Embedding) (None, 10, 256) 936192
_________________________________________________________________
lstm_3
(LSTM) (None, 256) 525312
_________________________________________________________________
repeat_vector_2
(RepeatVecto (None, 5, 256)
0
_________________________________________________________________
lstm_4
(LSTM) (None, 5, 256) 525312
_________________________________________________________________
time_distributed_2
(TimeDist (None, 5, 2309)
593413
=================================================================
Total
params: 2,580,229
Trainable
params: 2,580,229
Non-trainable
params: 0
_________________________________________________________________
None
Train on
9000 samples, validate on 1000 samplessssssssssssssssssssssssssssssssssssss
Epoch 1/30
- 76s - loss: 4.2931 - val_loss: 3.5462
Epoch 00001:
val_loss improved from inf to 3.54621, saving model to model.h5
Epoch 2/30
- 85s - loss: 3.3974 - val_loss: 3.4251
Epoch 00002:
val_loss improved from 3.54621 to 3.42512, saving model to model.h5
Epoch 3/30
- 87s - loss: 3.2527 - val_loss: 3.3515
Epoch 00003:
val_loss improved from 3.42512 to 3.35154, saving model to model.h5
Epoch 4/30
- 89s - loss: 3.1077 - val_loss: 3.2153
...
...
...
Epoch 27/30
- 91s - loss: 0.5958 - val_loss: 1.9781
Epoch 00027:
val_loss improved from 1.97907 to 1.97811, saving model to model.h5
Epoch 28/30
- 92s - loss: 0.5441 - val_loss: 1.9675
Epoch 00028:
val_loss improved from 1.97811 to 1.96752, saving model to model.h5
Epoch 29/30
- 93s - loss: 0.4995 - val_loss: 1.9564
Epoch 00029:
val_loss improved from 1.96752 to 1.95638, saving model to model.h5
Epoch 30/30
- 92s - loss: 0.4564 - val_loss: 1.9632
Epoch 00030:
val_loss did not improve from 1.95638
PREDICTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTt
runfile('C:/Users/ars/.spyder-py3/trans4.py',
wdir='C:/Users/ars/.spyder-py3')
test with
data that has been used in the training phase.
trainnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
src=[ich
fuhle mich schuldig], target=[i feel guilty], predicted=[i feel guilty]
src=[tom hat
das bewusstsein verloren], target=[tom passed out], predicted=[tom passed out]
src=[ich
werde lernen], target=[i will learn], predicted=[i will learn]
src=[mir
geht es gut], target=[im doing okay], predicted=[im doing well]
src=[ich bin
zimmermann], target=[im a carpenter], predicted=[im a carpenter]
src=[stell
es zuruck], target=[put it back], predicted=[put it back]
src=[es ist
bewundernswert], target=[its admirable], predicted=[its admirable]
src=[ich
habe das interesse verloren], target=[i lost interest], predicted=[i lost
interest]
src=[tom hat
sich gefugt], target=[tom obeyed], predicted=[tom obeyed]
src=[ich
kann gut kochen], target=[im a good cook], predicted=[i may stay stay]
C:\Users\ars\Anaconda3\lib\site-packages\nltk\translate\bleu_score.py:503:
UserWarning:
The
hypothesis contains 0 counts of 2-gram overlaps.
Therefore
the BLEU score evaluates to 0, independently of
how many
N-gram overlaps of lower order it contains.
Consider
using lower n-gram order or use SmoothingFunction()
warnings.warn(_msg)
C:\Users\ars\Anaconda3\lib\site-packages\nltk\translate\bleu_score.py:503:
UserWarning:
The
hypothesis contains 0 counts of 3-gram overlaps.
Therefore
the BLEU score evaluates to 0, independently of
how many
N-gram overlaps of lower order it contains.
Consider
using lower n-gram order or use SmoothingFunction()
warnings.warn(_msg)
C:\Users\ars\Anaconda3\lib\site-packages\nltk\translate\bleu_score.py:503:
UserWarning:
The
hypothesis contains 0 counts of 4-gram overlaps.
Therefore the
BLEU score evaluates to 0, independently of
how many
N-gram overlaps of lower order it contains.
Consider
using lower n-gram order or use SmoothingFunction()
warnings.warn(_msg)
BLEU-1:
0.077905
BLEU-2:
0.000000
BLEU-3:
0.000000
BLEU-4:
0.000000
test with
data that has never been introduced to the network before.
testttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt
src=[ich
habs im fernsehen gesehen], target=[i saw it on tv], predicted=[i saw it too]
src=[kommt
zuruck nach hause], target=[come back home], predicted=[come back home]
src=[entlassen
wir tom], target=[lets fire tom], predicted=[let tom tom]
src=[er lie
einen drachen steigen], target=[he flew a kite], predicted=[hes seems dead]
src=[ich
werde mit dem taxi fahren], target=[ill go by taxi], predicted=[ill get by day]
src=[ist es
weit weg], target=[is it far away], predicted=[is it do it]
src=[ich
bezahlte die rechnung], target=[i paid the bill], predicted=[i saved you]
src=[tom mag
schnee], target=[tom likes snow], predicted=[tom likes snow]
src=[es
ist], target=[its], predicted=[its important]
src=[schlafst
du], target=[are you asleep], predicted=[do you]
BLEU-1:
0.081552
BLEU-2:
0.000000
BLEU-3:
0.000000
BLEU-4:
0.000000