Monday, 28 January 2019

Implementation - How to Develop a Neural Machine Translation System from Scratch


IMPLEMENTATION: How to Develop a Neural Machine Translation System from Scratch

This is an implementation of Jason BROWNLEE’s program:
You can view the results at the bottom.

Trans.py
# -*- coding: utf-8 -*-
"""
Created on Sat Dec  8 22:37:54 2018

@author: Jason BROWNLEE
"""
import os; os.environ['KERAS_BACKEND'] = 'theano'
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

# load doc into memory
def load_doc(filename):
                # open the file as read only
                file = open(filename, mode='rt', encoding='utf-8')
                # read all text
                text = file.read()
                # close the file
                file.close()
                return text

# split a loaded document into sentences
def to_pairs(doc):
                lines = doc.strip().split('\n')
                pairs = [line.split('\t') for line in  lines]
                return pairs

# clean a list of lines
def clean_pairs(lines):
                cleaned = list()
                # prepare regex for char filtering
                re_print = re.compile('[^%s]' % re.escape(string.printable))
                # prepare translation table for removing punctuation
                table = str.maketrans('', '', string.punctuation)
                for pair in lines:
                               clean_pair = list()
                               for line in pair:
                                               # normalize unicode characters
                                               line = normalize('NFD', line).encode('ascii', 'ignore')
                                               line = line.decode('UTF-8')
                                               # tokenize on white space
                                               line = line.split()
                                               # convert to lowercase
                                               line = [word.lower() for word in line]
                                               # remove punctuation from each token
                                               line = [word.translate(table) for word in line]
                                               # remove non-printable chars form each token
                                               line = [re_print.sub('', w) for w in line]
                                               # remove tokens with numbers in them
                                               line = [word for word in line if word.isalpha()]
                                               # store as string
                                               clean_pair.append(' '.join(line))
                               cleaned.append(clean_pair)
                return array(cleaned)

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
                dump(sentences, open(filename, 'wb'))
                print('Saved: %s' % filename)

# load dataset
filename = 'deu.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')
# spot check
for i in range(10):
                print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))
-----------------------------------------------------------------------------------------------------
Trans2.py
# -*- coding: utf-8 -*-
"""
Created on Sat Dec  8 23:02:11 2018

@author: ars
"""
import os; os.environ['KERAS_BACKEND'] = 'theano'
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
                return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
                dump(sentences, open(filename, 'wb'))
                print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-german.pkl')

# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')
-------------------------------------------------------------------------
Trans3.py
# -*- coding: utf-8 -*-
"""
Created on Sat Dec  8 23:05:25 2018

@author: ars
"""
import os; os.environ['KERAS_BACKEND'] = 'theano'
#os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector

from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

import theano;
#print(theano.config)
import theano.tensor;
theano.config.cxx="C:\\Users\\ars\\Anaconda3\\Library\\mingw-w64\\bin\\g++.exe"

# load a clean dataset
def load_clean_sentences(filename):
                return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
                tokenizer = Tokenizer()
                tokenizer.fit_on_texts(lines)
                return tokenizer

# max sentence length
def max_length(lines):
                return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
                # integer encode sequences
                X = tokenizer.texts_to_sequences(lines)
                # pad sequences with 0 values
                X = pad_sequences(X, maxlen=length, padding='post')
                return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
                ylist = list()
                for sequence in sequences:
                               encoded = to_categorical(sequence, num_classes=vocab_size)
                               ylist.append(encoded)
                y = array(ylist)
                y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
                return y

# define NMT model (Neural Machine Translation)
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
                model = Sequential()
                model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
                model.add(LSTM(n_units))
                model.add(RepeatVector(tar_timesteps))
                model.add(LSTM(n_units, return_sequences=True))
                model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
                return model

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)
------------------------------------------------------------------
Trans4.py
# -*- coding: utf-8 -*-
"""
Created on Sun Dec  9 20:50:52 2018

@author: ars
"""
import os; os.environ['KERAS_BACKEND'] = 'theano'
#import os
#os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
from pickle import load
#from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
#import theano
#import theano.tensor
#theano.config.cxx=""
import theano;
#print(theano.config)
import theano.tensor;
theano.config.cxx="C:\\Users\\ars\\Anaconda3\\Library\\mingw-w64\\bin\\g++.exe"

# load a clean dataset
def load_clean_sentences(filename):
                return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
                tokenizer = Tokenizer()
                tokenizer.fit_on_texts(lines)
                return tokenizer

# max sentence length
def max_length(lines):
                return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
                # integer encode sequences
                X = tokenizer.texts_to_sequences(lines)
                # pad sequences with 0 values
                X = pad_sequences(X, maxlen=length, padding='post')
                return X

# map an integer to a word
def word_for_id(integer, tokenizer):
                for word, index in tokenizer.word_index.items():
                               if index == integer:
                                               return word
                return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
                prediction = model.predict(source, verbose=0)[0]
                integers = [argmax(vector) for vector in prediction]
                target = list()
                for i in integers:
                               word = word_for_id(i, tokenizer)
                               if word is None:
                                               break
                               target.append(word)
                return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
                actual, predicted = list(), list()
                for i, source in enumerate(sources):
                               # translate encoded source text
                               source = source.reshape((1, source.shape[0]))
                               translation = predict_sequence(model, eng_tokenizer, source)
                               raw_target, raw_src = raw_dataset[i]
                               if i < 10:
                                               print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
                               actual.append(raw_target.split())
                               predicted.append(translation.split())
                # calculate BLEU score
                print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
                print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
                print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
                print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model
model = load_model('model.h5')
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

-----------------------------------------------------------------------------------------
OUTPUTS:
English Vocabulary Size: 2309
English Max Length: 5
German Vocabulary Size: 3657
German Max Length: 10

Neural Network Structureeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
_________________________________________________________________
Layer (type)                 Output Shape              Param #  
=================================================================
embedding_2 (Embedding)      (None, 10, 256)           936192   
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               525312   
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 5, 256)            0        
_________________________________________________________________
lstm_4 (LSTM)                (None, 5, 256)            525312   
_________________________________________________________________
time_distributed_2 (TimeDist (None, 5, 2309)           593413   
=================================================================
Total params: 2,580,229
Trainable params: 2,580,229
Non-trainable params: 0
_________________________________________________________________
None

Train on 9000 samples, validate on 1000 samplessssssssssssssssssssssssssssssssssssss

Epoch 1/30
 - 76s - loss: 4.2931 - val_loss: 3.5462

Epoch 00001: val_loss improved from inf to 3.54621, saving model to model.h5
Epoch 2/30
 - 85s - loss: 3.3974 - val_loss: 3.4251

Epoch 00002: val_loss improved from 3.54621 to 3.42512, saving model to model.h5
Epoch 3/30
 - 87s - loss: 3.2527 - val_loss: 3.3515

Epoch 00003: val_loss improved from 3.42512 to 3.35154, saving model to model.h5
Epoch 4/30
 - 89s - loss: 3.1077 - val_loss: 3.2153

...
...
...
Epoch 27/30
 - 91s - loss: 0.5958 - val_loss: 1.9781

Epoch 00027: val_loss improved from 1.97907 to 1.97811, saving model to model.h5
Epoch 28/30
 - 92s - loss: 0.5441 - val_loss: 1.9675

Epoch 00028: val_loss improved from 1.97811 to 1.96752, saving model to model.h5
Epoch 29/30
 - 93s - loss: 0.4995 - val_loss: 1.9564

Epoch 00029: val_loss improved from 1.96752 to 1.95638, saving model to model.h5
Epoch 30/30
 - 92s - loss: 0.4564 - val_loss: 1.9632

Epoch 00030: val_loss did not improve from 1.95638

PREDICTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTt
runfile('C:/Users/ars/.spyder-py3/trans4.py', wdir='C:/Users/ars/.spyder-py3')

test with data that has been used in the training phase.
trainnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
src=[ich fuhle mich schuldig], target=[i feel guilty], predicted=[i feel guilty]
src=[tom hat das bewusstsein verloren], target=[tom passed out], predicted=[tom passed out]
src=[ich werde lernen], target=[i will learn], predicted=[i will learn]
src=[mir geht es gut], target=[im doing okay], predicted=[im doing well]
src=[ich bin zimmermann], target=[im a carpenter], predicted=[im a carpenter]
src=[stell es zuruck], target=[put it back], predicted=[put it back]
src=[es ist bewundernswert], target=[its admirable], predicted=[its admirable]
src=[ich habe das interesse verloren], target=[i lost interest], predicted=[i lost interest]
src=[tom hat sich gefugt], target=[tom obeyed], predicted=[tom obeyed]
src=[ich kann gut kochen], target=[im a good cook], predicted=[i may stay stay]
C:\Users\ars\Anaconda3\lib\site-packages\nltk\translate\bleu_score.py:503: UserWarning:
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  warnings.warn(_msg)
C:\Users\ars\Anaconda3\lib\site-packages\nltk\translate\bleu_score.py:503: UserWarning:
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  warnings.warn(_msg)
C:\Users\ars\Anaconda3\lib\site-packages\nltk\translate\bleu_score.py:503: UserWarning:
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  warnings.warn(_msg)
BLEU-1: 0.077905
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000

test with data that has never been introduced to the network before.
testttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt
src=[ich habs im fernsehen gesehen], target=[i saw it on tv], predicted=[i saw it too]
src=[kommt zuruck nach hause], target=[come back home], predicted=[come back home]
src=[entlassen wir tom], target=[lets fire tom], predicted=[let tom tom]
src=[er lie einen drachen steigen], target=[he flew a kite], predicted=[hes seems dead]
src=[ich werde mit dem taxi fahren], target=[ill go by taxi], predicted=[ill get by day]
src=[ist es weit weg], target=[is it far away], predicted=[is it do it]
src=[ich bezahlte die rechnung], target=[i paid the bill], predicted=[i saved you]
src=[tom mag schnee], target=[tom likes snow], predicted=[tom likes snow]
src=[es ist], target=[its], predicted=[its important]
src=[schlafst du], target=[are you asleep], predicted=[do you]
BLEU-1: 0.081552
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000