TEKNE - TECHNE: January 2019

IMPLEMENTATION: How to Develop a Neural Machine Translation System from Scratch

This is an implementation of Jason BROWNLEE’s program:

https://machinelearningmastery.com/develop-neural-machine-translation-system-keras/

You can view the results at the bottom.

Trans.py

# -*- coding: utf-8 -*-

"""

Created on Sat Dec 8 22:37:54 2018

@author: Jason BROWNLEE

"""

import os; os.environ['KERAS_BACKEND'] = 'theano'

import string

import re

from pickle import dump

from unicodedata import normalize

from numpy import array

# load doc into memory

def load_doc(filename):

# open the file as read only

file = open(filename, mode='rt', encoding='utf-8')

# read all text

text = file.read()

# close the file

file.close()

return text

# split a loaded document into sentences

def to_pairs(doc):

lines = doc.strip().split('\n')

pairs = [line.split('\t') for line in lines]

return pairs

# clean a list of lines

def clean_pairs(lines):

cleaned = list()

# prepare regex for char filtering

re_print = re.compile('[^%s]' % re.escape(string.printable))

# prepare translation table for removing punctuation

table = str.maketrans('', '', string.punctuation)

for pair in lines:

clean_pair = list()

for line in pair:

# normalize unicode characters

line = normalize('NFD', line).encode('ascii', 'ignore')

line = line.decode('UTF-8')

# tokenize on white space

line = line.split()

# convert to lowercase

line = [word.lower() for word in line]

# remove punctuation from each token

line = [word.translate(table) for word in line]

# remove non-printable chars form each token

line = [re_print.sub('', w) for w in line]

# remove tokens with numbers in them

line = [word for word in line if word.isalpha()]

# store as string

clean_pair.append(' '.join(line))

cleaned.append(clean_pair)

return array(cleaned)

# save a list of clean sentences to file

def save_clean_data(sentences, filename):

dump(sentences, open(filename, 'wb'))

print('Saved: %s' % filename)

# load dataset

filename = 'deu.txt'

doc = load_doc(filename)

# split into english-german pairs

pairs = to_pairs(doc)

# clean sentences

clean_pairs = clean_pairs(pairs)

# save clean pairs to file

save_clean_data(clean_pairs, 'english-german.pkl')

# spot check

for i in range(10):

print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

-----------------------------------------------------------------------------------------------------

Trans2.py

# -*- coding: utf-8 -*-

"""

Created on Sat Dec 8 23:02:11 2018

@author: ars

"""

import os; os.environ['KERAS_BACKEND'] = 'theano'

from pickle import load

from pickle import dump

from numpy.random import rand

from numpy.random import shuffle

# load a clean dataset

def load_clean_sentences(filename):

return load(open(filename, 'rb'))

# save a list of clean sentences to file

def save_clean_data(sentences, filename):

dump(sentences, open(filename, 'wb'))

print('Saved: %s' % filename)

# load dataset

raw_dataset = load_clean_sentences('english-german.pkl')

# reduce dataset size

n_sentences = 10000

dataset = raw_dataset[:n_sentences, :]

# random shuffle

shuffle(dataset)

# split into train/test

train, test = dataset[:9000], dataset[9000:]

# save

save_clean_data(dataset, 'english-german-both.pkl')

save_clean_data(train, 'english-german-train.pkl')

save_clean_data(test, 'english-german-test.pkl')

-------------------------------------------------------------------------

Trans3.py

# -*- coding: utf-8 -*-

"""

Created on Sat Dec 8 23:05:25 2018

@author: ars

"""

import os; os.environ['KERAS_BACKEND'] = 'theano'

#os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

from pickle import load

from numpy import array

from keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences

from keras.utils import to_categorical

from keras.utils.vis_utils import plot_model

from keras.models import Sequential

from keras.layers import LSTM

from keras.layers import Dense

from keras.layers import Embedding

from keras.layers import RepeatVector

from keras.layers import TimeDistributed

from keras.callbacks import ModelCheckpoint

import theano;

#print(theano.config)

import theano.tensor;

theano.config.cxx="C:\\Users\\ars\\Anaconda3\\Library\\mingw-w64\\bin\\g++.exe"

# load a clean dataset

def load_clean_sentences(filename):

return load(open(filename, 'rb'))

# fit a tokenizer

def create_tokenizer(lines):

tokenizer = Tokenizer()

tokenizer.fit_on_texts(lines)

return tokenizer

# max sentence length

def max_length(lines):

return max(len(line.split()) for line in lines)

# encode and pad sequences

def encode_sequences(tokenizer, length, lines):

# integer encode sequences

X = tokenizer.texts_to_sequences(lines)

# pad sequences with 0 values

X = pad_sequences(X, maxlen=length, padding='post')

return X

# one hot encode target sequence

def encode_output(sequences, vocab_size):

ylist = list()

for sequence in sequences:

encoded = to_categorical(sequence, num_classes=vocab_size)

ylist.append(encoded)

y = array(ylist)

y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)

return y

# define NMT model (Neural Machine Translation)

def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):

model = Sequential()

model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))

model.add(LSTM(n_units))

model.add(RepeatVector(tar_timesteps))

model.add(LSTM(n_units, return_sequences=True))

model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))

return model

# load datasets

dataset = load_clean_sentences('english-german-both.pkl')

train = load_clean_sentences('english-german-train.pkl')

test = load_clean_sentences('english-german-test.pkl')

# prepare english tokenizer

eng_tokenizer = create_tokenizer(dataset[:, 0])

eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = max_length(dataset[:, 0])

print('English Vocabulary Size: %d' % eng_vocab_size)

print('English Max Length: %d' % (eng_length))

# prepare german tokenizer

ger_tokenizer = create_tokenizer(dataset[:, 1])

ger_vocab_size = len(ger_tokenizer.word_index) + 1

ger_length = max_length(dataset[:, 1])

print('German Vocabulary Size: %d' % ger_vocab_size)

print('German Max Length: %d' % (ger_length))

# prepare training data

trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])

trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])

trainY = encode_output(trainY, eng_vocab_size)

# prepare validation data

testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

testY = encode_output(testY, eng_vocab_size)

# define model

model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)

model.compile(optimizer='adam', loss='categorical_crossentropy')

# summarize defined model

print(model.summary())

plot_model(model, to_file='model.png', show_shapes=True)

# fit model

filename = 'model.h5'

checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

------------------------------------------------------------------

Trans4.py

# -*- coding: utf-8 -*-

"""

Created on Sun Dec 9 20:50:52 2018

@author: ars

"""

import os; os.environ['KERAS_BACKEND'] = 'theano'

#import os

#os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

from pickle import load

#from numpy import array

from numpy import argmax

from keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences

from keras.models import load_model

from nltk.translate.bleu_score import corpus_bleu

#import theano

#import theano.tensor

#theano.config.cxx=""

import theano;

#print(theano.config)

import theano.tensor;

theano.config.cxx="C:\\Users\\ars\\Anaconda3\\Library\\mingw-w64\\bin\\g++.exe"

# load a clean dataset

def load_clean_sentences(filename):

return load(open(filename, 'rb'))

# fit a tokenizer

def create_tokenizer(lines):

tokenizer = Tokenizer()

tokenizer.fit_on_texts(lines)

return tokenizer

# max sentence length

def max_length(lines):

return max(len(line.split()) for line in lines)

# encode and pad sequences

def encode_sequences(tokenizer, length, lines):

# integer encode sequences

X = tokenizer.texts_to_sequences(lines)

# pad sequences with 0 values

X = pad_sequences(X, maxlen=length, padding='post')

return X

# map an integer to a word

def word_for_id(integer, tokenizer):

for word, index in tokenizer.word_index.items():

if index == integer:

return word

return None

# generate target given source sequence

def predict_sequence(model, tokenizer, source):

prediction = model.predict(source, verbose=0)[0]

integers = [argmax(vector) for vector in prediction]

target = list()

for i in integers:

word = word_for_id(i, tokenizer)

if word is None:

break

target.append(word)

return ' '.join(target)

# evaluate the skill of the model

def evaluate_model(model, tokenizer, sources, raw_dataset):

actual, predicted = list(), list()

for i, source in enumerate(sources):

# translate encoded source text

source = source.reshape((1, source.shape[0]))

translation = predict_sequence(model, eng_tokenizer, source)

raw_target, raw_src = raw_dataset[i]

if i < 10:

print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))

actual.append(raw_target.split())

predicted.append(translation.split())

# calculate BLEU score

print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))

print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))

print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))

print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

# load datasets

dataset = load_clean_sentences('english-german-both.pkl')

train = load_clean_sentences('english-german-train.pkl')

test = load_clean_sentences('english-german-test.pkl')

# prepare english tokenizer

eng_tokenizer = create_tokenizer(dataset[:, 0])

eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = max_length(dataset[:, 0])

# prepare german tokenizer

ger_tokenizer = create_tokenizer(dataset[:, 1])

ger_vocab_size = len(ger_tokenizer.word_index) + 1

ger_length = max_length(dataset[:, 1])

# prepare data

trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])

testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])

# load model

model = load_model('model.h5')

# test on some training sequences

print('train')

evaluate_model(model, eng_tokenizer, trainX, train)

# test on some test sequences

print('test')

evaluate_model(model, eng_tokenizer, testX, test)

-----------------------------------------------------------------------------------------

OUTPUTS:

English Vocabulary Size: 2309

English Max Length: 5

German Vocabulary Size: 3657

German Max Length: 10

Neural Network Structureeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee

_________________________________________________________________

Layer (type) Output Shape Param #

=================================================================

embedding_2 (Embedding) (None, 10, 256) 936192

_________________________________________________________________

lstm_3 (LSTM) (None, 256) 525312

_________________________________________________________________

repeat_vector_2 (RepeatVecto (None, 5, 256) 0

_________________________________________________________________

lstm_4 (LSTM) (None, 5, 256) 525312

_________________________________________________________________

time_distributed_2 (TimeDist (None, 5, 2309) 593413

=================================================================

Total params: 2,580,229

Trainable params: 2,580,229

Non-trainable params: 0

_________________________________________________________________

None

Train on 9000 samples, validate on 1000 samplessssssssssssssssssssssssssssssssssssss

Epoch 1/30

- 76s - loss: 4.2931 - val_loss: 3.5462

Epoch 00001: val_loss improved from inf to 3.54621, saving model to model.h5

Epoch 2/30

- 85s - loss: 3.3974 - val_loss: 3.4251

Epoch 00002: val_loss improved from 3.54621 to 3.42512, saving model to model.h5

Epoch 3/30

- 87s - loss: 3.2527 - val_loss: 3.3515

Epoch 00003: val_loss improved from 3.42512 to 3.35154, saving model to model.h5

Epoch 4/30

- 89s - loss: 3.1077 - val_loss: 3.2153

...

Epoch 27/30

- 91s - loss: 0.5958 - val_loss: 1.9781

Epoch 00027: val_loss improved from 1.97907 to 1.97811, saving model to model.h5

Epoch 28/30

- 92s - loss: 0.5441 - val_loss: 1.9675

Epoch 00028: val_loss improved from 1.97811 to 1.96752, saving model to model.h5

Epoch 29/30

- 93s - loss: 0.4995 - val_loss: 1.9564

Epoch 00029: val_loss improved from 1.96752 to 1.95638, saving model to model.h5

Epoch 30/30

- 92s - loss: 0.4564 - val_loss: 1.9632

Epoch 00030: val_loss did not improve from 1.95638

PREDICTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTt

runfile('C:/Users/ars/.spyder-py3/trans4.py', wdir='C:/Users/ars/.spyder-py3')

test with data that has been used in the training phase.

trainnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn

src=[ich fuhle mich schuldig], target=[i feel guilty], predicted=[i feel guilty]

src=[tom hat das bewusstsein verloren], target=[tom passed out], predicted=[tom passed out]

src=[ich werde lernen], target=[i will learn], predicted=[i will learn]

src=[mir geht es gut], target=[im doing okay], predicted=[im doing well]

src=[ich bin zimmermann], target=[im a carpenter], predicted=[im a carpenter]

src=[stell es zuruck], target=[put it back], predicted=[put it back]

src=[es ist bewundernswert], target=[its admirable], predicted=[its admirable]

src=[ich habe das interesse verloren], target=[i lost interest], predicted=[i lost interest]

src=[tom hat sich gefugt], target=[tom obeyed], predicted=[tom obeyed]

src=[ich kann gut kochen], target=[im a good cook], predicted=[i may stay stay]

C:\Users\ars\Anaconda3\lib\site-packages\nltk\translate\bleu_score.py:503: UserWarning:

The hypothesis contains 0 counts of 2-gram overlaps.

Therefore the BLEU score evaluates to 0, independently of

how many N-gram overlaps of lower order it contains.

Consider using lower n-gram order or use SmoothingFunction()

warnings.warn(_msg)

C:\Users\ars\Anaconda3\lib\site-packages\nltk\translate\bleu_score.py:503: UserWarning:

The hypothesis contains 0 counts of 3-gram overlaps.

Therefore the BLEU score evaluates to 0, independently of

how many N-gram overlaps of lower order it contains.

Consider using lower n-gram order or use SmoothingFunction()

warnings.warn(_msg)

C:\Users\ars\Anaconda3\lib\site-packages\nltk\translate\bleu_score.py:503: UserWarning:

The hypothesis contains 0 counts of 4-gram overlaps.

Therefore the BLEU score evaluates to 0, independently of

how many N-gram overlaps of lower order it contains.

Consider using lower n-gram order or use SmoothingFunction()

warnings.warn(_msg)

BLEU-1: 0.077905

BLEU-2: 0.000000

BLEU-3: 0.000000

BLEU-4: 0.000000

test with data that has never been introduced to the network before.

testttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttt

src=[ich habs im fernsehen gesehen], target=[i saw it on tv], predicted=[i saw it too]

src=[kommt zuruck nach hause], target=[come back home], predicted=[come back home]

src=[entlassen wir tom], target=[lets fire tom], predicted=[let tom tom]

src=[er lie einen drachen steigen], target=[he flew a kite], predicted=[hes seems dead]

src=[ich werde mit dem taxi fahren], target=[ill go by taxi], predicted=[ill get by day]

src=[ist es weit weg], target=[is it far away], predicted=[is it do it]

src=[ich bezahlte die rechnung], target=[i paid the bill], predicted=[i saved you]

src=[tom mag schnee], target=[tom likes snow], predicted=[tom likes snow]

src=[es ist], target=[its], predicted=[its important]

src=[schlafst du], target=[are you asleep], predicted=[do you]

BLEU-1: 0.081552

BLEU-2: 0.000000

BLEU-3: 0.000000

BLEU-4: 0.000000

TEKNE - TECHNE

INFORMATION

Blog Archive

About Me

Monday, 28 January 2019

Implementation - How to Develop a Neural Machine Translation System from Scratch

Tuesday, 22 January 2019

Implementing (How to run Keras model on Movidius neural compute stick)

‘Source code for this post available on my GitHub repo - keras_mnist.’

Monday, 21 January 2019

image classifying with Movidius Neural Compute Stick