% pretrainRBM: pre-training the restricted boltzmann machine (RBM) model by Contrastive Divergence Learning
%
% rbm = pretrainRBM(rbm, V, opts)
%
%
%Output parameters:
% rbm: the restricted boltzmann machine (RBM) model
%
%
%Input parameters:
% rbm: the initial boltzmann machine (RBM) model
% V: visible (input) variables, where # of row is number of data and # of col is # of visible (input) nodes
% opts (optional): options
%
% options (defualt value):
% opts.MaxIter: Maxium iteration number (100)
% opts.InitialMomentum: Initial momentum until InitialMomentumIter (0.5)
% opts.InitialMomentumIter: Iteration number for initial momentum (5)
% opts.FinalMomentum: Final momentum after InitialMomentumIter (0.9)
% opts.WeightCost: Weight cost (0.0002)
% opts.DropOutRate: Dropour rate (0)
% opts.StepRatio: Learning step size (0.01)
% opts.BatchSize: # of mini-batch data (# of all data)
% opts.Verbose: verbose or not (false)
% opts.SparseQ: q parameter of sparse learning (0)
% opts.SparseLambda: lambda parameter (weight) of sparse learning (0)
%
%
%Example:
% datanum = 1024;
% outputnum = 16;
% inputnum = 4;
%
% inputdata = rand(datanum, inputnum);
% outputdata = rand(datanum, outputnum);
%
% rbm = randRBM(inputnum, outputnum);
% rbm = pretrainRBM( rbm, inputdata );
%
%
%Reference:
%for details of the dropout
% Hinton et al, Improving neural networks by preventing co-adaptation of feature detectors, 2012.
%for details of the sparse learning
% Lee et al, Sparse deep belief net model for visual area V2, NIPS 2008.
%for implimentation of contrastive divergence learning
% http://read.pudn.com/downloads103/sourcecode/math/421402/drtoolbox/techniques/train_rbm.m__.htm
%
%
%Version: 20130727
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Deep Neural Network: %
% %
% Copyright (C) 2013 Masayuki Tanaka. All rights reserved. %
% mtanaka@ctrl.titech.ac.jp %
% %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function rbm = pretrainRBM(rbm, V, opts )
% Important parameters
InitialMomentum = 0.5; % momentum for first five iterations
FinalMomentum = 0.9; % momentum for remaining iterations
WeightCost = 0.0002; % costs of weight update
InitialMomentumIter = 5;
MaxIter = 100;
DropOutRate = 0;
StepRatio = 0.01;
BatchSize = 0;
Verbose = false;
SparseQ = 0;
SparseLambda = 0;
if( exist('opts' ) )
if( isfield(opts,'MaxIter') )
MaxIter = opts.MaxIter;
end
if( isfield(opts,'InitialMomentum') )
InitialMomentum = opts.InitialMomentum;
end
if( isfield(opts,'InitialMomentumIter') )
InitialMomentumIter = opts.InitialMomentumIter;
end
if( isfield(opts,'FinalMomentum') )
FinalMomentum = opts.FinalMomentum;
end
if( isfield(opts,'WeightCost') )
WeightCost = opts.WeightCost;
end
if( isfield(opts,'DropOutRate') )
DropOutRate = opts.DropOutRate;
end
if( isfield(opts,'StepRatio') )
StepRatio = opts.StepRatio;
end
if( isfield(opts,'BatchSize') )
BatchSize = opts.BatchSize;
end
if( isfield(opts,'Verbose') )
Verbose = opts.Verbose;
end
if( isfield(opts,'SparseQ') )
SparseQ = opts.SparseQ;
end
if( isfield(opts,'SparseLambda') )
SparseLambda = opts.SparseLambda;
end
else
opts = [];
end
num = size(V,1); % 1000 x 32
dimH = size(rbm.b, 2); % 16
dimV = size(rbm.c, 2); % 32
if( BatchSize <= 0 ) % 250
BatchSize = num;
end
if( DropOutRate > 0 )
DropOutNum = round(dimV * DropOutRate); % 32 * 0.5
DropOutRate = DropOutNum / num; % 0.016
end
deltaW = zeros(dimV, dimH); % 32 x 16
deltaB = zeros(1, dimH); % 1 x 16
deltaC = zeros(1, dimV); % 1 x 32
for iter=1:MaxIter % 1:20
% Set momentum
if( iter <= InitialMomentumIter )
momentum = InitialMomentum; % 0.500
else
momentum = FinalMomentum; % 0.9XXX
end
if( SparseLambda > 0 ) % 0
dsW = zeros(dimV, dimH); % 32 x 16
dsB = zeros(1, dimH); % 1 x 16
vis0 = V; % 250 % 32
hid0 = v2h( rbm, vis0 ); % 250 x 16
dH = hid0 .* ( 1.0 - hid0 ); %derivative hidden % 250 x 16
sH = sum( hid0, 1 ); % sum of columns of each row % 1 x 16
end
if( SparseLambda > 0 ) %0
mH = sH / num; % sH/1000 or 250 1 x 16
sdH = sum( dH, 1 ); % sum of derivative hidden columns of each row 1 x 16
svdH = dH' * vis0; % derivative hidden' times vis0 16 x 250 * 250 x 32 = 16 x 32
dsW = dsW + SparseLambda * 2.0 * bsxfun(@times, (SparseQ-mH)', svdH)'; % 32 x 16 + (1 x 16' 1 x 32)'
dsB = dsB + SparseLambda * 2.0 * (SparseQ-mH) .* sdH; % 1 x 16 + 1 x 16 .* 1 x 16 = 1 x 16
end
ind = randperm(num); % randperm(1000)
for batch=1:BatchSize:num % 1 251 501 751
bind = ind(batch:min([batch + BatchSize - 1, num])); % 1 x 250 = 1 + 249, 1000 -> 1:250 -> ind( randperm(num))
if( DropOutRate > 0 ) % 0.016
cMat = zeros(dimV,1); % 32 x 1
p = randperm(dimV, DropOutNum); % randperm(32 x 16)=16 x 32
cMat(p) = 1; % 16 x 32 cmat 32 x 1(16 x 32) = 16 x 32 --> cMat(1 x 32) = 1
cMat = diag(cMat); %32 x 32 diag(32 x 1)
end
% Gibbs sampling step 0
vis0 = double(V(bind,:)); % Set values of visible nodes 250 x 32 = 1000 x 32 1 x 250
if( DropOutRate > 0 ) % 0.016
vis0 = vis0 * cMat; % 250 x 32 * 32 x 32 = 250 x 32
end
hid0 = v2h( rbm, vis0 ); % Compute hidden nodes 250 x 16
% Gibbs sampling step 1
bhid0 = double( rand(size(hid0)) < hid0 ); % 250 x 16
vis1 = h2v( rbm, bhid0 ); % Compute visible nodes % 250 x 32
if( DropOutRate > 0 ) % 0.016
vis1 = vis1 * cMat;% 250 x 32 * 32 x 32 = 250 x 32
end
hid1 = v2h( rbm, vis1 ); % Compute hidden nodes 250 x 16
posprods = hid0' * vis0; % 250 x 16' * 250 x 32 = 16 x 32
negprods = hid1' * vis1; % 250 x 16' * 250 x 32 = 16 x 32
% Compute the weights update by contrastive divergence
dW = (posprods - negprods)'; %16 x 32'
dB = (sum(hid0, 1) - sum(hid1, 1)); % 1 x 16
dC = (sum(vis0, 1) - sum(vis1, 1)); % 1 x 32
if( strcmpi( 'GBRBM', rbm.type ) ) % BBRBM Gaussian-Binary Restricted Bolztmann
dW = bsxfun(@rdivide, dW, rbm.sig');
dC = bsxfun(@rdivide, dC, rbm.sig .* rbm.sig);
end
deltaW = momentum * deltaW + (StepRatio / num) * dW; % 0.1/1000 32 x 16
deltaB = momentum * deltaB + (StepRatio / num) * dB; % 0.1/1000 1 x 16
deltaC = momentum * deltaC + (StepRatio / num) * dC; % 0.1/1000 1 x 32
if( SparseLambda > 0 ) % 0
deltaW = deltaW + numel(bind) / num * dsW;
deltaB = deltaB + numel(bind) / num * dsB;
end
% Update the network weights
rbm.W = rbm.W + deltaW - WeightCost * rbm.W; % 32 x 16 0.0002
rbm.b = rbm.b + deltaB; % 1 x 16
rbm.c = rbm.c + deltaC; % 1 x 32
end
if( SparseLambda > 0 && strcmpi( 'GBRBM', rbm.type ) ) % BBRBM Gaussian-Binary Restricted Bolztmann
dsW = bsxfun(@rdivide, dsW, rbm.sig'); % sim = exp( -1 * sum((x1 - x2).^2) / (2 * (sigma ^ 2))); x1,x2 column vectors
end
if( Verbose )
H = v2h( rbm, V ); % 1000 x 16 = ... , 1000 x 32
Vr = h2v( rbm, H ); % 1000 x 32= ..., 1000 x 16
err = power( V - Vr, 2 ); % 1000 x 32
rmse = sqrt( sum(err(:)) / numel(err) ); % 1 x 1
fprintf( '%3d : %9.4f %9.4f\n', iter, rmse, mean(H(:)) );
end
end %% iter 1 : maxIter