Wednesday, 5 December 2018

testDNN analysis 5

% pretrainRBM: pre-training the restricted boltzmann machine (RBM) model by Contrastive Divergence Learning
%
% rbm = pretrainRBM(rbm, V, opts)
%
%
%Output parameters:
% rbm: the restricted boltzmann machine (RBM) model
%
%
%Input parameters:
% rbm: the initial boltzmann machine (RBM) model
% V: visible (input) variables, where # of row is number of data and # of col is # of visible (input) nodes
% opts (optional): options
%
% options (defualt value):
%  opts.MaxIter: Maxium iteration number (100)
%  opts.InitialMomentum: Initial momentum until InitialMomentumIter (0.5)
%  opts.InitialMomentumIter: Iteration number for initial momentum (5)
%  opts.FinalMomentum: Final momentum after InitialMomentumIter (0.9)
%  opts.WeightCost: Weight cost (0.0002)
%  opts.DropOutRate: Dropour rate (0)
%  opts.StepRatio: Learning step size (0.01)
%  opts.BatchSize: # of mini-batch data (# of all data)
%  opts.Verbose: verbose or not (false)
%  opts.SparseQ: q parameter of sparse learning (0)
%  opts.SparseLambda: lambda parameter (weight) of sparse learning (0)
%
%
%Example:
% datanum = 1024;
% outputnum = 16;
% inputnum = 4;
%
% inputdata = rand(datanum, inputnum);
% outputdata = rand(datanum, outputnum);
%
% rbm = randRBM(inputnum, outputnum);
% rbm = pretrainRBM( rbm, inputdata );
%
%
%Reference:
%for details of the dropout
% Hinton et al, Improving neural networks by preventing co-adaptation of feature detectors, 2012.
%for details of the sparse learning
% Lee et al, Sparse deep belief net model for visual area V2, NIPS 2008.
%for implimentation of contrastive divergence learning
% http://read.pudn.com/downloads103/sourcecode/math/421402/drtoolbox/techniques/train_rbm.m__.htm
%
%
%Version: 20130727


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Deep Neural Network:                                     %
%                                                          %
% Copyright (C) 2013 Masayuki Tanaka. All rights reserved. %
%                    mtanaka@ctrl.titech.ac.jp             %
%                                                          %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function rbm = pretrainRBM(rbm, V, opts )

% Important parameters
InitialMomentum = 0.5;     % momentum for first five iterations
FinalMomentum = 0.9;       % momentum for remaining iterations
WeightCost = 0.0002;       % costs of weight update
InitialMomentumIter = 5;

MaxIter = 100;
DropOutRate = 0;
StepRatio = 0.01;
BatchSize = 0;
Verbose = false;

SparseQ = 0;
SparseLambda = 0;


if( exist('opts' ) )
 if( isfield(opts,'MaxIter') )
  MaxIter = opts.MaxIter;
 end
 if( isfield(opts,'InitialMomentum') )
  InitialMomentum = opts.InitialMomentum;
 end
 if( isfield(opts,'InitialMomentumIter') )
  InitialMomentumIter = opts.InitialMomentumIter;
 end
 if( isfield(opts,'FinalMomentum') )
  FinalMomentum = opts.FinalMomentum;
 end
 if( isfield(opts,'WeightCost') )
  WeightCost = opts.WeightCost;
 end
 if( isfield(opts,'DropOutRate') )
  DropOutRate = opts.DropOutRate;
 end
 if( isfield(opts,'StepRatio') )
  StepRatio = opts.StepRatio;
 end
 if( isfield(opts,'BatchSize') )
  BatchSize = opts.BatchSize;
 end
 if( isfield(opts,'Verbose') )
  Verbose = opts.Verbose;
 end
 if( isfield(opts,'SparseQ') )
  SparseQ = opts.SparseQ;
 end
 if( isfield(opts,'SparseLambda') )
  SparseLambda = opts.SparseLambda;
 end

else
 opts = [];
end

num = size(V,1); % 1000 x 32
dimH = size(rbm.b, 2); % 16
dimV = size(rbm.c, 2); % 32

if( BatchSize <= 0 ) % 250
  BatchSize = num;
end

if( DropOutRate > 0 )
    DropOutNum = round(dimV * DropOutRate); % 32 * 0.5
    DropOutRate = DropOutNum / num;  % 0.016
end


deltaW = zeros(dimV, dimH); % 32 x 16
deltaB = zeros(1, dimH); % 1 x 16
deltaC = zeros(1, dimV); % 1 x 32

for iter=1:MaxIter % 1:20
         
    % Set momentum
if( iter <= InitialMomentumIter )
momentum = InitialMomentum; % 0.500
else
momentum = FinalMomentum; % 0.9XXX
    end
               
    if( SparseLambda > 0 ) % 0
        dsW = zeros(dimV, dimH); % 32 x 16
        dsB = zeros(1, dimH); % 1 x 16
           
        vis0 = V; % 250 % 32
        hid0 = v2h( rbm, vis0 ); % 250 x 16
           
        dH = hid0 .* ( 1.0 - hid0 ); %derivative hidden  % 250 x 16
        sH = sum( hid0, 1 ); % sum of columns of each row % 1 x 16
    end
                   
    if( SparseLambda > 0 ) %0
        mH = sH / num; % sH/1000 or 250   1 x 16
        sdH = sum( dH, 1 ); % sum of derivative hidden columns of each row 1 x 16
        svdH = dH' * vis0;  % derivative hidden' times vis0 16 x 250 * 250 x 32 = 16 x 32
             
        dsW = dsW + SparseLambda * 2.0 * bsxfun(@times, (SparseQ-mH)', svdH)'; % 32 x 16 + (1 x 16' 1 x 32)'
        dsB = dsB + SparseLambda * 2.0 * (SparseQ-mH) .* sdH; % 1 x 16 + 1 x 16 .* 1 x 16 = 1 x 16
    end
       
       
ind = randperm(num); % randperm(1000)

for batch=1:BatchSize:num % 1 251 501 751
         
bind = ind(batch:min([batch + BatchSize - 1, num])); % 1 x 250 = 1 + 249, 1000 -> 1:250 -> ind( randperm(num))
           
        if( DropOutRate > 0 ) % 0.016
            cMat = zeros(dimV,1); % 32 x 1
            p = randperm(dimV, DropOutNum); % randperm(32 x 16)=16 x 32
            cMat(p) = 1;  % 16 x 32     cmat 32 x 1(16 x 32) = 16 x 32  --> cMat(1 x 32) = 1
            cMat = diag(cMat); %32 x 32  diag(32 x 1)
        end
       
        % Gibbs sampling step 0
        vis0 = double(V(bind,:)); % Set values of visible nodes 250 x 32 = 1000 x 32  1 x 250
        if( DropOutRate > 0 ) % 0.016
            vis0 = vis0 * cMat; % 250 x 32 * 32 x 32 = 250 x 32
        end
        hid0 = v2h( rbm, vis0 );  % Compute hidden nodes 250 x 16
       
        % Gibbs sampling step 1
        bhid0 = double( rand(size(hid0)) < hid0 ); % 250 x 16
        vis1 = h2v( rbm, bhid0 );  % Compute visible nodes % 250 x 32
        if( DropOutRate > 0 )  % 0.016
            vis1 = vis1 * cMat;% 250 x 32 * 32 x 32 = 250 x 32
        end
        hid1 = v2h( rbm, vis1 );  % Compute hidden nodes 250 x 16
       
posprods = hid0' * vis0; % 250 x 16' * 250 x 32 = 16 x 32
negprods = hid1' * vis1; % 250 x 16' * 250 x 32 = 16 x 32
% Compute the weights update by contrastive divergence
       
        dW = (posprods - negprods)'; %16 x 32'
        dB = (sum(hid0, 1) - sum(hid1, 1)); % 1 x 16
        dC = (sum(vis0, 1) - sum(vis1, 1)); % 1 x 32
     
        if( strcmpi( 'GBRBM', rbm.type ) ) % BBRBM   Gaussian-Binary Restricted Bolztmann
        dW = bsxfun(@rdivide, dW, rbm.sig');
        dC = bsxfun(@rdivide, dC, rbm.sig .* rbm.sig);
        end
       
deltaW = momentum * deltaW + (StepRatio / num) * dW; % 0.1/1000    32 x 16
deltaB = momentum * deltaB + (StepRatio / num) * dB; % 0.1/1000    1 x 16
deltaC = momentum * deltaC + (StepRatio / num) * dC; % 0.1/1000    1 x 32
         
        if( SparseLambda > 0 ) % 0
            deltaW = deltaW + numel(bind) / num * dsW;
            deltaB = deltaB + numel(bind) / num * dsB;
        end
       
% Update the network weights
rbm.W = rbm.W + deltaW - WeightCost * rbm.W; % 32 x 16   0.0002
rbm.b = rbm.b + deltaB; % 1 x 16
rbm.c = rbm.c + deltaC; % 1 x 32
     
end
       
if( SparseLambda > 0 && strcmpi( 'GBRBM', rbm.type ) )  % BBRBM  Gaussian-Binary Restricted Bolztmann
dsW = bsxfun(@rdivide, dsW, rbm.sig');  % sim = exp( -1 * sum((x1 - x2).^2) / (2 * (sigma ^ 2))); x1,x2 column vectors
end

if( Verbose )
        H = v2h( rbm, V ); % 1000 x 16 = ... , 1000 x 32
        Vr = h2v( rbm, H ); % 1000 x 32= ..., 1000 x 16
err = power( V - Vr, 2 ); % 1000 x 32
rmse = sqrt( sum(err(:)) / numel(err) ); % 1 x 1
fprintf( '%3d : %9.4f %9.4f\n', iter, rmse, mean(H(:)) );
    end
   
end  %% iter 1 : maxIter