package ai.minxiao.ds4s.core.dl4j.mlnn

import org.deeplearning4j.nn.api.OptimizationAlgorithm
import org.deeplearning4j.nn.conf.inputs.InputType
import org.deeplearning4j.nn.conf.inputs.InputType.InputTypeFeedForward
import org.deeplearning4j.nn.conf.layers.{
  DenseLayer, FeedForwardLayer,
  EmbeddingLayer, EmbeddingSequenceLayer,
  OutputLayer,
  GravesBidirectionalLSTM, GravesLSTM, LSTM, RnnOutputLayer}
import org.deeplearning4j.nn.conf.{BackpropType, GradientNormalization, MultiLayerConfiguration, NeuralNetConfiguration, Updater}
import org.deeplearning4j.nn.conf.NeuralNetConfiguration.{Builder => NNConfBuilder, ListBuilder}
import org.deeplearning4j.nn.conf.preprocessor.{FeedForwardToRnnPreProcessor, RnnToFeedForwardPreProcessor}
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork
import org.deeplearning4j.nn.weights.WeightInit
import org.nd4j.linalg.activations.Activation
import org.nd4j.linalg.factory.Nd4j
import org.nd4j.linalg.learning.config.{AdaDelta, AdaGrad, AdaMax, Adam, Nadam, Nesterovs, NoOp, RmsProp, Sgd}
import org.nd4j.linalg.lossfunctions.impl._
import org.nd4j.linalg.lossfunctions.LossFunctions.LossFunction

import ai.minxiao.ds4s.core.dl4j.ui.UIStarter

/**
  * RNNEmbed
  * Recurrent Neural Network with Embeddings
  *
  * @constructor
  * -------------------------------------------------------------------------------------------------------------
  *
  * BASE
  * @param seed random generator seed, default=2018
  *
  * ------------------------------------------------------------------------------------------------------------
  *
  * REGULARIZATION
  * @param l2 l2 regularization, default=0.0
  * @param l1 l1 regularization, default=0.0
  * @param l2Bias l2 bias term, default=0.0
  * @param l1Bias l1 bias term, default=0.0
  * @param weightNoise whether to use weight noise (drop connect), default=false
  * @param weightRetainProbability weight retain probability for the weight noise (drop-connect), default=1 (no drop-connect)
  * @param applyToBiases whether apply to biases for the weight noise (drop-connect), default=false
  *
  * ------------------------------------------------------------------------------------------------------------------
  *
  * OPTIMIZATION
  *
  * @param optimizationAlgo
  *   <a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/api/OptimizationAlgorithm.java">
  *   optimization algorithm</a> (default=STOCHASTIC_GRADIENT_DESCENT)
  * {{{
  * STOCHASTIC_GRADIENT_DESCENT://<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/StochasticGradientDescent.java">StochasticGradientDescent.java</a>
  * LINE_GRADIENT_DESCENT://<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/LineGradientDescent.java">LineGradientDescent.java</a>
  * CONJUGATE_GRADIENT://<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/ConjugateGradient.java">ConjugateGradient.java</a>
  * LBFGS://<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/optimize/solvers/LBFGS.java">LBFGS.java</a>
  * }}}
  * @param miniBatch whether to use mini-batch, default=true
  * @param learningRate learning rate, default=0.1
  * @param beta1 gradient moving avg decay rate, default=0.9
  * @param beta2 gradient sqrt decay rate, default=0.999
  * @param epsilon default=1E-8
  * @param momentum NESTEROVS momentum, default=0.9
  * @param rmsDecay RMSPROP decay rate, default=0.95
  * @param rho ADADELTA decay rate, default=0.95
  * @param updater <a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/Updater.java">
  *   weights updater</a>, (default = NESTEROVS).
  * Options: {{{
  * SGD: //<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/Sgd.java">Sgd.java</a>
  *   learningRate: learning rate (default = 1E-3)
  * ADAM: //<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/Adam.java">Adam.java</a>
  *   learningRate: learning rate, DEFAULT_ADAM_LEARNING_RATE = 1e-3;
  *   beta1: gradient moving avg decay rate, DEFAULT_ADAM_BETA1_MEAN_DECAY = 0.9;
  *   beta2: gradient sqrt decay rate, DEFAULT_ADAM_BETA2_VAR_DECAY = 0.999;
  *   epsilon: epsilon, DEFAULT_ADAM_EPSILON = 1e-8;
  *   //<a href="http://arxiv.org/abs/1412.6980">Adam: A Method for Stochastic Optimization</a>
  * ADAMAX: //<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/AdaMax.java">AdaMax.java</a>
  *   learningRate: learning rate, DEFAULT_ADAMAX_LEARNING_RATE = 1e-3;
  *   beta1: gradient moving avg decay rate, DEFAULT_ADAMAX_BETA1_MEAN_DECAY = 0.9;
  *   beta2: gradient sqrt decay rate, DEFAULT_ADAMAX_BETA2_VAR_DECAY = 0.999;
  *   epsilon: epsilon, DEFAULT_ADAMAX_EPSILON = 1e-8;
  *   //<a href="http://arxiv.org/abs/1412.6980">Adam: A Method for Stochastic Optimization</a>
  * NADAM://<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/Nadam.java">Nadam.java</a>
  *   learningRate: learning rate, DEFAULT_NADAM_LEARNING_RATE = 1e-3;
  *   epsilon: DEFAULT_NADAM_EPSILON = 1e-8;
  *   beta1: gradient moving avg decay rate, DEFAULT_NADAM_BETA1_MEAN_DECAY = 0.9;
  *   beta2: gradient sqrt decay rate, DEFAULT_NADAM_BETA2_VAR_DECAY = 0.999;
  *   //<a href="https://arxiv.org/pdf/1609.04747.pdf">An overview of gradient descent optimization algorithms</a>
  * AMSGRAD: //<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/AMSGrad.java">AMSGrad.java</a>
  *   learningRate: learning rate, DEFAULT_AMSGRAD_LEARNING_RATE = 1e-3;
  *   epsilon: DEFAULT_AMSGRAD_EPSILON = 1e-8;
  *   beta1: DEFAULT_AMSGRAD_BETA1_MEAN_DECAY = 0.9;
  *   beta2: DEFAULT_AMSGRAD_BETA2_VAR_DECAY = 0.999;
  * ADAGRAD: Vectorized Learning Rate used per Connection Weight//<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/AdaGrad.java">AdaGrad.java</a>
  *   learningRate: learning rate, DEFAULT_ADAGRAD_LEARNING_RATE = 1e-1;
  *   epsilon: DEFAULT_ADAGRAD_EPSILON = 1e-6;
  *   //<a href="http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf">Adaptive Subgradient Methods for Online Learning and Stochastic Optimization</a>
  *   //<a href="http://xcorr.net/2014/01/23/adagrad-eliminating-learning-rates-in-stochastic-gradient-descent/">Adagrad – eliminating learning rates in stochastic gradient descent</a>
  * NESTEROVS: tracks previous layer's gradient and uses it as a way of updating the gradient //<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/Nesterovs.java">Nesterovs.java</a>
  *   learningRate: learning rate, DEFAULT_NESTEROV_LEARNING_RATE = 0.1;
  *   momentum: DEFAULT_NESTEROV_MOMENTUM = 0.9;
  * RMSPROP: //<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/RmsProp.java">RmsProp.java</a>
  *   learningRate: learning rate, DEFAULT_RMSPROP_LEARNING_RATE = 1e-1;
  *   epsilon: DEFAULT_RMSPROP_EPSILON = 1e-8;
  *   rmsDecay: decay rate, DEFAULT_RMSPROP_RMSDECAY = 0.95;
  *   //<a href="http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf">Neural Networks for Machine Learning</a>
  * ADADELTA: //<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/AdaDelta.java">AdaDelta.java</a>
  *   rho: decay rate, controlling the decay of the previous parameter updates, DEFAULT_ADADELTA_RHO = 0.95;
  *   epsilon: DEFAULT_ADADELTA_EPSILON = 1e-6;
  *   (no need to manually set the learning rate)
  *   //<a href="http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf">ADADELTA: AN ADAPTIVE LEARNING RATE METHOD</a>
  * NONE: no updates //<a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/NoOp.java">NoOp.java</a>
  * }}}
  * @param gradientNormalization gradient normalization, default=None
  * Options: GradientNormalization.X
  * {{{
  * ClipElementWiseAbsoluteValue:
  *  g <- sign(g)*max(maxAllowedValue,|g|).
  * ClipL2PerLayer:
  *   GOut = G                             if l2Norm(G) < threshold (i.e., no change)
  *   GOut = threshold * G / l2Norm(G)     otherwise
  * ClipL2PerParamType: conditional renormalization. Very similar to ClipL2PerLayer, however instead of clipping per layer, do clipping on each parameter type separately.
  * None: no gradient normalization
  * RenormalizeL2PerLayer: rescale gradients by dividing by the L2 norm of all gradients for the layer
  * RenormalizeL2PerParamType:
  *  GOut_weight = G_weight / l2(G_weight)
  *  GOut_bias = G_bias / l2(G_bias)
  * }}}
  * @param gradientNormalizationThreshold gradient threshold, default=0.5
  *
  * ------------------------------------------------------------------------------------------------------------------------------------------
  *
  * INPUT LAYER
  *
  * @param inputSize input size, required
  * @param inputType input type, default=InputType.Type.FF
  * Options: {{{
  * InputType.Type.FF: Standard feed-foward (2d minibatch, 1d per example) data
  * InputType.Type.CNN: 2D Convolutional neural network (4d minibatch, [miniBatchSize, channels, height, width])
  * InputType.Type.CNN3D: 3D convolutional neural network (5d minibatch, [miniBatchSize, channels, height, width, channels])
  * InputType.Type.CNNFlat: Flattened 2D conv net data (2d minibatch, [miniBatchSize, height * width * channels])
  * InputType.Type.RNN: Recurrent neural network (3d minibatch) time series data
  * }}}
  * @param height height of input, default=10
  * @param width width of input, default=10
  * @param depth depth of input, default=10
  * @param channels number of channels, default=3
  *
  * ------------------------------------------------------------------------------------------------------------------------------------------
  *
  * OUTPUT LAYER
  *
  * @param outputSize output size, required
  * @param lossFunction <a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/LossFunctions.java">
  *   loss function</a> for the output layer, required
  * Options: y-true, yHat-prediction {{{
  * L2: Sum of Squared Errors//<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossL2.java">LossL2.java</a>
  *   L = sum_i (y_i - yHat_i)^2
  * MSE (or SQUARED_LOSS): Mean Squared Error//<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossMSE.java">LossMSE.java</a>
  *   L = 1/(2N) sum_i sum_j (y_{i,j} - yHat_{i,j})^2
  * L1: Sum of Absolute Errors//<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossL1.java">LossL1.java</a>
  *   L = sum_i |y_i - yHat_i|
  * MEAN_ABSOLUTE_ERROR: Mean Absolute Error//<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossMAE.java">LossMAE.java</a>
  *   L = 1/(2N) sum_i sum_j |y_{i,j} - yHat_{i,j}|
  * MEAN_ABSOLUTE_PERCENTAGE_ERROR: Mean Aboluste Percentage Error//<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossMAPE.java">LossMAPE.java</a>
  *   L = 1/N sum_i |y_i - yHat_i|*100/|y_i|
  * MEAN_SQUARED_LOGARITHMIC_ERROR: Mean Squared Logarithmic Error//<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossMSLE.java">LossMSLE.java</a>
  *   L = 1/N sum_i (log(1 + y_i) - log(1 + yHat_i))^2
  * POISSON (or EXPLL): Exponential Log Likelihood Loss (Poisson Loss)//<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossPoisson.java">LossPoisson.java</a>
  *   L = 1/N sum_i (yHat_i - y_i * log(yHat_i))
  * XENT: Binary Cross Entropy Loss//<a href=https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossBinaryXENT.java>LossBinaryXENT.java</a>
  *   L = - 1/N (y_i*log(yHat_i) + (1 - y_i)*log(1 - yHat_i))
  *   (label scalar of 0/1 binary classes)
  * MCXENT: Multiclass Cross Entropy Loss//<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossMCXENT.java">LossMCXENT.java</a>
  *   L = - 1/N \sum_i \sum_k y_{i,k} * log(yHat_{i, k})
  *   (label vector of 0/1 indicator labels)
  * NEGATIVELOGLIKELIHOOD: Negative Log Likelihood//<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossNegativeLogLikelihood.java">LossNegativeLogLikelihood.java</a>
  *   L = - 1/N \sum_i \sum_k y_{i,k} * log(yHat_{i, k})
  *   (*negative log likelihood is equivalent to cross entropy mathematically)
  * KL_DIVERGENCE (or RECONSTRUCTION_CROSSENTROPY): Kullback Leibler Divergence Loss//<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossKLD.java">LossKLD.java</a>
  *   L = - 1/N sum_i y_i * log (yHat_i / y_i)
  *     = 1/N sum_i y_i * log (y_i / yHat_i)
  *     = 1/N ( sum_i y_i * log(y_i) - sum_i y_i * log(yHat_i))
  *     =        entropy                    cross-entropy
  * COSINE_PROXIMITY://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossCosineProximity.java">LossCosineProximity.java</a>
  *   L = (sum_i y_i dotprod yHat_i)/(sqrt(sum_i y_i dotprod y_i) * sqrt(sum_i yHat_i dotprod yHat_i))
  * HINGE: Hinge Loss//<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossHinge.java">LossHinge.java</a>
  *   L = 1/N sum_i max(0, 1 - yHat_i * y_i)
  *   (*label scalar of -1/+1 labels)
  * SQUARED_HINGE: Squared Hinge Loss//<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/lossfunctions/impl/LossSquaredHinge.java">LossSquaredHinge.java</a>
  *   L = 1/N sum_i (max(0, 1 - yHat_i * y_i))^2
  *   (*label scalar of -1/+1 labels)
  * }}}
  * @param outputLayerActivation output layer <a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/Activation.java">
  *   activation functions</a>, required.
  * Options: {{{
  * Cube://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationCube.java">ActivationCube.java</a>
  *   f(x) = x^3
  * ELU://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationELU.java">ActivationELU.java</a>
  *          ⎧ alpha * (exp(x) - 1.0), x <  0; // alpha defaults to 1, if not specified
  *   f(x) = ⎨
  *          ⎩                      x, x >= 0;
  * HARDSIGMOID://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationHardSigmoid.java">ActivationHardSigmoid.java</a>
  *   f(x) = min(1, max(0, 0.2*x + 0.5))
  * HARDTANH://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationHardTanH.java">ActivationHardTanH.java</a>
  *          ⎧  1, if x >  1
  *   f(x) = ⎨ -1, if x < -1
  *          ⎩  x, otherwise
  * IDENTITY://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationIdentity.java">ActivationIdentity.java</a>
  *   f(x) = x
  * LEAKYRELU://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationLReLU.java">ActivationLReLU.java</a>
  *   f(x) = max(0, x) + alpha * min(0, x) // alpha defaults to 0.01
  * RRELU://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationRReLU.java">ActivationRReLU.java</a>
  *   f(x) = max(0,x) + alpha * min(0, x)
  *     // alpha is drawn from uniform(l,u) during training and is set to l+u/2 during test
  *     // l and u default to 1/8 and 1/3 respectively
  *   // <a href="http://arxiv.org/abs/1505.00853">Empirical Evaluation of Rectified Activations in Convolutional Network</a>
  * RATIONALTANH://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationRationalTanh.java">ActivationRationalTanh.java</a>
  *   f(x) = 1.7159 * tanh(2x/3), where tanh is approxiated as tanh(y) ~ sgn(y) * { 1 - 1/(1+|y|+y^2+1.41645*y^4)}
  *   //<a href="https://arxiv.org/pdf/1508.01292v3">Reference</a>
  * RELU://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationReLU.java">ActivationReLU.java</a>
  *   f(x) = max(0, x)
  * //RELU6://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationReLU6.java">ActivationReLU6.java</a>
  * //  f(x) = min(max(x, 0), 6)
  * RECTIFIEDTANH://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationRectifiedTanh.java">ActivationRectifiedTanh.java</a>
  *   f(x) = max(0, tanh(x))
  * SELU://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationSELU.java">ActivationSELU.java</a>
  *                 ⎧                      x, x > 0
  *   f(x) = lambda ⎨
  *                 ⎩ alpha * exp(x) - alpha, x <= 0
  *   //<a href="https://arxiv.org/pdf/1706.02515.pdf">Reference</a>
  * SIGMOID://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationSigmoid.java">ActivationSigmoid.java</a>
  *   f(x) = 1 / (1 + exp(-x))
  * SOFTPLUS://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationSoftPlus.java">ActivationSoftPlus.java</a>
  *   f(x) = log(1 + exp(x))
  * SOFTSIGN://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationSoftSign.java">ActivationSoftSign.java</a>
  *   f_i(x) = x_i / (1 + |x_i|)
  * SOFTMAX://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationSoftmax.java">ActivationSoftmax.java</a>
  *   f_i(x) = exp(x_i - shift) / sum_j exp(x_j - shift), where shift = max_i x_i
  * SWISH://<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationSwish.java">ActivationSwish.java</a>
  *   f(x) = x * sigmoid(x)
  * TANH: //<a href="https://github.com/deeplearning4j/nd4j/blob/master/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationTanH.java">ActivationTanH.java</a>
  *   f(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
  * }}}
  * @param outputLayerWeightInit output layer <a href="https://github.com/deeplearning4j/deeplearning4j/blob/master/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInit.java">
  *   weight initialization</a>, default=XAVIER.
  * Options: {{{
  * ZERO: all 0s.
  * ONES: all 1s.
  * SIGMOID_UNIFORM: U(-r,r) with r=4*sqrt(6/(fanIn + fanOut)), A version of XAVIER_UNIFORM for sigmoid activation functions.
  * NORMAL: N(0, sigma^2) with sigma = 1/sqrt(fanIn).
  * LECUN_UNIFORM: U[-a,a] with a=3/sqrt(fanIn).
  * UNIFORM: U[-a,a] with a=1/sqrt(fanIn).
  * XAVIER: N(0, sigma^2) with sigma = sqrt(2.0/(fanIn + fanOut))
  * XAVIER_UNIFORM: U(-s,s) with s = sqrt(6/(fanIn + fanOut))
  * XAVIER_FAN_IN: N(0, sigma^2) with sigma = sqrt(1/fanIn)
  * RELU: N(0, sigma^2) with sigma = sqrt(2.0/nIn)
  * RELU_UNIFORM: U(-s,s) with s = sqrt(6/fanIn)
  * IDENTITY: I_{nIn, nOut} an identity matrix, only applicable to square weight matrices
  * VAR_SCALING_NORMAL_FAN_IN: N(0, sigma^2) with sigma = sqrt(1.0/fanIn)
  * VAR_SCALING_NORMAL_FAN_OUT: N(0, sigma^2) with sigma = sqrt(1.0/fanOut)
  * VAR_SCALING_NORMAL_FAN_AVG: N(0, sigma^2) with sigma = sqrt(1.0/((fanIn + fanOut)/2))
  * VAR_SCALING_UNIFORM_FAN_IN: U[-a,a] with a=3.0/(fanIn)
  * VAR_SCALING_UNIFORM_FAN_OUT: U[-a,a] with a=3.0/(fanOut)
  * VAR_SCALING_UNIFORM_FAN_AVG: U[-a,a] with a=3.0/((fanIn + fanOut)/2)
  * }}}
  * @param outputLayerBiasInit output layer bias initialization, default=0.0
  * @param weights instance weights-based on classes, applicable for weighted classification, default=Array[Double]()
  *
  * --------------------------------------------------------------------------------------------------------------------------------------------------
  *
  * BASE FOR LAYERS
  *
  * @param pretrain whether to pretrain, default=false
  * @param backprop whether to use backprop, default=true
  * @param backpropType: BackpropType.Type, default=BackpropType.TruncatedBPTT,
  * Options {{{
  * Standard
  * TruncatedBPTT
  * }}}
  * @param tBPTTFordwardLength, forward max length for Truncated BPTT, default=100
  * @param tBPTTBackwardLength, backward max length for Truncated BPTT, default=100
  *
  * -------------------------------------------------------------------------------------------------------------------------------------------
  *
  * MONITOR, UI
  *
  * @param setListener whether to set a listener, default=true
  * @param listenType listener type, default="console"
  * Options: {{{
  * console: print in the console
  * ui: display in the UI
  * file: save to a file
  * }}}
  * @param listenFreq listener frequency to track the score, default=1
  * @param storagePath file path for saving the stats if set listenType="file", default="", not used
  * @param enableRemote whether to enable remote listening, default=false
  *
  * --------------------------------------------------------------------------------------------
  *
  * Recurrent Layers
  *
  * @param nRecurLayers number of recurrent layers, default=1
  * @param recurLayerTypes: recurrent types, default=Array("GLSTM")
  * Options: {{{
  * LSTM
  * GLSTM
  * GBLSTM
  * }}}
  * @param recurLayerSizes recurrent layer sizes, default=Array(1)
  * @param recurLayerActivations recurrent layer activations, default=Array(Activation.TANH)
  * @param recurLayerWeightInits recurrent layer weight initialization, default=Array(WeightInit.XAVIER)
  * @param recurLayerBiasInits recurrent layer bias initialization, default=Array(0.0)
  *
  * ------------------------------------------------------------------------------------------
  *
  * Embedding Layer
  *
  * @param embeddingLayerType embedding layer type, non-seq (EmbeddingLayer) vs seq (EmbeddingSequenceLayer), default="seq"
  * @param embeddingLayerSize embedding layer size
  *
  * -----------------------------------------------------------------------------------------
  *
  * Dense Layers
  *
  * @param nDenseLayers number of dense layers, default=1
  * @param denseLayerSizes sizes of dense layers, default=Array(1), for index beyond the boundary, use the last one
  * @param denseLayerActivations activations of dense layers, default=Array(Activation.RELU)
  * @param denseLayerWeightInits weight initializer of dense layers, default=Array(WeightInit.XAVIER)
  * @param denseLayerBiasInits bias initializer of dense layers, default=Array(0.0)
  * @param denseLayerDropOuts dropouts of dense layers, default=Array(0.0)
  *
  * @author mx
  */
@SerialVersionUID(82787869L)
class RNNEmbed (
  // Base
  seed: Long = 2018L,
  // Regularization
  l2: Double = 0.0,
  l1: Double = 0.0,
  l2Bias: Double = 0.0,
  l1Bias: Double = 0.0,
  weightNoise: Boolean = false,
  weightRetainProbability: Double = 1.0,
  applyToBiases: Boolean = false,
  // Optimization
  optimizationAlgo: OptimizationAlgorithm = OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT,
  miniBatch: Boolean = true,
  learningRate: Double = 0.1,
  beta1: Double = 0.9,
  beta2: Double = 0.999,
  epsilon: Double = 1E-8,
  momentum: Double = 0.9,
  rmsDecay: Double = 0.95,
  rho: Double = 0.95,
  updater: Updater = Updater.NESTEROVS,
  gradientNormalization: GradientNormalization = GradientNormalization.None,
  gradientNormalizationThreshold: Double = 1.0,
  // Input Layer
  inputSize: Int,
  inputType: InputType.Type = null,
  height: Int = 10,
  width: Int = 10,
  depth: Int = 10,
  channels: Int = 3,
  // Output Layer
  outputSize: Int,
  lossFunction: LossFunction,
  outputLayerActivation: Activation,
  outputLayerWeightInit: WeightInit = WeightInit.XAVIER,
  outputLayerBiasInit: Double = 0.0,
  weights: Array[Double] = Array[Double](),
  // Base for Layers
  pretrain: Boolean = false,
  backprop: Boolean = true,
  backpropType: BackpropType = BackpropType.TruncatedBPTT,
  tBPTTForwardLength: Int = 100,
  tBPTTBackwardLength: Int = 100,
  // Monitor, UI
  setListener: Boolean = true,
  listenType: String = "console",
  listenFreq: Int = 1,
  storagePath: String = "",
  enableRemote: Boolean = false,
  // Recurrent Layers
  nRecurLayers: Int = 1,
  recurLayerTypes: Array[String] = Array("GLSTM"),
  recurLayerSizes: Array[Int] = Array(1),
  recurLayerActivations: Array[Activation] = Array(Activation.TANH),
  recurLayerWeightInits: Array[WeightInit] = Array(WeightInit.XAVIER),
  recurLayerBiasInits: Array[Double] = Array(0.0),
  // Embedding Layer
  embeddingLayerType: String = "seq",
  embeddingLayerSize: Int = 100,
  // Fully Connected Dense Layer
  nDenseLayers: Int = 0,
  denseLayerSizes: Array[Int] = Array(1),
  denseLayerActivations: Array[Activation] = Array(Activation.RELU),
  denseLayerWeightInits: Array[WeightInit] = Array(WeightInit.XAVIER),
  denseLayerBiasInits: Array[Double] = Array(0.0),
  denseLayerDropOuts: Array[Double] = Array(0.0)
) extends RNN (
  // Base
  seed = seed,
  // Regularization
  l2 = l2, l1 = l1, l2Bias = l2Bias, l1Bias = l1Bias,
  weightNoise = weightNoise, weightRetainProbability = weightRetainProbability, applyToBiases = applyToBiases,
  // Optimization
  optimizationAlgo = optimizationAlgo, miniBatch = miniBatch,
  learningRate = learningRate, beta1 = beta1, beta2 = beta2, epsilon = epsilon, momentum = momentum, rmsDecay = rmsDecay, rho = rho,
  updater = updater,
  gradientNormalization = gradientNormalization,
  gradientNormalizationThreshold = gradientNormalizationThreshold,
  // Input Layer
  inputSize = inputSize, inputType = inputType,
  height = height, width = width, depth = depth, channels = channels,
  // Output Layer
  outputSize = outputSize, lossFunction = lossFunction,
  outputLayerActivation = outputLayerActivation,
  outputLayerWeightInit = outputLayerWeightInit, outputLayerBiasInit = outputLayerBiasInit,
  weights = weights,
  // Base for Layers
  pretrain = pretrain,
  backprop = backprop,
  backpropType = backpropType, tBPTTForwardLength = tBPTTForwardLength, tBPTTBackwardLength = tBPTTBackwardLength,
  // Monitor, UI
  setListener = setListener, listenType = listenType,
  storagePath = storagePath, enableRemote = enableRemote,
  // Recurrent Layers
  nRecurLayers = nRecurLayers, recurLayerTypes = recurLayerTypes, recurLayerSizes = recurLayerSizes,
  recurLayerActivations = recurLayerActivations, recurLayerWeightInits = recurLayerWeightInits, recurLayerBiasInits = recurLayerBiasInits
) with Serializable {

  // not setInputType in MLNN
  assert(inputType == null)

  /**
    * Embedding Layer Builder Configuration
    * @param listBuilder
    * @param startIndex embedding layer index, fixed to be 0
    * @param startInSize input cardinality
    * @param startOutSize embedding cardinality
    */
  protected def embeddingLayerConfBuilder(listBuilder: ListBuilder,
    startIndex: Int, startInSize: Int, startOutSize: Int): ListBuilder = {
    assert(Array("non-seq", "seq").contains(embeddingLayerType))
    val eBuilder =
      if (embeddingLayerType == "non-seq")
        new EmbeddingLayer.Builder().nIn(startInSize).nOut(startOutSize).build()
      else new EmbeddingSequenceLayer.Builder().nIn(startInSize).nOut(startOutSize).build()

    listBuilder.layer(startIndex, eBuilder)
    if (embeddingLayerType == "non-seq") listBuilder.inputPreProcessor(startIndex, new RnnToFeedForwardPreProcessor)
    listBuilder
  }

  /**
    * (Fully Connected) Dense Layer Builder Configuration
    */
  protected def denseLayerConfBuilder(listBuilder: ListBuilder,
    startIndex: Int, startInSize: Int): ListBuilder = {
    // Dense Layer
    for (i <- 0 until nDenseLayers) {
      val dBuilder = new DenseLayer.Builder().
        nOut(denseLayerSizes(scala.math.min(i, denseLayerSizes.size - 1))).
        nIn(
          if (i == 0) startInSize
          else denseLayerSizes(scala.math.min(i - 1, denseLayerSizes.size - 1))
        ).
        activation(denseLayerActivations(scala.math.min(i, denseLayerActivations.size - 1))).
        weightInit(denseLayerWeightInits(scala.math.min(i, denseLayerWeightInits.size - 1))).
        biasInit(denseLayerBiasInits(scala.math.min(i, denseLayerBiasInits.size - 1))).
        dropOut(denseLayerDropOuts(scala.math.min(i, denseLayerDropOuts.size - 1)))
      listBuilder.layer(startIndex + i, dBuilder.build)
      if (embeddingLayerType == "seq" && i == 0) listBuilder.inputPreProcessor(startIndex, new RnnToFeedForwardPreProcessor)
    }
    listBuilder
  }

  protected override def recurLayerConfBuilder(listBuilder: ListBuilder,
    startIndex: Int, startInSize: Int): ListBuilder = {
    super.recurLayerConfBuilder(listBuilder, startIndex, startInSize)
    if (embeddingLayerType == "non-seq") listBuilder.inputPreProcessor(startIndex, new FeedForwardToRnnPreProcessor)
      else if (nDenseLayers > 0) listBuilder.inputPreProcessor(startIndex, new FeedForwardToRnnPreProcessor)
    listBuilder
  }

  // Override Layer Configuration Builder:
  protected override def layerConfBuilder(confBuilder: NNConfBuilder): ListBuilder =
    outputLayerConfBuilder(
      recurLayerConfBuilder(
        denseLayerConfBuilder(
          embeddingLayerConfBuilder(
            baseLayerConfBuilder(confBuilder.list),
            startIndex = 0,
            startInSize = inputSize,
            startOutSize = embeddingLayerSize
          ),
          startIndex = 1,  // on top of embedding layer
          startInSize = embeddingLayerSize
        ),
        startIndex = 1 + nDenseLayers,
        startInSize =
          if (nDenseLayers > 0)
            denseLayerSizes(scala.math.min(nDenseLayers - 1, denseLayerSizes.size - 1))
          else embeddingLayerSize
      ),
      index = 1 + nDenseLayers + nRecurLayers,
      nIn = recurLayerSizes(
        scala.math.min(nRecurLayers - 1, recurLayerSizes.size - 1)
      )
    )


}
