Source code for model.rp_optimizer

# -*- coding: utf-8 -*-

"""**RosenPy: An Open Source Python Framework for Complex-Valued Neural Networks**.
*Copyright © A. A. Cruz, K. S. Mayer, D. S. Arantes*.

*License*

This file is part of RosenPy.
RosenPy is an open source framework distributed under the terms of the GNU General 
Public License, as published by the Free Software Foundation, either version 3 of 
the License, or (at your option) any later version. For additional information on 
license terms, please open the Readme.md file.

RosenPy is distributed in the hope that it will be useful to every user, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 
See the GNU General Public License for more details. 

You should have received a copy of the GNU General Public License
along with RosenPy. If not, see <http://www.gnu.org/licenses/>.
"""

from rosenpy.utils import act_func, init_func

[docs]class Optimizer:
    """
    Base class for all optimizers used in the neural network.

    This class defines common parameters and methods that can be used
    by all derived optimizers.
    """
    def __init__(self, beta=100, beta1=0.9, beta2=0.999, epsilon=1e-8):
        """
        Initializes the optimizer with default hyperparameters.

        Parameters:
        -----------
        beta : float, optional
            The value for the beta parameter. Default is 100.
        beta1 : float, optional
            The value for the beta1 parameter. Default is 0.9.
        beta2 : float, optional
            The value for the beta2 parameter. Default is 0.999.
        epsilon : float, optional
            A small constant added to prevent division by zero. Default is 1e-8.
        """
        self.beta = beta
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.vt = None
        self.ut = None
        self.xp = None
        self.optimizer = None
    
[docs]    def set_module(self, xp):
        """
        Sets the backend module (NumPy or CuPy) for matrix operations.

        Parameters:
        -----------
        xp : module
            The backend module (NumPy or CuPy).
        """
        self.xp = xp
   
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters of the neural network based on the gradients.

        This is a placeholder method that should be implemented by subclasses.

        Parameters:
        -----------
        parameters : tuple
            The parameters of the neural network.
        gradients : tuple
            The gradients of the loss function with respect to the parameters.
        learning_rate : tuple
            The learning rates for updating the parameters.
        epoch : int
            The current epoch number.
        mt : tuple
            The first moment estimates.
        vt : tuple
            The second moment estimates.
        ut : tuple
            The third moment estimates.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        raise NotImplementedError("Subclasses must implement update_parameters method.")

[docs]class GradientDescent(Optimizer):
    """
    Gradient Descent optimizer.

    This class implements the standard gradient descent optimization algorithm.
    """
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the gradient descent optimizer.

        Parameters:
        -----------
        parameters : tuple
            The parameters of the neural network.
        gradients : tuple
            The gradients of the loss function with respect to the parameters.
        learning_rate : tuple
            The learning rates for updating the parameters.
        epoch : int
            The current epoch number.
        mt : tuple
            The first moment estimates (not used in this optimizer).
        vt : tuple
            The second moment estimates (not used in this optimizer).
        ut : tuple
            The third moment estimates (not used in this optimizer).

        Returns:
        --------
        tuple
            The updated parameters.
        """
        return tuple(p + lr * g for p, g, lr in zip(parameters, gradients, learning_rate)) + (mt, vt, ut)

[docs]class Adam(Optimizer):
    """
    Adam optimizer.

    This class implements the Adam optimization algorithm, which is an adaptive 
    learning rate optimization algorithm.
    """
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the Adam optimizer.

        Parameters:
        -----------
        Same as the parent class.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        updated_parameters, updated_mt, updated_vt = [], [], []
        for p, g, lr, m, v in zip(parameters, gradients, learning_rate, mt, vt):
            m = self.beta1 * m + (1 - self.beta1) * g
            v = self.beta2 * v + (1 - self.beta2) * (self.xp.abs(g) ** 2)
            mc = m / (1 - self.beta1 ** epoch)
            vc = v / (1 - self.beta2 ** epoch)
            updated_parameters.append(p + lr * (mc / (self.xp.sqrt(vc) + self.epsilon)))
            updated_mt.append(m)
            updated_vt.append(v)
        return tuple(updated_parameters + [updated_mt, updated_vt, ut])

[docs]class CVAdam(Optimizer):
    """
    Complex-Valued Adam optimizer.

    This class implements the complex-valued version of the Adam optimization algorithm.
    """
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the complex-valued Adam optimizer.

        Parameters:
        -----------
        Same as the parent class.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        updated_parameters, updated_mt, updated_vt = [], [], []
        for p, g, lr, m, v in zip(parameters, gradients, learning_rate, mt, vt):
            m = self.beta1 * m + (1 - self.beta1) * g
            v = self.beta2 * v + (1 - self.beta2) * (self.xp.real(g) ** 2 + 1j * self.xp.imag(g) ** 2)
            mc = m / (1 - self.beta1 ** epoch)
            vc = v / (1 - self.beta2 ** epoch)
            r = self.xp.real(mc) / (self.xp.sqrt(self.xp.real(vc)) + self.epsilon)
            i = self.xp.imag(mc) / (self.xp.sqrt(self.xp.imag(vc)) + self.epsilon)
            updated_parameters.append(p + lr * (r + 1j * i))
            updated_mt.append(m)
            updated_vt.append(v)
        return tuple(updated_parameters + [updated_mt, updated_vt, ut])



[docs]class AMSGrad(Optimizer):
    """
    AMSGrad optimizer.

    This class implements the AMSGrad optimization algorithm, a variant of Adam that 
    improves convergence in certain cases by keeping track of the maximum past squared gradient.
    """
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the AMSGrad optimizer.

        Parameters:
        -----------
        Same as the parent class.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        updated_parameters, updated_mt, updated_vt, updated_ut = [], [], [], []
        for p, g, lr, m, v, u in zip(parameters, gradients, learning_rate, mt, vt, ut):
            m = self.beta1 * m + (1 - self.beta1) * g
            v = self.beta2 * v + (1 - self.beta2) * (self.xp.abs(g) ** 2)
            u = self.xp.maximum(u, v)
            updated_parameters.append(p + lr * (m / (self.xp.sqrt(u) + self.epsilon)))
            updated_mt.append(m)
            updated_vt.append(v)
            updated_ut.append(u)
        return tuple(updated_parameters + [updated_mt, updated_vt, updated_ut])

[docs]class SAMSGrad(Optimizer):
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the SAMSGrad optimizer.

        Parameters:
        -----------
        Same as the parent class.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        updated_parameters, updated_mt, updated_vt, updated_ut = [], [], [], []
        for p, g, lr, m, v, u in zip(parameters, gradients, learning_rate, mt, vt, ut):
            m = self.beta1 * m + (1 - self.beta1) * g
            v = self.beta2 * v + (1 - self.beta2) * (self.xp.abs(g) ** 2)
            u = self.xp.maximum(self.xp.abs(u), self.xp.abs(v))

            updated_params.append(p + lr * (m / ((1 / self.beta) * self.xp.log(1 + self.xp.exp(self.beta * self.xp.sqrt(u))))))
            updated_mt.append(m)
            updated_vt.append(v)
            updated_ut.append(u)
        return tuple(updated_parameters + [updated_mt, updated_vt, updated_ut])

[docs]class CVAMSGrad(Optimizer):
    """
        Updates the parameters using the complex-valued SAMSGrad optimizer.

        Parameters:
        -----------
        Same as the parent class.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        updated_parameters, updated_mt, updated_vt, updated_ut = [], [], [], []

        for p, g, lr, m, v, u in zip(parameters, gradients, learning_rate, mt, vt, ut):
           m = self.beta1 * m + (1 - self.beta1) * g
           v = self.beta2 * v + (1 - self.beta2) * (self.xp.real(g) ** 2 + 1j * self.xp.imag(g) ** 2)
           u = self.xp.maximum(self.xp.abs(self.xp.real(u)), self.xp.abs(self.xp.real(v))) + 1j * self.xp.maximum(self.xp.abs(self.xp.imag(u)), self.xp.abs(self.xp.imag(v)))
           
           real_part = self.xp.real(m) / (self.xp.sqrt(self.xp.real(u)) + self.epsilon)
           imag_part = self.xp.imag(m) / (self.xp.sqrt(self.xp.imag(u)) + self.epsilon)
           
           updated_parameters.append(p + lr * (real_part + 1j * imag_part))
           updated_mt.append(m)
           updated_vt.append(v)
           updated_ut.append(u)
        return tuple(updated_parameters + [updated_mt, updated_vt, updated_ut])  
    
[docs]class Adamax(Optimizer):
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the Adamax optimizer.

        Parameters:
        -----------
        Same as the parent class.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        updated_parameters, updated_mt, updated_vt = [], [], []
    
        for p, g, lr, m, v in zip(parameters, gradients, learning_rate, mt, vt):
            m = self.beta1 * m + (1 - self.beta1) * g
            v = self.xp.maximum(self.beta2 * v, self.xp.abs(g))
            updated_parameters.append(p + (lr / (1 - self.beta1 ** epoch)) * m / (v + self.epsilon))
            updated_mt.append(m)
            updated_vt.append(v)
        
        return tuple(updated_parameters + [updated_mt, updated_vt, ut])


[docs]class CVAdamax(Optimizer):
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the complex-valued Adamax optimizer.

        Parameters:
        -----------
        Same as the parent class.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        updated_parameters, updated_mt, updated_vt = [], [], []
        
        for p, g, lr, m, v in zip(parameters, gradients, learning_rate, mt, vt):
            m = self.beta1 * m + (1 - self.beta1) * g
            v_real = self.xp.maximum(self.beta2 * self.xp.real(v), self.xp.abs(self.xp.real(g)))
            v_imag = self.xp.maximum(self.beta2 * self.xp.imag(v), self.xp.abs(self.xp.imag(g)))
            v = v_real + 1j * v_imag
            
            real_part = self.xp.real(m) / (self.xp.real(v) + self.epsilon)
            imag_part = self.xp.imag(m) / (self.xp.imag(v) + self.epsilon)
            
            updated_parameters.append(p + (lr / (1 - self.beta1 ** epoch)) * (real_part + 1j * imag_part))
            updated_mt.append(m)
            updated_vt.append(v)
        
        return tuple(updated_parameters + [updated_mt, updated_vt, ut])


[docs]class CVAdaGrad(Optimizer):
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the complex-valued AdaGrad optimizer.

        Parameters:
        -----------
        Same as the parent class.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        updated_parameters, updated_mt = [], []
        
        for p, lr, g, m, v in zip(parameters, learning_rate, gradients, mt, vt):
            m = m + (self.xp.real(g)**2 + 1j*self.xp.imag(g)**2)
            
            real_part = self.xp.real(g) / self.xp.sqrt(self.xp.real(m) + self.epsilon)
            imag_part = self.xp.imag(g) / self.xp.sqrt(self.xp.imag(m) + self.epsilon)
            
            updated_parameters.append(p + lr * (real_part + 1j * imag_part))
            updated_mt.append(m)
        
        return tuple(updated_parameters + [updated_mt, vt, ut])


[docs]class AdaGrad(Optimizer):
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the AdaGrad optimizer.

        Parameters:
        -----------
        Same as the parent class.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        updated_parameters, updated_mt = [], []
        
        for p, lr, g, m in zip(parameters, learning_rate, gradients, mt):
            m = m + (self.xp.abs(g) ** 2)
            updated_parameters.append(p + lr * (g / (self.xp.sqrt(m) + self.epsilon)))
            updated_mt.append(m)
        
        return tuple(updated_parameters + [updated_mt, vt, ut])


[docs]class RMSprop(Optimizer):
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the RMSprop optimizer.

        Parameters:
        -----------
        Same as the parent class.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        updated_parameters, updated_mt = [], []
        
        for p, g, lr, m in zip(parameters, gradients, learning_rate, mt):
            m = m * self.beta1 + (1 - self.beta1) * self.xp.abs(g) ** 2
            updated_parameters.append(p + lr * g / (self.xp.sqrt(m) + self.epsilon))
            updated_mt.append(m)
        
        return tuple(updated_parameters + [updated_mt, vt, ut])


[docs]class CVRMSprop(Optimizer):
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the complex-valued RMSprop optimizer.

        Parameters:
        -----------
        Same as the parent class.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        updated_parameters, updated_mt = [], []
        
        for p, g, lr, m in zip(parameters, gradients, learning_rate, mt):
            m = m * self.beta1 + (1 - self.beta1) * (self.xp.real(g) ** 2 + 1j * self.xp.imag(g) ** 2)
            real_part = self.xp.real(g) / (self.xp.sqrt(self.xp.real(m) + self.epsilon))
            imag_part = self.xp.imag(g) / (self.xp.sqrt(self.xp.imag(m) + self.epsilon))
            updated_parameters.append(p + lr * (real_part + 1j * imag_part))
            updated_mt.append(m)
        
        return tuple(updated_parameters + [updated_mt, vt, ut])


[docs]class Nadam(Optimizer):
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the Nadam optimizer.

        Parameters:
        -----------
        Same as the parent class.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        updated_parameters, updated_mt, updated_vt = [], [], []
        
        for p, g, lr, m, v in zip(parameters, gradients, learning_rate, mt, vt):
            m = m * self.beta1 + (1 - self.beta1) * g
            v = self.beta2 * v + (1 - self.beta2) * self.xp.abs(g) ** 2
            
            mt_hat = (1 - self.beta1) * g / (1 - self.beta1 ** (epoch + 1)) + self.beta1 * m / (1 - self.beta1 ** epoch)
            
            updated_parameters.append(p + lr * mt_hat / (self.xp.sqrt(v / (1 - self.beta2 ** epoch)) + self.epsilon))
            updated_mt.append(m)
            updated_vt.append(v)
        
        return tuple(updated_parameters + [updated_mt, updated_vt, ut])


[docs]class CVNadam(Nadam):
[docs]    def update_parameters(self, parameters, gradients, learning_rate, epoch, mt, vt, ut):
        """
        Updates the parameters using the complex-valued Nadam optimizer.

        Parameters:
        -----------
        parameters : list of arrays
            The parameters of the neural network.
        gradients : list of arrays
            The gradients of the loss function with respect to the parameters.
        learning_rate : float
            The learning rate for updating the parameters.
        epoch : int
            The current epoch number.
        mt : list of arrays
            The first moment estimates.
        vt : list of arrays
            The second moment estimates.
        ut : list of arrays
            The third moment estimates.

        Returns:
        --------
        tuple
            The updated parameters along with the updated moment estimates.
        """
        updated_parameters, updated_mt, updated_vt = [], [], []
        
        for param, grad, lr, m, v in zip(parameters, gradients, learning_rate, mt, vt):
            m = self.beta1 * m + (1 - self.beta1) * grad
            v = self.beta2 * v + (1 - self.beta2) * (self.xp.real(grad) ** 2 + 1j * self.xp.imag(grad) ** 2)
            
            mt_hat = (1 - self.beta1) * grad / (1 - self.beta1 ** (epoch + 1)) + self.beta1 * m / (1 - self.beta1 ** epoch)
            vc = v / (1 - self.beta2 ** epoch)
            
            real_update = self.xp.real(mt_hat) / (self.xp.sqrt(self.xp.real(vc)) + self.epsilon)
            imag_update = self.xp.imag(mt_hat) / (self.xp.sqrt(self.xp.imag(vc)) + self.epsilon)
            
            updated_parameters.append(param + lr * (real_update + 1j * imag_update))
            updated_mt.append(m)
            updated_vt.append(v)
        
        return tuple(updated_parameters + [updated_mt, updated_vt, ut])