Module nujo.optim.optimizers

Stochastic Gradient Descent (SGD) Optimizers

Check out the following link for more info about the optimizers: http://ruder.io/optimizing-gradient-descent/index.html

Expand source code
''' Stochastic Gradient Descent (SGD) Optimizers

Check out the following link for more info about the optimizers:
http://ruder.io/optimizing-gradient-descent/index.html

'''

from typing import Dict, List

from nujo.autodiff.tensor import Tensor
from nujo.init.basic import zeros_like
from nujo.math.scalar import sqrt
from nujo.optim.optimizer import Optimizer

__all__ = [
    'SGD',
    'Momentum',
    'RMSprop',
    'Adam',
]

# ====================================================================================================


class SGD(Optimizer):
    ''' SGD: Stochastic Gradient Descent

    An iterative method for optimizing an objective function.

    Parameters:
    -----------
     - params : list of Tensors, the parameters which to update
     - lr : float, the learning rate

    '''
    def __init__(self, params: List[Tensor], lr=0.005):
        super(SGD, self).__init__(params, lr)

    def update_rule(self, param: Tensor, grad: Tensor) -> Tensor:
        return param - self.lr * grad


# ====================================================================================================


class Momentum(Optimizer):
    ''' Momentum

    A method that helps accelerate SGD in the relevant direction and
    dampens oscillations. It does this by adding a fraction of the
    update vector of the past time step to the current update vector.

    Parameters:
    -----------
     - params : list of Tensors, the parameters which to update
     - lr : float, the learning rate
     - beta : float, the fraction of the update vector of the past
       time step to be added to the current update vector

    '''
    def __init__(self, params: List[Tensor], lr=0.001, beta=0.9):
        super(Momentum, self).__init__(params, lr)

        self.beta = beta
        self._velocity: Dict[str, Tensor] = {}

    def update_rule(self, param: Tensor, grad: Tensor) -> Tensor:
        # Get the corresponding velocity
        key = param.name
        if key not in self._velocity:
            self._velocity[key] = zeros_like(param)

        # Exponentially Weighted Moving Average
        self._velocity[key] = self.beta * self._velocity[key] +\
            (1 - self.beta) * grad

        # Update rule
        return param - self.lr * self._velocity[key]


# ====================================================================================================


class RMSprop(Optimizer):
    ''' RMSprop

    A gradient-based optimization technique proposed by Geoffrey Hinton
    at his Neural Networks Coursera course. It uses a moving average
    of squared gradients to normalize the gradient itself.

    Parameters:
    -----------
     - params : list of Tensors, the parameters which to update
     - lr : float, the learning rate
     - beta : float, the squared gradient coefficients
     - eps : float, added for numerical stability

    '''
    def __init__(self, params: List[Tensor], lr=0.001, beta=0.999, eps=1e-09):
        super(RMSprop, self).__init__(params, lr)

        self.beta = beta
        self.eps = eps
        self._squared: Dict[str, Tensor] = {}

    def update_rule(self, param: Tensor, grad: Tensor) -> Tensor:
        # Get the corresponding squared gradient
        key = param.name
        if key not in self._squared:
            self._squared[key] = zeros_like(param)

        # Exponentially Weighted Moving Average
        self._squared[key] = self.beta * self._squared[key] +\
            (1 - self.beta) * grad**2

        # Update rule
        return param - self.lr * grad / (sqrt(self._squared[key]) + self.eps)


# ====================================================================================================


class Adam(Optimizer):
    ''' Adam: Adaptive Moment Estimation

    Another method that computes adaptive learning rates
    for each parameter. It basically combines Momentum
    and RMSprop into one update rule.

    Parameters:
    -----------
     - params : list of Tensors, the parameters which to update
     - lr : float, the learning rate
     - betas : tuple of 2 floats, the velocity (Momentum) and
       squared gradient (RMSprop) coefficients
     - eps : float, added for numerical stability

    '''
    def __init__(self,
                 params: List[Tensor],
                 lr=0.001,
                 betas=(0.9, 0.999),
                 eps=1e-09):

        super(Adam, self).__init__(params, lr)

        self.betas = betas
        self.eps = eps

        self._velocity: Dict[str, Tensor] = {}
        self._squared: Dict[str, Tensor] = {}
        self._t = 1

    def update_rule(self, param: Tensor, grad: Tensor) -> Tensor:
        # Get the corresponding velocity and squared gradient
        key = param.name
        if key not in self._velocity:
            self._velocity[key] = zeros_like(param)
            self._squared[key] = zeros_like(param)

        # Exponentially Weighted Moving Average
        self._velocity[key] = self.betas[0]*self._velocity[key] +\
            (1 - self.betas[0]) * grad

        self._squared[key] = self.betas[1] * self._squared[key] +\
            (1 - self.betas[1]) * grad**2

        # Bias correction
        v_corrected = self._velocity[key] / (1 - self.betas[0]**self._t)
        s_corrected = self._squared[key] / (1 - self.betas[1]**self._t)
        self._t += 1

        # Update rule
        return param - self.lr * v_corrected / (sqrt(s_corrected) + self.eps)


# ====================================================================================================

Classes

class Adam (params: List[Tensor], lr=0.001, betas=(0.9, 0.999), eps=1e-09)

Adam: Adaptive Moment Estimation

Another method that computes adaptive learning rates for each parameter. It basically combines Momentum and RMSprop into one update rule.

Parameters:

  • params : list of Tensors, the parameters which to update
  • lr : float, the learning rate
  • betas : tuple of 2 floats, the velocity (Momentum) and squared gradient (RMSprop) coefficients
  • eps : float, added for numerical stability
Expand source code
class Adam(Optimizer):
    ''' Adam: Adaptive Moment Estimation

    Another method that computes adaptive learning rates
    for each parameter. It basically combines Momentum
    and RMSprop into one update rule.

    Parameters:
    -----------
     - params : list of Tensors, the parameters which to update
     - lr : float, the learning rate
     - betas : tuple of 2 floats, the velocity (Momentum) and
       squared gradient (RMSprop) coefficients
     - eps : float, added for numerical stability

    '''
    def __init__(self,
                 params: List[Tensor],
                 lr=0.001,
                 betas=(0.9, 0.999),
                 eps=1e-09):

        super(Adam, self).__init__(params, lr)

        self.betas = betas
        self.eps = eps

        self._velocity: Dict[str, Tensor] = {}
        self._squared: Dict[str, Tensor] = {}
        self._t = 1

    def update_rule(self, param: Tensor, grad: Tensor) -> Tensor:
        # Get the corresponding velocity and squared gradient
        key = param.name
        if key not in self._velocity:
            self._velocity[key] = zeros_like(param)
            self._squared[key] = zeros_like(param)

        # Exponentially Weighted Moving Average
        self._velocity[key] = self.betas[0]*self._velocity[key] +\
            (1 - self.betas[0]) * grad

        self._squared[key] = self.betas[1] * self._squared[key] +\
            (1 - self.betas[1]) * grad**2

        # Bias correction
        v_corrected = self._velocity[key] / (1 - self.betas[0]**self._t)
        s_corrected = self._squared[key] / (1 - self.betas[1]**self._t)
        self._t += 1

        # Update rule
        return param - self.lr * v_corrected / (sqrt(s_corrected) + self.eps)

Ancestors

Inherited members

class Momentum (params: List[Tensor], lr=0.001, beta=0.9)

Momentum

A method that helps accelerate SGD in the relevant direction and dampens oscillations. It does this by adding a fraction of the update vector of the past time step to the current update vector.

Parameters:

  • params : list of Tensors, the parameters which to update
  • lr : float, the learning rate
  • beta : float, the fraction of the update vector of the past time step to be added to the current update vector
Expand source code
class Momentum(Optimizer):
    ''' Momentum

    A method that helps accelerate SGD in the relevant direction and
    dampens oscillations. It does this by adding a fraction of the
    update vector of the past time step to the current update vector.

    Parameters:
    -----------
     - params : list of Tensors, the parameters which to update
     - lr : float, the learning rate
     - beta : float, the fraction of the update vector of the past
       time step to be added to the current update vector

    '''
    def __init__(self, params: List[Tensor], lr=0.001, beta=0.9):
        super(Momentum, self).__init__(params, lr)

        self.beta = beta
        self._velocity: Dict[str, Tensor] = {}

    def update_rule(self, param: Tensor, grad: Tensor) -> Tensor:
        # Get the corresponding velocity
        key = param.name
        if key not in self._velocity:
            self._velocity[key] = zeros_like(param)

        # Exponentially Weighted Moving Average
        self._velocity[key] = self.beta * self._velocity[key] +\
            (1 - self.beta) * grad

        # Update rule
        return param - self.lr * self._velocity[key]

Ancestors

Inherited members

class RMSprop (params: List[Tensor], lr=0.001, beta=0.999, eps=1e-09)

RMSprop

A gradient-based optimization technique proposed by Geoffrey Hinton at his Neural Networks Coursera course. It uses a moving average of squared gradients to normalize the gradient itself.

Parameters:

  • params : list of Tensors, the parameters which to update
  • lr : float, the learning rate
  • beta : float, the squared gradient coefficients
  • eps : float, added for numerical stability
Expand source code
class RMSprop(Optimizer):
    ''' RMSprop

    A gradient-based optimization technique proposed by Geoffrey Hinton
    at his Neural Networks Coursera course. It uses a moving average
    of squared gradients to normalize the gradient itself.

    Parameters:
    -----------
     - params : list of Tensors, the parameters which to update
     - lr : float, the learning rate
     - beta : float, the squared gradient coefficients
     - eps : float, added for numerical stability

    '''
    def __init__(self, params: List[Tensor], lr=0.001, beta=0.999, eps=1e-09):
        super(RMSprop, self).__init__(params, lr)

        self.beta = beta
        self.eps = eps
        self._squared: Dict[str, Tensor] = {}

    def update_rule(self, param: Tensor, grad: Tensor) -> Tensor:
        # Get the corresponding squared gradient
        key = param.name
        if key not in self._squared:
            self._squared[key] = zeros_like(param)

        # Exponentially Weighted Moving Average
        self._squared[key] = self.beta * self._squared[key] +\
            (1 - self.beta) * grad**2

        # Update rule
        return param - self.lr * grad / (sqrt(self._squared[key]) + self.eps)

Ancestors

Inherited members

class SGD (params: List[Tensor], lr=0.005)

SGD: Stochastic Gradient Descent

An iterative method for optimizing an objective function.

Parameters:

  • params : list of Tensors, the parameters which to update
  • lr : float, the learning rate
Expand source code
class SGD(Optimizer):
    ''' SGD: Stochastic Gradient Descent

    An iterative method for optimizing an objective function.

    Parameters:
    -----------
     - params : list of Tensors, the parameters which to update
     - lr : float, the learning rate

    '''
    def __init__(self, params: List[Tensor], lr=0.005):
        super(SGD, self).__init__(params, lr)

    def update_rule(self, param: Tensor, grad: Tensor) -> Tensor:
        return param - self.lr * grad

Ancestors

Inherited members