Module nujo.autodiff
nujo's core Reverse-mode Automatic Differentiation module
Expand source code
''' nujo's core Reverse-mode Automatic Differentiation module
'''
from nujo.autodiff.function import Function
from nujo.autodiff.modes import no_diff
from nujo.autodiff.tensor import Tensor
__all__ = [
'Function',
'no_diff',
'Tensor',
]
Sub-modules
nujo.autodiff.function
nujo.autodiff.modes
nujo.autodiff.tensor
Classes
class Function (*children: Union[nujo.autodiff.tensor.Tensor, numpy.ndarray, List[numbers.Number], numbers.Number], **kwargs)
-
Base Class for functions
Functions are applied to tensors. They take multiple tensors as input and produces only one tensor as output. They do NOT change tensors in-place.
Functions were also written so they reuse the input/output tensors when possible, which results in the computation graph being: - "Dynamically defined, statically evaluated." taking the best from both worlds.
Parameters:
- children : varargs, the inpute tensors
Expand source code
class Function(_Node, metaclass=_FunctionMeta): ''' Base Class for functions Functions are applied to tensors. They take multiple tensors as input and produces only one tensor as output. They do NOT change tensors in-place. Functions were also written so they reuse the input/output tensors when possible, which results in the computation graph being: - "Dynamically defined, statically evaluated." taking the best from both worlds. Parameters: ----------- - children : varargs, the inpute tensors ''' _func_children_lookup_cache: Dict[str, 'Function'] = {} ''' Cache used to lookup for functions that may have already been defined in the computation graph. - key : hash(FuncType) + (children's identifiers); use `_get_function_identifier` to obtain a key - value : the already defined function which can be reused ''' T = TypeVar('T', Tensor, ndarray) def __init__(self, *children: Union[Tensor, ndarray, List[Number], Number]): super(Function, self).__init__(*_parse_inputs(children), name=self.__class__.__name__) # This output placeholder is reused when possible self._output_placeholder = Tensor( None, diff=any(x.diff for x in self.children) and modes.DIFF_ENABLED, creator=self if modes.DIFF_ENABLED else None, name=self._generate_tensor_name()) if modes.DIFF_ENABLED: # If graph building is enabled. # Allocate space for parent's output (output placeholder) for child in self.children: child.parents_outputs.append(self._output_placeholder) def __repr__(self): return super(Function, self).__repr__() + f'#{self.id}' def _generate_tensor_name(self) -> str: return 'Z' + self.__repr__() @abstractmethod def forward(self) -> ndarray: ''' Implement forward pass of the function here. Use the `self.children` list to access the inputs. ''' pass @abstractmethod def backward(self, idx: int, accum_grad: T) -> T: ''' Implement backward pass of the function here Compute the gradient of children[idx] w.r.t. output of the computation graph from the accumulated gradient (the gradient of the output of the function w.r.t. the output of the graph). Parameters: ----------- - idx : int, the index of the children for which to compute the gradient w.r.t. output of the computation graph - accum_grad : T (Tensor or ndarray), the accumulated grad in the graph so far, you can otherwise think of it as the gradient of the output of the function w.r.t. the output of the graph. - `accum_grad` is Tensor if differentiantion is enabled (`DIFF_ENABLED`) and the children has opted for differentiation (`diff` is True), thus the computations will be recorded in the computation graph and higher-order derivatives could be computed. - otherwise, `accum_grad` is ndarray and the computations are not recorded; ndarrays are used since the computations with them are more efficient. Returns: -------- - grad : T (Tensor or ndarray), the computed gradient of `self.children[idx]` ''' pass def __call__(self) -> Tensor: ''' Executes cached forward pass ''' # Forward pass self._output_placeholder.value = self.forward() return self._output_placeholder
Ancestors
- nujo.autodiff._node._Node
Subclasses
- nujo.autodiff._functions._activations._BinaryStep
- nujo.autodiff._functions._activations._LeakyReLU
- nujo.autodiff._functions._activations._ReLU
- nujo.autodiff._functions._activations._Sigmoid
- nujo.autodiff._functions._activations._Softmax
- nujo.autodiff._functions._activations._Swish
- nujo.autodiff._functions._activations._TanH
- nujo.autodiff._functions._aggregate._InnerProd
- nujo.autodiff._functions._aggregate._InnerSum
- nujo.autodiff._functions._elementary._Addition
- nujo.autodiff._functions._elementary._Logarithm
- nujo.autodiff._functions._elementary._MatrixMul
- nujo.autodiff._functions._elementary._Multiplication
- nujo.autodiff._functions._elementary._Negation
- nujo.autodiff._functions._elementary._Power
- nujo.autodiff._functions._elementary._Reciprocal
- nujo.autodiff._functions._transform._ConstPad
- nujo.autodiff._functions._transform._Im2col
- nujo.autodiff._functions._transform._Reshape
- nujo.autodiff._functions._transform._Transpose
Class variables
var T
Methods
def backward(self, idx: int, accum_grad: ~T) -> ~T
-
Implement backward pass of the function here
Compute the gradient of children[idx] w.r.t. output of the computation graph from the accumulated gradient (the gradient of the output of the function w.r.t. the output of the graph).
Parameters:
- idx : int, the index of the children for which to compute the gradient w.r.t. output of the computation graph
-
accum_grad : T (Tensor or ndarray), the accumulated grad in the graph so far, you can otherwise think of it as the gradient of the output of the function w.r.t. the output of the graph.
accum_grad
is Tensor if differentiantion is enabled (DIFF_ENABLED
) and the children has opted for differentiation (diff
is True), thus the computations will be recorded in the computation graph and higher-order derivatives could be computed.- otherwise,
accum_grad
is ndarray and the computations are not recorded; ndarrays are used since the computations with them are more efficient.
Returns:
- grad : T (Tensor or ndarray), the computed gradient of
self.children[idx]
Expand source code
@abstractmethod def backward(self, idx: int, accum_grad: T) -> T: ''' Implement backward pass of the function here Compute the gradient of children[idx] w.r.t. output of the computation graph from the accumulated gradient (the gradient of the output of the function w.r.t. the output of the graph). Parameters: ----------- - idx : int, the index of the children for which to compute the gradient w.r.t. output of the computation graph - accum_grad : T (Tensor or ndarray), the accumulated grad in the graph so far, you can otherwise think of it as the gradient of the output of the function w.r.t. the output of the graph. - `accum_grad` is Tensor if differentiantion is enabled (`DIFF_ENABLED`) and the children has opted for differentiation (`diff` is True), thus the computations will be recorded in the computation graph and higher-order derivatives could be computed. - otherwise, `accum_grad` is ndarray and the computations are not recorded; ndarrays are used since the computations with them are more efficient. Returns: -------- - grad : T (Tensor or ndarray), the computed gradient of `self.children[idx]` ''' pass
def forward(self) -> numpy.ndarray
-
Implement forward pass of the function here.
Use the
self.children
list to access the inputs.Expand source code
@abstractmethod def forward(self) -> ndarray: ''' Implement forward pass of the function here. Use the `self.children` list to access the inputs. ''' pass
class Tensor (value: Union[ForwardRef('Tensor'), numpy.ndarray, List[numbers.Number], numbers.Number], diff=False, creator=None, name='Tensor')
-
Tensor - a multi-dimensional array
Tensors are the main units of data in nujo. They "flow" in the computation graph. :)
Tensors can be either constants or trainable weights, depending on whether gradients are computed for the given tensor.
Parameters:
- value : value, numerical value of the tensor
- diff : boolean, whether to compute gradients for the tensor
- creator : nujo function, that created this tensor; the only child of a tensor
- name : string, representation of the tensor
Expand source code
class Tensor(_Node): ''' Tensor - a multi-dimensional array Tensors are the main units of data in nujo. They "flow" in the computation graph. :) Tensors can be either constants or trainable weights, depending on whether gradients are computed for the given tensor. Parameters: ----------- - value : value, numerical value of the tensor - diff : boolean, whether to compute gradients for the tensor - creator : nujo function, that created this tensor; the only child of a tensor - name : string, representation of the tensor ''' def __init__(self, value: Union['Tensor', ndarray, List[Number], Number], diff=False, creator=None, name='Tensor'): super(Tensor, self).__init__(*_if_not_none(creator), name=name) self._value: ndarray = None self.value = value # set value self.diff = diff self.creator = creator # Outputs of the functions the current tensor is input to. # Used for backpropagation of the gradients. self.parents_outputs: List['Tensor'] = [] # Gradient of the current tensor self._grad: 'Tensor' = None # Transposed tensor cache self._T: 'Tensor' = None self._prev_value: ndarray = None @property def value(self): return self._value @value.setter def value(self, value: Union['Tensor', ndarray, List[Number], Number]): if isinstance(value, Tensor): self._value = value.value elif isinstance(value, ndarray): self._value = value else: self._value = array(value) @value.deleter def value(self): del self._value @property def grad(self) -> 'Tensor': if self._grad is None: self._grad = Tensor(empty(self._value.shape), name=f'grad[{self.name}]') return self._grad # Shape and shape manipulations @property def shape(self) -> Tuple[int, ...]: return self._value.shape @property def T(self) -> 'Tensor': # Only transpose if something has changed if (self._value != self._prev_value).any(): self._T = self.transpose() self._prev_value = self._value return self._T def transpose(self, *dims: int) -> 'Tensor': from nujo.autodiff._functions._transform import _Transpose return _Transpose(self, dims)() def reshape(self, *shape: int) -> 'Tensor': from nujo.autodiff._functions._transform import _Reshape return _Reshape(self, shape)() def squeeze(self, dim=-1) -> 'Tensor': if dim < 0: num_dims = len(self._value.shape) if dim < -num_dims: dim = num_dims else: dim += num_dims return self.reshape(*self._value.shape[:dim], *self._value.shape[dim + 1:]) def unsqueeze(self, dim=-1) -> 'Tensor': if dim < 0: num_dims = len(self._value.shape) if dim < -num_dims: dim = 0 else: if dim == -1: dim += 1 dim += num_dims return self.reshape(*self._value.shape[:dim], 1, *self._value.shape[dim:]) # Gradient computation def _compute_grad_from(self, poutput: 'Tensor') -> Union['Tensor', ndarray]: ''' Computes the gradient of `self` w.r.t. the output of the computation graph from `poutput` (using the path of computations from `poutput`) In other words, this functions returns: (dOutput / dPoutput) * (dPoutput / dSelf) ''' # Find the index of the children which gradient should be computed # (a.k.a. find the index of `self` in `poutput.creator.children`) idx = next(i for i, v in enumerate(poutput.creator.children) if v is self) if poutput._grad.diff: # Pass a diff enabled tensor to the backward call, # thus recording grad computations in the computation # graph, which enables higher-order differentiation. grad = poutput.creator.backward(idx, poutput._grad) # Check if `self` is scalar and needs to be averaged if self._value.shape != () and\ self._value.shape[-1] == 1: # Record the mean in the computation graph from nujo.math.aggregate import mean grad = mean(grad, dim=-1, keepdim=True) else: # Do not leave a trace in the computation graph! # Use numpy arrays! :) grad = poutput.creator.backward(idx, poutput._grad._value) # Check if `self` is scalar and needs to be averaged if self._value.shape != () and\ self._value.shape[-1] == 1: grad = grad.mean(axis=-1, keepdims=True) return grad def compute_grad(self) -> None: if modes.DIFF_ENABLED and self.diff: # Make sure grad is Tensor (`grad property call`) and init value if self._grad is None: self.zero_grad(propagate=False) # Top-parent grad if len(self.parents_outputs) == 0: self._grad._value += 1 return for poutput in self.parents_outputs: curr_grad = self._compute_grad_from(poutput) if self._grad.diff: # Record grad computations in the computation graph self._grad += curr_grad else: self._grad._value += curr_grad def zero_grad(self, propagate=True) -> None: self.grad._value.fill(0) if propagate: for poutput in self.parents_outputs: poutput.zero_grad() def backward(self, _debug=False) -> None: ''' It uses Breadth First Search to traverse the computation graph and compute the gradient for each differentiable Tensor in the graph. ''' nodes_to_visit: List['Tensor'] = [self] if _debug: i = 1 while nodes_to_visit: node = nodes_to_visit.pop() node.compute_grad() if _debug: nstr = f' [{i}]' node.name += nstr if nstr not in node.name else '' i += 1 if node.creator: for child in node.creator.children: # Avoid visiting the same node twice if all(child is not node for node in nodes_to_visit): nodes_to_visit.insert(0, child) # Useful methods def all(self) -> ndarray: return self._value.all() def any(self) -> ndarray: return self._value.any() def __getitem__(self, position: Union[int, Tuple[int, ...]]): return Tensor(self._value[position], diff=self.diff, creator=self.creator, name=f'{self.name}[{position}]') def __setitem__(self, position: Union[int, Tuple[int, ...]], value: Union['Tensor', ndarray, List[Number], Number]): # TODO: This is a naive implementation. Fix it. self._value[position] = value def __hash__(self): return self.id # Static evaluation operator def __ilshift__( self, other: Union['Tensor', ndarray, List[Number], Number]) -> 'Tensor': ''' In-place assignment operator: `<<=` Transfering key properties from `other` to `self`. Essentially a shortcut for: >>> self.children = other.children >>> self.creator = other.creator >>> self.value = other.value >>> self.grad = other.grad ''' self.children = getattr(other, 'children', None) if self.children: try: self.children.remove(self) except ValueError: # self is not in children pass self.creator = getattr(other, 'creator', None) if self.creator: try: self.creator.children.remove(self) except ValueError: # self is not in children pass self._value = getattr(other, 'value', other) # Transfer the gradient self._grad = getattr(other, 'grad', None) return self # Comparison operations def __lt__(self, other): return self._value < getattr(other, 'value', other) def __le__(self, other): return self._value <= getattr(other, 'value', other) def __eq__(self, other): return self._value == getattr(other, 'value', other) def __ne__(self, other): return self._value != getattr(other, 'value', other) def __gt__(self, other): return self._value > getattr(other, 'value', other) def __ge__(self, other): return self._value >= getattr(other, 'value', other) # Arithmetic operations def __add__(self, other): from nujo.autodiff._functions._elementary import _Addition return _Addition(self, other)() def __radd__(self, other): return self.__add__(other) def __neg__(self): from nujo.autodiff._functions._elementary import _Negation return _Negation(self)() def __sub__(self, other): return self.__add__(other.__neg__()) def __rsub__(self, other): return self.__neg__().__add__(other) def __mul__(self, other): from nujo.autodiff._functions._elementary import _Multiplication return _Multiplication(self, other)() def __rmul__(self, other): return self.__mul__(other) def __truediv__(self, other): from nujo.autodiff._functions._elementary import _Reciprocal return self.__mul__(_Reciprocal(other)()) def __rtruediv__(self, other): from nujo.autodiff._functions._elementary import _Reciprocal return _Reciprocal(self)().__mul__(other) def __pow__(self, other): from nujo.autodiff._functions._elementary import _Power return _Power(self, other)() def __rpow__(self, other): from nujo.autodiff._functions._elementary import _Power return _Power(other, self)() # More complex arithmetic operations def __matmul__(self, other): from nujo.autodiff._functions._elementary import _MatrixMul return _MatrixMul(self, other)() def __rmatmul__(self, other): from nujo.autodiff._functions._elementary import _MatrixMul return _MatrixMul(other, self)() # Representations def __str__(self): # TODO: Come up with a better representation return self.__repr__() + '\n' + '-' * 32 + '\n' + str(self._value)
Ancestors
- nujo.autodiff._node._Node
Instance variables
var T : Tensor
-
Expand source code
@property def T(self) -> 'Tensor': # Only transpose if something has changed if (self._value != self._prev_value).any(): self._T = self.transpose() self._prev_value = self._value return self._T
var grad : Tensor
-
Expand source code
@property def grad(self) -> 'Tensor': if self._grad is None: self._grad = Tensor(empty(self._value.shape), name=f'grad[{self.name}]') return self._grad
var shape : Tuple[int, ...]
-
Expand source code
@property def shape(self) -> Tuple[int, ...]: return self._value.shape
var value
-
Expand source code
@property def value(self): return self._value
Methods
def all(self) -> numpy.ndarray
-
Expand source code
def all(self) -> ndarray: return self._value.all()
def any(self) -> numpy.ndarray
-
Expand source code
def any(self) -> ndarray: return self._value.any()
def backward(self) -> NoneType
-
It uses Breadth First Search to traverse the computation graph and compute the gradient for each differentiable Tensor in the graph.
Expand source code
def backward(self, _debug=False) -> None: ''' It uses Breadth First Search to traverse the computation graph and compute the gradient for each differentiable Tensor in the graph. ''' nodes_to_visit: List['Tensor'] = [self] if _debug: i = 1 while nodes_to_visit: node = nodes_to_visit.pop() node.compute_grad() if _debug: nstr = f' [{i}]' node.name += nstr if nstr not in node.name else '' i += 1 if node.creator: for child in node.creator.children: # Avoid visiting the same node twice if all(child is not node for node in nodes_to_visit): nodes_to_visit.insert(0, child)
def compute_grad(self) -> NoneType
-
Expand source code
def compute_grad(self) -> None: if modes.DIFF_ENABLED and self.diff: # Make sure grad is Tensor (`grad property call`) and init value if self._grad is None: self.zero_grad(propagate=False) # Top-parent grad if len(self.parents_outputs) == 0: self._grad._value += 1 return for poutput in self.parents_outputs: curr_grad = self._compute_grad_from(poutput) if self._grad.diff: # Record grad computations in the computation graph self._grad += curr_grad else: self._grad._value += curr_grad
def reshape(self, *shape: int) -> Tensor
-
Expand source code
def reshape(self, *shape: int) -> 'Tensor': from nujo.autodiff._functions._transform import _Reshape return _Reshape(self, shape)()
def squeeze(self, dim=-1) -> Tensor
-
Expand source code
def squeeze(self, dim=-1) -> 'Tensor': if dim < 0: num_dims = len(self._value.shape) if dim < -num_dims: dim = num_dims else: dim += num_dims return self.reshape(*self._value.shape[:dim], *self._value.shape[dim + 1:])
def transpose(self, *dims: int) -> Tensor
-
Expand source code
def transpose(self, *dims: int) -> 'Tensor': from nujo.autodiff._functions._transform import _Transpose return _Transpose(self, dims)()
def unsqueeze(self, dim=-1) -> Tensor
-
Expand source code
def unsqueeze(self, dim=-1) -> 'Tensor': if dim < 0: num_dims = len(self._value.shape) if dim < -num_dims: dim = 0 else: if dim == -1: dim += 1 dim += num_dims return self.reshape(*self._value.shape[:dim], 1, *self._value.shape[dim:])
def zero_grad(self, propagate=True) -> NoneType
-
Expand source code
def zero_grad(self, propagate=True) -> None: self.grad._value.fill(0) if propagate: for poutput in self.parents_outputs: poutput.zero_grad()
class no_diff
-
No Differentiation block
Creates a block of code where no differentiation is done. a.k.a. No gradients are computed for whatever tensor.
Expand source code
class no_diff(): ''' No Differentiation block Creates a block of code where no differentiation is done. a.k.a. No gradients are computed for whatever tensor. ''' def __enter__(self): global DIFF_ENABLED DIFF_ENABLED = False def __exit__(self, type, value, traceback): global DIFF_ENABLED DIFF_ENABLED = True