Source code for markovflow.ssm_natgrad

#
# Copyright (c) 2021 The Markovflow Contributors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Module containing a natural gradient optimiser."""

from typing import Callable, List, Optional

import tensorflow as tf

from markovflow.ssm_gaussian_transformations import (
    expectations_to_ssm_params,
    naturals_to_ssm_params,
    ssm_to_expectations,
    ssm_to_naturals,
)
from markovflow.state_space_model import StateSpaceModel
from markovflow.utils import tf_scope_class_decorator


[docs]@tf_scope_class_decorator
class SSMNaturalGradient(tf.optimizers.Optimizer):
    r"""
    Represents a natural gradient optimiser. It is also capable of updating
    parameters with momentum, as per the :class:`~tf.keras.optimizers.Adam` optimiser.

    To account for momentum we keep track of a running moving average for :math:`g̃` and
    the Fisher norm of :math:`g̃`, where :math:`g̃ = F⁻¹g` is the natural gradient. This is
    defined as the Euclidean gradient preconditioned by the inverse Fisher information matrix.

    The Fisher norm of the natural gradient is given by:

    .. math:: |g̃|_F = g̃ᵀFg̃ = gᵀF⁻¹Fg̃ = gᵀg̃

    ...which is the inner product between the natural gradient and the Euclidean gradient.

    The moving average for the natural gradient and its norm are given by:

    .. math::
        &mₖ₊₁ = β₁ mₖ + (1 - β₁)g̃ₖ\\
        &vₖ₊₁ = β₂ vₖ + (1 - β₂)|g̃|ₖ

    The final update is given by:

    .. math::
        &θ̃ₖ₊₁ = θ̃ₖ - γ mₖ / (√vₖ + ε) \verb|(in the momentum case)|\\
        &θ̃ₖ₊₁ = θ̃ₖ - γ g̃ₖ \verb|(if we don't have momentum)|
    """

    def __init__(
        self,
        gamma: float = 0.1,
        momentum: bool = True,
        beta1: float = 0.9,
        beta2: float = 0.99,
        epsilon: float = 1e-08,
        name: Optional[str] = None,
    ) -> None:
        """
        :param gamma: The learning rate of the optimiser.
        :param momentum: Whether to update with momentum or not.
        :param beta1: The momentum parameter for the moving average of the natural gradient.
        :param beta2: The momentum parameter for the moving average of the norm of natural gradient.
        :param epsilon: A small constant to make sure we do not divide by :math:`0`
            in the momentum term.
        :param name: Optional name to give the optimiser.
        """
        name = self.__class__.__name__ if name is None else name
        super().__init__(name)

        self.gamma = gamma
        self._momentum = momentum
        self._beta1 = beta1
        self._beta2 = beta2
        self._epsilon = epsilon

        # Create the moving average buffers
        self._ms: List[tf.Variable] = []
        self._v = tf.Variable(0.0, dtype=tf.float64, trainable=False)
        self._step_counter = tf.Variable(1, dtype=tf.float64, trainable=False)
        self._effective_lr = tf.Variable(gamma, trainable=False)

    # pylint: disable=arguments-differ
[docs]    def minimize(self, loss_fn: Callable, ssm: StateSpaceModel) -> None:
        """
        Minimise the objective function of the model.

        Note the natural gradient optimiser works with variational parameters only.

        :param loss_fn: The Loss function.
        :param ssm: A state space model that represents our variational posterior.
        """
        if not self._ms:
            self._ms = [
                tf.Variable(tf.zeros(tf.shape(x), dtype=tf.float64), trainable=False)
                for x in ssm_to_expectations(ssm)
            ]
        self._natgrad_steps(loss_fn, ssm)

[docs]    def _natgrad_steps(self, loss_fn: Callable, ssm: StateSpaceModel):
        """
        Call a natgrad step after wrapping it in a name scope.

        :param loss_fn: A Loss function.
        :param ssm: A state space model that represents our variational posterior.
        """
        with tf.name_scope(f"{self._name}/natural_gradient_steps"):
            self._natgrad_step(loss_fn, ssm)

[docs]    def _natgrad_step(self, loss_fn: Callable, ssm: StateSpaceModel):
        """
        Implements equation [10] from

        @inproceedings{salimbeni18,
            title={Natural Gradients in Practice: Non-Conjugate  Variational Inference in
                   Gaussian Process Models},
            author={Salimbeni, Hugh and Eleftheriadis, Stefanos and Hensman, James},
            booktitle={AISTATS},
            year={2018}

        In addition, for convenience with the rest of MarkovFlow, this code computes ∂L/∂η using
        the chain rule:

        g̃ = ∂L/∂η = [(∂L / ∂[ssm_variables])(∂[ssm_variables] / ∂η)]ᵀ

        In the code η = eta, θ = theta.

        :param loss_fn: Loss function.
        :param ssm: A state space model that represents our variational posterior.
        """
        with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape:
            tape.watch(ssm.trainable_variables)

            loss = loss_fn()
            # the expectation parameterization as function of the ssm parameters
            etas = ssm_to_expectations(ssm)
            # we need these to calculate the relevant gradients
            ssm_params = expectations_to_ssm_params(*etas)

            # the natural parameterization as function of the ssm parameters
            thetas = ssm_to_naturals(ssm)

            if self._momentum:
                # Get ∂L/∂θ via the chain rule. We need that to compute the norm of the natgrad
                ssm_params_2 = naturals_to_ssm_params(*thetas)

        # 1) the oridinary gradient
        dL_dAs, dL_doffsets, dL_dchol_P0, dL_dchol_Qs, dL_dmu0 = tape.gradient(
            loss, ssm.trainable_variables
        )
        dL_dchol_P0 = ssm.cholesky_initial_covariance.transform.forward(dL_dchol_P0)
        dL_dchol_Qs = ssm.cholesky_process_covariances.transform.forward(dL_dchol_Qs)

        dL_dssm = (dL_dAs, dL_doffsets, dL_dchol_P0, dL_dchol_Qs, dL_dmu0)

        # 2) the chain rule to get ∂L/∂η, where η (eta) are the expectation parameters
        dL_detas = tape.gradient(ssm_params, etas, output_gradients=dL_dssm)

        if self._momentum:
            # the chain rule to get ∂L/∂θ, where θ (theta) are the natural parameters
            dL_dthetas = tape.gradient(ssm_params_2, thetas, output_gradients=dL_dssm)

        del tape  # Remove "persitent" tape

        # momentum
        if self._momentum:

            # adjust learning rate to debias momemtum...
            lr = (
                self.gamma
                * tf.sqrt(1.0 - self._beta2 ** self._step_counter)
                / (1.0 - self._beta1 ** self._step_counter)
            )

            # get the moving average for the natural gradients
            ms_new = [m * self._beta1 + (1.0 - self._beta1) * g for m, g in zip(self._ms, dL_detas)]

            # get moving average for the norm of the natural gradients
            natgrad_norm_components = [tf.reduce_sum(g * gt) for g, gt in zip(dL_detas, dL_dthetas)]
            natgrad_norm_components[-1] = natgrad_norm_components[-1] * 2.0
            natgrad_norm = tf.reduce_sum(natgrad_norm_components)

            v_new = self._v * self._beta2 + (1.0 - self._beta2) * natgrad_norm

            # perform natural gradient descent on the θ parameters
            thetas_new = [
                theta - lr * m / (tf.sqrt(v_new) + self._epsilon)
                for theta, m in zip(thetas, ms_new)
            ]

            _ = [
                mov.assign(mov_new) for mov, mov_new in zip(self._ms + [self._v], ms_new + [v_new])
            ]
            self._step_counter.assign_add(1)

            effective_lr = lr / (tf.sqrt(v_new) + self._epsilon)
            self._effective_lr = effective_lr
        else:
            thetas_new = [theta - self.gamma * dL_deta for theta, dL_deta in zip(thetas, dL_detas)]

        # Transform back to the model ssm parameters
        As_new, offsets_new, chol_P0_new, chol_Qs_new, mu0_new = naturals_to_ssm_params(*thetas_new)
        chol_P0_new = ssm.cholesky_initial_covariance.transform.inverse(chol_P0_new)
        chol_Qs_new = ssm.cholesky_process_covariances.transform.inverse(chol_Qs_new)
        ssm_params_new = (As_new, offsets_new, chol_P0_new, chol_Qs_new, mu0_new)

        _ = [v.assign(var_new) for v, var_new in zip(ssm.trainable_variables, ssm_params_new)]

[docs]    def get_config(self):
        """Return a Python dictionary containing the configuration of the optimiser."""
        config = super().get_config()
        config.update(
            {
                "gamma": self._serialize_hyperparameter("gamma"),
                "beta1": self._serialize_hyperparameter("beta1"),
                "beta2": self._serialize_hyperparameter("beta2"),
                "epsilon": self._serialize_hyperparameter("epsilon"),
                "momentum": self._serialize_hyperparameter("momentum"),
            }
        )
        return config

[docs]    def _resource_apply_dense(self, grad, handle, apply_state):
        # After bumping to TF2.2, PyLint started thinking that this method was abstract in the
        # Optimizers class (despite the fact that it has a trivial implementation). We reproduce
        # the trivial implementation to get around this.
        raise NotImplementedError()

[docs]    def _resource_apply_sparse(self, grad, handle, indices, apply_state):
        # After bumping to TF2.2, PyLint started thinking that this method was abstract in the
        # Optimizers class (despite the fact that it has a trivial implementation). We reproduce
        # the trivial implementation to get around this.
        raise NotImplementedError()