Source code for markovflow.models.variational

#
# Copyright (c) 2021 The Markovflow Contributors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Module containing a model for variational inference, for GP classification."""
from typing import Optional, Tuple

import tensorflow as tf
from gpflow.likelihoods import Likelihood

from markovflow.gauss_markov import GaussMarkovDistribution
from markovflow.kernels import SDEKernel
from markovflow.mean_function import MeanFunction, ZeroMeanFunction
from markovflow.models.models import MarkovFlowModel
from markovflow.posterior import AnalyticPosteriorProcess, PosteriorProcess


[docs]class VariationalGaussianProcess(MarkovFlowModel):
    """
    Approximates a :class:`~markovflow.gauss_markov.GaussMarkovDistribution`
    with a general likelihood using a Gaussian posterior.

    The following notation is used:

        * :math:`x` - the time points of the training data
        * :math:`y` - observations corresponding to time points :math:`x`
        * :math:`s(.)` - the latent state of the Markov chain
        * :math:`f(.)` - the noise free predictions of the model
        * :math:`p(y | f)` - the likelihood
        * :math:`p(.)` - the true distribution
        * :math:`q(.)` - the variational distribution

    Subscript is used to denote dependence for notational convenience,
    for example :math:`fₖ === f(k)`.

    With a prior generative model comprising a Gauss-Markov distribution, an emission model and an
    arbitrary likelihood on the emitted variables, these define:

        * :math:`p(xₖ₊₁| xₖ)`
        * :math:`fₖ = H xₖ`
        * :math:`p(yₖ | fₖ)`

    We would like to approximate the posterior of this generative model with a parametric
    model :math:`q`, comprising of the same distribution as the prior.

    To approximate the posterior, we maximise the evidence lower bound (ELBO) :math:`ℒ` with
    respect to the parameters of the variational distribution, since:

    .. math:: log p(y) = ℒ(q) + KL[q ‖ p(f | y)]

    ...where:

    .. math:: ℒ(q) = ∫ log(p(f, y) / q(f)) q(f) df

    Since the last term is non-negative, the ELBO provides a lower bound to the log-likelihood of
    the model. This bound is exact when :math:`KL[q ‖ p(f | y)] = 0`; that is, our approximation is
    sufficiently flexible to capture the true posterior.

    This turns the inference into an optimisation problem: find the optional :math:`q`.

    To calculate the ELBO, we rewrite it as:

    .. math:: ℒ(q) = Σᵢ ∫ log(p(yᵢ | f)) q(f) df - KL[q(f) ‖ p(f)]

    The first term is the 'variational expectation' of the model likelihood;
    the second is the KL from the prior to the approximation.
    """

    def __init__(
        self,
        input_data: Tuple[tf.Tensor, tf.Tensor],
        kernel: SDEKernel,
        likelihood: Likelihood,
        mean_function: Optional[MeanFunction] = None,
        initial_distribution: Optional[GaussMarkovDistribution] = None,
    ) -> None:
        """
        :param input_data: A tuple of ``(time_points, observations)`` containing the observed data:
            time points of observations, with shape ``batch_shape + [num_data]``,
            observations with shape ``batch_shape + [num_data, observation_dim]``.
        :param kernel: A kernel that defines a prior over functions.
        :param likelihood: A likelihood.
        :param mean_function: The mean function for the GP. Defaults to no mean function.
        :param initial_distribution: An initial configuration for the variational distribution,
            with shape ``batch_shape + [num_inducing]``.
        """
        super().__init__(self.__class__.__name__)
        time_points, observations = input_data

        # To collect kernel and mean function tf.Module trainable_variables
        self._kernel = kernel
        if mean_function is None:
            mean_function = ZeroMeanFunction(obs_dim=1)
        self._mean_function = mean_function

        self._likelihood = likelihood

        self._time_points = time_points
        self._observations = observations

        if initial_distribution is None:
            initial_distribution = kernel.build_finite_distribution(time_points)

        # q will approximate the posterior after optimisation.
        # This needs to be an instance attribute to provide trainable variables
        # when calling tf.Module trainable_variables. This is fine though, since
        # StateSpaceModel doesn't do any computation in its initialiser.
        self._dist_q = initial_distribution.create_trainable_copy()

        self._posterior = AnalyticPosteriorProcess(
            posterior_dist=self._dist_q,
            kernel=self._kernel,
            conditioning_time_points=self._time_points,
            likelihood=self._likelihood,
            mean_function=self._mean_function,
        )

[docs]    def elbo(self) -> tf.Tensor:
        """
        Calculate the evidence lower bound (ELBO) :math:`log p(y)`. We rewrite the ELBO as:

        .. math:: ℒ(q(x)) = Σᵢ ∫ log(p(yᵢ | fₓ)) q(fₓ) df - KL[q(sₓ) ‖ p(sₓ)]

        The first term is the 'variational expectation' (VE); the second is the KL divergence from
        the prior to the approximation.

        :return: A scalar tensor (summed over the batch_shape dimension) representing the ELBO.
        """
        # s ~ q(s) = N(μ, P)
        # Project to function space, fₓ = H*s ~ q(fₓ)
        fx_mus, fx_covs = self.posterior.predict_f(self._time_points)
        # VE(fₓ) = Σᵢ ∫ log(p(yᵢ | fₓ)) q(fₓ) dfₓ
        ve_fx = tf.reduce_sum(
            input_tensor=self._likelihood.variational_expectations(
                fx_mus, fx_covs, self._observations
            )
        )
        # KL[q(sₓ) || p(sₓ)]
        kl_fx = tf.reduce_sum(self.dist_q.kl_divergence(self.dist_p))
        # Return ELBO(fₓ) = VE(fₓ) - KL[q(sₓ) || p(sₓ)]
        return ve_fx - kl_fx

    @property
[docs]    def time_points(self) -> tf.Tensor:
        """
        Return the time points of our observations.

        :return: A tensor with shape ``batch_shape + [num_data]``.
        """
        return self._time_points

    @property
[docs]    def observations(self) -> tf.Tensor:
        """
        Return the observations.

        :return: A tensor with shape ``batch_shape + [num_data, observation_dim]``.
        """
        return self._observations

    @property
[docs]    def kernel(self) -> SDEKernel:
        """
        Return the kernel of the GP.
        """
        return self._kernel

    @property
[docs]    def likelihood(self) -> Likelihood:
        """
        Return the likelihood of the GP.
        """
        return self._likelihood

    @property
[docs]    def mean_function(self) -> MeanFunction:
        """
        Return the mean function of the GP.
        """
        return self._mean_function

    @property
[docs]    def dist_p(self) -> GaussMarkovDistribution:
        """
        Return the prior Gauss-Markov distribution.
        """
        return self._kernel.build_finite_distribution(self._time_points)

    @property
[docs]    def dist_q(self) -> GaussMarkovDistribution:
        """
        Return the variational distribution as a Gauss-Markov distribution.
        """
        return self._dist_q

    @property
[docs]    def posterior(self) -> PosteriorProcess:
        """
        Obtain a posterior process for inference.

        For this class this is the :class:`~markovflow.posterior.AnalyticPosteriorProcess`
        built from the variational distribution. This will be a locally optimal variational
        approximation of the posterior after optimisation.
        """
        return self._posterior

[docs]    def loss(self) -> tf.Tensor:
        """
        Return the loss, which is the negative ELBO.
        """
        return -self.elbo()