Source code for markovflow.models.iwvi

#
# Copyright (c) 2021 The Markovflow Contributors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Module containing a model for importance-weighted variational inference."""
from typing import Optional, Tuple

import tensorflow as tf
from gpflow.likelihoods import Likelihood

from markovflow.kernels import SDEKernel
from markovflow.mean_function import MeanFunction
from markovflow.models.sparse_variational import SparseVariationalGaussianProcess
from markovflow.posterior import ImportanceWeightedPosteriorProcess
from markovflow.state_space_model import StateSpaceModel


[docs]class ImportanceWeightedVI(SparseVariationalGaussianProcess): """ Performs importance-weighted variational inference (IWVI). The `key reference <https://papers.nips.cc/paper/7699-importance-weighting-and- variational-inference.pdf>`_ is:: @inproceedings{domke2018importance, title={Importance weighting and variational inference}, author={Domke, Justin and Sheldon, Daniel R}, booktitle={Advances in neural information processing systems}, pages={4470--4479}, year={2018} } The idea is based on the observation that an estimator of the evidence lower bound (ELBO) can be obtained from an importance weight :math:`w`: .. math:: Lā‚ = log w(xā‚), xā‚ ~ q(x) ...where :math:`x` is the latent variable of the model (a GP, or set of GPs in our case) and the function :math:`w` is: .. math:: w(x) = p(y | x) p(x) / q(x) It follows that: .. math:: ELBO = š”¼ā‚“ā‚[ Lā‚ ] ...and: .. math:: log p(y) = log š”¼ā‚“ā‚[ w(xā‚) ] It turns out that there are a series of lower bounds given by taking multiple importance samples: .. math:: Lā‚™ = log (1/n) Ī£įµ¢āæ w(xįµ¢), xįµ¢ ~ q(x) And we have the relation: .. math:: log p(y) >= š”¼[Lā‚™] >= š”¼[Lā‚™ā‚‹ā‚] >= ... >= š”¼[Lā‚] = ELBO This means that we can improve tightness of the ELBO to the log marginal likelihood by increasing :math:`n`, which we refer to in this class as `num_importance_samples`. The trade-offs are: * The objective function is now always stochastic, even for cases where the ELBO of the parent class is non-stochastic * We have to do more computations (evaluate the weights :math:`n` times) """ def __init__( self, kernel: SDEKernel, inducing_points: tf.Tensor, likelihood: Likelihood, num_importance_samples: int, initial_distribution: Optional[StateSpaceModel] = None, mean_function: Optional[MeanFunction] = None, ) -> None: """ :param kernel: A kernel that defines a prior over functions. :param inducing_points: The points in time on which inference should be performed, with shape ``batch_shape + [num_inducing]``. :param likelihood: A likelihood. :param num_importance_samples: The number of samples for the importance-weighted estimator. :param initial_distribution: An initial configuration for the variational distribution, with shape ``batch_shape + [num_inducing]``. :param mean_function: The mean function for the GP. Defaults to no mean function. """ super().__init__(kernel, likelihood, inducing_points, mean_function, initial_distribution) self._posterior = ImportanceWeightedPosteriorProcess( num_importance_samples, self._dist_q, self._kernel, inducing_points, self._likelihood, self._mean_function, )
[docs] def elbo(self, input_data: Tuple[tf.Tensor, tf.Tensor]) -> tf.Tensor: r""" Compute the importance-weighted ELBO using K samples. The procedure is:: for k=1...K: uā‚– ~ q(u) sā‚– ~ p(s | u) wā‚– = p(y | sā‚–)p(uā‚–) / q(uā‚–) ELBO = log (1/K) Ī£ā‚–wā‚– Everything is computed in log-space for stability. Note that gradients of this ELBO may have high variance with regard to the variational parameters; see the DREGS gradient estimator method. :param input_data: A tuple of time points and observations containing the data at which to calculate the loss for training the model: * A tensor of inputs with shape ``batch_shape + [num_data]`` * A tensor of observations with shape ``batch_shape + [num_data, observation_dim]`` :return: A scalar tensor. """ num_importance_samples = self.posterior.num_importance_samples time_points, _ = input_data log_num_samples = tf.math.log(tf.cast(num_importance_samples, dtype=tf.float64)) # draw samples from the proposeal distribution q(s, u) = q(u) p(s | u) sample_shape = (num_importance_samples,) samples_s, samples_u = self.posterior.proposal_process.sample_state_trajectories( time_points, sample_shape=sample_shape ) log_weights = self.posterior.log_importance_weights(samples_s, samples_u, input_data) return tf.math.reduce_logsumexp(log_weights) - log_num_samples
[docs] def dregs_objective(self, input_data: Tuple[tf.Tensor, tf.Tensor]) -> tf.Tensor: """ Compute a scalar tensor that, when differentiated using `tf.gradients`, produces the DREGS variance controlled gradient. See `"Doubly Reparameterized Gradient Estimators For Monte Carlo Objectives" <https://openreview.net/pdf?id=HkG3e205K7>`_ for a derivation. We recommend using these gradients for training variational parameters and gradients of the importance-weighted ELBO for training hyperparameters. :param input_data: A tuple of time points and observations containing the data at which to calculate the loss for training the model: * A tensor of inputs with shape ``batch_shape + [num_data]`` * A tensor of observations with shape ``batch_shape + [num_data, observation_dim]`` :return: A scalar tensor. """ num_importance_samples = self.posterior.num_importance_samples time_points, _ = input_data # draw samples from the proposal distribution q(s, u) = q(u) p(s | u) sample_shape = (num_importance_samples,) samples_s, samples_u = self.posterior.proposal_process.sample_state_trajectories( time_points, sample_shape=sample_shape ) log_weights = self.posterior.log_importance_weights( samples_s, samples_u, input_data, stop_gradient=True ) normalized_weights = tf.stop_gradient(tf.nn.softmax(log_weights)) return tf.reduce_sum(tf.square(normalized_weights) * log_weights)