Source code for markovflow.kalman_filter

#
# Copyright (c) 2021 The Markovflow Contributors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Module containing a Kalman filter."""

from abc import ABC, abstractmethod

import numpy as np
import tensorflow as tf
from gpflow import default_float
from gpflow.base import TensorType

from markovflow.block_tri_diag import SymmetricBlockTriDiagonal
from markovflow.emission_model import EmissionModel
from markovflow.state_space_model import StateSpaceModel
from markovflow.utils import tf_scope_class_decorator


[docs]class BaseKalmanFilter(tf.Module, ABC): r""" Performs a Kalman filter on a :class:`~markovflow.state_space_model.StateSpaceModel` and :class:`~markovflow.emission_model.EmissionModel`, with given observations. The key reference is:: @inproceedings{grigorievskiy2017parallelizable, title={Parallelizable sparse inverse formulation Gaussian processes (SpInGP)}, author={Grigorievskiy, Alexander and Lawrence, Neil and S{\"a}rkk{\"a}, Simo}, booktitle={Int'l Workshop on Machine Learning for Signal Processing (MLSP)}, pages={1--6}, year={2017}, organization={IEEE} } The following notation from the above paper is used: * :math:`G = I_N ⊗ H`, where :math:`⊗` is the Kronecker product * :math:`R` is the observation covariance * :math:`Σ = I_N ⊗ R` * :math:`K⁻¹ = A⁻ᵀQ⁻¹A⁻¹` is the precision, where :math:`A⁻ᵀ = [Aᵀ]⁻¹ = [A⁻¹]ᵀ` * :math:`L` is the Cholesky of :math:`K⁻¹ + GᵀΣ⁻¹G`. That is, :math:`LLᵀ = K⁻¹ + GᵀΣ⁻¹G` * :math:`y` is the observation matrix """ def __init__(self, state_space_model: StateSpaceModel, emission_model: EmissionModel,) -> None: """ :param state_space_model: Parametrises the latent chain. :param emission_model: Maps the latent chain to the observations. """ super().__init__(self.__class__.__name__) # verify observation shape self.prior_ssm = state_space_model self.emission = emission_model @property @abstractmethod
[docs] def _r_inv(self): """ Precision of observation model """ raise NotImplementedError
@property @abstractmethod
[docs] def observations(self): """ Observation vector """ raise NotImplementedError
@property
[docs] def _k_inv_prior(self): """ Prior precision """ return self.prior_ssm.precision
@property
[docs] def _k_inv_post(self): """ Posterior precision """ # construct the likelihood precision: Gᵀ Σ⁻¹ G # HᵀR⁻¹H [state_dim, state_dim] h_t_r_h = tf.einsum( "...ji,...jk,...kl->...il", self.emission.emission_matrix, self._r_inv, self.emission.emission_matrix, ) # The emission matrix is tiled across the time_points, so for a time invariant matrix # this is equivalent to Gᵀ Σ⁻¹ G = (I_N ⊗ HᵀR⁻¹H), likelihood_precision = SymmetricBlockTriDiagonal(h_t_r_h) # K⁻¹ + GᵀΣ⁻¹G return self._k_inv_prior + likelihood_precision
@property
[docs] def _log_det_observation_precision(self): """ Sum of log determinant of the precisions of the observation model """ num_data = self.prior_ssm.num_transitions + 1 return tf.cast(num_data, default_float()) * tf.linalg.logdet(self._r_inv)
[docs] def posterior_state_space_model(self) -> StateSpaceModel: r""" Return the posterior as a state space model. The marginal means and covariances are given by: .. math:: &μ(Χ) = (K⁻¹ + GᵀΣ⁻¹G)⁻¹[GᵀΣ⁻¹y + K⁻¹μ]\\ &P(X) = K⁻¹ + GᵀΣ⁻¹G ...where :math:`μ` is a block vector of the marginal means. We can derive the state transitions :math:`aₖ` and process noise covariances :math:`qₖ` from the block tridiagonal matrix (see :meth:`~markovflow.block_tri_diag.SymmetricBlockTriDiagonal.upper_diagonal_lower`). Lower case is used to attempt to distinguish the posterior and prior parameters. We then need to calculate :math:`μ₀` and :math:`bₖ` (this is what most of the code in this function does). This can be calculated from: .. math:: K⁻¹ₚₒₛₜμₚₒₛₜ = GᵀΣ⁻¹y + K⁻¹ₚᵣᵢₒᵣμₚᵣᵢₒᵣ Firstly, we use that for any :class:`~markovflow.state_space_model.StateSpaceModel`: .. math:: K⁻¹μ = A⁻ᵀ Q⁻¹ m ...where :math:`m = [μ₀, b₁,... bₙ]` and:: A⁻¹ = [ I ] Q⁻¹ = [ P₀⁻¹ ] [-A₁, I ] [ Q₁⁻¹ ] [ -A₂, I ] [ ᨞ ] [ ᨞ ᨞ ] [ ᨞ ] [ -Aₙ, I] [ Qₙ⁻¹] So: .. math:: mₚₒₛₜ = Qₚₒₛₜ Aₚₒₛₜᵀ [GᵀΣ⁻¹y + Kₚᵣᵢₒᵣ⁻¹mₚᵣᵢₒᵣ] :return: The posterior as a state space model. """ a_inv_post, chol_q_inv_post = self._k_inv_post.upper_diagonal_lower() assert a_inv_post.block_sub_diagonal is not None # (GᵀΣ⁻¹)y [..., num_transitions + 1, state_dim] obs_proj = self._back_project_y_to_state(self.observations) # Kₚᵣᵢₒᵣ⁻¹μₚᵣᵢₒᵣ (prior parameters) [..., num_transitions + 1, state_dim] k_inv_mu_prior = self._k_inv_prior.dense_mult(self.prior_ssm.marginal_means) # mₚₒₛₜ = Qₚₒₛₜ Aₚₒₛₜᵀ [GᵀΣ⁻¹y + Kₚᵣᵢₒᵣ⁻¹mₚᵣᵢₒᵣ] [..., num_transitions + 1, state_dim] m_post = chol_q_inv_post.solve( chol_q_inv_post.solve(a_inv_post.solve(obs_proj + k_inv_mu_prior, transpose_left=True)), transpose_left=True, ) # [..., num_transitions + 1, state_dim, state_dim] batch_shape = tf.concat( [self.prior_ssm.batch_shape, tf.TensorShape([self.prior_ssm.num_transitions + 1])], axis=0, ) identities = tf.eye(self.prior_ssm.state_dim, dtype=m_post.dtype, batch_shape=batch_shape) # cholesky of [P₀, Q₁, Q₂, ....] [..., num_transitions + 1, state_dim, state_dim] concatted_qs = tf.linalg.cholesky( tf.linalg.cholesky_solve(chol_q_inv_post.block_diagonal, identities) ) return StateSpaceModel( initial_mean=m_post[..., 0, :], chol_initial_covariance=concatted_qs[..., 0, :, :], state_transitions=-a_inv_post.block_sub_diagonal, state_offsets=m_post[..., 1:, :], chol_process_covariances=concatted_qs[..., 1:, :, :],
)
[docs] def log_likelihood(self) -> tf.Tensor: r""" Construct a TensorTlow function to compute the likelihood. We set :math:`y = obs - Hμ` (where :math:`μ` is the vector of marginal state means): .. math:: log p(obs|params) = &- ᴺ⁄₂log(2π) - ½(log |K⁻¹ + GᵀΣ⁻¹G| - log |K⁻¹| - log |Σ⁻¹|)\\ &- ½ yᵀ(Σ⁻¹ - Σ⁻¹G(K⁻¹ + GᵀΣ⁻¹G)⁻¹GᵀΣ⁻¹)y ...where :math:`N` is the dimensionality of the precision object, that is ``state_dim * (num_transitions + 1)``. We break up the log likelihood as: cst + term1 + term2 + term3. That is, as: * cst: :math:`- ᴺ⁄₂log(2π)` * term 1: :math:`- ½ yᵀΣ⁻¹y` * term 2: .. math:: ½ yᵀΣ⁻¹G(K⁻¹ + GᵀΣ⁻¹G)⁻¹GᵀΣ⁻¹)y = ½ yᵀΣ⁻¹G(LLᵀ)⁻¹GᵀΣ⁻¹)y = ½|L⁻¹(GᵀΣ⁻¹)y|² * term 3: .. math:: - ½(log |K⁻¹ + GᵀΣ⁻¹G| - log |K⁻¹| - log |Σ⁻¹|) = ½log |K⁻¹| - log |L| + ½log |Σ⁻¹| Note that there are a couple of mistakes in the SpinGP paper for this formula (18): * They have :math:`- ½(... + log |Σ⁻¹|)`. It should be :math:`- ½(... - log |Σ⁻¹|)` * They have :math:`- ½ yᵀ(... Σ⁻¹G(K⁻¹ + GᵀΣ⁻¹G)⁻¹)y`. It should be :math:`- ½ yᵀ(... Σ⁻¹G(K⁻¹ + GᵀΣ⁻¹G)⁻¹GᵀΣ⁻¹)y` :return: The likelihood as a scalar tensor (we sum over the `batch_shape`). """ # K⁻¹ + GᵀΣ⁻¹G = LLᵀ. l_post = self._k_inv_post.cholesky num_data = self.prior_ssm.num_transitions + 1 # Hμ [..., num_transitions + 1, output_dim] marginal = self.emission.project_state_to_f(self.prior_ssm.marginal_means) # y = obs - Hμ [..., num_transitions + 1, output_dim] disp = self.observations - marginal # cst is the constant term for a gaussian log likelihood cst = ( -0.5 * np.log(2 * np.pi) * tf.cast(self.emission.output_dim * num_data, default_float()) ) # term1 is: - ½ yᵀΣ⁻¹y shape [...] term1 = -0.5 * tf.reduce_sum( input_tensor=tf.einsum("...op,...p,...o->...o", self._r_inv, disp, disp), axis=[-1, -2] ) # term 2 is: ½|L⁻¹(GᵀΣ⁻¹)y|² # (GᵀΣ⁻¹)y [..., num_transitions + 1, state_dim] obs_proj = self._back_project_y_to_state(disp) # ½|L⁻¹(GᵀΣ⁻¹)y|² [...] term2 = 0.5 * tf.reduce_sum( input_tensor=tf.square(l_post.solve(obs_proj, transpose_left=False)), axis=[-1, -2] ) ## term 3 is: ½log |K⁻¹| - log |L| + ½ log |Σ⁻¹| # where log |Σ⁻¹| = num_data * log|R⁻¹| term3 = ( 0.5 * self.prior_ssm.log_det_precision() - l_post.abs_log_det() + 0.5 * self._log_det_observation_precision ) return tf.reduce_sum(cst + term1 + term2 + term3)
[docs] def _back_project_y_to_state(self, observations: tf.Tensor) -> tf.Tensor: """ Back project from the observation space to the state_space, i.e. calculate (GᵀΣ⁻¹)y. :param observations: a tensor y of shape batch_shape + [num_data, output_dim] :return: a tensor (GᵀΣ⁻¹)y of shape batch_shape + [num_data, state_dim] """ # GᵀΣ⁻¹, batch_shape + [num_data, output_dim, state_dim] back_projection = tf.einsum( "...ij,...ki->...kj", self.emission.emission_matrix, self._r_inv ) # (GᵀΣ⁻¹) y return tf.einsum("...ij,...i->...j", back_projection, observations)
[docs]@tf_scope_class_decorator class KalmanFilter(BaseKalmanFilter): r""" Performs a Kalman filter on a :class:`~markovflow.state_space_model.StateSpaceModel` and :class:`~markovflow.emission_model.EmissionModel`, with given observations. The key reference is:: @inproceedings{grigorievskiy2017parallelizable, title={Parallelizable sparse inverse formulation Gaussian processes (SpInGP)}, author={Grigorievskiy, Alexander and Lawrence, Neil and S{\"a}rkk{\"a}, Simo}, booktitle={Int'l Workshop on Machine Learning for Signal Processing (MLSP)}, pages={1--6}, year={2017}, organization={IEEE} } The following notation from the above paper is used: * :math:`G = I_N ⊗ H`, where :math:`⊗` is the Kronecker product * :math:`R` is the observation covariance * :math:`Σ = I_N ⊗ R` * :math:`K⁻¹ = A⁻ᵀQ⁻¹A⁻¹` is the precision, where :math:`A⁻ᵀ = [Aᵀ]⁻¹ = [A⁻¹]ᵀ` * :math:`L` is the Cholesky of :math:`K⁻¹ + GᵀΣ⁻¹G`. That is, :math:`LLᵀ = K⁻¹ + GᵀΣ⁻¹G` * :math:`y` is the observation matrix """ def __init__( self, state_space_model: StateSpaceModel, emission_model: EmissionModel, observations: tf.Tensor, chol_obs_covariance: TensorType, ) -> None: """ :param state_space_model: Parametrises the latent chain. :param emission_model: Maps the latent chain to the observations. :param observations: Data with shape ``[num_transitions + 1, output_dim]``. :param chol_obs_covariance: A :data:`~markovflow.base.TensorType` with shape ``[output_dim, output_dim]`` for the Cholesky factor of the covariance to be applied to :math:`f` from `emission_model`. """ super().__init__(state_space_model, emission_model) assert isinstance(observations, tf.Tensor) # verify observation covariance shape shape = tf.convert_to_tensor([emission_model.output_dim, emission_model.output_dim]) message = """The shape of the observation covaraiance matrix and the emission matrix are not compatible""" tf.debugging.assert_equal(tf.shape(chol_obs_covariance), shape, message=message) # verify observation shape message = """The shape of the observations and the state-space-model parameters are not compatible""" shape = tf.concat( [ state_space_model.batch_shape, [state_space_model.num_transitions + 1, emission_model.output_dim], ], axis=0, ) tf.debugging.assert_equal(tf.shape(observations), shape, message=message) self._chol_obs_covariance = chol_obs_covariance # To collect tf.Module trainables self._observations = observations # batch_shape + [num_transitions + 1, output_dim] @property
[docs] def _r_inv(self): """ Precision of the observation model """ # [output_dim, output_dim] return tf.linalg.cholesky_solve( self._chol_obs_covariance, tf.eye(self.emission.output_dim, dtype=self._chol_obs_covariance.dtype),
) @property
[docs] def observations(self): """ Observation vector """ return self._observations
[docs]class GaussianSites(tf.Module, ABC): """ This class is a wrapper around the parameters specifying multiple independent Gaussian distributions. """ @property
[docs] def means(self): """ Return the means of the Gaussians. """ raise NotImplementedError
@property
[docs] def precisions(self): """ Return the precisions of the Gaussians. """ raise NotImplementedError
@property
[docs] def log_det_precisions(self): """ Return the sum of the log determinant of the observation precisions.""" raise NotImplementedError
[docs]class UnivariateGaussianSitesNat(GaussianSites): """ This class is a wrapper around parameters of univariate Gaussian distributions in the natural form. That is: .. math:: p(f) = exp(𝞰ᵀφ(f) - A(𝞰)) ...where :math:`𝞰=[η₁,η₂]` and :math:`𝛗(f)=[f,f²]`. The mean :math:`μ` and variance :math:`σ²` parameterization is such that: .. math:: μ = -½η₁/η₂, σ²=-½η₂⁻¹ """ def __init__(self, nat1, nat2, log_norm=None): """ :param nat1: first natural parameter [N, D] :param nat2: second natural parameter [N, D, D] :param log_norm: normalizer parameter [N, D] """ super().__init__() shape_constraints = [ (nat1, ["N", 1]), (nat2, ["N", 1, 1]), ] if log_norm is not None: shape_constraints += [(log_norm, ["N", 1])] tf.debugging.assert_shapes(shape_constraints) self.num_data, self.output_dim = nat1.shape self.nat1 = nat1 self.nat2 = nat2 self.log_norm = log_norm @property
[docs] def means(self): """ Return the means of the Gaussians. """ return -0.5 * self.nat1 / self.nat2[..., 0]
@property
[docs] def precisions(self): """ Return the precisions of the Gaussians. """ return -2 * self.nat2
@property
[docs] def log_det_precisions(self): """ Return the sum of the log determinant of the observation precisions. """ return tf.math.log(-2 * self.nat2)
[docs]@tf_scope_class_decorator class KalmanFilterWithSites(BaseKalmanFilter): r""" Performs a Kalman filter on a :class:`~markovflow.state_space_model.StateSpaceModel` and :class:`~markovflow.emission_model.EmissionModel`, with Gaussian sites, that is time dependent Gaussian Likelihood terms. The key reference is:: @inproceedings{grigorievskiy2017parallelizable, title={Parallelizable sparse inverse formulation Gaussian processes (SpInGP)}, author={Grigorievskiy, Alexander and Lawrence, Neil and S{\"a}rkk{\"a}, Simo}, booktitle={Int'l Workshop on Machine Learning for Signal Processing (MLSP)}, pages={1--6}, year={2017}, organization={IEEE} } The following notation from the above paper is used: * :math:`G = I_N ⊗ H`, where :math:`⊗` is the Kronecker product * :math:`R = [R₁, R₂, ... Rₙ]` is the observation covariance * :math:`Σ = blockdiag[R]` * :math:`K⁻¹ = A⁻ᵀQ⁻¹A⁻¹` is the precision, where :math:`A⁻ᵀ = [Aᵀ]⁻¹ = [A⁻¹]ᵀ` * :math:`L` is the Cholesky of :math:`K⁻¹ + GᵀΣ⁻¹G`. That is, :math:`LLᵀ = K⁻¹ + GᵀΣ⁻¹G` * :math:`y` is the observation matrix """ def __init__( self, state_space_model: StateSpaceModel, emission_model: EmissionModel, sites: GaussianSites, ) -> None: """ :param state_space_model: Parametrises the latent chain. :param emission_model: Maps the latent chain to the observations. :param sites: Gaussian sites parameterizing the Gaussian likelihoods. """ # verify site shape message = """The shape of the site matrices and the emission matrix are not compatible""" tf.debugging.assert_equal(sites.output_dim, emission_model.output_dim, message=message) self.sites = sites super().__init__(state_space_model, emission_model) @property
[docs] def _r_inv(self): """ Precisions of the observation model """ return self.sites.precisions
@property
[docs] def _log_det_observation_precision(self): """ Sum of log determinant of the precisions of the observation model """ return tf.reduce_sum(tf.linalg.logdet(self._r_inv), axis=-1)
@property
[docs] def observations(self): """ Observation vector """ return self.sites.means
[docs]@tf_scope_class_decorator class KalmanFilterWithSparseSites(BaseKalmanFilter): r""" Performs a Kalman filter on a :class:`~markovflow.state_space_model.StateSpaceModel` and :class:`~markovflow.emission_model.EmissionModel`, with Gaussian sites, over a time grid. """ def __init__(self, state_space_model: StateSpaceModel, emission_model: EmissionModel, sites: GaussianSites, num_grid_points: int, observations_index: tf.Tensor, observations: tf.Tensor): """ :param state_space_model: Parameterises the latent chain. :param emission_model: Maps the latent chain to the observations. :param sites: Gaussian sites over the observations. :param num_grid_points: number of grid points. :param observations_index: Index of the observations in the time grid with shape (N,). :param observations: Sparse observations with shape [n_batch] + (N, output_dim). """ self.sites = sites self.observations_index = observations_index self.sparse_observations = self._drop_batch_shape(observations) self.grid_shape = tf.TensorShape((num_grid_points, 1)) super().__init__(state_space_model, emission_model) @property
[docs] def _r_inv(self): """ Precisions of the observation model over the time grid. """ data_sites_precision = self.sites.precisions return self.sparse_to_dense(data_sites_precision, output_shape=self.grid_shape + (1,))
[docs] def _drop_batch_shape(self, tensor: tf.Tensor): """ Check the batch, if present, is equal to 1, and drop it. """ tensor_shape = tensor._shape_as_list() if len(tensor_shape) < 3: return tensor if tensor_shape[0] != 1: raise Exception("KalmanFilterWithSparseSites doesn't support batches") return tf.squeeze(tensor, axis=0)
@property
[docs] def _log_det_observation_precision(self): """ Sum of log determinant of the precisions of the observation model. It only calculates for the data_sites as other sites precision is anyways zero. """ return tf.reduce_sum(tf.linalg.logdet(self._r_inv_data), axis=-1)
@property
[docs] def observations(self): """ Sparse observation vector """ return self.sparse_to_dense(self.sparse_observations, self.grid_shape)
@property
[docs] def _r_inv_data(self): """ Precisions of the observation model for only the data sites. """ return self.sites.precisions
[docs] def sparse_to_dense(self, tensor: tf.Tensor, output_shape: tf.TensorShape) -> tf.Tensor: """ Convert a sparse tensor to a dense one on the basis of observations index, output tensor is of the output_shape. """ return tf.scatter_nd(self.observations_index, tensor, output_shape)
[docs] def dense_to_sparse(self, tensor: tf.Tensor) -> tf.Tensor: """ Convert a dense tensor to a sparse one on the basis of observations index. """ tensor_shape = tensor.shape expand_dims = len(tensor_shape) == 3 tensor = tf.gather_nd(tf.reshape(tensor, (-1, 1)), self.observations_index) if expand_dims: tensor = tf.expand_dims(tensor, axis=-1) return tensor
[docs] def log_likelihood(self) -> tf.Tensor: r""" Construct a TensorFlow function to compute the likelihood. For more mathematical details, look at the log_likelihood function of the parent class. The main difference from the parent class are that the vector of observations is now sparse. :return: The likelihood as a scalar tensor (we sum over the `batch_shape`). """ # K⁻¹ + GᵀΣ⁻¹G = LLᵀ. l_post = self._k_inv_post.cholesky num_data = self.observations_index.shape[0] # Hμ [..., num_transitions + 1, output_dim] marginal = self.emission.project_state_to_f(self.prior_ssm.marginal_means) marginal = self._drop_batch_shape(marginal) # y = obs - Hμ [..., num_transitions + 1, output_dim] disp = self.observations - marginal disp_data = self.sparse_observations - self.dense_to_sparse(marginal) # cst is the constant term for a gaussian log likelihood cst = ( -0.5 * np.log(2 * np.pi) * tf.cast(self.emission.output_dim * num_data, default_float()) ) term1 = -0.5 * tf.reduce_sum( input_tensor=tf.einsum("...op,...p,...o->...o", self._r_inv_data, disp_data, disp_data), axis=[-1, -2] ) # term 2 is: ½|L⁻¹(GᵀΣ⁻¹)y|² # (GᵀΣ⁻¹)y [..., num_transitions + 1, state_dim] obs_proj = self._back_project_y_to_state(disp) # ½|L⁻¹(GᵀΣ⁻¹)y|² [...] term2 = 0.5 * tf.reduce_sum( input_tensor=tf.square(l_post.solve(obs_proj, transpose_left=False)), axis=[-1, -2] ) ## term 3 is: ½log |K⁻¹| - log |L| + ½ log |Σ⁻¹| # where log |Σ⁻¹| = num_data * log|R⁻¹| term3 = ( 0.5 * self.prior_ssm.log_det_precision() - l_post.abs_log_det() + 0.5 * self._log_det_observation_precision ) return tf.reduce_sum(cst + term1 + term2 + term3)