Source code for trieste.acquisition.function.active_learning

# Copyright 2021 The Trieste Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This module contains acquisition function builders and acquisition functions for Bayesian active
learning.
"""

from __future__ import annotations

import math
from typing import Optional, Sequence, Union

import tensorflow as tf
import tensorflow_probability as tfp

from ...data import Dataset
from ...models import ProbabilisticModel
from ...models.interfaces import FastUpdateModel, SupportsPredictJoint
from ...types import TensorType
from ...utils import DEFAULTS
from ..interface import AcquisitionFunction, AcquisitionFunctionClass, SingleModelAcquisitionBuilder


[docs] class PredictiveVariance(SingleModelAcquisitionBuilder[SupportsPredictJoint]): """ Builder for the determinant of the predictive covariance matrix over the batch points. For a batch of size 1 it is the same as maximizing the predictive variance. """ def __init__(self, jitter: float = DEFAULTS.JITTER) -> None: """ :param jitter: The size of the jitter to use when stabilising the Cholesky decomposition of the covariance matrix. """ self._jitter = jitter
[docs] def __repr__(self) -> str: """""" return f"PredictiveVariance(jitter={self._jitter!r})"
[docs] def prepare_acquisition_function( self, model: SupportsPredictJoint, dataset: Optional[Dataset] = None, ) -> AcquisitionFunction: """ :param model: The model. :param dataset: Unused. :return: The determinant of the predictive function. """ if not isinstance(model, SupportsPredictJoint): raise NotImplementedError( f"PredictiveVariance only works with models that support " f"predict_joint; received {model!r}" ) return predictive_variance(model, self._jitter)
[docs] def update_acquisition_function( self, function: AcquisitionFunction, model: SupportsPredictJoint, dataset: Optional[Dataset] = None, ) -> AcquisitionFunction: """ :param function: The acquisition function to update. :param model: The model. :param dataset: Unused. """ return function # no need to update anything
[docs] def predictive_variance(model: SupportsPredictJoint, jitter: float) -> AcquisitionFunction: """ The predictive variance acquisition function for active learning, based on the determinant of the covariance (see :cite:`MacKay1992` for details). Note that the model needs to supply covariance of the joint marginal distribution, which can be expensive to compute. :param model: The model of the objective function. :param jitter: The size of the jitter to use when stabilising the Cholesky decomposition of the covariance matrix. """ @tf.function def acquisition(x: TensorType) -> TensorType: try: _, covariance = model.predict_joint(x) except NotImplementedError: raise ValueError( """ PredictiveVariance only supports models with a predict_joint method. """ ) return tf.exp(tf.linalg.logdet(covariance + jitter)) return acquisition
[docs] class ExpectedFeasibility(SingleModelAcquisitionBuilder[ProbabilisticModel]): """ Builder for the Expected feasibility acquisition function for identifying a failure or feasibility region. It implements two related sampling strategies called *bichon* criterion (:cite:`bichon2008efficient`) and *ranjan* criterion (:cite:`ranjan2008sequential`). The goal of both criteria is to sample points with a mean close to the threshold and a high variance. """ def __init__(self, threshold: float, alpha: float = 1, delta: int = 1) -> None: """ :param threshold: The failure or feasibility threshold. :param alpha: The parameter which determines the neighbourhood around the estimated contour line as a percentage of the posterior variance in which to allocate new points. Defaults to value of 1. :param delta: The parameter identifying which criterion is used, *bichon* for value of 1 (default) and *ranjan* for value of 2. :raise ValueError (or InvalidArgumentError): If arguments are not a scalar, or `alpha` is not positive, or `delta` is not 1 or 2. """ tf.debugging.assert_scalar(threshold) tf.debugging.assert_scalar(alpha) tf.debugging.assert_positive(alpha, message="Parameter alpha must be positive.") tf.debugging.assert_scalar(delta) tf.debugging.Assert(delta in [1, 2], [delta]) self._threshold = threshold self._alpha = alpha self._delta = delta
[docs] def __repr__(self) -> str: """""" return ( f"ExpectedFeasibility(threshold={self._threshold!r}, alpha={self._alpha!r}," f" delta={self._delta!r})" )
[docs] def prepare_acquisition_function( self, model: ProbabilisticModel, dataset: Optional[Dataset] = None, ) -> AcquisitionFunction: """ :param model: The model. :param dataset: Unused. :return: The expected feasibility function. This function will raise :exc:`ValueError` or :exc:`~tf.errors.InvalidArgumentError` if used with a batch size greater than one. """ return bichon_ranjan_criterion(model, self._threshold, self._alpha, self._delta)
[docs] def update_acquisition_function( self, function: AcquisitionFunction, model: ProbabilisticModel, dataset: Optional[Dataset] = None, ) -> AcquisitionFunction: return function # no need to update anything
[docs] def bichon_ranjan_criterion( model: ProbabilisticModel, threshold: float, alpha: float, delta: int, ) -> AcquisitionFunction: r""" Return the *bichon* criterion (:cite:`bichon2008efficient`) and *ranjan* criterion (:cite:`ranjan2008sequential`) used in Expected feasibility acquisition function for active learning of failure or feasibility regions. The problem of identifying a failure or feasibility region of a function :math:`f` can be formalized as estimating the excursion set, :math:`\Gamma^* = \{ x \in X: f(x) \ge T\}`, or estimating the contour line, :math:`C^* = \{ x \in X: f(x) = T\}`, for some threshold :math:`T` (see :cite:`bect2012sequential` for more details). It turns out that probabilistic models can be used as classifiers for identifying where excursion probability is larger than 1/2 and this idea is used to build many sequential sampling strategies. We follow :cite:`bect2012sequential` and use a formulation which provides a common expression for these two criteria: .. math:: \mathbb{E}[\max(0, (\alpha s(x))^\delta - |T - m(x)|^\delta)] Here :math:`m(x)` and :math:`s(x)` are the mean and standard deviation of the predictive posterior of a probabilistic model. *Bichon* criterion is obtained when :math:`\delta = 1` while *ranjan* criterion is obtained when :math:`\delta = 2`. :math:`\alpha>0` is another parameter that acts as a percentage of standard deviation of the posterior around the current boundary estimate where we want to sample. The goal is to sample a point with a mean close to the threshold :math:`T` and a high variance, so that the positive difference in the equation above is as large as possible. Note that only batches of size 1 are allowed. :param model: The probabilistic model of the objective function. :param threshold: The failure or feasibility threshold. :param alpha: The parameter which determines the neighbourhood around the estimated contour line as a percentage of the posterior variance in which to allocate new points. :param delta: The parameter identifying which criterion is used, *bichon* for value of 1 and *ranjan* for value of 2. """ @tf.function def acquisition(x: TensorType) -> TensorType: tf.debugging.assert_shapes( [(x, [..., 1, None])], message="This acquisition function only supports batch sizes of one.", ) mean, variance = model.predict(tf.squeeze(x, -2)) stdev = tf.sqrt(variance) t = (threshold - mean) / stdev t_plus = t + alpha t_minus = t - alpha normal = tfp.distributions.Normal(tf.constant(0, x.dtype), tf.constant(1, x.dtype)) if delta == 1: G = ( alpha * (normal.cdf(t_plus) - normal.cdf(t_minus)) - t * (2 * normal.cdf(t) - normal.cdf(t_plus) - normal.cdf(t_minus)) - (2 * normal.prob(t) - normal.prob(t_plus) - normal.prob(t_minus)) ) tf.debugging.check_numerics(G, "NaN or Inf values encountered in criterion") criterion = G * stdev elif delta == 2: G = ( (alpha**2 - 1 - t**2) * (normal.cdf(t_plus) - normal.cdf(t_minus)) - 2 * t * (normal.prob(t_plus) - normal.prob(t_minus)) + t_plus * normal.prob(t_plus) - t_minus * normal.prob(t_minus) ) tf.debugging.check_numerics(G, "NaN or Inf values encountered in criterion") criterion = G * variance return criterion return acquisition
[docs] class IntegratedVarianceReduction(SingleModelAcquisitionBuilder[FastUpdateModel]): """ Builder for the reduction of the integral of the predicted variance over the search space given a batch of query points. """ def __init__( self, integration_points: TensorType, threshold: Optional[Union[float, Sequence[float], TensorType]] = None, ) -> None: """ :param integration_points: set of points to integrate the prediction variance over. :param threshold: either None, a float or a sequence of 1 or 2 float values. """ self._integration_points = integration_points self._threshold = threshold
[docs] def __repr__(self) -> str: """""" return f"IntegratedVarianceReduction(threshold={self._threshold!r})"
[docs] def prepare_acquisition_function( self, model: FastUpdateModel, dataset: Optional[Dataset] = None, ) -> AcquisitionFunction: """ :param model: The model. :param dataset: Unused. :return: The integral of the predictive variance. """ if not isinstance(model, FastUpdateModel): raise NotImplementedError( f"PredictiveVariance only works with FastUpdateModel models; received {model!r}" ) return integrated_variance_reduction(model, self._integration_points, self._threshold)
[docs] def update_acquisition_function( self, function: AcquisitionFunction, model: FastUpdateModel, dataset: Optional[Dataset] = None, ) -> AcquisitionFunction: """ :param function: The acquisition function to update. :param model: The model. :param dataset: Unused. """ return function # no need to update anything
[docs] class integrated_variance_reduction(AcquisitionFunctionClass): r""" The reduction of the (weighted) average of the predicted variance over the integration points (a.k.a. Integrated Means Square Error or IMSE criterion). See :cite:`Picheny2010` for details. The criterion (to maximise) writes as: .. math:: \int_x (v_{old}(x) - v_{new}(x)) * weights(x), where :math:`v_{old}(x)` is the predictive variance of the model at :math:`x`, and :math:`v_{new}(x)` is the updated predictive variance, given that the GP is further conditioned on the query points. Note that since :math:`v_{old}(x)` is constant w.r.t. the query points, this function only returns :math:`-\int_x v_{new}(x) * weights(x)`. If no threshold is provided, the goal is to learn a globally accurate model, and the predictive variance (:math:`v_{new}`) is used. Otherwise, learning is 'targeted' towards regions where the GP is close to particular values, and the variance is weighted by the posterior GP pdf evaluated at the threshold T (if a single value is given) or by the probability that the GP posterior belongs to the interval between the 2 thresholds T1 and T2 (note the slightly different parametrisation compared to :cite:`Picheny2010` in that case). This criterion allows batch size > 1. Note that the computational cost grows cubically with the batch size. This criterion requires a method (conditional_predict_f) to compute the new predictive variance given that query points are added to the data. """ def __init__( self, model: FastUpdateModel, integration_points: TensorType, threshold: Optional[Union[float, Sequence[float], TensorType]] = None, ): """ :param model: The model of the objective function. :param integration_points: Points over which to integrate the objective prediction variance. :param threshold: Either None, a float or a sequence of 1 or 2 float values. See class docs for details. :raise ValueError (or InvalidArgumentError): If ``threshold`` has more than 2 values. """ self._model = model tf.debugging.assert_equal( len(tf.shape(integration_points)), 2, message="integration_points must be of shape [N, D]", ) tf.debugging.assert_positive( tf.shape(integration_points)[0], message="integration_points should contain at least one point", ) self._integration_points = integration_points if threshold is None: self._weights = tf.constant(1.0, integration_points.dtype) else: if isinstance(threshold, float): t_threshold = tf.constant([threshold], integration_points.dtype) else: t_threshold = tf.cast(threshold, integration_points.dtype) tf.debugging.assert_rank( t_threshold, 1, message=f"threshold should be a float, a sequence " f"or a rank 1 tensor, received {tf.shape(t_threshold)}", ) tf.debugging.assert_less_equal( tf.size(t_threshold), 2, message=f"threshold should have one or two values," f" received {tf.size(t_threshold)}", ) tf.debugging.assert_greater_equal( tf.size(t_threshold), 1, message=f"threshold should have one or two values," f" received {tf.size(t_threshold)}", ) if tf.size(t_threshold) > 1: tf.debugging.assert_greater_equal( t_threshold[1], t_threshold[0], message=f"threshold values should be in increasing order," f" received {t_threshold}", ) if tf.size(t_threshold) == 1: mean_old, var_old = self._model.predict(query_points=integration_points) distr = tfp.distributions.Normal(mean_old, tf.sqrt(var_old)) self._weights = distr.prob(t_threshold[0]) else: mean_old, var_old = self._model.predict(query_points=integration_points) distr = tfp.distributions.Normal(mean_old, tf.sqrt(var_old)) self._weights = distr.cdf(t_threshold[1]) - distr.cdf(t_threshold[0]) @tf.function
[docs] def __call__(self, x: TensorType) -> TensorType: additional_data = Dataset(x, tf.ones_like(x[..., 0:1])) _, variance = self._model.conditional_predict_f( query_points=self._integration_points, additional_data=additional_data ) return -tf.reduce_mean(variance * self._weights, axis=-2)
[docs] class BayesianActiveLearningByDisagreement(SingleModelAcquisitionBuilder[ProbabilisticModel]): """ Builder for the *Bayesian Active Learning By Disagreement* acquisition function defined in :cite:`houlsby2011bayesian`. """ def __init__(self, jitter: float = DEFAULTS.JITTER) -> None: """ :param jitter: The size of the jitter to avoid numerical problem caused by the log operation if variance is close to zero. """ self._jitter = jitter
[docs] def __repr__(self) -> str: """""" return f"BayesianActiveLearningByDisagreement(jitter={self._jitter!r})"
[docs] def prepare_acquisition_function( self, model: ProbabilisticModel, dataset: Optional[Dataset] = None, ) -> AcquisitionFunction: """ :param model: The model. :param dataset: Unused. :return: The determinant of the predictive function. """ return bayesian_active_learning_by_disagreement(model, self._jitter)
[docs] def update_acquisition_function( self, function: AcquisitionFunction, model: ProbabilisticModel, dataset: Optional[Dataset] = None, ) -> AcquisitionFunction: """ :param function: The acquisition function to update. :param model: The model. :param dataset: Unused. """ return function # no need to update anything
[docs] class bayesian_active_learning_by_disagreement(AcquisitionFunctionClass): def __init__(self, model: ProbabilisticModel, jitter: float): r""" The Bayesian active learning by disagrement acquisition function computes the information gain of the predictive entropy :cite:`houlsby2011bayesian`. the acquisiton function is calculated by: .. math:: \mathrm{h}\left(\Phi\left(\frac{\mu_{\boldsymbol{x}, \mathcal{D}}} {\sqrt{\sigma_{\boldsymbol{x}, \mathcal{D}}^{2}+1}}\right)\right) -\frac{C \exp \left(-\frac{\mu_{\boldsymbol{x}, \mathcal{D}}^{2}} {2\left(\sigma_{\boldsymbol{w}, \mathcal{D}}^{+C^{2}}\right)}\right)} {\sqrt{\sigma_{\boldsymbol{x}, \mathcal{D}}^{2}+C^{2}}} Here :math:`\mathrm{h}(p)` is defined as: .. math:: \mathrm{h}(p)=-p \log p-(1-p) \log (1-p) This acquisition function is intended to use for Binary Gaussian Process Classification model with Bernoulli likelihood. It is designed for VGP but other Gaussian approximation of the posterior can be used. SVGP for instance, or some other model that is not currently supported by Trieste. Integrating over nuisance parameters is currently not supported (see equation 6 of the paper). :param model: The model of the objective function. :param jitter: The size of the jitter to avoid numerical problem caused by the log operation if variance is close to zero. :return: The Bayesian Active Learning By Disagreement acquisition function. """ tf.debugging.assert_positive(jitter, message="Jitter must be positive.") self._model = model self._jitter = jitter @tf.function
[docs] def __call__(self, x: TensorType) -> TensorType: tf.debugging.assert_shapes( [(x, [..., 1, None])], message="This acquisition function only supports batch sizes of one.", ) mean, variance = self._model.predict(tf.squeeze(x, -2)) variance = tf.maximum(variance, self._jitter) normal = tfp.distributions.Normal(tf.constant(0, mean.dtype), tf.constant(1, mean.dtype)) p = normal.cdf((mean / tf.sqrt(variance + 1))) C2 = (math.pi * tf.math.log(tf.constant(2, mean.dtype))) / 2 Ef = (tf.sqrt(C2) / tf.sqrt(variance + C2)) * tf.exp(-(mean**2) / (2 * (variance + C2))) return -p * tf.math.log(p + self._jitter) - (1 - p) * tf.math.log(1 - p + self._jitter) - Ef