Source code for dmgp.models.dmgp_variational

# Copyright (c) 2024 Wenyuan Zhao, Haoyuan Chen
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# @authors: Wenyuan Zhao, Haoyuan Chen.
#
# ===============================================================================================


from __future__ import print_function
import torch
import torch.nn as nn

from dmgp.layers.linear import LinearFlipout, LinearReparameterization
from dmgp.layers.activation import TMK, AMK
from dmgp.kernels.laplace_kernel import LaplaceProductKernel
from dmgp.utils.sparse_design.design_class import HyperbolicCrossDesign

__all__ = [
    'GPLayer',
    'DMGP',
]


[docs] class GPLayer(nn.Module): r""" Represents a layer in DMGP where inference is performed via finite-rank approximation, which can be represented as a one-layer neural network with correlated Gaussian distributed weights: .. math:: \begin{align*} \hat{\mathcal{G}}^{(i)}(\cdot) := & \mu + k(\cdot, \mathbf{U}) [ k(\mathbf{U}, \mathbf{U})]^{-1} \mathcal{G}^{(i)}(\mathbf{U}), \\ = & \mu + k(\cdot, \mathbf{U}) R^{-1}_{\mathbf{U}} \mathbf{Z} \\ = & \mu + \phi^{T}(\cdot) \mathbf{Z} \end{align*} A GP Layer consists of a GP activation :math:`\phi(\cdot) = k(\cdot, \mathbf{U}) R^{-1}_{\mathbf{U}}` and a linear layer with Gaussian weights :math:`\mathbf{Z} = [R^{T}_{\mathbf{U}}]^-1 \mathcal{G}(\mathbf{U})`. :math:`\mathbf{U}=\{ \mathbf{u}_i \}_{i=1}^{m}` are the inducing points for approximating GP. :math:`R_{\mathbf{U}}` is the Cholesky decomposition of the kernel matrix :math:`k(\mathbf{U}, \mathbf{U})`. The algorithm of Cholesky decomposition in DMGP can be found in `dmgp.utils.operators.chol_inv`. :param in_features: Input features of :math:`\mathbf x_1`. :type in_features: int :param out_features: Output features of GP layer. :type out_features: int :param num_inducing: Level of inducing points for approximating GP. Default: `3`. :type num_inducing: int, optional :param input_lb: Lower bound of the input space. You can choose any bound you want and apply normalization in the front. Default: `-2.`. :type input_lb: float, optional :param input_ub: Upper bound of the input space. You can choose any bound you want and apply normalization in the front. Default: `-2.`. :type input_ub: float, optional :param dense: The dense linear layer for Gaussian weights. Default: `LinearFlipout`. :type dense: class, dmgp.layers.linear, optional :param gp_activation: The GP activation layer. Default: `AMK`. :type gp_activation: class, dmgp.layers.activation, optional :param kernel: The GP kernel. Default: `LaplaceProductKernel`. :type kernel: class, dmgp.kernels, optional :param design_class: The design class of choosing inducing points for approximating GP. Default: `HyperbolicCrossDesign`. :type design_class: class, dmgp.utils.sparse_design.design_class, optional """ def __init__(self, in_features, out_features, num_inducing=2, input_lb=-2, input_ub=2, dense=LinearFlipout, gp_activation=AMK, kernel=LaplaceProductKernel(lengthscale=1.), design_class=HyperbolicCrossDesign): super(GPLayer, self).__init__() self.norm = nn.BatchNorm1d(in_features, affine=True) self.gp = gp_activation( in_features=in_features, n_level=num_inducing, input_lb=input_lb, input_ub=input_ub, kernel=kernel, design_class=design_class, ) self.dense = dense(in_features=self.gp.out_features, out_features=out_features)
[docs] def forward(self, x, normalize=True, return_kl=True): r""" Forward the GP inference :math:`\mu + \phi^{T}(x) \mathbf{Z}`. :param x: Training data of shape :math:`(n,d)`. :type x: torch.Tensor.float :param normalize: Apply normalization to fit induced points. Default: `True`. :type normalize: bool, optional :param return_kl: Return KL-divergence. Default: `True`. :type return_kl: bool, optional :return: The output of inference and KL-divergence. """ kl_sum = 0 if normalize: x = self.norm(x) out = self.gp(x) out, kl = self.dense(out) kl_sum += kl if return_kl: return out, kl_sum else: return out
[docs] class DMGP(nn.Module): r""" A container module to build a Deep GP. This module should contain GPLayer modules, and can also contain other modules as well. :param in_features: Input features of :math:`\mathbf x_1`. :type in_features: int :param out_features: Output features of GP layer. :type out_features: int :param num_layers: Number of hidden layers in DMGP model. Default: `2`. :type num_layers: int :param hidden_dim: Dimension of hidden layers in DMGP model. Default: `8`. :type hidden_dim: int :param num_inducing: Level of inducing points for approximating GP. For "sparse grid" design, we recommend low level of inducing. Default: `3`. :type num_inducing: int, optional :param input_lb: Lower bound of the input space. You can choose any bound you want and apply normalization in the front. Default: `-2.`. :type input_lb: float, optional :param input_ub: Upper bound of the input space. You can choose any bound you want and apply normalization in the front. Default: `-2.`. :type input_ub: float, optional :param kernel: The GP kernel. Default: `LaplaceProductKernel`. :type kernel: class, dmgp.kernels, optional :param design_class: The design class of choosing inducing points for approximating GP. Default: `HyperbolicCrossDesign`. :type design_class: class, dmgp.utils.sparse_design.design_class, optional :param linear_layer: The dense linear layer for Gaussian weights. Default: `LinearFlipout`. :type linear_layer: class, dmgp.layers.linear, optional :param option: The option of DMGP architecture: use sparse grid or additive structure. Default: `additive`. :type option: str, optional """ def __init__(self, in_features, out_features, num_layers=2, hidden_dim=8, num_inducing=2, input_lb=-2, input_ub=2, kernel=LaplaceProductKernel(lengthscale=1.), design_class=HyperbolicCrossDesign, linear_layer=LinearFlipout, option='additive'): super(DMGP, self).__init__() self.embedding = LinearFlipout(in_features, hidden_dim) activation = AMK if option == 'additive' else TMK self.gp_layers = nn.ModuleList( [GPLayer(hidden_dim, hidden_dim, num_inducing, input_lb, input_ub, dense=linear_layer, gp_activation=activation, kernel=kernel, design_class=design_class, ) for _ in range(num_layers) ] ) self.classifier = LinearFlipout(hidden_dim, out_features)
[docs] def forward(self, x, normalize=True, return_kl=True): r""" Forward the DMGP inference. :param x: Training data of shape :math:`(n,d)`. :type x: torch.Tensor.float :param normalize: Apply normalization to fit induced points. Default: `True`. :type normalize: bool, optional :param return_kl: Return KL-divergence. Default: `True`. :type return_kl: bool, optional :return: The output of inference and KL-divergence. """ kl_sum = 0 out = torch.flatten(x, 1) out, kl = self.embedding(out) kl_sum += kl for gp_layer in self.gp_layers: out, kl = gp_layer(out, normalize) kl_sum += kl out, kl = self.classifier(out) kl_sum += kl if return_kl: return out, kl_sum else: return out