Source code for atomai.models.dklgp.dklgpr

"""
dklgpr.py
=========

Deep kernel learning (DKL)-based gaussian process regression (GPR)

Created by Maxim Ziatdinov (email: maxim.ziatdinov@ai4microscopy.com)
"""

import warnings
from typing import Tuple, Type, Union, List

import gpytorch
import numpy as np
import torch

from ...trainers import dklGPTrainer
from ...utils import init_dataloader

mvn_ = gpytorch.distributions.MultivariateNormal


[docs]class dklGPR(dklGPTrainer):
    """
    Deep kernel learning (DKL)-based Gaussian process regression (GPR)

    Args:
        indim: input feature dimension
        embedim: embedding dimension (determines dimensionality of kernel space)
        shared_embedding_space: use one embedding space for all target outputs

    Keyword Args:
        device:
            Sets device to which model and data will be moved.
            Defaults to 'cuda:0' if a GPU is available and to CPU otherwise.
        precision:
            Sets tensor types for 'single' (torch.float32)
            or 'double' (torch.float64) precision
        seed:
            Seed for enforcing reproducibility
    
    Examples:

        Train a DKL-GPR model with high-dimensional inputs X and outputs y:

        >>> data_dim = X.shape[-1]  # X dimensions are n_samples x d
        >>> dklgp = aoi.models.dklGPR(data_dim, embedim=2, precision="double")
        >>> dklgp.fit(X, y, training_cycles=100, lr=1e-2)

        Make a prediction on new data (mean and variance for each 'test' point):

        >>> mean, var = dklgp.predict(X_test, batch_size=len(X_test))

        Alternatively, one can obtain a prediction as follows:

        >>> samples = dklgp.sample_from_posterior(X_test, num_samples=1000)
        >>> mean, var = samples.mean(0), samples.var(0)
    """
    def __init__(self,
                 indim: int,
                 embedim: int = 2,
                 shared_embedding_space: bool = True,
                 **kwargs: Union[str, int]) -> None:
        """
        Initializes DKL-GPR model
        """
        args = (indim, embedim, shared_embedding_space)
        super(dklGPR, self).__init__(*args, **kwargs)

[docs]    def fit(self, X: Union[torch.Tensor, np.ndarray],
            y: Union[torch.Tensor, np.ndarray],
            training_cycles: int = 1,
            **kwargs: Union[Type[torch.nn.Module], bool, float]
            ) -> None:
        """
        Initializes and trains a deep kernel GP model

        Args:
            X: Input training data (aka features) of N x input_dim dimensions
            y: Output targets of batch_size x N or N (if batch_size=1) dimensions
            training_cycles: Number of training epochs

        Keyword Args:
            feature_extractor:
                (Optional) Custom neural network for feature extractor.
                Must take input/feature dims and embedding dims as its arguments.
            freeze_weights:
                Freezes weights of feature extractor, that is, they are not
                passed to the optimizer. Used for a transfer learning.
            lr: learning rate (Default: 0.01)
            print_loss: print loss at every n-th training cycle (epoch)
        """
        _ = self.run(X, y, training_cycles, **kwargs)

[docs]    def fit_ensemble(self, X: Union[torch.Tensor, np.ndarray],
                     y: Union[torch.Tensor, np.ndarray],
                     training_cycles: int = 1,
                     n_models: int = 5,
                     **kwargs: Union[Type[torch.nn.Module], bool, float]
                     ) -> None:
        """
        Initializes and trains an ensemble of deep kernel GP model

        Args:
            X: Input training data (aka features) of N x input_dim dimensions
            y: Output targets of batch_size x N or N (if batch_size=1) dimensions
            training_cycles: Number of training epochs
            n_models: Number of models in ensemble

        Keyword Args:
            feature_extractor:
                (Optional) Custom neural network for feature extractor.
                Must take input/feature dims and embedding dims as its arguments.
            freeze_weights:
                Freezes weights of feature extractor, that is, they are not
                passed to the optimizer. Used for a transfer learning.
            lr: learning rate (Default: 0.01)
            print_loss: print loss at every n-th training cycle (epoch)
        """
        if y.ndim == 1:
            y = y[None]
        if y.shape[0] > 1:
            raise NotImplementedError(
                "The ensemble training is currently supported only for scalar targets")
        y = y.repeat(n_models, 0) if isinstance(y, np.ndarray) else y.repeat(n_models, 1)
        if self.correlated_output:
            msg = ("Replacing a single shared embedding space with" +
                   " {} independent ones").format(n_models)
            warnings.warn(msg)
            self.correlated_output = False
        self.ensemble = True
        _ = self.run(X, y, training_cycles, **kwargs)

    def _compute_posterior(self, X: torch.Tensor) -> Union[mvn_, List[mvn_]]:
        """
        Computes the posterior over model outputs at the provided points (X).
        For a model with multiple independent outputs, it returns a list of
        posteriors computed for each independent model.
        """
        if not self.correlated_output:
            if X.ndim != 3 or X.shape[0] != len(self.gp_model.train_targets):
                raise ValueError(
                    "The input data must have q x n x d dimensionality " +
                    "where the first dimension (q) must be equal to the " +
                    "number of independent outputs")
        self.gp_model.eval()
        self.likelihood.eval()
        wrn = gpytorch.models.exact_gp.GPInputWarning
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=wrn)
            with torch.no_grad(), gpytorch.settings.use_toeplitz(False), gpytorch.settings.fast_pred_var():
                if self.correlated_output:
                    posterior = self.gp_model(X.to(self.device))
                else:
                    posterior = self.gp_model(*X.to(self.device).split(1))
        return posterior

[docs]    def sample_from_posterior(self, X, num_samples: int = 1000) -> np.ndarray:
        """
        Computes the posterior over model outputs at the provided points (X)
        and samples from it
        """
        X, _ = self.set_data(X)
        gp_batch_dim = len(self.gp_model.train_targets)
        X = X.expand(gp_batch_dim, *X.shape)
        posterior = self._compute_posterior(X)
        if self.correlated_output:
            samples = posterior.rsample(torch.Size([num_samples, ]))
        else:
            samples = [p.rsample(torch.Size([num_samples, ])) for p in posterior]
            samples = torch.cat(samples, 1)
        return samples.cpu().numpy()

[docs]    def thompson(self,
                 X_cand: Union[torch.Tensor, np.ndarray],
                 scalarize_func = None,
                 maximize: bool = True) -> Tuple[np.ndarray, int]:
        """
        Thompson sampling for selecting the next measurement point
        """
        X_cand, _ = self.set_data(X_cand)
        gp_batch_dim = len(self.gp_model.train_targets)
        X_cand = X_cand.expand(gp_batch_dim, *X_cand.shape).squeeze()
        posterior = self._compute_posterior(X_cand)
        if self.correlated_output:
            tsample = posterior.rsample()
        else:
            tsample = torch.cat([p.rsample() for p in posterior])
        if tsample.ndim > 1 and scalarize_func is not None:
            tsample = scalarize_func(tsample).unsqueeze(0)
        idx = tsample.argmax(1) if maximize else tsample.argmin(1)
        return tsample.cpu().numpy(), idx.cpu().numpy()

    def _predict(self, x_new: torch.Tensor) -> Tuple[torch.Tensor]:
        posterior = self._compute_posterior(x_new)
        if self.correlated_output:
            return posterior.mean.cpu(), posterior.variance.cpu()
        means_all = torch.cat([p.mean for p in posterior])
        vars_all = torch.cat([p.variance for p in posterior])
        return means_all.cpu(), vars_all.cpu()

[docs]    def predict(self, x_new: Union[torch.Tensor, np.ndarray],
                **kwargs) -> Tuple[np.ndarray]:
        """
        Prediction of mean and variance using the trained model
        """
        gp_batch_dim = len(self.gp_model.train_targets)
        x_new, _ = self.set_data(x_new, device='cpu')
        data_loader = init_dataloader(x_new, shuffle=False, **kwargs)
        predicted_mean, predicted_var = [], []
        for (x,) in data_loader:
            x = x.expand(gp_batch_dim, *x.shape)
            mean, var = self._predict(x)
            predicted_mean.append(mean)
            predicted_var.append(var)
        return (torch.cat(predicted_mean, 1).numpy().squeeze(),
                torch.cat(predicted_var, 1).numpy().squeeze())

    def _embed(self, x_new: torch.Tensor):
        self.gp_model.eval()
        with torch.no_grad():
            if self.correlated_output:
                embeded = self.gp_model.feature_extractor(x_new)
                embeded = self.gp_model.scale_to_bounds(embeded)
            else:
                embeded = [m.scale_to_bounds(m.feature_extractor(x_new))[..., None]
                           for m in self.gp_model.models]
                embeded = torch.cat(embeded, -1)
        return embeded.cpu()

[docs]    def embed(self, x_new: Union[torch.Tensor, np.ndarray],
              **kwargs: int) -> torch.Tensor:
        """
        Embeds the input data to a "latent" space using a trained feature extractor NN.
        """
        x_new, _ = self.set_data(x_new, device='cpu')
        data_loader = init_dataloader(x_new, shuffle=False, **kwargs)
        embeded = torch.cat([self._embed(x.to(self.device)) for (x,) in data_loader], 0)
        if not self.correlated_output and not self.ensemble:
            embeded = embeded.permute(-1, 0, 1)
        return embeded.numpy()