Source code for typhon.retrieval.bmci.bmci

"""
Bayesian Montecarlo Integration.

Contains the `BMCI` class which implements the Bayesian Monte Carlo
Integration (BMCI) method as proposed by Evans et al. in [Evans]_.

.. [Evans] Evans, F. K. et al. Submillimeter-Wave Cloud Ice Radiometer: Simulations
   of retrieval algorithm performance. Journal of Geophysical Research 107, 2002
"""
import numpy as np


[docs]
class BMCI:
    r"""
    Bayesian Monte Carlo Integration

    This class implements methods for solving Bayesian inverse problems using
    Monte Carlo integration. The method uses a data base of atmospheric states
    :math:`x_i` and corresponding observations :math:`\mathbf{y}_i`, which are
    used to compute integrals of the posterior distribution by means of
    importance sampling:

    .. math::

        \int_a^b f(x') p(x' | \mathbf{y}) \: dx' \approx
        \sum_{a \leq x_i \leq b} \frac{w_i(\mathbf{y}) f(x)}
        {\sum_j w_j(\mathbf{y})}.

    The method assumes that measurement uncertainties can be described by a
    multivariate Gaussian distribution with covariance matrix $\mathbf{S}_o$,
    so that the weights :math:`w_i(\mathbf{y})` are given by

    .. math::
        w_i(\mathbf{y}) = \exp
        \{ -\frac{1}{2} (\mathbf{y} - \mathbf{y}_i)^T
        \mathbf{S}_o^{-1} (\mathbf{y} - \mathbf{y}_i) \}.

    The measurements in the database :code:`y` is assumed to be given as an
    array of shape :code:`(n, m)`, where n is the number of cases in the
    database. Currently only scalar retrievals are supported, which means that
    :code:`x` is assumed to a n-element vector containing the retrieval
    quantities corresponding to the observations in :math:`\mathbf{y}`.

    Attributes

        x: numpy.array, shape = (n,)
           The retrieval quantity corresponding to the atmospheric states represented
           in the data base.

        y: numpy.array, shape = (n, m)
           The measured or simulated brightness temperatures corresponding to the
           atmospheric states represented in the data base.

        s_o: numpy.array, shape = (m, m)
           The covariance matrix describing the measurement uncertainty.

        n: int
           The number of entries in the retrieval database.

        m: int
           The number of channels in a single measurement :math:`\mathbf{y}_i`.

        pc1: numpy.array, shape = (m, )
            The eigenvector corresponding to the smallest eigenvalue of the
            observation uncertainty covariance matrix. Along this vector the
            entries in the database will be ordered, which can be used to
            accelerate the retrieval.

        pc1_e: float
            The smallest eigenvalue of the observation uncertainty covariance
            matrix.

        pc1_proj: numpy.array, shape (n, )
            The projections of the measurements in `y` onto `pc1` along which
            the database entries are ordered.

    """

[docs]
    def __init__(self, y, x, s_o):
        """
        Create a QRNN instance from a given training data base
        `y, x` and measurement uncertainty given by the covariance
        matrix `s_o`.

        Args

            y(numpy.array): 2D array containing the measured or simulated
                            brightness temperatures corresponding to the
                            atmospheric states represented in the data base.
                            These are sorted along the eigenvector of the
                            observation uncertainty covariance matrix with
                            the smallest eigentvector.

            x: 1D array
            The retrieval quantity corresponding to the atmospheric states
            represented in the data base. These will be sorted along the
            eigenvector of the observation uncertainty covariance matrix
            with the smallest eigentvector.

            s_o: 2D array
            The covariance matrix describing the measurement uncertainty.
        """
        self.n = y.shape[0]
        self.m = y.shape[1]

        if not s_o.ndim == 2:
            raise Exception("Covariance matrix must be a 2D array.")
        if not self.m == s_o.shape[0] or not self.m == s_o.shape[1]:
            raise Exception("Covariance matrix must be square and agree "
                            + "with the dimensions of measurement vector.")

        # Covariance Matrix
        self.s_o = s_o
        self.s_o_inv = np.linalg.inv(self.s_o)


        # Eigenvalues of s
        self.y_mean  = np.mean(y, axis = 0)

        w, v = np.linalg.eig(self.s_o)

        inds = np.argsort(w)
        self.pc1_e = 1.0 / w[inds[0]]
        self.pc1 = v[:, inds[0]]
        self.pc1_proj = np.dot((y - self.y_mean), self.pc1)

        indices = np.argsort(self.pc1_proj)
        self.pc1_proj = self.pc1_proj[indices]
        self.x = x[indices]
        self.y = y[indices, :]

        self.x_sorted_inds = np.argsort(self.x)



    def __find_hits(self, y_obs, x2_max = 10.0):
        r"""
        Finds the range of indices in the database guaranteed to have a greater
        :math:`\chi^2` value than `x2_max`.

        Args:

            y_obs: 1D array
                   The measurement for which to compute the range of elements
                   to include in the weight calculation.

        Returns: Tuple of integers `(i_l, i_u, n_hits)`

            i_l: int
                 The lower index of the range of elements to include in the
                 weight calculation.

            i_u: int
                 The upper index of the range of elements to include in the
                 weight calculation.

            n_hits: int
                 The number of hits in the that are within the specified
                 :math:`\chi^2` limits.

        """
        y_proj = np.dot(self.pc1, (y_obs - self.y_mean).ravel())
        s_l = y_proj - np.sqrt(2.0 * x2_max / self.pc1_e)
        s_u = y_proj + np.sqrt(2.0 * x2_max / self.pc1_e)
        inds = np.searchsorted(self.pc1_proj, np.array([s_l, s_u]))

        return inds[0], inds[1], inds[1] - inds[0]

    def __gauss_prob(self, y_obs, y_database):

        dy = y_database - y_obs.reshape(1, -1)
        ws = dy * np.dot(dy, self.s_o_inv)
        ws = np.exp(-0.5 * ws.sum(axis=1, keepdims=True))
        return ws


[docs]
    def weights(self, y_obs, x2_max = -1.0):
        r"""
        Compute the importance sampling weights for a given observation `y`.
        If `y_train` is given it will be used as the database observations
        for which to compute the weights. Thie can be used to reduce the lookup
        scope in order to improve computational performance.

        The weight :math:`w_i` for database entry :math:`i` and given
        observation :math:`y` is computed as:

        .. math::
            w_i = \exp \{ -\frac{1}{2} (\mathbf{y} - \mathbf{y}_i)^T
            \mathbf{S}_o^{-1}(\mathbf{y} - \mathbf{y}_i) \}

        Args:

            y(numpy.array): The observations for which to compute the weights.

            y_train(numpy.array): 2D array containing the observations from the
                                  database for which to compute the weights.
                                  Channels are assumed to be along axis 1.

        Returns:

            ws: 1D array
                Array containing the importance sampling weights.

        Return
        """
        if x2_max < 0.0:
            ws = self.__gauss_prob(y_obs, self.y )
            c  = ws.sum()
            return 0, self.n, ws
        else:
            i_l, i_u, n_hits = self.__find_hits(y_obs, x2_max)
            ws = self.__gauss_prob(y_obs, self.y[i_l : i_u, :])
            return i_l, i_u, ws



[docs]
    def predict(self, y_obs, x2_max = -1.0):
        r"""
        This performs the BMCI integration to approximate the mean and variance
        of the posterior distribution:

        .. math::
            \bar{x} = \int x p(x | \mathbf{y}) \: dx \approx
            \sum_i \frac{w_i(\mathbf{y}) x}{\sum_j w_j(\mathbf{y})}


            \text{var}(x) = \int (x - \bar{x})^2 p(x | \mathbf{y}) \: dx \approx
            \sum_i \frac{w_i(\mathbf{y}) x^2}{\sum_j w_j(\mathbf{y})}

        By default this method computes weights for all entries in the database,
        which is slow for large databases. This can be avoided by setting the
        :code:`x2_max` keyword to a non-negative value. This will be used to
        exclude weights that can be guaranteed to have a larger :math:`\Chi^2`
        value than :code:`x2_max`.

        Arguments:

            y_obs (numpy.ndarray): Array of shape `(n, m)` containing the `n`
                                   observations for which to perform the
                                   retrieval.

            x2_max (float): If non-negative, database elements that can be
                            guaranteed to have a higher chi-square value are
                            excluded.

        Returns:

            A tuple :code:`(xs, sigmas)` containing the retrieved means (`xs`)
            and the corresponding standard deviations (:code:`sigmas`).

        """
        xs = np.zeros(y_obs.shape[0])
        sigmas = np.zeros(y_obs.shape[0])

        for i in range(y_obs.shape[0]):
            i_l, i_u, ws = self.weights(y_obs[i, :], x2_max)
            c = ws.sum()
            if c > 0.0:
                xs[i] = np.sum(self.x[i_l:i_u].ravel() * ws.ravel() / c)
                sigmas[i] = np.sqrt(np.sum(
                    (self.x[i_l:i_u].ravel() - xs[i]) ** 2.0 * ws.ravel() / c))
            else:
                xs[i] = np.float("nan")
                sigmas[i] = np.float("nan")
        return xs, sigmas



[docs]
    def crps(self, y_obs, x_true, x2_max = -1.0):
        r"""
        Compute the Continuous Ranked Probability Score.

        This function approximates the cumulative probability density of the
        posterior distribution for the observations in :code:`y_obs` and
        computes the continuously ranked probability score (CRPS):

        .. math::
            CRPS(\mathbf{y}, x) = \int_{-\infty}^\infty (F_{x | \mathbf{y}}(x')
            - \mathrm{1}_{x < x'})^2 \: dx'

        Arguments:

            y_obs (numpy.ndarray): Array of shape `(n, m)` containing the `n`
                                   observations for which to perform the
                                   retrieval.
            x_true(numpy.ndarray): 1-D array containing the `n` x values to test
                                   the predictions against.

        """
        n = y_obs.shape[0]
        scores = np.zeros(n)

        for i in range(n):
            i_l, i_u, ws = self.weights(y_obs[i, :], x2_max)

            inds = np.where((i_l <= self.x_sorted_inds) * (self.x_sorted_inds < i_u))
            inds = self.x_sorted_inds[inds] - i_l

            ws = ws[inds]
            xs = self.x[i_l:i_u][inds]

            indicator = np.zeros(i_u - i_l)
            indicator[xs > x_true[i]] = 1.0

            ws_cum = ws.cumsum()

            if ws_cum[-1] > 0.0:
                ws_cum /= ws_cum[-1]
                scores[i] = np.trapz((ws_cum - indicator) ** 2.0,
                                     xs)
            else:
                scores[i] = np.float("nan")

        return scores



[docs]
    def cdf(self, y_obs, x2_max = -1):
        r"""
        A posteriori cumulative distribution function (CDF).

        This function approximates the cumulative posterior distribution
        :math:`F(x | \mathbf{y})` for the given observation `y_obs` using

        .. math::

            F(x | \mathbf{y}) = \int_{-\infty}^{x} p(x' | \mathbf{y}) \: dx'
            \approx \sum_{x_i < x} \frac{w_i(\mathbf{y})}{\sum_j w_j(\mathbf{y})}

        Args:

            y_obs(numpy.array): `m`-element array containing the observation
                                 for which to compute the posterior CDF.

            x2_max(float): The :math:`\chi^2` cutoff to apply to elements in the
                           database. Ignored if less than zero.

        Returns:

            A tuple `(xs, ys)` containing the estimated values of the posterior
            CDF :math:`F(x | \mathbf{y})` evaluated at the $x$ values
            corresponding to the hits in the database.

        Raises:

            ValueError
                If the number of channels in the observations is different from
                the database.

        """
        try:
            y_obs = y_obs.reshape(1, self.m)
        except:
            raise ValueError("The observation vector is inconsistent"
                             "with the database.")

        i_l, i_u, ws = self.weights(y_obs, x2_max)

        inds = np.where((i_l <= self.x_sorted_inds) * (self.x_sorted_inds < i_u))
        inds = self.x_sorted_inds[inds] - i_l

        ws = ws[inds]
        xs = self.x[i_l:i_u][inds]

        ws_cum = ws.cumsum()
        if ws_cum[-1] > 0.0:
            ws_cum /= ws_cum[-1]
        else:
            ws_cum = np.float("nan")
        return xs, ws_cum



[docs]
    def pdf(self, y_obs, x2_max = -1, n_points = 21):
        r"""
        A posteriori probability density function (PDF).

        This function approximates the a posteriori PDF
        :math:`p(x | \mathbf{y})`. The PDF is just a weighted histogram
        of the retrieval values :math:`x` in the databse that are below
        the :math:`X^2` cutoff weighted by the corresponding weight
        :math:`w_i(\mathbf{y})`.

        Args:

            y_obs(numpy.array): `m`-element array containing the observation
                                 for which to compute the posterior CDF.

            x2_max(float): The :math:`\chi^2` cutoff to apply to elements in the
                           database. Ignored if less than zero.

            n_points(int): The number of points at which to estimate the PDF.

        Returns:

            A tuple `(xs, ys)` describing the estimated PSD.

        Raises:

            ValueError
                If the number of channels in the observations is different from
                the database.

        """
        try:
            y_obs = y_obs.reshape(1, self.m)
        except:
            raise ValueError("The observation vector is inconsistent"
                             "with the database.")

        i_l, i_u, ws = self.weights(y_obs, x2_max)

        inds = np.where((i_l <= self.x_sorted_inds) * (self.x_sorted_inds < i_u))
        inds = self.x_sorted_inds[inds] - i_l

        ws = ws[inds]
        xs = self.x[i_l:i_u][inds]

        ws_cum = np.cumsum(ws)
        ws_cum /= ws_cum[-1]

        i_s = np.where(ws_cum > 0.01)[0][0]
        i_e = np.where(ws_cum > 0.99)[0][0]

        x_t = np.linspace(xs[i_s], xs[i_e], n_points - 1)

        y = np.zeros(n_points)
        y[1:-1], _ = np.histogram(xs,
                                  weights = ws.ravel(),
                                  normed = True,
                                  bins = x_t)

        x = np.zeros(n_points)
        x[0] = xs[max(i_s, 0)]
        x[-1] = xs[min(i_e, xs.size - 1)]
        x[1:-1] = 0.5 * (x_t[1:] + x_t[:-1])

        return x, y



[docs]
    def predict_quantiles(self, y_obs, quantiles, x2_max=-1):
        r"""
        This estimates the quantiles given in `quantiles` by approximating
        the CDF of the posterior as

        .. math::

            F(x | \mathbf{y}) = \int_{-\infty}^{x} p(x' | \mathbf{y}) \: dx'
            \approx \sum_{x_i < x} \frac{w_i(\mathbf{y})}{\sum_j w_j(\mathbf{y})}

        and then interpolating :math:`F^{-1}` to obtain the desired quantiles.

        Args:

            y_obs(numpy.array): `n`-times-`m` matrix containing the `n`
                                 observations with `m` channels for which to
                                 compute the percentiles.

            quantiles(numpy.array): 1D array containing the `k` quantiles
                               :math:`\tau \in [0,1]` to compute.

            x2_max(float): The :math:`\chi^2` cutoff to apply to elements in the
                           database. Ignored if less than zero.

        Returns:

            A 2D numpy.array with shape `(n, k)` array containing the estimated
            quantiles or `NAN` if no database entries were found in the
            :math:`\chi^2` search  region.

        Raises:

            ValueError
                If the number of channels in the observations is different from
                the database.

            ValueError
                If any of the percentiles lies outside the interval [0, 1].

        """
        taus = np.asarray(quantiles).reshape((-1, ))

        m = y_obs.shape[1]
        n = y_obs.shape[0]
        k = taus.size

        if not m == self.m:
            raise ValueError("Number of channels is inconsistent with database.")

        if np.any((taus < 0.0) + (taus > 1.0)):
            raise ValueError("Percentiles must be in [0.0, 1.0]")

        qs = np.zeros((n, k))
        for i in range(y_obs.shape[0]):

            i_l, i_u, ws = self.weights(y_obs[i, :], x2_max)

            inds = np.where((i_l <= self.x_sorted_inds) * (self.x_sorted_inds < i_u))
            inds = self.x_sorted_inds[inds] - i_l

            ws = ws[inds]
            xs = self.x[i_l:i_u][inds]

            ws_cum = ws.cumsum()

            if ws_cum[-1] > 0.0:
                ws_cum /= ws_cum[-1]
                qs[i, :] = np.interp(taus, ws_cum, xs)
            else:
                qs[i, :] = np.float("nan")
        return qs