Source code for typhon.retrieval.bmci.bmci

"""
Bayesian Montecarlo Integration.

Contains the `BMCI` class which implements the Bayesian Monte Carlo
Integration (BMCI) method as proposed by Evans et al. in [Evans]_.

.. [Evans] Evans, F. K. et al. Submillimeter-Wave Cloud Ice Radiometer: Simulations
   of retrieval algorithm performance. Journal of Geophysical Research 107, 2002
"""
import numpy as np

[docs] class BMCI: r""" Bayesian Monte Carlo Integration This class implements methods for solving Bayesian inverse problems using Monte Carlo integration. The method uses a data base of atmospheric states :math:`x_i` and corresponding observations :math:`\mathbf{y}_i`, which are used to compute integrals of the posterior distribution by means of importance sampling: .. math:: \int_a^b f(x') p(x' | \mathbf{y}) \: dx' \approx \sum_{a \leq x_i \leq b} \frac{w_i(\mathbf{y}) f(x)} {\sum_j w_j(\mathbf{y})}. The method assumes that measurement uncertainties can be described by a multivariate Gaussian distribution with covariance matrix $\mathbf{S}_o$, so that the weights :math:`w_i(\mathbf{y})` are given by .. math:: w_i(\mathbf{y}) = \exp \{ -\frac{1}{2} (\mathbf{y} - \mathbf{y}_i)^T \mathbf{S}_o^{-1} (\mathbf{y} - \mathbf{y}_i) \}. The measurements in the database :code:`y` is assumed to be given as an array of shape :code:`(n, m)`, where n is the number of cases in the database. Currently only scalar retrievals are supported, which means that :code:`x` is assumed to a n-element vector containing the retrieval quantities corresponding to the observations in :math:`\mathbf{y}`. Attributes x: numpy.array, shape = (n,) The retrieval quantity corresponding to the atmospheric states represented in the data base. y: numpy.array, shape = (n, m) The measured or simulated brightness temperatures corresponding to the atmospheric states represented in the data base. s_o: numpy.array, shape = (m, m) The covariance matrix describing the measurement uncertainty. n: int The number of entries in the retrieval database. m: int The number of channels in a single measurement :math:`\mathbf{y}_i`. pc1: numpy.array, shape = (m, ) The eigenvector corresponding to the smallest eigenvalue of the observation uncertainty covariance matrix. Along this vector the entries in the database will be ordered, which can be used to accelerate the retrieval. pc1_e: float The smallest eigenvalue of the observation uncertainty covariance matrix. pc1_proj: numpy.array, shape (n, ) The projections of the measurements in `y` onto `pc1` along which the database entries are ordered. """
[docs] def __init__(self, y, x, s_o): """ Create a QRNN instance from a given training data base `y, x` and measurement uncertainty given by the covariance matrix `s_o`. Args y(numpy.array): 2D array containing the measured or simulated brightness temperatures corresponding to the atmospheric states represented in the data base. These are sorted along the eigenvector of the observation uncertainty covariance matrix with the smallest eigentvector. x: 1D array The retrieval quantity corresponding to the atmospheric states represented in the data base. These will be sorted along the eigenvector of the observation uncertainty covariance matrix with the smallest eigentvector. s_o: 2D array The covariance matrix describing the measurement uncertainty. """ self.n = y.shape[0] self.m = y.shape[1] if not s_o.ndim == 2: raise Exception("Covariance matrix must be a 2D array.") if not self.m == s_o.shape[0] or not self.m == s_o.shape[1]: raise Exception("Covariance matrix must be square and agree " + "with the dimensions of measurement vector.") # Covariance Matrix self.s_o = s_o self.s_o_inv = np.linalg.inv(self.s_o) # Eigenvalues of s self.y_mean = np.mean(y, axis = 0) w, v = np.linalg.eig(self.s_o) inds = np.argsort(w) self.pc1_e = 1.0 / w[inds[0]] self.pc1 = v[:, inds[0]] self.pc1_proj = np.dot((y - self.y_mean), self.pc1) indices = np.argsort(self.pc1_proj) self.pc1_proj = self.pc1_proj[indices] self.x = x[indices] self.y = y[indices, :] self.x_sorted_inds = np.argsort(self.x)
def __find_hits(self, y_obs, x2_max = 10.0): r""" Finds the range of indices in the database guaranteed to have a greater :math:`\chi^2` value than `x2_max`. Args: y_obs: 1D array The measurement for which to compute the range of elements to include in the weight calculation. Returns: Tuple of integers `(i_l, i_u, n_hits)` i_l: int The lower index of the range of elements to include in the weight calculation. i_u: int The upper index of the range of elements to include in the weight calculation. n_hits: int The number of hits in the that are within the specified :math:`\chi^2` limits. """ y_proj = np.dot(self.pc1, (y_obs - self.y_mean).ravel()) s_l = y_proj - np.sqrt(2.0 * x2_max / self.pc1_e) s_u = y_proj + np.sqrt(2.0 * x2_max / self.pc1_e) inds = np.searchsorted(self.pc1_proj, np.array([s_l, s_u])) return inds[0], inds[1], inds[1] - inds[0] def __gauss_prob(self, y_obs, y_database): dy = y_database - y_obs.reshape(1, -1) ws = dy * np.dot(dy, self.s_o_inv) ws = np.exp(-0.5 * ws.sum(axis=1, keepdims=True)) return ws
[docs] def weights(self, y_obs, x2_max = -1.0): r""" Compute the importance sampling weights for a given observation `y`. If `y_train` is given it will be used as the database observations for which to compute the weights. Thie can be used to reduce the lookup scope in order to improve computational performance. The weight :math:`w_i` for database entry :math:`i` and given observation :math:`y` is computed as: .. math:: w_i = \exp \{ -\frac{1}{2} (\mathbf{y} - \mathbf{y}_i)^T \mathbf{S}_o^{-1}(\mathbf{y} - \mathbf{y}_i) \} Args: y(numpy.array): The observations for which to compute the weights. y_train(numpy.array): 2D array containing the observations from the database for which to compute the weights. Channels are assumed to be along axis 1. Returns: ws: 1D array Array containing the importance sampling weights. Return """ if x2_max < 0.0: ws = self.__gauss_prob(y_obs, self.y ) c = ws.sum() return 0, self.n, ws else: i_l, i_u, n_hits = self.__find_hits(y_obs, x2_max) ws = self.__gauss_prob(y_obs, self.y[i_l : i_u, :]) return i_l, i_u, ws
[docs] def predict(self, y_obs, x2_max = -1.0): r""" This performs the BMCI integration to approximate the mean and variance of the posterior distribution: .. math:: \bar{x} = \int x p(x | \mathbf{y}) \: dx \approx \sum_i \frac{w_i(\mathbf{y}) x}{\sum_j w_j(\mathbf{y})} \text{var}(x) = \int (x - \bar{x})^2 p(x | \mathbf{y}) \: dx \approx \sum_i \frac{w_i(\mathbf{y}) x^2}{\sum_j w_j(\mathbf{y})} By default this method computes weights for all entries in the database, which is slow for large databases. This can be avoided by setting the :code:`x2_max` keyword to a non-negative value. This will be used to exclude weights that can be guaranteed to have a larger :math:`\Chi^2` value than :code:`x2_max`. Arguments: y_obs (numpy.ndarray): Array of shape `(n, m)` containing the `n` observations for which to perform the retrieval. x2_max (float): If non-negative, database elements that can be guaranteed to have a higher chi-square value are excluded. Returns: A tuple :code:`(xs, sigmas)` containing the retrieved means (`xs`) and the corresponding standard deviations (:code:`sigmas`). """ xs = np.zeros(y_obs.shape[0]) sigmas = np.zeros(y_obs.shape[0]) for i in range(y_obs.shape[0]): i_l, i_u, ws = self.weights(y_obs[i, :], x2_max) c = ws.sum() if c > 0.0: xs[i] = np.sum(self.x[i_l:i_u].ravel() * ws.ravel() / c) sigmas[i] = np.sqrt(np.sum( (self.x[i_l:i_u].ravel() - xs[i]) ** 2.0 * ws.ravel() / c)) else: xs[i] = np.float("nan") sigmas[i] = np.float("nan") return xs, sigmas
[docs] def crps(self, y_obs, x_true, x2_max = -1.0): r""" Compute the Continuous Ranked Probability Score. This function approximates the cumulative probability density of the posterior distribution for the observations in :code:`y_obs` and computes the continuously ranked probability score (CRPS): .. math:: CRPS(\mathbf{y}, x) = \int_{-\infty}^\infty (F_{x | \mathbf{y}}(x') - \mathrm{1}_{x < x'})^2 \: dx' Arguments: y_obs (numpy.ndarray): Array of shape `(n, m)` containing the `n` observations for which to perform the retrieval. x_true(numpy.ndarray): 1-D array containing the `n` x values to test the predictions against. """ n = y_obs.shape[0] scores = np.zeros(n) for i in range(n): i_l, i_u, ws = self.weights(y_obs[i, :], x2_max) inds = np.where((i_l <= self.x_sorted_inds) * (self.x_sorted_inds < i_u)) inds = self.x_sorted_inds[inds] - i_l ws = ws[inds] xs = self.x[i_l:i_u][inds] indicator = np.zeros(i_u - i_l) indicator[xs > x_true[i]] = 1.0 ws_cum = ws.cumsum() if ws_cum[-1] > 0.0: ws_cum /= ws_cum[-1] scores[i] = np.trapz((ws_cum - indicator) ** 2.0, xs) else: scores[i] = np.float("nan") return scores
[docs] def cdf(self, y_obs, x2_max = -1): r""" A posteriori cumulative distribution function (CDF). This function approximates the cumulative posterior distribution :math:`F(x | \mathbf{y})` for the given observation `y_obs` using .. math:: F(x | \mathbf{y}) = \int_{-\infty}^{x} p(x' | \mathbf{y}) \: dx' \approx \sum_{x_i < x} \frac{w_i(\mathbf{y})}{\sum_j w_j(\mathbf{y})} Args: y_obs(numpy.array): `m`-element array containing the observation for which to compute the posterior CDF. x2_max(float): The :math:`\chi^2` cutoff to apply to elements in the database. Ignored if less than zero. Returns: A tuple `(xs, ys)` containing the estimated values of the posterior CDF :math:`F(x | \mathbf{y})` evaluated at the $x$ values corresponding to the hits in the database. Raises: ValueError If the number of channels in the observations is different from the database. """ try: y_obs = y_obs.reshape(1, self.m) except: raise ValueError("The observation vector is inconsistent" "with the database.") i_l, i_u, ws = self.weights(y_obs, x2_max) inds = np.where((i_l <= self.x_sorted_inds) * (self.x_sorted_inds < i_u)) inds = self.x_sorted_inds[inds] - i_l ws = ws[inds] xs = self.x[i_l:i_u][inds] ws_cum = ws.cumsum() if ws_cum[-1] > 0.0: ws_cum /= ws_cum[-1] else: ws_cum = np.float("nan") return xs, ws_cum
[docs] def pdf(self, y_obs, x2_max = -1, n_points = 21): r""" A posteriori probability density function (PDF). This function approximates the a posteriori PDF :math:`p(x | \mathbf{y})`. The PDF is just a weighted histogram of the retrieval values :math:`x` in the databse that are below the :math:`X^2` cutoff weighted by the corresponding weight :math:`w_i(\mathbf{y})`. Args: y_obs(numpy.array): `m`-element array containing the observation for which to compute the posterior CDF. x2_max(float): The :math:`\chi^2` cutoff to apply to elements in the database. Ignored if less than zero. n_points(int): The number of points at which to estimate the PDF. Returns: A tuple `(xs, ys)` describing the estimated PSD. Raises: ValueError If the number of channels in the observations is different from the database. """ try: y_obs = y_obs.reshape(1, self.m) except: raise ValueError("The observation vector is inconsistent" "with the database.") i_l, i_u, ws = self.weights(y_obs, x2_max) inds = np.where((i_l <= self.x_sorted_inds) * (self.x_sorted_inds < i_u)) inds = self.x_sorted_inds[inds] - i_l ws = ws[inds] xs = self.x[i_l:i_u][inds] ws_cum = np.cumsum(ws) ws_cum /= ws_cum[-1] i_s = np.where(ws_cum > 0.01)[0][0] i_e = np.where(ws_cum > 0.99)[0][0] x_t = np.linspace(xs[i_s], xs[i_e], n_points - 1) y = np.zeros(n_points) y[1:-1], _ = np.histogram(xs, weights = ws.ravel(), normed = True, bins = x_t) x = np.zeros(n_points) x[0] = xs[max(i_s, 0)] x[-1] = xs[min(i_e, xs.size - 1)] x[1:-1] = 0.5 * (x_t[1:] + x_t[:-1]) return x, y
[docs] def predict_quantiles(self, y_obs, quantiles, x2_max=-1): r""" This estimates the quantiles given in `quantiles` by approximating the CDF of the posterior as .. math:: F(x | \mathbf{y}) = \int_{-\infty}^{x} p(x' | \mathbf{y}) \: dx' \approx \sum_{x_i < x} \frac{w_i(\mathbf{y})}{\sum_j w_j(\mathbf{y})} and then interpolating :math:`F^{-1}` to obtain the desired quantiles. Args: y_obs(numpy.array): `n`-times-`m` matrix containing the `n` observations with `m` channels for which to compute the percentiles. quantiles(numpy.array): 1D array containing the `k` quantiles :math:`\tau \in [0,1]` to compute. x2_max(float): The :math:`\chi^2` cutoff to apply to elements in the database. Ignored if less than zero. Returns: A 2D numpy.array with shape `(n, k)` array containing the estimated quantiles or `NAN` if no database entries were found in the :math:`\chi^2` search region. Raises: ValueError If the number of channels in the observations is different from the database. ValueError If any of the percentiles lies outside the interval [0, 1]. """ taus = np.asarray(quantiles).reshape((-1, )) m = y_obs.shape[1] n = y_obs.shape[0] k = taus.size if not m == self.m: raise ValueError("Number of channels is inconsistent with database.") if np.any((taus < 0.0) + (taus > 1.0)): raise ValueError("Percentiles must be in [0.0, 1.0]") qs = np.zeros((n, k)) for i in range(y_obs.shape[0]): i_l, i_u, ws = self.weights(y_obs[i, :], x2_max) inds = np.where((i_l <= self.x_sorted_inds) * (self.x_sorted_inds < i_u)) inds = self.x_sorted_inds[inds] - i_l ws = ws[inds] xs = self.x[i_l:i_u][inds] ws_cum = ws.cumsum() if ws_cum[-1] > 0.0: ws_cum /= ws_cum[-1] qs[i, :] = np.interp(taus, ws_cum, xs) else: qs[i, :] = np.float("nan") return qs