diff --git a/.zenodo.json b/.zenodo.json index e329952de..697422fc3 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -140,6 +140,10 @@ "affiliation": "Santander Meteorology Group, Instituto de Física de Cantabria (CSIC-UC), Santander, Spain", "orcid": "0000-0001-9053-2542" }, + { + "name": "Lamarche, Adrien", + "orcid": "0009-0002-9078-8358" + }, { "name": "Wang, Hui-Min", "affiliation": "National University of Singapore, Singapore, Singapore", diff --git a/AUTHORS.rst b/AUTHORS.rst index c83977550..23b2d52bf 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -44,4 +44,5 @@ Contributors * Dante Castro `@profesorpaiche `_ * Sascha Hofmann `@saschahofmann `_ * Javier Diez-Sierra `@JavierDiezSierra `_ -* Hui-Min Wang `@Hem-W ` +* Hui-Min Wang `@Hem-W `_ +* Adrien Lamarche `@LamAdr `_ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6452d02c1..db050ad56 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,7 +4,7 @@ Changelog v0.52.0 (unreleased) -------------------- -Contributors to this version: David Huard (:user:`huard`), Trevor James Smith (:user:`Zeitsperre`), Hui-Min Wang (:user:`Hem-W`), Éric Dupuis (:user:`coxipi`), Sarah Gammon (:user:`SarahG-579462`), Pascal Bourgault (:user:`aulemahal`), Juliette Lavoie (:user:`juliettelavoie`). +Contributors to this version: David Huard (:user:`huard`), Trevor James Smith (:user:`Zeitsperre`), Hui-Min Wang (:user:`Hem-W`), Éric Dupuis (:user:`coxipi`), Sarah Gammon (:user:`SarahG-579462`), Pascal Bourgault (:user:`aulemahal`), Juliette Lavoie (:user:`juliettelavoie`), Adrien Lamarche (:user:`LamAdr`). Announcements ^^^^^^^^^^^^^ @@ -15,6 +15,7 @@ New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * ``xclim.sdba.nbutils.quantile`` and its child functions are now faster. If the module `fastnanquantile` is installed, it is used as the backend for the computation of quantiles and yields even faster results. This dependency is now listed in the `xclim[extras]` recipe. (:issue:`1255`, :pull:`1513`). * New multivariate bias adjustment class ``MBCn``, giving a faster and more accurate implementation of the ``MBCn`` algorithm. (:issue:`1551`, :pull:`1580`). +* New multivariate bias adjustment classes ``OTC`` and ``dOTC``. (:pull:`1787`). * `xclim` is now compatible with `pytest` versions `>=8.0.0`. (:pull:`1632`). Breaking changes diff --git a/docs/notebooks/sdba.ipynb b/docs/notebooks/sdba.ipynb index 6bcc5c4f8..345d5c446 100644 --- a/docs/notebooks/sdba.ipynb +++ b/docs/notebooks/sdba.ipynb @@ -344,6 +344,7 @@ "metadata": {}, "source": [ "### Third example : Multi-method protocol - Hnilica et al. 2017\n", + "\n", "In [their paper of 2017](https://doi.org/10.1002/joc.4890), Hnilica, Hanel and Puš present a bias-adjustment method based on the principles of Principal Components Analysis.\n", "\n", "The idea is simple: use principal components to define coordinates on the reference and on the simulation, and then transform the simulation data from the latter to the former. Spatial correlation can thus be conserved by taking different points as the dimensions of the transform space. The method was demonstrated in the article by bias-adjusting precipitation over different drainage basins.\n", @@ -439,9 +440,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Fourth example : Multivariate bias-adjustment (Cannon, 2018)\n", + "### Fourth example : Dynamical Optimal Transport Correction - Robin et al. 2019\n", + "Robin, Vrac, Naveau and Yiou presented the dOTC multivariate bias correction method in a [2019 paper](https://hess.copernicus.org/articles/23/773/2019/).\n", "\n", - "This section replicates the \"MBCn\" algorithm described by [Cannon (2018)](https://doi.org/10.1007/s00382-017-3580-6). The method relies on some univariate algorithm, an adaption of the N-pdf transform of [Pitié et al. (2005)](https://ieeexplore.ieee.org/document/1544887/) and a final reordering step.\n", + "Here, we use optimal transport to find mappings between reference, simulated historical and simulated future data. Following these mappings, future simulation is corrected by applying the temporal evolution of model data to the reference.\n", "\n", "In the following, we use the Adjusted and Homogenized Canadian Climate Dataset ([AHCCD](https://open.canada.ca/data/en/dataset/9c4ebc00-3ea4-4fe0-8bf2-66cfe1cddd1d)) and CanESM2 data as reference and simulation, respectively, and correct both `pr` and `tasmax` together." ] @@ -452,12 +454,14 @@ "metadata": {}, "outputs": [], "source": [ + "import numpy as np\n", + "\n", "from xclim.core.units import convert_units_to\n", "from xclim.testing import open_dataset\n", "\n", - "dref = open_dataset(\n", - " \"sdba/ahccd_1950-2013.nc\", chunks={\"location\": 1}, drop_variables=[\"lat\", \"lon\"]\n", - ").sel(time=slice(\"1981\", \"2010\"))\n", + "dref = open_dataset(\"sdba/ahccd_1950-2013.nc\", drop_variables=[\"lat\", \"lon\"]).sel(\n", + " time=slice(\"1981\", \"2010\")\n", + ")\n", "\n", "# Fix the standard name of the `pr` variable.\n", "# This allows the convert_units_to below to infer the correct CF transformation (precip rate to flux)\n", @@ -468,12 +472,218 @@ " tasmax=convert_units_to(dref.tasmax, \"K\"),\n", " pr=convert_units_to(dref.pr, \"kg m-2 s-1\"),\n", ")\n", - "dsim = open_dataset(\n", - " \"sdba/CanESM2_1950-2100.nc\", chunks={\"location\": 1}, drop_variables=[\"lat\", \"lon\"]\n", - ")\n", + "dsim = open_dataset(\"sdba/CanESM2_1950-2100.nc\", drop_variables=[\"lat\", \"lon\"])\n", "\n", "dhist = dsim.sel(time=slice(\"1981\", \"2010\"))\n", "dsim = dsim.sel(time=slice(\"2041\", \"2070\"))\n", + "dref" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we are going to correct the precipitations multiplicatively to make sure they don't become negative. In this context, small precipitation values can lead to huge aberrations. This problem can be mitigated with `adapt_freq_thresh`. We also need to stack our variables into a `dataArray` before feeding them to `dOTC`.\n", + "\n", + "Since the precipitations are treated multiplicatively, we have no choice but to use \"std\" for the `cov_factor` argument (the default), which means the rescaling of model data to the observed data scale is done independently for every variable. In the situation where one only has additive variables, it is recommended to use the \"cholesky\" `cov_factor`, in which case the rescaling is done in a multivariate fashion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ref = dref.where(dref.location == \"Amos\", drop=True).squeeze()\n", + "hist = dhist.where(dhist.location == \"Amos\", drop=True).squeeze()\n", + "sim = dsim.where(dsim.location == \"Amos\", drop=True).squeeze()\n", + "\n", + "ref = sdba.processing.stack_variables(ref)\n", + "hist = sdba.processing.stack_variables(hist)\n", + "sim = sdba.processing.stack_variables(sim)\n", + "\n", + "# This function has random components\n", + "np.random.seed(0)\n", + "\n", + "# Contrary to most algorithms in sdba, dOTC has no `train` method\n", + "scen = sdba.adjustment.dOTC.adjust(\n", + " ref,\n", + " hist,\n", + " sim,\n", + " kind={\n", + " \"pr\": \"*\"\n", + " }, # Since this bias correction method is multivariate, `kind` must be specified per variable\n", + " adapt_freq_thresh={\"pr\": \"3.5e-4 kg m-2 s-1\"}, # Idem\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Some analysis figures\n", + "\n", + "# Unstack variables and select a location\n", + "ref = sdba.processing.unstack_variables(ref)\n", + "hist = sdba.processing.unstack_variables(hist)\n", + "sim = sdba.processing.unstack_variables(sim)\n", + "scen = sdba.processing.unstack_variables(scen)\n", + "\n", + "fig = plt.figure(figsize=(10, 10))\n", + "gs = plt.matplotlib.gridspec.GridSpec(2, 2, fig)\n", + "ax_pr = plt.subplot(gs[0, 0])\n", + "ax_tasmax = plt.subplot(gs[0, 1])\n", + "ax_scatter = plt.subplot(gs[1, :])\n", + "\n", + "# Precipitation\n", + "hist.pr.plot(ax=ax_pr, color=\"c\", label=\"Simulation (past)\")\n", + "ref.pr.plot(ax=ax_pr, color=\"b\", label=\"Reference\", alpha=0.5)\n", + "sim.pr.plot(ax=ax_pr, color=\"y\", label=\"Simulation (future)\")\n", + "scen.pr.plot(ax=ax_pr, color=\"r\", label=\"Corrected\", alpha=0.5)\n", + "ax_pr.set_title(\"Precipitation\")\n", + "\n", + "# Maximum temperature\n", + "hist.tasmax.plot(ax=ax_tasmax, color=\"c\")\n", + "ref.tasmax.plot(ax=ax_tasmax, color=\"b\", alpha=0.5)\n", + "sim.tasmax.plot(ax=ax_tasmax, color=\"y\")\n", + "scen.tasmax.plot(ax=ax_tasmax, color=\"r\", alpha=0.5)\n", + "ax_tasmax.set_title(\"Maximum temperature\")\n", + "\n", + "# Scatter\n", + "ref.plot.scatter(x=\"tasmax\", y=\"pr\", ax=ax_scatter, color=\"b\", edgecolors=\"k\", s=20)\n", + "scen.plot.scatter(x=\"tasmax\", y=\"pr\", ax=ax_scatter, color=\"r\", edgecolors=\"k\", s=20)\n", + "sim.plot.scatter(x=\"tasmax\", y=\"pr\", ax=ax_scatter, color=\"y\", edgecolors=\"k\", s=20)\n", + "hist.plot.scatter(x=\"tasmax\", y=\"pr\", ax=ax_scatter, color=\"c\", edgecolors=\"k\", s=20)\n", + "ax_scatter.set_title(\"Variables distribution\")\n", + "\n", + "# Example mapping\n", + "max_time = scen.pr.idxmax().data\n", + "max_idx = np.where(scen.time.data == max_time)[0][0]\n", + "\n", + "scen_x = scen.tasmax.isel(time=max_idx)\n", + "scen_y = scen.pr.isel(time=max_idx)\n", + "sim_x = sim.tasmax.isel(time=max_idx)\n", + "sim_y = sim.pr.isel(time=max_idx)\n", + "\n", + "ax_scatter.scatter(scen_x, scen_y, color=\"r\", edgecolors=\"k\", s=30, linewidth=1)\n", + "ax_scatter.scatter(sim_x, sim_y, color=\"y\", edgecolors=\"k\", s=30, linewidth=1)\n", + "\n", + "prop = dict(arrowstyle=\"-|>,head_width=0.3,head_length=0.8\", facecolor=\"black\", lw=1)\n", + "ax_scatter.annotate(\"\", xy=(scen_x, scen_y), xytext=(sim_x, sim_y), arrowprops=prop)\n", + "\n", + "ax_pr.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import gaussian_kde\n", + "\n", + "fig = plt.figure(figsize=(10, 5))\n", + "gs = plt.matplotlib.gridspec.GridSpec(1, 2, fig)\n", + "\n", + "tasmax = plt.subplot(gs[0, 0])\n", + "pr = plt.subplot(gs[0, 1])\n", + "\n", + "sim_t = sim.tasmax.to_numpy()\n", + "scen_t = scen.tasmax.to_numpy()\n", + "stack = np.vstack([sim_t, scen_t])\n", + "z = gaussian_kde(stack)(stack)\n", + "idx = z.argsort()\n", + "sim_t, scen_t, z = sim_t[idx], scen_t[idx], z[idx]\n", + "tasmax.scatter(sim_t, scen_t, c=z, s=1, cmap=\"viridis\")\n", + "tasmax.set_title(\"Tasmax\")\n", + "tasmax.set_ylabel(\"scen tasmax\")\n", + "tasmax.set_xlabel(\"sim tasmax\")\n", + "\n", + "sim_p = sim.pr.to_numpy()\n", + "scen_p = scen.pr.to_numpy()\n", + "stack = np.vstack([sim_p, scen_p])\n", + "z = gaussian_kde(stack)(stack)\n", + "idx = z.argsort()\n", + "sim_p, scen_p, z = sim_p[idx], scen_p[idx], z[idx]\n", + "pr.scatter(sim_p, scen_p, c=z, s=1, cmap=\"viridis\")\n", + "pr.set_title(\"Pr\")\n", + "pr.set_ylabel(\"scen pr\")\n", + "pr.set_xlabel(\"sim pr\")\n", + "\n", + "fig.suptitle(\"Correlations between input and output per variable\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This last plot shows the correlation between input and output per variable. Here we see a relatively strong correlation for all variables, meaning they are all taken into account when finding the optimal transport mappings. This is because we're using the (by default) `transform = 'max_distance'` argument. Were the data not transformed, the distances along the precipitation dimension would be very small relative to the temperature distances. Precipitation values would then be spread around at very low cost and have virtually no effect on the result. See this in action with `transform = None`.\n", + "\n", + "The chunks we see in the tasmax data are artefacts of the `bin_width`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fifth example : Multivariate bias-adjustment with multiple steps (Cannon, 2018)\n", + "\n", + "This section replicates the \"MBCn\" algorithm described by [Cannon (2018)](https://doi.org/10.1007/s00382-017-3580-6). The method relies on some univariate algorithm, an adaption of the N-pdf transform of [Pitié et al. (2005)](https://ieeexplore.ieee.org/document/1544887/) and a final reordering step.\n", + "\n", + "As in the dOTC example, we use the Adjusted and Homogenized Canadian Climate Dataset ([AHCCD](https://open.canada.ca/data/en/dataset/9c4ebc00-3ea4-4fe0-8bf2-66cfe1cddd1d)) and CanESM2 data as reference and simulation, respectively, and correct both `pr` and `tasmax` together. This time, we chunk our data with Dask." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dref[\"pr\"] = dref.pr.chunk({\"location\": 1})\n", + "dref[\"tasmax\"] = dref.tasmax.chunk({\"location\": 1})\n", + "\n", + "dhist[\"pr\"] = dhist.pr.chunk({\"location\": 1})\n", + "dhist[\"tasmax\"] = dhist.tasmax.chunk({\"location\": 1})\n", + "\n", + "dsim[\"pr\"] = dsim.pr.chunk({\"location\": 1})\n", + "dsim[\"tasmax\"] = dsim.tasmax.chunk({\"location\": 1})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Perform an initial univariate adjustment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# additive for tasmax\n", + "QDMtx = sdba.QuantileDeltaMapping.train(\n", + " dref.tasmax, dhist.tasmax, nquantiles=20, kind=\"+\", group=\"time\"\n", + ")\n", + "# Adjust both hist and sim, we'll feed both to the Npdf transform.\n", + "scenh_tx = QDMtx.adjust(dhist.tasmax)\n", + "scens_tx = QDMtx.adjust(dsim.tasmax)\n", + "\n", + "# remove == 0 values in pr:\n", + "dref[\"pr\"] = sdba.processing.jitter_under_thresh(dref.pr, \"0.01 mm d-1\")\n", + "dhist[\"pr\"] = sdba.processing.jitter_under_thresh(dhist.pr, \"0.01 mm d-1\")\n", + "dsim[\"pr\"] = sdba.processing.jitter_under_thresh(dsim.pr, \"0.01 mm d-1\")\n", + "\n", + "# multiplicative for pr\n", + "QDMpr = sdba.QuantileDeltaMapping.train(\n", + " dref.pr, dhist.pr, nquantiles=20, kind=\"*\", group=\"time\"\n", + ")\n", + "# Adjust both hist and sim, we'll feed both to the Npdf transform.\n", + "scenh_pr = QDMpr.adjust(dhist.pr)\n", + "scens_pr = QDMpr.adjust(dsim.pr)\n", "\n", "# Stack variables : Dataset -> DataArray with `multivar` dimension\n", "dref, dhist, dsim = (sdba.stack_variables(da) for da in (dref, dhist, dsim))" diff --git a/docs/references.bib b/docs/references.bib index b0d868e69..9f2419dd9 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -2152,3 +2152,54 @@ @article{droogers2002 url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-0036464359&doi=10.1023%2fA%3a1015508322413&partnerID=40&md5=7322aaa4c6874878f5b1dab3c73c1718}, type = {Article} } + +@Article{robin_2019, + author = {Robin, Y. and Vrac, M. and Naveau, P. and Yiou, P.}, + title = {Multivariate stochastic bias corrections with optimal transport}, + journal = {Hydrology and Earth System Sciences}, + volume = {23}, + year = {2019}, + number = {2}, + pages = {773--786}, + url = {https://hess.copernicus.org/articles/23/773/2019/}, + doi = {10.5194/hess-23-773-2019} +} + +@misc{robin_2021, + title = {{SBCK}: {Statistical} {Bias} {Correction} {Kit}}, + copyright = {GPL-3}, + shorttitle = {{SBCK}}, + url = {https://github.com/yrobink/SBCK-python}, + urldate = {2024-07-03}, + author = {Robin, Yoann}, + year = {2021}, +} + +@article{higham_1988, + title = {Computing a nearest symmetric positive semidefinite matrix}, + journal = {Linear Algebra and its Applications}, + volume = {103}, + pages = {103-118}, + year = {1988}, + issn = {0024-3795}, + doi = {https://doi.org/10.1016/0024-3795(88)90223-6}, + url = {https://www.sciencedirect.com/science/article/pii/0024379588902236}, + author = {Nicholas J. Higham}, + abstract = {The nearest symmetric positive semidefinite matrix in the Frobenius norm to an arbitrary real matrix A is shown to be (B + H)/2, where H is the symmetric polar factor of B=(A + AT)/2. In the 2-norm a nearest symmetric positive semidefinite matrix, and its distance δ2(A) from A, are given by a computationally challenging formula due to Halmos. We show how the bisection method can be applied to this formula to compute upper and lower bounds for δ2(A) differing by no more than a given amount. A key ingredient is a stable and efficient test for positive definiteness, based on an attempted Choleski decomposition. For accurate computation of δ2(A) we formulate the problem as one of zero finding and apply a hybrid Newton-bisection algorithm. Some numerical difficulties are discussed and illustrated by example.} +} + +@article{knol_1989, + title = "Least-squares approximation of an improper correlation matrix by a proper one", + abstract = "An algorithm is presented for the best least-squares fitting correlation matrix approximating a given missing value or improper correlation matrix. The proposed algorithm is based upon a solution for Mosier's oblique Procrustes rotation problem offered by ten Berge and Nevels. A necessary and sufficient condition is given for a solution to yield the unique global minimum of the least-squares function. Empirical verification of the condition indicates that the occurrence of non-optimal solutions with the proposed algorithm is very unlikely. A possible drawback of the optimal solution is that it is a singular matrix of necessity. In cases where singularity is undesirable, one may impose the additional nonsingularity constraint that the smallest eigenvalue of the solution be δ, where δ is an arbitrary small positive constant. Finally, it may be desirable to weight the squared errors of estimation differentially. A generalized solution is derived which satisfies the additional nonsingularity constraint and also allows for weighting. The generalized solution can readily be obtained from the standard “unweighted singular” solution by transforming the observed improper correlation matrix in a suitable way.", + keywords = "Missing value correlation, indefinite correlation matrix, IR-85889, tetrachoric correlation, constrained least-squares approximation", + author = "Knol, {Dirk L.} and {ten Berge}, {Jos M.F.}", + year = "1989", + doi = "10.1007/BF02294448", + language = "Undefined", + volume = "54", + pages = "53--61", + journal = "Psychometrika", + issn = "0033-3123", + publisher = "Springer", + number = "1", +} diff --git a/pyproject.toml b/pyproject.toml index 380714ed7..4e37b1b54 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,7 +114,7 @@ docs = [ "sphinxcontrib-bibtex", "sphinxcontrib-svg2pdfconverter[Cairosvg]" ] -extras = ["fastnanquantile >=0.0.2"] +extras = ["fastnanquantile >=0.0.2", "POT"] all = ["xclim[dev]", "xclim[docs]", "xclim[extras]"] [project.scripts] @@ -166,7 +166,7 @@ values = [ [tool.codespell] skip = 'xclim/data/*.json,docs/_build,docs/notebooks/xclim_training/*.ipynb,docs/references.bib,__pycache__,*.gz,*.nc,*.png,*.svg,*.whl' -ignore-words-list = "absolue,astroid,bloc,bui,callendar,degreee,environnement,hanel,inferrable,lond,nam,nd,ressources,socio-economic,sie,vas" +ignore-words-list = "absolue,astroid,bloc,bui,callendar,degreee,environnement,hanel,inferrable,lond,nam,nd,ot,ressources,socio-economic,sie,vas" [tool.coverage.run] relative_files = true @@ -180,6 +180,7 @@ pep621_dev_dependency_groups = ["all", "dev", "docs"] [tool.deptry.package_module_name_map] "scikit-learn" = "sklearn" "pyyaml" = "yaml" +"POT" = "ot" [tool.deptry.per_rule_ignores] DEP001 = ["SBCK"] diff --git a/tests/test_sdba/test_adjustment.py b/tests/test_sdba/test_adjustment.py index 0fdb354d2..6bdf292da 100644 --- a/tests/test_sdba/test_adjustment.py +++ b/tests/test_sdba/test_adjustment.py @@ -12,6 +12,7 @@ from xclim.sdba import adjustment from xclim.sdba.adjustment import ( LOCI, + OTC, BaseAdjustment, DetrendedQuantileMapping, EmpiricalQuantileMapping, @@ -20,6 +21,7 @@ PrincipalComponents, QuantileDeltaMapping, Scaling, + dOTC, ) from xclim.sdba.base import Grouper from xclim.sdba.processing import ( @@ -798,6 +800,252 @@ def test_real_data(self, open_dataset): new_scen.load() +class TestOTC: + def test_compare_sbck(self, random, series): + pytest.importorskip("ot") + pytest.importorskip("SBCK", minversion="0.4.0") + ns = 1000 + u = random.random(ns) + + ref_xd = uniform(loc=1000, scale=100) + ref_yd = norm(loc=0, scale=100) + hist_xd = norm(loc=-500, scale=100) + hist_yd = uniform(loc=-1000, scale=100) + + ref_x = ref_xd.ppf(u) + ref_y = ref_yd.ppf(u) + hist_x = hist_xd.ppf(u) + hist_y = hist_yd.ppf(u) + + # Constructing an histogram such that every bin contains + # at most 1 point should ensure that ot is deterministic + dx_ref = np.diff(np.sort(ref_x)).min() + dx_hist = np.diff(np.sort(hist_x)).min() + dx = min(dx_ref, dx_hist) * 9 / 10 + + dy_ref = np.diff(np.sort(ref_y)).min() + dy_hist = np.diff(np.sort(hist_y)).min() + dy = min(dy_ref, dy_hist) * 9 / 10 + + bin_width = [dx, dy] + + ref_tas = series(ref_x, "tas") + ref_pr = series(ref_y, "pr") + ref = xr.merge([ref_tas, ref_pr]) + ref = stack_variables(ref) + + hist_tas = series(hist_x, "tas") + hist_pr = series(hist_y, "pr") + hist = xr.merge([hist_tas, hist_pr]) + hist = stack_variables(hist) + + scen = OTC.adjust(ref, hist, bin_width=bin_width, jitter_inside_bins=False) + + otc_sbck = adjustment.SBCK_OTC + scen_sbck = otc_sbck.adjust( + ref, hist, hist, multi_dim="multivar", bin_width=bin_width + ) + + scen = scen.to_numpy().T + scen_sbck = scen_sbck.to_numpy() + assert np.allclose(scen, scen_sbck) + + def test_shape(self, random, series): + pytest.importorskip("ot") + pytest.importorskip("SBCK", minversion="0.4.0") + ref_ns = 300 + hist_ns = 200 + ref_u = random.random(ref_ns) + hist_u = random.random(hist_ns) + + ref_xd = uniform(loc=1000, scale=100) + ref_yd = norm(loc=0, scale=100) + ref_zd = norm(loc=500, scale=100) + hist_xd = norm(loc=-500, scale=100) + hist_yd = uniform(loc=-1000, scale=100) + hist_zd = uniform(loc=-10, scale=100) + + ref_x = ref_xd.ppf(ref_u) + ref_y = ref_yd.ppf(ref_u) + ref_z = ref_zd.ppf(ref_u) + hist_x = hist_xd.ppf(hist_u) + hist_y = hist_yd.ppf(hist_u) + hist_z = hist_zd.ppf(hist_u) + + ref_na = 10 + hist_na = 15 + ref_idx = random.choice(range(ref_ns), size=ref_na, replace=False) + ref_x[ref_idx] = None + hist_idx = random.choice(range(hist_ns), size=hist_na, replace=False) + hist_x[hist_idx] = None + + ref_x = series(ref_x, "tas").rename("x") + ref_y = series(ref_y, "tas").rename("y") + ref_z = series(ref_z, "tas").rename("z") + ref = xr.merge([ref_x, ref_y, ref_z]) + ref = stack_variables(ref) + + hist_x = series(hist_x, "tas").rename("x") + hist_y = series(hist_y, "tas").rename("y") + hist_z = series(hist_z, "tas").rename("z") + hist = xr.merge([hist_x, hist_y, hist_z]) + hist = stack_variables(hist) + + scen = OTC.adjust(ref, hist) + + assert scen.shape == (3, hist_ns - hist_na) + hist = unstack_variables(hist) + assert not np.isin(hist.x[hist.x.isnull()].time.values, scen.time.values).any() + + +class TestdOTC: + @pytest.mark.parametrize("use_dask", [True, False]) + @pytest.mark.parametrize("cov_factor", ["std", "cholesky"]) + def test_compare_sbck(self, random, series, use_dask, cov_factor): + pytest.importorskip("ot") + pytest.importorskip("SBCK", minversion="0.4.0") + ns = 1000 + u = random.random(ns) + + ref_xd = uniform(loc=1000, scale=100) + ref_yd = norm(loc=0, scale=100) + hist_xd = norm(loc=-500, scale=100) + hist_yd = uniform(loc=-1000, scale=100) + sim_xd = norm(loc=0, scale=100) + sim_yd = uniform(loc=0, scale=100) + + ref_x = ref_xd.ppf(u) + ref_y = ref_yd.ppf(u) + hist_x = hist_xd.ppf(u) + hist_y = hist_yd.ppf(u) + sim_x = sim_xd.ppf(u) + sim_y = sim_yd.ppf(u) + + # Constructing an histogram such that every bin contains + # at most 1 point should ensure that ot is deterministic + dx_ref = np.diff(np.sort(ref_x)).min() + dx_hist = np.diff(np.sort(hist_x)).min() + dx_sim = np.diff(np.sort(sim_x)).min() + dx = min(dx_ref, dx_hist, dx_sim) * 9 / 10 + + dy_ref = np.diff(np.sort(ref_y)).min() + dy_hist = np.diff(np.sort(hist_y)).min() + dy_sim = np.diff(np.sort(sim_y)).min() + dy = min(dy_ref, dy_hist, dy_sim) * 9 / 10 + + bin_width = [dx, dy] + + ref_tas = series(ref_x, "tas") + ref_pr = series(ref_y, "pr") + hist_tas = series(hist_x, "tas") + hist_pr = series(hist_y, "pr") + sim_tas = series(sim_x, "tas") + sim_pr = series(sim_y, "pr") + + if use_dask: + ref_tas = ref_tas.chunk({"time": -1}) + ref_pr = ref_pr.chunk({"time": -1}) + hist_tas = hist_tas.chunk({"time": -1}) + hist_pr = hist_pr.chunk({"time": -1}) + sim_tas = sim_tas.chunk({"time": -1}) + sim_pr = sim_pr.chunk({"time": -1}) + + ref = xr.merge([ref_tas, ref_pr]) + hist = xr.merge([hist_tas, hist_pr]) + sim = xr.merge([sim_tas, sim_pr]) + + ref = stack_variables(ref) + hist = stack_variables(hist) + sim = stack_variables(sim) + + scen = dOTC.adjust( + ref, + hist, + sim, + bin_width=bin_width, + jitter_inside_bins=False, + cov_factor=cov_factor, + ) + + dotc_sbck = adjustment.SBCK_dOTC + scen_sbck = dotc_sbck.adjust( + ref, + hist, + sim, + multi_dim="multivar", + bin_width=bin_width, + cov_factor=cov_factor, + ) + + scen = scen.to_numpy().T + scen_sbck = scen_sbck.to_numpy() + assert np.allclose(scen, scen_sbck) + + def test_shape(self, random, series): + pytest.importorskip("ot") + pytest.importorskip("SBCK", minversion="0.4.0") + ref_ns = 300 + hist_ns = 200 + sim_ns = 400 + ref_u = random.random(ref_ns) + hist_u = random.random(hist_ns) + sim_u = random.random(sim_ns) + + ref_xd = uniform(loc=1000, scale=100) + ref_yd = norm(loc=0, scale=100) + ref_zd = norm(loc=500, scale=100) + hist_xd = norm(loc=-500, scale=100) + hist_yd = uniform(loc=-1000, scale=100) + hist_zd = uniform(loc=-10, scale=100) + sim_xd = norm(loc=0, scale=100) + sim_yd = uniform(loc=0, scale=100) + sim_zd = uniform(loc=10, scale=100) + + ref_x = ref_xd.ppf(ref_u) + ref_y = ref_yd.ppf(ref_u) + ref_z = ref_zd.ppf(ref_u) + hist_x = hist_xd.ppf(hist_u) + hist_y = hist_yd.ppf(hist_u) + hist_z = hist_zd.ppf(hist_u) + sim_x = sim_xd.ppf(sim_u) + sim_y = sim_yd.ppf(sim_u) + sim_z = sim_zd.ppf(sim_u) + + ref_na = 10 + hist_na = 15 + sim_na = 20 + ref_idx = random.choice(range(ref_ns), size=ref_na, replace=False) + ref_x[ref_idx] = None + hist_idx = random.choice(range(hist_ns), size=hist_na, replace=False) + hist_x[hist_idx] = None + sim_idx = random.choice(range(sim_ns), size=sim_na, replace=False) + sim_x[sim_idx] = None + + ref_x = series(ref_x, "tas").rename("x") + ref_y = series(ref_y, "tas").rename("y") + ref_z = series(ref_z, "tas").rename("z") + ref = xr.merge([ref_x, ref_y, ref_z]) + ref = stack_variables(ref) + + hist_x = series(hist_x, "tas").rename("x") + hist_y = series(hist_y, "tas").rename("y") + hist_z = series(hist_z, "tas").rename("z") + hist = xr.merge([hist_x, hist_y, hist_z]) + hist = stack_variables(hist) + + sim_x = series(sim_x, "tas").rename("x") + sim_y = series(sim_y, "tas").rename("y") + sim_z = series(sim_z, "tas").rename("z") + sim = xr.merge([sim_x, sim_y, sim_z]) + sim = stack_variables(sim) + + scen = dOTC.adjust(ref, hist, sim) + + assert scen.shape == (3, sim_ns - sim_na) + sim = unstack_variables(sim) + assert not np.isin(sim.x[sim.x.isnull()].time.values, scen.time.values).any() + + def test_raise_on_multiple_chunks(tas_series): ref = tas_series(np.arange(730).astype(float)).chunk({"time": 365}) with pytest.raises(ValueError): diff --git a/xclim/sdba/_adjustment.py b/xclim/sdba/_adjustment.py index f6df322d6..303d4a3e3 100644 --- a/xclim/sdba/_adjustment.py +++ b/xclim/sdba/_adjustment.py @@ -944,3 +944,450 @@ def extremes_adjust( adjusted: xr.DataArray = (transition * scen) + ((1 - transition) * ds.scen) out = adjusted.rename("scen").squeeze("group", drop=True).to_dataset() return out + + +def _otc_adjust( + X: np.ndarray, + Y: np.ndarray, + bin_width: dict | float | np.ndarray | None = None, + bin_origin: dict | float | np.ndarray | None = None, + num_iter_max: int | None = 100_000_000, + jitter_inside_bins: bool = True, + transform: str | None = "max_distance", +): + """Optimal Transport Correction of the bias of X with respect to Y. + + Parameters + ---------- + X : np.ndarray + Historical data to be corrected. + Y : np.ndarray + Bias correction reference, target of optimal transport. + bin_width : dict or float or np.ndarray, optional + Bin widths for specified dimensions. + bin_origin : dict or float or np.ndarray, optional + Bin origins for specified dimensions. + num_iter_max : int, optional + Maximum number of iterations used in the earth mover distance algorithm. + jitter_inside_bins : bool + If `False`, output points are located at the center of their bin. + If `True`, a random location is picked uniformly inside their bin. Default is `True`. + transform : {None, 'standardize', 'max_distance', 'max_value'} + Per-variable transformation applied before the distances are calculated. + + Returns + ------- + np.ndarray + Adjusted data + + References + ---------- + :cite:cts:`sdba-robin_2021` + """ + # Initialize parameters + if bin_width is None: + bin_width = u.bin_width_estimator([Y, X]) + elif isinstance(bin_width, dict): + _bin_width = u.bin_width_estimator([Y, X]) + for k, v in bin_width.items(): + _bin_width[k] = v + bin_width = _bin_width + elif isinstance(bin_width, (float, int)): + bin_width = np.ones(X.shape[1]) * bin_width + + if bin_origin is None: + bin_origin = np.zeros(X.shape[1]) + elif isinstance(bin_origin, dict): + _bin_origin = np.zeros(X.shape[1]) + if bin_origin is not None: + for v, k in bin_origin.items(): + _bin_origin[v] = k + bin_origin = _bin_origin + elif isinstance(bin_origin, (float, int)): + bin_origin = np.ones(X.shape[1]) * bin_origin + + num_iter_max = 100_000_000 if num_iter_max is None else num_iter_max + + # Get the bin positions and frequencies of X and Y, and for all Xs the bin to which they belong + gridX, muX, binX = u.histogram(X, bin_width, bin_origin) + gridY, muY, _ = u.histogram(Y, bin_width, bin_origin) + + # Compute the optimal transportation plan + plan = u.optimal_transport(gridX, gridY, muX, muY, num_iter_max, transform) + + gridX = np.floor((gridX - bin_origin) / bin_width) + gridY = np.floor((gridY - bin_origin) / bin_width) + + # regroup the indices of all the points belonging to a same bin + binX_sort = np.lexsort(binX[:, ::-1].T) + sorted_bins = binX[binX_sort] + _, binX_start, binX_count = np.unique( + sorted_bins, return_index=True, return_counts=True, axis=0 + ) + binX_start_sort = np.sort(binX_start) + binX_groups = np.split(binX_sort, binX_start_sort[1:]) + + out = np.empty(X.shape) + rng = np.random.default_rng() + # The plan row corresponding to a source bin indicates its probabilities to be transported to every target bin + for i, binX_group in enumerate(binX_groups): + # Get the plan row of this bin + pi = np.where((binX[binX_group[0]] == gridX).all(1))[0][0] + # Pick as much target bins for this source bin as there are points in the source bin + choice = rng.choice(range(muY.size), p=plan[pi, :], size=binX_count[i]) + out[binX_group] = (gridY[choice] + 1 / 2) * bin_width + bin_origin + + if jitter_inside_bins: + out += np.random.uniform(low=-bin_width / 2, high=bin_width / 2, size=out.shape) + + return out + + +@map_groups(scen=[Grouper.DIM]) +def otc_adjust( + ds: xr.Dataset, + dim: list, + pts_dim: str, + bin_width: dict | float | None = None, + bin_origin: dict | float | None = None, + num_iter_max: int | None = 100_000_000, + jitter_inside_bins: bool = True, + adapt_freq_thresh: dict | None = None, + transform: str | None = "max_distance", +): + """Optimal Transport Correction of the bias of `hist` with respect to `ref`. + + Parameters + ---------- + ds : xr.Dataset + Dataset variables: + ref : training target + hist : training data + dim : list + The dimensions defining the distribution on which optimal transport is performed. + pts_dim : str + The dimension defining the multivariate components of the distribution. + bin_width : dict or float, optional + Bin widths for specified dimensions. + bin_origin : dict or float, optional + Bin origins for specified dimensions. + num_iter_max : int, optional + Maximum number of iterations used in the earth mover distance algorithm. + jitter_inside_bins : bool + If `False`, output points are located at the center of their bin. + If `True`, a random location is picked uniformly inside their bin. Default is `True`. + adapt_freq_thresh : dict, optional + Threshold for frequency adaptation per variable. + transform : {None, 'standardize', 'max_distance', 'max_value'} + Per-variable transformation applied before the distances are calculated. + + Returns + ------- + xr.Dataset + Adjusted data + """ + ref = ds.ref + hist = ds.hist + + if adapt_freq_thresh is not None: + for var, thresh in adapt_freq_thresh.items(): + hist.loc[var] = _adapt_freq_hist( + xr.Dataset( + {"ref": ref.sel({pts_dim: var}), "hist": hist.sel({pts_dim: var})} + ), + thresh, + ) + + ref_map = {d: f"ref_{d}" for d in dim} + ref = ref.rename(ref_map).stack(dim_ref=ref_map.values()).dropna(dim="dim_ref") + + hist = hist.stack(dim_hist=dim).dropna(dim="dim_hist") + + if isinstance(bin_width, dict): + bin_width = { + np.where(ref[pts_dim].values == var)[0][0]: op + for var, op in bin_width.items() + } + if isinstance(bin_origin, dict): + bin_origin = { + np.where(ref[pts_dim].values == var)[0][0]: op + for var, op in bin_origin.items() + } + + scen = xr.apply_ufunc( + _otc_adjust, + hist, + ref, + kwargs=dict( + bin_width=bin_width, + bin_origin=bin_origin, + num_iter_max=num_iter_max, + jitter_inside_bins=jitter_inside_bins, + transform=transform, + ), + input_core_dims=[["dim_hist", pts_dim], ["dim_ref", pts_dim]], + output_core_dims=[["dim_hist", pts_dim]], + keep_attrs=True, + vectorize=True, + ) + + # Pad dim differences with NA to please map_blocks + ref = ref.unstack().rename({v: k for k, v in ref_map.items()}) + scen = scen.unstack().rename("scen") + for d in dim: + full_d = xr.concat([ref[d], scen[d]], dim=d).drop_duplicates(d) + scen = scen.reindex({d: full_d}) + + return scen.to_dataset() + + +def _dotc_adjust( + X1: np.ndarray, + Y0: np.ndarray, + X0: np.ndarray, + bin_width: dict | float | None = None, + bin_origin: dict | float | None = None, + num_iter_max: int | None = 100_000_000, + cov_factor: str | None = "std", + jitter_inside_bins: bool = True, + kind: dict | None = None, + transform: str | None = "max_distance", +): + """Dynamical Optimal Transport Correction of the bias of X with respect to Y. + + Parameters + ---------- + X1 : np.ndarray + Simulation data to adjust. + Y0 : np.ndarray + Bias correction reference. + X0 : np.ndarray + Historical simulation data. + bin_width : dict or float, optional + Bin widths for specified dimensions. + bin_origin : dict or float, optional + Bin origins for specified dimensions. + num_iter_max : int, optional + Maximum number of iterations used in the earth mover distance algorithm. + cov_factor : str, optional + Rescaling factor. + jitter_inside_bins : bool + If `False`, output points are located at the center of their bin. + If `True`, a random location is picked uniformly inside their bin. Default is `True`. + kind : dict, optional + Keys are variable names and values are adjustment kinds, either additive or multiplicative. + Unspecified dimensions are treated as "+". + transform : {None, 'standardize', 'max_distance', 'max_value'} + Per-variable transformation applied before the distances are calculated. + + Returns + ------- + np.ndarray + Adjusted data + + References + ---------- + :cite:cts:`sdba-robin_2021` + """ + # Initialize parameters + if isinstance(bin_width, dict): + _bin_width = u.bin_width_estimator([Y0, X0, X1]) + for v, k in bin_width.items(): + _bin_width[v] = k + bin_width = _bin_width + elif isinstance(bin_width, (float, int)): + bin_width = np.ones(X0.shape[1]) * bin_width + + if isinstance(bin_origin, dict): + _bin_origin = np.zeros(X0.shape[1]) + for v, k in bin_origin.items(): + _bin_origin[v] = k + bin_origin = _bin_origin + elif isinstance(bin_origin, (float, int)): + bin_origin = np.ones(X0.shape[1]) * bin_origin + + # Map ref to hist + yX0 = _otc_adjust( + Y0, + X0, + bin_width=bin_width, + bin_origin=bin_origin, + num_iter_max=num_iter_max, + jitter_inside_bins=False, + transform=transform, + ) + + # Map hist to sim + yX1 = _otc_adjust( + yX0, + X1, + bin_width=bin_width, + bin_origin=bin_origin, + num_iter_max=num_iter_max, + jitter_inside_bins=False, + transform=transform, + ) + + # Temporal evolution + motion = np.empty(yX0.shape) + for j in range(yX0.shape[1]): + if kind is not None and j in kind.keys() and kind[j] == "*": + motion[:, j] = yX1[:, j] / yX0[:, j] + else: + motion[:, j] = yX1[:, j] - yX0[:, j] + + # Apply a variance dependent rescaling factor + if cov_factor == "cholesky": + fact0 = u.eps_cholesky(np.cov(Y0, rowvar=False)) + fact1 = u.eps_cholesky(np.cov(X0, rowvar=False)) + motion = (fact0 @ np.linalg.inv(fact1) @ motion.T).T + elif cov_factor == "std": + fact0 = np.std(Y0, axis=0) + fact1 = np.std(X0, axis=0) + motion = motion @ np.diag(fact0 / fact1) + + # Apply the evolution to ref + Y1 = np.empty(yX0.shape) + for j in range(yX0.shape[1]): + if kind is not None and j in kind.keys() and kind[j] == "*": + Y1[:, j] = Y0[:, j] * motion[:, j] + else: + Y1[:, j] = Y0[:, j] + motion[:, j] + + # Map sim to the evolution of ref + Z1 = _otc_adjust( + X1, + Y1, + bin_width=bin_width, + bin_origin=bin_origin, + num_iter_max=num_iter_max, + jitter_inside_bins=jitter_inside_bins, + transform=transform, + ) + + return Z1 + + +@map_groups(scen=[Grouper.DIM]) +def dotc_adjust( + ds: xr.Dataset, + dim: list, + pts_dim: str, + bin_width: dict | float | None = None, + bin_origin: dict | float | None = None, + num_iter_max: int | None = 100_000_000, + cov_factor: str | None = "std", + jitter_inside_bins: bool = True, + kind: dict | None = None, + adapt_freq_thresh: dict | None = None, + transform: str | None = "max_distance", +): + """Dynamical Optimal Transport Correction of the bias of X with respect to Y. + + Parameters + ---------- + ds : xr.Dataset + Dataset variables: + ref : training target + hist : training data + sim : simulated data + dim : list + The dimensions defining the distribution on which optimal transport is performed. + pts_dim : str + The dimension defining the multivariate components of the distribution. + bin_width : dict or float, optional + Bin widths for specified dimensions. + bin_origin : dict or float, optional + Bin origins for specified dimensions. + num_iter_max : int, optional + Maximum number of iterations used in the earth mover distance algorithm. + cov_factor : str, optional + Rescaling factor. + jitter_inside_bins : bool + If `False`, output points are located at the center of their bin. + If `True`, a random location is picked uniformly inside their bin. Default is `True`. + kind : dict, optional + Keys are variable names and values are adjustment kinds, either additive or multiplicative. + Unspecified dimensions are treated as "+". + adapt_freq_thresh : dict, optional + Threshold for frequency adaptation per variable. + transform : {None, 'standardize', 'max_distance', 'max_value'} + Per-variable transformation applied before the distances are calculated. + + Returns + ------- + xr.Dataset + Adjusted data + """ + hist = ds.hist + sim = ds.sim + ref = ds.ref + + if adapt_freq_thresh is not None: + for var, thresh in adapt_freq_thresh.items(): + hist.loc[var] = _adapt_freq_hist( + xr.Dataset( + {"ref": ref.sel({pts_dim: var}), "hist": hist.sel({pts_dim: var})} + ), + thresh, + ) + + # Drop data added by map_blocks and prepare for apply_ufunc + hist_map = {d: f"hist_{d}" for d in dim} + hist = ( + hist.rename(hist_map).stack(dim_hist=hist_map.values()).dropna(dim="dim_hist") + ) + + ref_map = {d: f"ref_{d}" for d in dim} + ref = ref.rename(ref_map).stack(dim_ref=ref_map.values()).dropna(dim="dim_ref") + + sim = sim.stack(dim_sim=dim).dropna(dim="dim_sim") + + if kind is not None: + kind = { + np.where(ref[pts_dim].values == var)[0][0]: op for var, op in kind.items() + } + if isinstance(bin_width, dict): + bin_width = { + np.where(ref[pts_dim].values == var)[0][0]: op + for var, op in bin_width.items() + } + if isinstance(bin_origin, dict): + bin_origin = { + np.where(ref[pts_dim].values == var)[0][0]: op + for var, op in bin_origin.items() + } + + scen = xr.apply_ufunc( + _dotc_adjust, + sim, + ref, + hist, + kwargs=dict( + bin_width=bin_width, + bin_origin=bin_origin, + num_iter_max=num_iter_max, + cov_factor=cov_factor, + jitter_inside_bins=jitter_inside_bins, + kind=kind, + transform=transform, + ), + input_core_dims=[ + ["dim_sim", pts_dim], + ["dim_ref", pts_dim], + ["dim_hist", pts_dim], + ], + output_core_dims=[["dim_sim", pts_dim]], + keep_attrs=True, + vectorize=True, + ) + + # Pad dim differences with NA to please map_blocks + hist = hist.unstack().rename({v: k for k, v in hist_map.items()}) + ref = ref.unstack().rename({v: k for k, v in ref_map.items()}) + scen = scen.unstack().rename("scen") + for d in dim: + full_d = xr.concat([hist[d], ref[d], scen[d]], dim=d).drop_duplicates(d) + scen = scen.reindex({d: full_d}) + + return scen.to_dataset() diff --git a/xclim/sdba/adjustment.py b/xclim/sdba/adjustment.py index 90e95ceb4..e8c4190b9 100644 --- a/xclim/sdba/adjustment.py +++ b/xclim/sdba/adjustment.py @@ -5,6 +5,7 @@ """ from __future__ import annotations +from importlib.util import find_spec from inspect import signature from typing import Any from warnings import warn @@ -21,6 +22,7 @@ from xclim.indices import stats from ._adjustment import ( + dotc_adjust, dqm_adjust, dqm_train, eqm_train, @@ -31,6 +33,7 @@ mbcn_adjust, mbcn_train, npdf_transform, + otc_adjust, qdm_adjust, qm_adjust, scaling_adjust, @@ -49,6 +52,7 @@ __all__ = [ "LOCI", + "OTC", "BaseAdjustment", "DetrendedQuantileMapping", "EmpiricalQuantileMapping", @@ -58,6 +62,7 @@ "PrincipalComponents", "QuantileDeltaMapping", "Scaling", + "dOTC", ] @@ -324,7 +329,7 @@ def adjust( cls, ref: xr.DataArray, hist: xr.DataArray, - sim: xr.DataArray, + sim: xr.DataArray | None = None, **kwargs, ) -> xr.Dataset: r"""Return bias-adjusted data. Refer to the class documentation for the algorithm details. @@ -345,6 +350,10 @@ def adjust( xr.Dataset The bias-adjusted Dataset. """ + if sim is None: + sim = hist.copy() + sim.attrs["_is_hist"] = True + kwargs = parse_group(cls._adjust, kwargs) skip_checks = kwargs.pop("skip_input_checks", False) @@ -354,7 +363,7 @@ def adjust( (ref, hist, sim), _ = cls._harmonize_units(ref, hist, sim) - out: xr.Dataset | xr.DataArray = cls._adjust(ref, hist, sim, **kwargs) + out: xr.Dataset | xr.DataArray = cls._adjust(ref, hist, sim=sim, **kwargs) if isinstance(out, xr.DataArray): out = out.rename("scen").to_dataset() @@ -1267,6 +1276,334 @@ def _adjust( return out +class OTC(Adjust): + r"""Optimal Transport Correction. + + Following :cite:t:`sdba-robin_2019`, this multivariate bias correction method finds the optimal transport + mapping between simulated and observed data. The correction of every simulated data point is the observed + point it is mapped to. + + See notes for an explanation of the algorithm. + + Parameters + ---------- + bin_width : dict or float, optional + Bin widths for specified dimensions if is dict. + For all dimensions if float. + Will be estimated with Freedman-Diaconis rule by default. + bin_origin : dict or float, optional + Bin origins for specified dimensions if is dict. + For all dimensions if float. + Default is 0. + num_iter_max : int, optional + Maximum number of iterations used in the earth mover distance algorithm. + Default is 100_000_000. + jitter_inside_bins : bool + If `False`, output points are located at the center of their bin. + If `True`, a random location is picked uniformly inside their bin. Default is `True`. + adapt_freq_thresh : dict or str, optional + Threshold for frequency adaptation per variable. + See :py:class:`xclim.sdba.processing.adapt_freq` for details. + Frequency adaptation is not applied to missing variables if is dict. + Applied to all variables if is string. + transform : {None, 'standardize', 'max_distance', 'max_value'} + Per-variable transformation applied before the distances are calculated. + Default is "max_distance". + See notes for details. + group : Union[str, Grouper] + The grouping information. See :py:class:`xclim.sdba.base.Grouper` for details. + Default is "time", meaning a single adjustment group along dimension "time". + pts_dim : str + The name of the "multivariate" dimension. Defaults to "multivar", which is the + normal case when using :py:func:`xclim.sdba.base.stack_variables`. + + Notes + ----- + The simulated and observed data sets :math:`X` and :math:`Y` are discretized and standardized using histograms + whose bin length along dimension `v` is given by `bin_width[v]`. An optimal transport plan :math:`P^*` is found by solving + the linear program + + .. math:: + + \mathop{\arg\!\min}_{P} \langle P,C\rangle \\ + s.t. P\mathbf{1} = X \\ + P^T\mathbf{1} = Y \\ + P \geq 0 + + where :math:`C_{ij}` is the squared euclidean distance between the bin at position :math:`i` of :math:`X`'s histogram and + the bin at position :math:`j` of :math:`Y`'s. + + All data points belonging to input bin at position :math:`i` are then separately assigned to output bin at position :math:`j` + with probability :math:`P_{ij}`. A transformation of bin positions can be applied before computing the distances :math:`C_{ij}` + to make variables on different scales more evenly taken into consideration by the optimization step. Available transformations are + + - `transform = 'standardize'` : + .. math:: + + i_v' = \frac{i_v - mean(i_v)}{std(i_v)} \quad\quad\quad j_v' = \frac{j_v - mean(j_v)}{std(j_v)} + + - `transform = 'max_distance'` : + .. math:: + + i_v' = \frac{i_v}{max \{|i_v - j_v|\}} \quad\quad\quad j_v' = \frac{j_v}{max \{|i_v - j_v|\}} + + such that + .. math:: + + max \{|i_v' - j_v'|\} = max \{|i_w' - j_w'|\} = 1 + + - `transform = 'max_value'` : + .. math:: + + i_v' = \frac{i_v}{max\{i_v\}} \quad\quad\quad j_v' = \frac{j_v}{max\{j_v\}} + + for variables :math:`v, w`. Default is `'max_distance'`. + + Note that `POT `__ must be installed to use this method. + + This implementation is strongly inspired by :cite:t:`sdba-robin_2021`. + The differences from this implementation are : + + - `bin_width` and `bin_origin` are dictionaries or float + - Freedman-Diaconis rule is used to find the bin width when unspecified, and fallbacks to Scott's rule when 0 is obtained + - `jitter_inside_bins` argument + - `adapt_freq_thresh` argument + - `transform` argument + - `group` argument + - `pts_dim` argument + + References + ---------- + :cite:cts:`sdba-robin_2019,sdba-robin_2021` + """ + + @classmethod + def _adjust( + cls, + ref: xr.DataArray, + hist: xr.DataArray, + *, + bin_width: dict | float | None = None, + bin_origin: dict | float | None = None, + num_iter_max: int | None = 100_000_000, + jitter_inside_bins: bool = True, + adapt_freq_thresh: dict | str | None = None, + transform: str | None = "max_distance", + group: str | Grouper = "time", + pts_dim: str = "multivar", + **kwargs, + ) -> xr.DataArray: + if find_spec("ot") is None: + raise ImportError( + "POT is required for OTC and dOTC. Please install with `pip install POT`." + ) + + if transform not in [None, "standardize", "max_distance", "max_value"]: + raise ValueError( + "`transform` should be in [None, 'standardize', 'max_distance', 'max_value']." + ) + + sim = kwargs.pop("sim") + if "_is_hist" not in sim.attrs: + raise ValueError("OTC does not take a `sim` argument.") + + if isinstance(adapt_freq_thresh, str): + adapt_freq_thresh = {v: adapt_freq_thresh for v in hist[pts_dim].values} + if adapt_freq_thresh is not None: + _, units = cls._harmonize_units(sim) + for var, thresh in adapt_freq_thresh.items(): + adapt_freq_thresh[var] = str( + convert_units_to(thresh, units[var], context="hydro") + ) + + scen = otc_adjust( + xr.Dataset({"ref": ref, "hist": hist}), + bin_width=bin_width, + bin_origin=bin_origin, + num_iter_max=num_iter_max, + jitter_inside_bins=jitter_inside_bins, + adapt_freq_thresh=adapt_freq_thresh, + transform=transform, + group=group, + pts_dim=pts_dim, + ).scen + + if adapt_freq_thresh is not None: + for var in adapt_freq_thresh.keys(): + adapt_freq_thresh[var] = adapt_freq_thresh[var] + " " + units[var] + + for d in scen.dims: + if d != pts_dim: + scen = scen.dropna(dim=d) + + return scen + + +class dOTC(Adjust): + r"""dynamical Optimal Transport Correction. + + This method is the dynamical version of :py:class:`~xclim.sdba.adjustment.OTC`, as presented by :cite:t:`sdba-robin_2019`. + The temporal evolution of the model is found for every point by mapping the historical to the future dataset with + optimal transport. A mapping between historical and reference data is found in the same way, and the temporal evolution + of model data is applied to their assigned reference. + + See notes for an explanation of the algorithm. + + This implementation is strongly inspired by :cite:t:`sdba-robin_2021`. + + Parameters + ---------- + bin_width : dict or float, optional + Bin widths for specified dimensions if is dict. + For all dimensions if float. + Will be estimated with Freedman-Diaconis rule by default. + bin_origin : dict or float, optional + Bin origins for specified dimensions if is dict. + For all dimensions if float. + Default is 0. + num_iter_max : int, optional + Maximum number of iterations used in the network simplex algorithm. + cov_factor : {None, 'std', 'cholesky'} + A rescaling of the temporal evolution before it is applied to the reference. + Note that "cholesky" cannot be used if some variables are multiplicative. + See notes for details. + jitter_inside_bins : bool + If `False`, output points are located at the center of their bin. + If `True`, a random location is picked uniformly inside their bin. Default is `True`. + kind : dict or str, optional + Keys are variable names and values are adjustment kinds, either additive or multiplicative. + Unspecified dimensions are treated as "+". + Applied to all variables if is string. + adapt_freq_thresh : dict or str, optional + Threshold for frequency adaptation per variable. + See :py:class:`xclim.sdba.processing.adapt_freq` for details. + Frequency adaptation is not applied to missing variables if is dict. + Applied to all variables if is string. + transform : {None, 'standardize', 'max_distance', 'max_value'} + Per-variable transformation applied before the distances are calculated. + Default is "max_distance". + See :py:class:`~xclim.sdba.adjustment.OTC` for details. + group : Union[str, Grouper] + The grouping information. See :py:class:`xclim.sdba.base.Grouper` for details. + Default is "time", meaning a single adjustment group along dimension "time". + pts_dim : str + The name of the "multivariate" dimension. Defaults to "multivar", which is the + normal case when using :py:func:`xclim.sdba.base.stack_variables`. + + Notes + ----- + The simulated historical, simulated future and observed data sets :math:`X0`, :math:`X1` and :math:`Y0` are + discretized and standardized using histograms whose bin length along dimension `k` is given by `bin_width[k]`. + Mappings between :math:`Y0` and :math:`X0` on the one hand and between :math:`X0` and :math:`X1` on the other + are found by optimal transport (see :py:class:`~xclim.sdba.adjustment.OTC`). The latter mapping is used to + compute the temporal evolution of model data. This evolution is computed additively or multiplicatively for + each variable depending on its `kind`, and is applied to observed data with + + .. math:: + + Y1_i & := Y0_i + D \cdot v_i \;\; or \\ + Y1_i & := Y0_i * D \cdot v_i + + where + - :math:`v_i` is the temporal evolution of historical simulated point :math:`i \in X0` to :math:`j \in X1` + - :math:`Y0_i` is the observed data mapped to :math:`i` + - :math:`D` is a correction factor given by + - :math:`I` if `cov_factor is None` + - :math:`diag(\frac{\sigma_{Y0}}{\sigma_{X0}})` if `cov_factor = "std"` + - :math:`\frac{Chol(Y0)}{Chol(X0)}` where :math:`Chol` is the Cholesky decomposition if `cov_factor = "cholesky"` + - :math:`Y1_i` is the correction of the future simulated data mapped to :math:`i`. + + Note that `POT `__ must be installed to use this method. + + This implementation is strongly inspired by :cite:t:`sdba-robin_2021`. + The differences from this reference are : + + - `bin_width` and `bin_origin` are dictionaries or float. + - Freedman-Diaconis rule is used to find the bin width when unspecified, and fallbacks to Scott's rule when 0 is obtained. + - `jitter_inside_bins` argument + - `adapt_freq_thresh` argument + - `transform` argument + - `group` argument + - `pts_dim` argument + - `kind` argument + + References + ---------- + :cite:cts:`sdba-robin_2019,sdba-robin_2021` + """ + + @classmethod + def _adjust( + cls, + ref: xr.DataArray, + hist: xr.DataArray, + sim: xr.DataArray, + *, + bin_width: dict | float | None = None, + bin_origin: dict | float | None = None, + num_iter_max: int | None = 100_000_000, + cov_factor: str | None = "std", + jitter_inside_bins: bool = True, + kind: dict | str | None = None, + adapt_freq_thresh: dict | str | None = None, + transform: str | None = "max_distance", + group: str | Grouper = "time", + pts_dim: str = "multivar", + ) -> xr.DataArray: + if find_spec("ot") is None: + raise ImportError( + "POT is required for OTC and dOTC. Please install with `pip install POT`." + ) + + if isinstance(kind, str): + kind = {v: kind for v in hist[pts_dim].values} + if kind is not None and "*" in kind.values() and cov_factor == "cholesky": + raise ValueError( + "Multiplicative correction is not supported with `cov_factor` = 'cholesky'." + ) + + if cov_factor not in [None, "std", "cholesky"]: + raise ValueError("`cov_factor` should be in [None, 'std', 'cholesky'].") + + if transform not in [None, "standardize", "max_distance", "max_value"]: + raise ValueError( + "`transform` should be in [None, 'standardize', 'max_distance', 'max_value']." + ) + + if isinstance(adapt_freq_thresh, str): + adapt_freq_thresh = {v: adapt_freq_thresh for v in hist[pts_dim].values} + if adapt_freq_thresh is not None: + _, units = cls._harmonize_units(sim) + for var, thresh in adapt_freq_thresh.items(): + adapt_freq_thresh[var] = str( + convert_units_to(thresh, units[var], context="hydro") + ) + + scen = dotc_adjust( + xr.Dataset({"ref": ref, "hist": hist, "sim": sim}), + bin_width=bin_width, + bin_origin=bin_origin, + num_iter_max=num_iter_max, + cov_factor=cov_factor, + jitter_inside_bins=jitter_inside_bins, + kind=kind, + adapt_freq_thresh=adapt_freq_thresh, + transform=transform, + group=group, + pts_dim=pts_dim, + ).scen + + if adapt_freq_thresh is not None: + for var in adapt_freq_thresh.keys(): + adapt_freq_thresh[var] = adapt_freq_thresh[var] + " " + units[var] + + for d in scen.dims: + if d != pts_dim: + scen = scen.dropna(dim=d, how="all") + + return scen + + class MBCn(TrainAdjust): r"""Multivariate bias correction function using the N-dimensional probability density function transform. diff --git a/xclim/sdba/utils.py b/xclim/sdba/utils.py index 153243ea4..1e805a6cb 100644 --- a/xclim/sdba/utils.py +++ b/xclim/sdba/utils.py @@ -15,6 +15,7 @@ from boltons.funcutils import wraps from dask import array as dsk from scipy.interpolate import griddata, interp1d +from scipy.spatial import distance from scipy.stats import spearmanr from xarray.core.utils import get_temp_dimname @@ -967,3 +968,120 @@ def _skipna_correlation(data): "allow_rechunk": True, }, ).rename("correlation") + + +def bin_width_estimator(X): + """Estimate the bin width of an histogram. + + References + ---------- + :cite:cts:`sdba-robin_2021` + """ + if isinstance(X, list): + return np.min([bin_width_estimator(x) for x in X], axis=0) + + if X.ndim == 1: + X = X.reshape(-1, 1) + + # Freedman-Diaconis + bin_width = ( + 2.0 + * (np.percentile(X, q=75, axis=0) - np.percentile(X, q=25, axis=0)) + / np.power(X.shape[0], 1.0 / 3.0) + ) + bin_width = np.where( + bin_width == 0, + # Scott + 3.49 * np.std(X, axis=0) / np.power(X.shape[0], 1.0 / 3.0), + bin_width, + ) + + return bin_width + + +def histogram(data, bin_width, bin_origin): + """Construct an histogram of the data. + + Returns the position of the center of bins, their corresponding frequency and the bin of every data point. + """ + # Find bin indices of data points + idx_bin = np.floor((data - bin_origin) / bin_width) + + # Associate unique values with frequencies + grid, mu = np.unique(idx_bin, return_counts=True, axis=0) + + # Normalise frequencies + mu = np.divide(mu, sum(mu)) + + grid = (grid + 0.5) * bin_width + bin_origin + + return grid, mu, idx_bin + + +def optimal_transport(gridX, gridY, muX, muY, numItermax, transform): + """Computes the optimal transportation plan on (transformations of) X and Y. + + References + ---------- + :cite:cts:`sdba-robin_2021` + """ + from ot import emd + + if transform == "standardize": + gridX = (gridX - gridX.mean()) / gridX.std() + gridY = (gridY - gridY.mean()) / gridY.std() + + elif transform == "max_distance": + max1 = np.abs(gridX.max(axis=0) - gridY.min(axis=0)) + max2 = np.abs(gridY.max(axis=0) - gridX.min(axis=0)) + max_dist = np.maximum(max1, max2) + gridX = gridX / max_dist + gridY = gridY / max_dist + + elif transform == "max_value": + max_value = np.maximum(gridX.max(axis=0), gridY.max(axis=0)) + gridX = gridX / max_value + gridY = gridY / max_value + + # Compute the distances from every X bin to every Y bin + C = distance.cdist(gridX, gridY, "sqeuclidean") + + # Compute the optimal transportation plan + gamma = emd(muX, muY, C, numItermax=numItermax) + plan = (gamma.T / gamma.sum(axis=1)).T + + return plan + + +def eps_cholesky(M, nit=26): + """Cholesky decomposition. + + References + ---------- + :cite:cts:`sdba-robin_2021,sdba-higham_1988,sdba-knol_1989` + """ + MC = None + try: + MC = np.linalg.cholesky(M) + except np.linalg.LinAlgError: + MC = None + + if MC is None: + # Introduce small perturbations until M is positive-definite + eps = min(1e-9, np.abs(np.diagonal(M)).min()) + if eps == 0: + eps = 1e-9 + it = 0 + while MC is None: + if it == nit: + raise ValueError( + "The vcov matrix is far from positive-definite. Please use `cov_factor = 'std'`" + ) + perturb = np.identity(M.shape[0]) * eps + try: + MC = np.linalg.cholesky(M + perturb) + except np.linalg.LinAlgError: + MC = None + eps = 2 * eps + it += 1 + return MC