Extract selected area from DestinE Climate Twin in Healpix to Zarr#

The goal of this notebook is to read 2D variables from GFTS bucket and select a small geographical area and store the results in zarr.

pip install xdggs
import xarray as xr
import healpy as hp

import fsspec
import datetime
import os
import s3fs
import hvplot.pandas  # noqa
import hvplot.xarray  # noqa
import cartopy.crs as ccrs

2D variables to process#

variables2D = ["avg_sos", "avg_hc300m", "avg_hc700m", "avg_zos"]
years = [2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030]
months = [6]
models = ["ifs-nemo"]
maxlevels = {}
maxlevels["ifs-nemo"] = 75
maxlevels["icon"] = 72

Define Geographical area#

bbox = {"latitude": [46, 51], "longitude": [-8, -1]}

Processing#

%%time
for model in models:
    for var in variables2D:
        for year in years:
            for month in months:
                for p in range(2):
                    start_date = datetime.datetime(
                        year, month, 1
                    ) + p * datetime.timedelta(days=15)
                    end_date = start_date + datetime.timedelta(days=14)
                    uri = (
                        "simplecache::s3://gfts-reference-data/ClimateDT/raw/"
                        + var
                        + "_"
                        + model
                        + "_"
                        + start_date.strftime("%Y%m%d")
                        + "-"
                        + end_date.strftime("%Y%m%d")
                        + ".grib"
                    )
                    print(uri)
                    try:
                        if not os.path.isdir(
                            "small/" + os.path.basename(uri).split(".grib")[0] + ".zarr"
                        ):
                            file = fsspec.open_local(
                                uri,
                                s3={"anon": False},
                                filecache={
                                    "cache_storage": "/home/jovyan/cache_storage"
                                },
                            )
                            dset = xr.open_dataset(file, engine="cfgrib")
                            npix = dset.sizes["values"]
                            nest = True
                            nside = hp.npix2nside(npix)
                            cell_ids = range(0, 12 * nside**2)
                            cell_ids = range(12 * nside**2, 0, -1)
                            dset = dset.assign_coords(
                                {"cell_ids": ("values", cell_ids)}
                            ).swap_dims({"values": "cell_ids"})
                            dset.cell_ids.attrs = {
                                "grid_name": "healpix",
                                "nside": nside,
                                "nest": nest,
                            }
                            dset["longitude"] = ((dset.longitude + 180) % 360) - 180

                            dset.sortby("cell_ids").where(
                                (dset.latitude >= bbox["latitude"][0])
                                & (dset.latitude <= bbox["latitude"][1])
                                & (dset.longitude >= bbox["longitude"][0])
                                & (dset.longitude <= bbox["longitude"][1]),
                                drop=True,
                            ).to_zarr(
                                "small/"
                                + os.path.basename(uri).split(".grib")[0]
                                + ".zarr"
                            )
                        else:
                            print("zarr file exists for ", uri)
                    except Exception:
                        print("not processing file ", uri)

Save geographical area cell_ids#

var
if not os.path.isdir("cell_ids.zarr") and "dset" in locals():
    dset.sortby("cell_ids").isel(time=0).reset_coords("time", drop=True).where(
        (dset.latitude >= bbox["latitude"][0])
        & (dset.latitude <= bbox["latitude"][1])
        & (dset.longitude >= bbox["longitude"][0])
        & (dset.longitude <= bbox["longitude"][1]),
        drop=True,
    ).drop_vars(var).to_zarr("cell_ids.zarr")

3D variables to process#

variables3D = ["avg_thetao", "avg_so", "avg_von", "avg_uoe", "avg_wo"]
for model in models:
    for var in variables3D:
        for year in years:
            for month in months:
                for p in range(2):
                    start_date = datetime.datetime(
                        year, month, 1
                    ) + p * datetime.timedelta(days=15)
                    end_date = start_date + datetime.timedelta(days=14)
                    uri = (
                        "simplecache::s3://gfts-reference-data/ClimateDT/raw/"
                        + var
                        + "_"
                        + model
                        + "_"
                        + start_date.strftime("%Y%m%d")
                        + "-"
                        + end_date.strftime("%Y%m%d")
                        + ".grib"
                    )
                    print(uri)
                    try:
                        if not os.path.isdir(
                            "small/" + os.path.basename(uri).split(".grib")[0] + ".zarr"
                        ):
                            file = fsspec.open_local(
                                uri,
                                s3={"anon": False},
                                filecache={
                                    "cache_storage": "/home/jovyan/cache_storage"
                                },
                            )
                            dset = xr.open_dataset(
                                file, engine="cfgrib", chunks={"time": 1}
                            )
                            npix = dset.sizes["values"]
                            nest = True
                            nside = hp.npix2nside(npix)
                            cell_ids = range(0, 12 * nside**2)
                            cell_ids = range(12 * nside**2, 0, -1)
                            dset = dset.assign_coords(
                                {"cell_ids": ("values", cell_ids)}
                            ).swap_dims({"values": "cell_ids"})
                            dset.cell_ids.attrs = {
                                "grid_name": "healpix",
                                "nside": nside,
                                "nest": nest,
                            }
                            dset["longitude"] = ((dset.longitude + 180) % 360) - 180
                            dcell_ids = xr.open_dataset("cell_ids.zarr", engine="zarr")
                            dcell_ids = dcell_ids.expand_dims(
                                dim={"time": dset.time.size}
                            )
                            dset = dset.sortby("cell_ids")
                            dset.where(
                                dset.cell_ids == dcell_ids.cell_ids, drop=True
                            ).to_zarr(
                                "small/"
                                + os.path.basename(uri).split(".grib")[0]
                                + ".zarr"
                            )
                        else:
                            print("zarr file exists for ", uri)
                    except Exception:
                        print("not processing file ", uri)

Open and consolidate 2D datasets#

list_files = []
for model in models:
    for var in variables2D:
        for year in years:
            for month in months:
                for p in range(2):
                    start_date = datetime.datetime(
                        year, month, 1
                    ) + p * datetime.timedelta(days=15)
                    end_date = start_date + datetime.timedelta(days=14)
                    zarrfile = (
                        var
                        + "_"
                        + model
                        + "_"
                        + start_date.strftime("%Y%m%d")
                        + "-"
                        + end_date.strftime("%Y%m%d")
                        + ".zarr"
                    )
                    print(zarrfile)

                    if os.path.isdir("small/" + os.path.basename(zarrfile)):
                        list_files.append("small/" + zarrfile)
dset = xr.open_mfdataset(list_files, engine="zarr")
dset
dset.sortby("cell_ids").chunk("auto").to_zarr(
    "d2D_consolidated.zarr", mode="w", consolidated=True, compute=True
)
dset = xr.open_mfdataset(["d2D_consolidated.zarr"], engine="zarr")
dset

Set the path to the remote location#

target2D = fsspec.get_mapper(
    "s3://gfts-reference-data/ClimateDT/bbox_area1/climateDT_2D_sorted.zarr",
    client_kwargs={
        "endpoint_url": "https://s3.gra.perf.cloud.ovh.net",
        "region_name": "gra",
    },
)
dset.sortby("cell_ids").chunk("auto").to_zarr(
    store=target2D, mode="w", consolidated=True, compute=True
)

Open and consolidate 3D datasets#

list_files = []
for model in models:
    for var in variables3D:
        for year in years:
            for month in months:
                for p in range(2):
                    start_date = datetime.datetime(
                        year, month, 1
                    ) + p * datetime.timedelta(days=15)
                    end_date = start_date + datetime.timedelta(days=14)
                    zarrfile = (
                        var
                        + "_"
                        + model
                        + "_"
                        + start_date.strftime("%Y%m%d")
                        + "-"
                        + end_date.strftime("%Y%m%d")
                        + ".zarr"
                    )
                    print(zarrfile)

                    if os.path.isdir("small/" + os.path.basename(zarrfile)):
                        list_files.append("small/" + zarrfile)
list_files
dset = xr.open_mfdataset(list_files, engine="zarr")
dset

Set the path to the remote location#

target3D = fsspec.get_mapper(
    "s3://gfts-reference-data/ClimateDT/bbox_area1/climateDT_3D.zarr",
    client_kwargs={
        "endpoint_url": "https://s3.gra.perf.cloud.ovh.net",
        "region_name": "gra",
    },
)
dset.sortby("cell_ids").chunk("auto").to_zarr(
    store=target3D, mode="w", consolidated=True, compute=True
)

Loading remote zarr#

fsg = s3fs.S3FileSystem(
    anon=False,
    client_kwargs={
        "endpoint_url": "https://s3.gra.perf.cloud.ovh.net",
        "region_name": "gra",
    },
)
store = s3fs.S3Map(
    root="s3://gfts-reference-data/ClimateDT/bbox_area1/climateDT_2D.zarr",
    s3=fsg,
    check=False,
)
d2D = xr.open_zarr(store=store, consolidated=True)
d2D
store = s3fs.S3Map(
    root="s3://gfts-reference-data/ClimateDT/bbox_area1/climateDT_3D.zarr",
    s3=fsg,
    check=False,
)
d3D = xr.open_zarr(store=store, consolidated=True)
d3D

Basic visualization#

# hvplot.xarray bugs with 1D index.
# workaround, is put that in pandas.
# but then when putting it into pandas, it bugs thus have to do reset_index before plotting....
#
df = d3D.isel(time=0, oceanModelLayer=0).reset_index("cell_ids").to_dataframe()
df[df.avg_thetao.notna()].hvplot.scatter(
    x="longitude",
    y="latitude",
    c="avg_thetao",
    s=5,
    geo=True,
    global_extent=True,
    frame_height=450,  # , tiles=True
    projection=ccrs.Orthographic(0, 30),
    # , marker='h', size=20
    coastline=True,
)
d2D.avg_zos.isel(cell_ids=1000).to_dataframe()["avg_zos"].hvplot()