Extract selected area from DestinE Climate Twin in Healpix to Zarr#
The goal of this notebook is to read 2D variables from GFTS bucket and select a small geographical area and store the results in zarr.
pip install xdggs
import xarray as xr
import healpy as hp
import fsspec
import datetime
import os
import s3fs
import hvplot.pandas # noqa
import hvplot.xarray # noqa
import cartopy.crs as ccrs
2D variables to process#
variables2D = ["avg_sos", "avg_hc300m", "avg_hc700m", "avg_zos"]
years = [2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030]
months = [6]
models = ["ifs-nemo"]
maxlevels = {}
maxlevels["ifs-nemo"] = 75
maxlevels["icon"] = 72
Define Geographical area#
bbox = {"latitude": [46, 51], "longitude": [-8, -1]}
Processing#
%%time
for model in models:
for var in variables2D:
for year in years:
for month in months:
for p in range(2):
start_date = datetime.datetime(
year, month, 1
) + p * datetime.timedelta(days=15)
end_date = start_date + datetime.timedelta(days=14)
uri = (
"simplecache::s3://gfts-reference-data/ClimateDT/raw/"
+ var
+ "_"
+ model
+ "_"
+ start_date.strftime("%Y%m%d")
+ "-"
+ end_date.strftime("%Y%m%d")
+ ".grib"
)
print(uri)
try:
if not os.path.isdir(
"small/" + os.path.basename(uri).split(".grib")[0] + ".zarr"
):
file = fsspec.open_local(
uri,
s3={"anon": False},
filecache={
"cache_storage": "/home/jovyan/cache_storage"
},
)
dset = xr.open_dataset(file, engine="cfgrib")
npix = dset.sizes["values"]
nest = True
nside = hp.npix2nside(npix)
cell_ids = range(0, 12 * nside**2)
cell_ids = range(12 * nside**2, 0, -1)
dset = dset.assign_coords(
{"cell_ids": ("values", cell_ids)}
).swap_dims({"values": "cell_ids"})
dset.cell_ids.attrs = {
"grid_name": "healpix",
"nside": nside,
"nest": nest,
}
dset["longitude"] = ((dset.longitude + 180) % 360) - 180
dset.sortby("cell_ids").where(
(dset.latitude >= bbox["latitude"][0])
& (dset.latitude <= bbox["latitude"][1])
& (dset.longitude >= bbox["longitude"][0])
& (dset.longitude <= bbox["longitude"][1]),
drop=True,
).to_zarr(
"small/"
+ os.path.basename(uri).split(".grib")[0]
+ ".zarr"
)
else:
print("zarr file exists for ", uri)
except Exception:
print("not processing file ", uri)
Save geographical area cell_ids#
var
if not os.path.isdir("cell_ids.zarr") and "dset" in locals():
dset.sortby("cell_ids").isel(time=0).reset_coords("time", drop=True).where(
(dset.latitude >= bbox["latitude"][0])
& (dset.latitude <= bbox["latitude"][1])
& (dset.longitude >= bbox["longitude"][0])
& (dset.longitude <= bbox["longitude"][1]),
drop=True,
).drop_vars(var).to_zarr("cell_ids.zarr")
3D variables to process#
variables3D = ["avg_thetao", "avg_so", "avg_von", "avg_uoe", "avg_wo"]
for model in models:
for var in variables3D:
for year in years:
for month in months:
for p in range(2):
start_date = datetime.datetime(
year, month, 1
) + p * datetime.timedelta(days=15)
end_date = start_date + datetime.timedelta(days=14)
uri = (
"simplecache::s3://gfts-reference-data/ClimateDT/raw/"
+ var
+ "_"
+ model
+ "_"
+ start_date.strftime("%Y%m%d")
+ "-"
+ end_date.strftime("%Y%m%d")
+ ".grib"
)
print(uri)
try:
if not os.path.isdir(
"small/" + os.path.basename(uri).split(".grib")[0] + ".zarr"
):
file = fsspec.open_local(
uri,
s3={"anon": False},
filecache={
"cache_storage": "/home/jovyan/cache_storage"
},
)
dset = xr.open_dataset(
file, engine="cfgrib", chunks={"time": 1}
)
npix = dset.sizes["values"]
nest = True
nside = hp.npix2nside(npix)
cell_ids = range(0, 12 * nside**2)
cell_ids = range(12 * nside**2, 0, -1)
dset = dset.assign_coords(
{"cell_ids": ("values", cell_ids)}
).swap_dims({"values": "cell_ids"})
dset.cell_ids.attrs = {
"grid_name": "healpix",
"nside": nside,
"nest": nest,
}
dset["longitude"] = ((dset.longitude + 180) % 360) - 180
dcell_ids = xr.open_dataset("cell_ids.zarr", engine="zarr")
dcell_ids = dcell_ids.expand_dims(
dim={"time": dset.time.size}
)
dset = dset.sortby("cell_ids")
dset.where(
dset.cell_ids == dcell_ids.cell_ids, drop=True
).to_zarr(
"small/"
+ os.path.basename(uri).split(".grib")[0]
+ ".zarr"
)
else:
print("zarr file exists for ", uri)
except Exception:
print("not processing file ", uri)
Open and consolidate 2D datasets#
list_files = []
for model in models:
for var in variables2D:
for year in years:
for month in months:
for p in range(2):
start_date = datetime.datetime(
year, month, 1
) + p * datetime.timedelta(days=15)
end_date = start_date + datetime.timedelta(days=14)
zarrfile = (
var
+ "_"
+ model
+ "_"
+ start_date.strftime("%Y%m%d")
+ "-"
+ end_date.strftime("%Y%m%d")
+ ".zarr"
)
print(zarrfile)
if os.path.isdir("small/" + os.path.basename(zarrfile)):
list_files.append("small/" + zarrfile)
dset = xr.open_mfdataset(list_files, engine="zarr")
dset
dset.sortby("cell_ids").chunk("auto").to_zarr(
"d2D_consolidated.zarr", mode="w", consolidated=True, compute=True
)
dset = xr.open_mfdataset(["d2D_consolidated.zarr"], engine="zarr")
dset
Set the path to the remote location#
target2D = fsspec.get_mapper(
"s3://gfts-reference-data/ClimateDT/bbox_area1/climateDT_2D_sorted.zarr",
client_kwargs={
"endpoint_url": "https://s3.gra.perf.cloud.ovh.net",
"region_name": "gra",
},
)
dset.sortby("cell_ids").chunk("auto").to_zarr(
store=target2D, mode="w", consolidated=True, compute=True
)
Open and consolidate 3D datasets#
list_files = []
for model in models:
for var in variables3D:
for year in years:
for month in months:
for p in range(2):
start_date = datetime.datetime(
year, month, 1
) + p * datetime.timedelta(days=15)
end_date = start_date + datetime.timedelta(days=14)
zarrfile = (
var
+ "_"
+ model
+ "_"
+ start_date.strftime("%Y%m%d")
+ "-"
+ end_date.strftime("%Y%m%d")
+ ".zarr"
)
print(zarrfile)
if os.path.isdir("small/" + os.path.basename(zarrfile)):
list_files.append("small/" + zarrfile)
list_files
dset = xr.open_mfdataset(list_files, engine="zarr")
dset
Set the path to the remote location#
target3D = fsspec.get_mapper(
"s3://gfts-reference-data/ClimateDT/bbox_area1/climateDT_3D.zarr",
client_kwargs={
"endpoint_url": "https://s3.gra.perf.cloud.ovh.net",
"region_name": "gra",
},
)
dset.sortby("cell_ids").chunk("auto").to_zarr(
store=target3D, mode="w", consolidated=True, compute=True
)
Loading remote zarr#
fsg = s3fs.S3FileSystem(
anon=False,
client_kwargs={
"endpoint_url": "https://s3.gra.perf.cloud.ovh.net",
"region_name": "gra",
},
)
store = s3fs.S3Map(
root="s3://gfts-reference-data/ClimateDT/bbox_area1/climateDT_2D.zarr",
s3=fsg,
check=False,
)
d2D = xr.open_zarr(store=store, consolidated=True)
d2D
store = s3fs.S3Map(
root="s3://gfts-reference-data/ClimateDT/bbox_area1/climateDT_3D.zarr",
s3=fsg,
check=False,
)
d3D = xr.open_zarr(store=store, consolidated=True)
d3D
Basic visualization#
# hvplot.xarray bugs with 1D index.
# workaround, is put that in pandas.
# but then when putting it into pandas, it bugs thus have to do reset_index before plotting....
#
df = d3D.isel(time=0, oceanModelLayer=0).reset_index("cell_ids").to_dataframe()
df[df.avg_thetao.notna()].hvplot.scatter(
x="longitude",
y="latitude",
c="avg_thetao",
s=5,
geo=True,
global_extent=True,
frame_height=450, # , tiles=True
projection=ccrs.Orthographic(0, 30),
# , marker='h', size=20
coastline=True,
)
d2D.avg_zos.isel(cell_ids=1000).to_dataframe()["avg_zos"].hvplot()