Read parquet kerchunk catalog

Contents

Read parquet kerchunk catalog#

import fsspec
import xarray as xr
%%time
# references are on an OSN pod (no credentials needed)
url = "s3://gfts-reference-data/CMEMS_v6r1_NWS_PHY_NRT_NL_01hav_AN_2D_combined.parq/"

target_opts = {"anon": False}

# netcdf files are on the AWS public dataset program (no credentials needed)
remote_opts = {"anon": False}

fs = fsspec.filesystem(
    "reference",
    fo=url,
    remote_protocol="s3",
    remote_options=remote_opts,
    target_options=target_opts,
)
m = fs.get_mapper("")
CPU times: user 286 ms, sys: 30.5 ms, total: 316 ms
Wall time: 421 ms
%%time
ds = xr.open_dataset(
    m, engine="zarr", chunks={}, backend_kwargs={"consolidated": False}
)
CPU times: user 2.1 s, sys: 191 ms, total: 2.29 s
Wall time: 2.63 s
ds
<xarray.Dataset> Size: 521GB
Dimensions:    (latitude: 551, longitude: 936, time: 18048)
Coordinates:
  * latitude   (latitude) float32 2kB 46.0 46.03 46.06 ... 61.23 61.25 61.28
  * longitude  (longitude) float32 4kB -16.0 -15.97 -15.94 ... 9.921 9.949 9.977
  * time       (time) datetime64[ns] 144kB 2022-04-02T00:30:00 ... 2024-04-22...
Data variables:
    mlotst     (time, latitude, longitude) float64 74GB dask.array<chunksize=(1, 551, 936), meta=np.ndarray>
    thetao     (time, latitude, longitude) float64 74GB dask.array<chunksize=(1, 551, 936), meta=np.ndarray>
    ubar       (time, latitude, longitude) float64 74GB dask.array<chunksize=(1, 551, 936), meta=np.ndarray>
    uo         (time, latitude, longitude) float64 74GB dask.array<chunksize=(1, 551, 936), meta=np.ndarray>
    vbar       (time, latitude, longitude) float64 74GB dask.array<chunksize=(1, 551, 936), meta=np.ndarray>
    vo         (time, latitude, longitude) float64 74GB dask.array<chunksize=(1, 551, 936), meta=np.ndarray>
    zos        (time, latitude, longitude) float64 74GB dask.array<chunksize=(1, 551, 936), meta=np.ndarray>
Attributes: (12/13)
    Conventions:     CF-1.8
    comment:         
    contact:         https://marine.copernicus.eu/contact
    domain_name:     NWS36
    field_date:      20220402
    field_type:      mean
    ...              ...
    forecast_type:   analysis
    institution:     Nologin (Spain)
    licence:         https://marine.copernicus.eu/user-corner/service-commitm...
    references:      http://marine.copernicus.eu/
    source:          NEMO3.6
    title:           Ocean surface hourly mean fields for the North West Shel...
ds["thetao"]
<xarray.DataArray 'thetao' (time: 18048, latitude: 551, longitude: 936)> Size: 74GB
dask.array<open_dataset-thetao, shape=(18048, 551, 936), dtype=float64, chunksize=(1, 551, 936), chunktype=numpy.ndarray>
Coordinates:
  * latitude   (latitude) float32 2kB 46.0 46.03 46.06 ... 61.23 61.25 61.28
  * longitude  (longitude) float32 4kB -16.0 -15.97 -15.94 ... 9.921 9.949 9.977
  * time       (time) datetime64[ns] 144kB 2022-04-02T00:30:00 ... 2024-04-22...
Attributes: (12/14)
    easting:        longitude
    latitude_max:   61.2819f
    latitude_min:   46.0036f
    long_name:      Temperature
    longitude_max:  9.977f
    longitude_min:  -15.996f
    ...             ...
    unit_long:      degrees_C
    units:          degrees_C
    valid_max:      22000
    valid_min:      -12000
    z_max:          0.494025f
    z_min:          0.494025f
da = ds["thetao"].sel(time="2022-04-02 00:00", method="nearest").load()
da
<xarray.DataArray 'thetao' (latitude: 551, longitude: 936)> Size: 4MB
array([[13.31200016, 13.32400016, 13.33900016, ...,         nan,
                nan,         nan],
       [13.32600016, 13.34100016, 13.34700016, ...,         nan,
                nan,         nan],
       [13.34300016, 13.35300016, 13.34600016, ...,         nan,
                nan,         nan],
       ...,
       [ 8.45599993,  8.49499993,  8.53999993, ...,         nan,
                nan,         nan],
       [ 8.44899993,  8.47899993,  8.53999993, ...,         nan,
                nan,         nan],
       [ 8.45999993,  8.47099993,  8.53399993, ...,         nan,
                nan,         nan]])
Coordinates:
  * latitude   (latitude) float32 2kB 46.0 46.03 46.06 ... 61.23 61.25 61.28
  * longitude  (longitude) float32 4kB -16.0 -15.97 -15.94 ... 9.921 9.949 9.977
    time       datetime64[ns] 8B 2022-04-02T00:30:00
Attributes: (12/14)
    easting:        longitude
    latitude_max:   61.2819f
    latitude_min:   46.0036f
    long_name:      Temperature
    longitude_max:  9.977f
    longitude_min:  -15.996f
    ...             ...
    unit_long:      degrees_C
    units:          degrees_C
    valid_max:      22000
    valid_min:      -12000
    z_max:          0.494025f
    z_min:          0.494025f
da.hvplot.quadmesh(x="longitude", y="latitude", rasterize=True, data_aspect=1)

Read 3D data#

%%time
# references are on an OSN pod (no credentials needed)
url = "s3://gfts-reference-data/CMEMS_v6r1_NWS_PHY_NRT_NL_3D_combined.parq/"

target_opts = {"anon": False}

# netcdf files are on the AWS public dataset program (no credentials needed)
remote_opts = {"anon": False}

fs3D = fsspec.filesystem(
    "reference",
    fo=url,
    remote_protocol="s3",
    remote_options=remote_opts,
    target_options=target_opts,
)
m3D = fs3D.get_mapper("")
CPU times: user 20.6 ms, sys: 1.34 ms, total: 21.9 ms
Wall time: 67.7 ms
%%time
ds3D = xr.open_dataset(
    m3D, engine="zarr", chunks={}, backend_kwargs={"consolidated": False}
)
CPU times: user 102 ms, sys: 620 µs, total: 102 ms
Wall time: 413 ms
ds3D
<xarray.Dataset> Size: 6TB
Dimensions:    (depth: 50, latitude: 551, longitude: 936, time: 7104)
Coordinates:
  * depth      (depth) float32 200B 0.494 1.541 2.646 ... 5.275e+03 5.728e+03
  * latitude   (latitude) float32 2kB 46.0 46.03 46.06 ... 61.23 61.25 61.28
  * longitude  (longitude) float32 4kB -16.0 -15.97 -15.94 ... 9.921 9.949 9.977
  * time       (time) datetime64[ns] 57kB 2023-07-02T00:30:00 ... 2024-04-22T...
Data variables:
    so         (time, depth, latitude, longitude) float64 1TB dask.array<chunksize=(1, 1, 551, 936), meta=np.ndarray>
    thetao     (time, depth, latitude, longitude) float64 1TB dask.array<chunksize=(1, 1, 551, 936), meta=np.ndarray>
    uo         (time, depth, latitude, longitude) float64 1TB dask.array<chunksize=(1, 1, 551, 936), meta=np.ndarray>
    vo         (time, depth, latitude, longitude) float64 1TB dask.array<chunksize=(1, 1, 551, 936), meta=np.ndarray>
Attributes: (12/13)
    Conventions:     CF-1.8
    comment:         
    contact:         https://marine.copernicus.eu/contact
    domain_name:     NWS36
    field_date:      20230702
    field_type:      mean
    ...              ...
    forecast_type:   hindcast
    institution:     Nologin (Spain)
    licence:         https://marine.copernicus.eu/user-corner/service-commitm...
    references:      http://marine.copernicus.eu/
    source:          NEMO3.6
    title:           Ocean 3D hourly mean fields for the North West Shelf (NW...
da3D = (
    ds3D["thetao"].isel(depth=0).sel(time="2022-04-02 00:00", method="nearest").load()
)
da3D.hvplot.quadmesh(x="longitude", y="latitude", rasterize=True, data_aspect=1)