Data Reduction: Quarters

In the cells below, remember to check the definitions of the constant values (and update them if needed!).

0. Initialization

!pip install xdggs
!pip install xarray --upgrade
!pip install --upgrade "cf_xarray>=0.10.4"

import os
import json
import sys
from pathlib import Path

PROFILE = "gfts"

SOURCE_BUCKET = "gfts-ifremer"
TARGET_BUCKET = "destine-gfts-visualisation-data"

TAG_ROOT = "https://data-taos.ifremer.fr/data_tmp/cleaned/tag/"
TAG_ROOT_STORAGE_OPTIONS = {}
SOURCE_PREFIX = f"kbatch_papermill/{os.getenv("JUPYTERHUB_USER")}/"
SOURCE_SUFFIX = ""
TARGET_PREFIX = "taos_pollock/"

# set the constant values as environment variables
os.environ["PROFILE"] = PROFILE
os.environ["SOURCE_BUCKET"] = SOURCE_BUCKET
os.environ["TARGET_BUCKET"] = TARGET_BUCKET
os.environ["TAG_ROOT"] = TAG_ROOT
os.environ["TAG_ROOT_STORAGE_OPTIONS"] = json.dumps(TAG_ROOT_STORAGE_OPTIONS)
os.environ["SOURCE_PREFIX"] = SOURCE_PREFIX
os.environ["SOURCE_SUFFIX"] = SOURCE_SUFFIX
os.environ["TARGET_PREFIX"] = TARGET_PREFIX

# add the patch to `regroup.py`

path_to_local_gfts = "gfts"
sys.path.append(str(Path().home() / path_to_local_gfts / "scripts"))
from groups import create_groups, rotate_group, convert_to_parquet  # noqa: E402
from simplify import list_tags  # noqa: E402

1. Execution

tag_list = list_tags()
tag_list

# possibly, filter the tags to only select some of them
# ...
tag_list = tag_list[:-1]

groups = create_groups(tag_list)
groups

In case your study involves different areas (e.g., the tags were processed with different bounding boxes), you need to specify how to regroup the data: either the intersection or union.

Below, we illustrate how to regroup the data on all the area covered by the tags (i.e., union).

NB: Note that this feature only currently supports HEALPix data.

from groups import open_dataset, compute_cell_ids  # noqa: E402

# pick the tags you want to use for determining the area
tag_names = ["tag_that_went_to_the_west", "tag_that_went_to_the_north"]
tags = [open_dataset(tag_name) for tag_name in tag_names]
cell_ids = compute_cell_ids(tags, method="union")
print(f"Found a total of {len(cell_ids)} cells.")

# and regroup the data as before, this time including the union specification
groups = create_groups(tag_list, method="union", cell_ids=cell_ids)
groups

groups = rotate_group(groups)
groups

convert_to_parquet(groups)

Optionally, you can inspect the results:

import s3fs  # noqa: E402

storage_options = {
    "anon": False,
    "profile": "gfts",
    "client_kwargs": {
        "endpoint_url": "https://s3.gra.perf.cloud.ovh.net/",
        "region_name": "gra",
    },
}

s3 = s3fs.S3FileSystem(**storage_options)
s3.ls(f"{TARGET_BUCKET}/{TARGET_PREFIX}")