Data Reduction: Quarters
In the cells below, remember to check the definitions of the constant values (and update them if needed!).
0. Initialization
!pip install xdggs
!pip install xarray --upgrade
!pip install --upgrade "cf_xarray>=0.10.4"
import os
import json
import sys
from pathlib import Path
PROFILE = "gfts"
SOURCE_BUCKET = "gfts-ifremer"
TARGET_BUCKET = "destine-gfts-visualisation-data"
TAG_ROOT = "https://data-taos.ifremer.fr/data_tmp/cleaned/tag/"
TAG_ROOT_STORAGE_OPTIONS = {}
SOURCE_PREFIX = f"kbatch_papermill/{os.getenv("JUPYTERHUB_USER")}/"
SOURCE_SUFFIX = ""
TARGET_PREFIX = "taos_pollock/"
# set the constant values as environment variables
os.environ["PROFILE"] = PROFILE
os.environ["SOURCE_BUCKET"] = SOURCE_BUCKET
os.environ["TARGET_BUCKET"] = TARGET_BUCKET
os.environ["TAG_ROOT"] = TAG_ROOT
os.environ["TAG_ROOT_STORAGE_OPTIONS"] = json.dumps(TAG_ROOT_STORAGE_OPTIONS)
os.environ["SOURCE_PREFIX"] = SOURCE_PREFIX
os.environ["SOURCE_SUFFIX"] = SOURCE_SUFFIX
os.environ["TARGET_PREFIX"] = TARGET_PREFIX
# add the patch to `regroup.py`
path_to_local_gfts = "gfts"
sys.path.append(str(Path().home() / path_to_local_gfts / "scripts"))
from groups import create_groups, rotate_group, convert_to_parquet # noqa: E402
from simplify import list_tags # noqa: E402
1. Execution
tag_list = list_tags()
tag_list
# possibly, filter the tags to only select some of them
# ...
tag_list = tag_list[:-1]
groups = create_groups(tag_list)
groups
In case your study involves different areas (e.g., the tags were processed with different bounding boxes), you need to specify how to regroup the data: either the intersection or union.
Below, we illustrate how to regroup the data on all the area covered by the tags (i.e., union).
NB: Note that this feature only currently supports HEALPix data.
from groups import open_dataset, compute_cell_ids # noqa: E402
# pick the tags you want to use for determining the area
tag_names = ["tag_that_went_to_the_west", "tag_that_went_to_the_north"]
tags = [open_dataset(tag_name) for tag_name in tag_names]
cell_ids = compute_cell_ids(tags, method="union")
print(f"Found a total of {len(cell_ids)} cells.")
# and regroup the data as before, this time including the union specification
groups = create_groups(tag_list, method="union", cell_ids=cell_ids)
groups
groups = rotate_group(groups)
groups
convert_to_parquet(groups)
Optionally, you can inspect the results:
import s3fs # noqa: E402
storage_options = {
"anon": False,
"profile": "gfts",
"client_kwargs": {
"endpoint_url": "https://s3.gra.perf.cloud.ovh.net/",
"region_name": "gra",
},
}
s3 = s3fs.S3FileSystem(**storage_options)
s3.ls(f"{TARGET_BUCKET}/{TARGET_PREFIX}")