Tool: How to summarize the tag data

This notebook aims to provide you with simple functions for overviewing your biologging data

As such, we expect you to have pre-processed the raw data of your biologging campaign as previously introduced.

0. Notebook Initialization

# needed for now...
!pip install rich zstandard
!pip install "xarray-healpy @ git+https://github.com/iaocea/xarray-healpy.git@0ffca6058f4008f4f22f076e2d60787fcf32ac82"
!pip install movingpandas more_itertools
!pip install xarray --upgrade
!pip install xdggs
!pip install healpix-convolution

# Python imports
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import s3fs
import tqdm
import xarray as xr

Update the variables below with your data.

TAG_ROOT = "path_to_your_formatted_tag_folder/"
# TAG_ROOT = "gfts-ifremer/bargip/tag/formatted/"

storage_options = {
    "anon": False,
    "profile": "gfts",
    "client_kwargs": {
        "endpoint_url": "https://s3.gra.perf.cloud.ovh.net/",
        "region_name": "gra",
    },
}
s3 = s3fs.S3FileSystem(**storage_options)
tag_names = [
    tn.replace(TAG_ROOT, "") for tn in s3.ls(TAG_ROOT) if not tn.endswith(".csv")
]
print(f'Found {len(tag_names)} in tag folder(s) in: "{TAG_ROOT}".')

tag_names = tag_names

1. Release Locations Overview

In this first section, we visualize the release locations. We have implemented static and dynamic visualizations (which can be useful if you want to interact with the map).

def aggregate_release_locations(
    paths: list[str], labels: list[str], storage_options: dict = None
):
    """
    Aggregates the release locations from a .csv files.

    Parameters
    ----------
    - paths : list of str
        The paths of the ``.csv`` files.
        They must have a column ``event_name`` (with a ``release`` entry) as well as the columns ``longitude`` and ``latitude``.
    - labels : list of str
        Names of the tags associated to the ``.csv`` files. **In other words, `paths` and `labels` must have the same length.**
    - storage_options : mapping, optional
        The storage options passed to ``pandas.read_csv()`` required to open the ``.csv`` files.

    Returns
    -------
    df : pandas.DataFrame
        A dataframe of all the release locations.
    """
    if len(paths) != len(labels):
        raise ValueError(
            "the number of labels provided is different than the number of paths."
        )

    pbar = tqdm.tqdm(paths, file=sys.stdout)
    lons, lats = [], []
    aws_prefix = "s3://"
    for p in pbar:
        if (storage_options is not None) and (not p.startswith(aws_prefix)):
            p = aws_prefix + p
        df = pd.read_csv(
            p, storage_options=storage_options, index_col="event_name"
        )  # .assign(time=lambda df: pd.to_datetime(df["time"]))
        s = df.loc["release"]
        lon = s["longitude"]
        lat = s["latitude"]
        lons.append(lon)
        lats.append(lat)
    data = {"tag_name": labels, "longitude": lons, "latitude": lats}
    pbar.close()
    return pd.DataFrame.from_dict(data, orient="columns")

df_paths = s3.glob(TAG_ROOT + "*/tagging_events.csv")
df = aggregate_release_locations(df_paths, tag_names, storage_options=storage_options)
df.head(3)

a. Static Visualization

import cartopy.crs as ccrs
import cartopy.feature as cf
import matplotlib.pyplot as plt
import pandas as pd  # noqa: F811


def static_plot(df: pd.DataFrame, margin=1.0, label_col=None):
    """Static visualization of data points.

    Parameters
    ----------
    - df : pandas.DataFrame
        A dataframe that must have the columns ``longitude`` and ``latitude``.
    - margin : float, default: 1.0
        Value to extend the longitude and latitude ranges of the figure.
    - label_col : str, optional
        Name of the column to label the data points.

    Returns
    -------
    figure : matplotlib.pyplot.Figure
        The static figure
    """

    ccr = ccrs.PlateCarree()

    fig, ax = plt.subplots(figsize=(8, 6), subplot_kw={"projection": ccr})

    gridlines_kwargs = {
        "crs": ccr,
        "draw_labels": True,
        "linewidth": 0.6,
        "color": "gray",
        "alpha": 0.5,
        "linestyle": "-.",
    }
    # coastlines
    ax.add_feature(cf.COASTLINE.with_scale("10m"), lw=0.5)
    # gridlines
    _gl = ax.gridlines(**gridlines_kwargs)

    ax.scatter(df["longitude"], df["latitude"], s=20)

    ax.set_xlim([df["longitude"].min() - margin, df["longitude"].max() + margin])
    ax.set_ylim([df["latitude"].min() - margin, df["latitude"].max() + margin])

    if label_col is not None:
        for i, row in df.iterrows():
            ax.text(
                row["longitude"],
                row["latitude"],
                row[label_col],
                fontsize=10,
                ha="right",
                transform=ccr,
            )

    ax.set_title("")
    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")

    return fig

figure = static_plot(df, margin=1.0)

figure.savefig("release_locations.png")

b. Dynamic Visualization

import pandas as pd  # noqa: F811
import geoviews as gv
import cartopy.crs as ccrs  # noqa: F811


def dynamic_plot(df: pd.DataFrame, margin=1.0, **points_kwargs):
    """Wrapper around ``pandas.DataFrame.hvplot.points``, with different defaults (for dynamic visualization of data points).

    Parameters
    ----------
    - df : pandas.DataFrame
        A dataframe that must have the columns ``longitude`` and ``latitude``.
    - margin : float, default: 1.0
        Value to extend the longitude and latitude ranges of the figure.

    Returns
    -------
    plot : holoviews.Overlay
        The combined plot of the points with coastlines and borders

    Other Parameters
    ----------------
    points_kwargs : dict
        Additional arguments passed to df.hvplot.points.
        See its documentation for more information.
    """

    margin_kwargs = {
        "xlim": [df["longitude"].min() - margin, df["longitude"].max() + margin],
        "ylim": [df["latitude"].min() - margin, df["latitude"].max() + margin],
    }

    points = df.hvplot.points(
        x="longitude",
        y="latitude",
        geo=True,
        projection=ccrs.PlateCarree(),
        **(points_kwargs | margin_kwargs),
    )
    coastlines = gv.feature.coastline()
    return coastlines * points

plot = dynamic_plot(df, size=20, title="Release Locations", color="blue", margin=2.0)
plot

2. Summary Table

In this second section, we create a table (using pandas) that summarises the following information for each tag:

The name/id of the tag, defined by the name of the tag folder.
Whether it has acoustic detections.
Whether it has a final position.
Start and end times.
Duration in days.

NB: Note that the section relies (and thus depends) on the pangeo-fish package:

import sys  # noqa: F811

pangeo_fish_path = Path().home() / "pangeo-fish"
print(pangeo_fish_path)
sys.path.append(str(pangeo_fish_path))

# or %pip install pangeo-fish

from pangeo_fish.io import open_tag

import numpy as np  # noqa: F811
import xarray as xr  # noqa: F811


def summarize_tag(tag: xr.DataTree):
    """Summarizes DST's information.

    Parameters
    ----------
    - tag : xarray.DataTree
        A loaded DST.

    Returns
    -------
    has_acoustic_detection : bool
        Whether ``tag`` has acoustic detections.
    has_final_position : bool
        Whether ``tag`` has a final location.
    start, end : numpy.datetime64
        Start and end times of ``tag``.
    duration : int
        Duration of ``tag`` in days.
    """

    # TODO: improve acoustic boolean assignment
    has_acoustic_detection = len(tag.groups) == 5
    has_final_position = not bool(
        np.isnan(tag["tagging_events"].isel(event_name=1)["longitude"].to_numpy())
        or np.isnan(tag["tagging_events"].isel(event_name=1)["latitude"].to_numpy())
    )
    start = np.datetime64(tag["dst"].isel(time=0)["time"].to_numpy())
    end = np.datetime64(tag["dst"].isel(time=-1)["time"].to_numpy())
    delta = end - start
    duration_in_days = int(delta / np.timedelta64(1, "D"))
    return has_acoustic_detection, has_final_position, start, end, duration_in_days

pbar = tqdm.tqdm(tag_names, file=sys.stdout)
summary = {}
failed_tags = []

for tag_name in pbar:
    try:
        summary[tag_name] = summarize_tag(
            open_tag("s3://" + TAG_ROOT, tag_name, storage_options)
        )
    except Exception as e:
        print(f"EXCEPTION {tag_name}: {str(e)}.")
        failed_tags.append(tag_name)

pbar.close()

import pandas as pd  # noqa: F811

columns = [
    "Acoustic Detection",
    "Has Final Position",
    "Start",
    "End",
    "Duration [days]",
]
df = pd.DataFrame.from_dict(summary, orient="index", columns=columns)
df.head(3)

you can sort df with the tags’ duration, names etc.

df.sort_values(by=[columns[-1]], inplace=True)  # by durations
df.head(3)

output_folder = Path(".")
table_fn = "csv_filename.csv"

path = Path(output_folder)
path.mkdir(parents=True, exist_ok=True)
df.to_csv(path / table_fn, index_label="Tag Name")