Fish track visualistion#


This notebook is used to visualize computed fish-track for the GFTS project.#

The original code used Panel to make the visualization interactive. However, since the panne_plot_s3.ipynb is very slow, we created an HTML version of each tag result using this notebook and uploaded them to S3, allowing biologists to validate the data in batches, such as 10 at a time.

Please refer to the original code (panne_plot_s3.ipynb) for explanations of each function used, the experimental configurations performed, and the parameters applied in this notebook.

How to use this notebook.#

Please execute all the notebook to visualise first 10 html file. In the last cell, please modify the

To create/upload new html files, you will need to activate ‘raw’ cells, and update parameters.


# Import necessary libraries and modules.
import s3fs
from IPython.display import HTML, IFrame, display

Update following with each expeirment you will examine#

We define the name of experiments and it’s related parameters in next cell.

# The name of experiment
remote_path = "gfts-ifremer/tags/bargip"

# Tag_storage_path
tag_storage_path = "cleaned"
tag_storage_path = "clean_demo"

# The name of the folder where the results are stored
generation_name = "tracks_4"

# bbox, bounding box, defines the latitude and longitude range for the analysis area.
bbox = {"latitude": [40, 56], "longitude": [-13, 5]}

Next cell contains parameters to access data, in GFTS which are static

# tramodes are the two types of track that have been computed for GFTS.
track_modes = ["mean", "mode"]

cloud_root = f"s3://{remote_path}"

# tag_root specifies the root URL for tag data used for this computation.
tag_root = f"{cloud_root}/{tag_storage_path}"

s3 = s3fs.S3FileSystem(
    anon=False,
    client_kwargs={
        "endpoint_url": "https://s3.gra.perf.cloud.ovh.net",
    },
)


# storage_options specifies options for the filesystem storing and/or opening output files.
storage_options = {
    "anon": False,
    # 'profile' : "gfts",
    "client_kwargs": {
        "endpoint_url": "https://s3.gra.perf.cloud.ovh.net",
        "region_name": "gra",
    },
}

Next step will list all tags which contains computed results.

# Tag list is the list of available tags

tag_list_ = s3.ls(f"{remote_path}/{generation_name}")
tag_list = [
    tag.replace(f"{remote_path}/{generation_name}/", "")
    for tag in tag_list_
    if tag.replace(f"{remote_path}/{generation_name}/", "")
]


# scratch_root specifies the root directory where are GFTS computation data stored.
scratch_root = f"{cloud_root}/{generation_name}"

Define Plotting functions#

Functions to plot the different visualization for a given tag id

def get_traj(tag_id="CB_A11071"):
    from pangeo_fish.io import read_trajectories

    # load trajectories
    trajectories = read_trajectories(
        track_modes, f"{scratch_root}/{tag_id}", storage_options, format="parquet"
    )

    # Converting the trajectories to pandas DataFrames to access data easily
    mean_df = trajectories.trajectories[0].df
    mode_df = trajectories.trajectories[1].df
    return mean_df, mode_df
def plot_time_series(mean_df, mode_df, tag_id="CB_A11071"):
    import hvplot.xarray  # noqa
    import pandas as pd
    import xarray as xr
    from pangeo_fish.io import open_tag
    from pangeo_fish.tags import to_time_slice

    tag = open_tag(tag_root, tag_id)
    time_slice = to_time_slice(tag["tagging_events/time"])

    tag_log = tag["dst"].ds.sel(time=time_slice)

    # Following part is not optimal, need optimisation
    #
    # Creating pandas series for xarrray dataset
    mean_lon_ = pd.Series(mean_df.geometry.x, name="longitude")
    mean_lat_ = pd.Series(mean_df.geometry.y, name="latitude")
    mode_lon_ = pd.Series(mode_df.geometry.x, name="longitude")
    mode_lat_ = pd.Series(mode_df.geometry.y, name="latitude")

    # Creating xarray datasets
    mean_coords = xr.Dataset(pd.concat([mean_lon_, mean_lat_], axis=1))
    mode_coords = xr.Dataset(pd.concat([mode_lon_, mode_lat_], axis=1))

    # Assigning dataarrays to variables
    mean_lon = mean_coords["longitude"]
    mean_lat = mean_coords["latitude"]
    mode_lon = mode_coords["longitude"]
    mode_lat = mode_coords["latitude"]

    tag_log["depth"] = tag_log["pressure"]
    temp_plot = tag_log["temperature"].hvplot(
        color="Red",
        title=f"{tag_id} , Temperature (°C)",
        grid=True,
        height=200,
        width=600,
    )
    depth_plot = (-tag_log["depth"]).hvplot(
        color="Blue", title="Depth (m)", grid=True, height=200, width=600
    )
    lon_plot = (
        mean_lat.hvplot(
            label="mean", clim=[mean_lat_.min(), mean_lat_.max()], dynamic=True
        )
        * mode_lat.hvplot(
            label="mode", clim=[mode_lat_.min(), mean_lat_.max()], dynamic=True
        )
    ).opts(height=200, width=600, show_grid=True, title="Fish latitude over time")
    lat_plot = (
        mean_lon.hvplot(
            label="mean", clim=[mean_lon_.min(), mean_lat_.max()], dynamic=True
        )
        * mode_lon.hvplot(
            label="mode", clim=[mode_lon_.min(), mean_lat_.max()], dynamic=True
        )
    ).opts(height=200, width=600, show_grid=True, title="Fish longitude over time")
    print("fini time series plot", tag_id)
    return (temp_plot + depth_plot + lon_plot + lat_plot).cols(1)
def plot_track(mean_df, mode_df, tag_id="CB_A11071"):
    import hvplot.pandas  # noqa
    import movingpandas as mpd
    import pandas as pd

    sigma = pd.read_json(f"{scratch_root}/{tag_id}/parameters.json").to_dict()[0][
        "sigma"
    ]
    ## Following part is not optimal, need some optimisation.
    # Adding month data
    mean_df["month"] = mean_df.index.month
    mode_df["month"] = mode_df.index.month
    # Converting back to trajectories
    mean_traj = mpd.Trajectory(
        mean_df, traj_id=mean_df.traj_id.drop_duplicates().values[0]
    )
    mode_traj = mpd.Trajectory(
        mode_df, traj_id=mode_df.traj_id.drop_duplicates().values[0]
    )
    trajectories = mpd.TrajectoryCollection([mean_traj, mode_traj])
    print("updated trajectories")
    traj_plots = [
        traj.hvplot(
            c="month",
            tiles="CartoLight",
            cmap="rainbow",
            title=f"{tag_id} , {traj.id}, {sigma}",
            width=375,
            height=375,
            dynamic=True,
        )
        for traj in trajectories.trajectories
    ]

    return (traj_plots[0] + traj_plots[1]).cols(1)
def plot_emission(tag_id="CB_A11071"):
    import xarray as xr
    from pangeo_fish import visualization

    ## Might not work if dask involved or slider involved
    emission = xr.open_dataset(
        f"{scratch_root}/{tag_id}/combined.zarr",
        engine="zarr",
        chunks={},
        inline_array=True,
        storage_options=storage_options,
    ).rename_vars({"pdf": "emission"})

    states = xr.open_dataset(
        f"{scratch_root}/{tag_id}/states.zarr",
        engine="zarr",
        chunks={},
        inline_array=True,
        storage_options=storage_options,
    ).where(emission["mask"])

    data = xr.merge([states, emission.drop_vars(["mask"])])
    plot1 = visualization.plot_map(
        data["states"].sel(time=slice("2015-09-04", "2015-09-10")), bbox, cmap="cool"
    ).opts(height=350, width=600)
    plot2 = visualization.plot_map(
        data["emission"].sel(time=slice("2015-09-04", "2015-09-10")), bbox, cmap="cool"
    ).opts(height=350, width=600)
    return (plot1 + plot2).cols(1)
def get_plot(tag_id="CB_A11071"):
    # load trajectories
    mean_df, mode_df = get_traj(tag_id)
    time_series = plot_time_series(mean_df, mode_df, tag_id)
    track = plot_track(mean_df, mode_df, tag_id)
    # emission=plot_emission(tag_id)
    fig = (time_series + track).cols(2)
    return fig
def get_plot_in_html(tag_id="CB_A11071", make_html=True):
    import hvplot
    from bokeh.resources import INLINE

    fname = f"{generation_name}/{tag_id}.html"
    if make_html:
        # load trajectories
        mean_df, mode_df = get_traj(tag_id)
        time_series = plot_time_series(mean_df, mode_df, tag_id)
        track = plot_track(mean_df, mode_df, tag_id)
        # emission=plot_emission(tag_id)
        fig = (time_series + track).cols(2)
        print(fname)
        hvplot.save(fig, fname, resources=INLINE)
    # Copy the html to s3 here
    s3.put(fname, (f"{remote_path}/{generation_name}/{tag_id}/track.html"))
    return  # fig
def process_tag(tag_id):
    try:
        print(tag_id)
        get_plot_in_html(tag_id)  # ,make_html=False)
        return True  # Return True if successful
    except KeyError as e:
        print(f"KeyError encountered for tag {tag_id}: {e}")
        return False
    except Exception as e:
        print(f"Other error encountered for tag {tag_id}: {e}")
        return False

Create html file for each tag_id, and list failed tag_id’s#

Activate next ‘raw’ cell, to create new html file. Please verify that the parameters so that you do not overwrite the html files already created!

Visualise diag using html files created.#

First, list the all uploaded tag_id in the GFTS.

print(tag_root)
tag_ids = [path.split("/")[-1] for path in s3.ls(tag_root)]

Plot 10 by 10#

print(
    "You have in total ",
    len(tag_ids),
    "tag_id's stored.  In next cell, we will plot",
    "result from ",
    remote_path,
    generation_name,
)
print(
    'Next cell, update the parameter "id_start" and "id_end" from 0 to ',
    len(tag_ids),
    "so you can plot 10 by 10",
)
id_start = 0
id_end = 9
print(
    "plot in total",
    len(tag_ids[id_start : id_end + 1]),
    "next cell will plot computed result of tag_id's",
    tag_ids[id_start : id_end + 1],
)

If you see err mesage for fetching the html file, it means that tag_id does not have result computed.

generation_name = "tracks_4"
local = False
for tag_id in tag_ids[id_start : id_end + 1]:
    if local:
        fname = f"{generation_name}/{tag_id}.html"
        display(IFrame(src=fname, width=1000, height=1000))
    else:
        fname = f"{remote_path}/{generation_name}/{tag_id}/track.html"
        print("visualising", fname)
        # Attempt to read the HTML file content from S3 (cached in memory)
        try:
            with s3.open(fname, "r") as f:
                html_content = f.read()  # Read the HTML content as a string
            print("File fetched and cached successfully.")

            # Display the HTML content in Jupyter Notebook
            try:
                display(HTML(html_content))  # Display the HTML content in the notebook
            except Exception as e:
                print(f"Error displaying HTML: {e}")

        except Exception as e:
            print(f"Error fetching file: {e}")
            continue  # Continue with the next iteration of the loop