Source code for ctdfjorder.CTD

from ctdfjorder.exceptions.exceptions import CTDError, Critical
from ctdfjorder.exceptions.exceptions import raise_warning_calculatuion
from ctdfjorder.metadata.master_sheet import MasterSheet
from ctdfjorder.constants.constants import *
from ctdfjorder.loadctd.rsk import load_file_rsk
from ctdfjorder.loadctd.castaway import load_file_castaway
from ctdfjorder.utils import utils
from ctdfjorder.ai import ai
import pandas as pd
from sqlite3 import OperationalError
from os import path
import numpy as np
import polars as pl
import gsw
from typing import Any, Union
import logging
import warnings

warnings.filterwarnings("ignore")
logger = logging.getLogger("ctdfjorder")
logger.propagate = 0



[docs]
class CTD:
    """
    Object representing a single profile.

    Parameters
    ----------

    ctd_file_path : str
        The file path to the RSK or Castaway file.
    cached_master_sheet : masterSheet, default None
        CTDFjorder's internal representation of a master sheet.
    master_sheet_path : str, default None
        Path to a master sheet.
    plot : bool, default False
        If true saves plots to 'plots' folder in working directory.

    Examples
    --------
    Castaway CTD profile with valid data

    >>> ctd_data = CTD('CC1531002_20181225_114931.csv')
    >>> output = ctd_data.get_df()
    >>> print(output.head(3))
    shape: (3, 13)
    ┌──────────────┬──────────┬─────────────┬──────────────┬───┬────────────┬───────────────────────────────┬────────────┬────────────┐
    │ sea_pressure ┆ depth    ┆ temperature ┆ conductivity ┆ … ┆ profile_id ┆ filename                      ┆ latitude   ┆ longitude  │
    │ ---          ┆ ---      ┆ ---         ┆ ---          ┆   ┆ ---        ┆ ---                           ┆ ---        ┆ ---        │
    │ f64          ┆ f64      ┆ f64         ┆ f64          ┆   ┆ i32        ┆ str                           ┆ f64        ┆ f64        │
    ╞══════════════╪══════════╪═════════════╪══════════════╪═══╪════════════╪═══════════════════════════════╪════════════╪════════════╡
    │ 0.15         ┆ 0.148676 ┆ 0.32895     ┆ 28413.735648 ┆ … ┆ 0          ┆ CC1531002_20181225_114931.csv ┆ -64.668455 ┆ -62.641775 │
    │ 0.45         ┆ 0.446022 ┆ 0.316492    ┆ 28392.966662 ┆ … ┆ 0          ┆ CC1531002_20181225_114931.csv ┆ -64.668455 ┆ -62.641775 │
    │ 0.75         ┆ 0.743371 ┆ 0.310613    ┆ 28386.78011  ┆ … ┆ 0          ┆ CC1531002_20181225_114931.csv ┆ -64.668455 ┆ -62.641775 │
    └──────────────┴──────────┴─────────────┴──────────────┴───┴────────────┴───────────────────────────────┴────────────┴────────────┘

    Castaway CTD profile with no data

    >>> ctd_data = CTD('CC1627007_20191220_195931.csv') # doctest: +IGNORE_EXCEPTION_DETAIL
    Traceback (most recent call last):
    ...
    ctdfjorder.CTDError: CC1627007_20191220_195931.csv - No samples in file

    Raises
    ------
    CTDError
        For ctdfjorder related errors

    """

    # Initialization Constants
    _data: pl.DataFrame = pl.DataFrame()
    _cached_master_sheet: MasterSheet = None
    _filename: str = None
    _filepath: str = None
    _cwd: str = None
    master_sheet_path: str = None
    _add_unique_id: bool = False
    _num_profiles: int = 0
    _mld_col_labels: list[str] = []
    _plot: bool = False

    def __init__(
        self,
        ctd_file_path: str,
        cached_master_sheet: MasterSheet = None,
        master_sheet_path=None,
        plot=False,
    ):
        # Define instance vars, load master sheet if path given and master sheet is not cached
        self._filename = path.basename(ctd_file_path)
        if type(self._cached_master_sheet) is type(None) and master_sheet_path:
            try:
                self._cached_master_sheet = MasterSheet(master_sheet_path)
            except Critical:
                pass
        else:
            self._cached_master_sheet = cached_master_sheet
        self.master_sheet_path = master_sheet_path
        self._cwd = utils.get_cwd()
        self._plot = plot

        # Processing RSK Files
        if RSK_FILE_MARKER in ctd_file_path:
            try:
                self._data = load_file_rsk(ctd_file_path)
            except OperationalError:
                raise CTDError(filename=self._filename, message=ERROR_RSK_CORRUPT)
        # Processing Castaway Files
        elif CASTAWAY_FILE_MARKER in ctd_file_path:
            self._data = load_file_castaway(ctd_file_path)
        else:
            raise CTDError(filename=self._filename, message=ERROR_CTD_FILENAME_ENDING)

        # Checking if data is empty
        if self._data.is_empty():
            raise CTDError(filename=self._filename, message=ERROR_NO_SAMPLES)

        # Adding year and month columns
        self._data = self._data.with_columns(
            pl.col(TIMESTAMP_LABEL)
            .dt.convert_time_zone(TIME_ZONE)
            .cast(pl.Datetime(time_unit=TIME_UNIT, time_zone=TIME_ZONE))
        )
        self._data = self._data.with_columns(
            pl.col(TIMESTAMP_LABEL).dt.year().alias(YEAR_LABEL),
            pl.col(TIMESTAMP_LABEL).dt.month().alias(MONTH_LABEL),
        )

        # If master sheet or cached master sheet is present, find the matching information and correct missing location
        if self._cached_master_sheet:
            self._data = self._data.with_columns(
                pl.lit(None, dtype=pl.String).alias(UNIQUE_ID_LABEL),
                pl.lit(None, dtype=pl.Float32).alias(SECCHI_DEPTH_LABEL),
                pl.lit(None, dtype=pl.String).alias(SITE_NAME_LABEL),
                pl.lit(None, dtype=pl.String).alias(SITE_ID_LABEL),
            )
            for profile_id in (
                self._data.select(PROFILE_ID_LABEL)
                .unique(keep="first")
                .to_series()
                .to_list()
            ):
                profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)

                # Find master sheet match
                master_sheet_match = self._cached_master_sheet.find_match(profile)
                # Add secchi depth and Unique ID
                profile = profile.with_columns(
                    pl.lit(master_sheet_match.unique_id, dtype=pl.String).alias(
                        UNIQUE_ID_LABEL
                    ),
                    pl.lit(master_sheet_match.secchi_depth)
                    .cast(pl.Float32)
                    .alias(SECCHI_DEPTH_LABEL),
                    pl.lit(master_sheet_match.site_name)
                    .cast(pl.String)
                    .alias(SITE_NAME_LABEL),
                    pl.lit(master_sheet_match.site_id)
                    .cast(pl.String)
                    .alias(SITE_ID_LABEL),
                )
                # Add location data if not present
                if (
                    LATITUDE_LABEL not in profile.columns
                    or (
                        profile.select(pl.col(LATITUDE_LABEL).has_nulls()).item()
                        or profile.select(pl.col(LATITUDE_LABEL).is_nan().any()).item()
                    )
                    or profile.select(pl.col(LATITUDE_LABEL)).is_empty()
                    or profile.select(pl.col(LATITUDE_LABEL)).limit(1).item() is None
                ):
                    self._data = self._data.with_columns(
                        pl.lit(None, dtype=pl.Float64).alias(LATITUDE_LABEL),
                        pl.lit(None, dtype=pl.Float64).alias(LONGITUDE_LABEL),
                    )
                    latitude = master_sheet_match.latitude
                    longitude = master_sheet_match.longitude
                    if (
                        latitude is None
                        or longitude is None
                        or np.isnan(latitude)
                        or np.isnan(longitude)
                    ):
                        latitude = None
                        longitude = None
                    new_fname = self._filename + "cm"
                    profile = profile.with_columns(
                        pl.lit(latitude).cast(pl.Float64).alias(LATITUDE_LABEL),
                        pl.lit(longitude).cast(pl.Float64).alias(LONGITUDE_LABEL),
                        pl.lit(new_fname, pl.String).alias("filename"),
                    )
                self._data = self._data.filter(pl.col(PROFILE_ID_LABEL) != profile_id)
                self._data = self._data.vstack(profile)


[docs]
    def get_df(self, pandas=False) -> pl.DataFrame | Any:
        """
        Returns the dataframe of the CTD object for integration with custom pipelines.

        Parameters
        ----------
        pandas : bool, default False
            If True returns a pandas df, if False returns a polars DataFrame. Defaults to False.

        Examples
        --------
        Accessing CTD data as a polars dataframe

        >>> from ctdfjorder import CTD
        >>> ctd_data = CTD('CC1531002_20181225_114931.csv')
        >>> ctd_data.remove_non_positive_samples()
        >>> output = ctd_data.get_df()
        >>> print(output.head(3))
        shape: (3, 13)
        ┌──────────────┬──────────┬─────────────┬──────────────┬───┬────────────┬───────────────────────────────┬────────────┬────────────┐
        │ sea_pressure ┆ depth    ┆ temperature ┆ conductivity ┆ … ┆ profile_id ┆ filename                      ┆ latitude   ┆ longitude  │
        │ ---          ┆ ---      ┆ ---         ┆ ---          ┆   ┆ ---        ┆ ---                           ┆ ---        ┆ ---        │
        │ f64          ┆ f64      ┆ f64         ┆ f64          ┆   ┆ i32        ┆ str                           ┆ f64        ┆ f64        │
        ╞══════════════╪══════════╪═════════════╪══════════════╪═══╪════════════╪═══════════════════════════════╪════════════╪════════════╡
        │ 0.15         ┆ 0.148676 ┆ 0.32895     ┆ 28413.735648 ┆ … ┆ 0          ┆ CC1531002_20181225_114931.csv ┆ -64.668455 ┆ -62.641775 │
        │ 0.45         ┆ 0.446022 ┆ 0.316492    ┆ 28392.966662 ┆ … ┆ 0          ┆ CC1531002_20181225_114931.csv ┆ -64.668455 ┆ -62.641775 │
        │ 0.75         ┆ 0.743371 ┆ 0.310613    ┆ 28386.78011  ┆ … ┆ 0          ┆ CC1531002_20181225_114931.csv ┆ -64.668455 ┆ -62.641775 │
        └──────────────┴──────────┴─────────────┴──────────────┴───┴────────────┴───────────────────────────────┴────────────┴────────────┘

        Accessing CTD data as a pandas dataframe

        >>> from ctdfjorder import CTD
        >>> ctd_data = CTD('CC1531002_20181225_114931.csv')
        >>> ctd_data.remove_non_positive_samples()
        >>> output = ctd_data.get_df(pandas=True)
        >>> print(output.head(3))
           sea_pressure     depth  temperature  conductivity  specific_conductivity  ...  pressure  profile_id                       filename   latitude  longitude
        0          0.15  0.148676      0.32895  28413.735648           56089.447456  ...   10.2825           0  CC1531002_20181225_114931.csv -64.668455 -62.641775
        1          0.45  0.446022     0.316492  28392.966662           56076.028991  ...   10.5825           0  CC1531002_20181225_114931.csv -64.668455 -62.641775
        2          0.75  0.743371     0.310613   28386.78011           56076.832208  ...   10.8825           0  CC1531002_20181225_114931.csv -64.668455 -62.641775
        [3 rows x 13 columns]

        Returns
        -------
        pl.DataFrame | pd.DataFrame
            CTD data in pandas when pandas=True, polars when pandas=False.

        Notes
        -----
        There is no supported method to reinsert the dataframe back into the :class:`CTD` object. Any changes made on
        this dataframe will not be reflected in the :class:`CTD` objects internal data.

        """
        # Convert each DataFrame to a DataFrame and collect them in a list
        if pandas:
            return self._data.to_pandas(use_pyarrow_extension_array=True)
        else:
            return self._data


    def _is_empty(self, func: str) -> bool:
        if self._data.is_empty():
            raise CTDError(
                filename=self._filename,
                message=f"No valid samples in file after running {func}",
            )
        return True


[docs]
    def remove_upcasts(self) -> None:
        r"""
        Removes upcasts by dropping rows where pressure decreases from one sampling event to the next.

        Notes
        -----
        An upcast in CTD (Conductivity, Temperature, Depth) profiles occurs when the sensor package is
        moved upward through the water column, causing a decrease in pressure readings. This method
        identifies and removes such events by ensuring pressure monotonically increases within each profile.

        The procedure is as follows:

        1. For each unique profile identified by `profile_id`, the method extracts the profile's data.
        2. It then computes the difference in pressure between consecutive samples within the profile.
        3. Rows where the pressure difference is not positive are removed, indicating a non-increasing
           pressure (i.e., an upcast or no movement).
        4. The cleaned profile is then reintegrated into the main dataset, replacing the original data.

        Let :math:`p_i` be the pressure at the :math:`i`-th sampling event. The condition for retaining a
        data point is given by:

        .. math::

            \Delta p_i = p_{i} - p_{i-1} > 0 \quad \text{for} \quad i = 1, 2, \ldots, N

        where :math:`N` is the total number of sampling events in the profile. Rows not satisfying this
        condition are considered upcasts and are removed.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.remove_upcasts()
        >>> # This will clean the dataset by removing upcasts, ensuring all profiles have monotonically
        >>> # increasing pressure readings.

        See Also
        --------
        CTD : To initialize an object

        """
        for profile_id in (
            self._data.select(PROFILE_ID_LABEL)
            .unique(keep="first")
            .to_series()
            .to_list()
        ):
            profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)
            profile = profile.filter((pl.col(PRESSURE_LABEL).diff()) > 0.0)
            self._data = self._data.filter(pl.col(PROFILE_ID_LABEL) != profile_id)
            self._data = self._data.vstack(profile)
        self._is_empty(CTD.remove_upcasts.__name__)



[docs]
    def filter_columns_by_range(
        self,
        filters: zip = None,
        columns: list[str] = None,
        upper_bounds: list[float | int] = None,
        lower_bounds: list[float | int] = None,
    ):
        """
        Filters columns of the dataset based on specified upper and lower bounds.

        This method allows filtering of a dataset by applying specified upper and lower bounds
        on the columns. It processes the data profile by profile and updates the dataset accordingly.

        Parameters
        ----------
        filters : zip, optional
            An iterable of tuples, where each tuple contains a column name, an upper bound, and a lower bound.
            If provided, this takes precedence over the individual `columns`, `upper_bounds`, and `lower_bounds` parameters.
        columns : list of str, optional
            A list of column names to be filtered. Must be provided along with `upper_bounds` and `lower_bounds`.
        upper_bounds : list of float or int, optional
            A list of upper bound values corresponding to each column in `columns`.
        lower_bounds : list of float or int, optional
            A list of lower bound values corresponding to each column in `columns`.

        Notes
        -----
        The method performs the following steps for each unique profile identified by `profile_id`:

        1. Extracts the data for the profile using the `profile_id`.
        2. If `filters` is provided, it iterates over each filter tuple and applies the specified bounds
           to the relevant column, filtering out data that does not meet the criteria.
        3. If `columns`, `upper_bounds`, and `lower_bounds` are provided, it iterates over each column
           and applies the corresponding upper and lower bounds to filter the data.
        4. Updates the dataset by removing the original profile data and reintegrating the filtered profile data.
        5. Checks if the dataset is empty after filtering using the `_is_empty` method.

        The method is designed to be flexible, allowing filtering through either a comprehensive set of filters
        or by specifying individual columns and their bounds directly. This makes it adaptable to various
        dataset structures and filtering requirements.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> filters = zip(['temperature', 'salinity'], [20.0, 35.0], [10.0, 30.0])
        >>> ctd_data.filter_columns_by_range(filters=filters)
        >>> # This will filter the `temperature` column to be between 10.0 and 20.0, and `salinity` to be between 30.0 and 35.0.

        >>> columns = ['temperature', 'salinity']
        >>> upper_bounds = [20.0, 35.0]
        >>> lower_bounds = [10.0, 30.0]
        >>> ctd_data.filter_columns_by_range(columns=columns, upper_bounds=upper_bounds, lower_bounds=lower_bounds)
        >>> # This will filter the `temperature` column to be between 10.0 and 20.0, and `salinity` to be between 30.0 and 35.0.

        See Also
        --------
        remove_non_positive_samples : Method to remove rows with non-positive values for specific parameters.
        """

        def apply_bounds(profile, column, upper_bound, lower_bound):
            if upper_bound is not None:
                profile = profile.filter(pl.col(column) <= upper_bound)
            if lower_bound is not None:
                profile = profile.filter(pl.col(column) >= lower_bound)
            return profile

        for profile_id in (
            self._data.select(PROFILE_ID_LABEL)
            .unique(keep="first")
            .to_series()
            .to_list()
        ):
            profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)

            if filters:
                for column, upper_bound, lower_bound in filters:
                    if column in profile.columns:
                        profile = apply_bounds(
                            profile, column, upper_bound, lower_bound
                        )
            elif columns and upper_bounds and lower_bounds:
                for column, upper_bound, lower_bound in zip(
                    columns, upper_bounds, lower_bounds
                ):
                    profile = apply_bounds(profile, column, upper_bound, lower_bound)

            self._data = self._data.filter(
                pl.col(PROFILE_ID_LABEL) != profile_id
            ).vstack(profile)

        self._is_empty(CTD.filter_columns_by_range.__name__)



[docs]
    def remove_non_positive_samples(self) -> None:
        r"""
        Removes rows with non-positive values for depth, pressure, practical salinity, absolute salinity, or density.

        Notes
        -----
        This method cleans the CTD (Conductivity, Temperature, Depth) dataset by removing any samples
        that have non-positive values for key parameters. Non-positive values in these parameters
        are generally invalid and indicate erroneous measurements.

        The procedure is as follows:

        1. For each unique profile identified by `profile_id`, the method extracts the profile's data.
        2. It then iterates over a predefined set of key parameters: depth, pressure, practical salinity,
           absolute salinity, and density.
        3. For each parameter present in the profile, it filters out rows where the parameter's value
           is non-positive, null, or NaN.
        4. The cleaned profile is then reintegrated into the main dataset, replacing the original data.

        Let :math:`( x_i )` represent the value of a parameter (depth, pressure, practical salinity, absolute salinity,
        or density) at the :math:`( i )`-th sampling event. The condition for retaining a data point is given by:

        .. math::

            x_i > 0 \quad \text{and} \quad x_i \neq \text{NaN} \quad \text{and} \quad x_i \neq \text{null}

        Rows not satisfying this condition for any of the parameters are removed.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.remove_non_positive_samples()
        >>> # This will clean the dataset by removing samples with non-positive, null, or NaN values
        >>> # for the specified key parameters.

        See Also
        --------
        remove_invalid_salinity_values : method to remove salinity values < 10 PSU.

        """
        for profile_id in (
            self._data.select(PROFILE_ID_LABEL)
            .unique(keep="first")
            .to_series()
            .to_list()
        ):
            profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)
            cols = list(
                {
                    DEPTH_LABEL,
                    PRESSURE_LABEL,
                    SALINITY_LABEL,
                    SALINITY_ABS_LABEL,
                    DENSITY_LABEL,
                }.intersection(profile.collect_schema().names())
            )
            for col in cols:
                profile = profile.filter(
                    pl.col(col) > 0.0, ~pl.col(col).is_null(), pl.col(col).is_not_nan()
                )
            self._data = self._data.filter(pl.col(PROFILE_ID_LABEL) != profile_id)
            self._data = self._data.vstack(profile)
        self._is_empty(CTD.remove_non_positive_samples.__name__)



[docs]
    def clean(self, method) -> None:
        r"""
        Applies data cleaning methods to the specified feature using the selected method.
        Supports cleaning practical salinity using the 'clean_salinity_ai' method.

        Parameters
        ----------
        method : str, default 'clean_salinity_ai'
            The cleaning method to apply. Currently, only 'clean_salinity_ai' is supported,
            which uses a GRU-based machine learning model to clean the salinity values.

        Raises
        ------
        CTDError
            When the cleaning method is invalid.

        Notes
        -----
        The 'clean_salinity_ai' method uses a Gated Recurrent Unit (GRU) model to correct
        salinity measurements. This model is designed to smooth out unrealistic fluctuations
        in salinity with respect to pressure.

        The procedure for 'clean_salinity_ai' is as follows:

        1. For each unique profile identified by `profile_id`, extract the profile's data.
        2. Bin the data every 0.5 dbar of pressure.
        3. Use a GRU model with a loss function that penalizes non-monotonic salinity increases with decreasing pressure.
        4. Train the model on the binned data to predict clean salinity values.
        5. Replace the original salinity values in the profile with the predicted clean values.
        6. Reintegration of the cleaned profile into the main dataset.

        The loss function :math:`( L )` used in training is defined as:

        .. math::

            L = \text{MAE}(y_{\text{true}}, y_{\text{pred}}) + \lambda \cdot \text{mean}(P)

        where :math:`( P )` are penalties for predicted salinity increases with decreasing pressure,
        and :math:`( \lambda )` is a weighting factor.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.clean('clean_salinity_ai')
        >>> # This will clean the salinity data using the AI-based method, correcting any unrealistic
        >>> # values and ensuring smoother transitions with respect to pressure.

        See Also
        --------
        _AI.clean_salinity_ai : Method used to clean salinity with 'clean_salinity_ai' option.

        """
        for profile_id in (
            self._data.select(PROFILE_ID_LABEL)
            .unique(keep="first")
            .to_series()
            .to_list()
        ):
            profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)
            if method == "clean_salinity_ai":
                profile = ai.AI.clean_salinity_ai(profile, profile_id)
            else:
                raise CTDError(
                    message="Method invalid for clean.", filename=self._filename
                )
            self._data = self._data.filter(pl.col(PROFILE_ID_LABEL) != profile_id)
            self._data = pl.concat([self._data, profile], how=CONCAT_HOW, rechunk=True)
        self._is_empty(CTD.clean.__name__)



[docs]
    def add_absolute_salinity(self) -> None:
        r"""
        Calculates and adds absolute salinity to the CTD data using the TEOS-10 salinity conversion formula.

        Notes
        -----
        The `gsw.SA_from_SP` function from the Gibbs SeaWater (GSW) Oceanographic Toolbox is utilized
        for this calculation. More information about this function can be found at the
        `TEOS-10 website <https://www.teos-10.org/pubs/gsw/html/gsw_SA_from_SP.html>`__.

        This method computes the absolute salinity from practical salinity for each profile in the dataset
        using the TEOS-10 standard. Absolute salinity provides a more accurate representation of salinity
        by accounting for the variations in seawater composition.

        The procedure is as follows:

        1. Initialize a new column for absolute salinity in the dataset.
        2. For each unique profile identified by `profile_id`, extract the profile's data.
        3. Use the `gsw.conversions.SA_from_SP` function to compute absolute salinity from practical salinity.
        4. Update the profile with the computed absolute salinity values.
        5. Reintegration of the updated profile into the main dataset.

        The TEOS-10 formula for converting practical salinity :math:`( S_P )` to absolute salinity :math:`( S_A )` is used:

        .. math::

            S_A = f(S_P, p, \phi, \lambda)

        where :math:`( p )` is the sea pressure, :math:`( \phi )` is the latitude, and :math:`( \lambda )` is the longitude.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_absolute_salinity()
        >>> # This will add a new column with absolute salinity values to the dataset, calculated using the
        >>> # TEOS-10 formula.

        See Also
        --------
        gsw.conversions.SA_from_SP : Function used for the conversion from practical salinity to absolute salinity.

        """
        self._data = self._data.with_columns(
            pl.lit(None, dtype=pl.Float64).alias(SALINITY_ABS_LABEL)
        )
        for profile_id in (
            self._data.select(PROFILE_ID_LABEL)
            .unique(keep="first")
            .to_series()
            .to_list()
        ):
            profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)
            s = profile.select(pl.col(SALINITY_LABEL)).to_numpy()
            p = profile.select(pl.col(SEA_PRESSURE_LABEL)).to_numpy()
            lat = profile.select(pl.col(LATITUDE_LABEL)).to_numpy()
            long = profile.select(pl.col(LONGITUDE_LABEL)).to_numpy()
            salinity_abs_list = gsw.conversions.SA_from_SP(s, p, lat, long)
            salinity_abs = pl.Series(
                np.array(salinity_abs_list).flatten(), dtype=pl.Float64, strict=False
            ).to_frame(SALINITY_ABS_LABEL)
            profile = profile.with_columns(salinity_abs)
            self._data = self._data.filter(pl.col(PROFILE_ID_LABEL) != profile_id)
            self._data = self._data.vstack(profile)
        self._is_empty(CTD.add_absolute_salinity.__name__)



[docs]
    def add_density(self):
        r"""
        Calculates and adds density to CTD data using the TEOS-10 formula.
        If absolute salinity is not present, it is calculated first.


        Notes
        -----
        The `gsw.rho_t_exact` function from the Gibbs SeaWater (GSW) Oceanographic Toolbox is utilized
        for this calculation. More information about this function can be found at the
        `TEOS-10 website <https://www.teos-10.org/pubs/gsw/html/gsw_rho_t_exact.html>`__.

        This method computes the density of seawater from absolute salinity, in-situ temperature,
        and sea pressure using the TEOS-10 standard. The density is a critical parameter for
        understanding the physical properties of seawater and its buoyancy characteristics.

        The procedure is as follows:

        1. Check if absolute salinity is already present in the dataset. If not, calculate it using `add_absolute_salinity()`.
        2. Initialize a new column for density in the dataset.
        3. For each unique profile identified by `profile_id`, extract the profile's data.
        4. Use the `gsw.density.rho_t_exact` function to compute density from absolute salinity, temperature, and pressure.
        5. Update the profile with the computed density values.
        6. Reintegration of the updated profile into the main dataset.

        The TEOS-10 formula for calculating density :math:`( \rho )` is used:

        .. math::

            \rho = f(S_A, T, p)

        where :math:`( S_A )` is the absolute salinity, :math:`( T )` is the in-situ temperature, and :math:`( p )` is the sea pressure.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_density()
        >>> # This will add a new column with density values to the dataset, calculated using the
        >>> # TEOS-10 formula.

        See Also
        --------
        gsw.density.rho_t_exact : Function used for the calculation of density.
        add_absolute_salinity : Method to add absolute salinity if it is not already present in the dataset.

        """
        if SALINITY_ABS_LABEL not in self._data.columns:
            self.add_absolute_salinity()
        self._data = self._data.with_columns(
            pl.lit(None).cast(pl.Float64).alias(DENSITY_LABEL)
        )
        for profile_id in (
            self._data.select(PROFILE_ID_LABEL)
            .unique(keep="first")
            .to_series()
            .to_list()
        ):
            profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)
            sa = profile.select(pl.col(SALINITY_ABS_LABEL)).to_numpy()
            t = profile.select(pl.col(TEMPERATURE_LABEL)).to_numpy()
            p = profile.select(pl.col(SEA_PRESSURE_LABEL)).to_numpy()
            density = pl.Series(
                np.array(gsw.density.rho_t_exact(sa, t, p)).flatten(),
                dtype=pl.Float64,
                strict=True,
            ).to_frame(DENSITY_LABEL)
            profile = profile.with_columns(density)
            self._data = self._data.filter(pl.col(PROFILE_ID_LABEL) != profile_id)
            self._data = self._data.vstack(profile)
        self._is_empty(CTD.add_density.__name__)



[docs]
    def add_potential_density(self):
        r"""
        Calculates and adds potential density to the CTD data using the TEOS-10 formula.
        If absolute salinity is not present, it is calculated first.

        Notes
        -----
        The `gsw.sigma0` function from the Gibbs SeaWater (GSW) Oceanographic Toolbox is utilized
        for this calculation. More information about this function can be found at the
        `TEOS-10 website <https://www.teos-10.org/pubs/gsw/html/gsw_sigma0.html>`__.

        This method computes the potential density of seawater from absolute salinity and in-situ temperature
        using the TEOS-10 standard. Potential density is the density a parcel of seawater would have if
        it were adiabatically brought to the sea surface, which helps in understanding the stability and
        stratification of the water column.

        The procedure is as follows:

        1. Check if absolute salinity is already present in the dataset. If not, calculate it using `add_absolute_salinity()`.
        2. Initialize a new column for potential density in the dataset.
        3. For each unique profile identified by `profile_id`, extract the profile's data.
        4. Use the `gsw.sigma0` function to compute potential density from absolute salinity and temperature.
        5. Update the profile with the computed potential density values.
        6. Reintegration of the updated profile into the main dataset.

        The TEOS-10 formula for calculating potential density :math:`( \sigma_0 )` is used:

        .. math::

            \sigma_0 = f(S_A, T)

        where :math:`( S_A )` is the absolute salinity and :math:`( T )` is the in-situ temperature.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_potential_density()
        >>> # This will add a new column with potential density values to the dataset, calculated using the
        >>> # TEOS-10 formula.

        See Also
        --------
        gsw.sigma0 : Function used for the calculation of potential density.
        add_absolute_salinity : Method to add absolute salinity if it is not already present in the dataset.

        """
        self._data = self._data.with_columns(
            pl.lit(None, dtype=pl.Float64).alias(POTENTIAL_DENSITY_LABEL)
        )
        if SALINITY_ABS_LABEL not in self._data.columns:
            self.add_absolute_salinity()
        for profile_id in (
            self._data.select(PROFILE_ID_LABEL)
            .unique(keep="first")
            .to_series()
            .to_list()
        ):
            profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)
            sa = profile.select(pl.col(SALINITY_ABS_LABEL)).to_numpy()
            t = profile.select(pl.col(TEMPERATURE_LABEL)).to_numpy()
            potential_density = pl.Series(
                np.array(gsw.sigma0(sa, t)).flatten()
            ).to_frame(POTENTIAL_DENSITY_LABEL)
            profile = profile.with_columns(pl.Series(potential_density))
            self._data = self._data.filter(pl.col(PROFILE_ID_LABEL) != profile_id)
            self._data = self._data.vstack(profile)
        self._is_empty(CTD.add_potential_density.__name__)



[docs]
    def add_surface_salinity_temp_meltwater(self, start=10.1325, end=12.1325):
        r"""
        Calculates the surface salinity, surface temperature, and meltwater fraction of a CTD profile.
        Adds these values to the CTD data.

        Parameters
        ----------
        start : float, default 10.1325
            Upper bound of surface pressure.
        end : float, default 12.1325
            Lower bound of surface pressure.

        Notes
        -----
        This method adds three new columns to the dataset: surface salinity, surface temperature, and
        meltwater fraction. The values are calculated as follows:

        - Surface temperature is the mean temperature from pressure `start` to `end`.
        - Surface salinity is the salinity value at the lowest pressure within the range from `start` to `end`.
        - Meltwater fraction is calculated using the formula from `Pan et. al 2019 <https://doi.org/10.1371/journal.pone.0211107>`__:

        .. math::

            \text{Meltwater fraction} = (-0.021406 \cdot S_0 + 0.740392) \cdot 100

        where :math:`( S_0 )` is the surface salinity.

        The procedure is as follows:

        1. Initialize new columns for surface salinity, surface temperature, and meltwater fraction in the dataset.
        2. For each unique profile identified by `profile_id`, extract the profile's data.
        3. Filter the data to include only the samples within the specified pressure range (`start` to `end`).
        4. Calculate the mean surface temperature, surface salinity, and meltwater fraction based on the filtered data.
        5. Update the profile with the computed values.
        6. Reintegration of the updated profile into the main dataset.

        Raises
        ------
        CTDError
            When there are no measurements within the specified pressure range for a profile.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_surface_salinity_temp_meltwater(start=10.1325, end=12.1325)
        >>> # This will add new columns with surface salinity, surface temperature, and meltwater fraction
        >>> # values to the dataset, calculated using the specified pressure range.

        See Also
        --------
        add_density : method to calculate derived density.
        add_potential_density : method to calculate derived potential density.

        """
        self._data = self._data.with_columns(
            pl.lit(None, dtype=pl.Float64).alias(SURFACE_SALINITY_LABEL),
            pl.lit(None, dtype=pl.Float64).alias(SURFACE_TEMPERATURE_LABEL),
            pl.lit(None, dtype=pl.Float64).alias(MELTWATER_FRACTION_LABEL),
        )
        for profile_id in (
            self._data.select(PROFILE_ID_LABEL)
            .unique(keep="first")
            .to_series()
            .to_list()
        ):
            profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)
            surface_data = profile.filter(
                pl.col(PRESSURE_LABEL) > start, pl.col(PRESSURE_LABEL) < end
            )
            if surface_data.is_empty():
                raise_warning_calculatuion(
                    filename=self._filename,
                    message=WARNING_CTD_SURFACE_MEASUREMENT,
                )
                self._is_empty(CTD.add_surface_salinity_temp_meltwater.__name__)
                continue
            surface_salinity = np.array(
                surface_data.select(pl.col(SALINITY_LABEL)).to_numpy()
            )
            surface_salinity = surface_salinity.item(0)
            surface_temperature = np.array(
                surface_data.select(pl.col(TEMPERATURE_LABEL).mean()).to_numpy()
            ).item(0)
            mwf = (-0.021406 * surface_salinity + 0.740392) * 100
            profile = profile.with_columns(
                pl.lit(surface_salinity).alias(SURFACE_SALINITY_LABEL),
                pl.lit(surface_temperature).alias(SURFACE_TEMPERATURE_LABEL),
                pl.lit(mwf).alias(MELTWATER_FRACTION_LABEL),
            )
            self._data = self._data.filter(pl.col(PROFILE_ID_LABEL) != profile_id)
            self._data = self._data.vstack(profile)
        self._is_empty(CTD.add_surface_salinity_temp_meltwater.__name__)


    def add_speed_of_sound(self) -> None:
        """
        Calculates and adds sound speed to the CTD data using the TEOS-10 formula.

        This method computes the speed of sound in seawater, which is influenced by factors such as
        salinity, temperature, and pressure. The sound speed is a critical parameter for various
        oceanographic studies, particularly in understanding acoustic propagation.

        Notes
        -----
        The `gsw.sound_speed` function from the Gibbs SeaWater (GSW) Oceanographic Toolbox is utilized
        for this calculation. More information about this function can be found at the
        `TEOS-10 website <https://www.teos-10.org/pubs/gsw/html/gsw_sound_speed.html>`__.

        The speed of sound, :math:`c`, in seawater is calculated using the TEOS-10 equation:

        .. math::

            c = f(S_A, T, p)

        where:
        - :math:`S_A` is the absolute salinity,
        - :math:`T` is the in-situ temperature,
        - :math:`p` is the sea pressure.

        This method adds a new column for sound speed to the dataset, and it will overwrite any existing
        column for speed of sound.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_speed_of_sound()
        >>> # This will add a new column with sound speed values to the dataset, calculated using the TEOS-10 formula.
        """
        if SALINITY_ABS_LABEL not in self._data.columns:
            self.add_absolute_salinity()
        self._data = self._data.with_columns(
            pl.lit(None).cast(pl.Float64).alias(SPEED_OF_SOUND_LABEL)
        )
        for profile_id in (
            self._data.select(PROFILE_ID_LABEL)
            .unique(keep="first")
            .to_series()
            .to_list()
        ):
            profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)
            sa = profile.select(pl.col(SALINITY_ABS_LABEL)).to_numpy()
            t = profile.select(pl.col(TEMPERATURE_LABEL)).to_numpy()
            p = profile.select(pl.col(SEA_PRESSURE_LABEL)).to_numpy()
            sound_speed = pl.Series(
                np.array(gsw.sound_speed(sa, t, p)).flatten(),
                dtype=pl.Float64,
                strict=True,
            ).to_frame(SPEED_OF_SOUND_LABEL)
            profile = profile.with_columns(sound_speed)
            self._data = self._data.filter(pl.col(PROFILE_ID_LABEL) != profile_id)
            self._data = self._data.vstack(profile)
        self._is_empty(CTD.add_speed_of_sound.__name__)


[docs]
    def add_potential_temperature(self, p_ref: Union[float, np.ndarray] = 0) -> None:
        r"""
        Calculates and adds potential temperature to the CTD data using the TEOS-10 formula.

        This method computes the potential temperature of seawater, which is the temperature
        a parcel of water would have if moved adiabatically to the sea surface pressure.

        Notes
        -----
        The `gsw.pt_from_t` function from the Gibbs SeaWater (GSW) Oceanographic Toolbox is utilized
        for this calculation. More information about this function can be found at the
        `TEOS-10 website <https://www.teos-10.org/pubs/gsw/html/gsw_pt_from_t.html>`__.

        This method adds a new column for potential temperature in the dataset.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_potential_temperature()
        >>> # This will add a new column with potential temperature values to the dataset, calculated using the TEOS-10 formula.
        """
        if SALINITY_ABS_LABEL not in self._data.columns:
            self.add_absolute_salinity()

        # Compute potential temperature across all profiles in one go
        sa = self._data.select(pl.col(SALINITY_ABS_LABEL)).to_numpy().flatten()
        t = self._data.select(pl.col(TEMPERATURE_LABEL)).to_numpy().flatten()
        p = self._data.select(pl.col(SEA_PRESSURE_LABEL)).to_numpy().flatten()

        # Calculate potential temperature for all data points
        potential_temperature_values = gsw.pt_from_t(sa, t, p, p_ref)

        # Add the calculated potential temperature to the dataframe
        self._data = self._data.with_columns(
            pl.Series(potential_temperature_values, dtype=pl.Float64).alias(
                "potential_temperature"
            )
        )

        self._is_empty(CTD.add_potential_temperature.__name__)



[docs]
    def add_mean_surface_density(self, start=10.1325, end=12.1325):
        """
        Calculates the mean surface density from the density values and adds it as a new column
        to the CTD data table. Requires absolute salinity and absolute density to be calculated first.

        Parameters
        ----------
        start : float, default 10.1325
            Upper bound of surface pressure.
        end : float, default 12.1325
            Lower bound of surface pressure.

        Notes
        -----
        This method adds a new column for the mean surface density to the dataset. The values are calculated as follows:

        - Mean surface density is computed as the mean of density values within the specified pressure range (`start` to `end`).

        The procedure is as follows:

        1. For each unique profile identified by `profile_id`, extract the profile's data.
        2. Filter the data to include only the samples within the specified pressure range (`start` to `end`).
        3. Calculate the mean surface density based on the filtered data.
        4. Update the profile with the computed mean surface density value.
        5. Reintegration of the updated profile into the main dataset.

        Raises
        ------
        CTDError
            When there are no measurements within the specified pressure range for a profile.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_mean_surface_density(start=10.1325, end=12.1325)
        >>> # This will add a new column with mean surface density values to the dataset, calculated using the
        >>> # specified pressure range.

        See Also
        --------
        add_absolute_salinity : Method to add absolute salinity if it is not already present in the dataset.
        add_density : Method to add density if it is not already present in the dataset.

        """
        # Filtering data within the specified pressure range
        self._data = self._data.with_columns(
            pl.lit(None).cast(pl.Float64).alias(SURFACE_DENSITY_LABEL)
        )
        for profile_id in (
            self._data.select(PROFILE_ID_LABEL)
            .unique(keep="first")
            .to_series()
            .to_list()
        ):
            profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)
            surface_data = profile.filter(
                pl.col(PRESSURE_LABEL) > start, pl.col(PRESSURE_LABEL) < end
            )
            surface_density = surface_data.select(pl.col(DENSITY_LABEL).mean()).item()
            profile = profile.with_columns(
                pl.lit(surface_density).alias(SURFACE_DENSITY_LABEL)
            )
            self._data = self._data.filter(pl.col(PROFILE_ID_LABEL) != profile_id)
            self._data = self._data.vstack(profile)
        self._is_empty(CTD.add_mean_surface_density.__name__)



[docs]
    def add_conservative_temperature(self) -> None:
        """
        Calculates and adds conservative temperature to the CTD data using the TEOS-10 formula.

        This method computes the conservative temperature, which is a more accurate measure of heat content
        in seawater compared to potential temperature.

        Notes
        -----
        The `gsw.CT_from_t` function from the Gibbs SeaWater (GSW) Oceanographic Toolbox is utilized
        for this calculation. More information about this function can be found at the
        `TEOS-10 website <https://www.teos-10.org/pubs/gsw/html/gsw_CT_from_t.html>`__.

        This method adds a new column for conservative temperature in the dataset.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_conservative_temperature()
        >>> # This will add a new column with conservative temperature values to the dataset, calculated using the TEOS-10 formula.

        """
        if SALINITY_ABS_LABEL not in self._data.columns:
            self.add_absolute_salinity()

        sa = self._data.select(pl.col(SALINITY_ABS_LABEL)).to_numpy().flatten()
        t = self._data.select(pl.col(TEMPERATURE_LABEL)).to_numpy().flatten()

        conservative_temperature_values = gsw.CT_from_t(sa, t, 0)

        self._data = self._data.with_columns(
            pl.Series(conservative_temperature_values, dtype=pl.Float64).alias(
                "conservative_temperature"
            )
        )

        self._is_empty(CTD.add_conservative_temperature.__name__)



[docs]
    def add_dynamic_height(self, p_ref: Union[float, np.ndarray] = 0) -> None:
        r"""
        Calculates and adds dynamic height anomaly to the CTD data using the TEOS-10 formula.

        This method computes the dynamic height anomaly, which represents the geostrophic streamfunction
        that indicates the difference in horizontal velocity between the pressure at the measurement
        point (p) and a reference pressure (p_ref).

        Parameters
        ----------
        p_ref : Union[float, np.ndarray], optional
            Reference pressure, in dbar. The default is 0 dbar, corresponding to the sea surface.

        Notes
        -----
        The `gsw.geo_strf_dyn_height` function from the Gibbs SeaWater (GSW) Oceanographic Toolbox is utilized
        for this calculation. More information about this function can be found at the
        `TEOS-10 website <https://www.teos-10.org/pubs/gsw/pdf/geo_strf_dyn_height.pdf>`__.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_dynamic_height()
        >>> # This will add a new column with dynamic height values to the dataset, calculated using the TEOS-10 formula.
        """
        if SALINITY_ABS_LABEL not in self._data.columns:
            self.add_absolute_salinity()

        sa = self._data.select(pl.col(SALINITY_ABS_LABEL)).to_numpy().flatten()
        ct = (
            self._data.select(pl.col(CONSERVATIVE_TEMPERATURE_LABEL))
            .to_numpy()
            .flatten()
        )
        p = self._data.select(pl.col(SEA_PRESSURE_LABEL)).to_numpy().flatten()

        dynamic_height = gsw.geo_strf_dyn_height(sa, ct, p, p_ref)

        self._data = self._data.with_columns(
            pl.Series(dynamic_height, dtype=pl.Float64).alias("dynamic_height")
        )

        self._is_empty(CTD.add_dynamic_height.__name__)



[docs]
    def add_thermal_expansion_coefficient(self) -> None:
        r"""
        Calculates and adds thermal expansion coefficient to the CTD data using the TEOS-10 formula.

        The thermal expansion coefficient, :math:`\alpha`, is important for understanding how the volume of seawater
        changes with temperature at constant pressure. It is derived from absolute salinity, conservative temperature,
        and sea pressure.

        Notes
        -----
        The `gsw.alpha` function from the Gibbs SeaWater (GSW) Oceanographic Toolbox is utilized
        for this calculation. More information about this function can be found at the
        `TEOS-10 website <https://www.teos-10.org/pubs/gsw/html/gsw_alpha.html>`__.

        The thermal expansion coefficient is calculated using the following equation from TEOS-10:

        .. math::

            \alpha^\theta = -\frac{1}{\rho} \frac{\partial \rho}{\partial \theta} \bigg|_{S_A, P}

        where:
        - :math:`\rho` is the in-situ density of seawater,
        - :math:`\theta` is the conservative temperature,
        - :math:`S_{A}` is the absolute salinity,
        - :math:`P` is the sea pressure.

        This method adds a new column for the thermal expansion coefficient in the dataset. It ensures that absolute salinity
        and conservative temperature are present in the data, calculating them if necessary, before computing the thermal expansion coefficient.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_thermal_expansion_coefficient()
        >>> # This will add a new column with thermal expansion coefficient values to the dataset, calculated using the TEOS-10 formula.

        """
        if SALINITY_ABS_LABEL not in self._data.columns:
            self.add_absolute_salinity()
        if CONSERVATIVE_TEMPERATURE_LABEL not in self._data.columns:
            self.add_conservative_temperature()

        sa = self._data.select(pl.col(SALINITY_ABS_LABEL)).to_numpy().flatten()
        ct = (
            self._data.select(pl.col(CONSERVATIVE_TEMPERATURE_LABEL))
            .to_numpy()
            .flatten()
        )
        p = self._data.select(pl.col(SEA_PRESSURE_LABEL)).to_numpy().flatten()

        thermal_expansion_coefficient_values = gsw.alpha(sa, ct, p)

        self._data = self._data.with_columns(
            pl.Series(thermal_expansion_coefficient_values, dtype=pl.Float64).alias(
                "thermal_expansion_coefficient"
            )
        )

        self._is_empty(CTD.add_thermal_expansion_coefficient.__name__)



[docs]
    def add_haline_contraction_coefficient(self) -> None:
        r"""
        Calculates and adds haline contraction coefficient to the CTD data using the TEOS-10 formula.

        The haline contraction coefficient, :math:`\beta`, is important for understanding how the volume of seawater
        changes with salinity at constant temperature. It is derived from absolute salinity, conservative temperature,
        and sea pressure.

        Notes
        -----
        The `gsw.beta` function from the Gibbs SeaWater (GSW) Oceanographic Toolbox is utilized
        for this calculation. More information about this function can be found at the
        `TEOS-10 website <https://www.teos-10.org/pubs/gsw/html/gsw_beta.html>`__.

        The haline contraction coefficient is calculated using the following equation from TEOS-10:

        .. math::

            \beta^\theta = \frac{1}{\rho} \frac{\partial \rho}{\partial S_A} \bigg|_{\theta, P}

        where:
        - :math:`\rho` is the in-situ density of seawater,
        - :math:`S_A` is the absolute salinity,
        - :math:`\theta` is the conservative temperature,
        - :math:`P` is the sea pressure.

        This method adds a new column for the haline contraction coefficient in the dataset. It ensures that absolute salinity
        and conservative temperature are present in the data, calculating them if necessary, before computing the haline contraction coefficient.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_haline_contraction_coefficient()
        >>> # This will add a new column with haline contraction coefficient values to the dataset, calculated using the TEOS-10 formula.

        """
        if SALINITY_ABS_LABEL not in self._data.columns:
            self.add_absolute_salinity()
        if CONSERVATIVE_TEMPERATURE_LABEL not in self._data.columns:
            self.add_conservative_temperature()

        sa = self._data.select(pl.col(SALINITY_ABS_LABEL)).to_numpy().flatten()
        ct = (
            self._data.select(pl.col(CONSERVATIVE_TEMPERATURE_LABEL))
            .to_numpy()
            .flatten()
        )
        p = self._data.select(pl.col(SEA_PRESSURE_LABEL)).to_numpy().flatten()

        haline_contraction_coefficient_values = gsw.beta(sa, ct, p)

        self._data = self._data.with_columns(
            pl.Series(haline_contraction_coefficient_values, dtype=pl.Float64).alias(
                "haline_contraction_coefficient"
            )
        )

        self._is_empty(CTD.add_haline_contraction_coefficient.__name__)



[docs]
    def add_mld(
        self, reference: int, method: str = "potential_density_avg", delta: float = 0.05
    ):
        r"""
        Calculates and adds the mixed layer depth (MLD) using the density threshold method.

        Parameters
        ----------
        reference : int
            The reference depth for MLD calculation.
        method : str, default "potential_density_avg"
            The MLD calculation method. Options are "abs_density_avg" or "potential_density_avg".
        delta : float, default 0.05
            The change in density or potential density from the reference that would define the MLD in units of
            :math:`\frac{kg}{m^3}`.

        Notes
        -----
        The mixed layer depth (MLD) is calculated using the density threshold method, defined as the depth
        at which the density increases by a specified amount (delta) from the reference density. The reference
        density is calculated as the mean density up to the reference depth.

        The procedure is as follows:

        1. Initialize a new column for MLD in the dataset.
        2. For each unique profile identified by `profile_id`, extract the profile's data.
        3. Filter the data to include only the samples up to the reference depth.
        4. Calculate the reference density based on the chosen method:
           - "abs_density_avg": mean absolute density up to the reference depth.
           - "potential_density_avg": mean potential density up to the reference depth.
        5. Identify the MLD as the shallowest depth where the density exceeds the reference density by the specified delta.
        6. Update the profile with the calculated MLD value.
        7. Reintegration of the updated profile into the main dataset.

        The MLD equation is given by:

        .. math::

            \text{MLD} = \min(D + \Delta > D_r)

        where:

        * :math:`( D_r )` is the reference density, defined as the mean density up to the reference depth.
        * :math:`( D )` represents all densities in the profile.
        * :math:`( \Delta )` is the specified change in density (delta).

        Raises
        ------
        CTDError
            When the specified method is invalid.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_mld(reference=10, method="potential_density_avg", delta=0.05)
        >>> # This will add a new column with MLD values to the dataset, calculated using the specified method
        >>> # and parameters.

        See Also
        --------
        add_density : method to calculate derived density.
        add_potential_density : method to calculate derived potential density.

        """
        supported_methods = ["abs_density_avg", "potential_density_avg"]
        self._mld_col_labels.append(f"MLD_Ref:{reference}_(m)_Thresh_{delta}_(kg/m^3)")
        self._data = self._data.with_columns(
            pl.lit(None, dtype=pl.Float64).alias(self._mld_col_labels[-1])
        )
        for profile_id in (
            self._data.select(PROFILE_ID_LABEL)
            .unique(keep="first")
            .to_series()
            .to_list()
        ):
            profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)
            df_filtered = profile.filter(pl.col(DEPTH_LABEL) <= reference)
            if method == supported_methods[0]:
                reference_density = df_filtered.select(
                    pl.col(DENSITY_LABEL).mean()
                ).item()
                df_filtered = profile.filter(
                    pl.col(DENSITY_LABEL) >= reference_density + delta
                )
            elif method == supported_methods[1]:
                reference_density = df_filtered.select(
                    pl.col(POTENTIAL_DENSITY_LABEL).mean()
                ).item()
                df_filtered = profile.filter(
                    pl.col(POTENTIAL_DENSITY_LABEL) >= reference_density + delta
                )
            else:
                raise CTDError(
                    message=f'add_mld: Invalid method "{method}" not in {supported_methods}',
                    filename=self._filename,
                )
            mld = df_filtered.select(pl.col(DEPTH_LABEL).first()).item()
            profile = profile.with_columns(pl.lit(mld).alias(self._mld_col_labels[-1]))
            self._data = self._data.filter(pl.col(PROFILE_ID_LABEL) != profile_id)
            self._data = self._data.vstack(profile)
        self._is_empty(CTD.add_mld.__name__)



[docs]
    def add_brunt_vaisala_squared(self):
        r"""
        Calculates buoyancy frequency squared and adds it to the CTD data.
        Requires potential density to be calculated first.

        This method computes the buoyancy frequency squared (also known as the Brunt-Väisälä frequency squared)
        for each profile in the dataset using the TEOS-10 standard. This parameter is essential for understanding
        the stability of the water column and its propensity to mix vertically.

        The procedure is as follows:

        1. Initialize new columns for buoyancy frequency squared and the mid-pressure values in the dataset.
        2. For each unique profile identified by `profile_id`, extract the profile's data.
        3. Use the `gsw.Nsquared` function to compute buoyancy frequency squared and mid-pressure values from
           absolute salinity, conservative temperature, pressure, and latitude.
        4. Update the profile with the computed buoyancy frequency squared and mid-pressure values.
        5. Reintegration of the updated profile into the main dataset.

        Notes
        -----
        The `gsw.Nsquared` function from the Gibbs SeaWater (GSW) Oceanographic Toolbox is utilized
        for this calculation. More information about this function can be found at the
        `TEOS-10 website <https://www.teos-10.org/pubs/gsw/html/gsw_Nsquared.html>`__.

        The buoyancy frequency squared :math:`( N^2 )` is calculated using the formula:

        .. math::

            N_2 = g_2 \cdot \frac{\beta \cdot d(SA) - \alpha \cdot d(CT)}{\text{specvol_local} \cdot dP}

        Note. This routine uses rho from "gsw_specvol", which is the
          computationally efficient 75-term expression for specific volume in
          terms of SA, CT and p (Roquet et al., 2015).
        Note also that the pressure increment, dP, in the above formula is in
          Pa, so that it is 104 times the pressure increment dp in dbar.

        Raises
        ------
        CTDError
            When buoyancy frequency could not be calculated due to a ValueError.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.add_brunt_vaisala_squared()
        >>> # This will add new columns with buoyancy frequency squared values and mid-pressure values to the dataset,
        >>> # calculated using the TEOS-10 formula.

        See Also
        --------
        gsw.Nsquared : Function used for the calculation of buoyancy frequency squared.

        """
        self._data = self._data.with_columns(
            pl.lit(None, dtype=pl.Float64).alias(BV_LABEL),
            pl.lit(None, dtype=pl.Float64).alias(P_MID_LABEL),
        )
        for profile_id in (
            self._data.select(PROFILE_ID_LABEL)
            .unique(keep="first")
            .to_series()
            .to_list()
        ):
            profile = self._data.filter(pl.col(PROFILE_ID_LABEL) == profile_id)
            sa = profile.select(pl.col(SALINITY_ABS_LABEL)).to_numpy().flatten()
            t = profile.select(pl.col(TEMPERATURE_LABEL)).to_numpy().flatten()
            p = profile.select(pl.col(SEA_PRESSURE_LABEL)).to_numpy().flatten()
            lat = profile.select(pl.col(LATITUDE_LABEL)).to_numpy().flatten()
            ct = gsw.CT_from_t(sa, t, p).flatten()
            try:
                n_2, p_mid = gsw.Nsquared(SA=sa, CT=ct, p=p, lat=lat)
            except ValueError:
                raise CTDError(
                    filename=self._filename,
                    message=f"Unable to calculate buoyancy frequency, likely due to lat = {lat}",
                )
            buoyancy_frequency = (
                pl.Series(np.array(n_2).flatten())
                .extend_constant(None, n=1)
                .to_frame(BV_LABEL)
            )
            p_mid = pl.Series(p_mid).extend_constant(None, n=1).to_frame(P_MID_LABEL)
            profile = profile.with_columns(
                pl.Series(buoyancy_frequency), pl.Series(p_mid)
            )
            self._data = self._data.filter(pl.col(PROFILE_ID_LABEL) != profile_id)
            self._data = self._data.vstack(profile)
        self._is_empty(CTD.add_brunt_vaisala_squared.__name__)



[docs]
    def save_to_csv(self, output_file: str, null_value: str | None):
        """
        Renames the columns of the CTD data table based on a predefined mapping and saves the
        data to the specified CSV file.

        Parameters
        ----------
        output_file : str
            The output CSV file path.
        null_value : str or None
            The value to represent null cells in the saved file.

        Notes
        -----
        This method will rename the columns of the CTD dataset based on a predefined mapping. After renaming,
        the dataset is saved to the specified CSV file. If a file with the same name already exists at the
        specified path, it will be overwritten.

        The procedure is as follows:

        1. Rename the columns of the CTD data table using a predefined mapping.
        2. Save the modified data table to the specified CSV file path.

        The predefined column mapping ensures that the column names in the output CSV file adhere to a specific
        naming convention or format required for further analysis or sharing.

        Raises
        ------
        IOError
            If there is an error in writing to the specified file path.

        Examples
        --------
        >>> ctd_data = CTD('example.csv')
        >>> ctd_data.save_to_csv(output_file='path/to/output.csv')
        >>> # This will rename the columns of the CTD dataset and save it to 'path/to/output.csv'.
        >>> # Any existing file with the same name at that location will be overwritten.

        See Also
        --------
        utils.save_to_csv : Utility function used to save the data to a CSV file.

        """
        utils.save_to_csv(self._data, output_file=output_file, null_value=null_value)