Source code for ctdfjorder.loadctd.aml

import polars as pl
from datetime import datetime
from os import path
from ctdfjorder.constants.constants import *
from ctdfjorder.exceptions.exceptions import CTDCorruptError


[docs] def load_file_aml(aml_file_path): """ Loads and processes an AML CTD file. Parameters ---------- aml_file_path : str The file path to the AML CTD file. Returns ------- pl.DataFrame The processed AML CTD data. Raises ------ CTDCorruptError If the AML file is unable to be opened or processed. MissingTimestampError If the timestamp information is missing from the file. """ filename = path.basename(aml_file_path) try: with open(aml_file_path, 'r') as file: header = {} measurement_metadata = {} data = [] # Parse header for line in file: line = line.strip().rstrip(',') if line == '[MeasurementMetadata]': break if '=' in line: key, value = line.split('=', 1) header[key.strip()] = value.strip() # Parse measurement metadata for line in file: line = line.strip().rstrip(',') if line == '[MeasurementData]': break if '=' in line: key, value = line.split('=', 1) measurement_metadata[key.strip()] = [v.strip() for v in value.split(',')] # Parse data for line in file: line = line.strip().rstrip(',') data.append([v.strip() for v in line.split(',')]) except Exception as e: raise CTDCorruptError(filename=filename) from e # Define the columns to keep desired_columns = ['Date', 'Time', 'Conductivity', 'Temperature', 'Pressure', 'Sound Velocity', 'Depth'] # Check if the measurement metadata columns include the desired columns columns = measurement_metadata.get('Columns', []) valid_columns = [col for col in columns if col in desired_columns] valid_indices = [i for i, col in enumerate(columns) if col in valid_columns] # Filter out undesired columns from data filtered_data = [[row[i] for i in valid_indices] for row in data] # Create DataFrame with valid columns only df = pl.DataFrame(filtered_data, schema=valid_columns, orient='row') # Detect date format and parse accordingly if df['Date'].str.contains(r'^\d{1,2}/\d{1,2}/\d{4}$').all(): df = df.with_columns( pl.col('Date').str.strptime(pl.Date, "%m/%d/%Y") ) else: df = df.with_columns( pl.col('Date').str.strptime(pl.Date, "%Y-%m-%d") ) # Check if 'Time' column is in 'mm:ss.s' format if df['Time'].str.contains(r'^\d{1,2}:\d{2}\.\d$').all(): # Convert 'mm:ss.s' to 'hh:mm:ss.s' df = df.with_columns( (pl.lit("00:") + pl.col('Time')).alias('Time') ) # Convert time and other columns df = df.with_columns([ pl.col('Time').str.strptime(pl.Time, "%H:%M:%S.%f"), pl.col('Conductivity').alias(CONDUCTIVITY.label).cast(pl.Float64), pl.col('Temperature').alias(TEMPERATURE.label).cast(pl.Float64), pl.col('Pressure').alias(PRESSURE.label).cast(pl.Float64), pl.col('Sound Velocity').alias(SPEED_OF_SOUND.label).cast(pl.Float64), pl.col('Depth').alias(DEPTH.label).cast(pl.Float64) ]).drop(['Conductivity', 'Temperature', 'Pressure', 'Sound Velocity', 'Depth']) # Combine Date and Time into a single Timestamp column df = df.with_columns([ (pl.col('Date').cast(pl.Datetime) + pl.col('Time').cast(pl.Duration)).alias(TIMESTAMP.label) ]).drop(['Date', 'Time']) # Convert conductivity from mS/cm to µS/cm df = df.with_columns([ (pl.col(CONDUCTIVITY.label) * 1000).alias(CONDUCTIVITY.label) ]) # Add sea pressure column (assuming atmospheric pressure of 10.1325 dbar) df = df.with_columns([ (pl.col(PRESSURE.label).alias(SEA_PRESSURE.label)) ]) df = df.with_columns((pl.col(SEA_PRESSURE.label) + 10.1325).alias(PRESSURE.label)) # Handle missing latitude and longitude latitude = pl.lit(float(header['Latitude'])) if 'Latitude' in header else pl.lit(None) longitude = pl.lit(float(header['Longitude'])) if 'Longitude' in header else pl.lit(None) # Add metadata columns start_time = datetime.strptime(header['Time'], "%d/%m/%Y %H:%M:%S") df = df.with_columns([ pl.lit(filename).alias(FILENAME.label), pl.lit(0).alias(PROFILE_ID.label), latitude.alias(LATITUDE.label), longitude.alias(LONGITUDE.label), ]) return df
[docs] def is_aml_file(filename): """ Checks if the given filename follows the AML naming convention. Parameters ---------- filename : str The filename to check. Returns ------- bool True if the filename follows the AML naming convention, False otherwise. """ try: # Check if the filename starts with a date in the format YYYY-MM-DD datetime.strptime(filename[:10], "%Y-%m-%d") return True except ValueError: return False