Files
Garmin_Analyser/parsers/file_parser.py
2025-10-06 12:54:15 -07:00

455 lines
16 KiB
Python

"""File parser for various workout formats (FIT, TCX, GPX)."""
import logging
from pathlib import Path
from typing import Dict, Any, Optional, List
import pandas as pd
import numpy as np
try:
from fitparse import FitFile
except ImportError:
raise ImportError("fitparse package required. Install with: pip install fitparse")
from models.workout import WorkoutData, WorkoutMetadata, PowerData, HeartRateData, SpeedData, ElevationData, GearData
from config.settings import SUPPORTED_FORMATS, BikeConfig, INDOOR_KEYWORDS
from utils.gear_estimation import estimate_gear_series, compute_gear_summary
logger = logging.getLogger(__name__)
class FileParser:
"""Parser for workout files in various formats."""
def __init__(self):
"""Initialize file parser."""
pass
def parse_file(self, file_path: Path) -> Optional[WorkoutData]:
"""Parse a workout file and return structured data.
Args:
file_path: Path to the workout file
Returns:
WorkoutData object or None if parsing failed
"""
if not file_path.exists():
logger.error(f"File not found: {file_path}")
return None
file_extension = file_path.suffix.lower()
if file_extension not in SUPPORTED_FORMATS:
logger.error(f"Unsupported file format: {file_extension}")
return None
try:
if file_extension == '.fit':
return self._parse_fit(file_path)
elif file_extension == '.tcx':
return self._parse_tcx(file_path)
elif file_extension == '.gpx':
return self._parse_gpx(file_path)
else:
logger.error(f"Parser not implemented for format: {file_extension}")
return None
except Exception as e:
logger.error(f"Failed to parse file {file_path}: {e}")
return None
def _parse_fit(self, file_path: Path) -> Optional[WorkoutData]:
"""Parse FIT file format.
Args:
file_path: Path to FIT file
Returns:
WorkoutData object or None if parsing failed
"""
try:
fit_file = FitFile(str(file_path))
# Extract session data
session_data = self._extract_fit_session(fit_file)
if not session_data:
logger.error("No session data found in FIT file")
return None
# Extract record data (timestamp-based data)
records = list(fit_file.get_messages('record'))
if not records:
logger.error("No record data found in FIT file")
return None
# Create DataFrame from records
df = self._fit_records_to_dataframe(records)
if df.empty:
logger.error("No valid data extracted from FIT records")
return None
# Create metadata
metadata = WorkoutMetadata(
activity_id=str(session_data.get('activity_id', 'unknown')),
activity_name=session_data.get('activity_name', 'Workout'),
start_time=session_data.get('start_time', pd.Timestamp.now()),
duration_seconds=session_data.get('total_timer_time', 0),
distance_meters=session_data.get('total_distance'),
avg_heart_rate=session_data.get('avg_heart_rate'),
max_heart_rate=session_data.get('max_heart_rate'),
avg_power=session_data.get('avg_power'),
max_power=session_data.get('max_power'),
avg_speed=session_data.get('avg_speed'),
max_speed=session_data.get('max_speed'),
elevation_gain=session_data.get('total_ascent'),
elevation_loss=session_data.get('total_descent'),
calories=session_data.get('total_calories'),
sport=session_data.get('sport', 'cycling'),
sub_sport=session_data.get('sub_sport'),
is_indoor=session_data.get('is_indoor', False)
)
if not metadata.is_indoor and metadata.activity_name:
metadata.is_indoor = any(keyword in metadata.activity_name.lower() for keyword in INDOOR_KEYWORDS)
# Create workout data
workout_data = WorkoutData(
metadata=metadata,
raw_data=df
)
# Add processed data if available
if not df.empty:
workout_data.power = self._extract_power_data(df)
workout_data.heart_rate = self._extract_heart_rate_data(df)
workout_data.speed = self._extract_speed_data(df)
workout_data.elevation = self._extract_elevation_data(df)
workout_data.gear = self._extract_gear_data(df)
return workout_data
except Exception as e:
logger.error(f"Failed to parse FIT file {file_path}: {e}")
return None
def _extract_fit_session(self, fit_file) -> Optional[Dict[str, Any]]:
"""Extract session data from FIT file.
Args:
fit_file: FIT file object
Returns:
Dictionary with session data
"""
try:
sessions = list(fit_file.get_messages('session'))
if not sessions:
return None
session = sessions[0]
data = {}
for field in session:
if field.name and field.value is not None:
data[field.name] = field.value
return data
except Exception as e:
logger.error(f"Failed to extract session data: {e}")
return None
def _fit_records_to_dataframe(self, records) -> pd.DataFrame:
"""Convert FIT records to pandas DataFrame.
Args:
records: List of FIT record messages
Returns:
DataFrame with workout data
"""
data = []
for record in records:
record_data = {}
for field in record:
if field.name and field.value is not None:
record_data[field.name] = field.value
data.append(record_data)
if not data:
return pd.DataFrame()
df = pd.DataFrame(data)
# Convert timestamp to datetime
if 'timestamp' in df.columns:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp')
df = df.reset_index(drop=True)
return df
def _extract_power_data(self, df: pd.DataFrame) -> Optional[PowerData]:
"""Extract power data from DataFrame.
Args:
df: DataFrame with workout data
Returns:
PowerData object or None
"""
if 'power' not in df.columns:
return None
power_values = df['power'].dropna().tolist()
if not power_values:
return None
return PowerData(
power_values=power_values,
estimated_power=[], # Will be calculated later
power_zones={}
)
def _extract_heart_rate_data(self, df: pd.DataFrame) -> Optional[HeartRateData]:
"""Extract heart rate data from DataFrame.
Args:
df: DataFrame with workout data
Returns:
HeartRateData object or None
"""
if 'heart_rate' not in df.columns:
return None
hr_values = df['heart_rate'].dropna().tolist()
if not hr_values:
return None
return HeartRateData(
heart_rate_values=hr_values,
hr_zones={},
avg_hr=np.mean(hr_values),
max_hr=np.max(hr_values)
)
def _extract_speed_data(self, df: pd.DataFrame) -> Optional[SpeedData]:
"""Extract speed data from DataFrame.
Args:
df: DataFrame with workout data
Returns:
SpeedData object or None
"""
if 'speed' not in df.columns:
return None
speed_values = df['speed'].dropna().tolist()
if not speed_values:
return None
# Convert m/s to km/h if needed
if max(speed_values) < 50: # Likely m/s
speed_values = [s * 3.6 for s in speed_values]
# Calculate distance if available
distance_values = []
if 'distance' in df.columns:
distance_values = df['distance'].dropna().tolist()
# Convert to km if in meters
if distance_values and max(distance_values) > 1000:
distance_values = [d / 1000 for d in distance_values]
return SpeedData(
speed_values=speed_values,
distance_values=distance_values,
avg_speed=np.mean(speed_values),
max_speed=np.max(speed_values),
total_distance=distance_values[-1] if distance_values else None
)
def _extract_elevation_data(self, df: pd.DataFrame) -> Optional[ElevationData]:
"""Extract elevation data from DataFrame.
Args:
df: DataFrame with workout data
Returns:
ElevationData object or None
"""
if 'altitude' not in df.columns and 'elevation' not in df.columns:
return None
# Use 'altitude' or 'elevation' column
elevation_col = 'altitude' if 'altitude' in df.columns else 'elevation'
elevation_values = df[elevation_col].dropna().tolist()
if not elevation_values:
return None
# Calculate gradients
gradient_values = self._calculate_gradients(df)
# Add gradient column to DataFrame
df['gradient_percent'] = gradient_values
return ElevationData(
elevation_values=elevation_values,
gradient_values=gradient_values,
elevation_gain=max(elevation_values) - min(elevation_values),
elevation_loss=0, # Will be calculated more accurately
max_gradient=np.max(gradient_values),
min_gradient=np.min(gradient_values)
)
def _extract_gear_data(self, df: pd.DataFrame) -> Optional[GearData]:
"""Extract gear data from DataFrame.
Args:
df: DataFrame with workout data
Returns:
GearData object or None
"""
if 'cadence_rpm' not in df.columns or 'speed_mps' not in df.columns:
logger.info("Gear estimation skipped: missing speed_mps or cadence_rpm columns")
return None
# Estimate gear series
gear_series = estimate_gear_series(
df,
wheel_circumference_m=BikeConfig.TIRE_CIRCUMFERENCE_M,
valid_configurations=BikeConfig.VALID_CONFIGURATIONS
)
if gear_series.empty:
logger.info("Gear estimation skipped: no valid data for estimation")
return None
# Compute summary
summary = compute_gear_summary(gear_series)
return GearData(
series=gear_series,
summary=summary
)
def _distance_window_indices(self, distance: np.ndarray, half_window_m: float) -> tuple[np.ndarray, np.ndarray]:
"""Compute backward and forward indices for distance-based windowing.
For each sample i, find the closest indices j <= i and k >= i such that
distance[i] - distance[j] >= half_window_m and distance[k] - distance[i] >= half_window_m.
Args:
distance: Monotonic array of cumulative distances in meters
half_window_m: Half window size in meters
Returns:
Tuple of (j_indices, k_indices) arrays
"""
n = len(distance)
j_indices = np.full(n, -1, dtype=int)
k_indices = np.full(n, -1, dtype=int)
for i in range(n):
# Find largest j <= i where distance[i] - distance[j] >= half_window_m
j = i
while j >= 0 and distance[i] - distance[j] < half_window_m:
j -= 1
j_indices[i] = max(j, 0)
# Find smallest k >= i where distance[k] - distance[i] >= half_window_m
k = i
while k < n and distance[k] - distance[i] < half_window_m:
k += 1
k_indices[i] = min(k, n - 1)
return j_indices, k_indices
def _calculate_gradients(self, df: pd.DataFrame) -> List[float]:
"""Calculate smoothed, distance-referenced gradients from elevation data.
Computes gradients using a distance-based smoothing window, handling missing
distance/speed/elevation data gracefully. Assumes 1 Hz sampling for distance
derivation if speed is available but distance is not.
Args:
df: DataFrame containing elevation, distance, and speed columns
Returns:
List of gradient values in percent, with NaN for invalid computations
"""
from config.settings import SMOOTHING_WINDOW
n = len(df)
if n < 2:
return [np.nan] * n
# Derive distance array
if 'distance' in df.columns:
distance = df['distance'].values.astype(float)
if not np.all(distance[1:] >= distance[:-1]):
logger.warning("Distance not monotonic, deriving from speed")
distance = None # Fall through to speed derivation
else:
distance = None
if distance is None:
if 'speed' in df.columns:
speed = df['speed'].values.astype(float)
distance = np.cumsum(speed) # dt=1 assumed
else:
logger.warning("No distance or speed available, cannot compute gradients")
return [np.nan] * n
# Get elevation
elevation_col = 'altitude' if 'altitude' in df.columns else 'elevation'
elevation = df[elevation_col].values.astype(float)
half_window = SMOOTHING_WINDOW / 2
j_arr, k_arr = self._distance_window_indices(distance, half_window)
gradients = []
for i in range(n):
j, k = j_arr[i], k_arr[i]
if distance[k] - distance[j] >= 1 and not (pd.isna(elevation[j]) or pd.isna(elevation[k])):
delta_elev = elevation[k] - elevation[j]
delta_dist = distance[k] - distance[j]
grad = 100 * delta_elev / delta_dist
grad = np.clip(grad, -30, 30)
gradients.append(grad)
else:
gradients.append(np.nan)
# Light smoothing: rolling median over 5 samples, interpolate isolated NaNs
grad_series = pd.Series(gradients)
smoothed = grad_series.rolling(5, center=True, min_periods=1).median()
smoothed = smoothed.interpolate(limit=3, limit_direction='both')
return smoothed.tolist()
def _parse_tcx(self, file_path: Path) -> Optional[WorkoutData]:
"""Parse TCX file format.
Args:
file_path: Path to TCX file
Returns:
WorkoutData object or None if parsing failed
"""
raise NotImplementedError("TCX file parsing is not yet implemented.")
def _parse_gpx(self, file_path: Path) -> Optional[WorkoutData]:
"""Parse GPX file format.
Args:
file_path: Path to GPX file
Returns:
WorkoutData object or None if parsing failed
"""
raise NotImplementedError("GPX file parsing is not yet implemented.")