This commit is contained in:
2025-09-26 07:30:15 -07:00
parent 9d7e855713
commit 478d1bb06c
30 changed files with 5584 additions and 31 deletions

371
parsers/file_parser.py Normal file
View File

@@ -0,0 +1,371 @@
"""File parser for various workout formats (FIT, TCX, GPX)."""
import logging
from pathlib import Path
from typing import Dict, Any, Optional, List
import pandas as pd
import numpy as np
try:
from fitparse import FitFile
except ImportError:
raise ImportError("fitparse package required. Install with: pip install fitparse")
from ..models.workout import WorkoutData, WorkoutMetadata, PowerData, HeartRateData, SpeedData, ElevationData, GearData
from ..config.settings import SUPPORTED_FORMATS
logger = logging.getLogger(__name__)
class FileParser:
"""Parser for workout files in various formats."""
def __init__(self):
"""Initialize file parser."""
pass
def parse_file(self, file_path: Path) -> Optional[WorkoutData]:
"""Parse a workout file and return structured data.
Args:
file_path: Path to the workout file
Returns:
WorkoutData object or None if parsing failed
"""
if not file_path.exists():
logger.error(f"File not found: {file_path}")
return None
file_extension = file_path.suffix.lower()
if file_extension not in SUPPORTED_FORMATS:
logger.error(f"Unsupported file format: {file_extension}")
return None
try:
if file_extension == '.fit':
return self._parse_fit(file_path)
elif file_extension == '.tcx':
return self._parse_tcx(file_path)
elif file_extension == '.gpx':
return self._parse_gpx(file_path)
else:
logger.error(f"Parser not implemented for format: {file_extension}")
return None
except Exception as e:
logger.error(f"Failed to parse file {file_path}: {e}")
return None
def _parse_fit(self, file_path: Path) -> Optional[WorkoutData]:
"""Parse FIT file format.
Args:
file_path: Path to FIT file
Returns:
WorkoutData object or None if parsing failed
"""
try:
fit_file = FitFile(str(file_path))
# Extract session data
session_data = self._extract_fit_session(fit_file)
if not session_data:
logger.error("No session data found in FIT file")
return None
# Extract record data (timestamp-based data)
records = list(fit_file.get_messages('record'))
if not records:
logger.error("No record data found in FIT file")
return None
# Create DataFrame from records
df = self._fit_records_to_dataframe(records)
if df.empty:
logger.error("No valid data extracted from FIT records")
return None
# Create metadata
metadata = WorkoutMetadata(
activity_id=str(session_data.get('activity_id', 'unknown')),
activity_name=session_data.get('activity_name', 'Workout'),
start_time=session_data.get('start_time', pd.Timestamp.now()),
duration_seconds=session_data.get('total_timer_time', 0),
distance_meters=session_data.get('total_distance'),
avg_heart_rate=session_data.get('avg_heart_rate'),
max_heart_rate=session_data.get('max_heart_rate'),
avg_power=session_data.get('avg_power'),
max_power=session_data.get('max_power'),
avg_speed=session_data.get('avg_speed'),
max_speed=session_data.get('max_speed'),
elevation_gain=session_data.get('total_ascent'),
elevation_loss=session_data.get('total_descent'),
calories=session_data.get('total_calories'),
sport=session_data.get('sport', 'cycling'),
sub_sport=session_data.get('sub_sport'),
is_indoor=session_data.get('is_indoor', False)
)
# Create workout data
workout_data = WorkoutData(
metadata=metadata,
raw_data=df
)
# Add processed data if available
if not df.empty:
workout_data.power = self._extract_power_data(df)
workout_data.heart_rate = self._extract_heart_rate_data(df)
workout_data.speed = self._extract_speed_data(df)
workout_data.elevation = self._extract_elevation_data(df)
workout_data.gear = self._extract_gear_data(df)
return workout_data
except Exception as e:
logger.error(f"Failed to parse FIT file {file_path}: {e}")
return None
def _extract_fit_session(self, fit_file) -> Optional[Dict[str, Any]]:
"""Extract session data from FIT file.
Args:
fit_file: FIT file object
Returns:
Dictionary with session data
"""
try:
sessions = list(fit_file.get_messages('session'))
if not sessions:
return None
session = sessions[0]
data = {}
for field in session:
if field.name and field.value is not None:
data[field.name] = field.value
return data
except Exception as e:
logger.error(f"Failed to extract session data: {e}")
return None
def _fit_records_to_dataframe(self, records) -> pd.DataFrame:
"""Convert FIT records to pandas DataFrame.
Args:
records: List of FIT record messages
Returns:
DataFrame with workout data
"""
data = []
for record in records:
record_data = {}
for field in record:
if field.name and field.value is not None:
record_data[field.name] = field.value
data.append(record_data)
if not data:
return pd.DataFrame()
df = pd.DataFrame(data)
# Convert timestamp to datetime
if 'timestamp' in df.columns:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp')
df = df.reset_index(drop=True)
return df
def _extract_power_data(self, df: pd.DataFrame) -> Optional[PowerData]:
"""Extract power data from DataFrame.
Args:
df: DataFrame with workout data
Returns:
PowerData object or None
"""
if 'power' not in df.columns:
return None
power_values = df['power'].dropna().tolist()
if not power_values:
return None
return PowerData(
power_values=power_values,
estimated_power=[], # Will be calculated later
power_zones={}
)
def _extract_heart_rate_data(self, df: pd.DataFrame) -> Optional[HeartRateData]:
"""Extract heart rate data from DataFrame.
Args:
df: DataFrame with workout data
Returns:
HeartRateData object or None
"""
if 'heart_rate' not in df.columns:
return None
hr_values = df['heart_rate'].dropna().tolist()
if not hr_values:
return None
return HeartRateData(
heart_rate_values=hr_values,
hr_zones={},
avg_hr=np.mean(hr_values),
max_hr=np.max(hr_values)
)
def _extract_speed_data(self, df: pd.DataFrame) -> Optional[SpeedData]:
"""Extract speed data from DataFrame.
Args:
df: DataFrame with workout data
Returns:
SpeedData object or None
"""
if 'speed' not in df.columns:
return None
speed_values = df['speed'].dropna().tolist()
if not speed_values:
return None
# Convert m/s to km/h if needed
if max(speed_values) < 50: # Likely m/s
speed_values = [s * 3.6 for s in speed_values]
# Calculate distance if available
distance_values = []
if 'distance' in df.columns:
distance_values = df['distance'].dropna().tolist()
# Convert to km if in meters
if distance_values and max(distance_values) > 1000:
distance_values = [d / 1000 for d in distance_values]
return SpeedData(
speed_values=speed_values,
distance_values=distance_values,
avg_speed=np.mean(speed_values),
max_speed=np.max(speed_values),
total_distance=distance_values[-1] if distance_values else None
)
def _extract_elevation_data(self, df: pd.DataFrame) -> Optional[ElevationData]:
"""Extract elevation data from DataFrame.
Args:
df: DataFrame with workout data
Returns:
ElevationData object or None
"""
if 'altitude' not in df.columns and 'elevation' not in df.columns:
return None
# Use 'altitude' or 'elevation' column
elevation_col = 'altitude' if 'altitude' in df.columns else 'elevation'
elevation_values = df[elevation_col].dropna().tolist()
if not elevation_values:
return None
# Calculate gradients
gradient_values = self._calculate_gradients(elevation_values)
return ElevationData(
elevation_values=elevation_values,
gradient_values=gradient_values,
elevation_gain=max(elevation_values) - min(elevation_values),
elevation_loss=0, # Will be calculated more accurately
max_gradient=np.max(gradient_values),
min_gradient=np.min(gradient_values)
)
def _extract_gear_data(self, df: pd.DataFrame) -> Optional[GearData]:
"""Extract gear data from DataFrame.
Args:
df: DataFrame with workout data
Returns:
GearData object or None
"""
if 'cadence' not in df.columns:
return None
cadence_values = df['cadence'].dropna().tolist()
if not cadence_values:
return None
return GearData(
gear_ratios=[],
cadence_values=cadence_values,
estimated_gear=[],
chainring_teeth=38, # Default
cassette_teeth=[14, 16, 18, 20]
)
def _calculate_gradients(self, elevation_values: List[float]) -> List[float]:
"""Calculate gradients from elevation data.
Args:
elevation_values: List of elevation values in meters
Returns:
List of gradient values in percent
"""
if len(elevation_values) < 2:
return [0.0] * len(elevation_values)
gradients = [0.0] # First point has no gradient
for i in range(1, len(elevation_values)):
elevation_diff = elevation_values[i] - elevation_values[i-1]
# Assume 10m distance between points for gradient calculation
distance = 10.0
gradient = (elevation_diff / distance) * 100
gradients.append(gradient)
return gradients
def _parse_tcx(self, file_path: Path) -> Optional[WorkoutData]:
"""Parse TCX file format.
Args:
file_path: Path to TCX file
Returns:
WorkoutData object or None if parsing failed
"""
logger.warning("TCX parser not implemented yet")
return None
def _parse_gpx(self, file_path: Path) -> Optional[WorkoutData]:
"""Parse GPX file format.
Args:
file_path: Path to GPX file
Returns:
WorkoutData object or None if parsing failed
"""
logger.warning("GPX parser not implemented yet")
return None