Source code for src.preprocess

import os
from pathlib import Path

import pandas as pd

INPUT_PATH = os.getenv("RAW_CSV_PATH", "data/raw/hydraulic_data.csv")
OUTPUT_PATH = os.getenv("PROCESSED_CSV_PATH", "data/processed/hydraulic_clean.csv")

SENSORS = [
    "PS1",
    "PS2",
    "PS3",
    "PS4",
    "PS5",
    "PS6",
    "EPS1",
    "FS1",
    "FS2",
    "TS1",
    "TS2",
    "TS3",
    "TS4",
    "VS1",
    "CE",
    "CP",
    "SE",
]

TARGETS = ["cooler_condition", "valve_condition", "pump_leakage", "accumulator_pressure"]



[docs]
def preprocess() -> None:
    df = pd.read_csv(INPUT_PATH)

    # Remove unstable cycles
    df = df[df["stable_flag"] == 0].copy()

    df = df[SENSORS + TARGETS].dropna()

    Path(OUTPUT_PATH).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUTPUT_PATH, index=False)

    print(f"Rows: {len(df)} | Features: {len(SENSORS)} | Targets: {TARGETS}")
    for col in TARGETS:
        print(f"  {col}: {df[col].value_counts().to_dict()}")
    print(f"Saved to: {OUTPUT_PATH}")



if __name__ == "__main__":
    preprocess()