Actionable Rules for high-quality, reproducible feature-engineering code in Python ML projects.
Stop spending 80% of your time on repetitive feature transformations. These Cursor Rules turn feature engineering from a tedious bottleneck into a streamlined, reproducible pipeline that scales with your ML projects.
You know the drill: every ML project starts with promising data, then you spend weeks writing the same transformation code, debugging data leaks, and rebuilding features because someone changed the schema. Your notebooks become unmaintainable, your transformations aren't reproducible, and half your time goes to fixing broken pipelines instead of improving models.
The core problems killing your productivity:
These Cursor Rules establish battle-tested patterns that eliminate common pitfalls while maximizing code reusability. You'll build transformations as composable, testable pipeline components that work consistently across projects.
What you get:
Build once, reuse everywhere. Every transformer follows the same pattern:
class AgeBucketizer(BaseEstimator, TransformerMixin):
def __init__(self, bins=(0, 18, 35, 55, 120)):
self.bins = bins
def fit(self, X: pd.DataFrame, y=None):
return self
def transform(self, X: pd.DataFrame):
return pd.cut(X["age"], bins=self.bins, labels=False)
All transformations are fit once on training data, then applied consistently:
pipeline = Pipeline([
("impute", SimpleImputer(strategy="median")),
("scale", StandardScaler()),
("select", SelectKBest(k=10))
])
# Fit once, transform many times - no leakage possible
Vectorized operations replace slow row-by-row processing:
# Instead of: df.apply(lambda x: complex_transform(x), axis=1)
# Use: vectorized_transform(df["column"]) # 10x faster
# Scattered exploration across multiple notebooks
df.hist() # Which columns? What insights?
df.corr() # No systematic approach
# Features built ad-hoc, no reusability
def explore_numerical_features(df: pd.DataFrame) -> Dict[str, Any]:
"""Domain-driven exploration with actionable insights."""
insights = {}
for col in df.select_dtypes(include=[np.number]).columns:
insights[col] = {
"skewness": df[col].skew(),
"outliers": detect_outliers(df[col]),
"missing_rate": df[col].isnull().mean()
}
return insights
# Transformations scattered across notebooks
df["age_scaled"] = (df["age"] - df["age"].mean()) / df["age"].std()
df["income_log"] = np.log(df["income"] + 1)
# Schema change? Everything breaks
feature_pipeline = Pipeline([
("validate", DataFrameValidator(expected_columns=["age", "income"])),
("impute", ColumnTransformer([
("num", SimpleImputer(strategy="median"), ["age", "income"]),
("cat", SimpleImputer(strategy="constant", fill_value="unknown"), ["category"])
])),
("engineer", FeatureEngineer())
])
# Manually calculating rolling statistics
df["sales_7d_mean"] = df.groupby("store_id")["sales"].rolling(7).mean()
df["sales_trend"] = df["sales"].diff()
# Hundreds of lines for basic time series features
from tsfresh import extract_features
from tsfresh.feature_selection import select_features
# Extract 700+ features automatically
features = extract_features(
timeseries_container=df,
column_id="store_id",
column_sort="date",
n_jobs=-1
)
# Select only predictive features
relevant_features = select_features(features, target)
# feature_engineering/base.py
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from typing import Dict, Any
class BaseFeatureTransformer(BaseEstimator, TransformerMixin):
"""Base class ensuring consistent transformer behavior."""
def fit(self, X: pd.DataFrame, y=None):
self._validate_input(X)
return self
def transform(self, X: pd.DataFrame):
self._validate_input(X)
return self._transform_impl(X)
def _validate_input(self, X: pd.DataFrame):
"""Override with specific validation logic."""
pass
def _transform_impl(self, X: pd.DataFrame):
"""Override with transformation logic."""
raise NotImplementedError
# feature_engineering/transformers.py
class CustomerFeatureEngineer(BaseFeatureTransformer):
"""Customer-specific feature transformations."""
def _transform_impl(self, X: pd.DataFrame):
X = X.copy()
# Recency, Frequency, Monetary features
X["days_since_last_purchase"] = (pd.Timestamp.now() - X["last_purchase_date"]).dt.days
X["purchase_frequency"] = X["total_purchases"] / X["customer_lifetime_days"]
X["avg_order_value"] = X["total_spent"] / X["total_purchases"]
return X[["days_since_last_purchase", "purchase_frequency", "avg_order_value"]]
# For Featuretools
def build_feature_definitions(entities: Dict[str, pd.DataFrame]):
"""Create entity set with proper normalization."""
es = ft.EntitySet(id="customer_data")
# Add entities with proper time indices
es = es.add_entity(
entity_id="customers",
dataframe=entities["customers"],
index="customer_id"
)
# Limit depth to prevent feature explosion
features, names = ft.dfs(
entityset=es,
target_entity="customers",
max_depth=2,
trans_primitives=["add_numeric", "multiply_numeric"],
agg_primitives=["mean", "sum", "count"]
)
return features
# tests/test_transformers.py
def test_customer_feature_engineer():
"""Ensure transformer handles edge cases."""
transformer = CustomerFeatureEngineer()
# Test normal case
df = pd.DataFrame({
"last_purchase_date": [pd.Timestamp("2023-01-01")],
"total_purchases": [5],
"customer_lifetime_days": [365],
"total_spent": [500.0]
})
result = transformer.fit_transform(df)
assert result.shape[1] == 3
assert not result.isnull().any().any()
# Test edge cases
empty_df = pd.DataFrame()
with pytest.raises(ValidationError):
transformer.transform(empty_df)
Transform your feature engineering from a time sink into a competitive advantage. These rules eliminate the repetitive work while ensuring your transformations are robust, reusable, and ready for production scale.
Your ML projects deserve better than scattered notebook cells and brittle transformation code. Start building features the right way.
You are an expert in Python, Pandas, NumPy, Scikit-learn, Featuretools, Feature Engine, TSFresh, Feast.
Key Principles
- Start with domain-driven data exploration (histograms, scatter/box plots, corr matrix) before coding transformations.
- Keep every transformation deterministic, idempotent, and traceable.
- Build features in composable, testable pipeline steps (Scikit-learn `Pipeline` or custom callable).
- Prefer column-wise vectorised operations over row loops; avoid `apply` unless unavoidable.
- Store raw data → staging → features as separate, immutable artefacts; never overwrite.
- Use snake_case, verbs for functions (`encode_gender`), nouns for transformers (`AgeBucketizer`).
- Version control the full feature definition (code + metadata) alongside the model.
Python
- Use type hints and NumPy typing (`pd.DataFrame`, `pd.Series`, `npt.NDArray`); run `mypy` in CI.
- Write pure functions or `sklearn`-style transformers (inherit `BaseEstimator`, `TransformerMixin`).
- One public function per module: `build_features.py` exposes `build(df: pd.DataFrame) -> pd.DataFrame`.
- Default to Pandas; if dataset >10 M rows, switch to Polars, Dask, or PySpark.
- Avoid global state; pass feature params explicitly (e.g., `rare_threshold: float = 0.01`).
- Use f-strings for logging; format floats with `:.4f`.
- Commit notebooks only as executed HTML or plain `.py` via `jupyter nbconvert`.
Error Handling and Validation
- Validate inputs at function start using `pydantic` or `pandera` schemas.
- Missing-value policy must be explicit: raise if unexpected nulls, else impute.
- Use early returns; depth ≤2 nested blocks.
- Wrap external I/O in try/except; re-raise as `FeatureEngineeringError` with context.
- Log sample counts before/after each filter; abort if ≥30 % rows removed unless `force=True`.
- Detect data drift by storing training feature stats in `features.yaml`; compare at inference.
Framework-Specific Rules
Scikit-learn
- Always encapsulate preprocessing in a `ColumnTransformer`; never leak fit data.
- Serialise pipeline with `joblib` and checksum the file.
- Use `FunctionTransformer` for one-off lambdas, but move to named class when reused twice.
Featuretools
- Use Deep Feature Synthesis only on normalised (one-to-many) entity sets.
- Limit `max_depth` ≤2 for tabular data to prevent feature explosion.
- Disable primitives that produce high-cardinality categoricals unless explicitly whitelisted.
Feature Engine
- Prefer `SmartCorrelatedSelection` to drop correlated vars; set `method="pearson"`, `threshold=0.9`.
- Use `RareLabelEncoder(tol=0.01, n_categories=10)` before `OneHotEncoder`.
TSFresh (time-series)
- Run `extract_features` with `n_jobs=-1`, `chunksize` ≈ 50 k rows.
- Apply `select_features` using target for supervised ranking; ensure no leakage.
Feast (online serving)
- Store feature values in batch `parquet`; online layer Redis/BigTable only.
- Define feature views with explicit `ttl`; default 30 d.
- Keep entity IDs consistent; never embed business meaning in the key.
Additional Sections
Testing
- Write unit tests for every transformer: assert shape, dtypes, null counts.
- Use `pytest` parametrisation to test edge cases (empty df, all nulls).
- Add cross-validation tests: `pytest --hypothesis-show-statistics` with `hypothesis.extra.pandas`.
Performance
- Profile with `%%timeit` or `perf_counter`; budget: feature build ≤30 % of total training time.
- Cache costly steps with `joblib.Memory`; invalidated when input hash changes.
- Chunk large CSVs via `pd.read_csv(chunksize=1_000_000)` and `concat`.
Security & Compliance
- Mask PII before logging (`name`, `email`, etc.).
- Encrypt intermediate feature files at rest (AES-256) when containing sensitive data.
- Record data lineage and transformation hashes for auditability.
Documentation
- Auto-generate feature catalogue markdown via `feast registry build && feast registry json`.
- Each transformer class requires a docstring with: Purpose, Inputs, Outputs, Formula, Example.
Example Skeleton
```python
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
class AgeBucketizer(BaseEstimator, TransformerMixin):
"""Bucketises age into categorical bins."""
def __init__(self, bins=(0, 18, 35, 55, 120)):
self.bins = bins
def fit(self, X: pd.DataFrame, y=None):
return self
def transform(self, X: pd.DataFrame):
X = X.copy()
X["age_bin"] = pd.cut(X["age"], bins=self.bins, labels=False)
return X[["age_bin"]]
num_cols = ["salary", "expenses"]
cat_cols = ["gender", "city"]
preprocess = ColumnTransformer([
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])
pipeline = Pipeline([
("age_bucket", AgeBucketizer()),
("pre", preprocess)
])
```
Never embed feature generation inside model training loops; fit once, reuse.