Skip to main content

Feature Engineering

Feature Engineering Pipeline

Data Scientists Spend 80% of Time Here

Raw data is messy. Models need clean, meaningful numbers. Feature engineering is the art of transforming raw data into features that help models learn.
E-commerce Feature Extraction

The House Price Example

Raw data:
Address: "123 Main St, New York, NY 10001"
Built: "March 15, 1995"  
Description: "Cozy 3BR, renovated kitchen, near subway"
Price: $850,000
What a model needs:
{
    'bedrooms': 3,
    'city_encoded': 45,        # New York
    'year_built': 1995,
    'building_age': 30,
    'is_renovated': 1,
    'near_transit': 1,
    'zip_price_tier': 3        # Expensive area
}

Handling Missing Values

import pandas as pd
import numpy as np

# Sample data with missing values
df = pd.DataFrame({
    'age': [25, np.nan, 35, 40, np.nan],
    'income': [50000, 60000, np.nan, 80000, 90000],
    'education': ['Bachelor', 'Master', np.nan, 'PhD', 'Bachelor']
})

print("Missing values:")
print(df.isnull().sum())

Strategy 1: Drop Missing Values

# Drop rows with ANY missing values
df_clean = df.dropna()

# Drop rows with missing values in specific columns
df_clean = df.dropna(subset=['age'])
Only use when you have lots of data and missingness is random.

Strategy 2: Imputation

from sklearn.impute import SimpleImputer

# Numeric: fill with mean, median, or constant
numeric_imputer = SimpleImputer(strategy='mean')  # or 'median', 'most_frequent'
df['age'] = numeric_imputer.fit_transform(df[['age']])

# Categorical: fill with mode or 'Unknown'
categorical_imputer = SimpleImputer(strategy='most_frequent')
df['education'] = categorical_imputer.fit_transform(df[['education']])

Strategy 3: Indicator Variables

# Create a flag for missing values (can be informative!)
df['age_missing'] = df['age'].isnull().astype(int)
df['age'] = df['age'].fillna(df['age'].median())

Encoding Categorical Variables

Label Encoding (for ordinal categories)

from sklearn.preprocessing import LabelEncoder

education_order = ['High School', 'Bachelor', 'Master', 'PhD']
df['education_encoded'] = df['education'].map({
    'High School': 0,
    'Bachelor': 1,
    'Master': 2,
    'PhD': 3
})

One-Hot Encoding (for nominal categories)

from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Using pandas
df_encoded = pd.get_dummies(df, columns=['color'], prefix='color')

# Using sklearn
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(df[['color']])
Before:
id  color
1   red
2   blue
3   green
After:
id  color_red  color_blue  color_green
1   1          0           0
2   0          1           0
3   0          0           1

Target Encoding (for high-cardinality categories)

# Replace category with mean target value
city_means = df.groupby('city')['price'].mean()
df['city_encoded'] = df['city'].map(city_means)
Use target encoding carefully to avoid data leakage. Always compute means on training data only.

Scaling Numerical Features

Why Scale?

Many algorithms (SVM, KNN, neural networks) are sensitive to scale:
  • Age: 0-100
  • Income: 0-1,000,000
Without scaling, income would dominate!

StandardScaler (Z-score normalization)

xscaled=xμσx_{scaled} = \frac{x - \mu}{\sigma}
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Result: mean=0, std=1
print(f"Mean: {X_scaled.mean():.4f}")
print(f"Std:  {X_scaled.std():.4f}")

MinMaxScaler (0-1 normalization)

xscaled=xxminxmaxxminx_{scaled} = \frac{x - x_{min}}{x_{max} - x_{min}}
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Result: values between 0 and 1
print(f"Min: {X_scaled.min():.4f}")
print(f"Max: {X_scaled.max():.4f}")

RobustScaler (for outliers)

Uses median and IQR instead of mean and std:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

Creating New Features

Mathematical Transformations

# For skewed distributions
df['income_log'] = np.log1p(df['income'])  # log(1+x) to handle zeros

# Square root
df['rooms_sqrt'] = np.sqrt(df['rooms'])

# Polynomial features
df['age_squared'] = df['age'] ** 2

Interaction Features

# Combine features
df['price_per_sqft'] = df['price'] / df['sqft']
df['income_per_person'] = df['household_income'] / df['household_size']
df['age_income_ratio'] = df['age'] / df['income']

Date Features

df['date'] = pd.to_datetime(df['date'])

# Extract components
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['quarter'] = df['date'].dt.quarter

# Time since event
df['days_since_purchase'] = (pd.Timestamp.now() - df['date']).dt.days

Text Features

# Length
df['description_length'] = df['description'].str.len()
df['word_count'] = df['description'].str.split().str.len()

# Contains specific words
df['has_discount'] = df['description'].str.contains('discount|sale|offer', case=False).astype(int)

Binning Continuous Variables

# Age groups
df['age_group'] = pd.cut(
    df['age'],
    bins=[0, 18, 35, 50, 65, 100],
    labels=['youth', 'young_adult', 'middle_age', 'senior', 'elderly']
)

# Equal-frequency binning (quantiles)
df['income_quantile'] = pd.qcut(
    df['income'],
    q=4,
    labels=['low', 'medium', 'high', 'very_high']
)

Handling Outliers

import numpy as np

def detect_outliers_iqr(data):
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return (data < lower) | (data > upper)

# Cap outliers (winsorization)
def cap_outliers(data, lower_percentile=1, upper_percentile=99):
    lower = np.percentile(data, lower_percentile)
    upper = np.percentile(data, upper_percentile)
    return np.clip(data, lower, upper)

df['income_capped'] = cap_outliers(df['income'])

Feature Selection

Correlation Analysis

import seaborn as sns
import matplotlib.pyplot as plt

# Correlation matrix
corr = df.corr()

# Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlations')
plt.show()

# Drop highly correlated features
def drop_correlated_features(df, threshold=0.9):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return df.drop(columns=to_drop)

Model-Based Selection

from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier

# Select top k features by ANOVA F-score
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[selector.get_support()]
print("Selected features:", list(selected_features))

Recursive Feature Elimination

from sklearn.feature_selection import RFE

model = RandomForestClassifier(n_estimators=50, random_state=42)
rfe = RFE(model, n_features_to_select=10)
rfe.fit(X, y)

# Get rankings
for name, rank in zip(X.columns, rfe.ranking_):
    print(f"{name}: Rank {rank}")

Feature Engineering Pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Define column types
numeric_features = ['age', 'income', 'credit_score']
categorical_features = ['education', 'occupation', 'city']

# Create transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Full pipeline with model
from sklearn.ensemble import RandomForestClassifier

full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Use it
full_pipeline.fit(X_train, y_train)
predictions = full_pipeline.predict(X_test)

Common Mistakes

Data Leakage

Problem: Using test data info during trainingFix: Always fit transformers on train data only
# Wrong
scaler.fit(X)  # Uses all data

# Right
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Scaling After Split

Problem: Scaling before train-test splitFix: Split first, then scale
# Right order:
# 1. Train-test split
# 2. Fit scaler on train
# 3. Transform both

🚀 Mini Projects

Project 1: E-commerce Feature Engineer

Transform raw transaction data into predictive features

Project 2: Date-Time Feature Factory

Extract powerful temporal features from timestamps

Project 3: Text Feature Extractor

Convert text data into numerical features

Project 4: Automated Feature Pipeline

Build an end-to-end feature engineering pipeline

Project 1: E-commerce Feature Engineer

Transform raw e-commerce transaction data into features that predict customer churn.

Project 2: Date-Time Feature Factory

Extract powerful temporal features from timestamp data.

Project 3: Text Feature Extractor

Convert text data into numerical features for machine learning.

Project 4: Automated Feature Pipeline

Build an end-to-end feature engineering pipeline that handles multiple data types.

Key Takeaways

Handle Missing Data

Impute or create indicator variables

Encode Categories

One-hot for nominal, ordinal for ordered

Scale Features

StandardScaler or MinMaxScaler for most algorithms

Create Features

Domain knowledge creates the best features

🧹 Real-World Messy Data: Complete Guide

Missing Values Decision Tree

Is data missing?
├── < 5% missing
│   └── Safe to drop rows OR simple imputation (mean/median)
├── 5-30% missing
│   ├── Is missingness random?
│   │   ├── Yes → Impute with mean/median/mode
│   │   └── No (informative) → Create "is_missing" indicator + impute
│   └── Consider multiple imputation for important analyses
└── > 30% missing
    ├── Is the feature important?
    │   ├── Yes → Advanced imputation (KNN, iterative)
    │   └── No → Consider dropping the feature
    └── Investigate WHY data is missing
# Production-ready missing value handler
def handle_missing_values(df, missing_threshold=0.3):
    """
    Handle missing values with best practices.
    """
    report = []
    
    for col in df.columns:
        missing_pct = df[col].isnull().mean()
        
        if missing_pct == 0:
            continue
        elif missing_pct > missing_threshold:
            report.append(f"⚠️ {col}: {missing_pct:.1%} missing - consider dropping")
        elif df[col].dtype in ['float64', 'int64']:
            # Numeric: impute with median (robust to outliers)
            df[f'{col}_missing'] = df[col].isnull().astype(int)
            df[col].fillna(df[col].median(), inplace=True)
            report.append(f"✓ {col}: imputed with median, created indicator")
        else:
            # Categorical: impute with mode or 'Unknown'
            df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unknown', inplace=True)
            report.append(f"✓ {col}: imputed with mode/Unknown")
    
    return df, report

Outlier Detection & Treatment

import numpy as np
from scipy import stats

def detect_and_handle_outliers(df, columns, method='iqr', action='cap'):
    """
    Detect outliers using IQR or Z-score, then handle them.
    
    Parameters:
    - method: 'iqr' (robust) or 'zscore' (assumes normality)
    - action: 'cap' (winsorize), 'remove', or 'flag'
    """
    for col in columns:
        if method == 'iqr':
            Q1, Q3 = df[col].quantile([0.25, 0.75])
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
        else:  # zscore
            z = np.abs(stats.zscore(df[col].dropna()))
            lower = df[col].mean() - 3 * df[col].std()
            upper = df[col].mean() + 3 * df[col].std()
        
        outliers = (df[col] < lower) | (df[col] > upper)
        n_outliers = outliers.sum()
        
        if n_outliers > 0:
            if action == 'cap':
                df[col] = df[col].clip(lower=lower, upper=upper)
                print(f"  {col}: capped {n_outliers} outliers to [{lower:.2f}, {upper:.2f}]")
            elif action == 'remove':
                df = df[~outliers]
                print(f"  {col}: removed {n_outliers} outlier rows")
            else:  # flag
                df[f'{col}_is_outlier'] = outliers.astype(int)
                print(f"  {col}: flagged {n_outliers} outliers")
    
    return df

Handling Skewed Distributions

from sklearn.preprocessing import PowerTransformer

def handle_skewness(df, columns, threshold=1.0):
    """
    Apply log or Yeo-Johnson transform to skewed features.
    """
    from scipy.stats import skew
    
    for col in columns:
        col_skew = skew(df[col].dropna())
        
        if abs(col_skew) > threshold:
            if (df[col] > 0).all():
                # Log transform for positive data
                df[f'{col}_log'] = np.log1p(df[col])
                print(f"  {col}: skew={col_skew:.2f} → log transform applied")
            else:
                # Yeo-Johnson for any data
                pt = PowerTransformer(method='yeo-johnson')
                df[f'{col}_transformed'] = pt.fit_transform(df[[col]])
                print(f"  {col}: skew={col_skew:.2f} → Yeo-Johnson applied")
    
    return df

🔗 Math → ML Connection

Feature engineering connects to these mathematical concepts:
TechniqueMath ConceptWhy It Works
StandardizationZ-scores from statisticsMakes gradient descent converge faster
One-hot encodingOrthogonal basis vectorsEach category becomes a dimension
Log transformsProperties of logarithmsLinearizes exponential relationships
Polynomial featuresPolynomial functionsCaptures nonlinear patterns
PCA featuresEigenvalue decompositionFinds directions of max variance
Interaction termsCross-productsModels combined effects
The Linear Algebra course covers why these transformations work geometrically.

🚀 Going Deeper (Optional)

Target Encoding (for High-Cardinality Categoricals)

When a categorical has 1000+ unique values, one-hot encoding creates too many features:
from sklearn.model_selection import KFold

def target_encode(df, cat_col, target_col, n_splits=5):
    """
    Replace category with mean target value (using cross-validation to prevent leakage).
    """
    df[f'{cat_col}_target_enc'] = np.nan
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    for train_idx, val_idx in kf.split(df):
        # Calculate means only on training fold
        means = df.iloc[train_idx].groupby(cat_col)[target_col].mean()
        # Apply to validation fold
        df.loc[df.index[val_idx], f'{cat_col}_target_enc'] = \
            df.iloc[val_idx][cat_col].map(means)
    
    # Fill any remaining NaN with global mean
    df[f'{cat_col}_target_enc'].fillna(df[target_col].mean(), inplace=True)
    
    return df

Time-Based Features

def create_time_features(df, date_col):
    """
    Extract rich features from datetime columns.
    """
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Basic extractions
    df['year'] = df[date_col].dt.year
    df['month'] = df[date_col].dt.month
    df['day'] = df[date_col].dt.day
    df['dayofweek'] = df[date_col].dt.dayofweek  # 0=Monday
    df['hour'] = df[date_col].dt.hour
    
    # Cyclical encoding (preserves continuity: Dec → Jan)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    
    # Business features
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_month_start'] = df[date_col].dt.is_month_start.astype(int)
    df['is_month_end'] = df[date_col].dt.is_month_end.astype(int)
    
    return df

Automated Feature Engineering

# Using featuretools for automated feature generation
import featuretools as ft

# Define entity set
es = ft.EntitySet(id='customers')
es.add_dataframe(dataframe_name='transactions', dataframe=transactions_df,
                 index='transaction_id', time_index='timestamp')
es.add_dataframe(dataframe_name='customers', dataframe=customers_df,
                 index='customer_id')

# Create relationship
es.add_relationship('customers', 'customer_id', 'transactions', 'customer_id')

# Automatically generate features
features, feature_names = ft.dfs(
    entityset=es,
    target_dataframe_name='customers',
    agg_primitives=['sum', 'mean', 'count', 'max', 'min', 'std'],
    trans_primitives=['month', 'year', 'weekday'],
    max_depth=2
)

What’s Next?

Now you know how to prepare data. But how do you find the best hyperparameters?

Continue to Module 9: Hyperparameter Tuning

Learn Grid Search, Random Search, and Bayesian Optimization