Feature Engineering
Data Scientists Spend 80% of Time Here
Raw data is messy. Models need clean, meaningful numbers.
Feature engineering is the art of transforming raw data into features that help models learn.
The House Price Example
Raw data:
Address: "123 Main St, New York, NY 10001"
Built: "March 15, 1995"
Description: "Cozy 3BR, renovated kitchen, near subway"
Price: $850,000
What a model needs:
{
'bedrooms' : 3 ,
'city_encoded' : 45 , # New York
'year_built' : 1995 ,
'building_age' : 30 ,
'is_renovated' : 1 ,
'near_transit' : 1 ,
'zip_price_tier' : 3 # Expensive area
}
Handling Missing Values
import pandas as pd
import numpy as np
# Sample data with missing values
df = pd.DataFrame({
'age' : [ 25 , np.nan, 35 , 40 , np.nan],
'income' : [ 50000 , 60000 , np.nan, 80000 , 90000 ],
'education' : [ 'Bachelor' , 'Master' , np.nan, 'PhD' , 'Bachelor' ]
})
print ( "Missing values:" )
print (df.isnull().sum())
Strategy 1: Drop Missing Values
# Drop rows with ANY missing values
df_clean = df.dropna()
# Drop rows with missing values in specific columns
df_clean = df.dropna( subset = [ 'age' ])
Only use when you have lots of data and missingness is random.
Strategy 2: Imputation
from sklearn.impute import SimpleImputer
# Numeric: fill with mean, median, or constant
numeric_imputer = SimpleImputer( strategy = 'mean' ) # or 'median', 'most_frequent'
df[ 'age' ] = numeric_imputer.fit_transform(df[[ 'age' ]])
# Categorical: fill with mode or 'Unknown'
categorical_imputer = SimpleImputer( strategy = 'most_frequent' )
df[ 'education' ] = categorical_imputer.fit_transform(df[[ 'education' ]])
Strategy 3: Indicator Variables
# Create a flag for missing values (can be informative!)
df[ 'age_missing' ] = df[ 'age' ].isnull().astype( int )
df[ 'age' ] = df[ 'age' ].fillna(df[ 'age' ].median())
Encoding Categorical Variables
Label Encoding (for ordinal categories)
from sklearn.preprocessing import LabelEncoder
education_order = [ 'High School' , 'Bachelor' , 'Master' , 'PhD' ]
df[ 'education_encoded' ] = df[ 'education' ].map({
'High School' : 0 ,
'Bachelor' : 1 ,
'Master' : 2 ,
'PhD' : 3
})
One-Hot Encoding (for nominal categories)
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
# Using pandas
df_encoded = pd.get_dummies(df, columns = [ 'color' ], prefix = 'color' )
# Using sklearn
encoder = OneHotEncoder( sparse_output = False , handle_unknown = 'ignore' )
encoded = encoder.fit_transform(df[[ 'color' ]])
Before :
id color
1 red
2 blue
3 green
After :
id color_red color_blue color_green
1 1 0 0
2 0 1 0
3 0 0 1
Target Encoding (for high-cardinality categories)
# Replace category with mean target value
city_means = df.groupby( 'city' )[ 'price' ].mean()
df[ 'city_encoded' ] = df[ 'city' ].map(city_means)
Use target encoding carefully to avoid data leakage. Always compute means on training data only.
Scaling Numerical Features
Why Scale?
Many algorithms (SVM, KNN, neural networks) are sensitive to scale:
Age: 0-100
Income: 0-1,000,000
Without scaling, income would dominate!
StandardScaler (Z-score normalization)
x s c a l e d = x − μ σ x_{scaled} = \frac{x - \mu}{\sigma} x sc a l e d = σ x − μ
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Result: mean=0, std=1
print ( f "Mean: { X_scaled.mean() :.4f} " )
print ( f "Std: { X_scaled.std() :.4f} " )
MinMaxScaler (0-1 normalization)
x s c a l e d = x − x m i n x m a x − x m i n x_{scaled} = \frac{x - x_{min}}{x_{max} - x_{min}} x sc a l e d = x ma x − x min x − x min
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
# Result: values between 0 and 1
print ( f "Min: { X_scaled.min() :.4f} " )
print ( f "Max: { X_scaled.max() :.4f} " )
RobustScaler (for outliers)
Uses median and IQR instead of mean and std:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
Creating New Features
# For skewed distributions
df[ 'income_log' ] = np.log1p(df[ 'income' ]) # log(1+x) to handle zeros
# Square root
df[ 'rooms_sqrt' ] = np.sqrt(df[ 'rooms' ])
# Polynomial features
df[ 'age_squared' ] = df[ 'age' ] ** 2
Interaction Features
# Combine features
df[ 'price_per_sqft' ] = df[ 'price' ] / df[ 'sqft' ]
df[ 'income_per_person' ] = df[ 'household_income' ] / df[ 'household_size' ]
df[ 'age_income_ratio' ] = df[ 'age' ] / df[ 'income' ]
Date Features
df[ 'date' ] = pd.to_datetime(df[ 'date' ])
# Extract components
df[ 'year' ] = df[ 'date' ].dt.year
df[ 'month' ] = df[ 'date' ].dt.month
df[ 'day_of_week' ] = df[ 'date' ].dt.dayofweek
df[ 'is_weekend' ] = df[ 'day_of_week' ].isin([ 5 , 6 ]).astype( int )
df[ 'quarter' ] = df[ 'date' ].dt.quarter
# Time since event
df[ 'days_since_purchase' ] = (pd.Timestamp.now() - df[ 'date' ]).dt.days
Text Features
# Length
df[ 'description_length' ] = df[ 'description' ].str.len()
df[ 'word_count' ] = df[ 'description' ].str.split().str.len()
# Contains specific words
df[ 'has_discount' ] = df[ 'description' ].str.contains( 'discount|sale|offer' , case = False ).astype( int )
Binning Continuous Variables
# Age groups
df[ 'age_group' ] = pd.cut(
df[ 'age' ],
bins = [ 0 , 18 , 35 , 50 , 65 , 100 ],
labels = [ 'youth' , 'young_adult' , 'middle_age' , 'senior' , 'elderly' ]
)
# Equal-frequency binning (quantiles)
df[ 'income_quantile' ] = pd.qcut(
df[ 'income' ],
q = 4 ,
labels = [ 'low' , 'medium' , 'high' , 'very_high' ]
)
Handling Outliers
import numpy as np
def detect_outliers_iqr ( data ):
Q1 = np.percentile(data, 25 )
Q3 = np.percentile(data, 75 )
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
return (data < lower) | (data > upper)
# Cap outliers (winsorization)
def cap_outliers ( data , lower_percentile = 1 , upper_percentile = 99 ):
lower = np.percentile(data, lower_percentile)
upper = np.percentile(data, upper_percentile)
return np.clip(data, lower, upper)
df[ 'income_capped' ] = cap_outliers(df[ 'income' ])
Feature Selection
Correlation Analysis
import seaborn as sns
import matplotlib.pyplot as plt
# Correlation matrix
corr = df.corr()
# Heatmap
plt.figure( figsize = ( 12 , 8 ))
sns.heatmap(corr, annot = True , cmap = 'coolwarm' , center = 0 )
plt.title( 'Feature Correlations' )
plt.show()
# Drop highly correlated features
def drop_correlated_features ( df , threshold = 0.9 ):
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1 ).astype( bool ))
to_drop = [column for column in upper.columns if any (upper[column] > threshold)]
return df.drop( columns = to_drop)
Model-Based Selection
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
# Select top k features by ANOVA F-score
selector = SelectKBest(f_classif, k = 10 )
X_selected = selector.fit_transform(X, y)
# Get selected feature names
selected_features = X.columns[selector.get_support()]
print ( "Selected features:" , list (selected_features))
Recursive Feature Elimination
from sklearn.feature_selection import RFE
model = RandomForestClassifier( n_estimators = 50 , random_state = 42 )
rfe = RFE(model, n_features_to_select = 10 )
rfe.fit(X, y)
# Get rankings
for name, rank in zip (X.columns, rfe.ranking_):
print ( f " { name } : Rank { rank } " )
Feature Engineering Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
# Define column types
numeric_features = [ 'age' , 'income' , 'credit_score' ]
categorical_features = [ 'education' , 'occupation' , 'city' ]
# Create transformers
numeric_transformer = Pipeline([
( 'imputer' , SimpleImputer( strategy = 'median' )),
( 'scaler' , StandardScaler())
])
categorical_transformer = Pipeline([
( 'imputer' , SimpleImputer( strategy = 'constant' , fill_value = 'Unknown' )),
( 'encoder' , OneHotEncoder( handle_unknown = 'ignore' , sparse_output = False ))
])
# Combine
preprocessor = ColumnTransformer([
( 'num' , numeric_transformer, numeric_features),
( 'cat' , categorical_transformer, categorical_features)
])
# Full pipeline with model
from sklearn.ensemble import RandomForestClassifier
full_pipeline = Pipeline([
( 'preprocessor' , preprocessor),
( 'classifier' , RandomForestClassifier( n_estimators = 100 , random_state = 42 ))
])
# Use it
full_pipeline.fit(X_train, y_train)
predictions = full_pipeline.predict(X_test)
Common Mistakes
Data Leakage Problem : Using test data info during trainingFix : Always fit transformers on train data only# Wrong
scaler.fit(X) # Uses all data
# Right
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
Scaling After Split Problem : Scaling before train-test splitFix : Split first, then scale# Right order:
# 1. Train-test split
# 2. Fit scaler on train
# 3. Transform both
🚀 Mini Projects
Project 1: E-commerce Feature Engineer Transform raw transaction data into predictive features
Project 2: Date-Time Feature Factory Extract powerful temporal features from timestamps
Project 3: Text Feature Extractor Convert text data into numerical features
Project 4: Automated Feature Pipeline Build an end-to-end feature engineering pipeline
Project 1: E-commerce Feature Engineer
Transform raw e-commerce transaction data into features that predict customer churn.
Project 2: Date-Time Feature Factory
Extract powerful temporal features from timestamp data.
Convert text data into numerical features for machine learning.
Project 4: Automated Feature Pipeline
Build an end-to-end feature engineering pipeline that handles multiple data types.
Key Takeaways
Handle Missing Data Impute or create indicator variables
Encode Categories One-hot for nominal, ordinal for ordered
Scale Features StandardScaler or MinMaxScaler for most algorithms
Create Features Domain knowledge creates the best features
🧹 Real-World Messy Data: Complete Guide
Handling Every Type of Data Problem
Missing Values Decision Tree Is data missing?
├── < 5% missing
│ └── Safe to drop rows OR simple imputation (mean/median)
├── 5-30% missing
│ ├── Is missingness random?
│ │ ├── Yes → Impute with mean/median/mode
│ │ └── No (informative) → Create "is_missing" indicator + impute
│ └── Consider multiple imputation for important analyses
└── > 30% missing
├── Is the feature important?
│ ├── Yes → Advanced imputation (KNN, iterative)
│ └── No → Consider dropping the feature
└── Investigate WHY data is missing
# Production-ready missing value handler
def handle_missing_values ( df , missing_threshold = 0.3 ):
"""
Handle missing values with best practices.
"""
report = []
for col in df.columns:
missing_pct = df[col].isnull().mean()
if missing_pct == 0 :
continue
elif missing_pct > missing_threshold:
report.append( f "⚠️ { col } : { missing_pct :.1%} missing - consider dropping" )
elif df[col].dtype in [ 'float64' , 'int64' ]:
# Numeric: impute with median (robust to outliers)
df[ f ' { col } _missing' ] = df[col].isnull().astype( int )
df[col].fillna(df[col].median(), inplace = True )
report.append( f "✓ { col } : imputed with median, created indicator" )
else :
# Categorical: impute with mode or 'Unknown'
df[col].fillna(df[col].mode()[ 0 ] if len (df[col].mode()) > 0 else 'Unknown' , inplace = True )
report.append( f "✓ { col } : imputed with mode/Unknown" )
return df, report
Outlier Detection & Treatment import numpy as np
from scipy import stats
def detect_and_handle_outliers ( df , columns , method = 'iqr' , action = 'cap' ):
"""
Detect outliers using IQR or Z-score, then handle them.
Parameters:
- method: 'iqr' (robust) or 'zscore' (assumes normality)
- action: 'cap' (winsorize), 'remove', or 'flag'
"""
for col in columns:
if method == 'iqr' :
Q1, Q3 = df[col].quantile([ 0.25 , 0.75 ])
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
else : # zscore
z = np.abs(stats.zscore(df[col].dropna()))
lower = df[col].mean() - 3 * df[col].std()
upper = df[col].mean() + 3 * df[col].std()
outliers = (df[col] < lower) | (df[col] > upper)
n_outliers = outliers.sum()
if n_outliers > 0 :
if action == 'cap' :
df[col] = df[col].clip( lower = lower, upper = upper)
print ( f " { col } : capped { n_outliers } outliers to [ { lower :.2f} , { upper :.2f} ]" )
elif action == 'remove' :
df = df[ ~ outliers]
print ( f " { col } : removed { n_outliers } outlier rows" )
else : # flag
df[ f ' { col } _is_outlier' ] = outliers.astype( int )
print ( f " { col } : flagged { n_outliers } outliers" )
return df
Handling Skewed Distributions from sklearn.preprocessing import PowerTransformer
def handle_skewness ( df , columns , threshold = 1.0 ):
"""
Apply log or Yeo-Johnson transform to skewed features.
"""
from scipy.stats import skew
for col in columns:
col_skew = skew(df[col].dropna())
if abs (col_skew) > threshold:
if (df[col] > 0 ).all():
# Log transform for positive data
df[ f ' { col } _log' ] = np.log1p(df[col])
print ( f " { col } : skew= { col_skew :.2f} → log transform applied" )
else :
# Yeo-Johnson for any data
pt = PowerTransformer( method = 'yeo-johnson' )
df[ f ' { col } _transformed' ] = pt.fit_transform(df[[col]])
print ( f " { col } : skew= { col_skew :.2f} → Yeo-Johnson applied" )
return df
🔗 Math → ML Connection
Feature engineering connects to these mathematical concepts: Technique Math Concept Why It Works Standardization Z-scores from statistics Makes gradient descent converge faster One-hot encoding Orthogonal basis vectors Each category becomes a dimension Log transforms Properties of logarithms Linearizes exponential relationships Polynomial features Polynomial functions Captures nonlinear patterns PCA features Eigenvalue decomposition Finds directions of max variance Interaction terms Cross-products Models combined effects
The Linear Algebra course covers why these transformations work geometrically.
🚀 Going Deeper (Optional)
Advanced Feature Engineering Techniques
Target Encoding (for High-Cardinality Categoricals) When a categorical has 1000+ unique values, one-hot encoding creates too many features: from sklearn.model_selection import KFold
def target_encode ( df , cat_col , target_col , n_splits = 5 ):
"""
Replace category with mean target value (using cross-validation to prevent leakage).
"""
df[ f ' { cat_col } _target_enc' ] = np.nan
kf = KFold( n_splits = n_splits, shuffle = True , random_state = 42 )
for train_idx, val_idx in kf.split(df):
# Calculate means only on training fold
means = df.iloc[train_idx].groupby(cat_col)[target_col].mean()
# Apply to validation fold
df.loc[df.index[val_idx], f ' { cat_col } _target_enc' ] = \
df.iloc[val_idx][cat_col].map(means)
# Fill any remaining NaN with global mean
df[ f ' { cat_col } _target_enc' ].fillna(df[target_col].mean(), inplace = True )
return df
Time-Based Features def create_time_features ( df , date_col ):
"""
Extract rich features from datetime columns.
"""
df[date_col] = pd.to_datetime(df[date_col])
# Basic extractions
df[ 'year' ] = df[date_col].dt.year
df[ 'month' ] = df[date_col].dt.month
df[ 'day' ] = df[date_col].dt.day
df[ 'dayofweek' ] = df[date_col].dt.dayofweek # 0=Monday
df[ 'hour' ] = df[date_col].dt.hour
# Cyclical encoding (preserves continuity: Dec → Jan)
df[ 'month_sin' ] = np.sin( 2 * np.pi * df[ 'month' ] / 12 )
df[ 'month_cos' ] = np.cos( 2 * np.pi * df[ 'month' ] / 12 )
df[ 'hour_sin' ] = np.sin( 2 * np.pi * df[ 'hour' ] / 24 )
df[ 'hour_cos' ] = np.cos( 2 * np.pi * df[ 'hour' ] / 24 )
# Business features
df[ 'is_weekend' ] = (df[ 'dayofweek' ] >= 5 ).astype( int )
df[ 'is_month_start' ] = df[date_col].dt.is_month_start.astype( int )
df[ 'is_month_end' ] = df[date_col].dt.is_month_end.astype( int )
return df
Automated Feature Engineering # Using featuretools for automated feature generation
import featuretools as ft
# Define entity set
es = ft.EntitySet( id = 'customers' )
es.add_dataframe( dataframe_name = 'transactions' , dataframe = transactions_df,
index = 'transaction_id' , time_index = 'timestamp' )
es.add_dataframe( dataframe_name = 'customers' , dataframe = customers_df,
index = 'customer_id' )
# Create relationship
es.add_relationship( 'customers' , 'customer_id' , 'transactions' , 'customer_id' )
# Automatically generate features
features, feature_names = ft.dfs(
entityset = es,
target_dataframe_name = 'customers' ,
agg_primitives = [ 'sum' , 'mean' , 'count' , 'max' , 'min' , 'std' ],
trans_primitives = [ 'month' , 'year' , 'weekday' ],
max_depth = 2
)
What’s Next?
Now you know how to prepare data. But how do you find the best hyperparameters?
Continue to Module 9: Hyperparameter Tuning Learn Grid Search, Random Search, and Bayesian Optimization