Skip to content

Commit 34ad969

Browse files
authored
Merge pull request #140 from Data-Simply/feature/rfm-segmentation
RFM Segmentation
2 parents e846b30 + b0effc0 commit 34ad969

File tree

4 files changed

+318
-1
lines changed

4 files changed

+318
-1
lines changed

docs/analysis_modules.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,55 @@ segment_stats.df
791791
| Total | 4604.28 | 150 | 50 | 92.0856 | 30.6952 | 3 | 1 |
792792
<!-- markdownlint-enable MD013 -->
793793

794+
795+
### RFM Segmentation
796+
797+
<div class="clear" markdown>
798+
799+
![RFM Segmentation Distribution](assets/images/analysis_modules/rfm_segmentation.svg){ align=right loading=lazy width="50%"}
800+
801+
**Recency, Frequency, Monetary (RFM) segmentation** categorizes customers based on their purchasing behavior:
802+
803+
- **Recency (R)**: How recently a customer made a purchase
804+
- **Frequency (F)**: How often a customer makes purchases
805+
- **Monetary (M)**: How much a customer spends
806+
807+
Each metric is typically scored on a scale, and the combined RFM score helps businesses identify **loyal customers, at-risk customers, and high-value buyers**.
808+
809+
RFM segmentation helps answer questions such as:
810+
811+
- Who are your most valuable customers?
812+
- Which customers are at risk of churn?
813+
- Which customers should be targeted for re-engagement?
814+
815+
</div>
816+
817+
Example:
818+
819+
```python
820+
import pandas as pd
821+
from pyretailscience.analysis.segmentation import RFMSegmentation
822+
823+
data = pd.DataFrame({
824+
"customer_id": [1, 1, 2, 2, 3, 3, 3],
825+
"transaction_id": [101, 102, 201, 202, 301, 302, 303],
826+
"transaction_date": ["2024-03-01", "2024-03-10", "2024-02-20", "2024-02-25", "2024-01-15", "2024-01-20", "2024-02-05"],
827+
"unit_spend": [50, 75, 100, 150, 200, 250, 300]
828+
})
829+
830+
data["transaction_date"] = pd.to_datetime(data["transaction_date"])
831+
current_date = "2024-07-01"
832+
833+
rfm_segmenter = RFMSegmentation(df=data, current_date=current_date)
834+
rfm_results = rfm_segmenter.df
835+
```
836+
837+
| customer_id | recency_days | frequency | monetary | r_score | f_score | m_score | rfm_segment | fm_segment |
838+
|-------------|--------------|-----------|----------|---------|---------|---------|-------------|------------|
839+
| 1 | 113 | 2 | 125 | 0 | 0 | 0 | 0 | 0 |
840+
| 2 | 127 | 2 | 250 | 1 | 1 | 1 | 111 | 11 |
841+
| 3 | 147 | 3 | 750 | 2 | 2 | 2 | 222 | 22 |
842+
794843
### Purchases Per Customer
795844

796845
<div class="clear" markdown>

pyretailscience/analysis/segmentation.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""This module contains classes for segmenting customers based on their spend and transaction statistics by segment."""
22

3+
import datetime
34
from typing import Literal
45

56
import ibis
@@ -452,3 +453,115 @@ def plot(
452453
gu.standard_tick_styles(ax)
453454

454455
return ax
456+
457+
458+
class RFMSegmentation:
459+
"""Segments customers using the RFM (Recency, Frequency, Monetary) methodology.
460+
461+
Customers are scored on three dimensions:
462+
- Recency (R): Days since the last transaction (lower is better).
463+
- Frequency (F): Number of unique transactions (higher is better).
464+
- Monetary (M): Total amount spent (higher is better).
465+
466+
Each metric is ranked into 10 bins (0-9) using NTILE(10) where,
467+
- 9 represents the best score (top 10% of customers).
468+
- 0 represents the lowest score (bottom 10% of customers).
469+
The RFM segment is a 3-digit number (R*100 + F*10 + M), representing customer value.
470+
"""
471+
472+
_df: pd.DataFrame | None = None
473+
474+
def __init__(self, df: pd.DataFrame | ibis.Table, current_date: str | datetime.date | None = None) -> None:
475+
"""Initializes the RFM segmentation process.
476+
477+
Args:
478+
df (pd.DataFrame | ibis.Table): A DataFrame or Ibis table containing transaction data.
479+
Must include the following columns:
480+
- customer_id
481+
- transaction_date
482+
- unit_spend
483+
- transaction_id
484+
current_date (Optional[Union[str, datetime.date]]): The reference date for calculating recency.
485+
Can be a string (format: "YYYY-MM-DD"), a date object, or None (defaults to the current system date).
486+
487+
Raises:
488+
ValueError: If the dataframe is missing required columns.
489+
TypeError: If the input data is not a pandas DataFrame or an Ibis Table.
490+
"""
491+
cols = ColumnHelper()
492+
required_cols = [
493+
cols.customer_id,
494+
cols.transaction_date,
495+
cols.unit_spend,
496+
cols.transaction_id,
497+
]
498+
if isinstance(df, pd.DataFrame):
499+
df = ibis.memtable(df)
500+
elif not isinstance(df, ibis.Table):
501+
raise TypeError("df must be either a pandas DataFrame or an Ibis Table")
502+
503+
missing_cols = set(required_cols) - set(df.columns)
504+
if missing_cols:
505+
error_message = f"Missing required columns: {missing_cols}"
506+
raise ValueError(error_message)
507+
508+
if isinstance(current_date, str):
509+
current_date = datetime.date.fromisoformat(current_date)
510+
elif current_date is None:
511+
current_date = datetime.datetime.now(datetime.UTC).date()
512+
elif not isinstance(current_date, datetime.date):
513+
raise TypeError("current_date must be a string in 'YYYY-MM-DD' format, a datetime.date object, or None")
514+
515+
self.table = self._compute_rfm(df, current_date)
516+
517+
def _compute_rfm(self, df: ibis.Table, current_date: datetime.date) -> ibis.Table:
518+
"""Computes the RFM metrics and segments customers accordingly.
519+
520+
Args:
521+
df (ibis.Table): The transaction data table.
522+
current_date (datetime.date): The reference date for calculating recency.
523+
524+
Returns:
525+
ibis.Table: A table with RFM scores and segment values.
526+
"""
527+
cols = ColumnHelper()
528+
current_date_expr = ibis.literal(current_date)
529+
530+
customer_metrics = df.group_by(cols.customer_id).aggregate(
531+
recency_days=(current_date_expr - df[cols.transaction_date].max().cast("date")).cast("int32"),
532+
frequency=df[cols.transaction_id].nunique(),
533+
monetary=df[cols.unit_spend].sum(),
534+
)
535+
536+
window_recency = ibis.window(
537+
order_by=[ibis.asc(customer_metrics.recency_days), ibis.asc(customer_metrics.customer_id)],
538+
)
539+
window_frequency = ibis.window(
540+
order_by=[ibis.asc(customer_metrics.frequency), ibis.asc(customer_metrics.customer_id)],
541+
)
542+
window_monetary = ibis.window(
543+
order_by=[ibis.asc(customer_metrics.monetary), ibis.asc(customer_metrics.customer_id)],
544+
)
545+
546+
rfm_scores = customer_metrics.mutate(
547+
r_score=(ibis.ntile(10).over(window_recency)),
548+
f_score=(ibis.ntile(10).over(window_frequency)),
549+
m_score=(ibis.ntile(10).over(window_monetary)),
550+
)
551+
552+
return rfm_scores.mutate(
553+
rfm_segment=(rfm_scores.r_score * 100 + rfm_scores.f_score * 10 + rfm_scores.m_score),
554+
fm_segment=(rfm_scores.f_score * 10 + rfm_scores.m_score),
555+
)
556+
557+
@property
558+
def df(self) -> pd.DataFrame:
559+
"""Returns the dataframe with the segment names."""
560+
if self._df is None:
561+
self._df = self.table.execute().set_index(get_option("column.customer_id"))
562+
return self._df
563+
564+
@property
565+
def ibis_table(self) -> ibis.Table:
566+
"""Returns the computed Ibis table with RFM segmentation."""
567+
return self.table

tests/analysis/test_segmentation.py

Lines changed: 155 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
11
"""Tests for the SegTransactionStats class."""
22

3+
import ibis
34
import numpy as np
45
import pandas as pd
56
import pytest
67

7-
from pyretailscience.analysis.segmentation import HMLSegmentation, SegTransactionStats, ThresholdSegmentation
8+
from pyretailscience.analysis.segmentation import (
9+
HMLSegmentation,
10+
RFMSegmentation,
11+
SegTransactionStats,
12+
ThresholdSegmentation,
13+
)
814
from pyretailscience.options import ColumnHelper, get_option
915

1016
cols = ColumnHelper()
@@ -545,3 +551,151 @@ def test_alternate_value_col(self, base_df):
545551
assert result_df.loc[2, "segment_name"] == "Light"
546552
assert result_df.loc[4, "segment_name"] == "Medium"
547553
assert result_df.loc[5, "segment_name"] == "Light"
554+
555+
556+
class TestRFMSegmentation:
557+
"""Tests for the RFMSegmentation class."""
558+
559+
@pytest.fixture
560+
def base_df(self):
561+
"""Return a base DataFrame for testing."""
562+
return pd.DataFrame(
563+
{
564+
cols.customer_id: [1, 2, 3, 4, 5],
565+
cols.transaction_id: [101, 102, 103, 104, 105],
566+
cols.unit_spend: [100.0, 200.0, 150.0, 300.0, 250.0],
567+
cols.transaction_date: [
568+
"2025-03-01",
569+
"2025-02-15",
570+
"2025-01-30",
571+
"2025-03-10",
572+
"2025-02-20",
573+
],
574+
},
575+
)
576+
577+
@pytest.fixture
578+
def expected_df(self):
579+
"""Returns the expected DataFrame for testing segmentation."""
580+
return pd.DataFrame(
581+
{
582+
"customer_id": [1, 2, 3, 4, 5],
583+
"frequency": [1, 1, 1, 1, 1],
584+
"monetary": [100.0, 200.0, 150.0, 300.0, 250.0],
585+
"r_score": [1, 3, 4, 0, 2],
586+
"f_score": [0, 1, 2, 3, 4],
587+
"m_score": [0, 2, 1, 4, 3],
588+
"rfm_segment": [100, 312, 421, 34, 243],
589+
"fm_segment": [0, 12, 21, 34, 43],
590+
},
591+
).set_index("customer_id")
592+
593+
def test_correct_rfm_segmentation(self, base_df, expected_df):
594+
"""Test that the RFM segmentation correctly calculates the RFM scores and segments."""
595+
current_date = "2025-03-17"
596+
rfm_segmentation = RFMSegmentation(df=base_df, current_date=current_date)
597+
result_df = rfm_segmentation.df
598+
expected_df["recency_days"] = [16, 30, 46, 7, 25]
599+
expected_df["recency_days"] = expected_df["recency_days"].astype(result_df["recency_days"].dtype)
600+
601+
pd.testing.assert_frame_equal(
602+
result_df.sort_index(),
603+
expected_df.sort_index(),
604+
check_like=True,
605+
)
606+
607+
def test_handles_dataframe_with_missing_columns(self):
608+
"""Test that the method raises an error when required columns are missing."""
609+
base_df = pd.DataFrame(
610+
{
611+
cols.customer_id: [1, 2, 3],
612+
cols.unit_spend: [100.0, 200.0, 150.0],
613+
cols.transaction_id: [101, 102, 103],
614+
},
615+
)
616+
617+
with pytest.raises(ValueError):
618+
RFMSegmentation(df=base_df, current_date="2025-03-17")
619+
620+
def test_single_customer(self):
621+
"""Test that the method correctly calculates RFM segmentation for a single customer."""
622+
df_single_customer = pd.DataFrame(
623+
{
624+
cols.customer_id: [1],
625+
cols.transaction_id: [101],
626+
cols.unit_spend: [200.0],
627+
cols.transaction_date: ["2025-03-01"],
628+
},
629+
)
630+
current_date = "2025-03-17"
631+
rfm_segmentation = RFMSegmentation(df=df_single_customer, current_date=current_date)
632+
result_df = rfm_segmentation.df
633+
assert result_df.loc[1, "rfm_segment"] == 0
634+
635+
def test_multiple_transactions_per_customer(self):
636+
"""Test that the method correctly handles multiple transactions for the same customer."""
637+
df_multiple_transactions = pd.DataFrame(
638+
{
639+
cols.customer_id: [1, 1, 1, 1, 1],
640+
cols.transaction_id: [101, 102, 103, 104, 105],
641+
cols.unit_spend: [120.0, 250.0, 180.0, 300.0, 220.0],
642+
cols.transaction_date: [
643+
"2025-03-01",
644+
"2025-02-15",
645+
"2025-01-10",
646+
"2025-03-10",
647+
"2025-02-25",
648+
],
649+
},
650+
)
651+
current_date = "2025-03-17"
652+
rfm_segmentation = RFMSegmentation(df=df_multiple_transactions, current_date=current_date)
653+
result_df = rfm_segmentation.df
654+
655+
assert result_df.loc[1, "rfm_segment"] == 0
656+
657+
def test_calculates_rfm_correctly_for_all_customers(self, base_df):
658+
"""Test that RFM scores are calculated correctly for all customers."""
659+
current_date = "2025-03-17"
660+
expected_customer_count = 5
661+
rfm_segmentation = RFMSegmentation(df=base_df, current_date=current_date)
662+
result_df = rfm_segmentation.df
663+
664+
assert len(result_df) == expected_customer_count
665+
assert "rfm_segment" in result_df.columns
666+
667+
def test_rfm_segmentation_with_no_date(self, base_df, expected_df):
668+
"""Test that the RFM segmentation correctly calculates the RFM scores and segments."""
669+
rfm_segmentation = RFMSegmentation(df=base_df)
670+
result_df = rfm_segmentation.df
671+
expected_df["recency_days"] = [18, 32, 48, 9, 27]
672+
expected_df["recency_days"] = expected_df["recency_days"].astype(result_df["recency_days"].dtype)
673+
674+
pd.testing.assert_frame_equal(
675+
result_df.sort_index(),
676+
expected_df.sort_index(),
677+
check_like=True,
678+
)
679+
680+
def test_invalid_current_date_type(self, base_df):
681+
"""Test that RFMSegmentation raises a TypeError when an invalid current_date is provided."""
682+
with pytest.raises(
683+
TypeError,
684+
match="current_date must be a string in 'YYYY-MM-DD' format, a datetime.date object, or None",
685+
):
686+
RFMSegmentation(base_df, current_date=12345)
687+
688+
def test_invalid_df_type(self):
689+
"""Test that RFMSegmentation raises a TypeError when df is neither a DataFrame nor an Ibis Table."""
690+
invalid_df = "this is not a dataframe"
691+
692+
with pytest.raises(TypeError, match="df must be either a pandas DataFrame or an Ibis Table"):
693+
RFMSegmentation(df=invalid_df, current_date="2025-03-17")
694+
695+
def test_ibis_table_property(self, base_df):
696+
"""Test that ibis_table property returns an Ibis Table."""
697+
segmentation = RFMSegmentation(df=base_df, current_date="2025-03-17")
698+
699+
result = segmentation.ibis_table
700+
701+
assert isinstance(result, ibis.Table), "Expected ibis.Table but got a different type"

uv.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)