Merge pull request #140 from Data-Simply/feature/rfm-segmentation

mayurkmmt · web-flow · commit 34ad9699b653 · 2025-03-19T18:09:24.000+05:30
RFM Segmentation
diff --git a/docs/analysis_modules.md b/docs/analysis_modules.md
@@ -791,6 +791,55 @@ segment_stats.df
 | Total          | 4604.28  |            150 |          50 |              92.0856 |                30.6952  |                           3 |             1   |
 <!-- markdownlint-enable MD013 -->
 
+
+### RFM Segmentation
+
+<div class="clear" markdown>
+
+![RFM Segmentation Distribution](assets/images/analysis_modules/rfm_segmentation.svg){ align=right loading=lazy width="50%"}
+
+**Recency, Frequency, Monetary (RFM) segmentation** categorizes customers based on their purchasing behavior:
+
+- **Recency (R)**: How recently a customer made a purchase
+- **Frequency (F)**: How often a customer makes purchases
+- **Monetary (M)**: How much a customer spends
+
+Each metric is typically scored on a scale, and the combined RFM score helps businesses identify **loyal customers, at-risk customers, and high-value buyers**.
+
+RFM segmentation helps answer questions such as:
+
+- Who are your most valuable customers?
+- Which customers are at risk of churn?
+- Which customers should be targeted for re-engagement?
+
+</div>
+
+Example:
+
+```python
+import pandas as pd
+from pyretailscience.analysis.segmentation import RFMSegmentation
+
+data = pd.DataFrame({
+    "customer_id": [1, 1, 2, 2, 3, 3, 3],
+    "transaction_id": [101, 102, 201, 202, 301, 302, 303],
+    "transaction_date": ["2024-03-01", "2024-03-10", "2024-02-20", "2024-02-25", "2024-01-15", "2024-01-20", "2024-02-05"],
+    "unit_spend": [50, 75, 100, 150, 200, 250, 300]
+})
+
+data["transaction_date"] = pd.to_datetime(data["transaction_date"])
+current_date = "2024-07-01"
+
+rfm_segmenter = RFMSegmentation(df=data, current_date=current_date)
+rfm_results = rfm_segmenter.df
+```
+
+| customer_id | recency_days | frequency | monetary | r_score | f_score | m_score | rfm_segment | fm_segment |
+|-------------|--------------|-----------|----------|---------|---------|---------|-------------|------------|
+| 1           | 113          | 2         | 125      | 0       | 0       | 0       | 0           | 0          |
+| 2           | 127          | 2         | 250      | 1       | 1       | 1       | 111         | 11         |
+| 3           | 147          | 3         | 750      | 2       | 2       | 2       | 222         | 22         |
+
 ### Purchases Per Customer
 
 <div class="clear" markdown>
diff --git a/pyretailscience/analysis/segmentation.py b/pyretailscience/analysis/segmentation.py
@@ -1,5 +1,6 @@
 """This module contains classes for segmenting customers based on their spend and transaction statistics by segment."""
 
+import datetime
 from typing import Literal
 
 import ibis
@@ -452,3 +453,115 @@ def plot(
         gu.standard_tick_styles(ax)
 
         return ax
+
+
+class RFMSegmentation:
+    """Segments customers using the RFM (Recency, Frequency, Monetary) methodology.
+
+    Customers are scored on three dimensions:
+    - Recency (R): Days since the last transaction (lower is better).
+    - Frequency (F): Number of unique transactions (higher is better).
+    - Monetary (M): Total amount spent (higher is better).
+
+    Each metric is ranked into 10 bins (0-9) using NTILE(10) where,
+    - 9 represents the best score (top 10% of customers).
+    - 0 represents the lowest score (bottom 10% of customers).
+    The RFM segment is a 3-digit number (R*100 + F*10 + M), representing customer value.
+    """
+
+    _df: pd.DataFrame | None = None
+
+    def __init__(self, df: pd.DataFrame | ibis.Table, current_date: str | datetime.date | None = None) -> None:
+        """Initializes the RFM segmentation process.
+
+        Args:
+            df (pd.DataFrame | ibis.Table): A DataFrame or Ibis table containing transaction data.
+                Must include the following columns:
+                - customer_id
+                - transaction_date
+                - unit_spend
+                - transaction_id
+            current_date (Optional[Union[str, datetime.date]]): The reference date for calculating recency.
+                Can be a string (format: "YYYY-MM-DD"), a date object, or None (defaults to the current system date).
+
+        Raises:
+            ValueError: If the dataframe is missing required columns.
+            TypeError: If the input data is not a pandas DataFrame or an Ibis Table.
+        """
+        cols = ColumnHelper()
+        required_cols = [
+            cols.customer_id,
+            cols.transaction_date,
+            cols.unit_spend,
+            cols.transaction_id,
+        ]
+        if isinstance(df, pd.DataFrame):
+            df = ibis.memtable(df)
+        elif not isinstance(df, ibis.Table):
+            raise TypeError("df must be either a pandas DataFrame or an Ibis Table")
+
+        missing_cols = set(required_cols) - set(df.columns)
+        if missing_cols:
+            error_message = f"Missing required columns: {missing_cols}"
+            raise ValueError(error_message)
+
+        if isinstance(current_date, str):
+            current_date = datetime.date.fromisoformat(current_date)
+        elif current_date is None:
+            current_date = datetime.datetime.now(datetime.UTC).date()
+        elif not isinstance(current_date, datetime.date):
+            raise TypeError("current_date must be a string in 'YYYY-MM-DD' format, a datetime.date object, or None")
+
+        self.table = self._compute_rfm(df, current_date)
+
+    def _compute_rfm(self, df: ibis.Table, current_date: datetime.date) -> ibis.Table:
+        """Computes the RFM metrics and segments customers accordingly.
+
+        Args:
+            df (ibis.Table): The transaction data table.
+            current_date (datetime.date): The reference date for calculating recency.
+
+        Returns:
+            ibis.Table: A table with RFM scores and segment values.
+        """
+        cols = ColumnHelper()
+        current_date_expr = ibis.literal(current_date)
+
+        customer_metrics = df.group_by(cols.customer_id).aggregate(
+            recency_days=(current_date_expr - df[cols.transaction_date].max().cast("date")).cast("int32"),
+            frequency=df[cols.transaction_id].nunique(),
+            monetary=df[cols.unit_spend].sum(),
+        )
+
+        window_recency = ibis.window(
+            order_by=[ibis.asc(customer_metrics.recency_days), ibis.asc(customer_metrics.customer_id)],
+        )
+        window_frequency = ibis.window(
+            order_by=[ibis.asc(customer_metrics.frequency), ibis.asc(customer_metrics.customer_id)],
+        )
+        window_monetary = ibis.window(
+            order_by=[ibis.asc(customer_metrics.monetary), ibis.asc(customer_metrics.customer_id)],
+        )
+
+        rfm_scores = customer_metrics.mutate(
+            r_score=(ibis.ntile(10).over(window_recency)),
+            f_score=(ibis.ntile(10).over(window_frequency)),
+            m_score=(ibis.ntile(10).over(window_monetary)),
+        )
+
+        return rfm_scores.mutate(
+            rfm_segment=(rfm_scores.r_score * 100 + rfm_scores.f_score * 10 + rfm_scores.m_score),
+            fm_segment=(rfm_scores.f_score * 10 + rfm_scores.m_score),
+        )
+
+    @property
+    def df(self) -> pd.DataFrame:
+        """Returns the dataframe with the segment names."""
+        if self._df is None:
+            self._df = self.table.execute().set_index(get_option("column.customer_id"))
+        return self._df
+
+    @property
+    def ibis_table(self) -> ibis.Table:
+        """Returns the computed Ibis table with RFM segmentation."""
+        return self.table
diff --git a/tests/analysis/test_segmentation.py b/tests/analysis/test_segmentation.py
@@ -1,10 +1,16 @@
 """Tests for the SegTransactionStats class."""
 
+import ibis
 import numpy as np
 import pandas as pd
 import pytest
 
-from pyretailscience.analysis.segmentation import HMLSegmentation, SegTransactionStats, ThresholdSegmentation
+from pyretailscience.analysis.segmentation import (
+    HMLSegmentation,
+    RFMSegmentation,
+    SegTransactionStats,
+    ThresholdSegmentation,
+)
 from pyretailscience.options import ColumnHelper, get_option
 
 cols = ColumnHelper()
@@ -545,3 +551,151 @@ def test_alternate_value_col(self, base_df):
         assert result_df.loc[2, "segment_name"] == "Light"
         assert result_df.loc[4, "segment_name"] == "Medium"
         assert result_df.loc[5, "segment_name"] == "Light"
+
+
+class TestRFMSegmentation:
+    """Tests for the RFMSegmentation class."""
+
+    @pytest.fixture
+    def base_df(self):
+        """Return a base DataFrame for testing."""
+        return pd.DataFrame(
+            {
+                cols.customer_id: [1, 2, 3, 4, 5],
+                cols.transaction_id: [101, 102, 103, 104, 105],
+                cols.unit_spend: [100.0, 200.0, 150.0, 300.0, 250.0],
+                cols.transaction_date: [
+                    "2025-03-01",
+                    "2025-02-15",
+                    "2025-01-30",
+                    "2025-03-10",
+                    "2025-02-20",
+                ],
+            },
+        )
+
+    @pytest.fixture
+    def expected_df(self):
+        """Returns the expected DataFrame for testing segmentation."""
+        return pd.DataFrame(
+            {
+                "customer_id": [1, 2, 3, 4, 5],
+                "frequency": [1, 1, 1, 1, 1],
+                "monetary": [100.0, 200.0, 150.0, 300.0, 250.0],
+                "r_score": [1, 3, 4, 0, 2],
+                "f_score": [0, 1, 2, 3, 4],
+                "m_score": [0, 2, 1, 4, 3],
+                "rfm_segment": [100, 312, 421, 34, 243],
+                "fm_segment": [0, 12, 21, 34, 43],
+            },
+        ).set_index("customer_id")
+
+    def test_correct_rfm_segmentation(self, base_df, expected_df):
+        """Test that the RFM segmentation correctly calculates the RFM scores and segments."""
+        current_date = "2025-03-17"
+        rfm_segmentation = RFMSegmentation(df=base_df, current_date=current_date)
+        result_df = rfm_segmentation.df
+        expected_df["recency_days"] = [16, 30, 46, 7, 25]
+        expected_df["recency_days"] = expected_df["recency_days"].astype(result_df["recency_days"].dtype)
+
+        pd.testing.assert_frame_equal(
+            result_df.sort_index(),
+            expected_df.sort_index(),
+            check_like=True,
+        )
+
+    def test_handles_dataframe_with_missing_columns(self):
+        """Test that the method raises an error when required columns are missing."""
+        base_df = pd.DataFrame(
+            {
+                cols.customer_id: [1, 2, 3],
+                cols.unit_spend: [100.0, 200.0, 150.0],
+                cols.transaction_id: [101, 102, 103],
+            },
+        )
+
+        with pytest.raises(ValueError):
+            RFMSegmentation(df=base_df, current_date="2025-03-17")
+
+    def test_single_customer(self):
+        """Test that the method correctly calculates RFM segmentation for a single customer."""
+        df_single_customer = pd.DataFrame(
+            {
+                cols.customer_id: [1],
+                cols.transaction_id: [101],
+                cols.unit_spend: [200.0],
+                cols.transaction_date: ["2025-03-01"],
+            },
+        )
+        current_date = "2025-03-17"
+        rfm_segmentation = RFMSegmentation(df=df_single_customer, current_date=current_date)
+        result_df = rfm_segmentation.df
+        assert result_df.loc[1, "rfm_segment"] == 0
+
+    def test_multiple_transactions_per_customer(self):
+        """Test that the method correctly handles multiple transactions for the same customer."""
+        df_multiple_transactions = pd.DataFrame(
+            {
+                cols.customer_id: [1, 1, 1, 1, 1],
+                cols.transaction_id: [101, 102, 103, 104, 105],
+                cols.unit_spend: [120.0, 250.0, 180.0, 300.0, 220.0],
+                cols.transaction_date: [
+                    "2025-03-01",
+                    "2025-02-15",
+                    "2025-01-10",
+                    "2025-03-10",
+                    "2025-02-25",
+                ],
+            },
+        )
+        current_date = "2025-03-17"
+        rfm_segmentation = RFMSegmentation(df=df_multiple_transactions, current_date=current_date)
+        result_df = rfm_segmentation.df
+
+        assert result_df.loc[1, "rfm_segment"] == 0
+
+    def test_calculates_rfm_correctly_for_all_customers(self, base_df):
+        """Test that RFM scores are calculated correctly for all customers."""
+        current_date = "2025-03-17"
+        expected_customer_count = 5
+        rfm_segmentation = RFMSegmentation(df=base_df, current_date=current_date)
+        result_df = rfm_segmentation.df
+
+        assert len(result_df) == expected_customer_count
+        assert "rfm_segment" in result_df.columns
+
+    def test_rfm_segmentation_with_no_date(self, base_df, expected_df):
+        """Test that the RFM segmentation correctly calculates the RFM scores and segments."""
+        rfm_segmentation = RFMSegmentation(df=base_df)
+        result_df = rfm_segmentation.df
+        expected_df["recency_days"] = [18, 32, 48, 9, 27]
+        expected_df["recency_days"] = expected_df["recency_days"].astype(result_df["recency_days"].dtype)
+
+        pd.testing.assert_frame_equal(
+            result_df.sort_index(),
+            expected_df.sort_index(),
+            check_like=True,
+        )
+
+    def test_invalid_current_date_type(self, base_df):
+        """Test that RFMSegmentation raises a TypeError when an invalid current_date is provided."""
+        with pytest.raises(
+            TypeError,
+            match="current_date must be a string in 'YYYY-MM-DD' format, a datetime.date object, or None",
+        ):
+            RFMSegmentation(base_df, current_date=12345)
+
+    def test_invalid_df_type(self):
+        """Test that RFMSegmentation raises a TypeError when df is neither a DataFrame nor an Ibis Table."""
+        invalid_df = "this is not a dataframe"
+
+        with pytest.raises(TypeError, match="df must be either a pandas DataFrame or an Ibis Table"):
+            RFMSegmentation(df=invalid_df, current_date="2025-03-17")
+
+    def test_ibis_table_property(self, base_df):
+        """Test that ibis_table property returns an Ibis Table."""
+        segmentation = RFMSegmentation(df=base_df, current_date="2025-03-17")
+
+        result = segmentation.ibis_table
+
+        assert isinstance(result, ibis.Table), "Expected ibis.Table but got a different type"
diff --git a/uv.lock b/uv.lock