Merge branch 'main' of github.com:data-simply/pyretailscience into feature/rfm-segmentation

mayurkmmt · mayurkmmt · commit 4f77514d7c79 · 2025-03-18T15:14:52.000+05:30
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,129 +1,51 @@
 [project]
 name = "pyretailscience"
-version = "0.9.0"
+version = "0.10.0"
 description = "Retail Data Science Tools"
 requires-python = ">=3.10,<3.13"
 readme = "README.md"
 license = "Elastic-2.0"
-dependencies = [
-    "pandas>=2.1.4,<3",
-    "pyarrow>=14.0.2,<15",
-    "matplotlib>=3.9.1,<4",
-    "numpy>=1.26.3,<2",
-    "loguru>=0.7.2,<0.8",
-    "tqdm>=4.66.1,<5",
-    "scipy>=1.13.0,<2",
-    "scikit-learn>=1.4.2,<2",
-    "matplotlib-set-diagrams~=0.0.2",
-    "toml>=0.10.2,<0.11",
-    "duckdb>=1.0.0,<2",
-    "graphviz>=0.20.3,<0.21",
-    "ibis-framework[duckdb]>=9.5.0,<10",
-]
+dependencies = [ "pandas>=2.1.4,<3", "pyarrow>=14.0.2,<15", "matplotlib>=3.9.1,<4", "numpy>=1.26.3,<2", "loguru>=0.7.2,<0.8", "tqdm>=4.66.1,<5", "scipy>=1.13.0,<2", "scikit-learn>=1.4.2,<2", "matplotlib-set-diagrams~=0.0.2", "toml>=0.10.2,<0.11", "duckdb>=1.0.0,<2", "graphviz>=0.20.3,<0.21", "ibis-framework[duckdb]>=9.5.0,<10",]
 [[project.authors]]
 name = "Murray Vanwyk"
 email = "2493311+mvanwyk@users.noreply.github.com"
 
 [dependency-groups]
-dev = [
-    "pytest>=8.0.0,<9",
-    "pytest-cov>=4.1.0,<5",
-    "nbstripout>=0.7.1,<0.8",
-    "ruff>=0.9,<0.10",
-    "pre-commit>=3.6.2,<4",
-    "pytest-mock>=3.14.0,<4",
-]
-examples = ["jupyterlab>=4.2.5,<5", "tqdm>=4.66.1,<5"]
-docs = [
-    "mkdocs-material>=9.5.4,<10",
-    "mkdocstrings[python]>=0.24.0,<0.25",
-    "mkdocs>=1.5.3,<2",
-    "mkdocs-jupyter>=0.24.6,<0.25",
-]
+dev = [ "pytest>=8.0.0,<9", "pytest-cov>=4.1.0,<5", "nbstripout>=0.7.1,<0.8", "ruff>=0.9,<0.10", "pre-commit>=3.6.2,<4", "pytest-mock>=3.14.0,<4",]
+examples = [ "jupyterlab>=4.2.5,<5", "tqdm>=4.66.1,<5",]
+docs = [ "mkdocs-material>=9.5.4,<10", "mkdocstrings[python]>=0.24.0,<0.25", "mkdocs>=1.5.3,<2", "mkdocs-jupyter>=0.24.6,<0.25",]
 
 [build-system]
-requires = ["hatchling"]
+requires = [ "hatchling",]
 build-backend = "hatchling.build"
 
 [tool.uv]
-default-groups = ["dev", "examples", "docs"]
+default-groups = [ "dev", "examples", "docs",]
 
 [tool.ruff]
 target-version = "py310"
 line-length = 120
 show-fixes = true
 
 [tool.ruff.lint]
-ignore = ["ANN101", "ANN102", "EM101", "TRY003", "PT011", "PTH123", "SLF001"]
-select = [
-    "A",
-    "ANN",
-    "ARG",
-    "B",
-    "BLE",
-    "C4",
-    "C90",
-    "COM",
-    "D",
-    "D1",
-    "D2",
-    "D3",
-    "D4",
-    "DTZ",
-    "EM",
-    "ERA",
-    "EXE",
-    "F",
-    "FA",
-    "FLY",
-    "G",
-    "I",
-    "ICN",
-    "INP",
-    "INT",
-    "ISC",
-    "N",
-    "NPY",
-    "PERF",
-    "PGH",
-    "PIE",
-    "PL",
-    "PT",
-    "PTH",
-    "PYI",
-    "Q",
-    "RET",
-    "RUF",
-    "RSE",
-    "S",
-    "SIM",
-    "SLF",
-    "SLOT",
-    "T10",
-    "T20",
-    "TCH",
-    "TID",
-    "TRY",
-    "UP",
-    "W",
-    "YTT",
-]
+ignore = [ "ANN101", "ANN102", "EM101", "TRY003", "PT011", "PTH123", "SLF001",]
+select = [ "A", "ANN", "ARG", "B", "BLE", "C4", "C90", "COM", "D", "D1", "D2", "D3", "D4", "DTZ", "EM", "ERA", "EXE", "F", "FA", "FLY", "G", "I", "ICN", "INP", "INT", "ISC", "N", "NPY", "PERF", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "Q", "RET", "RUF", "RSE", "S", "SIM", "SLF", "SLOT", "T10", "T20", "TCH", "TID", "TRY", "UP", "W", "YTT",]
 
 [tool.pytest.ini_options]
 addopts = "--cov=pyretailscience --cov-report=term-missing  --cov-branch"
 
 [tool.coverage.run]
 branch = true
-source = ["pyretailscience"]
+source = [ "pyretailscience",]
 
 [tool.coverage.report]
 show_missing = true
 skip_covered = true
 
 [tool.ruff.lint.per-file-ignores]
-"__init__.py" = ["F401", "F403", "F405", "D104"]
-"tests/*" = ["ANN", "ARG", "INP001", "S101", "SLF001"]
-"*.ipynb" = ["T201"]
+"__init__.py" = [ "F401", "F403", "F405", "D104",]
+"tests/*" = [ "ANN", "ARG", "INP001", "S101", "SLF001",]
+"*.ipynb" = [ "T201",]
 
 [tool.ruff.lint.pylint]
 max-args = 15
diff --git a/pyretailscience/analysis/segmentation.py b/pyretailscience/analysis/segmentation.py
@@ -190,7 +190,12 @@ class SegTransactionStats:
 
     _df: pd.DataFrame | None = None
 
-    def __init__(self, data: pd.DataFrame | ibis.Table, segment_col: str = "segment_name") -> None:
+    def __init__(
+        self,
+        data: pd.DataFrame | ibis.Table,
+        segment_col: str = "segment_name",
+        extra_aggs: dict[str, tuple[str, str]] | None = None,
+    ) -> None:
         """Calculates transaction statistics by segment.
 
         Args:
@@ -199,6 +204,12 @@ def __init__(self, data: pd.DataFrame | ibis.Table, segment_col: str = "segment_
                 the columns unit_spend and unit_quantity are used to calculate the price_per_unit and
                 units_per_transaction.
             segment_col (str, optional): The column to use for the segmentation. Defaults to "segment_name".
+            extra_aggs (dict[str, tuple[str, str]], optional): Additional aggregations to perform.
+                The keys in the dictionary will be the column names for the aggregation results.
+                The values are tuples with (column_name, aggregation_function), where:
+                - column_name is the name of the column to aggregate
+                - aggregation_function is a string name of an Ibis aggregation function (e.g., "nunique", "sum")
+                Example: {"stores": ("store_id", "nunique")} would count unique store_ids.
         """
         cols = ColumnHelper()
         required_cols = [
@@ -215,9 +226,21 @@ def __init__(self, data: pd.DataFrame | ibis.Table, segment_col: str = "segment_
             msg = f"The following columns are required but missing: {missing_cols}"
             raise ValueError(msg)
 
+        # Validate extra_aggs if provided
+        if extra_aggs:
+            for col_tuple in extra_aggs.values():
+                col, func = col_tuple
+                if col not in data.columns:
+                    msg = f"Column '{col}' specified in extra_aggs does not exist in the data"
+                    raise ValueError(msg)
+                if not hasattr(data[col], func):
+                    msg = f"Aggregation function '{func}' not available for column '{col}'"
+                    raise ValueError(msg)
+
         self.segment_col = segment_col
+        self.extra_aggs = {} if extra_aggs is None else extra_aggs
 
-        self.table = self._calc_seg_stats(data, segment_col)
+        self.table = self._calc_seg_stats(data, segment_col, self.extra_aggs)
 
     @staticmethod
     def _get_col_order(include_quantity: bool) -> list[str]:
@@ -249,12 +272,19 @@ def _get_col_order(include_quantity: bool) -> list[str]:
         return col_order
 
     @staticmethod
-    def _calc_seg_stats(data: pd.DataFrame | ibis.Table, segment_col: str) -> ibis.Table:
+    def _calc_seg_stats(
+        data: pd.DataFrame | ibis.Table,
+        segment_col: str,
+        extra_aggs: dict[str, tuple[str, str]] | None = None,
+    ) -> ibis.Table:
         """Calculates the transaction statistics by segment.
 
         Args:
             data (pd.DataFrame | ibis.Table): The transaction data.
             segment_col (str): The column to use for the segmentation.
+            extra_aggs (dict[str, tuple[str, str]], optional): Additional aggregations to perform.
+                The keys in the dictionary will be the column names for the aggregation results.
+                The values are tuples with (column_name, aggregation_function).
 
         Returns:
             pd.DataFrame: The transaction statistics by segment.
@@ -277,6 +307,12 @@ def _calc_seg_stats(data: pd.DataFrame | ibis.Table, segment_col: str) -> ibis.T
         if cols.unit_qty in data.columns:
             aggs[cols.agg_unit_qty] = data[cols.unit_qty].sum()
 
+        # Add extra aggregations if provided
+        if extra_aggs:
+            for agg_name, col_tuple in extra_aggs.items():
+                col, func = col_tuple
+                aggs[agg_name] = getattr(data[col], func)()
+
         # Calculate metrics for segments and total
         segment_metrics = data.group_by(segment_col).aggregate(**aggs)
         total_metrics = data.aggregate(**aggs).mutate(segment_name=ibis.literal("Total"))
@@ -311,6 +347,11 @@ def df(self) -> pd.DataFrame:
                 self.segment_col,
                 *SegTransactionStats._get_col_order(include_quantity=cols.agg_unit_qty in self.table.columns),
             ]
+
+            # Add any extra aggregation columns to the column order
+            if hasattr(self, "extra_aggs") and self.extra_aggs:
+                col_order.extend(self.extra_aggs.keys())
+
             self._df = self.table.execute()[col_order]
         return self._df
 
@@ -484,10 +525,10 @@ def _compute_rfm(self, df: ibis.Table, current_date: datetime.date) -> ibis.Tabl
             order_by=[ibis.asc(customer_metrics.recency_days), ibis.asc(customer_metrics.customer_id)],
         )
         window_frequency = ibis.window(
-            order_by=[ibis.desc(customer_metrics.frequency), ibis.asc(customer_metrics.customer_id)],
+            order_by=[ibis.asc(customer_metrics.frequency), ibis.asc(customer_metrics.customer_id)],
         )
         window_monetary = ibis.window(
-            order_by=[ibis.desc(customer_metrics.monetary), ibis.asc(customer_metrics.customer_id)],
+            order_by=[ibis.asc(customer_metrics.monetary), ibis.asc(customer_metrics.customer_id)],
         )
 
         rfm_scores = customer_metrics.mutate(
diff --git a/tests/analysis/test_segmentation.py b/tests/analysis/test_segmentation.py
@@ -319,6 +319,93 @@ def test_handles_empty_dataframe_with_errors(self):
         with pytest.raises(ValueError):
             SegTransactionStats(df, "segment_name")
 
+    def test_extra_aggs_functionality(self):
+        """Test that the extra_aggs parameter works correctly."""
+        # Constants for expected values
+        segment_a_store_count = 3  # Segment A has stores 1, 2, 4
+        segment_b_store_count = 2  # Segment B has stores 1, 3
+        total_store_count = 4  # Total has stores 1, 2, 3, 4
+
+        segment_a_product_count = 3  # Segment A has products 10, 20, 40
+        segment_b_product_count = 2  # Segment B has products 10, 30
+        total_product_count = 4  # Total has products 10, 20, 30, 40
+        df = pd.DataFrame(
+            {
+                cols.customer_id: [1, 1, 2, 2, 3, 3],
+                cols.unit_spend: [100.0, 150.0, 200.0, 250.0, 300.0, 350.0],
+                cols.transaction_id: [101, 102, 103, 104, 105, 106],
+                "segment_name": ["A", "A", "B", "B", "A", "A"],
+                "store_id": [1, 2, 1, 3, 2, 4],
+                "product_id": [10, 20, 10, 30, 20, 40],
+            },
+        )
+
+        # Test with a single extra aggregation
+        seg_stats = SegTransactionStats(df, "segment_name", extra_aggs={"distinct_stores": ("store_id", "nunique")})
+
+        # Verify the extra column exists and has correct values
+        assert "distinct_stores" in seg_stats.df.columns
+
+        # Sort by segment_name to ensure consistent order
+        result_df = seg_stats.df.sort_values("segment_name").reset_index(drop=True)
+
+        assert result_df.loc[0, "distinct_stores"] == segment_a_store_count  # Segment A
+        assert result_df.loc[1, "distinct_stores"] == segment_b_store_count  # Segment B
+        assert result_df.loc[2, "distinct_stores"] == total_store_count  # Total
+
+        # Test with multiple extra aggregations
+        seg_stats_multi = SegTransactionStats(
+            df,
+            "segment_name",
+            extra_aggs={
+                "distinct_stores": ("store_id", "nunique"),
+                "distinct_products": ("product_id", "nunique"),
+            },
+        )
+
+        # Verify both extra columns exist
+        assert "distinct_stores" in seg_stats_multi.df.columns
+        assert "distinct_products" in seg_stats_multi.df.columns
+
+        # Sort by segment_name to ensure consistent order
+        result_df_multi = seg_stats_multi.df.sort_values("segment_name").reset_index(drop=True)
+
+        assert result_df_multi.loc[0, "distinct_products"] == segment_a_product_count  # Segment A
+        assert result_df_multi.loc[1, "distinct_products"] == segment_b_product_count  # Segment B
+        assert result_df_multi.loc[2, "distinct_products"] == total_product_count  # Total
+
+    def test_extra_aggs_with_invalid_column(self):
+        """Test that an error is raised when an invalid column is specified in extra_aggs."""
+        df = pd.DataFrame(
+            {
+                cols.customer_id: [1, 2, 3],
+                cols.unit_spend: [100.0, 200.0, 300.0],
+                cols.transaction_id: [101, 102, 103],
+                "segment_name": ["A", "B", "A"],
+            },
+        )
+
+        with pytest.raises(ValueError) as excinfo:
+            SegTransactionStats(df, "segment_name", extra_aggs={"invalid_agg": ("nonexistent_column", "nunique")})
+
+        assert "does not exist in the data" in str(excinfo.value)
+
+    def test_extra_aggs_with_invalid_function(self):
+        """Test that an error is raised when an invalid function is specified in extra_aggs."""
+        df = pd.DataFrame(
+            {
+                cols.customer_id: [1, 2, 3],
+                cols.unit_spend: [100.0, 200.0, 300.0],
+                cols.transaction_id: [101, 102, 103],
+                "segment_name": ["A", "B", "A"],
+            },
+        )
+
+        with pytest.raises(ValueError) as excinfo:
+            SegTransactionStats(df, "segment_name", extra_aggs={"invalid_agg": (cols.customer_id, "invalid_function")})
+
+        assert "not available for column" in str(excinfo.value)
+
 
 class TestHMLSegmentation:
     """Tests for the HMLSegmentation class."""
@@ -428,7 +515,7 @@ def test_correct_rfm_segmentation(self, base_df):
         expected_df = pd.DataFrame(
             {
                 "customer_id": [1, 2, 3, 4, 5],
-                "rfm_segment": [104, 312, 423, 30, 241],
+                "rfm_segment": [100, 312, 421, 34, 243],
             },
         ).set_index("customer_id")
 
@@ -505,7 +592,7 @@ def test_rfm_segmentation_with_no_date(self, base_df):
         expected_df = pd.DataFrame(
             {
                 "customer_id": [1, 2, 3, 4, 5],
-                "rfm_segment": [104, 312, 423, 30, 241],
+                "rfm_segment": [100, 312, 421, 34, 243],
             },
         ).set_index("customer_id")
 
diff --git a/uv.lock b/uv.lock