From d9427f5daeea17b746a00382d07023ddc02ca922 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Tue, 7 Jan 2025 10:52:59 -0300 Subject: [PATCH 01/22] add custom mask functionalities --- .../utilities/data_masking/base.py | 129 ++++++++++++++++-- .../utilities/data_masking/provider/base.py | 73 +++++++++- 2 files changed, 186 insertions(+), 16 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 9b80e50bd58..4cebcef37cb 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ast import functools import logging import warnings @@ -94,8 +95,41 @@ def erase(self, data: tuple, fields: list[str]) -> tuple[str]: ... @overload def erase(self, data: dict, fields: list[str]) -> dict: ... - def erase(self, data: Sequence | Mapping, fields: list[str] | None = None) -> str | list[str] | tuple[str] | dict: - return self._apply_action(data=data, fields=fields, action=self.provider.erase) + @overload + def erase( + self, + data: dict, + fields: list[str], + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, + ) -> dict: ... + + def erase( + self, + data: Sequence | Mapping, + fields: list[str] | None = None, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, + masking_rules: dict | None = None, + ) -> str | list[str] | tuple[str] | dict: + if not data: + return data + if masking_rules: + return self._apply_masking_rules(data, masking_rules) + else: + return self._apply_action( + data=data, + fields=fields, + action=self.provider.erase, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, + ) def _apply_action( self, @@ -103,6 +137,10 @@ def _apply_action( fields: list[str] | None, action: Callable, provider_options: dict | None = None, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, **encryption_context: str, ): """ @@ -136,11 +174,23 @@ def _apply_action( fields=fields, action=action, provider_options=provider_options, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, **encryption_context, ) else: logger.debug(f"Running action {action.__name__} with the entire data") - return action(data=data, provider_options=provider_options, **encryption_context) + return action( + data=data, + provider_options=provider_options, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, + **encryption_context, + ) def _apply_action_to_fields( self, @@ -148,6 +198,10 @@ def _apply_action_to_fields( fields: list, action: Callable, provider_options: dict | None = None, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, **encryption_context: str, ) -> dict | str: """ @@ -194,6 +248,8 @@ def _apply_action_to_fields( new_dict = {'a': {'b': {'c': '*****'}}, 'x': {'y': '*****'}} ``` """ + if not fields: + raise ValueError("Fields parameter cannot be empty") data_parsed: dict = self._normalize_data_to_parse(fields, data) @@ -204,6 +260,10 @@ def _apply_action_to_fields( self._call_action, action=action, provider_options=provider_options, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, **encryption_context, # type: ignore[arg-type] ) @@ -225,12 +285,6 @@ def _apply_action_to_fields( # For in-place updates, json_parse accepts a callback function # that receives 3 args: field_value, fields, field_name # We create a partial callback to pre-populate known provider options (action, provider opts, enc ctx) - update_callback = functools.partial( - self._call_action, - action=action, - provider_options=provider_options, - **encryption_context, # type: ignore[arg-type] - ) json_parse.update( data_parsed, @@ -239,6 +293,49 @@ def _apply_action_to_fields( return data_parsed + def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: + """ + Apply masking rules to data, supporting different rules for each field. + """ + result = data.copy() + + for path, rule in masking_rules.items(): + try: + # Handle nested paths (e.g., 'address.street') + parts = path.split(".") + current = result + + for part in parts[:-1]: + if isinstance(current[part], str) and current[part].startswith("{"): + try: + current[part] = ast.literal_eval(current[part]) + except (ValueError, SyntaxError): + continue + current = current[part] + + final_field = parts[-1] + + # Apply masking rule to the target field + if final_field in current: + current[final_field] = self.provider.erase(str(current[final_field]), **rule) + + except (KeyError, TypeError, AttributeError): + # Log warning if field not found or invalid path + warnings.warn(f"Could not apply masking rule for path: {path}", stacklevel=2) + continue + + return result + + def _mask_nested_field(self, data: dict, field_path: str, mask_function): + keys = field_path.split(".") + current = data + for key in keys[:-1]: + current = current.get(key, {}) + if not isinstance(current, dict): + return # Caminho inválido + if keys[-1] in current: + current[keys[-1]] = mask_function(current[keys[-1]]) + @staticmethod def _call_action( field_value: Any, @@ -246,6 +343,10 @@ def _call_action( field_name: str, action: Callable, provider_options: dict[str, Any] | None = None, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, **encryption_context, ) -> None: """ @@ -263,7 +364,15 @@ def _call_action( Returns: - fields[field_name]: Returns the processed field value """ - fields[field_name] = action(field_value, provider_options=provider_options, **encryption_context) + fields[field_name] = action( + field_value, + provider_options=provider_options, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, + **encryption_context, + ) return fields[field_name] def _normalize_data_to_parse(self, fields: list, data: str | dict) -> dict: diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 28bc8384f8d..6a5d806f056 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -2,10 +2,14 @@ import functools import json +import re from typing import Any, Callable, Iterable from aws_lambda_powertools.utilities.data_masking.constants import DATA_MASKING_STRING +PRESERVE_CHARS = set("-_. ") +_regex_cache = {} + class BaseProvider: """ @@ -63,7 +67,16 @@ def decrypt(self, data, provider_options: dict | None = None, **encryption_conte """ raise NotImplementedError("Subclasses must implement decrypt()") - def erase(self, data, **kwargs) -> Iterable[str]: + def erase( + self, + data, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, + masking_rules: dict | None = None, + **kwargs, + ) -> Iterable[str]: """ This method irreversibly erases data. @@ -72,10 +85,58 @@ def erase(self, data, **kwargs) -> Iterable[str]: If the data to be erased is of an iterable type like `list`, `tuple`, or `set`, this method will return a new object of the same type as the - input data but with each element replaced by the string "*****". + input data but with each element replaced by the string "*****" or following one of the custom masks. """ - if isinstance(data, (str, dict, bytes)): + result = DATA_MASKING_STRING + + if data: + if isinstance(data, str): + if custom_mask: + if mask_pattern: + result = self._pattern_mask(data, mask_pattern) + elif regex_pattern and mask_format: + result = self._regex_mask(data, regex_pattern, mask_format) + else: + result = self._custom_erase(data, **kwargs) + elif isinstance(data, dict): + if masking_rules: + result = self._apply_masking_rules(data, masking_rules) + elif isinstance(data, (list, tuple, set)): + result = type(data)( + self.erase( + item, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + **kwargs, + ) + for item in data + ) + + return result + + def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: + return { + key: self.erase(str(value), **masking_rules[key]) if key in masking_rules else str(value) + for key, value in data.items() + } + + def _pattern_mask(self, data: str, pattern: str) -> str: + return pattern[: len(data)] if len(pattern) >= len(data) else pattern + + def _regex_mask(self, data: str, regex_pattern: str, mask_format: str) -> str: + try: + if regex_pattern not in _regex_cache: + _regex_cache[regex_pattern] = re.compile(regex_pattern) + return _regex_cache[regex_pattern].sub(mask_format, data) + except re.error: return DATA_MASKING_STRING - elif isinstance(data, (list, tuple, set)): - return type(data)([DATA_MASKING_STRING] * len(data)) - return DATA_MASKING_STRING + + def _custom_erase(self, data: str, **kwargs) -> str: + if not data: + return "" + + # Use join with list comprehension instead of building list incrementally + return "".join("*" if char not in PRESERVE_CHARS else char for char in data) From 796bd898de8e95c8342994449e157ccad15d1ce7 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Tue, 7 Jan 2025 14:05:39 -0300 Subject: [PATCH 02/22] change flags name to more intuitive --- .../utilities/data_masking/base.py | 30 +++++++++---------- .../utilities/data_masking/provider/base.py | 17 +++++------ 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 4cebcef37cb..f08e10371f7 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -100,8 +100,8 @@ def erase( self, data: dict, fields: list[str], - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, ) -> dict: ... @@ -110,8 +110,8 @@ def erase( self, data: Sequence | Mapping, fields: list[str] | None = None, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, masking_rules: dict | None = None, @@ -125,8 +125,8 @@ def erase( data=data, fields=fields, action=self.provider.erase, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, ) @@ -137,8 +137,8 @@ def _apply_action( fields: list[str] | None, action: Callable, provider_options: dict | None = None, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, **encryption_context: str, @@ -174,8 +174,8 @@ def _apply_action( fields=fields, action=action, provider_options=provider_options, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, **encryption_context, @@ -185,8 +185,8 @@ def _apply_action( return action( data=data, provider_options=provider_options, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, **encryption_context, @@ -198,8 +198,8 @@ def _apply_action_to_fields( fields: list, action: Callable, provider_options: dict | None = None, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, **encryption_context: str, @@ -260,8 +260,8 @@ def _apply_action_to_fields( self._call_action, action=action, provider_options=provider_options, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, **encryption_context, # type: ignore[arg-type] @@ -343,8 +343,8 @@ def _call_action( field_name: str, action: Callable, provider_options: dict[str, Any] | None = None, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, **encryption_context, @@ -367,8 +367,8 @@ def _call_action( fields[field_name] = action( field_value, provider_options=provider_options, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, **encryption_context, diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 6a5d806f056..4337a0e6502 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -70,8 +70,8 @@ def decrypt(self, data, provider_options: dict | None = None, **encryption_conte def erase( self, data, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, masking_rules: dict | None = None, @@ -91,13 +91,12 @@ def erase( if data: if isinstance(data, str): + if dynamic_mask: + result = self._custom_erase(data, **kwargs) if custom_mask: - if mask_pattern: - result = self._pattern_mask(data, mask_pattern) - elif regex_pattern and mask_format: - result = self._regex_mask(data, regex_pattern, mask_format) - else: - result = self._custom_erase(data, **kwargs) + result = self._pattern_mask(data, custom_mask) + if regex_pattern and mask_format: + result = self._regex_mask(data, regex_pattern, mask_format) elif isinstance(data, dict): if masking_rules: result = self._apply_masking_rules(data, masking_rules) @@ -105,8 +104,8 @@ def erase( result = type(data)( self.erase( item, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, masking_rules=masking_rules, From d9319179fa9bacde7f10e0e301a980b4c4ebf0c1 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Tue, 7 Jan 2025 18:50:30 -0300 Subject: [PATCH 03/22] fix type check error --- .../utilities/data_masking/provider/base.py | 81 ++++++++++++------- 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 4337a0e6502..382264c220e 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -3,8 +3,9 @@ import functools import json import re -from typing import Any, Callable, Iterable +from typing import Any, Callable +# , Iterable from aws_lambda_powertools.utilities.data_masking.constants import DATA_MASKING_STRING PRESERVE_CHARS = set("-_. ") @@ -69,14 +70,14 @@ def decrypt(self, data, provider_options: dict | None = None, **encryption_conte def erase( self, - data, + data: Any, dynamic_mask: bool | None = None, custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, masking_rules: dict | None = None, **kwargs, - ) -> Iterable[str]: + ) -> str | dict | list | tuple | set: """ This method irreversibly erases data. @@ -85,47 +86,68 @@ def erase( If the data to be erased is of an iterable type like `list`, `tuple`, or `set`, this method will return a new object of the same type as the - input data but with each element replaced by the string "*****" or following one of the custom masks. + input data but with each element masked according to the specified rules. """ - result = DATA_MASKING_STRING - - if data: - if isinstance(data, str): - if dynamic_mask: - result = self._custom_erase(data, **kwargs) - if custom_mask: - result = self._pattern_mask(data, custom_mask) - if regex_pattern and mask_format: - result = self._regex_mask(data, regex_pattern, mask_format) - elif isinstance(data, dict): - if masking_rules: - result = self._apply_masking_rules(data, masking_rules) - elif isinstance(data, (list, tuple, set)): - result = type(data)( - self.erase( - item, - dynamic_mask=dynamic_mask, - custom_mask=custom_mask, - regex_pattern=regex_pattern, - mask_format=mask_format, - masking_rules=masking_rules, - **kwargs, - ) - for item in data + result = None + + # Handle empty or None data + if not data: + result = DATA_MASKING_STRING if isinstance(data, (str, bytes)) else data + + # Handle string data + elif isinstance(data, str): + if regex_pattern and mask_format: + result = self._regex_mask(data, regex_pattern, mask_format) + elif custom_mask: + result = self._pattern_mask(data, custom_mask) + elif dynamic_mask: + result = self._custom_erase(data, **kwargs) + else: + result = DATA_MASKING_STRING + + # Handle dictionary data + elif isinstance(data, dict): + result = ( + self._apply_masking_rules(data, masking_rules) + if masking_rules + else {k: DATA_MASKING_STRING for k in data} + ) + + # Handle iterable data (list, tuple, set) + elif isinstance(data, (list, tuple, set)): + masked_data = ( + self.erase( + item, + dynamic_mask=dynamic_mask, + custom_mask=custom_mask, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + **kwargs, ) + for item in data + ) + result = type(data)(masked_data) + + # Default case + else: + result = DATA_MASKING_STRING return result def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: + """Apply masking rules to dictionary data.""" return { key: self.erase(str(value), **masking_rules[key]) if key in masking_rules else str(value) for key, value in data.items() } def _pattern_mask(self, data: str, pattern: str) -> str: + """Apply pattern masking to string data.""" return pattern[: len(data)] if len(pattern) >= len(data) else pattern def _regex_mask(self, data: str, regex_pattern: str, mask_format: str) -> str: + """Apply regex masking to string data.""" try: if regex_pattern not in _regex_cache: _regex_cache[regex_pattern] = re.compile(regex_pattern) @@ -137,5 +159,4 @@ def _custom_erase(self, data: str, **kwargs) -> str: if not data: return "" - # Use join with list comprehension instead of building list incrementally return "".join("*" if char not in PRESERVE_CHARS else char for char in data) From 4c0070c30050749d4b570f0fc10916624ee52081 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 8 Jan 2025 13:55:54 -0300 Subject: [PATCH 04/22] add draft documentation --- docs/utilities/data_masking.md | 21 +++++++++- .../data_masking/src/custom_data_masking.py | 38 +++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 examples/data_masking/src/custom_data_masking.py diff --git a/docs/utilities/data_masking.md b/docs/utilities/data_masking.md index 162292e79a0..b1485dac6df 100644 --- a/docs/utilities/data_masking.md +++ b/docs/utilities/data_masking.md @@ -43,7 +43,7 @@ stateDiagram-v2 ## Terminology -**Erasing** replaces sensitive information **irreversibly** with a non-sensitive placeholder _(`*****`)_. This operation replaces data in-memory, making it a one-way action. +**Erasing** replaces sensitive information **irreversibly** with a non-sensitive placeholder _(`*****`)_, or with a customized mask. This operation replaces data in-memory, making it a one-way action. **Encrypting** transforms plaintext into ciphertext using an encryption algorithm and a cryptographic key. It allows you to encrypt any sensitive data, so only allowed personnel to decrypt it. Learn more about encryption [here](https://aws.amazon.com/blogs/security/importance-of-encryption-and-how-aws-can-help/){target="_blank"}. @@ -117,6 +117,25 @@ Erasing will remove the original data and replace it with a `*****`. This means --8<-- "examples/data_masking/src/getting_started_erase_data_output.json" ``` +The `erase` method also supports additional flags for more advanced and flexible masking: + +| Flag | Behavior | +| ---------------- | ----------------------------------------------------------| +| `dynamic_mask`(bool) | When set to `True`, this flag enables custom masking behavior. It activates the use of advanced masking techniques such as pattern-based or regex-based masking.| +| `custom_mask`(str) | Specifies a simple pattern for masking data. This pattern is applied directly to the input string, replacing all the original characters. For example, with a `custom_mask` of "XX-XX" applied to "12345", the result would be "XX-XX".| +| `regex_pattern`(str) | Defines a regular expression pattern used to identify parts of the input string that should be masked. This allows for more complex and flexible masking rules. It's used in conjunction with `mask_format`.| +| `mask_format`(str) | Specifies the format to use when replacing parts of the string matched by `regex_pattern`. It can include placeholders (like \1, \2) to refer to captured groups in the regex pattern, allowing some parts of the original string to be preserved.| +| `masking_rules`(dict) | Allows you to apply different masking rules (flags) for each data field.| + +=== "custom_data_masking.py" + ```python hl_lines="13 17 21 25 36" + --8<-- "examples/data_masking/src/custom_data_masking.py" + ``` +=== "generic_data_input.json" + ```json hl_lines="6 7 9 12" + --8<-- "examples/data_masking/src/generic_data_input.json" + ``` + ### Encrypting data ???+ note "About static typing and encryption" diff --git a/examples/data_masking/src/custom_data_masking.py b/examples/data_masking/src/custom_data_masking.py new file mode 100644 index 00000000000..a99b9045cac --- /dev/null +++ b/examples/data_masking/src/custom_data_masking.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from aws_lambda_powertools.utilities.data_masking import DataMasking +from aws_lambda_powertools.utilities.typing import LambdaContext + +data_masker = DataMasking() + + +def lambda_handler(event: dict, context: LambdaContext) -> dict: + data: dict = event.get("body", {}) + + # Default erase (*****) + default_erased = data_masker.erase(data, fields=["address.zip"]) + # 'street': '*****' + + # dynamic_mask + dynamic_mask = data_masker.erase(data, fields=["address.zip"], dynamic_mask=True) + #'street': '*** **** **' + + # custom_mask + custom_mask = data_masker.erase(data, fields=["address.zip"], custom_mask="XX") + #'zip': 'XX' + + # regex_pattern and mask_format + regex_pattern = data_masker.erase(data, fields=["email"], regex_pattern=r"(.)(.*)(@.*)", mask_format=r"\1****\3") + #'email': 'j****@example.com' + + # Masking rules for each field + masking_rules = { + "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, + "age": {"dynamic_mask": True}, + "address.zip": {"dynamic_mask": True, "custom_mask": "xxx"}, + "address.street": {"dynamic_mask": False}, + } + + masking_rules_erase = data_masker.erase(data, masking_rules=masking_rules) + + return default_erased, dynamic_mask, custom_mask, regex_pattern, masking_rules_erase From ae81dce2d47967145b9f2a223356996a986ea0fb Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 8 Jan 2025 16:43:16 -0300 Subject: [PATCH 05/22] change doc examples --- .../utilities/data_masking/base.py | 3 ++ docs/utilities/data_masking.md | 43 +++++++++++++------ .../data_masking/src/custom_data_masking.py | 20 +-------- 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index f08e10371f7..7695b41bd6b 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -95,6 +95,9 @@ def erase(self, data: tuple, fields: list[str]) -> tuple[str]: ... @overload def erase(self, data: dict, fields: list[str]) -> dict: ... + @overload + def erase(self, data: dict[Any, Any], *, masking_rules: dict[str, object]) -> dict[Any, Any]: ... + @overload def erase( self, diff --git a/docs/utilities/data_masking.md b/docs/utilities/data_masking.md index b1485dac6df..c90abfc236e 100644 --- a/docs/utilities/data_masking.md +++ b/docs/utilities/data_masking.md @@ -119,22 +119,37 @@ Erasing will remove the original data and replace it with a `*****`. This means The `erase` method also supports additional flags for more advanced and flexible masking: -| Flag | Behavior | -| ---------------- | ----------------------------------------------------------| -| `dynamic_mask`(bool) | When set to `True`, this flag enables custom masking behavior. It activates the use of advanced masking techniques such as pattern-based or regex-based masking.| -| `custom_mask`(str) | Specifies a simple pattern for masking data. This pattern is applied directly to the input string, replacing all the original characters. For example, with a `custom_mask` of "XX-XX" applied to "12345", the result would be "XX-XX".| -| `regex_pattern`(str) | Defines a regular expression pattern used to identify parts of the input string that should be masked. This allows for more complex and flexible masking rules. It's used in conjunction with `mask_format`.| -| `mask_format`(str) | Specifies the format to use when replacing parts of the string matched by `regex_pattern`. It can include placeholders (like \1, \2) to refer to captured groups in the regex pattern, allowing some parts of the original string to be preserved.| -| `masking_rules`(dict) | Allows you to apply different masking rules (flags) for each data field.| - -=== "custom_data_masking.py" - ```python hl_lines="13 17 21 25 36" +=== "dynamic_mask" + + (bool) When set to `True`, this flag enables custom masking behavior. It activates the use of advanced masking techniques such as pattern-based or regex-based masking. + + > Expression: `data_masker.erase(data, fields=["address.zip"], dynamic_mask=True)` + + > Field result: `'street': '*** **** **'` + +=== "custom_mask" + + (str) Specifies a simple pattern for masking data. This pattern is applied directly to the input string, replacing all the original characters. For example, with a `custom_mask` of "XX-XX" applied to "12345", the result would be "XX-XX". + + > Expression: `data_masker.erase(data, fields=["address.zip"], custom_mask="XX")` + + > Field result: `'zip': 'XX'` + +=== "regex_pattern & mask_format" + + (str) `regex_pattern` defines a regular expression pattern used to identify parts of the input string that should be masked. This allows for more complex and flexible masking rules. It's used in conjunction with `mask_format`. + `mask_format` specifies the format to use when replacing parts of the string matched by `regex_pattern`. It can include placeholders (like \1, \2) to refer to captured groups in the regex pattern, allowing some parts of the original string to be preserved. + + > Expression: `data_masker.erase(data, fields=["email"], regex_pattern=r"(.)(.*)(@.*)", mask_format=r"\1****\3")` + + > Field result: `'email': 'j****@example.com'` + +=== "masking_rules" + + (dict) Allows you to apply different masking rules (flags) for each data field. + ```python hl_lines="20" --8<-- "examples/data_masking/src/custom_data_masking.py" ``` -=== "generic_data_input.json" - ```json hl_lines="6 7 9 12" - --8<-- "examples/data_masking/src/generic_data_input.json" - ``` ### Encrypting data diff --git a/examples/data_masking/src/custom_data_masking.py b/examples/data_masking/src/custom_data_masking.py index a99b9045cac..24a5d51bc81 100644 --- a/examples/data_masking/src/custom_data_masking.py +++ b/examples/data_masking/src/custom_data_masking.py @@ -9,22 +9,6 @@ def lambda_handler(event: dict, context: LambdaContext) -> dict: data: dict = event.get("body", {}) - # Default erase (*****) - default_erased = data_masker.erase(data, fields=["address.zip"]) - # 'street': '*****' - - # dynamic_mask - dynamic_mask = data_masker.erase(data, fields=["address.zip"], dynamic_mask=True) - #'street': '*** **** **' - - # custom_mask - custom_mask = data_masker.erase(data, fields=["address.zip"], custom_mask="XX") - #'zip': 'XX' - - # regex_pattern and mask_format - regex_pattern = data_masker.erase(data, fields=["email"], regex_pattern=r"(.)(.*)(@.*)", mask_format=r"\1****\3") - #'email': 'j****@example.com' - # Masking rules for each field masking_rules = { "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, @@ -33,6 +17,6 @@ def lambda_handler(event: dict, context: LambdaContext) -> dict: "address.street": {"dynamic_mask": False}, } - masking_rules_erase = data_masker.erase(data, masking_rules=masking_rules) + result = data_masker.erase(data, masking_rules=masking_rules) - return default_erased, dynamic_mask, custom_mask, regex_pattern, masking_rules_erase + return result From 7630b068ed0f9bffa2894f7dc8f0c86c04bba134 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Thu, 9 Jan 2025 08:30:26 -0300 Subject: [PATCH 06/22] style: format code with black --- .../utilities/data_masking/base.py | 2 +- .../utilities/data_masking/provider/base.py | 35 ++++++++++++------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 7695b41bd6b..8136c8bcaaf 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -335,7 +335,7 @@ def _mask_nested_field(self, data: dict, field_path: str, mask_function): for key in keys[:-1]: current = current.get(key, {}) if not isinstance(current, dict): - return # Caminho inválido + return if keys[-1] in current: current[keys[-1]] = mask_function(current[keys[-1]]) diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 382264c220e..6fa5648e7bc 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -91,11 +91,11 @@ def erase( result = None # Handle empty or None data - if not data: - result = DATA_MASKING_STRING if isinstance(data, (str, bytes)) else data + if data is None or (isinstance(data, (str, list, dict)) and not data): + return data # Handle string data - elif isinstance(data, str): + if isinstance(data, str): if regex_pattern and mask_format: result = self._regex_mask(data, regex_pattern, mask_format) elif custom_mask: @@ -107,15 +107,24 @@ def erase( # Handle dictionary data elif isinstance(data, dict): - result = ( - self._apply_masking_rules(data, masking_rules) - if masking_rules - else {k: DATA_MASKING_STRING for k in data} - ) + if masking_rules: + result = self._apply_masking_rules(data, masking_rules) + else: + result = {} + for k, v in data.items(): + result[str(k)] = self.erase( + str(v), + dynamic_mask=dynamic_mask, + custom_mask=custom_mask, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + **kwargs, + ) # Handle iterable data (list, tuple, set) elif isinstance(data, (list, tuple, set)): - masked_data = ( + masked_data = [ self.erase( item, dynamic_mask=dynamic_mask, @@ -126,16 +135,16 @@ def erase( **kwargs, ) for item in data - ) + ] result = type(data)(masked_data) - # Default case + # Handle other types (int, float, bool, etc.) else: - result = DATA_MASKING_STRING + result = str(data) return result - def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: + def _apply_masking_rules(self, data: dict, masking_rules: dict) -> Any: """Apply masking rules to dictionary data.""" return { key: self.erase(str(value), **masking_rules[key]) if key in masking_rules else str(value) From 6e2ec354612b44ceede4c56bc4d56bb602ecc7e4 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Thu, 9 Jan 2025 08:49:17 -0300 Subject: [PATCH 07/22] fix format base --- .../utilities/data_masking/provider/base.py | 138 +++++++++++------- 1 file changed, 85 insertions(+), 53 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 6fa5648e7bc..47079c42484 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -5,7 +5,6 @@ import re from typing import Any, Callable -# , Iterable from aws_lambda_powertools.utilities.data_masking.constants import DATA_MASKING_STRING PRESERVE_CHARS = set("-_. ") @@ -77,56 +76,72 @@ def erase( mask_format: str | None = None, masking_rules: dict | None = None, **kwargs, - ) -> str | dict | list | tuple | set: - """ - This method irreversibly erases data. - - If the data to be erased is of type `str`, `dict`, or `bytes`, - this method will return an erased string, i.e. "*****". - - If the data to be erased is of an iterable type like `list`, `tuple`, - or `set`, this method will return a new object of the same type as the - input data but with each element masked according to the specified rules. - """ - result = None - + ) -> Any: # Handle empty or None data if data is None or (isinstance(data, (str, list, dict)) and not data): return data - # Handle string data - if isinstance(data, str): - if regex_pattern and mask_format: - result = self._regex_mask(data, regex_pattern, mask_format) - elif custom_mask: - result = self._pattern_mask(data, custom_mask) - elif dynamic_mask: - result = self._custom_erase(data, **kwargs) - else: - result = DATA_MASKING_STRING - - # Handle dictionary data + result = data # Default to returning the original data + + if isinstance(data, (str, int, float)): + result = self._mask_primitive(str(data), dynamic_mask, custom_mask, regex_pattern, mask_format, **kwargs) elif isinstance(data, dict): - if masking_rules: - result = self._apply_masking_rules(data, masking_rules) - else: - result = {} - for k, v in data.items(): - result[str(k)] = self.erase( - str(v), - dynamic_mask=dynamic_mask, - custom_mask=custom_mask, - regex_pattern=regex_pattern, - mask_format=mask_format, - masking_rules=masking_rules, - **kwargs, - ) - - # Handle iterable data (list, tuple, set) + result = self._mask_dict( + data, + dynamic_mask, + custom_mask, + regex_pattern, + mask_format, + masking_rules, + **kwargs, + ) elif isinstance(data, (list, tuple, set)): - masked_data = [ - self.erase( - item, + result = self._mask_iterable( + data, + dynamic_mask, + custom_mask, + regex_pattern, + mask_format, + masking_rules, + **kwargs, + ) + + return result + + def _mask_primitive( + self, + data: str, + dynamic_mask: bool | None, + custom_mask: str | None, + regex_pattern: str | None, + mask_format: str | None, + **kwargs, + ) -> str: + if regex_pattern and mask_format: + return self._regex_mask(data, regex_pattern, mask_format) + elif custom_mask: + return self._pattern_mask(data, custom_mask) + elif dynamic_mask: + return self._custom_erase(data, **kwargs) + else: + return DATA_MASKING_STRING + + def _mask_dict( + self, + data: dict, + dynamic_mask: bool | None, + custom_mask: str | None, + regex_pattern: str | None, + mask_format: str | None, + masking_rules: dict | None, + **kwargs, + ) -> dict: + if masking_rules: + return self._apply_masking_rules(data, masking_rules) + else: + return { + k: self.erase( + v, dynamic_mask=dynamic_mask, custom_mask=custom_mask, regex_pattern=regex_pattern, @@ -134,15 +149,32 @@ def erase( masking_rules=masking_rules, **kwargs, ) - for item in data - ] - result = type(data)(masked_data) + for k, v in data.items() + } - # Handle other types (int, float, bool, etc.) - else: - result = str(data) - - return result + def _mask_iterable( + self, + data: list | tuple | set, + dynamic_mask: bool | None, + custom_mask: str | None, + regex_pattern: str | None, + mask_format: str | None, + masking_rules: dict | None, + **kwargs, + ) -> list | tuple | set: + masked_data = [ + self.erase( + item, + dynamic_mask=dynamic_mask, + custom_mask=custom_mask, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + **kwargs, + ) + for item in data + ] + return type(data)(masked_data) def _apply_masking_rules(self, data: dict, masking_rules: dict) -> Any: """Apply masking rules to dictionary data.""" From 93c1544fd31cb282ba15fc86de3b76fd53fbf02c Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Thu, 9 Jan 2025 11:39:39 -0300 Subject: [PATCH 08/22] add tests for new masks --- .../utilities/data_masking/base.py | 2 - .../utilities/data_masking/provider/base.py | 15 ++++--- .../test_unit_data_masking.py | 43 +++++++++++++++++++ 3 files changed, 53 insertions(+), 7 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 8136c8bcaaf..23b7a684dde 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -119,8 +119,6 @@ def erase( mask_format: str | None = None, masking_rules: dict | None = None, ) -> str | list[str] | tuple[str] | dict: - if not data: - return data if masking_rules: return self._apply_masking_rules(data, masking_rules) else: diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 47079c42484..02e6406b862 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -77,11 +77,16 @@ def erase( masking_rules: dict | None = None, **kwargs, ) -> Any: - # Handle empty or None data - if data is None or (isinstance(data, (str, list, dict)) and not data): - return data - result = data # Default to returning the original data + result = DATA_MASKING_STRING + + if not any([dynamic_mask, custom_mask, regex_pattern, mask_format, masking_rules]): + if isinstance(data, (str, int, float, dict, bytes)): + return DATA_MASKING_STRING + elif isinstance(data, (list, tuple, set)): + return type(data)([DATA_MASKING_STRING] * len(data)) + else: + return DATA_MASKING_STRING if isinstance(data, (str, int, float)): result = self._mask_primitive(str(data), dynamic_mask, custom_mask, regex_pattern, mask_format, **kwargs) @@ -194,7 +199,7 @@ def _regex_mask(self, data: str, regex_pattern: str, mask_format: str) -> str: _regex_cache[regex_pattern] = re.compile(regex_pattern) return _regex_cache[regex_pattern].sub(mask_format, data) except re.error: - return DATA_MASKING_STRING + return data def _custom_erase(self, data: str, **kwargs) -> str: if not data: diff --git a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py index 4fbbc188ceb..cd728904cc7 100644 --- a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py +++ b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py @@ -25,6 +25,16 @@ def test_erase_int(data_masker): assert erased_string == DATA_MASKING_STRING +def test_erase_int_custom_mask(data_masker): + # GIVEN an int data type + + # WHEN erase is called with no fields argument + erased_string = data_masker.erase(42, custom_mask="XX") + + # THEN the result is the data masked + assert erased_string == "XX" + + def test_erase_float(data_masker): # GIVEN a float data type @@ -205,3 +215,36 @@ def test_parsing_nonexistent_fields_warning_on_missing_field(): # THEN the "erased" payload is the same of the original assert masked_json_string == data + + +def test_regex_mask(data_masker): + data = "Hello! My name is Fulano Ciclano" + regex_pattern = r"\b[A-Z][a-z]+ [A-Z][a-z]+\b" + mask_format = "XXXX XXXX" + + result = data_masker.erase(data, regex_pattern=regex_pattern, mask_format=mask_format) + + assert result == "Hello! My name is XXXX XXXX" + + +def test_erase_json_dict_with_fields_and_masks(data_masker): + # GIVEN the data type is a json representation of a dictionary + data = json.dumps( + { + "a": { + "1": {"None": "hello", "four": "world"}, + "b": {"3": {"4": "goodbye", "e": "world"}}, + }, + }, + ) + + # WHEN erase is called with a list of fields specified + masked_json_string = data_masker.erase(data, fields=["a.'1'.None", "a..'4'"], dynamic_mask=True) + + # THEN the result is only the specified fields are erased + assert masked_json_string == { + "a": { + "1": {"None": "*****", "four": "world"}, + "b": {"3": {"4": "*******", "e": "world"}}, + }, + } From 92d474020b65100fb23d4903fd51e42a5f38ce0c Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 15 Jan 2025 11:22:39 -0300 Subject: [PATCH 09/22] sub header for custom mask in docs --- docs/utilities/data_masking.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/utilities/data_masking.md b/docs/utilities/data_masking.md index c90abfc236e..596fa2c3fa3 100644 --- a/docs/utilities/data_masking.md +++ b/docs/utilities/data_masking.md @@ -117,6 +117,8 @@ Erasing will remove the original data and replace it with a `*****`. This means --8<-- "examples/data_masking/src/getting_started_erase_data_output.json" ``` +#### Custom masking + The `erase` method also supports additional flags for more advanced and flexible masking: === "dynamic_mask" From d9535d6ff78ee638c5ea9b4813c16b64c8dd8bdc Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 15 Jan 2025 13:03:04 -0300 Subject: [PATCH 10/22] masking rules to handle complex nest --- .../utilities/data_masking/base.py | 67 ++++++++++++------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 23b7a684dde..0dd41522d61 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -1,6 +1,5 @@ from __future__ import annotations -import ast import functools import logging import warnings @@ -296,33 +295,55 @@ def _apply_action_to_fields( def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: """ - Apply masking rules to data, supporting different rules for each field. + Apply masking rules to data, supporting both simple field names and complex path expressions. + + Args: + data: The dictionary containing data to mask + masking_rules: Dictionary mapping field names or path expressions to masking rules + + Returns: + dict: The masked data dictionary """ result = data.copy() for path, rule in masking_rules.items(): try: - # Handle nested paths (e.g., 'address.street') - parts = path.split(".") - current = result - - for part in parts[:-1]: - if isinstance(current[part], str) and current[part].startswith("{"): - try: - current[part] = ast.literal_eval(current[part]) - except (ValueError, SyntaxError): - continue - current = current[part] - - final_field = parts[-1] - - # Apply masking rule to the target field - if final_field in current: - current[final_field] = self.provider.erase(str(current[final_field]), **rule) - - except (KeyError, TypeError, AttributeError): - # Log warning if field not found or invalid path - warnings.warn(f"Could not apply masking rule for path: {path}", stacklevel=2) + if ".." in path: + # Handle recursive descent paths (e.g., "address..name") + base_path, field = path.split("..") + jsonpath_expr = parse(f"$.{base_path}..{field}") + elif "[" in path: + # Handle array notation paths (e.g., "address[*].street") + jsonpath_expr = parse(f"$.{path}") + else: + # Handle simple field names (e.g., "email") + jsonpath_expr = parse(f"$.{path}") + + matches = jsonpath_expr.find(result) + + if not matches: + warnings.warn(f"No matches found for path: {path}", stacklevel=2) + continue + + for match in matches: + try: + value = match.value + if value is not None: + if isinstance(value, dict): + # Handle dictionary values by masking each field + for k, v in value.items(): + if v is not None: + value[k] = self.provider.erase(str(v), **rule) + else: + masked_value = self.provider.erase(str(value), **rule) + match.full_path.update(result, masked_value) + + except Exception as e: + warnings.warn(f"Error masking value for path {path}: {str(e)}", stacklevel=2) + continue + + except Exception as e: + warnings.warn(f"Error processing path {path}: {str(e)}", stacklevel=2) continue return result From 9dc2b562e685d1ec1f37e691403c32eb7644f8c0 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 15 Jan 2025 13:18:25 -0300 Subject: [PATCH 11/22] add test for masking rules --- .../test_unit_data_masking.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py index cd728904cc7..8eb0f955958 100644 --- a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py +++ b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py @@ -248,3 +248,37 @@ def test_erase_json_dict_with_fields_and_masks(data_masker): "b": {"3": {"4": "*******", "e": "world"}}, }, } + + +def test_erase_json_dict_with_complex_masking_rules(data_masker): + # GIVEN the data type is a json representation of a dictionary with nested and filtered paths + data = json.dumps( + { + "email": "john.doe@example.com", + "age": 30, + "addres": [ + {"postcode": 13000, "street": "123 Main St", "details": {"name": "Home", "type": "Primary"}}, + {"postcode": 14000, "street": "456 Other Street", "details": {"name": "Office", "type": "Secondary"}}, + ], + }, + ) + + # WHEN erase is called with complex masking rules + masking_rules = { + "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, + "age": {"dynamic_mask": True}, + "addres..name": {"custom_mask": "xxx"}, + "addres[?(@.postcode > 12000)]": {"dynamic_mask": True}, + } + + masked_json_string = data_masker.erase(data, masking_rules=masking_rules) + + # THEN the result should have all specified fields masked according to their rules + assert masked_json_string == { + "email": "j****@example.com", + "age": "*****", + "addres": [ + {"postcode": "*****", "street": "*** *** **", "details": {"name": "xxx", "type": "*******"}}, + {"postcode": "*****", "street": "*** ***** ******", "details": {"name": "xxx", "type": "********"}}, + ], + } From 63c7918876c596c1b1d890885b2cee19a9d80a29 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Fri, 31 Jan 2025 09:33:55 -0300 Subject: [PATCH 12/22] modifications based on the feedback --- .../utilities/data_masking/base.py | 25 +++----------- docs/utilities/data_masking.md | 12 ++++++- .../data_masking/src/custom_data_masking.py | 4 +-- .../src/output_custom_masking.json | 29 ++++++++++++++++ .../src/payload_custom_masking.json | 34 +++++++++++++++++++ .../test_unit_data_masking.py | 6 +++- 6 files changed, 86 insertions(+), 24 deletions(-) create mode 100644 examples/data_masking/src/output_custom_masking.json create mode 100644 examples/data_masking/src/payload_custom_masking.json diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 0dd41522d61..00650789696 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -3,6 +3,7 @@ import functools import logging import warnings +from copy import deepcopy from typing import TYPE_CHECKING, Any, Callable, Mapping, Sequence, overload from jsonpath_ng.ext import parse @@ -304,21 +305,11 @@ def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: Returns: dict: The masked data dictionary """ - result = data.copy() + result = deepcopy(data) for path, rule in masking_rules.items(): try: - if ".." in path: - # Handle recursive descent paths (e.g., "address..name") - base_path, field = path.split("..") - jsonpath_expr = parse(f"$.{base_path}..{field}") - elif "[" in path: - # Handle array notation paths (e.g., "address[*].street") - jsonpath_expr = parse(f"$.{path}") - else: - # Handle simple field names (e.g., "email") - jsonpath_expr = parse(f"$.{path}") - + jsonpath_expr = parse(f"$.{path}") matches = jsonpath_expr.find(result) if not matches: @@ -329,14 +320,8 @@ def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: try: value = match.value if value is not None: - if isinstance(value, dict): - # Handle dictionary values by masking each field - for k, v in value.items(): - if v is not None: - value[k] = self.provider.erase(str(v), **rule) - else: - masked_value = self.provider.erase(str(value), **rule) - match.full_path.update(result, masked_value) + masked_value = self.provider.erase(str(value), **rule) + match.full_path.update(result, masked_value) except Exception as e: warnings.warn(f"Error masking value for path {path}: {str(e)}", stacklevel=2) diff --git a/docs/utilities/data_masking.md b/docs/utilities/data_masking.md index 596fa2c3fa3..94e470aa965 100644 --- a/docs/utilities/data_masking.md +++ b/docs/utilities/data_masking.md @@ -123,7 +123,7 @@ The `erase` method also supports additional flags for more advanced and flexible === "dynamic_mask" - (bool) When set to `True`, this flag enables custom masking behavior. It activates the use of advanced masking techniques such as pattern-based or regex-based masking. + (bool) Enables dynamic masking behavior when set to `True`, by maintaining the original length and structure of the text replacing with *. > Expression: `data_masker.erase(data, fields=["address.zip"], dynamic_mask=True)` @@ -152,6 +152,16 @@ The `erase` method also supports additional flags for more advanced and flexible ```python hl_lines="20" --8<-- "examples/data_masking/src/custom_data_masking.py" ``` +=== "Input example" + + ```json + --8<-- "examples/data_masking/src/payload_custom_masking.json" + ``` +=== "Masking rules output example" + + ```json hl_lines="4 5 10 21" + --8<-- "examples/data_masking/src/output_custom_masking.json" + ``` ### Encrypting data diff --git a/examples/data_masking/src/custom_data_masking.py b/examples/data_masking/src/custom_data_masking.py index 24a5d51bc81..7b96f6f379f 100644 --- a/examples/data_masking/src/custom_data_masking.py +++ b/examples/data_masking/src/custom_data_masking.py @@ -13,8 +13,8 @@ def lambda_handler(event: dict, context: LambdaContext) -> dict: masking_rules = { "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, "age": {"dynamic_mask": True}, - "address.zip": {"dynamic_mask": True, "custom_mask": "xxx"}, - "address.street": {"dynamic_mask": False}, + "address.zip": {"custom_mask": "xxx"}, + "$.other_address[?(@.postcode > 12000)]": {"custom_mask": "Masked"}, } result = data_masker.erase(data, masking_rules=masking_rules) diff --git a/examples/data_masking/src/output_custom_masking.json b/examples/data_masking/src/output_custom_masking.json new file mode 100644 index 00000000000..0571da99808 --- /dev/null +++ b/examples/data_masking/src/output_custom_masking.json @@ -0,0 +1,29 @@ +{ + "id": 1, + "name": "John Doe", + "age": "**", + "email": "j****@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "zip": "xxx", + "postcode": 12345, + "product": { + "name": "Car" + } + }, + "other_address": [ + { + "postcode": 11345, + "street": "123 Any Drive" + }, + "Masked" + ], + "company_address": { + "street": "456 ACME Ave", + "city": "Anytown", + "state": "CA", + "zip": "12345" + } +} \ No newline at end of file diff --git a/examples/data_masking/src/payload_custom_masking.json b/examples/data_masking/src/payload_custom_masking.json new file mode 100644 index 00000000000..d50b715ffa4 --- /dev/null +++ b/examples/data_masking/src/payload_custom_masking.json @@ -0,0 +1,34 @@ +{ + "body": { + "id": 1, + "name": "Jane Doe", + "age": 30, + "email": "janedoe@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "zip": "12345", + "postcode": 12345, + "product": { + "name": "Car" + } + }, + "other_address": [ + { + "postcode": 11345, + "street": "123 Any Drive" + }, + { + "postcode": 67890, + "street": "100 Main Street," + } + ], + "company_address": { + "street": "456 ACME Ave", + "city": "Anytown", + "state": "CA", + "zip": "12345" + } + } +} \ No newline at end of file diff --git a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py index 8eb0f955958..93588445034 100644 --- a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py +++ b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py @@ -218,12 +218,16 @@ def test_parsing_nonexistent_fields_warning_on_missing_field(): def test_regex_mask(data_masker): - data = "Hello! My name is Fulano Ciclano" + # GIVEN a str data type + data = "Hello! My name is John Doe" + + # WHEN erase is called with regex pattern and mask format regex_pattern = r"\b[A-Z][a-z]+ [A-Z][a-z]+\b" mask_format = "XXXX XXXX" result = data_masker.erase(data, regex_pattern=regex_pattern, mask_format=mask_format) + # THEN the result is the regex part masked by the masked format assert result == "Hello! My name is XXXX XXXX" From 84fc4fe867297be7b948627cbe743c92732dd016 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Mon, 3 Feb 2025 11:59:49 -0300 Subject: [PATCH 13/22] mypy and tests modification --- .../utilities/data_masking/base.py | 52 ++++++------------- .../utilities/data_masking/provider/base.py | 4 +- .../test_unit_data_masking.py | 25 +++------ 3 files changed, 26 insertions(+), 55 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 00650789696..adf0ba72f0e 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -4,7 +4,7 @@ import logging import warnings from copy import deepcopy -from typing import TYPE_CHECKING, Any, Callable, Mapping, Sequence, overload +from typing import TYPE_CHECKING, Any, Callable, Mapping, Sequence from jsonpath_ng.ext import parse @@ -66,6 +66,10 @@ def encrypt( fields=None, action=self.provider.encrypt, provider_options=provider_options or {}, + dynamic_mask=None, + custom_mask=None, + regex_pattern=None, + mask_format=None, **encryption_context, ) @@ -80,47 +84,25 @@ def decrypt( fields=None, action=self.provider.decrypt, provider_options=provider_options or {}, + dynamic_mask=None, + custom_mask=None, + regex_pattern=None, + mask_format=None, **encryption_context, ) - @overload - def erase(self, data, fields: None) -> str: ... - - @overload - def erase(self, data: list, fields: list[str]) -> list[str]: ... - - @overload - def erase(self, data: tuple, fields: list[str]) -> tuple[str]: ... - - @overload - def erase(self, data: dict, fields: list[str]) -> dict: ... - - @overload - def erase(self, data: dict[Any, Any], *, masking_rules: dict[str, object]) -> dict[Any, Any]: ... - - @overload - def erase( - self, - data: dict, - fields: list[str], - dynamic_mask: bool | None = None, - custom_mask: str | None = None, - regex_pattern: str | None = None, - mask_format: str | None = None, - ) -> dict: ... - def erase( self, - data: Sequence | Mapping, + data: Any, fields: list[str] | None = None, dynamic_mask: bool | None = None, custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, masking_rules: dict | None = None, - ) -> str | list[str] | tuple[str] | dict: + ) -> Any: if masking_rules: - return self._apply_masking_rules(data, masking_rules) + return self._apply_masking_rules(data=data, masking_rules=masking_rules) else: return self._apply_action( data=data, @@ -142,8 +124,8 @@ def _apply_action( custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, - **encryption_context: str, - ): + **kwargs: Any, + ) -> Any: """ Helper method to determine whether to apply a given action to the entire input data or to specific fields if the 'fields' argument is specified. @@ -159,8 +141,6 @@ def _apply_action( and returns the modified value. provider_options : dict Provider specific keyword arguments to propagate; used as an escape hatch. - encryption_context: str - Encryption context to use in encrypt and decrypt operations. Returns ------- @@ -179,7 +159,7 @@ def _apply_action( custom_mask=custom_mask, regex_pattern=regex_pattern, mask_format=mask_format, - **encryption_context, + **kwargs, ) else: logger.debug(f"Running action {action.__name__} with the entire data") @@ -190,7 +170,7 @@ def _apply_action( custom_mask=custom_mask, regex_pattern=regex_pattern, mask_format=mask_format, - **encryption_context, + **kwargs, ) def _apply_action_to_fields( diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 02e6406b862..db309f9e51b 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -28,7 +28,7 @@ def encrypt(self, data) -> str: def decrypt(self, data) -> Any: # Implementation logic for data decryption - def erase(self, data) -> str | Iterable: + def erase(self, data) -> Any | Iterable: # Implementation logic for data masking pass @@ -78,7 +78,7 @@ def erase( **kwargs, ) -> Any: - result = DATA_MASKING_STRING + result: Any = DATA_MASKING_STRING if not any([dynamic_mask, custom_mask, regex_pattern, mask_format, masking_rules]): if isinstance(data, (str, int, float, dict, bytes)): diff --git a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py index 93588445034..9ec1cc4e1f5 100644 --- a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py +++ b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py @@ -256,23 +256,17 @@ def test_erase_json_dict_with_fields_and_masks(data_masker): def test_erase_json_dict_with_complex_masking_rules(data_masker): # GIVEN the data type is a json representation of a dictionary with nested and filtered paths - data = json.dumps( - { - "email": "john.doe@example.com", - "age": 30, - "addres": [ - {"postcode": 13000, "street": "123 Main St", "details": {"name": "Home", "type": "Primary"}}, - {"postcode": 14000, "street": "456 Other Street", "details": {"name": "Office", "type": "Secondary"}}, - ], - }, - ) + data = { + "email": "johndoe@example.com", + "age": 30, + "address": {"zip": 13000, "street": "123 Main St", "details": {"name": "Home", "type": "Primary"}}, + } # WHEN erase is called with complex masking rules masking_rules = { "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, "age": {"dynamic_mask": True}, - "addres..name": {"custom_mask": "xxx"}, - "addres[?(@.postcode > 12000)]": {"dynamic_mask": True}, + "address.zip": {"custom_mask": "xxx"}, } masked_json_string = data_masker.erase(data, masking_rules=masking_rules) @@ -280,9 +274,6 @@ def test_erase_json_dict_with_complex_masking_rules(data_masker): # THEN the result should have all specified fields masked according to their rules assert masked_json_string == { "email": "j****@example.com", - "age": "*****", - "addres": [ - {"postcode": "*****", "street": "*** *** **", "details": {"name": "xxx", "type": "*******"}}, - {"postcode": "*****", "street": "*** ***** ******", "details": {"name": "xxx", "type": "********"}}, - ], + "age": "**", + "address": {"zip": "xxx", "street": "123 Main St", "details": {"name": "Home", "type": "Primary"}}, } From 9272ac07009b9ef788e4d14177dcd9e44daaf167 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Mon, 3 Feb 2025 15:12:46 -0300 Subject: [PATCH 14/22] create more tests --- .../utilities/data_masking/base.py | 2 +- .../test_unit_data_masking.py | 146 ++++++++++++++++++ 2 files changed, 147 insertions(+), 1 deletion(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index adf0ba72f0e..10835d2173e 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -321,7 +321,7 @@ def _mask_nested_field(self, data: dict, field_path: str, mask_function): if not isinstance(current, dict): return if keys[-1] in current: - current[keys[-1]] = mask_function(current[keys[-1]]) + current[keys[-1]] = self.provider.erase(current[keys[-1]], **mask_function) @staticmethod def _call_action( diff --git a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py index 9ec1cc4e1f5..39457225aad 100644 --- a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py +++ b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py @@ -277,3 +277,149 @@ def test_erase_json_dict_with_complex_masking_rules(data_masker): "age": "**", "address": {"zip": "xxx", "street": "123 Main St", "details": {"name": "Home", "type": "Primary"}}, } + + +def test_no_matches_for_masking_rule(data_masker): + # GIVEN a dictionary without the expected field + data = {"name": "Ana"} + masking_rules = {"$.missing_field": {"dynamic_mask": True}} + + # WHEN applying the masking rule + with pytest.warns(UserWarning, match=r"No matches found for path: \$\.missing_field"): + result = data_masker._apply_masking_rules(data, masking_rules) + + # THEN the original data remains unchanged + assert result == data + + +def test_warning_during_masking_value(data_masker): + # GIVEN data and a masking rule + data = {"value": "test"} + + # Mock provider that raises an error + class MockProvider: + def erase(self, value, **kwargs): + raise ValueError("Mock error") + + data_masker.provider = MockProvider() + + # WHEN erase is called + with pytest.warns(UserWarning, match="Error masking value for path value: Mock error"): + masked_data = data_masker.erase(data, masking_rules={"value": {"rule": "value"}}) + + # THEN the original data should remain unchanged + assert masked_data["value"] == "test" + + +def test_mask_nested_field_with_non_dict_value(data_masker): + # GIVEN nested data where a middle path component is not a dictionary + data = {"user": {"contact": "not_a_dict", "details": {"ssn": "123-45-6789"}}} # This will stop the traversal + + # WHEN attempting to mask a field through a path containing a non-dict value + data_masker._mask_nested_field(data, "user.contact.details.ssn", lambda x: "MASKED") + + # THEN the data should remain unchanged since traversal stopped at non-dict value + assert data == {"user": {"contact": "not_a_dict", "details": {"ssn": "123-45-6789"}}} + + +def test_mask_nested_field_success(data_masker): + # GIVEN nested data with a field to mask + data = {"user": {"contact": {"details": {"address": {"street": "123 Main St", "zip": "12345"}}}}} + + # WHEN masking a nested field with a masking rule + data_masker._mask_nested_field(data, "user.contact.details.address.zip", {"custom_mask": "xxx"}) + + # THEN the nested field should be masked while other data remains unchanged + assert data == {"user": {"contact": {"details": {"address": {"street": "123 Main St", "zip": "xxx"}}}}} + + +## teste aqui +def test_erase_dictionary_with_masking_rules(data_masker): + # GIVEN a dictionary with nested sensitive data + data = {"user": {"name": "John Doe", "ssn": "123-45-6789", "address": {"street": "123 Main St", "zip": "12345"}}} + + # AND masking rules for specific fields + masking_rules = {"user.ssn": {"custom_mask": "XXX-XX-XXXX"}, "user.address.zip": {"custom_mask": "00000"}} + + # WHEN erase is called with masking rules + result = data_masker.erase(data, masking_rules=masking_rules) + + # THEN only the specified fields should be masked + assert result == { + "user": { + "name": "John Doe", # unchanged + "ssn": "XXX-XX-XXXX", # masked + "address": {"street": "123 Main St", "zip": "00000"}, # unchanged # masked + }, + } + + +def test_erase_dictionary_with_global_mask(data_masker): + # GIVEN a dictionary with sensitive data + data = {"user": {"name": "John Doe", "ssn": "123-45-6789"}} + + # WHEN erase is called with a custom mask for all fields + result = data_masker.erase(data, custom_mask="REDACTED") + + # THEN all fields should use the custom mask + assert result == {"user": {"name": "REDACTED", "ssn": "REDACTED"}} + + +def test_erase_empty_dictionary(data_masker): + # GIVEN an empty dictionary + data = {} + + # WHEN erase is called + result = data_masker.erase(data, custom_mask="MASKED") + + # THEN an empty dictionary should be returned + assert result == {} + + +def test_erase_different_iterables_with_masking(data_masker): + # GIVEN different types of iterables + list_data = ["name", "phone", "email"] + tuple_data = ("name", "phone", "email") + set_data = {"name", "phone", "email"} + + # WHEN erase is called with a custom mask + masked_list = data_masker.erase(list_data, custom_mask="XXX") + masked_tuple = data_masker.erase(tuple_data, custom_mask="XXX") + masked_set = data_masker.erase(set_data, custom_mask="XXX") + + # THEN the masked data should maintain its original type + assert isinstance(masked_list, list) + assert isinstance(masked_tuple, tuple) + assert isinstance(masked_set, set) + + # AND all values should be masked + expected_values = {"XXX"} + assert set(masked_list) == expected_values + assert set(masked_tuple) == expected_values + assert masked_set == expected_values + + +def test_erase_handles_invalid_regex_pattern(data_masker): + # GIVEN a string and an invalid regex pattern + data = "test123" + + # WHEN masking with invalid regex + result = data_masker.erase( + data, + regex_pattern="[", + mask_format="X", # Invalid regex pattern that will raise re.error + ) + + # THEN original data should be returned + assert result == "test123" + + +def test_erase_handles_empty_string_with_dynamic_mask(data_masker): + # GIVEN an empty string + data = "" + + # WHEN erase is called with dynamic_mask + result = data_masker.erase(data, dynamic_mask=True) + + # THEN empty string should be returned + assert result == "" From ce4fbaca61341660db80d83827085f5db1460ec0 Mon Sep 17 00:00:00 2001 From: Leandro Damascena Date: Tue, 4 Feb 2025 18:12:39 +0000 Subject: [PATCH 15/22] Refactoring tests --- noxfile.py | 2 + .../test_erase_data_masking.py} | 35 +++++++++--------- .../test_base_functions.py | 37 +++++++++++++++++++ 3 files changed, 56 insertions(+), 18 deletions(-) rename tests/{unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py => functional/data_masking/required_dependencies/test_erase_data_masking.py} (92%) create mode 100644 tests/unit/data_masking/required_dependencies/test_base_functions.py diff --git a/noxfile.py b/noxfile.py index 7b73fd0dc59..4710bcbca2c 100644 --- a/noxfile.py +++ b/noxfile.py @@ -140,6 +140,8 @@ def test_with_aws_encryption_sdk_as_required_package(session: nox.Session): folders=[ f"{PREFIX_TESTS_FUNCTIONAL}/data_masking/_aws_encryption_sdk/", f"{PREFIX_TESTS_UNIT}/data_masking/_aws_encryption_sdk/", + f"{PREFIX_TESTS_FUNCTIONAL}/data_masking/required_dependencies/", + f"{PREFIX_TESTS_UNIT}/data_masking/required_dependencies/", ], extras="datamasking", ) diff --git a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py b/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py similarity index 92% rename from tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py rename to tests/functional/data_masking/required_dependencies/test_erase_data_masking.py index 39457225aad..49ba5020d80 100644 --- a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py +++ b/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py @@ -269,7 +269,7 @@ def test_erase_json_dict_with_complex_masking_rules(data_masker): "address.zip": {"custom_mask": "xxx"}, } - masked_json_string = data_masker.erase(data, masking_rules=masking_rules) + masked_json_string = data_masker.erase(data=data, masking_rules=masking_rules) # THEN the result should have all specified fields masked according to their rules assert masked_json_string == { @@ -285,8 +285,8 @@ def test_no_matches_for_masking_rule(data_masker): masking_rules = {"$.missing_field": {"dynamic_mask": True}} # WHEN applying the masking rule - with pytest.warns(UserWarning, match=r"No matches found for path: \$\.missing_field"): - result = data_masker._apply_masking_rules(data, masking_rules) + with pytest.warns(UserWarning, match=r"No matches found *"): + result = data_masker.erase(data=data, masking_rules=masking_rules) # THEN the original data remains unchanged assert result == data @@ -311,29 +311,16 @@ def erase(self, value, **kwargs): assert masked_data["value"] == "test" -def test_mask_nested_field_with_non_dict_value(data_masker): - # GIVEN nested data where a middle path component is not a dictionary - data = {"user": {"contact": "not_a_dict", "details": {"ssn": "123-45-6789"}}} # This will stop the traversal - - # WHEN attempting to mask a field through a path containing a non-dict value - data_masker._mask_nested_field(data, "user.contact.details.ssn", lambda x: "MASKED") - - # THEN the data should remain unchanged since traversal stopped at non-dict value - assert data == {"user": {"contact": "not_a_dict", "details": {"ssn": "123-45-6789"}}} - - def test_mask_nested_field_success(data_masker): # GIVEN nested data with a field to mask data = {"user": {"contact": {"details": {"address": {"street": "123 Main St", "zip": "12345"}}}}} # WHEN masking a nested field with a masking rule - data_masker._mask_nested_field(data, "user.contact.details.address.zip", {"custom_mask": "xxx"}) + data_masked = data_masker.erase(data=data, fields=["user.contact.details.address.zip"], custom_mask="xxx") # THEN the nested field should be masked while other data remains unchanged - assert data == {"user": {"contact": {"details": {"address": {"street": "123 Main St", "zip": "xxx"}}}}} - + assert data_masked == {"user": {"contact": {"details": {"address": {"street": "123 Main St", "zip": "xxx"}}}}} -## teste aqui def test_erase_dictionary_with_masking_rules(data_masker): # GIVEN a dictionary with nested sensitive data data = {"user": {"name": "John Doe", "ssn": "123-45-6789", "address": {"street": "123 Main St", "zip": "12345"}}} @@ -423,3 +410,15 @@ def test_erase_handles_empty_string_with_dynamic_mask(data_masker): # THEN empty string should be returned assert result == "" + +def test_erase_dictionary_with_masking_rules_wrong_field(data_masker): + # GIVEN a dictionary with nested sensitive data + data = {"user": {"name": "John Doe", "ssn": "123-45-6789", "address": {"street": "123 Main St", "zip": "12345"}}} + + # AND masking rules for specific fields + masking_rules = {"user.ssn...": {"custom_mask": "XXX-XX-XXXX"}, "user.address.zip": {"custom_mask": "00000"}} + + # WHEN erase is called with wrong masking rules + # We must have a warning + with pytest.warns(UserWarning, match="Error processing path*"): + data_masker.erase(data, masking_rules=masking_rules) diff --git a/tests/unit/data_masking/required_dependencies/test_base_functions.py b/tests/unit/data_masking/required_dependencies/test_base_functions.py new file mode 100644 index 00000000000..e73e656e987 --- /dev/null +++ b/tests/unit/data_masking/required_dependencies/test_base_functions.py @@ -0,0 +1,37 @@ + +import json + +import pytest + +from aws_lambda_powertools.utilities.data_masking.base import DataMasking +from aws_lambda_powertools.utilities.data_masking.constants import DATA_MASKING_STRING +from aws_lambda_powertools.utilities.data_masking.exceptions import ( + DataMaskingFieldNotFoundError, + DataMaskingUnsupportedTypeError, +) + + +@pytest.fixture +def data_masker() -> DataMasking: + return DataMasking() + +def test_mask_nested_field_with_non_dict_value(data_masker): + # GIVEN nested data where a middle path component is not a dictionary + data = {"user": {"contact": "not_a_dict", "details": {"ssn": "123-45-6789"}}} # This will stop the traversal + + # WHEN attempting to mask a field through a path containing a non-dict value + data_masker._mask_nested_field(data, "user.contact.details.ssn", lambda x: "MASKED") + + # THEN the data should remain unchanged since traversal stopped at non-dict value + assert data == {"user": {"contact": "not_a_dict", "details": {"ssn": "123-45-6789"}}} + + +def test_mask_nested_field_success(data_masker): + # GIVEN nested data with a field to mask + data = {"user": {"contact": {"details": {"address": {"street": "123 Main St", "zip": "12345"}}}}} + + # WHEN masking a nested field with a masking rule + data_masker._mask_nested_field(data, "user.contact.details.address.zip", {"custom_mask": "xxx"}) + + # THEN the nested field should be masked while other data remains unchanged + assert data == {"user": {"contact": {"details": {"address": {"street": "123 Main St", "zip": "xxx"}}}}} From fdf1b095675af1edb1fada4055033f93d252e09c Mon Sep 17 00:00:00 2001 From: Leandro Damascena Date: Tue, 4 Feb 2025 18:14:13 +0000 Subject: [PATCH 16/22] Refactoring tests --- .../required_dependencies/test_erase_data_masking.py | 2 ++ .../required_dependencies/test_base_functions.py | 9 +-------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py b/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py index 49ba5020d80..629fe4fde85 100644 --- a/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py +++ b/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py @@ -321,6 +321,7 @@ def test_mask_nested_field_success(data_masker): # THEN the nested field should be masked while other data remains unchanged assert data_masked == {"user": {"contact": {"details": {"address": {"street": "123 Main St", "zip": "xxx"}}}}} + def test_erase_dictionary_with_masking_rules(data_masker): # GIVEN a dictionary with nested sensitive data data = {"user": {"name": "John Doe", "ssn": "123-45-6789", "address": {"street": "123 Main St", "zip": "12345"}}} @@ -411,6 +412,7 @@ def test_erase_handles_empty_string_with_dynamic_mask(data_masker): # THEN empty string should be returned assert result == "" + def test_erase_dictionary_with_masking_rules_wrong_field(data_masker): # GIVEN a dictionary with nested sensitive data data = {"user": {"name": "John Doe", "ssn": "123-45-6789", "address": {"street": "123 Main St", "zip": "12345"}}} diff --git a/tests/unit/data_masking/required_dependencies/test_base_functions.py b/tests/unit/data_masking/required_dependencies/test_base_functions.py index e73e656e987..1af532967c7 100644 --- a/tests/unit/data_masking/required_dependencies/test_base_functions.py +++ b/tests/unit/data_masking/required_dependencies/test_base_functions.py @@ -1,20 +1,13 @@ - -import json - import pytest from aws_lambda_powertools.utilities.data_masking.base import DataMasking -from aws_lambda_powertools.utilities.data_masking.constants import DATA_MASKING_STRING -from aws_lambda_powertools.utilities.data_masking.exceptions import ( - DataMaskingFieldNotFoundError, - DataMaskingUnsupportedTypeError, -) @pytest.fixture def data_masker() -> DataMasking: return DataMasking() + def test_mask_nested_field_with_non_dict_value(data_masker): # GIVEN nested data where a middle path component is not a dictionary data = {"user": {"contact": "not_a_dict", "details": {"ssn": "123-45-6789"}}} # This will stop the traversal From 3fa18ccecef8d10189c181cc7a540c9d75995bbc Mon Sep 17 00:00:00 2001 From: Leandro Damascena Date: Tue, 4 Feb 2025 18:38:07 +0000 Subject: [PATCH 17/22] Refactoring tests --- aws_lambda_powertools/utilities/data_masking/base.py | 6 ++++-- .../required_dependencies/test_erase_data_masking.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 65d8018f61d..56f2627c378 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -19,6 +19,7 @@ DataMaskingUnsupportedTypeError, ) from aws_lambda_powertools.utilities.data_masking.provider import BaseProvider +from aws_lambda_powertools.warnings import PowertoolsUserWarning if TYPE_CHECKING: from numbers import Number @@ -102,6 +103,7 @@ def erase( self, data: Any, fields: list[str] | None = None, + *, dynamic_mask: bool | None = None, custom_mask: str | None = None, regex_pattern: str | None = None, @@ -311,11 +313,11 @@ def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: match.full_path.update(result, masked_value) except Exception as e: - warnings.warn(f"Error masking value for path {path}: {str(e)}", stacklevel=2) + warnings.warn(f"Error masking value for path {path}: {str(e)}", category=PowertoolsUserWarning, stacklevel=2) continue except Exception as e: - warnings.warn(f"Error processing path {path}: {str(e)}", stacklevel=2) + warnings.warn(f"Error processing path {path}: {str(e)}", category=PowertoolsUserWarning, stacklevel=2) continue return result diff --git a/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py b/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py index 629fe4fde85..a7436476ad7 100644 --- a/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py +++ b/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py @@ -8,6 +8,7 @@ DataMaskingFieldNotFoundError, DataMaskingUnsupportedTypeError, ) +from aws_lambda_powertools.warnings import PowertoolsUserWarning @pytest.fixture @@ -304,7 +305,7 @@ def erase(self, value, **kwargs): data_masker.provider = MockProvider() # WHEN erase is called - with pytest.warns(UserWarning, match="Error masking value for path value: Mock error"): + with pytest.warns(expected_warning=PowertoolsUserWarning, match="Error masking value for path value: Mock error"): masked_data = data_masker.erase(data, masking_rules={"value": {"rule": "value"}}) # THEN the original data should remain unchanged @@ -422,5 +423,5 @@ def test_erase_dictionary_with_masking_rules_wrong_field(data_masker): # WHEN erase is called with wrong masking rules # We must have a warning - with pytest.warns(UserWarning, match="Error processing path*"): + with pytest.warns(expected_warning=PowertoolsUserWarning, match="Error processing path*"): data_masker.erase(data, masking_rules=masking_rules) From b6d2a9425d33f2c971ca4999eaaed8079e6332f9 Mon Sep 17 00:00:00 2001 From: Leandro Damascena Date: Tue, 4 Feb 2025 18:53:52 +0000 Subject: [PATCH 18/22] Adding docstring + arg parameter --- .../utilities/data_masking/base.py | 80 ++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 56f2627c378..1d1a865922c 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -69,6 +69,30 @@ def encrypt( provider_options: dict | None = None, **encryption_context: str, ) -> str: + """ + Encrypt data using the configured encryption provider. + + Parameters + ---------- + data : dict, Mapping, Sequence, or Number + The data to encrypt. + provider_options : dict, optional + Provider-specific options for encryption. + **encryption_context : str + Additional key-value pairs for encryption context. + + Returns + ------- + str + The encrypted data as a base64-encoded string. + + Example + -------- + + encryption_provider = AWSEncryptionSDKProvider(keys=[KMS_KEY_ARN]) + data_masker = DataMasking(provider=encryption_provider) + encrypted = data_masker.encrypt({"secret": "value"}) + """ return self._apply_action( data=data, fields=None, @@ -87,6 +111,31 @@ def decrypt( provider_options: dict | None = None, **encryption_context: str, ) -> Any: + """ + Decrypt data using the configured encryption provider. + + Parameters + ---------- + data : dict, Mapping, Sequence, or Number + The data to encrypt. + provider_options : dict, optional + Provider-specific options for encryption. + **encryption_context : str + Additional key-value pairs for encryption context. + + Returns + ------- + str + The encrypted data as a base64-encoded string. + + Example + -------- + + encryption_provider = AWSEncryptionSDKProvider(keys=[KMS_KEY_ARN]) + data_masker = DataMasking(provider=encryption_provider) + encrypted = data_masker.decrypt(encrypted_data) + """ + return self._apply_action( data=data, fields=None, @@ -110,6 +159,31 @@ def erase( mask_format: str | None = None, masking_rules: dict | None = None, ) -> Any: + """ + Erase or mask sensitive data in the input. + + Parameters + ---------- + data : Any + The data to be erased or masked. + fields : list of str, optional + List of field names to be erased or masked. + dynamic_mask : bool, optional + Whether to use dynamic masking. + custom_mask : str, optional + Custom mask to apply instead of the default. + regex_pattern : str, optional + Regular expression pattern for identifying data to mask. + mask_format : str, optional + Format string for the mask. + masking_rules : dict, optional + Dictionary of custom masking rules. + + Returns + ------- + Any + The data with sensitive information erased or masked. + """ if masking_rules: return self._apply_masking_rules(data=data, masking_rules=masking_rules) else: @@ -313,7 +387,11 @@ def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: match.full_path.update(result, masked_value) except Exception as e: - warnings.warn(f"Error masking value for path {path}: {str(e)}", category=PowertoolsUserWarning, stacklevel=2) + warnings.warn( + f"Error masking value for path {path}: {str(e)}", + category=PowertoolsUserWarning, + stacklevel=2, + ) continue except Exception as e: From 3a17ab32496c6839519e1616c818b575af000b49 Mon Sep 17 00:00:00 2001 From: Leandro Damascena Date: Tue, 4 Feb 2025 18:58:11 +0000 Subject: [PATCH 19/22] Adding docstring + arg parameter --- aws_lambda_powertools/utilities/data_masking/provider/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 8f1c5d9b0ac..69ca4cb4e9f 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -127,7 +127,7 @@ def _mask_primitive( elif custom_mask: return self._pattern_mask(data, custom_mask) elif dynamic_mask: - return self._custom_erase(data, **kwargs) + return self._custom_erase(data) else: return DATA_MASKING_STRING @@ -201,7 +201,7 @@ def _regex_mask(self, data: str, regex_pattern: str, mask_format: str) -> str: except re.error: return data - def _custom_erase(self, data: str, **kwargs) -> str: + def _custom_erase(self, data: str) -> str: if not data: return "" From cba411e21637226160311478ab59a87b89ec8ac6 Mon Sep 17 00:00:00 2001 From: Leandro Damascena Date: Tue, 11 Feb 2025 09:42:07 +0000 Subject: [PATCH 20/22] Removing unnecessary code --- .../utilities/data_masking/provider/base.py | 2 -- .../required_dependencies/test_erase_data_masking.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 69ca4cb4e9f..5baedbadc11 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -128,8 +128,6 @@ def _mask_primitive( return self._pattern_mask(data, custom_mask) elif dynamic_mask: return self._custom_erase(data) - else: - return DATA_MASKING_STRING def _mask_dict( self, diff --git a/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py b/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py index a7436476ad7..392993e6461 100644 --- a/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py +++ b/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py @@ -280,6 +280,16 @@ def test_erase_json_dict_with_complex_masking_rules(data_masker): } +def test_dynamic_mask_with_string(data_masker): + # GIVEN the data type is a json representation of a dictionary with nested and filtered paths + data = "XYZEKDEDE" + + masked_json_string = data_masker.erase(data=data, dynamic_mask=True) + + # THEN the result should have all specified fields masked according to their rules + assert masked_json_string == "*********" + + def test_no_matches_for_masking_rule(data_masker): # GIVEN a dictionary without the expected field data = {"name": "Ana"} From f23f47675e7354d72226621e69fd3038f56995d8 Mon Sep 17 00:00:00 2001 From: Leandro Damascena Date: Tue, 11 Feb 2025 10:06:47 +0000 Subject: [PATCH 21/22] Removing unnecessary code --- .../utilities/data_masking/base.py | 3 -- .../utilities/data_masking/provider/base.py | 45 +++++----------- .../test_erase_data_masking.py | 51 +++++++++++++++++++ 3 files changed, 65 insertions(+), 34 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 1d1a865922c..4e4486f89a9 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -450,9 +450,6 @@ def _call_action( return fields[field_name] def _normalize_data_to_parse(self, fields: list, data: str | dict) -> dict: - if not fields: - raise ValueError("No fields specified.") - if isinstance(data, str): # Parse JSON string as dictionary data_parsed = self.json_deserializer(data) diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 5baedbadc11..16fa22d16b8 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -89,7 +89,7 @@ def erase( return DATA_MASKING_STRING if isinstance(data, (str, int, float)): - result = self._mask_primitive(str(data), dynamic_mask, custom_mask, regex_pattern, mask_format, **kwargs) + result = self._mask_primitive(str(data), dynamic_mask, custom_mask, regex_pattern, mask_format) elif isinstance(data, dict): result = self._mask_dict( data, @@ -98,7 +98,6 @@ def erase( regex_pattern, mask_format, masking_rules, - **kwargs, ) elif isinstance(data, (list, tuple, set)): result = self._mask_iterable( @@ -108,7 +107,6 @@ def erase( regex_pattern, mask_format, masking_rules, - **kwargs, ) return result @@ -120,14 +118,13 @@ def _mask_primitive( custom_mask: str | None, regex_pattern: str | None, mask_format: str | None, - **kwargs, ) -> str: if regex_pattern and mask_format: return self._regex_mask(data, regex_pattern, mask_format) elif custom_mask: return self._pattern_mask(data, custom_mask) - elif dynamic_mask: - return self._custom_erase(data) + + return self._custom_erase(data) def _mask_dict( self, @@ -137,23 +134,18 @@ def _mask_dict( regex_pattern: str | None, mask_format: str | None, masking_rules: dict | None, - **kwargs, ) -> dict: - if masking_rules: - return self._apply_masking_rules(data, masking_rules) - else: - return { - k: self.erase( - v, - dynamic_mask=dynamic_mask, - custom_mask=custom_mask, - regex_pattern=regex_pattern, - mask_format=mask_format, - masking_rules=masking_rules, - **kwargs, - ) - for k, v in data.items() - } + return { + k: self.erase( + v, + dynamic_mask=dynamic_mask, + custom_mask=custom_mask, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + ) + for k, v in data.items() + } def _mask_iterable( self, @@ -163,7 +155,6 @@ def _mask_iterable( regex_pattern: str | None, mask_format: str | None, masking_rules: dict | None, - **kwargs, ) -> list | tuple | set: masked_data = [ self.erase( @@ -173,19 +164,11 @@ def _mask_iterable( regex_pattern=regex_pattern, mask_format=mask_format, masking_rules=masking_rules, - **kwargs, ) for item in data ] return type(data)(masked_data) - def _apply_masking_rules(self, data: dict, masking_rules: dict) -> Any: - """Apply masking rules to dictionary data.""" - return { - key: self.erase(str(value), **masking_rules[key]) if key in masking_rules else str(value) - for key, value in data.items() - } - def _pattern_mask(self, data: str, pattern: str) -> str: """Apply pattern masking to string data.""" return pattern[: len(data)] if len(pattern) >= len(data) else pattern diff --git a/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py b/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py index 392993e6461..12ffd054376 100644 --- a/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py +++ b/tests/functional/data_masking/required_dependencies/test_erase_data_masking.py @@ -232,6 +232,24 @@ def test_regex_mask(data_masker): assert result == "Hello! My name is XXXX XXXX" +def test_regex_mask_with_cache(data_masker): + # GIVEN a str data type + data = "Hello! My name is John Doe" + data1 = "Hello! My name is John Xix" + + # WHEN erase is called with regex pattern and mask format + regex_pattern = r"\b[A-Z][a-z]+ [A-Z][a-z]+\b" + mask_format = "XXXX XXXX" + + # WHEN erasing twice to check the regex compiled and stored in the cache + result = data_masker.erase(data, regex_pattern=regex_pattern, mask_format=mask_format) + result1 = data_masker.erase(data1, regex_pattern=regex_pattern, mask_format=mask_format) + + # THEN the result is the regex part masked by the masked format + assert result == "Hello! My name is XXXX XXXX" + assert result1 == "Hello! My name is XXXX XXXX" + + def test_erase_json_dict_with_fields_and_masks(data_masker): # GIVEN the data type is a json representation of a dictionary data = json.dumps( @@ -353,6 +371,39 @@ def test_erase_dictionary_with_masking_rules(data_masker): } +def test_erase_dictionary_with_masking_rules_with_list(data_masker): + # GIVEN a dictionary with nested sensitive data + data = {"user": {"name": ["leandro", "powertools"]}} + + # AND masking rules for specific fields + masking_rules = {"user.name": {"custom_mask": "NO-NAME"}} + + # WHEN erase is called with masking rules + result = data_masker.erase(data, masking_rules=masking_rules) + + # THEN only the specified fields should be masked + assert result == { + "user": { + "name": "NO-NAME", + }, + } + + +def test_erase_list_with_custom_mask(data_masker): + # GIVEN a dictionary with nested sensitive data + data = {"user": {"name": ["leandro", "powertools"]}} + + # WHEN erase is called with masking rules + result = data_masker.erase(data, fields=["user.name"], dynamic_mask=True) + + # THEN only the specified fields should be masked + assert result == { + "user": { + "name": ["*******", "**********"], + }, + } + + def test_erase_dictionary_with_global_mask(data_masker): # GIVEN a dictionary with sensitive data data = {"user": {"name": "John Doe", "ssn": "123-45-6789"}} From 9c3b9b2ba7211dc171e2c209d0a801cfb0a032ba Mon Sep 17 00:00:00 2001 From: Leandro Damascena Date: Tue, 11 Feb 2025 10:17:11 +0000 Subject: [PATCH 22/22] Removing unnecessary code --- aws_lambda_powertools/utilities/data_masking/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 4e4486f89a9..c5fd6f274e7 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -315,7 +315,7 @@ def _apply_action_to_fields( if not fields: raise ValueError("Fields parameter cannot be empty") - data_parsed: dict = self._normalize_data_to_parse(fields, data) + data_parsed: dict = self._normalize_data_to_parse(data) # For in-place updates, json_parse accepts a callback function # this function must receive 3 args: field_value, fields, field_name @@ -449,7 +449,7 @@ def _call_action( ) return fields[field_name] - def _normalize_data_to_parse(self, fields: list, data: str | dict) -> dict: + def _normalize_data_to_parse(self, data: str | dict) -> dict: if isinstance(data, str): # Parse JSON string as dictionary data_parsed = self.json_deserializer(data)