Skip to content

Commit add3a08

Browse files
sethmoobusunkim96
andauthored
Add text redaction sample using DLP (#3964)
* Add text redaction sample using DLP * Update dlp/deid.py Co-authored-by: Bu Sun Kim <[email protected]> * Rename string parameter to item Co-authored-by: Bu Sun Kim <[email protected]>
1 parent 2d6afd5 commit add3a08

File tree

3 files changed

+104
-4
lines changed

3 files changed

+104
-4
lines changed

dlp/README.rst

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -339,13 +339,12 @@ To run this sample:
339339
.. code-block:: bash
340340
341341
$ python deid.py
342-
343-
usage: deid.py [-h] {deid_mask,deid_fpe,reid_fpe,deid_date_shift} ...
342+
usage: deid.py [-h] {deid_mask,deid_fpe,reid_fpe,deid_date_shift,redact} ...
344343
345344
Uses of the Data Loss Prevention API for deidentifying sensitive data.
346345
347346
positional arguments:
348-
{deid_mask,deid_fpe,reid_fpe,deid_date_shift}
347+
{deid_mask,deid_fpe,reid_fpe,deid_date_shift,redact}
349348
Select how to submit content to the API.
350349
deid_mask Deidentify sensitive data in a string by masking it
351350
with a character.
@@ -355,6 +354,8 @@ To run this sample:
355354
Preserving Encryption (FPE).
356355
deid_date_shift Deidentify dates in a CSV file by pseudorandomly
357356
shifting them.
357+
redact Redact sensitive data in a string by replacing it with
358+
the info type of the data.
358359
359360
optional arguments:
360361
-h, --help show this help message and exit
@@ -378,4 +379,4 @@ to `browse the source`_ and `report issues`_.
378379
https://github.com/GoogleCloudPlatform/google-cloud-python/issues
379380
380381
381-
.. _Google Cloud SDK: https://cloud.google.com/sdk/
382+
.. _Google Cloud SDK: https://cloud.google.com/sdk/

dlp/deid.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,61 @@ def write_data(data):
435435
# [END dlp_deidentify_date_shift]
436436

437437

438+
# [START dlp_redact_sensitive_data]
439+
def redact_sensitive_data(project, item, info_types):
440+
"""Uses the Data Loss Prevention API to redact sensitive data in a
441+
string by replacing it with the info type.
442+
Args:
443+
project: The Google Cloud project id to use as a parent resource.
444+
item: The string to redact (will be treated as text).
445+
info_types: A list of strings representing info types to look for.
446+
A full list of info type categories can be fetched from the API.
447+
Returns:
448+
None; the response from the API is printed to the terminal.
449+
"""
450+
451+
# Import the client library
452+
import google.cloud.dlp
453+
454+
# Instantiate a client
455+
dlp = google.cloud.dlp_v2.DlpServiceClient()
456+
457+
# Convert the project id into a full resource id.
458+
parent = dlp.project_path(project)
459+
460+
# Construct inspect configuration dictionary
461+
inspect_config = {
462+
"info_types": [{"name": info_type} for info_type in info_types]
463+
}
464+
465+
# Construct deidentify configuration dictionary
466+
deidentify_config = {
467+
"info_type_transformations": {
468+
"transformations": [
469+
{
470+
"primitive_transformation": {
471+
"replace_with_info_type_config": {}
472+
}
473+
}
474+
]
475+
}
476+
}
477+
478+
# Call the API
479+
response = dlp.deidentify_content(
480+
parent,
481+
inspect_config=inspect_config,
482+
deidentify_config=deidentify_config,
483+
item={"value": item},
484+
)
485+
486+
# Print out the results.
487+
print(response.item.value)
488+
489+
490+
# [END dlp_redact_sensitive_data]
491+
492+
438493
if __name__ == "__main__":
439494
parser = argparse.ArgumentParser(description=__doc__)
440495
subparsers = parser.add_subparsers(
@@ -626,6 +681,30 @@ def write_data(data):
626681
"key_name.",
627682
)
628683

684+
redact_parser = subparsers.add_parser(
685+
"redact",
686+
help="Redact sensitive data in a string by replacing it with the "
687+
"info type of the data.",
688+
)
689+
redact_parser.add_argument(
690+
"--info_types",
691+
action="append",
692+
help="Strings representing info types to look for. A full list of "
693+
"info categories and types is available from the API. Examples "
694+
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
695+
"If unspecified, the three above examples will be used.",
696+
default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
697+
)
698+
redact_parser.add_argument(
699+
"project",
700+
help="The Google Cloud project id to use as a parent resource.",
701+
)
702+
redact_parser.add_argument(
703+
"item",
704+
help="The string to redact."
705+
"Example: 'My credit card is 4242 4242 4242 4242'",
706+
)
707+
629708
args = parser.parse_args()
630709

631710
if args.content == "deid_mask":
@@ -667,3 +746,9 @@ def write_data(data):
667746
wrapped_key=args.wrapped_key,
668747
key_name=args.key_name,
669748
)
749+
elif args.content == "redact":
750+
redact_sensitive_data(
751+
args.project,
752+
item=args.item,
753+
info_types=args.info_types,
754+
)

dlp/deid_test.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,3 +185,17 @@ def test_reidentify_with_fpe(capsys):
185185
out, _ = capsys.readouterr()
186186

187187
assert "731997681" not in out
188+
189+
190+
def test_redact_sensitive_data(capsys):
191+
url_to_redact = "https://cloud.google.com"
192+
deid.redact_sensitive_data(
193+
GCLOUD_PROJECT,
194+
"My favorite site is " + url_to_redact,
195+
["URL"],
196+
)
197+
198+
out, _ = capsys.readouterr()
199+
200+
assert url_to_redact not in out
201+
assert "My favorite site is [URL]" in out

0 commit comments

Comments
 (0)