Skip to content

Commit 629c0a8

Browse files
authored
Add DLP code sample and test for de-id free text with surrogate (#4085)
## Description Add DLP code sample and test for de-id free text with surrogate, meant for https://cloud.google.com/dlp/docs/pseudonymization#de-identification_in_free_text_code_example ## Checklist - [x] I have followed [Sample Guidelines from AUTHORING_GUIDE.MD](https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/AUTHORING_GUIDE.md) - [ ] README is updated to include [all relevant information](https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/AUTHORING_GUIDE.md#readme-file) - [x] **Tests** pass: `nox -s py-3.6` (see [Test Enviroment Setup](https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/AUTHORING_GUIDE.md#test-environment-setup)) - [x] **Lint** pass: `nox -s lint` (see [Test Enviroment Setup](https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/AUTHORING_GUIDE.md#test-environment-setup)) - [ ] These samples need a new **API enabled** in testing projects to pass (let us know which ones) - [ ] These samples need a new/updated **env vars** in testing projects set to pass (let us know which ones) - [x] Please **merge** this PR for me once it is approved.
1 parent 8acf0d6 commit 629c0a8

File tree

2 files changed

+106
-0
lines changed

2 files changed

+106
-0
lines changed

dlp/deid.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,93 @@ def reidentify_with_fpe(
394394

395395
# [END dlp_reidentify_fpe]
396396

397+
398+
# [START dlp_deidentify_free_text_with_fpe_using_surrogate]
399+
def deidentify_free_text_with_fpe_using_surrogate(
400+
project,
401+
input_str,
402+
alphabet="NUMERIC",
403+
info_type="PHONE_NUMBER",
404+
surrogate_type="PHONE_TOKEN",
405+
unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==",
406+
):
407+
"""Uses the Data Loss Prevention API to deidentify sensitive data in a
408+
string using Format Preserving Encryption (FPE).
409+
The encryption is performed with an unwrapped key.
410+
Args:
411+
project: The Google Cloud project id to use as a parent resource.
412+
input_str: The string to deidentify (will be treated as text).
413+
alphabet: The set of characters to replace sensitive ones with. For
414+
more information, see https://cloud.google.com/dlp/docs/reference/
415+
rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
416+
info_type: The name of the info type to de-identify
417+
surrogate_type: The name of the surrogate custom info type to use. Can
418+
be essentially any arbitrary string, as long as it doesn't appear
419+
in your dataset otherwise.
420+
unwrapped_key: The base64-encoded AES-256 key to use.
421+
Returns:
422+
None; the response from the API is printed to the terminal.
423+
"""
424+
# Import the client library
425+
import google.cloud.dlp
426+
427+
# Instantiate a client
428+
dlp = google.cloud.dlp_v2.DlpServiceClient()
429+
430+
# Convert the project id into a full resource id.
431+
parent = dlp.project_path(project)
432+
433+
# The unwrapped key is base64-encoded, but the library expects a binary
434+
# string, so decode it here.
435+
import base64
436+
437+
unwrapped_key = base64.b64decode(unwrapped_key)
438+
439+
# Construct de-identify config
440+
transformation = {
441+
"info_types": [{"name": info_type}],
442+
"primitive_transformation": {
443+
"crypto_replace_ffx_fpe_config": {
444+
"crypto_key": {
445+
"unwrapped": {"key": unwrapped_key}
446+
},
447+
"common_alphabet": alphabet,
448+
"surrogate_info_type": {"name": surrogate_type},
449+
}
450+
}
451+
}
452+
453+
deidentify_config = {
454+
"info_type_transformations": {
455+
"transformations": [transformation]
456+
}
457+
}
458+
459+
# Construct the inspect config, trying to finding all PII with likelihood
460+
# higher than UNLIKELY
461+
inspect_config = {
462+
"info_types": [{"name": info_type}],
463+
"min_likelihood": "UNLIKELY"
464+
}
465+
466+
# Convert string to item
467+
item = {"value": input_str}
468+
469+
# Call the API
470+
response = dlp.deidentify_content(
471+
parent,
472+
inspect_config=inspect_config,
473+
deidentify_config=deidentify_config,
474+
item=item,
475+
)
476+
477+
# Print results
478+
print(response.item.value)
479+
480+
481+
# [END dlp_deidentify_free_text_with_fpe_using_surrogate]
482+
483+
397484
# [START dlp_reidentify_free_text_with_fpe_using_surrogate]
398485
def reidentify_free_text_with_fpe_using_surrogate(
399486
project,

dlp/deid_test.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,25 @@ def test_reidentify_with_fpe(capsys):
206206
assert "731997681" not in out
207207

208208

209+
def test_deidentify_free_text_with_fpe_using_surrogate(capsys):
210+
labeled_fpe_string = "My phone number is 4359916732"
211+
212+
deid.deidentify_free_text_with_fpe_using_surrogate(
213+
GCLOUD_PROJECT,
214+
labeled_fpe_string,
215+
info_type="PHONE_NUMBER",
216+
surrogate_type="PHONE_TOKEN",
217+
unwrapped_key=UNWRAPPED_KEY,
218+
alphabet="NUMERIC",
219+
)
220+
221+
out, _ = capsys.readouterr()
222+
223+
assert "PHONE_TOKEN" in out
224+
assert "My phone number is" in out
225+
assert "4359916732" not in out
226+
227+
209228
def test_reidentify_free_text_with_fpe_using_surrogate(capsys):
210229
labeled_fpe_string = "My phone number is PHONE_TOKEN(10):9617256398"
211230

0 commit comments

Comments
 (0)