Skip to content

Commit 94ced73

Browse files
ackulengelke
authored andcommitted
Add inspect table code sample for DLP and some nit fixes (#1921)
* Remove claim that redact.py operates on strings Reflect in the comments that this particular code sample does not support text redaction. * Add code sample for inspecting table, fix requirements for running tests, quickstart example refactor * Remove newline, if -> elif * formatting * More formatting
1 parent c1ec40c commit 94ced73

File tree

7 files changed

+237
-25
lines changed

7 files changed

+237
-25
lines changed

dlp/README.rst

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,15 @@ Install Dependencies
5858
.. _pip: https://pip.pypa.io/
5959
.. _virtualenv: https://virtualenv.pypa.io/
6060

61+
#. For running *_test.py files, install test dependencies
62+
63+
.. code-block:: bash
64+
65+
$ pip install -r requirements-test.txt
66+
$ pytest inspect_content_test.py
67+
68+
** *_test.py files are demo wrappers and make API calls. You may get rate limited for making high number of requests. **
69+
6170
Samples
6271
-------------------------------------------------------------------------------
6372

@@ -74,7 +83,7 @@ To run this sample:
7483

7584
.. code-block:: bash
7685
77-
$ python quickstart.py
86+
$ python quickstart.py <project-id>
7887
7988
8089
Inspect Content

dlp/deid.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ def write_data(data):
402402
write_file.writerow(map(write_data, row.values))
403403
# Print status
404404
print('Successfully saved date-shift output to {}'.format(
405-
output_csv_file))
405+
output_csv_file))
406406
# [END dlp_deidentify_date_shift]
407407

408408

@@ -450,8 +450,8 @@ def write_data(data):
450450
'If unspecified, the three above examples will be used.',
451451
default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
452452
fpe_parser.add_argument(
453-
'project',
454-
help='The Google Cloud project id to use as a parent resource.')
453+
'project',
454+
help='The Google Cloud project id to use as a parent resource.')
455455
fpe_parser.add_argument(
456456
'item',
457457
help='The string to deidentify. '

dlp/inspect_content.py

Lines changed: 177 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import argparse
2121
import os
22+
import json
2223

2324

2425
# [START dlp_inspect_string]
@@ -77,7 +78,7 @@ def inspect_string(project, content_string, info_types,
7778
'min_likelihood': min_likelihood,
7879
'include_quote': include_quote,
7980
'limits': {'max_findings_per_request': max_findings},
80-
}
81+
}
8182

8283
# Construct the `item`.
8384
item = {'value': content_string}
@@ -102,8 +103,130 @@ def inspect_string(project, content_string, info_types,
102103
print('No findings.')
103104
# [END dlp_inspect_string]
104105

106+
# [START dlp_inspect_table]
107+
108+
109+
def inspect_table(project, data, info_types,
110+
custom_dictionaries=None, custom_regexes=None,
111+
min_likelihood=None, max_findings=None, include_quote=True):
112+
"""Uses the Data Loss Prevention API to analyze strings for protected data.
113+
Args:
114+
project: The Google Cloud project id to use as a parent resource.
115+
data: Json string representing table data.
116+
info_types: A list of strings representing info types to look for.
117+
A full list of info type categories can be fetched from the API.
118+
min_likelihood: A string representing the minimum likelihood threshold
119+
that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
120+
'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
121+
max_findings: The maximum number of findings to report; 0 = no maximum.
122+
include_quote: Boolean for whether to display a quote of the detected
123+
information in the results.
124+
Returns:
125+
None; the response from the API is printed to the terminal.
126+
Example:
127+
data = {
128+
"header":[
129+
"email",
130+
"phone number"
131+
],
132+
"rows":[
133+
[
134+
135+
"4232342345"
136+
],
137+
[
138+
139+
"4253458383"
140+
]
141+
]
142+
}
143+
144+
>> $ python inspect_content.py table \
145+
'{"header": ["email", "phone number"],
146+
"rows": [["[email protected]", "4232342345"],
147+
["[email protected]", "4253458383"]]}'
148+
149+
Info type: EMAIL_ADDRESS
150+
Likelihood: 4
151+
152+
Info type: EMAIL_ADDRESS
153+
Likelihood: 4
154+
"""
155+
156+
# Import the client library.
157+
import google.cloud.dlp
158+
159+
# Instantiate a client.
160+
dlp = google.cloud.dlp.DlpServiceClient()
161+
162+
# Prepare info_types by converting the list of strings into a list of
163+
# dictionaries (protos are also accepted).
164+
info_types = [{'name': info_type} for info_type in info_types]
165+
166+
# Prepare custom_info_types by parsing the dictionary word lists and
167+
# regex patterns.
168+
if custom_dictionaries is None:
169+
custom_dictionaries = []
170+
dictionaries = [{
171+
'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
172+
'dictionary': {
173+
'word_list': {'words': custom_dict.split(',')}
174+
}
175+
} for i, custom_dict in enumerate(custom_dictionaries)]
176+
if custom_regexes is None:
177+
custom_regexes = []
178+
regexes = [{
179+
'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
180+
'regex': {'pattern': custom_regex}
181+
} for i, custom_regex in enumerate(custom_regexes)]
182+
custom_info_types = dictionaries + regexes
183+
184+
# Construct the configuration dictionary. Keys which are None may
185+
# optionally be omitted entirely.
186+
inspect_config = {
187+
'info_types': info_types,
188+
'custom_info_types': custom_info_types,
189+
'min_likelihood': min_likelihood,
190+
'include_quote': include_quote,
191+
'limits': {'max_findings_per_request': max_findings},
192+
}
193+
194+
# Construct the `table`. For more details on the table schema, please see
195+
# https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
196+
headers = [{"name": val} for val in data["header"]]
197+
rows = []
198+
for row in data["rows"]:
199+
rows.append({
200+
"values": [{"string_value": cell_val} for cell_val in row]
201+
})
202+
203+
table = {}
204+
table["headers"] = headers
205+
table["rows"] = rows
206+
item = {"table": table}
207+
# Convert the project id into a full resource id.
208+
parent = dlp.project_path(project)
209+
210+
# Call the API.
211+
response = dlp.inspect_content(parent, inspect_config, item)
212+
213+
# Print out the results.
214+
if response.result.findings:
215+
for finding in response.result.findings:
216+
try:
217+
if finding.quote:
218+
print('Quote: {}'.format(finding.quote))
219+
except AttributeError:
220+
pass
221+
print('Info type: {}'.format(finding.info_type.name))
222+
print('Likelihood: {}'.format(finding.likelihood))
223+
else:
224+
print('No findings.')
225+
# [END dlp_inspect_table]
105226

106227
# [START dlp_inspect_file]
228+
229+
107230
def inspect_file(project, filename, info_types, min_likelihood=None,
108231
custom_dictionaries=None, custom_regexes=None,
109232
max_findings=None, include_quote=True, mime_type=None):
@@ -284,8 +407,8 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
284407
storage_config = {
285408
'cloud_storage_options': {
286409
'file_set': {'url': url}
287-
}
288410
}
411+
}
289412

290413
# Convert the project id into a full resource id.
291414
parent = dlp.project_path(project)
@@ -309,7 +432,6 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
309432
subscriber = google.cloud.pubsub.SubscriberClient()
310433
subscription_path = subscriber.subscription_path(
311434
project, subscription_id)
312-
subscription = subscriber.subscribe(subscription_path)
313435

314436
# Set up a callback to acknowledge a message. This closes around an event
315437
# so that it can signal that it is done and the main thread can continue.
@@ -341,8 +463,7 @@ def callback(message):
341463
print(e)
342464
raise
343465

344-
# Register the callback and wait on the event.
345-
subscription.open(callback)
466+
subscriber.subscribe(subscription_path, callback=callback)
346467
finished = job_done.wait(timeout=timeout)
347468
if not finished:
348469
print('No event received before the timeout. Please verify that the '
@@ -460,7 +581,6 @@ def inspect_datastore(project, datastore_project, kind,
460581
subscriber = google.cloud.pubsub.SubscriberClient()
461582
subscription_path = subscriber.subscription_path(
462583
project, subscription_id)
463-
subscription = subscriber.subscribe(subscription_path)
464584

465585
# Set up a callback to acknowledge a message. This closes around an event
466586
# so that it can signal that it is done and the main thread can continue.
@@ -493,7 +613,8 @@ def callback(message):
493613
raise
494614

495615
# Register the callback and wait on the event.
496-
subscription.open(callback)
616+
subscriber.subscribe(subscription_path, callback=callback)
617+
497618
finished = job_done.wait(timeout=timeout)
498619
if not finished:
499620
print('No event received before the timeout. Please verify that the '
@@ -609,7 +730,6 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
609730
subscriber = google.cloud.pubsub.SubscriberClient()
610731
subscription_path = subscriber.subscription_path(
611732
project, subscription_id)
612-
subscription = subscriber.subscribe(subscription_path)
613733

614734
# Set up a callback to acknowledge a message. This closes around an event
615735
# so that it can signal that it is done and the main thread can continue.
@@ -642,7 +762,7 @@ def callback(message):
642762
raise
643763

644764
# Register the callback and wait on the event.
645-
subscription.open(callback)
765+
subscriber.subscribe(subscription_path, callback=callback)
646766
finished = job_done.wait(timeout=timeout)
647767
if not finished:
648768
print('No event received before the timeout. Please verify that the '
@@ -698,6 +818,46 @@ def callback(message):
698818
'information in the results.',
699819
default=True)
700820

821+
parser_table = subparsers.add_parser('table', help='Inspect a table.')
822+
parser_table.add_argument(
823+
'data', help='Json string representing a table.', type=json.loads)
824+
parser_table.add_argument(
825+
'--project',
826+
help='The Google Cloud project id to use as a parent resource.',
827+
default=default_project)
828+
parser_table.add_argument(
829+
'--info_types', action='append',
830+
help='Strings representing info types to look for. A full list of '
831+
'info categories and types is available from the API. Examples '
832+
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
833+
'If unspecified, the three above examples will be used.',
834+
default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
835+
parser_table.add_argument(
836+
'--custom_dictionaries', action='append',
837+
help='Strings representing comma-delimited lists of dictionary words'
838+
' to search for as custom info types. Each string is a comma '
839+
'delimited list of words representing a distinct dictionary.',
840+
default=None)
841+
parser_table.add_argument(
842+
'--custom_regexes', action='append',
843+
help='Strings representing regex patterns to search for as custom '
844+
' info types.',
845+
default=None)
846+
parser_table.add_argument(
847+
'--min_likelihood',
848+
choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
849+
'POSSIBLE', 'LIKELY', 'VERY_LIKELY'],
850+
help='A string representing the minimum likelihood threshold that '
851+
'constitutes a match.')
852+
parser_table.add_argument(
853+
'--max_findings', type=int,
854+
help='The maximum number of findings to report; 0 = no maximum.')
855+
parser_table.add_argument(
856+
'--include_quote', type=bool,
857+
help='A boolean for whether to display a quote of the detected '
858+
'information in the results.',
859+
default=True)
860+
701861
parser_file = subparsers.add_parser('file', help='Inspect a local file.')
702862
parser_file.add_argument(
703863
'filename', help='The path to the file to inspect.')
@@ -923,6 +1083,14 @@ def callback(message):
9231083
min_likelihood=args.min_likelihood,
9241084
max_findings=args.max_findings,
9251085
include_quote=args.include_quote)
1086+
elif args.content == 'table':
1087+
inspect_table(
1088+
args.project, args.data, args.info_types,
1089+
custom_dictionaries=args.custom_dictionaries,
1090+
custom_regexes=args.custom_regexes,
1091+
min_likelihood=args.min_likelihood,
1092+
max_findings=args.max_findings,
1093+
include_quote=args.include_quote)
9261094
elif args.content == 'file':
9271095
inspect_file(
9281096
args.project, args.filename, args.info_types,

dlp/inspect_content_test.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import google.cloud.storage
2525

2626
import pytest
27-
2827
import inspect_content
2928

3029

@@ -170,6 +169,35 @@ def test_inspect_string(capsys):
170169
assert 'Info type: EMAIL_ADDRESS' in out
171170

172171

172+
def test_inspect_table(capsys):
173+
test_tabular_data = {
174+
"header": [
175+
"email",
176+
"phone number"
177+
],
178+
"rows": [
179+
[
180+
181+
"4232342345"
182+
],
183+
[
184+
185+
"4253458383"
186+
]
187+
]
188+
}
189+
190+
inspect_content.inspect_table(
191+
GCLOUD_PROJECT,
192+
test_tabular_data,
193+
['PHONE_NUMBER', 'EMAIL_ADDRESS'],
194+
include_quote=True)
195+
196+
out, _ = capsys.readouterr()
197+
assert 'Info type: PHONE_NUMBER' in out
198+
assert 'Info type: EMAIL_ADDRESS' in out
199+
200+
173201
def test_inspect_string_with_custom_info_types(capsys):
174202
test_string = 'My name is Gary Smith and my email is [email protected]'
175203
dictionaries = ['Gary Smith']

dlp/quickstart.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,17 @@
1717

1818
from __future__ import print_function
1919

20+
import sys
21+
import argparse
2022

21-
def quickstart():
23+
24+
def quickstart(project_id):
2225
"""Demonstrates use of the Data Loss Prevention API client library."""
2326

2427
# [START dlp_quickstart]
2528
# Import the client library
2629
import google.cloud.dlp
2730

28-
# Edit this with your Google Cloud Project ID.
29-
project = 'your-project'
30-
3131
# Instantiate a client.
3232
dlp_client = google.cloud.dlp.DlpServiceClient()
3333

@@ -84,4 +84,11 @@ def quickstart():
8484

8585

8686
if __name__ == '__main__':
87-
quickstart()
87+
parser = argparse.ArgumentParser()
88+
parser.add_argument(
89+
"project_id", help="Enter your GCP project id.", type=str)
90+
args = parser.parse_args()
91+
if len(sys.argv) == 1:
92+
parser.print_usage()
93+
sys.exit(1)
94+
quickstart(args.project_id)

0 commit comments

Comments
 (0)