19
19
20
20
import argparse
21
21
import os
22
+ import json
22
23
23
24
24
25
# [START dlp_inspect_string]
@@ -77,7 +78,7 @@ def inspect_string(project, content_string, info_types,
77
78
'min_likelihood' : min_likelihood ,
78
79
'include_quote' : include_quote ,
79
80
'limits' : {'max_findings_per_request' : max_findings },
80
- }
81
+ }
81
82
82
83
# Construct the `item`.
83
84
item = {'value' : content_string }
@@ -102,8 +103,130 @@ def inspect_string(project, content_string, info_types,
102
103
print ('No findings.' )
103
104
# [END dlp_inspect_string]
104
105
106
+ # [START dlp_inspect_table]
107
+
108
+
109
+ def inspect_table (project , data , info_types ,
110
+ custom_dictionaries = None , custom_regexes = None ,
111
+ min_likelihood = None , max_findings = None , include_quote = True ):
112
+ """Uses the Data Loss Prevention API to analyze strings for protected data.
113
+ Args:
114
+ project: The Google Cloud project id to use as a parent resource.
115
+ data: Json string representing table data.
116
+ info_types: A list of strings representing info types to look for.
117
+ A full list of info type categories can be fetched from the API.
118
+ min_likelihood: A string representing the minimum likelihood threshold
119
+ that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
120
+ 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
121
+ max_findings: The maximum number of findings to report; 0 = no maximum.
122
+ include_quote: Boolean for whether to display a quote of the detected
123
+ information in the results.
124
+ Returns:
125
+ None; the response from the API is printed to the terminal.
126
+ Example:
127
+ data = {
128
+ "header":[
129
+ "email",
130
+ "phone number"
131
+ ],
132
+ "rows":[
133
+ [
134
+
135
+ "4232342345"
136
+ ],
137
+ [
138
+
139
+ "4253458383"
140
+ ]
141
+ ]
142
+ }
143
+
144
+ >> $ python inspect_content.py table \
145
+ '{"header": ["email", "phone number"],
146
+ "rows": [["[email protected] ", "4232342345"],
147
+ ["[email protected] ", "4253458383"]]}'
148
+
149
+ Info type: EMAIL_ADDRESS
150
+ Likelihood: 4
151
+
152
+ Info type: EMAIL_ADDRESS
153
+ Likelihood: 4
154
+ """
155
+
156
+ # Import the client library.
157
+ import google .cloud .dlp
158
+
159
+ # Instantiate a client.
160
+ dlp = google .cloud .dlp .DlpServiceClient ()
161
+
162
+ # Prepare info_types by converting the list of strings into a list of
163
+ # dictionaries (protos are also accepted).
164
+ info_types = [{'name' : info_type } for info_type in info_types ]
165
+
166
+ # Prepare custom_info_types by parsing the dictionary word lists and
167
+ # regex patterns.
168
+ if custom_dictionaries is None :
169
+ custom_dictionaries = []
170
+ dictionaries = [{
171
+ 'info_type' : {'name' : 'CUSTOM_DICTIONARY_{}' .format (i )},
172
+ 'dictionary' : {
173
+ 'word_list' : {'words' : custom_dict .split (',' )}
174
+ }
175
+ } for i , custom_dict in enumerate (custom_dictionaries )]
176
+ if custom_regexes is None :
177
+ custom_regexes = []
178
+ regexes = [{
179
+ 'info_type' : {'name' : 'CUSTOM_REGEX_{}' .format (i )},
180
+ 'regex' : {'pattern' : custom_regex }
181
+ } for i , custom_regex in enumerate (custom_regexes )]
182
+ custom_info_types = dictionaries + regexes
183
+
184
+ # Construct the configuration dictionary. Keys which are None may
185
+ # optionally be omitted entirely.
186
+ inspect_config = {
187
+ 'info_types' : info_types ,
188
+ 'custom_info_types' : custom_info_types ,
189
+ 'min_likelihood' : min_likelihood ,
190
+ 'include_quote' : include_quote ,
191
+ 'limits' : {'max_findings_per_request' : max_findings },
192
+ }
193
+
194
+ # Construct the `table`. For more details on the table schema, please see
195
+ # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
196
+ headers = [{"name" : val } for val in data ["header" ]]
197
+ rows = []
198
+ for row in data ["rows" ]:
199
+ rows .append ({
200
+ "values" : [{"string_value" : cell_val } for cell_val in row ]
201
+ })
202
+
203
+ table = {}
204
+ table ["headers" ] = headers
205
+ table ["rows" ] = rows
206
+ item = {"table" : table }
207
+ # Convert the project id into a full resource id.
208
+ parent = dlp .project_path (project )
209
+
210
+ # Call the API.
211
+ response = dlp .inspect_content (parent , inspect_config , item )
212
+
213
+ # Print out the results.
214
+ if response .result .findings :
215
+ for finding in response .result .findings :
216
+ try :
217
+ if finding .quote :
218
+ print ('Quote: {}' .format (finding .quote ))
219
+ except AttributeError :
220
+ pass
221
+ print ('Info type: {}' .format (finding .info_type .name ))
222
+ print ('Likelihood: {}' .format (finding .likelihood ))
223
+ else :
224
+ print ('No findings.' )
225
+ # [END dlp_inspect_table]
105
226
106
227
# [START dlp_inspect_file]
228
+
229
+
107
230
def inspect_file (project , filename , info_types , min_likelihood = None ,
108
231
custom_dictionaries = None , custom_regexes = None ,
109
232
max_findings = None , include_quote = True , mime_type = None ):
@@ -284,8 +407,8 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
284
407
storage_config = {
285
408
'cloud_storage_options' : {
286
409
'file_set' : {'url' : url }
287
- }
288
410
}
411
+ }
289
412
290
413
# Convert the project id into a full resource id.
291
414
parent = dlp .project_path (project )
@@ -309,7 +432,6 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
309
432
subscriber = google .cloud .pubsub .SubscriberClient ()
310
433
subscription_path = subscriber .subscription_path (
311
434
project , subscription_id )
312
- subscription = subscriber .subscribe (subscription_path )
313
435
314
436
# Set up a callback to acknowledge a message. This closes around an event
315
437
# so that it can signal that it is done and the main thread can continue.
@@ -341,8 +463,7 @@ def callback(message):
341
463
print (e )
342
464
raise
343
465
344
- # Register the callback and wait on the event.
345
- subscription .open (callback )
466
+ subscriber .subscribe (subscription_path , callback = callback )
346
467
finished = job_done .wait (timeout = timeout )
347
468
if not finished :
348
469
print ('No event received before the timeout. Please verify that the '
@@ -460,7 +581,6 @@ def inspect_datastore(project, datastore_project, kind,
460
581
subscriber = google .cloud .pubsub .SubscriberClient ()
461
582
subscription_path = subscriber .subscription_path (
462
583
project , subscription_id )
463
- subscription = subscriber .subscribe (subscription_path )
464
584
465
585
# Set up a callback to acknowledge a message. This closes around an event
466
586
# so that it can signal that it is done and the main thread can continue.
@@ -493,7 +613,8 @@ def callback(message):
493
613
raise
494
614
495
615
# Register the callback and wait on the event.
496
- subscription .open (callback )
616
+ subscriber .subscribe (subscription_path , callback = callback )
617
+
497
618
finished = job_done .wait (timeout = timeout )
498
619
if not finished :
499
620
print ('No event received before the timeout. Please verify that the '
@@ -609,7 +730,6 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
609
730
subscriber = google .cloud .pubsub .SubscriberClient ()
610
731
subscription_path = subscriber .subscription_path (
611
732
project , subscription_id )
612
- subscription = subscriber .subscribe (subscription_path )
613
733
614
734
# Set up a callback to acknowledge a message. This closes around an event
615
735
# so that it can signal that it is done and the main thread can continue.
@@ -642,7 +762,7 @@ def callback(message):
642
762
raise
643
763
644
764
# Register the callback and wait on the event.
645
- subscription . open ( callback )
765
+ subscriber . subscribe ( subscription_path , callback = callback )
646
766
finished = job_done .wait (timeout = timeout )
647
767
if not finished :
648
768
print ('No event received before the timeout. Please verify that the '
@@ -698,6 +818,46 @@ def callback(message):
698
818
'information in the results.' ,
699
819
default = True )
700
820
821
+ parser_table = subparsers .add_parser ('table' , help = 'Inspect a table.' )
822
+ parser_table .add_argument (
823
+ 'data' , help = 'Json string representing a table.' , type = json .loads )
824
+ parser_table .add_argument (
825
+ '--project' ,
826
+ help = 'The Google Cloud project id to use as a parent resource.' ,
827
+ default = default_project )
828
+ parser_table .add_argument (
829
+ '--info_types' , action = 'append' ,
830
+ help = 'Strings representing info types to look for. A full list of '
831
+ 'info categories and types is available from the API. Examples '
832
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
833
+ 'If unspecified, the three above examples will be used.' ,
834
+ default = ['FIRST_NAME' , 'LAST_NAME' , 'EMAIL_ADDRESS' ])
835
+ parser_table .add_argument (
836
+ '--custom_dictionaries' , action = 'append' ,
837
+ help = 'Strings representing comma-delimited lists of dictionary words'
838
+ ' to search for as custom info types. Each string is a comma '
839
+ 'delimited list of words representing a distinct dictionary.' ,
840
+ default = None )
841
+ parser_table .add_argument (
842
+ '--custom_regexes' , action = 'append' ,
843
+ help = 'Strings representing regex patterns to search for as custom '
844
+ ' info types.' ,
845
+ default = None )
846
+ parser_table .add_argument (
847
+ '--min_likelihood' ,
848
+ choices = ['LIKELIHOOD_UNSPECIFIED' , 'VERY_UNLIKELY' , 'UNLIKELY' ,
849
+ 'POSSIBLE' , 'LIKELY' , 'VERY_LIKELY' ],
850
+ help = 'A string representing the minimum likelihood threshold that '
851
+ 'constitutes a match.' )
852
+ parser_table .add_argument (
853
+ '--max_findings' , type = int ,
854
+ help = 'The maximum number of findings to report; 0 = no maximum.' )
855
+ parser_table .add_argument (
856
+ '--include_quote' , type = bool ,
857
+ help = 'A boolean for whether to display a quote of the detected '
858
+ 'information in the results.' ,
859
+ default = True )
860
+
701
861
parser_file = subparsers .add_parser ('file' , help = 'Inspect a local file.' )
702
862
parser_file .add_argument (
703
863
'filename' , help = 'The path to the file to inspect.' )
@@ -923,6 +1083,14 @@ def callback(message):
923
1083
min_likelihood = args .min_likelihood ,
924
1084
max_findings = args .max_findings ,
925
1085
include_quote = args .include_quote )
1086
+ elif args .content == 'table' :
1087
+ inspect_table (
1088
+ args .project , args .data , args .info_types ,
1089
+ custom_dictionaries = args .custom_dictionaries ,
1090
+ custom_regexes = args .custom_regexes ,
1091
+ min_likelihood = args .min_likelihood ,
1092
+ max_findings = args .max_findings ,
1093
+ include_quote = args .include_quote )
926
1094
elif args .content == 'file' :
927
1095
inspect_file (
928
1096
args .project , args .filename , args .info_types ,
0 commit comments