From 231fc45108cddca424c84ba347e5ff72839f123e Mon Sep 17 00:00:00 2001 From: Ace Date: Mon, 10 Dec 2018 12:57:46 -0800 Subject: [PATCH 1/5] Remove claim that redact.py operates on strings Reflect in the comments that this particular code sample does not support text redaction. --- dlp/redact.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlp/redact.py b/dlp/redact.py index 85fb9ef6458..004d06f5baf 100644 --- a/dlp/redact.py +++ b/dlp/redact.py @@ -13,7 +13,7 @@ # limitations under the License. """Sample app that uses the Data Loss Prevent API to redact the contents of -a string or an image file.""" +an image file.""" from __future__ import print_function From 9e2ed479250996281314a17b525a2ceb14908d43 Mon Sep 17 00:00:00 2001 From: achinkulshrestha Date: Fri, 14 Dec 2018 08:55:52 -0800 Subject: [PATCH 2/5] Add code sample for inspecting table, fix requirements for running tests, quickstart example refactor --- dlp/README.rst | 11 ++- dlp/inspect_content.py | 177 ++++++++++++++++++++++++++++++++++-- dlp/inspect_content_test.py | 15 ++- dlp/quickstart.py | 18 ++-- 4 files changed, 206 insertions(+), 15 deletions(-) diff --git a/dlp/README.rst b/dlp/README.rst index df85a0636e8..ce8b8550024 100644 --- a/dlp/README.rst +++ b/dlp/README.rst @@ -58,6 +58,15 @@ Install Dependencies .. _pip: https://pip.pypa.io/ .. _virtualenv: https://virtualenv.pypa.io/ +#. For running *_test.py files, install test dependencies + + .. code-block:: bash + + $ pip install -r requirements-test.txt + $ pytest inspect_content_test.py + +** *_test.py files are demo wrappers and make API calls. You may get rate limited for making high number of requests. ** + Samples ------------------------------------------------------------------------------- @@ -74,7 +83,7 @@ To run this sample: .. code-block:: bash - $ python quickstart.py + $ python quickstart.py Inspect Content diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index aedc002d465..428dd39ec8c 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -19,6 +19,7 @@ import argparse import os +import json # [START dlp_inspect_string] @@ -102,6 +103,124 @@ def inspect_string(project, content_string, info_types, print('No findings.') # [END dlp_inspect_string] +# [START dlp_inspect_table] +def inspect_table(project, data, info_types, + custom_dictionaries=None, custom_regexes=None, + min_likelihood=None, max_findings=None, include_quote=True): + """Uses the Data Loss Prevention API to analyze strings for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + data: Json string representing table data. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + Returns: + None; the response from the API is printed to the terminal. + Example: + data = { + "header":[ + "email", + "phone number" + ], + "rows":[ + [ + "robertfrost@xyz.com", + "4232342345" + ], + [ + "johndoe@pqr.com", + "4253458383" + ] + ] + } + + >> $ python inspect_content.py table \ + '{"header": ["email", "phone number"], "rows": [["robertfrost@xyz.com", "4232342345"], + ["johndoe@pqr.com", "4253458383"]]}' + >> Quote: robertfrost@xyz.com + Info type: EMAIL_ADDRESS + Likelihood: 4 + Quote: johndoe@pqr.com + Info type: EMAIL_ADDRESS + Likelihood: 4 + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{'name': info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dict.split(',')} + } + } for i, custom_dict in enumerate(custom_dictionaries)] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regex} + } for i, custom_regex in enumerate(custom_regexes)] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + 'info_types': info_types, + 'custom_info_types': custom_info_types, + 'min_likelihood': min_likelihood, + 'include_quote': include_quote, + 'limits': {'max_findings_per_request': max_findings}, + } + + # Construct the `table`. For more details on the table schema, please see + # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table + headers = [{"name": val} for val in data["header"]] + rows = [] + for row in data["rows"]: + rows.append({ + "values": [{"string_value": cell_val} for cell_val in row] + }) + + table = {} + table["headers"] = headers + table["rows"] = rows + item = {"table": table} + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print('Quote: {}'.format(finding.quote)) + except AttributeError: + pass + print('Info type: {}'.format(finding.info_type.name)) + print('Likelihood: {}'.format(finding.likelihood)) + else: + print('No findings.') +# [END dlp_inspect_table] + # [START dlp_inspect_file] def inspect_file(project, filename, info_types, min_likelihood=None, @@ -309,7 +428,6 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path( project, subscription_id) - subscription = subscriber.subscribe(subscription_path) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. @@ -341,8 +459,7 @@ def callback(message): print(e) raise - # Register the callback and wait on the event. - subscription.open(callback) + subscriber.subscribe(subscription_path, callback=callback) finished = job_done.wait(timeout=timeout) if not finished: print('No event received before the timeout. Please verify that the ' @@ -460,7 +577,6 @@ def inspect_datastore(project, datastore_project, kind, subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path( project, subscription_id) - subscription = subscriber.subscribe(subscription_path) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. @@ -493,7 +609,8 @@ def callback(message): raise # Register the callback and wait on the event. - subscription.open(callback) + subscriber.subscribe(subscription_path, callback=callback) + finished = job_done.wait(timeout=timeout) if not finished: print('No event received before the timeout. Please verify that the ' @@ -609,7 +726,6 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, subscriber = google.cloud.pubsub.SubscriberClient() subscription_path = subscriber.subscription_path( project, subscription_id) - subscription = subscriber.subscribe(subscription_path) # Set up a callback to acknowledge a message. This closes around an event # so that it can signal that it is done and the main thread can continue. @@ -642,7 +758,7 @@ def callback(message): raise # Register the callback and wait on the event. - subscription.open(callback) + subscriber.subscribe(subscription_path, callback=callback) finished = job_done.wait(timeout=timeout) if not finished: print('No event received before the timeout. Please verify that the ' @@ -698,6 +814,45 @@ def callback(message): 'information in the results.', default=True) + parser_table = subparsers.add_parser('table', help='Inspect a table.') + parser_table.add_argument('data', help='Json string representing a table.', type=json.loads) + parser_table.add_argument( + '--project', + help='The Google Cloud project id to use as a parent resource.', + default=default_project) + parser_table.add_argument( + '--info_types', action='append', + help='Strings representing info types to look for. A full list of ' + 'info categories and types is available from the API. Examples ' + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + 'If unspecified, the three above examples will be used.', + default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_table.add_argument( + '--custom_dictionaries', action='append', + help='Strings representing comma-delimited lists of dictionary words' + ' to search for as custom info types. Each string is a comma ' + 'delimited list of words representing a distinct dictionary.', + default=None) + parser_table.add_argument( + '--custom_regexes', action='append', + help='Strings representing regex patterns to search for as custom ' + ' info types.', + default=None) + parser_table.add_argument( + '--min_likelihood', + choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', + 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], + help='A string representing the minimum likelihood threshold that ' + 'constitutes a match.') + parser_table.add_argument( + '--max_findings', type=int, + help='The maximum number of findings to report; 0 = no maximum.') + parser_table.add_argument( + '--include_quote', type=bool, + help='A boolean for whether to display a quote of the detected ' + 'information in the results.', + default=True) + parser_file = subparsers.add_parser('file', help='Inspect a local file.') parser_file.add_argument( 'filename', help='The path to the file to inspect.') @@ -923,6 +1078,14 @@ def callback(message): min_likelihood=args.min_likelihood, max_findings=args.max_findings, include_quote=args.include_quote) + if args.content == 'table': + inspect_table( + args.project, args.data, args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote) elif args.content == 'file': inspect_file( args.project, args.filename, args.info_types, diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py index db1a0074142..d1b2a6fe2c9 100644 --- a/dlp/inspect_content_test.py +++ b/dlp/inspect_content_test.py @@ -24,7 +24,7 @@ import google.cloud.storage import pytest - +import json import inspect_content @@ -169,6 +169,19 @@ def test_inspect_string(capsys): assert 'Info type: FIRST_NAME' in out assert 'Info type: EMAIL_ADDRESS' in out +def test_inspect_table(capsys): + test_tabular_data = '{"header": ["email", "phone number"], "rows": [["robertfrost@xyz.com", "4232342345"], ["johndoe@pqr.com", "4253458383"]]}' + + inspect_content.inspect_table( + GCLOUD_PROJECT, + json.loads(test_tabular_data), + ['PHONE_NUMBER', 'EMAIL_ADDRESS'], + include_quote=True) + + out, _ = capsys.readouterr() + assert 'Info type: PHONE_NUMBER' in out + assert 'Info type: EMAIL_ADDRESS' in out + def test_inspect_string_with_custom_info_types(capsys): test_string = 'My name is Gary Smith and my email is gary@example.com' diff --git a/dlp/quickstart.py b/dlp/quickstart.py index 82bbc1f3078..d2e0bff1dbd 100644 --- a/dlp/quickstart.py +++ b/dlp/quickstart.py @@ -17,17 +17,17 @@ from __future__ import print_function +import sys +import argparse -def quickstart(): + +def quickstart(project_id): """Demonstrates use of the Data Loss Prevention API client library.""" # [START dlp_quickstart] # Import the client library import google.cloud.dlp - # Edit this with your Google Cloud Project ID. - project = 'your-project' - # Instantiate a client. dlp = google.cloud.dlp.DlpServiceClient() @@ -59,7 +59,7 @@ def quickstart(): } # Convert the project id into a full resource id. - parent = dlp.project_path(project) + parent = dlp.project_path(project_id) # Call the API. response = dlp.inspect_content(parent, inspect_config, item) @@ -84,4 +84,10 @@ def quickstart(): if __name__ == '__main__': - quickstart() + parser = argparse.ArgumentParser() + parser.add_argument("project_id", help="Enter your GCP project id.", type=str) + args = parser.parse_args() + if len(sys.argv) == 1: + parser.print_usage() + sys.exit(1) + quickstart(args.project_id) From 1436c228a000ba973bc307543c8fb1ad3e35d0d5 Mon Sep 17 00:00:00 2001 From: achinkulshrestha Date: Fri, 14 Dec 2018 09:17:26 -0800 Subject: [PATCH 3/5] Remove newline, if -> elif --- dlp/inspect_content.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index 428dd39ec8c..762498ae285 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -221,7 +221,6 @@ def inspect_table(project, data, info_types, print('No findings.') # [END dlp_inspect_table] - # [START dlp_inspect_file] def inspect_file(project, filename, info_types, min_likelihood=None, custom_dictionaries=None, custom_regexes=None, @@ -1078,7 +1077,7 @@ def callback(message): min_likelihood=args.min_likelihood, max_findings=args.max_findings, include_quote=args.include_quote) - if args.content == 'table': + elif args.content == 'table': inspect_table( args.project, args.data, args.info_types, custom_dictionaries=args.custom_dictionaries, From 3f171f8980a6bff10f829e88d80e94206a0f091f Mon Sep 17 00:00:00 2001 From: achinkulshrestha Date: Fri, 14 Dec 2018 10:03:45 -0800 Subject: [PATCH 4/5] formatting --- dlp/deid.py | 6 +++--- dlp/inspect_content.py | 20 +++++++++++++------- dlp/inspect_content_test.py | 5 ++++- dlp/quickstart.py | 3 ++- dlp/risk.py | 10 +++++----- 5 files changed, 27 insertions(+), 17 deletions(-) diff --git a/dlp/deid.py b/dlp/deid.py index 98b41488267..9c97f8e620e 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -402,7 +402,7 @@ def write_data(data): write_file.writerow(map(write_data, row.values)) # Print status print('Successfully saved date-shift output to {}'.format( - output_csv_file)) + output_csv_file)) # [END dlp_deidentify_date_shift] @@ -450,8 +450,8 @@ def write_data(data): 'If unspecified, the three above examples will be used.', default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) fpe_parser.add_argument( - 'project', - help='The Google Cloud project id to use as a parent resource.') + 'project', + help='The Google Cloud project id to use as a parent resource.') fpe_parser.add_argument( 'item', help='The string to deidentify. ' diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index 762498ae285..562ad4d2ab7 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -78,7 +78,7 @@ def inspect_string(project, content_string, info_types, 'min_likelihood': min_likelihood, 'include_quote': include_quote, 'limits': {'max_findings_per_request': max_findings}, - } + } # Construct the `item`. item = {'value': content_string} @@ -104,9 +104,11 @@ def inspect_string(project, content_string, info_types, # [END dlp_inspect_string] # [START dlp_inspect_table] + + def inspect_table(project, data, info_types, - custom_dictionaries=None, custom_regexes=None, - min_likelihood=None, max_findings=None, include_quote=True): + custom_dictionaries=None, custom_regexes=None, + min_likelihood=None, max_findings=None, include_quote=True): """Uses the Data Loss Prevention API to analyze strings for protected data. Args: project: The Google Cloud project id to use as a parent resource. @@ -140,7 +142,8 @@ def inspect_table(project, data, info_types, } >> $ python inspect_content.py table \ - '{"header": ["email", "phone number"], "rows": [["robertfrost@xyz.com", "4232342345"], + '{"header": ["email", "phone number"], + "rows": [["robertfrost@xyz.com", "4232342345"], ["johndoe@pqr.com", "4253458383"]]}' >> Quote: robertfrost@xyz.com Info type: EMAIL_ADDRESS @@ -186,7 +189,7 @@ def inspect_table(project, data, info_types, 'min_likelihood': min_likelihood, 'include_quote': include_quote, 'limits': {'max_findings_per_request': max_findings}, - } + } # Construct the `table`. For more details on the table schema, please see # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table @@ -222,6 +225,8 @@ def inspect_table(project, data, info_types, # [END dlp_inspect_table] # [START dlp_inspect_file] + + def inspect_file(project, filename, info_types, min_likelihood=None, custom_dictionaries=None, custom_regexes=None, max_findings=None, include_quote=True, mime_type=None): @@ -402,8 +407,8 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, storage_config = { 'cloud_storage_options': { 'file_set': {'url': url} - } } + } # Convert the project id into a full resource id. parent = dlp.project_path(project) @@ -814,7 +819,8 @@ def callback(message): default=True) parser_table = subparsers.add_parser('table', help='Inspect a table.') - parser_table.add_argument('data', help='Json string representing a table.', type=json.loads) + parser_table.add_argument( + 'data', help='Json string representing a table.', type=json.loads) parser_table.add_argument( '--project', help='The Google Cloud project id to use as a parent resource.', diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py index d1b2a6fe2c9..8d84b92a693 100644 --- a/dlp/inspect_content_test.py +++ b/dlp/inspect_content_test.py @@ -169,8 +169,11 @@ def test_inspect_string(capsys): assert 'Info type: FIRST_NAME' in out assert 'Info type: EMAIL_ADDRESS' in out + def test_inspect_table(capsys): - test_tabular_data = '{"header": ["email", "phone number"], "rows": [["robertfrost@xyz.com", "4232342345"], ["johndoe@pqr.com", "4253458383"]]}' + test_tabular_data = '{"header": ["email", "phone number"], + "rows": [["robertfrost@xyz.com", "4232342345"], + ["johndoe@pqr.com", "4253458383"]]}' inspect_content.inspect_table( GCLOUD_PROJECT, diff --git a/dlp/quickstart.py b/dlp/quickstart.py index d2e0bff1dbd..e05d6798cec 100644 --- a/dlp/quickstart.py +++ b/dlp/quickstart.py @@ -85,7 +85,8 @@ def quickstart(project_id): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument("project_id", help="Enter your GCP project id.", type=str) + parser.add_argument( + "project_id", help="Enter your GCP project id.", type=str) args = parser.parse_args() if len(sys.argv) == 1: parser.print_usage() diff --git a/dlp/risk.py b/dlp/risk.py index f06b08494ca..8512f054bce 100644 --- a/dlp/risk.py +++ b/dlp/risk.py @@ -360,12 +360,12 @@ def callback(message): print('Bucket {}:'.format(i)) if bucket.equivalence_class_size_lower_bound: print(' Bucket size range: [{}, {}]'.format( - bucket.equivalence_class_size_lower_bound, - bucket.equivalence_class_size_upper_bound)) + bucket.equivalence_class_size_lower_bound, + bucket.equivalence_class_size_upper_bound)) for value_bucket in bucket.bucket_values: print(' Quasi-ID values: {}'.format( map(get_values, value_bucket.quasi_ids_values) - )) + )) print(' Class size: {}'.format( value_bucket.equivalence_class_size)) # Signal to the main thread that we can exit. @@ -495,8 +495,8 @@ def callback(message): for i, bucket in enumerate(histogram_buckets): print('Bucket {}:'.format(i)) print(' Bucket size range: [{}, {}]'.format( - bucket.sensitive_value_frequency_lower_bound, - bucket.sensitive_value_frequency_upper_bound)) + bucket.sensitive_value_frequency_lower_bound, + bucket.sensitive_value_frequency_upper_bound)) for value_bucket in bucket.bucket_values: print(' Quasi-ID values: {}'.format( map(get_values, value_bucket.quasi_ids_values))) From 51122dcf827b8642a76bd10a5f3959042df7db53 Mon Sep 17 00:00:00 2001 From: achinkulshrestha Date: Fri, 14 Dec 2018 10:20:46 -0800 Subject: [PATCH 5/5] More formatting --- dlp/inspect_content.py | 2 +- dlp/inspect_content_test.py | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index 562ad4d2ab7..a741e0ee734 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -142,7 +142,7 @@ def inspect_table(project, data, info_types, } >> $ python inspect_content.py table \ - '{"header": ["email", "phone number"], + '{"header": ["email", "phone number"], "rows": [["robertfrost@xyz.com", "4232342345"], ["johndoe@pqr.com", "4253458383"]]}' >> Quote: robertfrost@xyz.com diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py index 8d84b92a693..d5dd84a6f2f 100644 --- a/dlp/inspect_content_test.py +++ b/dlp/inspect_content_test.py @@ -24,7 +24,6 @@ import google.cloud.storage import pytest -import json import inspect_content @@ -171,13 +170,26 @@ def test_inspect_string(capsys): def test_inspect_table(capsys): - test_tabular_data = '{"header": ["email", "phone number"], - "rows": [["robertfrost@xyz.com", "4232342345"], - ["johndoe@pqr.com", "4253458383"]]}' + test_tabular_data = { + "header": [ + "email", + "phone number" + ], + "rows": [ + [ + "robertfrost@xyz.com", + "4232342345" + ], + [ + "johndoe@pqr.com", + "4253458383" + ] + ] + } inspect_content.inspect_table( GCLOUD_PROJECT, - json.loads(test_tabular_data), + test_tabular_data, ['PHONE_NUMBER', 'EMAIL_ADDRESS'], include_quote=True)