Skip to content

Commit 6d4b796

Browse files
DLP: Added sample for k anonymity with entity ID and deidentify cloud storage (#3364)
* DLP: Added sample for k anonymity with entity ID and deidentify cloud storage Added unit test cases for same * Improved comments as per PR suggestions * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 165f350 commit 6d4b796

File tree

5 files changed

+677
-0
lines changed

5 files changed

+677
-0
lines changed

dlp/deIdentifyCloudStorage.js

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
// Copyright 2023 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
'use strict';
16+
17+
// sample-metadata:
18+
// title: De-identify sensitive data in a Cloud Storage directory.
19+
// description: Uses the Data Loss Prevention API To de-identify sensitive data in a Cloud Storage directory.
20+
// usage: node deIdentifyCloudStorage.js projectId, inputDirectory, tableId, datasetId, outputDirectory, deidentifyTemplateId, structuredDeidentifyTemplateId, imageRedactTemplateId
21+
async function main(
22+
projectId,
23+
inputDirectory,
24+
tableId,
25+
datasetId,
26+
outputDirectory,
27+
deidentifyTemplateId,
28+
structuredDeidentifyTemplateId,
29+
imageRedactTemplateId
30+
) {
31+
// [START dlp_deidentify_cloud_storage]
32+
// Imports the Google Cloud client library
33+
const DLP = require('@google-cloud/dlp');
34+
// Instantiates a client
35+
const dlp = new DLP.DlpServiceClient();
36+
37+
// The project ID to run the API call under
38+
// const projectId = 'my-project';
39+
40+
// The Cloud Storage directory that needs to be inspected
41+
// const inputDirectory = 'your-google-cloud-storage-path';
42+
43+
// The ID of the dataset to inspect, e.g. 'my_dataset'
44+
// const datasetId = 'my_dataset';
45+
46+
// The ID of the table to inspect, e.g. 'my_table'
47+
// const tableId = 'my_table';
48+
49+
// The Cloud Storage directory that will be used to store the de-identified files
50+
// const outputDirectory = 'your-output-directory';
51+
52+
// The full resource name of the default de-identify template
53+
// const deidentifyTemplateId = 'your-deidentify-template-id';
54+
55+
// The full resource name of the de-identify template for structured files
56+
// const structuredDeidentifyTemplateId = 'your-structured-deidentify-template-id';
57+
58+
// The full resource name of the image redaction template for images
59+
// const imageRedactTemplateId = 'your-image-redact-template-id';
60+
61+
async function deidentifyCloudStorage() {
62+
// Specify storage configuration that uses file set.
63+
const storageConfig = {
64+
cloudStorageOptions: {
65+
fileSet: {
66+
url: inputDirectory,
67+
},
68+
},
69+
};
70+
71+
// Specify the type of info the inspection will look for.
72+
const infoTypes = [{name: 'PERSON_NAME'}, {name: 'EMAIL_ADDRESS'}];
73+
74+
// Construct inspect configuration
75+
const inspectConfig = {
76+
infoTypes: infoTypes,
77+
includeQuote: true,
78+
};
79+
80+
// Types of files to include for de-identification.
81+
const fileTypesToTransform = [
82+
{fileType: 'IMAGE'},
83+
{fileType: 'CSV'},
84+
{fileType: 'TEXT_FILE'},
85+
];
86+
87+
// Specify the big query table to store the transformation details.
88+
const transformationDetailsStorageConfig = {
89+
table: {
90+
projectId: projectId,
91+
tableId: tableId,
92+
datasetId: datasetId,
93+
},
94+
};
95+
96+
// Specify the de-identify template used for the transformation.
97+
const transformationConfig = {
98+
deidentifyTemplate: deidentifyTemplateId,
99+
structuredDeidentifyTemplate: structuredDeidentifyTemplateId,
100+
imageRedactTemplate: imageRedactTemplateId,
101+
};
102+
103+
// Construct action to de-identify sensitive data.
104+
const action = {
105+
deidentify: {
106+
cloudStorageOutput: outputDirectory,
107+
transformationConfig: transformationConfig,
108+
transformationDetailsStorageConfig: transformationDetailsStorageConfig,
109+
fileTypes: fileTypesToTransform,
110+
},
111+
};
112+
113+
// Construct the inspect job configuration.
114+
const inspectJobConfig = {
115+
inspectConfig: inspectConfig,
116+
storageConfig: storageConfig,
117+
actions: [action],
118+
};
119+
120+
// Construct the job creation request to be sent by the client.
121+
const createDlpJobRequest = {
122+
parent: `projects/${projectId}/locations/global`,
123+
inspectJob: inspectJobConfig,
124+
};
125+
// Send the job creation request and process the response.
126+
const [response] = await dlp.createDlpJob(createDlpJobRequest);
127+
const jobName = response.name;
128+
129+
// Waiting for a maximum of 15 minutes for the job to get complete.
130+
let job;
131+
let numOfAttempts = 30;
132+
while (numOfAttempts > 0) {
133+
// Fetch DLP Job status
134+
[job] = await dlp.getDlpJob({name: jobName});
135+
136+
// Check if the job has completed.
137+
if (job.state === 'DONE') {
138+
break;
139+
}
140+
if (job.state === 'FAILED') {
141+
console.log('Job Failed, Please check the configuration.');
142+
return;
143+
}
144+
// Sleep for a short duration before checking the job status again.
145+
await new Promise(resolve => {
146+
setTimeout(() => resolve(), 30000);
147+
});
148+
numOfAttempts -= 1;
149+
}
150+
151+
// Print out the results.
152+
const infoTypeStats = job.inspectDetails.result.infoTypeStats;
153+
if (infoTypeStats.length > 0) {
154+
infoTypeStats.forEach(infoTypeStat => {
155+
console.log(
156+
` Found ${infoTypeStat.count} instance(s) of infoType ${infoTypeStat.infoType.name}.`
157+
);
158+
});
159+
} else {
160+
console.log('No findings.');
161+
}
162+
}
163+
await deidentifyCloudStorage();
164+
// [END dlp_deidentify_cloud_storage]
165+
}
166+
167+
process.on('unhandledRejection', err => {
168+
console.error(err.message);
169+
process.exitCode = 1;
170+
});
171+
172+
// TODO(developer): Please uncomment below line before running sample
173+
// main(...process.argv.slice(2));
174+
175+
module.exports = main;

dlp/kAnonymityWithEntityIds.js

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
// Copyright 2023 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
'use strict';
16+
17+
// sample-metadata:
18+
// title: Create a Dlp Job to visualize the k-anonymity re-identification risk analysis metric
19+
// description: Uses the Data Loss Prevention API to visualize the k-anonymity re-identification risk analysis metric.
20+
// usage: node kAnonymityWithEntityIds.js projectId, datasetId, sourceTableId, outputTableId
21+
async function main(projectId, datasetId, sourceTableId, outputTableId) {
22+
// [START dlp_k_anonymity_with_entity_id]
23+
// Imports the Google Cloud Data Loss Prevention library
24+
const DLP = require('@google-cloud/dlp');
25+
26+
// Instantiates a client
27+
const dlp = new DLP.DlpServiceClient();
28+
29+
// The project ID to run the API call under.
30+
// const projectId = "your-project-id";
31+
32+
// The ID of the dataset to inspect, e.g. 'my_dataset'
33+
// const datasetId = 'my_dataset';
34+
35+
// The ID of the table to inspect, e.g. 'my_table'
36+
// const sourceTableId = 'my_source_table';
37+
38+
// The ID of the table where outputs are stored
39+
// const outputTableId = 'my_output_table';
40+
41+
async function kAnonymityWithEntityIds() {
42+
// Specify the BigQuery table to analyze.
43+
const sourceTable = {
44+
projectId: projectId,
45+
datasetId: datasetId,
46+
tableId: sourceTableId,
47+
};
48+
49+
// Specify the unique identifier in the source table for the k-anonymity analysis.
50+
const uniqueIdField = {name: 'Name'};
51+
52+
// These values represent the column names of quasi-identifiers to analyze
53+
const quasiIds = [{name: 'Age'}, {name: 'Mystery'}];
54+
55+
// Configure the privacy metric to compute for re-identification risk analysis.
56+
const privacyMetric = {
57+
kAnonymityConfig: {
58+
entityId: {
59+
field: uniqueIdField,
60+
},
61+
quasiIds: quasiIds,
62+
},
63+
};
64+
// Create action to publish job status notifications to BigQuery table.
65+
const action = [
66+
{
67+
saveFindings: {
68+
outputConfig: {
69+
table: {
70+
projectId: projectId,
71+
datasetId: datasetId,
72+
tableId: outputTableId,
73+
},
74+
},
75+
},
76+
},
77+
];
78+
79+
// Configure the risk analysis job to perform.
80+
const riskAnalysisJob = {
81+
sourceTable: sourceTable,
82+
privacyMetric: privacyMetric,
83+
actions: action,
84+
};
85+
// Combine configurations into a request for the service.
86+
const createDlpJobRequest = {
87+
parent: `projects/${projectId}/locations/global`,
88+
riskJob: riskAnalysisJob,
89+
};
90+
91+
// Send the request and receive response from the service
92+
const [createdDlpJob] = await dlp.createDlpJob(createDlpJobRequest);
93+
const jobName = createdDlpJob.name;
94+
95+
// Waiting for a maximum of 15 minutes for the job to get complete.
96+
let job;
97+
let numOfAttempts = 30;
98+
while (numOfAttempts > 0) {
99+
// Fetch DLP Job status
100+
[job] = await dlp.getDlpJob({name: jobName});
101+
102+
// Check if the job has completed.
103+
if (job.state === 'DONE') {
104+
break;
105+
}
106+
if (job.state === 'FAILED') {
107+
console.log('Job Failed, Please check the configuration.');
108+
return;
109+
}
110+
// Sleep for a short duration before checking the job status again.
111+
await new Promise(resolve => {
112+
setTimeout(() => resolve(), 30000);
113+
});
114+
numOfAttempts -= 1;
115+
}
116+
117+
// Create helper function for unpacking values
118+
const getValue = obj => obj[Object.keys(obj)[0]];
119+
120+
// Print out the results.
121+
const histogramBuckets =
122+
job.riskDetails.kAnonymityResult.equivalenceClassHistogramBuckets;
123+
124+
histogramBuckets.forEach((histogramBucket, histogramBucketIdx) => {
125+
console.log(`Bucket ${histogramBucketIdx}:`);
126+
console.log(
127+
` Bucket size range: [${histogramBucket.equivalenceClassSizeLowerBound}, ${histogramBucket.equivalenceClassSizeUpperBound}]`
128+
);
129+
130+
histogramBucket.bucketValues.forEach(valueBucket => {
131+
const quasiIdValues = valueBucket.quasiIdsValues
132+
.map(getValue)
133+
.join(', ');
134+
console.log(` Quasi-ID values: {${quasiIdValues}}`);
135+
console.log(` Class size: ${valueBucket.equivalenceClassSize}`);
136+
});
137+
});
138+
}
139+
await kAnonymityWithEntityIds();
140+
// [END dlp_k_anonymity_with_entity_id]
141+
}
142+
process.on('unhandledRejection', err => {
143+
console.error(err.message);
144+
process.exitCode = 1;
145+
});
146+
147+
// TODO(developer): Please uncomment below line before running sample
148+
// main(...process.argv.slice(2));
149+
150+
module.exports = main;

0 commit comments

Comments
 (0)