Skip to content

Commit 4b9bd20

Browse files
committed
DLP: Added sample for k anonymity with entity ID and deidentify cloud storage
Added unit test cases for same
1 parent 020c773 commit 4b9bd20

File tree

5 files changed

+680
-0
lines changed

5 files changed

+680
-0
lines changed

dlp/deIdentifyCloudStorage.js

+175
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
// Copyright 2023 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
'use strict';
16+
17+
// sample-metadata:
18+
// title: De-identify sensitive data in a Cloud Storage directory.
19+
// description: Uses the Data Loss Prevention API To de-identify sensitive data in a Cloud Storage directory.
20+
// usage: node deIdentifyCloudStorage.js projectId, inputDirectory, tableId, datasetId, outputDirectory, deidentifyTemplateId, structuredDeidentifyTemplateId, imageRedactTemplateId
21+
async function main(
22+
projectId,
23+
inputDirectory,
24+
tableId,
25+
datasetId,
26+
outputDirectory,
27+
deidentifyTemplateId,
28+
structuredDeidentifyTemplateId,
29+
imageRedactTemplateId
30+
) {
31+
// [START dlp_deidentify_cloud_storage]
32+
// Imports the Google Cloud client library
33+
const DLP = require('@google-cloud/dlp');
34+
// Instantiates a client
35+
const dlp = new DLP.DlpServiceClient();
36+
37+
// The project ID to run the API call under
38+
// const projectId = 'my-project';
39+
40+
// The Cloud Storage directory that needs to be inspected
41+
// const inputDirectory = 'your-google-cloud-storage-path';
42+
43+
// The ID of the dataset to inspect, e.g. 'my_dataset'
44+
// const datasetId = 'my_dataset';
45+
46+
// The ID of the table to inspect, e.g. 'my_table'
47+
// const tableId = 'my_table';
48+
49+
// The Cloud Storage directory that will be used to store the de-identified files
50+
// const outputDirectory = 'your-output-directory';
51+
52+
// The full resource name of the default de-identify template
53+
// const deidentifyTemplateId = 'your-deidentify-template-id';
54+
55+
// The full resource name of the de-identify template for structured files
56+
// const structuredDeidentifyTemplateId = 'your-structured-deidentify-template-id';
57+
58+
// The full resource name of the image redaction template for images
59+
// const imageRedactTemplateId = 'your-image-redact-template-id';
60+
61+
async function deidentifyCloudStorage() {
62+
// Specify storage configuration that uses file set.
63+
const storageConfig = {
64+
cloudStorageOptions: {
65+
fileSet: {
66+
url: inputDirectory,
67+
},
68+
},
69+
};
70+
71+
// Specify the type of info the inspection will look for.
72+
const infoTypes = [{name: 'PERSON_NAME'}, {name: 'EMAIL_ADDRESS'}];
73+
74+
// Construct inspect configuration
75+
const inspectConfig = {
76+
infoTypes: infoTypes,
77+
includeQuote: true,
78+
};
79+
80+
// Types of files to include for de-identification.
81+
const fileTypesToTransform = [
82+
{fileType: 'IMAGE'},
83+
{fileType: 'CSV'},
84+
{fileType: 'TEXT_FILE'},
85+
];
86+
87+
// Specify the big query table to store the transformation details.
88+
const transformationDetailsStorageConfig = {
89+
table: {
90+
projectId: projectId,
91+
tableId: tableId,
92+
datasetId: datasetId,
93+
},
94+
};
95+
96+
// Specify the de-identify template used for the transformation.
97+
const transformationConfig = {
98+
deidentifyTemplate: deidentifyTemplateId,
99+
structuredDeidentifyTemplate: structuredDeidentifyTemplateId,
100+
imageRedactTemplate: imageRedactTemplateId,
101+
};
102+
103+
// Construct action to de-identify sensitive data.
104+
const action = {
105+
deidentify: {
106+
cloudStorageOutput: outputDirectory,
107+
transformationConfig: transformationConfig,
108+
transformationDetailsStorageConfig: transformationDetailsStorageConfig,
109+
fileTypes: fileTypesToTransform,
110+
},
111+
};
112+
113+
// Construct the inspect job configuration.
114+
const inspectJobConfig = {
115+
inspectConfig: inspectConfig,
116+
storageConfig: storageConfig,
117+
actions: [action],
118+
};
119+
120+
// Construct the job creation request to be sent by the client.
121+
const request = {
122+
parent: `projects/${projectId}/locations/global`,
123+
inspectJob: inspectJobConfig,
124+
};
125+
// Send the job creation request and process the response.
126+
const [response] = await dlp.createDlpJob(request);
127+
const jobName = response.name;
128+
129+
// Waiting for a maximum of 15 minutes for the job to get complete.
130+
let job;
131+
let numOfAttempts = 30;
132+
while (numOfAttempts > 0) {
133+
// Fetch DLP Job status
134+
[job] = await dlp.getDlpJob({name: jobName});
135+
136+
// Check if the job has completed.
137+
if (job.state === 'DONE') {
138+
break;
139+
}
140+
if (job.state === 'FAILED') {
141+
console.log('Job Failed, Please check the configuration.');
142+
return;
143+
}
144+
// Sleep for a short duration before checking the job status again.
145+
await new Promise(resolve => {
146+
setTimeout(() => resolve(), 30000);
147+
});
148+
numOfAttempts -= 1;
149+
}
150+
151+
// Print out the results.
152+
const infoTypeStats = job.inspectDetails.result.infoTypeStats;
153+
if (infoTypeStats.length > 0) {
154+
infoTypeStats.forEach(infoTypeStat => {
155+
console.log(
156+
` Found ${infoTypeStat.count} instance(s) of infoType ${infoTypeStat.infoType.name}.`
157+
);
158+
});
159+
} else {
160+
console.log('No findings.');
161+
}
162+
}
163+
await deidentifyCloudStorage();
164+
// [END dlp_deidentify_cloud_storage]
165+
}
166+
167+
process.on('unhandledRejection', err => {
168+
console.error(err.message);
169+
process.exitCode = 1;
170+
});
171+
172+
// TODO(developer): Please uncomment below line before running sample
173+
// main(...process.argv.slice(2));
174+
175+
module.exports = main;

dlp/kAnonymityWithEntityIds.js

+153
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
// Copyright 2023 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
'use strict';
16+
17+
// sample-metadata:
18+
// title: Create a Dlp Job to visualize the k-anonymity re-identification risk analysis metric
19+
// description: Uses the Data Loss Prevention API to visualize the k-anonymity re-identification risk analysis metric.
20+
// usage: node kAnonymityWithEntityIds.js projectId, datasetId, sourceTableId, outputTableId
21+
async function main(projectId, datasetId, sourceTableId, outputTableId) {
22+
// [START dlp_k_anonymity_with_entity_id]
23+
// Imports the Google Cloud Data Loss Prevention library
24+
const DLP = require('@google-cloud/dlp');
25+
26+
// Instantiates a client
27+
const dlp = new DLP.DlpServiceClient();
28+
29+
// The project ID to run the API call under.
30+
// const projectId = "your-project-id";
31+
32+
// The ID of the dataset to inspect, e.g. 'my_dataset'
33+
// const datasetId = 'my_dataset';
34+
35+
// The ID of the table to inspect, e.g. 'my_table'
36+
// const sourceTableId = 'my_source_table';
37+
38+
// The ID of the table where outputs are stored
39+
// const outputTableId = 'my_output_table';
40+
41+
async function kAnonymityWithEntityIds() {
42+
// Specify the BigQuery table to analyze.
43+
const sourceTable = {
44+
projectId: projectId,
45+
datasetId: datasetId,
46+
tableId: sourceTableId,
47+
};
48+
49+
// Specify quasi-identifiers to analyze
50+
const privacyMetric = {
51+
kAnonymityConfig: {
52+
entityId: {
53+
field: {
54+
name: 'Name',
55+
},
56+
},
57+
quasiIds: [
58+
{
59+
name: 'Age',
60+
},
61+
{
62+
name: 'Mystery',
63+
},
64+
],
65+
},
66+
};
67+
// Create action to publish job status notifications to BigQuery table.
68+
const action = [
69+
{
70+
saveFindings: {
71+
outputConfig: {
72+
table: {
73+
projectId: projectId,
74+
datasetId: datasetId,
75+
tableId: outputTableId,
76+
},
77+
},
78+
},
79+
},
80+
];
81+
82+
// Configure the risk analysis job to perform.
83+
const riskJob = {
84+
sourceTable: sourceTable,
85+
privacyMetric: privacyMetric,
86+
actions: action,
87+
};
88+
// Combine configurations into a request for the service.
89+
const request = {
90+
parent: `projects/${projectId}/locations/global`,
91+
riskJob: riskJob,
92+
};
93+
94+
// Send the request and receive response from the service
95+
const [createdDlpJob] = await dlp.createDlpJob(request);
96+
const jobName = createdDlpJob.name;
97+
98+
// Waiting for a maximum of 15 minutes for the job to get complete.
99+
let job;
100+
let numOfAttempts = 30;
101+
while (numOfAttempts > 0) {
102+
// Fetch DLP Job status
103+
[job] = await dlp.getDlpJob({name: jobName});
104+
105+
// Check if the job has completed.
106+
if (job.state === 'DONE') {
107+
break;
108+
}
109+
if (job.state === 'FAILED') {
110+
console.log('Job Failed, Please check the configuration.');
111+
return;
112+
}
113+
// Sleep for a short duration before checking the job status again.
114+
await new Promise(resolve => {
115+
setTimeout(() => resolve(), 30000);
116+
});
117+
numOfAttempts -= 1;
118+
}
119+
120+
// Create helper function for unpacking values
121+
const getValue = obj => obj[Object.keys(obj)[0]];
122+
123+
// Print out the results.
124+
const histogramBuckets =
125+
job.riskDetails.kAnonymityResult.equivalenceClassHistogramBuckets;
126+
127+
histogramBuckets.forEach((histogramBucket, histogramBucketIdx) => {
128+
console.log(`Bucket ${histogramBucketIdx}:`);
129+
console.log(
130+
` Bucket size range: [${histogramBucket.equivalenceClassSizeLowerBound}, ${histogramBucket.equivalenceClassSizeUpperBound}]`
131+
);
132+
133+
histogramBucket.bucketValues.forEach(valueBucket => {
134+
const quasiIdValues = valueBucket.quasiIdsValues
135+
.map(getValue)
136+
.join(', ');
137+
console.log(` Quasi-ID values: {${quasiIdValues}}`);
138+
console.log(` Class size: ${valueBucket.equivalenceClassSize}`);
139+
});
140+
});
141+
}
142+
await kAnonymityWithEntityIds();
143+
// [END dlp_k_anonymity_with_entity_id]
144+
}
145+
process.on('unhandledRejection', err => {
146+
console.error(err.message);
147+
process.exitCode = 1;
148+
});
149+
150+
// TODO(developer): Please uncomment below line before running sample
151+
// main(...process.argv.slice(2));
152+
153+
module.exports = main;

0 commit comments

Comments
 (0)