Skip to content

Commit b6b4136

Browse files
authored
Add DLP sample for redacting all image text (#4018)
The sample shows how to remove all text found in an image with DLP. The sample is integrated into the existing redact.py CLI application.
1 parent 21a25b9 commit b6b4136

File tree

3 files changed

+118
-42
lines changed

3 files changed

+118
-42
lines changed

dlp/README.rst

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -136,37 +136,21 @@ To run this sample:
136136
137137
$ python redact.py
138138
139-
usage: redact.py [-h] [--project PROJECT]
140-
[--info_types INFO_TYPES [INFO_TYPES ...]]
141-
[--min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}]
142-
[--mime_type MIME_TYPE]
143-
filename output_filename
139+
usage: redact.py [-h] {info_types,all_text} ...
144140
145141
Sample app that uses the Data Loss Prevent API to redact the contents of an
146142
image file.
147143
148144
positional arguments:
149-
filename The path to the file to inspect.
150-
output_filename The path to which the redacted image will be written.
145+
{info_types,all_text}
146+
Select which content should be redacted.
147+
info_types Redact specific infoTypes from an image.
148+
all_text Redact all text from an image. The MIME type of the
149+
file is inferred via the Python standard library's
150+
mimetypes module.
151151
152152
optional arguments:
153153
-h, --help show this help message and exit
154-
--project PROJECT The Google Cloud project id to use as a parent
155-
resource.
156-
--info_types INFO_TYPES [INFO_TYPES ...]
157-
Strings representing info types to look for. A full
158-
list of info categories and types is available from
159-
the API. Examples include "FIRST_NAME", "LAST_NAME",
160-
"EMAIL_ADDRESS". If unspecified, the three above
161-
examples will be used.
162-
--min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}
163-
A string representing the minimum likelihood threshold
164-
that constitutes a match.
165-
--mime_type MIME_TYPE
166-
The MIME type of the file. If not specified, the type
167-
is inferred via the Python standard library's
168-
mimetypes module.
169-
170154
171155
172156
Metadata

dlp/redact.py

Lines changed: 97 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -121,23 +121,87 @@ def redact_image(
121121

122122
# [END dlp_redact_image]
123123

124+
# [START dlp_redact_image_all_text]
124125

125-
if __name__ == "__main__":
126-
default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")
127126

128-
parser = argparse.ArgumentParser(description=__doc__)
127+
def redact_image_all_text(
128+
project,
129+
filename,
130+
output_filename,
131+
):
132+
"""Uses the Data Loss Prevention API to redact all text in an image.
129133
130-
parser.add_argument("filename", help="The path to the file to inspect.")
131-
parser.add_argument(
132-
"output_filename",
133-
help="The path to which the redacted image will be written.",
134+
Args:
135+
project: The Google Cloud project id to use as a parent resource.
136+
filename: The path to the file to inspect.
137+
output_filename: The path to which the redacted image will be written.
138+
139+
Returns:
140+
None; the response from the API is printed to the terminal.
141+
"""
142+
# Import the client library
143+
import google.cloud.dlp
144+
145+
# Instantiate a client.
146+
dlp = google.cloud.dlp_v2.DlpServiceClient()
147+
148+
# Construct the image_redaction_configs, indicating to DLP that all text in
149+
# the input image should be redacted.
150+
image_redaction_configs = [{
151+
"redact_all_text": True,
152+
}]
153+
154+
# Construct the byte_item, containing the file's byte data.
155+
with open(filename, mode="rb") as f:
156+
byte_item = {"type": "IMAGE", "data": f.read()}
157+
158+
# Convert the project id into a full resource id.
159+
parent = dlp.project_path(project)
160+
161+
# Call the API.
162+
response = dlp.redact_image(
163+
parent,
164+
image_redaction_configs=image_redaction_configs,
165+
byte_item=byte_item,
134166
)
135-
parser.add_argument(
167+
168+
# Write out the results.
169+
with open(output_filename, mode="wb") as f:
170+
f.write(response.redacted_image)
171+
172+
print("Wrote {byte_count} to {filename}".format(
173+
byte_count=len(response.redacted_image), filename=output_filename))
174+
175+
176+
# [END dlp_redact_image_all_text]
177+
178+
if __name__ == "__main__":
179+
default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")
180+
181+
common_args_parser = argparse.ArgumentParser(add_help=False)
182+
common_args_parser.add_argument(
136183
"--project",
137184
help="The Google Cloud project id to use as a parent resource.",
138185
default=default_project,
139186
)
140-
parser.add_argument(
187+
common_args_parser.add_argument(
188+
"filename", help="The path to the file to inspect.")
189+
common_args_parser.add_argument(
190+
"output_filename",
191+
help="The path to which the redacted image will be written.",
192+
)
193+
194+
parser = argparse.ArgumentParser(description=__doc__)
195+
subparsers = parser.add_subparsers(
196+
dest="content", help="Select which content should be redacted.")
197+
subparsers.required = True
198+
199+
info_types_parser = subparsers.add_parser(
200+
"info_types",
201+
help="Redact specific infoTypes from an image.",
202+
parents=[common_args_parser],
203+
)
204+
info_types_parser.add_argument(
141205
"--info_types",
142206
nargs="+",
143207
help="Strings representing info types to look for. A full list of "
@@ -146,7 +210,7 @@ def redact_image(
146210
"If unspecified, the three above examples will be used.",
147211
default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
148212
)
149-
parser.add_argument(
213+
info_types_parser.add_argument(
150214
"--min_likelihood",
151215
choices=[
152216
"LIKELIHOOD_UNSPECIFIED",
@@ -159,19 +223,33 @@ def redact_image(
159223
help="A string representing the minimum likelihood threshold that "
160224
"constitutes a match.",
161225
)
162-
parser.add_argument(
226+
info_types_parser.add_argument(
163227
"--mime_type",
164228
help="The MIME type of the file. If not specified, the type is "
165229
"inferred via the Python standard library's mimetypes module.",
166230
)
167231

232+
all_text_parser = subparsers.add_parser(
233+
"all_text",
234+
help="Redact all text from an image. The MIME type of the file is "
235+
"inferred via the Python standard library's mimetypes module.",
236+
parents=[common_args_parser],
237+
)
238+
168239
args = parser.parse_args()
169240

170-
redact_image(
171-
args.project,
172-
args.filename,
173-
args.output_filename,
174-
args.info_types,
175-
min_likelihood=args.min_likelihood,
176-
mime_type=args.mime_type,
177-
)
241+
if args.content == "info_types":
242+
redact_image(
243+
args.project,
244+
args.filename,
245+
args.output_filename,
246+
args.info_types,
247+
min_likelihood=args.min_likelihood,
248+
mime_type=args.mime_type,
249+
)
250+
elif args.content == "all_text":
251+
redact_image_all_text(
252+
args.project,
253+
args.filename,
254+
args.output_filename,
255+
)

dlp/redact_test.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,17 @@ def test_redact_image_file(tempdir, capsys):
4444

4545
out, _ = capsys.readouterr()
4646
assert output_filepath in out
47+
48+
49+
def test_redact_image_all_text(tempdir, capsys):
50+
test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png")
51+
output_filepath = os.path.join(tempdir, "redacted.png")
52+
53+
redact.redact_image_all_text(
54+
GCLOUD_PROJECT,
55+
test_filepath,
56+
output_filepath,
57+
)
58+
59+
out, _ = capsys.readouterr()
60+
assert output_filepath in out

0 commit comments

Comments
 (0)