Skip to content

Commit f17c23a

Browse files
committed
Add DLP sample for redacting all image text
The sample shows how to remove all text found in an image with DLP. The sample is integrated into the existing redact.py CLI application.
1 parent ee5be6d commit f17c23a

File tree

3 files changed

+118
-42
lines changed

3 files changed

+118
-42
lines changed

dlp/README.rst

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -136,37 +136,21 @@ To run this sample:
136136
137137
$ python redact.py
138138
139-
usage: redact.py [-h] [--project PROJECT]
140-
[--info_types INFO_TYPES [INFO_TYPES ...]]
141-
[--min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}]
142-
[--mime_type MIME_TYPE]
143-
filename output_filename
139+
usage: redact.py [-h] {info_types,all_text} ...
144140
145141
Sample app that uses the Data Loss Prevent API to redact the contents of an
146142
image file.
147143
148144
positional arguments:
149-
filename The path to the file to inspect.
150-
output_filename The path to which the redacted image will be written.
145+
{info_types,all_text}
146+
Select which content should be redacted.
147+
info_types Redact specific infoTypes from an image.
148+
all_text Redact all text from an image. The MIME type of the
149+
file is inferred via the Python standard library's
150+
mimetypes module.
151151
152152
optional arguments:
153153
-h, --help show this help message and exit
154-
--project PROJECT The Google Cloud project id to use as a parent
155-
resource.
156-
--info_types INFO_TYPES [INFO_TYPES ...]
157-
Strings representing info types to look for. A full
158-
list of info categories and types is available from
159-
the API. Examples include "FIRST_NAME", "LAST_NAME",
160-
"EMAIL_ADDRESS". If unspecified, the three above
161-
examples will be used.
162-
--min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}
163-
A string representing the minimum likelihood threshold
164-
that constitutes a match.
165-
--mime_type MIME_TYPE
166-
The MIME type of the file. If not specified, the type
167-
is inferred via the Python standard library's
168-
mimetypes module.
169-
170154
171155
172156
Metadata

dlp/redact.py

Lines changed: 97 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -121,23 +121,87 @@ def redact_image(
121121

122122
# [END dlp_redact_image]
123123

124+
# [START dlp_redact_image_all_text]
124125

125-
if __name__ == "__main__":
126-
default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")
127126

128-
parser = argparse.ArgumentParser(description=__doc__)
127+
def redact_image_all_text(
128+
project,
129+
filename,
130+
output_filename,
131+
):
132+
"""Uses the Data Loss Prevention API to redact all text in an image.
129133
130-
parser.add_argument("filename", help="The path to the file to inspect.")
131-
parser.add_argument(
132-
"output_filename",
133-
help="The path to which the redacted image will be written.",
134+
Args:
135+
project: The Google Cloud project id to use as a parent resource.
136+
filename: The path to the file to inspect.
137+
output_filename: The path to which the redacted image will be written.
138+
139+
Returns:
140+
None; the response from the API is printed to the terminal.
141+
"""
142+
# Import the client library
143+
import google.cloud.dlp
144+
145+
# Instantiate a client.
146+
dlp = google.cloud.dlp_v2.DlpServiceClient()
147+
148+
# Construct the image_redaction_configs, indicating to DLP that all text in
149+
# the input image should be redacted.
150+
image_redaction_configs = [{
151+
"redact_all_text": True,
152+
}]
153+
154+
# Construct the byte_item, containing the file's byte data.
155+
with open(filename, mode="rb") as f:
156+
byte_item = {"type": "IMAGE", "data": f.read()}
157+
158+
# Convert the project id into a full resource id.
159+
parent = dlp.project_path(project)
160+
161+
# Call the API.
162+
response = dlp.redact_image(
163+
parent,
164+
image_redaction_configs=image_redaction_configs,
165+
byte_item=byte_item,
134166
)
135-
parser.add_argument(
167+
168+
# Write out the results.
169+
with open(output_filename, mode="wb") as f:
170+
f.write(response.redacted_image)
171+
172+
print("Wrote {byte_count} to {filename}".format(
173+
byte_count=len(response.redacted_image), filename=output_filename))
174+
175+
176+
# [END dlp_redact_image_all_text]
177+
178+
if __name__ == "__main__":
179+
default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")
180+
181+
common_args_parser = argparse.ArgumentParser(add_help=False)
182+
common_args_parser.add_argument(
136183
"--project",
137184
help="The Google Cloud project id to use as a parent resource.",
138185
default=default_project,
139186
)
140-
parser.add_argument(
187+
common_args_parser.add_argument(
188+
"filename", help="The path to the file to inspect.")
189+
common_args_parser.add_argument(
190+
"output_filename",
191+
help="The path to which the redacted image will be written.",
192+
)
193+
194+
parser = argparse.ArgumentParser(description=__doc__)
195+
subparsers = parser.add_subparsers(
196+
dest="content", help="Select which content should be redacted.")
197+
subparsers.required = True
198+
199+
info_types_parser = subparsers.add_parser(
200+
"info_types",
201+
help="Redact specific infoTypes from an image.",
202+
parents=[common_args_parser],
203+
)
204+
info_types_parser.add_argument(
141205
"--info_types",
142206
nargs="+",
143207
help="Strings representing info types to look for. A full list of "
@@ -146,7 +210,7 @@ def redact_image(
146210
"If unspecified, the three above examples will be used.",
147211
default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
148212
)
149-
parser.add_argument(
213+
info_types_parser.add_argument(
150214
"--min_likelihood",
151215
choices=[
152216
"LIKELIHOOD_UNSPECIFIED",
@@ -159,19 +223,33 @@ def redact_image(
159223
help="A string representing the minimum likelihood threshold that "
160224
"constitutes a match.",
161225
)
162-
parser.add_argument(
226+
info_types_parser.add_argument(
163227
"--mime_type",
164228
help="The MIME type of the file. If not specified, the type is "
165229
"inferred via the Python standard library's mimetypes module.",
166230
)
167231

232+
all_text_parser = subparsers.add_parser(
233+
"all_text",
234+
help="Redact all text from an image. The MIME type of the file is "
235+
"inferred via the Python standard library's mimetypes module.",
236+
parents=[common_args_parser],
237+
)
238+
168239
args = parser.parse_args()
169240

170-
redact_image(
171-
args.project,
172-
args.filename,
173-
args.output_filename,
174-
args.info_types,
175-
min_likelihood=args.min_likelihood,
176-
mime_type=args.mime_type,
177-
)
241+
if args.content == "info_types":
242+
redact_image(
243+
args.project,
244+
args.filename,
245+
args.output_filename,
246+
args.info_types,
247+
min_likelihood=args.min_likelihood,
248+
mime_type=args.mime_type,
249+
)
250+
elif args.content == "all_text":
251+
redact_image_all_text(
252+
args.project,
253+
args.filename,
254+
args.output_filename,
255+
)

dlp/redact_test.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,17 @@ def test_redact_image_file(tempdir, capsys):
4444

4545
out, _ = capsys.readouterr()
4646
assert output_filepath in out
47+
48+
49+
def test_redact_image_all_text(tempdir, capsys):
50+
test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png")
51+
output_filepath = os.path.join(tempdir, "redacted.png")
52+
53+
redact.redact_image_all_text(
54+
GCLOUD_PROJECT,
55+
test_filepath,
56+
output_filepath,
57+
)
58+
59+
out, _ = capsys.readouterr()
60+
assert output_filepath in out

0 commit comments

Comments
 (0)