Skip to content

Commit 48e3886

Browse files
[formrecognizer] renames to support non-text elements (#12251)
* more renames * corrections * recordings * add FieldData to init * plural field_elements * whoops * one more fix * fix * FormContent -> FormElement * review feedback * update docstring * wording
1 parent 2f85327 commit 48e3886

21 files changed

+155
-150
lines changed

sdk/formrecognizer/azure-ai-formrecognizer/CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@
88
- `begin_recognize_receipts` and `begin_recognize_receipts_from_url` now return `RecognizedForm`.
99
- `requested_on` renamed to `training_started_on` and `completed_on` renamed to `training_completed_on` on `CustomFormModel`
1010
and `CustomFormModelInfo`
11+
- `FieldText` has been renamed to `FieldData`
12+
- `FormContent` has been renamed to `FormElement`
13+
- Parameter `include_text_content` has been renamed to `include_field_elements` for
14+
`begin_recognize_receipts`, `begin_recognize_receipts_from_url`, `begin_recognize_custom_forms`, and `begin_recognize_custom_forms_from_url`
15+
- `text_content` has been renamed to `field_elements` on `FieldData` and `FormTableCell`
1116

1217
**Fixes and improvements**
1318

sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/__init__.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from ._form_training_client import FormTrainingClient
1010

1111
from ._models import (
12-
FormContent,
12+
FormElement,
1313
LengthUnit,
1414
TrainingStatus,
1515
CustomFormModelStatus,
@@ -24,7 +24,7 @@
2424
FormPageRange,
2525
RecognizedForm,
2626
FormField,
27-
FieldText,
27+
FieldData,
2828
FormPage,
2929
FormLine,
3030
FormWord,
@@ -41,7 +41,7 @@
4141
'TrainingStatus',
4242
'CustomFormModelStatus',
4343
'FormContentType',
44-
'FormContent',
44+
'FormElement',
4545
'FormTable',
4646
'FormTableCell',
4747
'TrainingDocumentInfo',
@@ -52,7 +52,7 @@
5252
'FormPageRange',
5353
'RecognizedForm',
5454
'FormField',
55-
'FieldText',
55+
'FieldData',
5656
'FormPage',
5757
'FormLine',
5858
'FormWord',

sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_form_recognizer_client.py

+16-16
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@ def begin_recognize_receipts(self, receipt, **kwargs):
9393
:param receipt: JPEG, PNG, PDF and TIFF type file stream or bytes.
9494
Currently only supports US sales receipts.
9595
:type receipt: bytes or IO[bytes]
96-
:keyword bool include_text_content:
97-
Whether or not to include text elements such as lines and words in addition to form fields.
96+
:keyword bool include_field_elements:
97+
Whether or not to include field elements such as lines and words in addition to form fields.
9898
:keyword content_type: Media type of the body sent to the API. Content-type is
9999
auto-detected, but can be overridden by passing this keyword argument. For options,
100100
see :class:`~azure.ai.formrecognizer.FormContentType`.
@@ -123,15 +123,15 @@ def begin_recognize_receipts(self, receipt, **kwargs):
123123
if content_type == "application/json":
124124
raise TypeError("Call begin_recognize_receipts_from_url() to analyze a receipt from a url.")
125125

126-
include_text_content = kwargs.pop("include_text_content", False)
126+
include_field_elements = kwargs.pop("include_field_elements", False)
127127

128128
if content_type is None:
129129
content_type = get_content_type(receipt)
130130

131131
return self._client.begin_analyze_receipt_async(
132132
file_stream=receipt,
133133
content_type=content_type,
134-
include_text_details=include_text_content,
134+
include_text_details=include_field_elements,
135135
cls=kwargs.pop("cls", self._receipt_callback),
136136
polling=LROBasePolling(timeout=polling_interval, **kwargs),
137137
error_map=error_map,
@@ -151,8 +151,8 @@ def begin_recognize_receipts_from_url(self, receipt_url, **kwargs):
151151
:param str receipt_url: The url of the receipt to analyze. The input must be a valid, encoded url
152152
of one of the supported formats: JPEG, PNG, PDF and TIFF. Currently only supports
153153
US sales receipts.
154-
:keyword bool include_text_content:
155-
Whether or not to include text elements such as lines and words in addition to form fields.
154+
:keyword bool include_field_elements:
155+
Whether or not to include field elements such as lines and words in addition to form fields.
156156
:keyword int polling_interval: Waiting time between two polls for LRO operations
157157
if no Retry-After header is present. Defaults to 5 seconds.
158158
:keyword str continuation_token: A continuation token to restart a poller from a saved state.
@@ -173,11 +173,11 @@ def begin_recognize_receipts_from_url(self, receipt_url, **kwargs):
173173

174174
polling_interval = kwargs.pop("polling_interval", self._client._config.polling_interval)
175175
continuation_token = kwargs.pop("continuation_token", None)
176-
include_text_content = kwargs.pop("include_text_content", False)
176+
include_field_elements = kwargs.pop("include_field_elements", False)
177177

178178
return self._client.begin_analyze_receipt_async(
179179
file_stream={"source": receipt_url},
180-
include_text_details=include_text_content,
180+
include_text_details=include_field_elements,
181181
cls=kwargs.pop("cls", self._receipt_callback),
182182
polling=LROBasePolling(timeout=polling_interval, **kwargs),
183183
error_map=error_map,
@@ -279,8 +279,8 @@ def begin_recognize_custom_forms(self, model_id, form, **kwargs):
279279
:param str model_id: Custom model identifier.
280280
:param form: JPEG, PNG, PDF and TIFF type file stream or bytes.
281281
:type form: bytes or IO[bytes]
282-
:keyword bool include_text_content:
283-
Whether or not to include text elements such as lines and words in addition to form fields.
282+
:keyword bool include_field_elements:
283+
Whether or not to include field elements such as lines and words in addition to form fields.
284284
:keyword content_type: Media type of the body sent to the API. Content-type is
285285
auto-detected, but can be overridden by passing this keyword argument. For options,
286286
see :class:`~azure.ai.formrecognizer.FormContentType`.
@@ -313,7 +313,7 @@ def begin_recognize_custom_forms(self, model_id, form, **kwargs):
313313
if content_type == "application/json":
314314
raise TypeError("Call begin_recognize_custom_forms_from_url() to analyze a document from a url.")
315315

316-
include_text_content = kwargs.pop("include_text_content", False)
316+
include_field_elements = kwargs.pop("include_field_elements", False)
317317
if content_type is None:
318318
content_type = get_content_type(form)
319319

@@ -325,7 +325,7 @@ def analyze_callback(raw_response, _, headers): # pylint: disable=unused-argume
325325
return self._client.begin_analyze_with_custom_model(
326326
file_stream=form,
327327
model_id=model_id,
328-
include_text_details=include_text_content,
328+
include_text_details=include_field_elements,
329329
content_type=content_type,
330330
cls=deserialization_callback,
331331
polling=LROBasePolling(timeout=polling_interval, lro_algorithms=[AnalyzePolling()], **kwargs),
@@ -344,8 +344,8 @@ def begin_recognize_custom_forms_from_url(self, model_id, form_url, **kwargs):
344344
:param str model_id: Custom model identifier.
345345
:param str form_url: The url of the form to analyze. The input must be a valid, encoded url
346346
of one of the supported formats: JPEG, PNG, PDF and TIFF.
347-
:keyword bool include_text_content:
348-
Whether or not to include text elements such as lines and words in addition to form fields.
347+
:keyword bool include_field_elements:
348+
Whether or not to include field elements such as lines and words in addition to form fields.
349349
:keyword int polling_interval: Waiting time between two polls for LRO operations
350350
if no Retry-After header is present. Defaults to 5 seconds.
351351
:keyword str continuation_token: A continuation token to restart a poller from a saved state.
@@ -361,7 +361,7 @@ def begin_recognize_custom_forms_from_url(self, model_id, form_url, **kwargs):
361361
cls = kwargs.pop("cls", None)
362362
polling_interval = kwargs.pop("polling_interval", self._client._config.polling_interval)
363363
continuation_token = kwargs.pop("continuation_token", None)
364-
include_text_content = kwargs.pop("include_text_content", False)
364+
include_field_elements = kwargs.pop("include_field_elements", False)
365365

366366
def analyze_callback(raw_response, _, headers): # pylint: disable=unused-argument
367367
analyze_result = self._client._deserialize(AnalyzeOperationResult, raw_response)
@@ -371,7 +371,7 @@ def analyze_callback(raw_response, _, headers): # pylint: disable=unused-argume
371371
return self._client.begin_analyze_with_custom_model(
372372
file_stream={"source": form_url},
373373
model_id=model_id,
374-
include_text_details=include_text_content,
374+
include_text_details=include_field_elements,
375375
cls=deserialization_callback,
376376
polling=LROBasePolling(timeout=polling_interval, lro_algorithms=[AnalyzePolling()], **kwargs),
377377
error_map=error_map,

sdk/formrecognizer/azure-ai-formrecognizer/azure/ai/formrecognizer/_models.py

+34-34
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,8 @@ def __new__(cls, first_page_number, last_page_number):
138138
return super(FormPageRange, cls).__new__(cls, first_page_number, last_page_number)
139139

140140

141-
class FormContent(object):
142-
"""Base type which includes properties for text.
141+
class FormElement(object):
142+
"""Base type which includes properties for a form element.
143143
144144
:ivar str text: The text content of the line.
145145
:ivar list[~azure.ai.formrecognizer.Point] bounding_box:
@@ -188,10 +188,10 @@ def __repr__(self):
188188
class FormField(object):
189189
"""Represents a field recognized in an input form.
190190
191-
:ivar ~azure.ai.formrecognizer.FieldText label_data:
192-
Contains the text, bounding box, and text content of the field label.
193-
:ivar ~azure.ai.formrecognizer.FieldText value_data:
194-
Contains the text, bounding box, and text content of the field value.
191+
:ivar ~azure.ai.formrecognizer.FieldData label_data:
192+
Contains the text, bounding box, and field elements for the field label.
193+
:ivar ~azure.ai.formrecognizer.FieldData value_data:
194+
Contains the text, bounding box, and field elements for the field value.
195195
:ivar str name: The unique name of the field or label.
196196
:ivar value:
197197
The value for the recognized field. Possible types include: 'string',
@@ -212,8 +212,8 @@ def __init__(self, **kwargs):
212212
@classmethod
213213
def _from_generated(cls, field, value, read_result):
214214
return cls(
215-
label_data=FieldText._from_generated(field, read_result),
216-
value_data=FieldText._from_generated(value, read_result),
215+
label_data=FieldData._from_generated(field, read_result),
216+
value_data=FieldData._from_generated(value, read_result),
217217
value=get_field_value(field, value, read_result),
218218
name=field,
219219
confidence=adjust_confidence(value.confidence) if value else None,
@@ -222,8 +222,8 @@ def _from_generated(cls, field, value, read_result):
222222
@classmethod
223223
def _from_generated_unlabeled(cls, field, idx, page, read_result):
224224
return cls(
225-
label_data=FieldText._from_generated_unlabeled(field.key, page, read_result),
226-
value_data=FieldText._from_generated_unlabeled(field.value, page, read_result),
225+
label_data=FieldData._from_generated_unlabeled(field.key, page, read_result),
226+
value_data=FieldData._from_generated_unlabeled(field.value, page, read_result),
227227
value=field.value.text,
228228
name="field-" + str(idx),
229229
confidence=adjust_confidence(field.confidence),
@@ -235,7 +235,7 @@ def __repr__(self):
235235
)[:1024]
236236

237237

238-
class FieldText(FormContent):
238+
class FieldData(FormElement):
239239
"""Represents the text that is part of a form field. This includes
240240
the location of the text in the form and a collection of the
241241
elements that make up the text.
@@ -248,16 +248,16 @@ class FieldText(FormContent):
248248
that outlines the text. The points are listed in clockwise
249249
order: top-left, top-right, bottom-right, bottom-left.
250250
Units are in pixels for images and inches for PDF.
251-
:ivar text_content:
252-
When `include_text_content` is set to true, a list of text
251+
:ivar field_elements:
252+
When `include_field_elements` is set to true, a list of
253253
elements constituting this field or value is returned. The list
254-
constitutes of text elements such as lines and words.
255-
:vartype text_content: list[~azure.ai.formrecognizer.FormWord, ~azure.ai.formrecognizer.FormLine]
254+
constitutes of elements such as lines and words.
255+
:vartype field_elements: list[~azure.ai.formrecognizer.FormWord, ~azure.ai.formrecognizer.FormLine]
256256
"""
257257

258258
def __init__(self, **kwargs):
259-
super(FieldText, self).__init__(**kwargs)
260-
self.text_content = kwargs.get("text_content", None)
259+
super(FieldData, self).__init__(**kwargs)
260+
self.field_elements = kwargs.get("field_elements", None)
261261

262262
@classmethod
263263
def _from_generated(cls, field, read_result):
@@ -272,7 +272,7 @@ def _from_generated(cls, field, read_result):
272272
Point(x=field.bounding_box[4], y=field.bounding_box[5]),
273273
Point(x=field.bounding_box[6], y=field.bounding_box[7])
274274
] if field.bounding_box else None,
275-
text_content=get_elements(field, read_result) if field.elements else None
275+
field_elements=get_elements(field, read_result) if field.elements else None
276276
)
277277

278278
@classmethod
@@ -286,12 +286,12 @@ def _from_generated_unlabeled(cls, field, page, read_result):
286286
Point(x=field.bounding_box[4], y=field.bounding_box[5]),
287287
Point(x=field.bounding_box[6], y=field.bounding_box[7])
288288
] if field.bounding_box else None,
289-
text_content=get_elements(field, read_result) if field.elements else None
289+
field_elements=get_elements(field, read_result) if field.elements else None
290290
)
291291

292292
def __repr__(self):
293-
return "FieldText(page_number={}, text={}, bounding_box={}, text_content={})".format(
294-
self.page_number, self.text, self.bounding_box, repr(self.text_content)
293+
return "FieldData(page_number={}, text={}, bounding_box={}, field_elements={})".format(
294+
self.page_number, self.text, self.bounding_box, repr(self.field_elements)
295295
)[:1024]
296296

297297

@@ -315,7 +315,7 @@ class FormPage(object):
315315
:ivar list[~azure.ai.formrecognizer.FormTable] tables:
316316
A list of extracted tables contained in a page.
317317
:ivar list[~azure.ai.formrecognizer.FormLine] lines:
318-
When `include_text_content` is set to true, a list of recognized text lines is returned.
318+
When `include_field_elements` is set to true, a list of recognized text lines is returned.
319319
For calls to recognize content, this list is always populated. The maximum number of lines
320320
returned is 300 per page. The lines are sorted top to bottom, left to right, although in
321321
certain cases proximity is treated with higher priority. As the sorting order depends on
@@ -349,7 +349,7 @@ def __repr__(self):
349349
)[:1024]
350350

351351

352-
class FormLine(FormContent):
352+
class FormLine(FormElement):
353353
"""An object representing an extracted line of text.
354354
355355
:ivar str text: The text content of the line.
@@ -388,7 +388,7 @@ def __repr__(self):
388388
)[:1024]
389389

390390

391-
class FormWord(FormContent):
391+
class FormWord(FormElement):
392392
"""Represents a word recognized from the input document.
393393
394394
:ivar str text: The text content of the word.
@@ -452,7 +452,7 @@ def __repr__(self):
452452
)[:1024]
453453

454454

455-
class FormTableCell(FormContent):
455+
class FormTableCell(FormElement):
456456
"""Represents a cell contained in a table recognized from the input document.
457457
458458
:ivar str text: Text content of the cell.
@@ -471,12 +471,12 @@ class FormTableCell(FormContent):
471471
:ivar bool is_footer: Whether the current cell is a footer cell.
472472
:ivar int page_number:
473473
The 1-based number of the page in which this content is present.
474-
:ivar text_content:
475-
When `include_text_content` is set to true, a list of text
474+
:ivar field_elements:
475+
When `include_field_elements` is set to true, a list of
476476
elements constituting this cell is returned. The list
477-
constitutes of text elements such as lines and words.
478-
For calls to recognize content, this list is always populated.
479-
:vartype text_content: list[~azure.ai.formrecognizer.FormWord, ~azure.ai.formrecognizer.FormLine]
477+
constitutes of elements such as lines and words.
478+
For calls to begin_recognize_content(), this list is always populated.
479+
:vartype field_elements: list[~azure.ai.formrecognizer.FormWord, ~azure.ai.formrecognizer.FormLine]
480480
"""
481481

482482
def __init__(self, **kwargs):
@@ -488,7 +488,7 @@ def __init__(self, **kwargs):
488488
self.confidence = kwargs.get("confidence", None)
489489
self.is_header = kwargs.get("is_header", False)
490490
self.is_footer = kwargs.get("is_footer", False)
491-
self.text_content = kwargs.get("text_content", None)
491+
self.field_elements = kwargs.get("field_elements", None)
492492

493493
@classmethod
494494
def _from_generated(cls, cell, page, read_result):
@@ -508,14 +508,14 @@ def _from_generated(cls, cell, page, read_result):
508508
is_header=cell.is_header or False,
509509
is_footer=cell.is_footer or False,
510510
page_number=page,
511-
text_content=get_elements(cell, read_result) if cell.elements else None
511+
field_elements=get_elements(cell, read_result) if cell.elements else None
512512
)
513513

514514
def __repr__(self):
515515
return "FormTableCell(text={}, row_index={}, column_index={}, row_span={}, column_span={}, " \
516-
"bounding_box={}, confidence={}, is_header={}, is_footer={}, page_number={}, text_content={})".format(
516+
"bounding_box={}, confidence={}, is_header={}, is_footer={}, page_number={}, field_elements={})".format(
517517
self.text, self.row_index, self.column_index, self.row_span, self.column_span, self.bounding_box,
518-
self.confidence, self.is_header, self.is_footer, self.page_number, repr(self.text_content)
518+
self.confidence, self.is_header, self.is_footer, self.page_number, repr(self.field_elements)
519519
)[:1024]
520520

521521

0 commit comments

Comments
 (0)