Skip to content

Commit 4ee7f35

Browse files
authored
[text analytics] add normalized_text (#17074)
1 parent e16a929 commit 4ee7f35

10 files changed

+280
-4
lines changed

sdk/textanalytics/azure-ai-textanalytics/CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@
88
- Renamed properties `aspect` and `opinions` to `target` and `assessments` respectively in class `MinedOpinion`.
99
- Renamed classes `AspectSentiment` and `OpinionSentiment` to `TargetSentiment` and `AssessmentSentiment` respectively.
1010

11+
**New Features**
12+
13+
- Add property `normalized_text` to `HealthcareEntity`. This property is a normalized version of the `text` property that already
14+
exists on the `HealthcareEntity`
15+
1116
## 5.1.0b5 (2021-02-10)
1217

1318
**Breaking Changes**

sdk/textanalytics/azure-ai-textanalytics/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,7 @@ print("Results of Healthcare Entities Analysis:")
467467
for idx, doc in enumerate(docs):
468468
for entity in doc.entities:
469469
print("Entity: {}".format(entity.text))
470+
print("...Normalized Text: {}".format(entity.normalized_text))
470471
print("...Category: {}".format(entity.category))
471472
print("...Subcategory: {}".format(entity.subcategory))
472473
print("...Offset: {}".format(entity.offset))

sdk/textanalytics/azure-ai-textanalytics/azure/ai/textanalytics/_models.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,9 @@ def __repr__(self):
492492
class HealthcareEntity(DictMixin):
493493
"""HealthcareEntity contains information about a Healthcare entity found in text.
494494
495-
:ivar str text: Entity text as appears in the request.
495+
:ivar str text: Entity text as appears in the document.
496+
:ivar str normalized_text: Optional. Normalized version of the raw `text` we extract
497+
from the document. Not all `text`s have a normalized version.
496498
:ivar str category: Entity category, see the following link for health's named
497499
entity types: https://aka.ms/text-analytics-health-entities
498500
:ivar str subcategory: Entity subcategory.
@@ -510,6 +512,7 @@ class HealthcareEntity(DictMixin):
510512

511513
def __init__(self, **kwargs):
512514
self.text = kwargs.get("text", None)
515+
self.normalized_text = kwargs.get("normalized_text", None)
513516
self.category = kwargs.get("category", None)
514517
self.subcategory = kwargs.get("subcategory", None)
515518
self.length = kwargs.get("length", None)
@@ -521,6 +524,7 @@ def __init__(self, **kwargs):
521524
def _from_generated(cls, healthcare_entity):
522525
return cls(
523526
text=healthcare_entity.text,
527+
normalized_text=healthcare_entity.name,
524528
category=healthcare_entity.category,
525529
subcategory=healthcare_entity.subcategory,
526530
length=healthcare_entity.length,
@@ -535,9 +539,10 @@ def __hash__(self):
535539
return hash(repr(self))
536540

537541
def __repr__(self):
538-
return "HealthcareEntity(text={}, category={}, subcategory={}, length={}, offset={}, confidence_score={}, "\
539-
"data_sources={})".format(
542+
return "HealthcareEntity(text={}, normalized_text={}, category={}, subcategory={}, length={}, offset={}, "\
543+
"confidence_score={}, data_sources={})".format(
540544
self.text,
545+
self.normalized_text,
541546
self.category,
542547
self.subcategory,
543548
self.length,

sdk/textanalytics/azure-ai-textanalytics/samples/async_samples/sample_analyze_healthcare_entities_async.py

+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ async def analyze_healthcare_entities_async(self):
7878
for idx, doc in enumerate(docs):
7979
for entity in doc.entities:
8080
print("Entity: {}".format(entity.text))
81+
print("...Normalized Text: {}".format(entity.normalized_text))
8182
print("...Category: {}".format(entity.category))
8283
print("...Subcategory: {}".format(entity.subcategory))
8384
print("...Offset: {}".format(entity.offset))

sdk/textanalytics/azure-ai-textanalytics/samples/sample_analyze_healthcare_entities.py

+1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ def analyze_healthcare_entities(self):
7474
for idx, doc in enumerate(docs):
7575
for entity in doc.entities:
7676
print("Entity: {}".format(entity.text))
77+
print("...Normalized Text: {}".format(entity.normalized_text))
7778
print("...Category: {}".format(entity.category))
7879
print("...Subcategory: {}".format(entity.subcategory))
7980
print("...Offset: {}".format(entity.offset))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
interactions:
2+
- request:
3+
body: '{"documents": [{"id": "0", "text": "patients must have histologically confirmed
4+
NHL", "language": "en"}]}'
5+
headers:
6+
Accept:
7+
- application/json, text/json
8+
Accept-Encoding:
9+
- gzip, deflate
10+
Connection:
11+
- keep-alive
12+
Content-Length:
13+
- '105'
14+
Content-Type:
15+
- application/json
16+
User-Agent:
17+
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
18+
method: POST
19+
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs?stringIndexType=UnicodeCodePoint
20+
response:
21+
body:
22+
string: ''
23+
headers:
24+
apim-request-id:
25+
- 1c4b7bf9-4eaf-41c1-8c28-585fd380d751
26+
date:
27+
- Wed, 03 Mar 2021 21:46:23 GMT
28+
operation-location:
29+
- https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
30+
strict-transport-security:
31+
- max-age=31536000; includeSubDomains; preload
32+
transfer-encoding:
33+
- chunked
34+
x-content-type-options:
35+
- nosniff
36+
x-envoy-upstream-service-time:
37+
- '338'
38+
status:
39+
code: 202
40+
message: Accepted
41+
- request:
42+
body: null
43+
headers:
44+
Accept:
45+
- '*/*'
46+
Accept-Encoding:
47+
- gzip, deflate
48+
Connection:
49+
- keep-alive
50+
User-Agent:
51+
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
52+
method: GET
53+
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
54+
response:
55+
body:
56+
string: '{"jobId":"7249ddad-d020-43dd-9a69-a2b0216cd9bd","lastUpdateDateTime":"2021-03-03T21:46:24Z","createdDateTime":"2021-03-03T21:46:23Z","expirationDateTime":"2021-03-04T21:46:23Z","status":"notStarted","errors":[]}'
57+
headers:
58+
apim-request-id:
59+
- 57339114-5845-4f08-ab4d-0aa36c843d25
60+
content-type:
61+
- application/json; charset=utf-8
62+
date:
63+
- Wed, 03 Mar 2021 21:46:28 GMT
64+
strict-transport-security:
65+
- max-age=31536000; includeSubDomains; preload
66+
transfer-encoding:
67+
- chunked
68+
x-content-type-options:
69+
- nosniff
70+
x-envoy-upstream-service-time:
71+
- '146'
72+
status:
73+
code: 200
74+
message: OK
75+
- request:
76+
body: null
77+
headers:
78+
Accept:
79+
- '*/*'
80+
Accept-Encoding:
81+
- gzip, deflate
82+
Connection:
83+
- keep-alive
84+
User-Agent:
85+
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
86+
method: GET
87+
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
88+
response:
89+
body:
90+
string: '{"jobId":"7249ddad-d020-43dd-9a69-a2b0216cd9bd","lastUpdateDateTime":"2021-03-03T21:46:32Z","createdDateTime":"2021-03-03T21:46:23Z","expirationDateTime":"2021-03-04T21:46:23Z","status":"running","errors":[]}'
91+
headers:
92+
apim-request-id:
93+
- 417f0558-5abd-49fd-8cd7-32f2d03549bd
94+
content-type:
95+
- application/json; charset=utf-8
96+
date:
97+
- Wed, 03 Mar 2021 21:46:33 GMT
98+
strict-transport-security:
99+
- max-age=31536000; includeSubDomains; preload
100+
transfer-encoding:
101+
- chunked
102+
x-content-type-options:
103+
- nosniff
104+
x-envoy-upstream-service-time:
105+
- '122'
106+
status:
107+
code: 200
108+
message: OK
109+
- request:
110+
body: null
111+
headers:
112+
Accept:
113+
- '*/*'
114+
Accept-Encoding:
115+
- gzip, deflate
116+
Connection:
117+
- keep-alive
118+
User-Agent:
119+
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
120+
method: GET
121+
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
122+
response:
123+
body:
124+
string: '{"jobId":"7249ddad-d020-43dd-9a69-a2b0216cd9bd","lastUpdateDateTime":"2021-03-03T21:46:32Z","createdDateTime":"2021-03-03T21:46:23Z","expirationDateTime":"2021-03-04T21:46:23Z","status":"running","errors":[]}'
125+
headers:
126+
apim-request-id:
127+
- 54ddb168-5bcc-4610-86b4-1b02d2241bd5
128+
content-type:
129+
- application/json; charset=utf-8
130+
date:
131+
- Wed, 03 Mar 2021 21:46:39 GMT
132+
strict-transport-security:
133+
- max-age=31536000; includeSubDomains; preload
134+
transfer-encoding:
135+
- chunked
136+
x-content-type-options:
137+
- nosniff
138+
x-envoy-upstream-service-time:
139+
- '87'
140+
status:
141+
code: 200
142+
message: OK
143+
- request:
144+
body: null
145+
headers:
146+
Accept:
147+
- '*/*'
148+
Accept-Encoding:
149+
- gzip, deflate
150+
Connection:
151+
- keep-alive
152+
User-Agent:
153+
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
154+
method: GET
155+
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/7249ddad-d020-43dd-9a69-a2b0216cd9bd
156+
response:
157+
body:
158+
string: '{"jobId":"7249ddad-d020-43dd-9a69-a2b0216cd9bd","lastUpdateDateTime":"2021-03-03T21:46:43Z","createdDateTime":"2021-03-03T21:46:23Z","expirationDateTime":"2021-03-04T21:46:23Z","status":"succeeded","errors":[],"results":{"documents":[{"id":"0","entities":[{"offset":19,"length":14,"text":"histologically","category":"ExaminationName","confidenceScore":1.0,"links":[{"dataSource":"UMLS","id":"C0344441"},{"dataSource":"CHV","id":"0000030964"},{"dataSource":"LNC","id":"MTHU010496"},{"dataSource":"MDR","id":"10062005"},{"dataSource":"MTH","id":"U002823"},{"dataSource":"MTHMST","id":"MT140012"},{"dataSource":"NCI","id":"C49131"},{"dataSource":"SNOMEDCT_US","id":"714797009"}]},{"offset":44,"length":3,"text":"NHL","category":"Diagnosis","confidenceScore":1.0,"links":[{"dataSource":"UMLS","id":"C0024305"},{"dataSource":"BI","id":"BI00323"},{"dataSource":"CCPSS","id":"0001640"},{"dataSource":"CCS","id":"2.10.2"},{"dataSource":"CCSR_10","id":"NEO058"},{"dataSource":"CHV","id":"0000007621"},{"dataSource":"COSTAR","id":"U000045"},{"dataSource":"CSP","id":"4001-0094"},{"dataSource":"DXP","id":"U002830"},{"dataSource":"HPO","id":"HP:0012539"},{"dataSource":"ICD10","id":"C85.9"},{"dataSource":"ICD10AM","id":"M9672/3"},{"dataSource":"ICD10CM","id":"C85.9"},{"dataSource":"ICPC2ICD10ENG","id":"MTHU053464"},{"dataSource":"ICPC2P","id":"B74002"},{"dataSource":"MDR","id":"10029547"},{"dataSource":"MEDCIN","id":"35839"},{"dataSource":"MEDLINEPLUS","id":"117"},{"dataSource":"MSH","id":"D008228"},{"dataSource":"NCI","id":"C3211"},{"dataSource":"NCI_CELLOSAURUS","id":"C3211"},{"dataSource":"NCI_CPTAC","id":"C3211"},{"dataSource":"NCI_CTEP-SDC","id":"10029593"},{"dataSource":"NCI_CTRP","id":"C3211"},{"dataSource":"NCI_GDC","id":"C3211"},{"dataSource":"NCI_NCI-GLOSS","id":"CDR0000045148"},{"dataSource":"NCI_NICHD","id":"C3211"},{"dataSource":"OMIM","id":"MTHU014311"},{"dataSource":"PDQ","id":"CDR0000038957"},{"dataSource":"QMR","id":"R0121804"},{"dataSource":"RCD","id":"B627."},{"dataSource":"SNM","id":"M-YYX54"},{"dataSource":"SNMI","id":"M-96723"},{"dataSource":"SNOMEDCT_US","id":"1929004"},{"dataSource":"WHO","id":"1544"}]}],"relations":[],"warnings":[]}],"errors":[],"modelVersion":"2021-01-11"}}'
159+
headers:
160+
apim-request-id:
161+
- 356495ad-d24a-4870-ae9a-3bc03cdc951b
162+
content-type:
163+
- application/json; charset=utf-8
164+
date:
165+
- Wed, 03 Mar 2021 21:46:45 GMT
166+
strict-transport-security:
167+
- max-age=31536000; includeSubDomains; preload
168+
transfer-encoding:
169+
- chunked
170+
x-content-type-options:
171+
- nosniff
172+
x-envoy-upstream-service-time:
173+
- '302'
174+
status:
175+
code: 200
176+
message: OK
177+
version: 1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
interactions:
2+
- request:
3+
body: '{"documents": [{"id": "0", "text": "patients must have histologically confirmed
4+
NHL", "language": "en"}]}'
5+
headers:
6+
Accept:
7+
- application/json, text/json
8+
Content-Length:
9+
- '105'
10+
Content-Type:
11+
- application/json
12+
User-Agent:
13+
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
14+
method: POST
15+
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs?stringIndexType=UnicodeCodePoint
16+
response:
17+
body:
18+
string: ''
19+
headers:
20+
apim-request-id: 5f62849b-975a-4da4-8d9f-359e2b7af6d4
21+
date: Wed, 03 Mar 2021 21:46:45 GMT
22+
operation-location: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/4be5a0c6-2663-46d8-ba56-ffeefe175b9b
23+
strict-transport-security: max-age=31536000; includeSubDomains; preload
24+
transfer-encoding: chunked
25+
x-content-type-options: nosniff
26+
x-envoy-upstream-service-time: '75'
27+
status:
28+
code: 202
29+
message: Accepted
30+
url: https://westus2.api.cognitive.microsoft.com//text/analytics/v3.1-preview.4/entities/health/jobs?stringIndexType=UnicodeCodePoint
31+
- request:
32+
body: null
33+
headers:
34+
User-Agent:
35+
- azsdk-python-ai-textanalytics/5.1.0b6 Python/3.9.1 (macOS-10.13.6-x86_64-i386-64bit)
36+
method: GET
37+
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/4be5a0c6-2663-46d8-ba56-ffeefe175b9b
38+
response:
39+
body:
40+
string: '{"jobId":"4be5a0c6-2663-46d8-ba56-ffeefe175b9b","lastUpdateDateTime":"2021-03-03T21:46:48Z","createdDateTime":"2021-03-03T21:46:45Z","expirationDateTime":"2021-03-04T21:46:45Z","status":"succeeded","errors":[],"results":{"documents":[{"id":"0","entities":[{"offset":19,"length":14,"text":"histologically","category":"ExaminationName","confidenceScore":1.0,"links":[{"dataSource":"UMLS","id":"C0344441"},{"dataSource":"CHV","id":"0000030964"},{"dataSource":"LNC","id":"MTHU010496"},{"dataSource":"MDR","id":"10062005"},{"dataSource":"MTH","id":"U002823"},{"dataSource":"MTHMST","id":"MT140012"},{"dataSource":"NCI","id":"C49131"},{"dataSource":"SNOMEDCT_US","id":"714797009"}]},{"offset":44,"length":3,"text":"NHL","category":"Diagnosis","confidenceScore":1.0,"links":[{"dataSource":"UMLS","id":"C0024305"},{"dataSource":"BI","id":"BI00323"},{"dataSource":"CCPSS","id":"0001640"},{"dataSource":"CCS","id":"2.10.2"},{"dataSource":"CCSR_10","id":"NEO058"},{"dataSource":"CHV","id":"0000007621"},{"dataSource":"COSTAR","id":"U000045"},{"dataSource":"CSP","id":"4001-0094"},{"dataSource":"DXP","id":"U002830"},{"dataSource":"HPO","id":"HP:0012539"},{"dataSource":"ICD10","id":"C85.9"},{"dataSource":"ICD10AM","id":"M9672/3"},{"dataSource":"ICD10CM","id":"C85.9"},{"dataSource":"ICPC2ICD10ENG","id":"MTHU053464"},{"dataSource":"ICPC2P","id":"B74002"},{"dataSource":"MDR","id":"10029547"},{"dataSource":"MEDCIN","id":"35839"},{"dataSource":"MEDLINEPLUS","id":"117"},{"dataSource":"MSH","id":"D008228"},{"dataSource":"NCI","id":"C3211"},{"dataSource":"NCI_CELLOSAURUS","id":"C3211"},{"dataSource":"NCI_CPTAC","id":"C3211"},{"dataSource":"NCI_CTEP-SDC","id":"10029593"},{"dataSource":"NCI_CTRP","id":"C3211"},{"dataSource":"NCI_GDC","id":"C3211"},{"dataSource":"NCI_NCI-GLOSS","id":"CDR0000045148"},{"dataSource":"NCI_NICHD","id":"C3211"},{"dataSource":"OMIM","id":"MTHU014311"},{"dataSource":"PDQ","id":"CDR0000038957"},{"dataSource":"QMR","id":"R0121804"},{"dataSource":"RCD","id":"B627."},{"dataSource":"SNM","id":"M-YYX54"},{"dataSource":"SNMI","id":"M-96723"},{"dataSource":"SNOMEDCT_US","id":"1929004"},{"dataSource":"WHO","id":"1544"}]}],"relations":[],"warnings":[]}],"errors":[],"modelVersion":"2021-01-11"}}'
41+
headers:
42+
apim-request-id: f9b79e8f-3fa1-4623-99b1-bf925c6b3b60
43+
content-type: application/json; charset=utf-8
44+
date: Wed, 03 Mar 2021 21:46:50 GMT
45+
strict-transport-security: max-age=31536000; includeSubDomains; preload
46+
transfer-encoding: chunked
47+
x-content-type-options: nosniff
48+
x-envoy-upstream-service-time: '30'
49+
status:
50+
code: 200
51+
message: OK
52+
url: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.4/entities/health/jobs/4be5a0c6-2663-46d8-ba56-ffeefe175b9b
53+
version: 1

sdk/textanalytics/azure-ai-textanalytics/tests/test_analyze_healthcare.py

+14
Original file line numberDiff line numberDiff line change
@@ -373,3 +373,17 @@ def test_relations(self, client):
373373
else:
374374
assert role.name == HealthcareEntityRelationRoleType.ABBREVIATED_TERM
375375
self.assert_healthcare_entities_equal(role.entity, parkinsons_abbreviation_entity)
376+
377+
@GlobalTextAnalyticsAccountPreparer()
378+
@TextAnalyticsClientPreparer()
379+
def test_normalized_text(self, client):
380+
result = list(client.begin_analyze_healthcare_entities(
381+
documents=["patients must have histologically confirmed NHL"]
382+
).result())
383+
384+
# currently just testing it has that attribute.
385+
# have an issue to update https://github.com/Azure/azure-sdk-for-python/issues/17072
386+
387+
assert all([
388+
e for e in result[0].entities if hasattr(e, "normalized_text")
389+
])

sdk/textanalytics/azure-ai-textanalytics/tests/test_analyze_healthcare_async.py

+18
Original file line numberDiff line numberDiff line change
@@ -420,4 +420,22 @@ async def test_relations(self, client):
420420
assert role.name == "AbbreviatedTerm"
421421
self.assert_healthcare_entities_equal(role.entity, parkinsons_abbreviation_entity)
422422

423+
@GlobalTextAnalyticsAccountPreparer()
424+
@TextAnalyticsClientPreparer()
425+
async def test_normalized_text(self, client):
426+
response = await (await client.begin_analyze_healthcare_entities(
427+
documents=["patients must have histologically confirmed NHL"]
428+
)).result()
429+
430+
result = []
431+
async for r in response:
432+
result.append(r)
433+
434+
# currently just testing it has that attribute.
435+
# have an issue to update https://github.com/Azure/azure-sdk-for-python/issues/17072
436+
437+
assert all([
438+
e for e in result[0].entities if hasattr(e, "normalized_text")
439+
])
440+
423441

sdk/textanalytics/azure-ai-textanalytics/tests/test_repr.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ def data_source():
278278
def healthcare_entity(data_source):
279279
model = _models.HealthcareEntity(
280280
text="Bonjour",
281+
normalized_text="Bonjour",
281282
category="MyCategory",
282283
subcategory="MySubcategory",
283284
length=7,
@@ -286,7 +287,7 @@ def healthcare_entity(data_source):
286287
data_sources=[data_source[0]],
287288
)
288289
model_repr = (
289-
"HealthcareEntity(text=Bonjour, category=MyCategory, subcategory=MySubcategory, length=7, offset=12, " +
290+
"HealthcareEntity(text=Bonjour, normalized_text=Bonjour, category=MyCategory, subcategory=MySubcategory, length=7, offset=12, " +
290291
"confidence_score=0.95, data_sources=[{}])".format(data_source[1])
291292
)
292293

0 commit comments

Comments
 (0)