Skip to content

Commit 691f4c8

Browse files
authored
enhancements on scicrunch classifiers (backend) (#2065)
* @GitHK review: constant attributes upon construction * @GitHK: get_instance always raises * @GitHK review: cleanup * adding tests * Normalization of RRID tags * Tests scicrunch API invariants * Improved scicrunch client interface and error handling * renamed test_scicrunch_service -> test_scircrunch_service_api * testing scicrunch client * adds new tests for groups handlers including classifiers
1 parent 16c4a86 commit 691f4c8

File tree

13 files changed

+770
-310
lines changed

13 files changed

+770
-310
lines changed

api/specs/webserver/openapi-groups.yaml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -250,10 +250,10 @@ paths:
250250
responses:
251251
"200":
252252
description: Got information of a valid RRID
253-
"404":
254-
description: Cannot find a valid research resource for the provided RRID
253+
"400":
254+
description: Invalid RRID
255255
"503":
256-
description: Underline validation service is not reachable
256+
description: scircrunch.org service is not reachable
257257
default:
258258
$ref: "#/components/responses/DefaultErrorResponse"
259259
post:
@@ -264,8 +264,10 @@ paths:
264264
responses:
265265
"200":
266266
description: Got information of a valid RRID
267-
"422":
267+
"400":
268268
description: Invalid RRID
269+
"503":
270+
description: scircrunch.org service is not reachable
269271
default:
270272
$ref: "#/components/responses/DefaultErrorResponse"
271273

@@ -284,6 +286,8 @@ paths:
284286
responses:
285287
"200":
286288
description: Got information of a valid RRID
289+
"503":
290+
description: scircrunch.org service is not reachable
287291
default:
288292
$ref: "#/components/responses/DefaultErrorResponse"
289293
components:

services/web/server/src/simcore_service_webserver/api/v0/openapi.yaml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4661,10 +4661,10 @@ paths:
46614661
responses:
46624662
'200':
46634663
description: Got information of a valid RRID
4664-
'404':
4665-
description: Cannot find a valid research resource for the provided RRID
4664+
'400':
4665+
description: Invalid RRID
46664666
'503':
4667-
description: Underline validation service is not reachable
4667+
description: scircrunch.org service is not reachable
46684668
default:
46694669
description: Default http error response body
46704670
content:
@@ -4756,8 +4756,10 @@ paths:
47564756
responses:
47574757
'200':
47584758
description: Got information of a valid RRID
4759-
'422':
4759+
'400':
47604760
description: Invalid RRID
4761+
'503':
4762+
description: scircrunch.org service is not reachable
47614763
default:
47624764
description: Default http error response body
47634765
content:
@@ -4856,6 +4858,8 @@ paths:
48564858
responses:
48574859
'200':
48584860
description: Got information of a valid RRID
4861+
'503':
4862+
description: scircrunch.org service is not reachable
48594863
default:
48604864
description: Default http error response body
48614865
content:

services/web/server/src/simcore_service_webserver/groups_classifiers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
from .constants import APP_DB_ENGINE_KEY
2121
from .scicrunch.scicrunch_db import ResearchResourceRepository
22-
from .scicrunch.service_client import SciCrunchAPI
22+
from .scicrunch.service_client import SciCrunch
2323

2424
logger = logging.getLogger(__name__)
2525

@@ -114,7 +114,7 @@ async def build_rrids_tree_view(app, tree_view_mode="std") -> Dict[str, Any]:
114114
reason="Currently only 'std' option for the classifiers tree view is implemented"
115115
)
116116

117-
scicrunch = SciCrunchAPI.get_instance(app)
117+
scicrunch = SciCrunch.get_instance(app)
118118
repo = ResearchResourceRepository(app)
119119

120120
flat_tree_view = {}

services/web/server/src/simcore_service_webserver/groups_handlers.py

Lines changed: 66 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import json
44
import logging
5-
from typing import Optional
5+
from typing import List, Optional
66

77
from aiohttp import web
88

@@ -15,8 +15,8 @@
1515
)
1616
from .login.decorators import RQT_USERID_KEY, login_required
1717
from .scicrunch.scicrunch_db import ResearchResourceRepository
18-
from .scicrunch.scicrunch_models import ListOfResourceHits, ResearchResource
19-
from .scicrunch.service_client import SciCrunchAPI
18+
from .scicrunch.scicrunch_models import ResearchResource, ResourceHit
19+
from .scicrunch.service_client import InvalidRRID, SciCrunch, ScicrunchError
2020
from .security_decorators import permission_required
2121
from .users_exceptions import UserNotFoundError
2222

@@ -199,66 +199,92 @@ async def delete_group_user(request: web.Request):
199199
@login_required
200200
@permission_required("groups.*")
201201
async def get_group_classifiers(request: web.Request):
202-
gid = int(request.match_info["gid"]) # FIXME: raise http enetity error if not int
203-
classifiers_tree_view = {}
204-
205-
repo = GroupClassifierRepository(request.app)
206-
if not await repo.group_uses_scicrunch(gid):
207-
classifiers_tree_view = await repo.get_classifiers_from_bundle(gid)
208-
else:
209-
classifiers_tree_view = await build_rrids_tree_view(
202+
try:
203+
gid = int(request.match_info["gid"])
204+
# FIXME: Raise ValidationError and handle as bad request.
205+
# Now middleware will convert as server error but it is a client error
206+
207+
repo = GroupClassifierRepository(request.app)
208+
if not await repo.group_uses_scicrunch(gid):
209+
return await repo.get_classifiers_from_bundle(gid)
210+
211+
# otherwise, build dynamic tree with RRIDs
212+
return await build_rrids_tree_view(
210213
request.app, tree_view_mode=request.query.get("tree_view", "std")
211214
)
212-
213-
return classifiers_tree_view
215+
except ScicrunchError:
216+
return {}
214217

215218

216219
# GET /groups/sparc/classifiers/scicrunch-resources/{rrid}
217220
@login_required
218221
@permission_required("groups.*")
219222
async def get_scicrunch_resource(request: web.Request):
220-
rrid = request.match_info["rrid"]
221-
rrid = SciCrunchAPI.validate_identifier(rrid)
223+
try:
224+
rrid = request.match_info["rrid"]
225+
rrid = SciCrunch.validate_identifier(rrid)
226+
227+
# check if in database first
228+
repo = ResearchResourceRepository(request.app)
229+
resource: Optional[ResearchResource] = await repo.get_resource(rrid)
230+
if not resource:
231+
# otherwise, request to scicrunch service
232+
scicrunch = SciCrunch.get_instance(request.app)
233+
resource = await scicrunch.get_resource_fields(rrid)
234+
235+
return resource.dict()
222236

223-
# check if in database first
224-
repo = ResearchResourceRepository(request.app)
225-
resource: Optional[ResearchResource] = await repo.get_resource(rrid)
226-
if not resource:
227-
# otherwise, request to scicrunch service
228-
scicrunch = SciCrunchAPI.get_instance(request.app, raises=True)
229-
scicrunch_resource = await scicrunch.get_resource_fields(rrid)
230-
resource = scicrunch_resource.convert_to_api_model()
231-
return resource.dict()
237+
except InvalidRRID as err:
238+
raise web.HTTPBadRequest(reason=err.reason) from err
239+
240+
except ScicrunchError as err:
241+
user_msg = "Cannot get RRID since scicrunch.org service is not reachable."
242+
logger.error("%s -> %s", err, user_msg)
243+
raise web.HTTPServiceUnavailable(reason=user_msg) from err
232244

233245

234246
# POST /groups/sparc/classifiers/scicrunch-resources/{rrid}
235247
@login_required
236248
@permission_required("groups.*")
237249
async def add_scicrunch_resource(request: web.Request):
238-
rrid = request.match_info["rrid"]
250+
try:
251+
rrid = request.match_info["rrid"]
252+
253+
# check if exists
254+
repo = ResearchResourceRepository(request.app)
255+
resource: Optional[ResearchResource] = await repo.get_resource(rrid)
256+
if not resource:
257+
# then request scicrunch service
258+
scicrunch = SciCrunch.get_instance(request.app)
259+
resource = await scicrunch.get_resource_fields(rrid)
239260

240-
# check if exists
241-
repo = ResearchResourceRepository(request.app)
242-
resource: Optional[ResearchResource] = await repo.get_resource(rrid)
243-
if not resource:
244-
# then request scicrunch service
245-
scicrunch = SciCrunchAPI.get_instance(request.app, raises=True)
246-
scicrunch_resource = await scicrunch.get_resource_fields(rrid)
247-
resource = scicrunch_resource.convert_to_api_model()
261+
# insert new or if exists, then update
262+
await repo.upsert(resource)
248263

249-
# insert new or if exists, then update
250-
await repo.upsert(resource)
264+
return resource.dict()
251265

252-
return resource.dict()
266+
except InvalidRRID as err:
267+
raise web.HTTPBadRequest(reason=err.reason) from err
268+
269+
except ScicrunchError as err:
270+
user_msg = "Cannot add RRID since scicrunch.org service is not reachable."
271+
logger.error("%s -> %s", err, user_msg)
272+
raise web.HTTPServiceUnavailable(reason=user_msg) from err
253273

254274

255275
# GET /groups/sparc/classifiers/scicrunch-resources:search
256276
@login_required
257277
@permission_required("groups.*")
258278
async def search_scicrunch_resources(request: web.Request):
259-
guess_name: str = request.query["guess_name"]
279+
try:
280+
guess_name = str(request.query["guess_name"]).strip()
281+
282+
scicrunch = SciCrunch.get_instance(request.app)
283+
hits: List[ResourceHit] = await scicrunch.search_resource(guess_name)
260284

261-
scicrunch = SciCrunchAPI.get_instance(request.app, raises=True)
262-
hits: ListOfResourceHits = await scicrunch.search_resource(guess_name)
285+
return [hit.dict() for hit in hits]
263286

264-
return hits.dict()["__root__"]
287+
except ScicrunchError as err:
288+
user_msg = "Cannot search since scicrunch.org service is not reachable."
289+
logger.error("%s -> %s", err, user_msg)
290+
raise web.HTTPServiceUnavailable(reason=user_msg) from err

services/web/server/src/simcore_service_webserver/scicrunch/_config.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,6 @@
33
# TODO: read https://www.force11.org/group/resource-identification-initiative
44
SCICRUNCH_DEFAULT_URL = "https://scicrunch.org"
55

6-
# To ensure they are recognizable, unique, and traceable, identifiers are prefixed with " RRID: ",
7-
# followed by a second tag that indicates the source authority that provided it
8-
# (e.g. "AB" for the Antibody Registry, "CVCL" for the Cellosaurus, "MMRRC" for Mutant Mouse Regional Resource Centers,
9-
# "SCR" for the SciCrunch registry of tools).
10-
# SEE https://scicrunch.org/resources
11-
12-
STRICT_RRID_PATTERN = r"(RRID:)\s*(SCR_\d+)"
13-
RRID_PATTERN = r"(RRID:)?\s*(SCR_\d+)"
14-
156

167
class SciCrunchSettings(BaseSettings):
178

services/web/server/src/simcore_service_webserver/scicrunch/scicrunch_models.py

Lines changed: 38 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,31 +3,61 @@
33
"""
44

55
import logging
6+
import re
67
from datetime import datetime
78
from typing import Any, Dict, List, Union
89

910
from pydantic import BaseModel, Field, constr, validator
1011
from yarl import URL
1112

12-
from ._config import STRICT_RRID_PATTERN
13-
1413
logger = logging.getLogger(__name__)
1514

1615

16+
# Research Resource Identifiers --------------------------------
17+
#
18+
# To ensure they are recognizable, unique, and traceable,
19+
# identifiers are prefixed with " RRID:",
20+
# followed by a second tag that indicates the source authority that provided it:
21+
#
22+
# "AB" for the Antibody Registry,
23+
# "CVCL" for the Cellosaurus,
24+
# "MMRRC" for Mutant Mouse Regional Resource Centers,
25+
# "SCR" for the SciCrunch registry of tools
26+
#
27+
# SEE https://scicrunch.org/resources
28+
29+
STRICT_RRID_PATTERN = r"^(RRID:)([^_\s]+)_(\S+)$" # Expected in db labels and models
30+
31+
RRID_TAG_PATTERN = r"(RRID:)?\s*([^:_\s]+)_(\S+)"
32+
rrid_capture_re = re.compile(RRID_TAG_PATTERN)
33+
34+
35+
def normalize_rrid_tags(rrid_tag: str, *, with_prefix: bool = True) -> str:
36+
try:
37+
# validate & parse
38+
_, source_authority, identifier = rrid_capture_re.search(rrid_tag).groups()
39+
# format according to norm
40+
rrid = f"{source_authority}_{identifier}"
41+
if with_prefix:
42+
rrid = "RRID:" + rrid
43+
return rrid
44+
except AttributeError:
45+
raise ValueError(f"'{rrid_tag}' does not match a RRID pattern")
46+
47+
1748
# webserver API models -----------------------------------------
1849
class ResearchResource(BaseModel):
19-
rrid: constr(
20-
regex=STRICT_RRID_PATTERN
21-
) # unique identifier used as classifier, i.e. to tag studies and services
50+
rrid: constr(regex=STRICT_RRID_PATTERN) = Field(
51+
...,
52+
description="Unique identifier used as classifier, i.e. to tag studies and services",
53+
)
2254
name: str
2355
description: str
2456

2557
@validator("rrid", pre=True)
2658
@classmethod
2759
def format_rrid(cls, v):
28-
if not v.startswith("RRID:"):
29-
return f"RRID: {v}"
30-
return v
60+
return normalize_rrid_tags(v, with_prefix=True)
3161

3262
class Config:
3363
orm_mode = True
@@ -47,22 +77,14 @@ class ResearchResourceAtdB(ResearchResource):
4777
class FieldItem(BaseModel):
4878
field_name: str = Field(..., alias="field")
4979
required: bool
50-
# field_type: str = Field(..., alias="type") # text, textarea, resource-types, ...
51-
# max_number: str # convertable to int
5280
value: Union[str, None, List[Any]] = None
53-
# position: int
54-
# display: str # title, descripiotn, url, text, owner-text
55-
alt: str # alternative text
5681

5782

5883
class ResourceView(BaseModel):
5984
resource_fields: List[FieldItem] = Field([], alias="fields")
6085
version: int
6186
curation_status: str
6287
last_curated_version: int
63-
# uuid: UUID
64-
# NOTE: image_src is a path from https://scicrunch.org/ e.g. https://scicrunch.org/upload/resource-images/18997.png
65-
# image_src: Optional[str]
6688
scicrunch_id: str
6789

6890
@classmethod
@@ -74,8 +96,6 @@ def from_response_payload(cls, payload: Dict):
7496
def is_curated(self) -> bool:
7597
return self.curation_status.lower() == "curated"
7698

77-
# TODO: add validator to capture only selected fields
78-
7999
def _get_field(self, fieldname: str):
80100
for field in self.resource_fields:
81101
if field.field_name == fieldname:
@@ -91,17 +111,9 @@ def get_description(self):
91111
def get_resource_url(self):
92112
return URL(str(self._get_field("Resource URL")))
93113

94-
def convert_to_api_model(self) -> ResearchResource:
95-
return ResearchResource(
96-
rrid=self.scicrunch_id,
97-
name=self.get_name(),
98-
description=self.get_description(),
99-
)
100-
101114

102115
class ResourceHit(BaseModel):
103116
rrid: str = Field(..., alias="rid")
104-
# original_id: str
105117
name: str
106118

107119

0 commit comments

Comments
 (0)