Skip to content

Commit 2594bfe

Browse files
authored
fix testing if node has gpu support (#1604)
* fix testing if node has gpu support
1 parent 47b24c6 commit 2594bfe

File tree

2 files changed

+38
-111
lines changed

2 files changed

+38
-111
lines changed

services/sidecar/src/simcore_service_sidecar/utils.py

+24-56
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
import asyncio
22
import logging
3-
import aiodocker
4-
import re
5-
from typing import List
3+
import os
4+
from typing import Awaitable, List
65

6+
import aiodocker
77
import aiopg
88
import networkx as nx
9-
from simcore_postgres_database.sidecar_models import SUCCESS, comp_pipeline, comp_tasks
109
from sqlalchemy import and_
11-
from simcore_sdk.config.rabbit import Config as RabbitConfig
10+
1211
from celery import Celery
13-
from .exceptions import MoreThenOneItemDetected
12+
from simcore_postgres_database.sidecar_models import SUCCESS, comp_pipeline, comp_tasks
13+
from simcore_sdk.config.rabbit import Config as RabbitConfig
1414

1515
logger = logging.getLogger(__name__)
1616

1717

18-
def wrap_async_call(fct: asyncio.coroutine):
18+
def wrap_async_call(fct: Awaitable):
1919
return asyncio.get_event_loop().run_until_complete(fct)
2020

2121

@@ -76,58 +76,26 @@ def is_gpu_node() -> bool:
7676
"""Returns True if this node has support to GPU,
7777
meaning that the `VRAM` label was added to it."""
7878

79-
def get_container_id_from_cgroup(cat_cgroup_content) -> str:
80-
"""Parses the result of cat cat /proc/self/cgroup and returns a container_id or
81-
raises an error in case only one unique id was not found."""
82-
possible_candidates = {x for x in cat_cgroup_content.split() if len(x) >= 64}
83-
result_set = {x.split("/")[-1] for x in possible_candidates}
84-
if len(result_set) != 1:
85-
# pylint: disable=raising-format-tuple
86-
raise MoreThenOneItemDetected(
87-
"There should only be one entry in this set of possible container_ids"
88-
", have a look at %s" % possible_candidates
89-
)
90-
return_value = result_set.pop()
91-
# check if length is 64 and all char match this regex [A-Fa-f0-9]
92-
if len(return_value) != 64 and re.findall("[A-Fa-f0-9]{64}", return_value):
93-
# pylint: disable=raising-format-tuple
94-
raise ValueError(
95-
"Found container ID is not a valid sha256 string %s", return_value
96-
)
97-
return return_value
98-
9979
async def async_is_gpu_node() -> bool:
100-
cmd = "cat /proc/self/cgroup"
101-
proc = await asyncio.create_subprocess_shell(
102-
cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
103-
)
104-
105-
stdout, _ = await proc.communicate()
106-
container_id = get_container_id_from_cgroup(stdout.decode("utf-8").strip())
107-
10880
docker = aiodocker.Docker()
10981

110-
container = await docker.containers.get(container_id)
111-
container_info = await container.show()
112-
node_id = container_info["Config"]["Labels"]["com.docker.swarm.node.id"]
113-
node_info = await docker.nodes.inspect(node_id=node_id)
114-
115-
generic_resources = (
116-
node_info.get("Description", {})
117-
.get("Resources", {})
118-
.get("GenericResources", [])
119-
)
120-
121-
has_gpu_support = False
122-
for entry in generic_resources:
123-
if entry.get("DiscreteResourceSpec", {}).get("Kind") == "VRAM":
124-
has_gpu_support = True
125-
break
126-
127-
await docker.close()
128-
129-
logger.info("Node GPU support: %s", has_gpu_support)
130-
return has_gpu_support
82+
config = {
83+
"Cmd": "nvidia-smi",
84+
"Image": "nvidia/cuda:10.0-base",
85+
"AttachStdin": False,
86+
"AttachStdout": False,
87+
"AttachStderr": False,
88+
"Tty": False,
89+
"OpenStdin": False,
90+
}
91+
try:
92+
await docker.containers.run(
93+
config=config, name=f"sidecar_{os.getpid()}_test_gpu"
94+
)
95+
return True
96+
except aiodocker.exceptions.DockerError:
97+
pass
98+
return False
13199

132100
return wrap_async_call(async_is_gpu_node())
133101

services/sidecar/tests/unit/test_celery_configurator.py

+14-55
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,27 @@
11
# pylint: disable=unused-argument,redefined-outer-name,no-member
2-
import pytest
32
import asyncio
43

4+
import aiodocker
5+
import pytest
6+
from celery import Celery
7+
8+
from simcore_sdk.config.rabbit import Config as RabbitConfig
9+
from simcore_service_sidecar import config
510
from simcore_service_sidecar.celery_configurator import (
611
get_rabbitmq_config_and_celery_app,
712
)
813
from simcore_service_sidecar.utils import is_gpu_node
9-
from simcore_service_sidecar import config
10-
11-
from celery import Celery
12-
from simcore_sdk.config.rabbit import Config as RabbitConfig
1314

1415

1516
def _toggle_gpu_mock(mocker, has_gpu: bool) -> None:
16-
# mock ouput of cat /proc/self/cgroup
17-
CAT_DATA = b"""
18-
12:hugetlb:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
19-
11:freezer:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
20-
10:blkio:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
21-
9:devices:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
22-
8:net_cls,net_prio:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
23-
7:cpuset:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
24-
6:perf_event:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
25-
5:memory:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
26-
4:rdma:/
27-
3:cpu,cpuacct:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
28-
2:pids:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
29-
1:name=systemd:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
30-
0::/system.slice/containerd.service
31-
"""
32-
33-
future = asyncio.Future()
34-
future.set_result((CAT_DATA, None))
35-
comunicate = mocker.patch("asyncio.subprocess.Process.communicate")
36-
comunicate.return_value = future
37-
38-
class MockContainer:
39-
async def show(self):
40-
data = {"Config": {"Labels": {"com.docker.swarm.node.id": "node_id"}}}
41-
return data
42-
43-
future = asyncio.Future()
44-
future.set_result(MockContainer())
45-
containers_get = mocker.patch("aiodocker.containers.DockerContainers.get")
46-
containers_get.return_value = future
47-
48-
def gpu_support_key():
49-
"""if GPU support is enabled this Kind key must be present"""
50-
return "Kind" if has_gpu else "_"
51-
52-
payload = {
53-
"Description": {
54-
"Resources": {
55-
"GenericResources": [
56-
{"DiscreteResourceSpec": {gpu_support_key(): "VRAM"}}
57-
]
58-
}
59-
}
60-
}
61-
62-
future = asyncio.Future()
63-
future.set_result(payload)
64-
containers_get = mocker.patch("aiodocker.nodes.DockerSwarmNodes.inspect")
65-
containers_get.return_value = future
17+
containers_get = mocker.patch(
18+
"aiodocker.containers.DockerContainers.run", return_value=asyncio.Future()
19+
)
20+
containers_get.return_value.set_result("")
21+
if not has_gpu:
22+
containers_get.side_effect = aiodocker.exceptions.DockerError(
23+
"MOCK Error", {"message": "this is a mocked exception"}
24+
)
6625

6726

6827
@pytest.fixture()

0 commit comments

Comments
 (0)