Skip to content

fix testing if node has gpu support #1604

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 24 additions & 56 deletions services/sidecar/src/simcore_service_sidecar/utils.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
import asyncio
import logging
import aiodocker
import re
from typing import List
import os
from typing import Awaitable, List

import aiodocker
import aiopg
import networkx as nx
from simcore_postgres_database.sidecar_models import SUCCESS, comp_pipeline, comp_tasks
from sqlalchemy import and_
from simcore_sdk.config.rabbit import Config as RabbitConfig

from celery import Celery
from .exceptions import MoreThenOneItemDetected
from simcore_postgres_database.sidecar_models import SUCCESS, comp_pipeline, comp_tasks
from simcore_sdk.config.rabbit import Config as RabbitConfig

logger = logging.getLogger(__name__)


def wrap_async_call(fct: asyncio.coroutine):
def wrap_async_call(fct: Awaitable):
return asyncio.get_event_loop().run_until_complete(fct)


Expand Down Expand Up @@ -76,58 +76,26 @@ def is_gpu_node() -> bool:
"""Returns True if this node has support to GPU,
meaning that the `VRAM` label was added to it."""

def get_container_id_from_cgroup(cat_cgroup_content) -> str:
"""Parses the result of cat cat /proc/self/cgroup and returns a container_id or
raises an error in case only one unique id was not found."""
possible_candidates = {x for x in cat_cgroup_content.split() if len(x) >= 64}
result_set = {x.split("/")[-1] for x in possible_candidates}
if len(result_set) != 1:
# pylint: disable=raising-format-tuple
raise MoreThenOneItemDetected(
"There should only be one entry in this set of possible container_ids"
", have a look at %s" % possible_candidates
)
return_value = result_set.pop()
# check if length is 64 and all char match this regex [A-Fa-f0-9]
if len(return_value) != 64 and re.findall("[A-Fa-f0-9]{64}", return_value):
# pylint: disable=raising-format-tuple
raise ValueError(
"Found container ID is not a valid sha256 string %s", return_value
)
return return_value

async def async_is_gpu_node() -> bool:
cmd = "cat /proc/self/cgroup"
proc = await asyncio.create_subprocess_shell(
cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
)

stdout, _ = await proc.communicate()
container_id = get_container_id_from_cgroup(stdout.decode("utf-8").strip())

docker = aiodocker.Docker()

container = await docker.containers.get(container_id)
container_info = await container.show()
node_id = container_info["Config"]["Labels"]["com.docker.swarm.node.id"]
node_info = await docker.nodes.inspect(node_id=node_id)

generic_resources = (
node_info.get("Description", {})
.get("Resources", {})
.get("GenericResources", [])
)

has_gpu_support = False
for entry in generic_resources:
if entry.get("DiscreteResourceSpec", {}).get("Kind") == "VRAM":
has_gpu_support = True
break

await docker.close()

logger.info("Node GPU support: %s", has_gpu_support)
return has_gpu_support
config = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I guess this image will never block when boots

"Cmd": "nvidia-smi",
"Image": "nvidia/cuda:10.0-base",
"AttachStdin": False,
"AttachStdout": False,
"AttachStderr": False,
"Tty": False,
"OpenStdin": False,
}
try:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TIP: to suppress exceptions sometimes is handy and more readable

from contextlib import suppress
with suppress(aiodocker.execptions.DockerError):
    await ...
    return True
return False

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did not realize this existed. cool thing! but for this pre-new-sidecar era I will keep it for a next time.

await docker.containers.run(
config=config, name=f"sidecar_{os.getpid()}_test_gpu"
)
return True
except aiodocker.exceptions.DockerError:
pass
return False

return wrap_async_call(async_is_gpu_node())

Expand Down
69 changes: 14 additions & 55 deletions services/sidecar/tests/unit/test_celery_configurator.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,27 @@
# pylint: disable=unused-argument,redefined-outer-name,no-member
import pytest
import asyncio

import aiodocker
import pytest
from celery import Celery

from simcore_sdk.config.rabbit import Config as RabbitConfig
from simcore_service_sidecar import config
from simcore_service_sidecar.celery_configurator import (
get_rabbitmq_config_and_celery_app,
)
from simcore_service_sidecar.utils import is_gpu_node
from simcore_service_sidecar import config

from celery import Celery
from simcore_sdk.config.rabbit import Config as RabbitConfig


def _toggle_gpu_mock(mocker, has_gpu: bool) -> None:
# mock ouput of cat /proc/self/cgroup
CAT_DATA = b"""
12:hugetlb:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
11:freezer:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
10:blkio:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
9:devices:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
8:net_cls,net_prio:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
7:cpuset:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
6:perf_event:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
5:memory:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
4:rdma:/
3:cpu,cpuacct:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
2:pids:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
1:name=systemd:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
0::/system.slice/containerd.service
"""

future = asyncio.Future()
future.set_result((CAT_DATA, None))
comunicate = mocker.patch("asyncio.subprocess.Process.communicate")
comunicate.return_value = future

class MockContainer:
async def show(self):
data = {"Config": {"Labels": {"com.docker.swarm.node.id": "node_id"}}}
return data

future = asyncio.Future()
future.set_result(MockContainer())
containers_get = mocker.patch("aiodocker.containers.DockerContainers.get")
containers_get.return_value = future

def gpu_support_key():
"""if GPU support is enabled this Kind key must be present"""
return "Kind" if has_gpu else "_"

payload = {
"Description": {
"Resources": {
"GenericResources": [
{"DiscreteResourceSpec": {gpu_support_key(): "VRAM"}}
]
}
}
}

future = asyncio.Future()
future.set_result(payload)
containers_get = mocker.patch("aiodocker.nodes.DockerSwarmNodes.inspect")
containers_get.return_value = future
containers_get = mocker.patch(
"aiodocker.containers.DockerContainers.run", return_value=asyncio.Future()
)
containers_get.return_value.set_result("")
if not has_gpu:
containers_get.side_effect = aiodocker.exceptions.DockerError(
"MOCK Error", {"message": "this is a mocked exception"}
)


@pytest.fixture()
Expand Down