fix testing if node has gpu support (#1604)

sanderegg · web-flow · commit 2594bfead1f0 · 2020-07-07T08:53:00.000+02:00
* fix testing if node has gpu support
diff --git a/services/sidecar/src/simcore_service_sidecar/utils.py b/services/sidecar/src/simcore_service_sidecar/utils.py
@@ -1,21 +1,21 @@
 import asyncio
 import logging
-import aiodocker
-import re
-from typing import List
+import os
+from typing import Awaitable, List
 
+import aiodocker
 import aiopg
 import networkx as nx
-from simcore_postgres_database.sidecar_models import SUCCESS, comp_pipeline, comp_tasks
 from sqlalchemy import and_
-from simcore_sdk.config.rabbit import Config as RabbitConfig
+
 from celery import Celery
-from .exceptions import MoreThenOneItemDetected
+from simcore_postgres_database.sidecar_models import SUCCESS, comp_pipeline, comp_tasks
+from simcore_sdk.config.rabbit import Config as RabbitConfig
 
 logger = logging.getLogger(__name__)
 
 
-def wrap_async_call(fct: asyncio.coroutine):
+def wrap_async_call(fct: Awaitable):
     return asyncio.get_event_loop().run_until_complete(fct)
 
 
@@ -76,58 +76,26 @@ def is_gpu_node() -> bool:
     """Returns True if this node has support to GPU,
     meaning that the `VRAM` label was added to it."""
 
-    def get_container_id_from_cgroup(cat_cgroup_content) -> str:
-        """Parses the result of cat cat /proc/self/cgroup and returns a container_id or
-        raises an error in case only one unique id was not found."""
-        possible_candidates = {x for x in cat_cgroup_content.split() if len(x) >= 64}
-        result_set = {x.split("/")[-1] for x in possible_candidates}
-        if len(result_set) != 1:
-            # pylint: disable=raising-format-tuple
-            raise MoreThenOneItemDetected(
-                "There should only be one entry in this set of possible container_ids"
-                ", have a look at %s" % possible_candidates
-            )
-        return_value = result_set.pop()
-        # check if length is 64 and all char match this regex [A-Fa-f0-9]
-        if len(return_value) != 64 and re.findall("[A-Fa-f0-9]{64}", return_value):
-            # pylint: disable=raising-format-tuple
-            raise ValueError(
-                "Found container ID is not a valid sha256 string %s", return_value
-            )
-        return return_value
-
     async def async_is_gpu_node() -> bool:
-        cmd = "cat /proc/self/cgroup"
-        proc = await asyncio.create_subprocess_shell(
-            cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
-        )
-
-        stdout, _ = await proc.communicate()
-        container_id = get_container_id_from_cgroup(stdout.decode("utf-8").strip())
-
         docker = aiodocker.Docker()
 
-        container = await docker.containers.get(container_id)
-        container_info = await container.show()
-        node_id = container_info["Config"]["Labels"]["com.docker.swarm.node.id"]
-        node_info = await docker.nodes.inspect(node_id=node_id)
-
-        generic_resources = (
-            node_info.get("Description", {})
-            .get("Resources", {})
-            .get("GenericResources", [])
-        )
-
-        has_gpu_support = False
-        for entry in generic_resources:
-            if entry.get("DiscreteResourceSpec", {}).get("Kind") == "VRAM":
-                has_gpu_support = True
-                break
-
-        await docker.close()
-
-        logger.info("Node GPU support: %s", has_gpu_support)
-        return has_gpu_support
+        config = {
+            "Cmd": "nvidia-smi",
+            "Image": "nvidia/cuda:10.0-base",
+            "AttachStdin": False,
+            "AttachStdout": False,
+            "AttachStderr": False,
+            "Tty": False,
+            "OpenStdin": False,
+        }
+        try:
+            await docker.containers.run(
+                config=config, name=f"sidecar_{os.getpid()}_test_gpu"
+            )
+            return True
+        except aiodocker.exceptions.DockerError:
+            pass
+        return False
 
     return wrap_async_call(async_is_gpu_node())
 
diff --git a/services/sidecar/tests/unit/test_celery_configurator.py b/services/sidecar/tests/unit/test_celery_configurator.py
@@ -1,68 +1,27 @@
 # pylint: disable=unused-argument,redefined-outer-name,no-member
-import pytest
 import asyncio
 
+import aiodocker
+import pytest
+from celery import Celery
+
+from simcore_sdk.config.rabbit import Config as RabbitConfig
+from simcore_service_sidecar import config
 from simcore_service_sidecar.celery_configurator import (
     get_rabbitmq_config_and_celery_app,
 )
 from simcore_service_sidecar.utils import is_gpu_node
-from simcore_service_sidecar import config
-
-from celery import Celery
-from simcore_sdk.config.rabbit import Config as RabbitConfig
 
 
 def _toggle_gpu_mock(mocker, has_gpu: bool) -> None:
-    # mock ouput of cat /proc/self/cgroup
-    CAT_DATA = b"""
-    12:hugetlb:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
-    11:freezer:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
-    10:blkio:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
-    9:devices:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
-    8:net_cls,net_prio:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
-    7:cpuset:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
-    6:perf_event:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
-    5:memory:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
-    4:rdma:/
-    3:cpu,cpuacct:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
-    2:pids:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
-    1:name=systemd:/docker/2c52ab5a825dea0b074741fb1521c972866af7997a761eb312405b50ad289263
-    0::/system.slice/containerd.service
-    """
-
-    future = asyncio.Future()
-    future.set_result((CAT_DATA, None))
-    comunicate = mocker.patch("asyncio.subprocess.Process.communicate")
-    comunicate.return_value = future
-
-    class MockContainer:
-        async def show(self):
-            data = {"Config": {"Labels": {"com.docker.swarm.node.id": "node_id"}}}
-            return data
-
-    future = asyncio.Future()
-    future.set_result(MockContainer())
-    containers_get = mocker.patch("aiodocker.containers.DockerContainers.get")
-    containers_get.return_value = future
-
-    def gpu_support_key():
-        """if GPU support is enabled this Kind key must be present"""
-        return "Kind" if has_gpu else "_"
-
-    payload = {
-        "Description": {
-            "Resources": {
-                "GenericResources": [
-                    {"DiscreteResourceSpec": {gpu_support_key(): "VRAM"}}
-                ]
-            }
-        }
-    }
-
-    future = asyncio.Future()
-    future.set_result(payload)
-    containers_get = mocker.patch("aiodocker.nodes.DockerSwarmNodes.inspect")
-    containers_get.return_value = future
+    containers_get = mocker.patch(
+        "aiodocker.containers.DockerContainers.run", return_value=asyncio.Future()
+    )
+    containers_get.return_value.set_result("")
+    if not has_gpu:
+        containers_get.side_effect = aiodocker.exceptions.DockerError(
+            "MOCK Error", {"message": "this is a mocked exception"}
+        )
 
 
 @pytest.fixture()