|
1 | 1 | import asyncio
|
2 | 2 | import logging
|
3 |
| -import aiodocker |
4 |
| -import re |
5 |
| -from typing import List |
| 3 | +import os |
| 4 | +from typing import Awaitable, List |
6 | 5 |
|
| 6 | +import aiodocker |
7 | 7 | import aiopg
|
8 | 8 | import networkx as nx
|
9 |
| -from simcore_postgres_database.sidecar_models import SUCCESS, comp_pipeline, comp_tasks |
10 | 9 | from sqlalchemy import and_
|
11 |
| -from simcore_sdk.config.rabbit import Config as RabbitConfig |
| 10 | + |
12 | 11 | from celery import Celery
|
13 |
| -from .exceptions import MoreThenOneItemDetected |
| 12 | +from simcore_postgres_database.sidecar_models import SUCCESS, comp_pipeline, comp_tasks |
| 13 | +from simcore_sdk.config.rabbit import Config as RabbitConfig |
14 | 14 |
|
15 | 15 | logger = logging.getLogger(__name__)
|
16 | 16 |
|
17 | 17 |
|
18 |
| -def wrap_async_call(fct: asyncio.coroutine): |
| 18 | +def wrap_async_call(fct: Awaitable): |
19 | 19 | return asyncio.get_event_loop().run_until_complete(fct)
|
20 | 20 |
|
21 | 21 |
|
@@ -76,58 +76,26 @@ def is_gpu_node() -> bool:
|
76 | 76 | """Returns True if this node has support to GPU,
|
77 | 77 | meaning that the `VRAM` label was added to it."""
|
78 | 78 |
|
79 |
| - def get_container_id_from_cgroup(cat_cgroup_content) -> str: |
80 |
| - """Parses the result of cat cat /proc/self/cgroup and returns a container_id or |
81 |
| - raises an error in case only one unique id was not found.""" |
82 |
| - possible_candidates = {x for x in cat_cgroup_content.split() if len(x) >= 64} |
83 |
| - result_set = {x.split("/")[-1] for x in possible_candidates} |
84 |
| - if len(result_set) != 1: |
85 |
| - # pylint: disable=raising-format-tuple |
86 |
| - raise MoreThenOneItemDetected( |
87 |
| - "There should only be one entry in this set of possible container_ids" |
88 |
| - ", have a look at %s" % possible_candidates |
89 |
| - ) |
90 |
| - return_value = result_set.pop() |
91 |
| - # check if length is 64 and all char match this regex [A-Fa-f0-9] |
92 |
| - if len(return_value) != 64 and re.findall("[A-Fa-f0-9]{64}", return_value): |
93 |
| - # pylint: disable=raising-format-tuple |
94 |
| - raise ValueError( |
95 |
| - "Found container ID is not a valid sha256 string %s", return_value |
96 |
| - ) |
97 |
| - return return_value |
98 |
| - |
99 | 79 | async def async_is_gpu_node() -> bool:
|
100 |
| - cmd = "cat /proc/self/cgroup" |
101 |
| - proc = await asyncio.create_subprocess_shell( |
102 |
| - cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, |
103 |
| - ) |
104 |
| - |
105 |
| - stdout, _ = await proc.communicate() |
106 |
| - container_id = get_container_id_from_cgroup(stdout.decode("utf-8").strip()) |
107 |
| - |
108 | 80 | docker = aiodocker.Docker()
|
109 | 81 |
|
110 |
| - container = await docker.containers.get(container_id) |
111 |
| - container_info = await container.show() |
112 |
| - node_id = container_info["Config"]["Labels"]["com.docker.swarm.node.id"] |
113 |
| - node_info = await docker.nodes.inspect(node_id=node_id) |
114 |
| - |
115 |
| - generic_resources = ( |
116 |
| - node_info.get("Description", {}) |
117 |
| - .get("Resources", {}) |
118 |
| - .get("GenericResources", []) |
119 |
| - ) |
120 |
| - |
121 |
| - has_gpu_support = False |
122 |
| - for entry in generic_resources: |
123 |
| - if entry.get("DiscreteResourceSpec", {}).get("Kind") == "VRAM": |
124 |
| - has_gpu_support = True |
125 |
| - break |
126 |
| - |
127 |
| - await docker.close() |
128 |
| - |
129 |
| - logger.info("Node GPU support: %s", has_gpu_support) |
130 |
| - return has_gpu_support |
| 82 | + config = { |
| 83 | + "Cmd": "nvidia-smi", |
| 84 | + "Image": "nvidia/cuda:10.0-base", |
| 85 | + "AttachStdin": False, |
| 86 | + "AttachStdout": False, |
| 87 | + "AttachStderr": False, |
| 88 | + "Tty": False, |
| 89 | + "OpenStdin": False, |
| 90 | + } |
| 91 | + try: |
| 92 | + await docker.containers.run( |
| 93 | + config=config, name=f"sidecar_{os.getpid()}_test_gpu" |
| 94 | + ) |
| 95 | + return True |
| 96 | + except aiodocker.exceptions.DockerError: |
| 97 | + pass |
| 98 | + return False |
131 | 99 |
|
132 | 100 | return wrap_async_call(async_is_gpu_node())
|
133 | 101 |
|
|
0 commit comments