-
Notifications
You must be signed in to change notification settings - Fork 28
fix testing if node has gpu support #1604
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,21 @@ | ||
import asyncio | ||
import logging | ||
import aiodocker | ||
import re | ||
from typing import List | ||
import os | ||
from typing import Awaitable, List | ||
|
||
import aiodocker | ||
import aiopg | ||
import networkx as nx | ||
from simcore_postgres_database.sidecar_models import SUCCESS, comp_pipeline, comp_tasks | ||
from sqlalchemy import and_ | ||
from simcore_sdk.config.rabbit import Config as RabbitConfig | ||
|
||
from celery import Celery | ||
from .exceptions import MoreThenOneItemDetected | ||
from simcore_postgres_database.sidecar_models import SUCCESS, comp_pipeline, comp_tasks | ||
from simcore_sdk.config.rabbit import Config as RabbitConfig | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def wrap_async_call(fct: asyncio.coroutine): | ||
def wrap_async_call(fct: Awaitable): | ||
return asyncio.get_event_loop().run_until_complete(fct) | ||
|
||
|
||
|
@@ -76,58 +76,26 @@ def is_gpu_node() -> bool: | |
"""Returns True if this node has support to GPU, | ||
meaning that the `VRAM` label was added to it.""" | ||
|
||
def get_container_id_from_cgroup(cat_cgroup_content) -> str: | ||
"""Parses the result of cat cat /proc/self/cgroup and returns a container_id or | ||
raises an error in case only one unique id was not found.""" | ||
possible_candidates = {x for x in cat_cgroup_content.split() if len(x) >= 64} | ||
result_set = {x.split("/")[-1] for x in possible_candidates} | ||
if len(result_set) != 1: | ||
# pylint: disable=raising-format-tuple | ||
raise MoreThenOneItemDetected( | ||
"There should only be one entry in this set of possible container_ids" | ||
", have a look at %s" % possible_candidates | ||
) | ||
return_value = result_set.pop() | ||
# check if length is 64 and all char match this regex [A-Fa-f0-9] | ||
if len(return_value) != 64 and re.findall("[A-Fa-f0-9]{64}", return_value): | ||
# pylint: disable=raising-format-tuple | ||
raise ValueError( | ||
"Found container ID is not a valid sha256 string %s", return_value | ||
) | ||
return return_value | ||
|
||
async def async_is_gpu_node() -> bool: | ||
cmd = "cat /proc/self/cgroup" | ||
proc = await asyncio.create_subprocess_shell( | ||
cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, | ||
) | ||
|
||
stdout, _ = await proc.communicate() | ||
container_id = get_container_id_from_cgroup(stdout.decode("utf-8").strip()) | ||
|
||
docker = aiodocker.Docker() | ||
|
||
container = await docker.containers.get(container_id) | ||
container_info = await container.show() | ||
node_id = container_info["Config"]["Labels"]["com.docker.swarm.node.id"] | ||
node_info = await docker.nodes.inspect(node_id=node_id) | ||
|
||
generic_resources = ( | ||
node_info.get("Description", {}) | ||
.get("Resources", {}) | ||
.get("GenericResources", []) | ||
) | ||
|
||
has_gpu_support = False | ||
for entry in generic_resources: | ||
if entry.get("DiscreteResourceSpec", {}).get("Kind") == "VRAM": | ||
has_gpu_support = True | ||
break | ||
|
||
await docker.close() | ||
|
||
logger.info("Node GPU support: %s", has_gpu_support) | ||
return has_gpu_support | ||
config = { | ||
"Cmd": "nvidia-smi", | ||
"Image": "nvidia/cuda:10.0-base", | ||
"AttachStdin": False, | ||
"AttachStdout": False, | ||
"AttachStderr": False, | ||
"Tty": False, | ||
"OpenStdin": False, | ||
} | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TIP: to suppress exceptions sometimes is handy and more readable from contextlib import suppress
with suppress(aiodocker.execptions.DockerError):
await ...
return True
return False There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. did not realize this existed. cool thing! but for this pre-new-sidecar era I will keep it for a next time. |
||
await docker.containers.run( | ||
config=config, name=f"sidecar_{os.getpid()}_test_gpu" | ||
) | ||
return True | ||
except aiodocker.exceptions.DockerError: | ||
pass | ||
return False | ||
|
||
return wrap_async_call(async_is_gpu_node()) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so I guess this image will never block when boots