bugfix: addressed issues when acquiring mpi_lock in sidecar (#2093)

GitHK · Andrei Neagu · sanderegg · web-flow · commit 9e0d6df400d3 · 2021-01-21T08:20:10.000+01:00
* mpi locking got refacted to use multiprocessing * it should no longer be possible to start multiple MPI nodes in case of errors * minor refactor * added more comments and written in a more clear logic * Update mpi_lock.py updating comment * added tests for mpi_lock module * refactored fucntion names * migrated function to private * semplified mpi_lock acquisition * updated docstring and comment * final refactor to the mpi_locking using a proper * Update services/sidecar/src/simcore_service_sidecar/mpi_lock.py Co-authored-by: Sylvain <35365065+sanderegg@users.noreply.github.com> * inverted test oder * before trying to acquire lock, make sure redis is working * adding more debug * Git hk fix mpi sidecar locking (#2) * Minor cleanup in pytest_simcore/docker_swarm and redis_service * Minor * tests improvmentsd - adde dmore cases - patched redis config properly * improved mpi_lock for high concurrency * Update test_mpi_lock.py we do not have such high concurrency Co-authored-by: Andrei Neagu <neagu@itis.swiss> Co-authored-by: Sylvain <35365065+sanderegg@users.noreply.github.com> Co-authored-by: Pedro Crespo-Valero <32402063+pcrespov@users.noreply.github.com>
diff --git a/packages/pytest-simcore/src/pytest_simcore/docker_swarm.py b/packages/pytest-simcore/src/pytest_simcore/docker_swarm.py
@@ -8,7 +8,7 @@
 from datetime import datetime
 from pathlib import Path
 from pprint import pprint
-from typing import Dict
+from typing import Dict, Iterator
 
 import docker
 import pytest
@@ -20,7 +20,7 @@
 
 
 @pytest.fixture(scope="session")
-def docker_client() -> docker.client.DockerClient:
+def docker_client() -> Iterator[docker.client.DockerClient]:
     client = docker.from_env()
     yield client
 
@@ -32,7 +32,7 @@ def keep_docker_up(request) -> bool:
 
 @pytest.fixture(scope="module")
 def docker_swarm(
-    docker_client: docker.client.DockerClient, keep_docker_up: bool
+    docker_client: docker.client.DockerClient, keep_docker_up: Iterator[bool]
 ) -> None:
     try:
         docker_client.swarm.reload()
@@ -56,7 +56,7 @@ def to_datetime(datetime_str: str) -> datetime:
     return datetime.strptime(datetime_str, "%Y-%m-%dT%H:%M:%S.%f")
 
 
-def by_task_update(task: Dict) -> bool:
+def by_task_update(task: Dict) -> datetime:
     datetime_str = task["Status"]["Timestamp"]
     return to_datetime(datetime_str)
 
@@ -94,7 +94,7 @@ def docker_stack(
     core_docker_compose_file: Path,
     ops_docker_compose_file: Path,
     keep_docker_up: bool,
-) -> Dict:
+) -> Iterator[Dict]:
     stacks = {"simcore": core_docker_compose_file, "ops": ops_docker_compose_file}
 
     # make up-version
diff --git a/packages/pytest-simcore/src/pytest_simcore/redis_service.py b/packages/pytest-simcore/src/pytest_simcore/redis_service.py
@@ -1,10 +1,10 @@
-import asyncio
-import logging
-
 # pylint:disable=unused-variable
 # pylint:disable=unused-argument
 # pylint:disable=redefined-outer-name
-from typing import Dict
+
+import asyncio
+import logging
+from typing import Dict, Iterator, Union
 
 import aioredis
 import pytest
@@ -18,7 +18,7 @@
 
 
 @pytest.fixture(scope="module")
-def loop(request) -> asyncio.AbstractEventLoop:
+def loop(request) -> Iterator[asyncio.AbstractEventLoop]:
     loop = asyncio.get_event_loop_policy().new_event_loop()
     yield loop
     loop.close()
@@ -27,13 +27,12 @@ def loop(request) -> asyncio.AbstractEventLoop:
 @pytest.fixture(scope="module")
 async def redis_config(loop, docker_stack: Dict, devel_environ: Dict) -> RedisConfig:
     assert "simcore_redis" in docker_stack["services"]
-
     # test runner is running on the host computer
     config = RedisConfig(
         host="127.0.0.1",
         port=get_service_published_port("simcore_redis", devel_environ["REDIS_PORT"]),
     )
-    await wait_till_redis_responsive(config.dsn)
+    await wait_till_redis_responsive(str(config.dsn))
     return config
 
 
@@ -45,7 +44,7 @@ async def redis_service(redis_config: RedisConfig, monkeypatch) -> RedisConfig:
 
 
 @pytest.fixture(scope="module")
-async def redis_client(loop, redis_config: RedisConfig) -> aioredis.Redis:
+async def redis_client(loop, redis_config: RedisConfig) -> Iterator[aioredis.Redis]:
     client = await aioredis.create_redis_pool(redis_config.dsn, encoding="utf-8")
 
     yield client
@@ -56,13 +55,15 @@ async def redis_client(loop, redis_config: RedisConfig) -> aioredis.Redis:
 
 
 # HELPERS --
+
+
 @tenacity.retry(
     wait=tenacity.wait_fixed(5),
     stop=tenacity.stop_after_attempt(60),
     before_sleep=tenacity.before_sleep_log(log, logging.INFO),
     reraise=True,
 )
-async def wait_till_redis_responsive(redis_url: URL) -> None:
+async def wait_till_redis_responsive(redis_url: Union[URL, str]) -> None:
     client = await aioredis.create_redis_pool(str(redis_url), encoding="utf-8")
     client.close()
     await client.wait_closed()
diff --git a/services/sidecar/src/simcore_service_sidecar/mpi_lock.py b/services/sidecar/src/simcore_service_sidecar/mpi_lock.py
@@ -1,131 +1,136 @@
 """
 Try to acquire a lock on the MPI resource.
 
-Due to pour non async implementation aioredlock will be used
+Due to pour non async implementation aioredlock will be used.
+All configuration is specified upfront
 
-How it works:
 
-- Try to acquire a lock the lock in a tight loop for about X seconds.
-- If it works start a task which updates the expiration every X second is spawned.
-- Ensures sleeper can be started as MPI sleeper again.
 """
 import asyncio
-import datetime
 import logging
-from threading import Thread
-from typing import Any, Callable, Optional, Tuple
+import multiprocessing
+import os
 
-from aioredlock import Aioredlock, Lock, LockError
+import aioredis
+import tenacity
+from aioredlock import Aioredlock, LockError
+from pydantic.networks import RedisDsn
 
 from . import config
 
-logger = logging.getLogger(__name__)
-
+# ptsvd cause issues with multiprocessing
+# SEE: https://github.com/microsoft/ptvsd/issues/1443
+if os.environ.get("SC_BOOT_MODE") == "debug-ptvsd":  # pragma: no cover
+    multiprocessing.set_start_method("spawn", True)
 
-async def retry_for_result(
-    result_validator: Callable[[Any], Any], coroutine_factory: Callable
-) -> Tuple[bool, Any]:
-    """
-    Will execute the given callback until the expected result is reached.
-    Between each retry it will wait 1/5 of REDLOCK_REFRESH_INTERVAL_SECONDS
-    """
-    sleep_interval = config.REDLOCK_REFRESH_INTERVAL_SECONDS / 5.0
-    elapsed = 0.0
-    start = datetime.datetime.utcnow()
-
-    while elapsed < config.REDLOCK_REFRESH_INTERVAL_SECONDS:
-        result = await coroutine_factory()
-        if result_validator(result):
-            return True, result
-        await asyncio.sleep(sleep_interval)
-        elapsed = (datetime.datetime.utcnow() - start).total_seconds()
-
-    return False, None
+logger = logging.getLogger(__name__)
 
 
-def start_background_lock_extender(
-    lock_manager: Aioredlock, lock: Lock, loop: asyncio.BaseEventLoop
+async def _wrapped_acquire_and_extend_lock_worker(
+    reply_queue: multiprocessing.Queue, cpu_count: int
 ) -> None:
-    """Will periodically extend the duration of the lock"""
-
-    async def extender_worker(lock_manager: Aioredlock):
-        sleep_interval = 0.9 * config.REDLOCK_REFRESH_INTERVAL_SECONDS
-        while True:
-            await lock_manager.extend(lock, config.REDLOCK_REFRESH_INTERVAL_SECONDS)
-
-            await asyncio.sleep(sleep_interval)
-
-    loop.run_until_complete(extender_worker(lock_manager))
-
-
-def thread_worker(
-    lock_manager: Aioredlock, lock: Lock, loop: asyncio.BaseEventLoop
+    try:
+        # if the lock is acquired the above function will block here
+        await _acquire_and_extend_lock_forever(reply_queue, cpu_count)
+    finally:
+        # if the _acquire_and_extend_lock_forever function returns
+        # the lock was not acquired, need to make sure the acquire_mpi_lock
+        # always has a result to avoid issues
+        reply_queue.put(False)
+
+
+@tenacity.retry(
+    wait=tenacity.wait_fixed(5),
+    stop=tenacity.stop_after_attempt(60),
+    before_sleep=tenacity.before_sleep_log(logger, logging.INFO),
+    reraise=True,
+)
+async def wait_till_redis_responsive(dsn: RedisDsn) -> None:
+    logger.info("Trying to connect to %s", dsn)
+    client = await aioredis.create_redis_pool(dsn, encoding="utf-8")
+    client.close()
+    await client.wait_closed()
+
+
+# trap lock_error
+async def _acquire_and_extend_lock_forever(
+    reply_queue: multiprocessing.Queue, cpu_count: int
 ) -> None:
-    start_background_lock_extender(lock_manager, lock, loop)
+    await wait_till_redis_responsive(config.CELERY_CONFIG.redis.dsn)
 
+    resource_name = f"aioredlock:mpi_lock:{cpu_count}"
+    endpoint = [
+        {
+            "host": config.CELERY_CONFIG.redis.host,
+            "port": config.CELERY_CONFIG.redis.port,
+            "db": int(config.CELERY_CONFIG.redis.db),
+        }
+    ]
+
+    logger.info("Will try to acquire an mpi_lock on %s", resource_name)
+    logger.info("Connecting to %s", endpoint)
+    lock_manager = Aioredlock(
+        redis_connections=endpoint,
+        retry_count=10,
+        internal_lock_timeout=config.REDLOCK_REFRESH_INTERVAL_SECONDS,
+    )
 
-async def try_to_acquire_lock(
-    lock_manager: Aioredlock, resource_name: str
-) -> Optional[Tuple[bool, Lock]]:
-    # Try to acquire the lock:
+    # Try to acquire the lock, it will retry it 5 times with
+    # a wait between 0.1 and 0.3 seconds between each try
+    # if the lock is not acquire a LockError is raised
     try:
-        return await lock_manager.lock(
-            resource_name, lock_timeout=config.REDLOCK_REFRESH_INTERVAL_SECONDS
-        )
+        lock = await lock_manager.lock(resource_name)
     except LockError:
-        pass
-
-    return None
+        logger.warning("Could not acquire lock on resource %s", resource_name)
+        await lock_manager.destroy()
+        return
 
+    # NOTE: in high concurrency situation you can have
+    # multiple instances acquire the same lock
+    # wait a tiny amount and read back the result of the lock acquisition
+    await asyncio.sleep(0.1)
+    # reed back result to make sure it was locked
+    is_locked = await lock_manager.is_locked(resource_name)
 
-async def acquire_lock(cpu_count: int) -> bool:
-    resource_name = f"aioredlock:mpi_lock:{cpu_count}"
-    lock_manager = Aioredlock([config.CELERY_CONFIG.redis.dsn])
-    logger.info("Will try to acquire an mpi_lock")
+    # the lock was successfully acquired, put the result in the queue
+    reply_queue.put(is_locked)
 
-    def is_locked_factory():
-        return lock_manager.is_locked(resource_name)
+    # continue renewing the lock at regular intervals
+    sleep_interval = 0.5 * config.REDLOCK_REFRESH_INTERVAL_SECONDS
+    logger.info("Starting lock extention at %s seconds interval", sleep_interval)
 
-    is_lock_free, _ = await retry_for_result(
-        result_validator=lambda x: x is False,
-        coroutine_factory=is_locked_factory,
-    )
+    try:
+        while True:
+            try:
+                await lock_manager.extend(lock)
+            except LockError:
+                logger.warning(
+                    "There was an error trying to extend the lock %s", resource_name
+                )
 
-    if not is_lock_free:
-        # it was not possible to acquire the lock
-        return False
+            await asyncio.sleep(sleep_interval)
+    finally:
+        # in case some other error occurs recycle all connections to redis
+        await lock_manager.destroy()
 
-    def try_to_acquire_lock_factory():
-        return try_to_acquire_lock(lock_manager, resource_name)
 
-    # lock is free try to acquire and start background extention
-    managed_to_acquire_lock, lock = await retry_for_result(
-        result_validator=lambda x: type(x) == Lock,
-        coroutine_factory=try_to_acquire_lock_factory,
+def _process_worker(queue: multiprocessing.Queue, cpu_count: int) -> None:
+    logger.info("Starting background process for mpi lock result")
+    asyncio.get_event_loop().run_until_complete(
+        _wrapped_acquire_and_extend_lock_worker(queue, cpu_count)
     )
-
-    if managed_to_acquire_lock:
-        Thread(
-            target=thread_worker,
-            args=(
-                lock_manager,
-                lock,
-                asyncio.get_event_loop(),
-            ),
-            daemon=True,
-        ).start()
-
-    logger.info("mpi_lock acquisition result %s", managed_to_acquire_lock)
-    return managed_to_acquire_lock
+    logger.info("Background asyncio task finished. Background process will despawn.")
 
 
 def acquire_mpi_lock(cpu_count: int) -> bool:
     """
     returns True if successfull
     Will try to acquire a distributed shared lock.
-    This operation will last up to 2 x config.REDLOCK_REFRESH_INTERVAL_SECONDS
     """
-    from .utils import wrap_async_call
+    reply_queue = multiprocessing.Queue()
+    multiprocessing.Process(
+        target=_process_worker, args=(reply_queue, cpu_count), daemon=True
+    ).start()
 
-    was_acquired = wrap_async_call(acquire_lock(cpu_count))
-    return was_acquired
+    lock_acquired = reply_queue.get()
+    return lock_acquired
diff --git a/services/sidecar/tests/integration/conftest.py b/services/sidecar/tests/integration/conftest.py
@@ -18,6 +18,8 @@
     "pytest_simcore.rabbit_service",
     "pytest_simcore.postgres_service",
     "pytest_simcore.minio_service",
+    "pytest_simcore.simcore_services",
+    "pytest_simcore.redis_service",
     "pytest_simcore.simcore_storage_service",
 ]
 log = logging.getLogger(__name__)
diff --git a/services/sidecar/tests/integration/test_mpi_lock.py b/services/sidecar/tests/integration/test_mpi_lock.py

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,8 @@`
`18`	`18`	`"pytest_simcore.rabbit_service",`
`19`	`19`	`"pytest_simcore.postgres_service",`
`20`	`20`	`"pytest_simcore.minio_service",`
	`21`	`+ "pytest_simcore.simcore_services",`
	`22`	`+ "pytest_simcore.redis_service",`
`21`	`23`	`"pytest_simcore.simcore_storage_service",`
`22`	`24`	`]`
`23`	`25`	`log = logging.getLogger(__name__)`