Skip to content

Commit ef47aec

Browse files
authored
[core][dashboard] Return the correct error message when trying to kill non-existent actors (#51355)
```sh # Launch a Ray cluster with Ray dashboard ray start --head --include-dashboard=True # Use `/api/actors/kill` to kill an non-existent actor curl -X GET -w "%{http_code}" "http://localhost:8265/api/actors/kill?actor_id=abc&force_kill=false" # [response of the curl command] # The error code is 200, and the msg is wrong. {"result": true, "msg": "Requested actor with id abc to terminate. It will exit once running tasks complete", "data": {}}200% ``` This PR corrects the error message, and the fix of status code is a follow up. With this PR, the status code changes from 200 to 500. It should be 404. This will be fixed in a follow up PR. <img width="1317" alt="image" src="https://github.com/user-attachments/assets/eaef961b-4229-4c92-9168-f2eb69434d85" /> Signed-off-by: Kai-Hsun Chen <[email protected]>
1 parent 936539c commit ef47aec

File tree

5 files changed

+80
-36
lines changed

5 files changed

+80
-36
lines changed

Diff for: python/ray/dashboard/modules/snapshot/snapshot_head.py

+18-8
Original file line numberDiff line numberDiff line change
@@ -89,21 +89,31 @@ async def kill_actor_gcs(self, req) -> aiohttp.web.Response:
8989
success=False, message="actor_id is required."
9090
)
9191

92-
await self.gcs_aio_client.kill_actor(
92+
status_code = await self.gcs_aio_client.kill_actor(
9393
ActorID.from_hex(actor_id),
9494
force_kill,
9595
no_restart,
9696
timeout=SNAPSHOT_API_TIMEOUT_SECONDS,
9797
)
9898

99-
message = (
100-
f"Force killed actor with id {actor_id}"
101-
if force_kill
102-
else f"Requested actor with id {actor_id} to terminate. "
103-
+ "It will exit once running tasks complete"
104-
)
99+
success = status_code == 200
100+
if status_code == 404:
101+
message = f"Actor with id {actor_id} not found."
102+
elif status_code == 500:
103+
message = f"Failed to kill actor with id {actor_id}."
104+
elif status_code == 200:
105+
message = (
106+
f"Force killed actor with id {actor_id}"
107+
if force_kill
108+
else f"Requested actor with id {actor_id} to terminate. "
109+
+ "It will exit once running tasks complete"
110+
)
111+
else:
112+
message = f"Unknown status code: {status_code}. Please open a bug report in the Ray repository."
105113

106-
return dashboard_optional_utils.rest_response(success=True, message=message)
114+
# TODO(kevin85421): The utility function needs to be refactored to handle
115+
# different status codes. Currently, it only returns 200 and 500.
116+
return dashboard_optional_utils.rest_response(success=success, message=message)
107117

108118
@routes.get("/api/component_activities")
109119
async def get_component_activities(self, req) -> aiohttp.web.Response:

Diff for: python/ray/dashboard/modules/snapshot/tests/test_actors.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,18 @@ def _actor_killed_loop(worker_pid: str, timeout_secs=3) -> bool:
3535
return dead
3636

3737

38-
def _kill_actor_using_dashboard_gcs(webui_url: str, actor_id: str, force_kill=False):
38+
def _kill_actor_using_dashboard_gcs(
39+
webui_url: str, actor_id: str, expected_status_code: int, force_kill=False
40+
):
3941
resp = requests.get(
4042
webui_url + KILL_ACTOR_ENDPOINT,
4143
params={
4244
"actor_id": actor_id,
4345
"force_kill": force_kill,
4446
},
4547
)
46-
resp.raise_for_status()
48+
assert resp.status_code == expected_status_code
4749
resp_json = resp.json()
48-
assert resp_json["result"] is True, "msg" in resp_json
4950
return resp_json
5051

5152

@@ -71,8 +72,19 @@ def loop(self):
7172
worker_pid = ray.get(a.f.remote()) # noqa
7273
actor_id = a._ray_actor_id.hex()
7374

75+
OK = 200
76+
INTERNAL_ERROR = 500
77+
78+
# Kill an non-existent actor
79+
# TODO(kevin85421): It should return 404 instead of 500.
80+
resp = _kill_actor_using_dashboard_gcs(
81+
webui_url, "non-existent-actor-id", INTERNAL_ERROR
82+
)
83+
assert "not found" in resp["msg"]
84+
7485
# Kill the actor
75-
_kill_actor_using_dashboard_gcs(webui_url, actor_id, force_kill=False)
86+
resp = _kill_actor_using_dashboard_gcs(webui_url, actor_id, OK, force_kill=False)
87+
assert "It will exit once running tasks complete" in resp["msg"]
7688
assert _actor_killed_loop(worker_pid)
7789

7890
# Create an actor and have it loop
@@ -82,11 +94,13 @@ def loop(self):
8294
a.loop.remote()
8395

8496
# Try to kill the actor, it should not die since a task is running
85-
_kill_actor_using_dashboard_gcs(webui_url, actor_id, force_kill=False)
97+
resp = _kill_actor_using_dashboard_gcs(webui_url, actor_id, OK, force_kill=False)
98+
assert "It will exit once running tasks complete" in resp["msg"]
8699
assert not _actor_killed_loop(worker_pid, timeout_secs=1)
87100

88101
# Force kill the actor
89-
_kill_actor_using_dashboard_gcs(webui_url, actor_id, force_kill=True)
102+
resp = _kill_actor_using_dashboard_gcs(webui_url, actor_id, OK, force_kill=True)
103+
assert "Force killed actor with id" in resp["msg"]
90104
assert _actor_killed_loop(worker_pid)
91105

92106

Diff for: python/ray/includes/gcs_client.pxi

+21-11
Original file line numberDiff line numberDiff line change
@@ -421,14 +421,12 @@ cdef class InnerGcsClient:
421421
CActorID c_actor_id = actor_id.native()
422422

423423
with nogil:
424-
check_status_timeout_as_rpc_error(
425-
self.inner.get().Actors().AsyncKillActor(
426-
c_actor_id,
427-
force_kill,
428-
no_restart,
429-
StatusPyCallback(convert_status, assign_and_decrement_fut, fut),
430-
timeout_ms
431-
)
424+
self.inner.get().Actors().AsyncKillActor(
425+
c_actor_id,
426+
force_kill,
427+
no_restart,
428+
StatusPyCallback(convert_status, assign_and_decrement_fut, fut),
429+
timeout_ms
432430
)
433431
return asyncio.wrap_future(fut)
434432
#############################################################
@@ -739,12 +737,24 @@ cdef convert_get_cluster_status_reply(
739737
return None, e
740738

741739
cdef convert_status(CRayStatus status) with gil:
742-
# -> None
740+
# This function is currently only used by `async_kill_actor` to
741+
# convert RayStatus to an HTTP status code.
742+
#
743+
# Returns:
744+
# Tuple[int, Optional[Exception]]:
745+
# - int: HTTP status code.
746+
# (1) 200: Success
747+
# (2) 404: Actor not found
748+
# (3) 500: Other errors
749+
# - Optional[Exception]: Exception raised by RayStatus
743750
try:
751+
if status.IsNotFound():
752+
return 404, None
744753
check_status_timeout_as_rpc_error(status)
745-
return None, None
754+
return 200, None
746755
except Exception as e:
747-
return None, e
756+
return 500, e
757+
748758
cdef convert_optional_str_none_for_not_found(
749759
CRayStatus status, optional[c_string] c_str) with gil:
750760
# If status is NotFound, return None.

Diff for: src/ray/gcs/gcs_server/gcs_actor_manager.cc

+20-10
Original file line numberDiff line numberDiff line change
@@ -689,18 +689,28 @@ void GcsActorManager::HandleKillActorViaGcs(rpc::KillActorViaGcsRequest request,
689689
rpc::KillActorViaGcsReply *reply,
690690
rpc::SendReplyCallback send_reply_callback) {
691691
const auto &actor_id = ActorID::FromBinary(request.actor_id());
692-
bool force_kill = request.force_kill();
693-
bool no_restart = request.no_restart();
694-
if (no_restart) {
695-
DestroyActor(actor_id, GenKilledByApplicationCause(GetActor(actor_id)));
692+
auto it = registered_actors_.find(actor_id);
693+
if (it != registered_actors_.end()) {
694+
bool force_kill = request.force_kill();
695+
bool no_restart = request.no_restart();
696+
if (no_restart) {
697+
DestroyActor(actor_id, GenKilledByApplicationCause(GetActor(actor_id)));
698+
} else {
699+
KillActor(actor_id, force_kill);
700+
}
701+
702+
GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK());
703+
RAY_LOG(DEBUG).WithField(actor_id.JobId()).WithField(actor_id)
704+
<< "Finished killing actor, force_kill = " << force_kill
705+
<< ", no_restart = " << no_restart;
696706
} else {
697-
KillActor(actor_id, force_kill);
707+
GCS_RPC_SEND_REPLY(send_reply_callback,
708+
reply,
709+
Status::NotFound(absl::StrFormat(
710+
"Could not find actor with ID %s.", actor_id.Hex())));
711+
RAY_LOG(DEBUG).WithField(actor_id.JobId()).WithField(actor_id)
712+
<< "Could not find actor with ID " << actor_id.Hex();
698713
}
699-
700-
GCS_RPC_SEND_REPLY(send_reply_callback, reply, Status::OK());
701-
RAY_LOG(DEBUG).WithField(actor_id.JobId()).WithField(actor_id)
702-
<< "Finished killing actor, force_kill = " << force_kill
703-
<< ", no_restart = " << no_restart;
704714
++counts_[CountType::KILL_ACTOR_REQUEST];
705715
}
706716

Diff for: src/ray/gcs/gcs_server/gcs_actor_manager.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -673,7 +673,7 @@ class GcsActorManager : public rpc::ActorInfoHandler {
673673
/// messages come from a Driver/Worker caused by some network problems.
674674
absl::flat_hash_map<ActorID, std::vector<CreateActorCallback>>
675675
actor_to_create_callbacks_;
676-
/// All registered actors (unresoved and pending actors are also included).
676+
/// All registered actors (unresolved and pending actors are also included).
677677
/// TODO(swang): Use unique_ptr instead of shared_ptr.
678678
absl::flat_hash_map<ActorID, std::shared_ptr<GcsActor>> registered_actors_;
679679
/// All destroyed actors.

0 commit comments

Comments
 (0)