Skip to content

Commit 54fb8f6

Browse files
author
Chris Elion
authored
[MLA-1767] Refactor communicator connection exceptions (#4935)
1 parent c56c617 commit 54fb8f6

File tree

6 files changed

+143
-115
lines changed

6 files changed

+143
-115
lines changed

com.unity.ml-agents/CHANGELOG.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@ and this project adheres to
5858
reduced the amount of memory allocated by approximately 25%. (#4887)
5959
- Removed several memory allocations that happened during inference with discrete actions. (#4922)
6060
- Properly catch permission errors when writing timer files. (#4921)
61-
- Unexpected gRPC exceptions during training are now logged before stopping training. If you see
62-
"noisy" log, please let us know! (#4930)
61+
- Unexpected exceptions during training initialization and shutdown are now logged. If you see
62+
"noisy" logs, please let us know! (#4930, #4935)
6363

6464
#### ml-agents / ml-agents-envs / gym-unity (Python)
6565
- Fixed a bug that would cause an exception when `RunOptions` was deserialized via `pickle`. (#4842)

com.unity.ml-agents/Runtime/Academy.cs

+30-19
Original file line numberDiff line numberDiff line change
@@ -430,32 +430,43 @@ void InitializeEnvironment()
430430
{
431431
// We try to exchange the first message with Python. If this fails, it means
432432
// no Python Process is ready to train the environment. In this case, the
433-
//environment must use Inference.
433+
// environment must use Inference.
434+
bool initSuccessful = false;
435+
var communicatorInitParams = new CommunicatorInitParameters
436+
{
437+
unityCommunicationVersion = k_ApiVersion,
438+
unityPackageVersion = k_PackageVersion,
439+
name = "AcademySingleton",
440+
CSharpCapabilities = new UnityRLCapabilities()
441+
};
442+
434443
try
435444
{
436-
var unityRlInitParameters = Communicator.Initialize(
437-
new CommunicatorInitParameters
438-
{
439-
unityCommunicationVersion = k_ApiVersion,
440-
unityPackageVersion = k_PackageVersion,
441-
name = "AcademySingleton",
442-
CSharpCapabilities = new UnityRLCapabilities()
443-
});
444-
UnityEngine.Random.InitState(unityRlInitParameters.seed);
445-
// We might have inference-only Agents, so set the seed for them too.
446-
m_InferenceSeed = unityRlInitParameters.seed;
447-
TrainerCapabilities = unityRlInitParameters.TrainerCapabilities;
448-
TrainerCapabilities.WarnOnPythonMissingBaseRLCapabilities();
445+
initSuccessful = Communicator.Initialize(
446+
communicatorInitParams,
447+
out var unityRlInitParameters
448+
);
449+
if (initSuccessful)
450+
{
451+
UnityEngine.Random.InitState(unityRlInitParameters.seed);
452+
// We might have inference-only Agents, so set the seed for them too.
453+
m_InferenceSeed = unityRlInitParameters.seed;
454+
TrainerCapabilities = unityRlInitParameters.TrainerCapabilities;
455+
TrainerCapabilities.WarnOnPythonMissingBaseRLCapabilities();
456+
}
457+
else
458+
{
459+
Debug.Log($"Couldn't connect to trainer on port {port} using API version {k_ApiVersion}. Will perform inference instead.");
460+
Communicator = null;
461+
}
449462
}
450-
catch
463+
catch (Exception ex)
451464
{
452-
Debug.Log($"" +
453-
$"Couldn't connect to trainer on port {port} using API version {k_ApiVersion}. " +
454-
"Will perform inference instead."
455-
);
465+
Debug.Log($"Unexpected exception when trying to initialize communication: {ex}\nWill perform inference instead.");
456466
Communicator = null;
457467
}
458468
}
469+
459470
if (Communicator != null)
460471
{
461472
Communicator.QuitCommandReceived += OnQuitCommandReceived;

com.unity.ml-agents/Runtime/Communicator/ICommunicator.cs

+3-2
Original file line numberDiff line numberDiff line change
@@ -130,9 +130,10 @@ internal interface ICommunicator : IDisposable
130130
/// Sends the academy parameters through the Communicator.
131131
/// Is used by the academy to send the AcademyParameters to the communicator.
132132
/// </summary>
133-
/// <returns>The External Initialization Parameters received.</returns>
133+
/// <returns>Whether the connection was successful.</returns>
134134
/// <param name="initParameters">The Unity Initialization Parameters to be sent.</param>
135-
UnityRLInitParameters Initialize(CommunicatorInitParameters initParameters);
135+
/// <param name="initParametersOut">The External Initialization Parameters received</param>
136+
bool Initialize(CommunicatorInitParameters initParameters, out UnityRLInitParameters initParametersOut);
136137

137138
/// <summary>
138139
/// Registers a new Brain to the Communicator.

com.unity.ml-agents/Runtime/Communicator/RpcCommunicator.cs

+90-77
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ public RpcCommunicator(CommunicatorInitParameters communicatorInitParameters)
6464

6565
internal static bool CheckCommunicationVersionsAreCompatible(
6666
string unityCommunicationVersion,
67-
string pythonApiVersion,
68-
string pythonLibraryVersion)
67+
string pythonApiVersion
68+
)
6969
{
7070
var unityVersion = new Version(unityCommunicationVersion);
7171
var pythonVersion = new Version(pythonApiVersion);
@@ -92,9 +92,10 @@ internal static bool CheckCommunicationVersionsAreCompatible(
9292
/// Sends the initialization parameters through the Communicator.
9393
/// Is used by the academy to send initialization parameters to the communicator.
9494
/// </summary>
95-
/// <returns>The External Initialization Parameters received.</returns>
95+
/// <returns>Whether the connection was successful.</returns>
9696
/// <param name="initParameters">The Unity Initialization Parameters to be sent.</param>
97-
public UnityRLInitParameters Initialize(CommunicatorInitParameters initParameters)
97+
/// <param name="initParametersOut">The External Initialization Parameters received.</param>
98+
public bool Initialize(CommunicatorInitParameters initParameters, out UnityRLInitParameters initParametersOut)
9899
{
99100
var academyParameters = new UnityRLInitializationOutputProto
100101
{
@@ -113,62 +114,75 @@ public UnityRLInitParameters Initialize(CommunicatorInitParameters initParameter
113114
{
114115
RlInitializationOutput = academyParameters
115116
},
116-
out input);
117-
118-
var pythonPackageVersion = initializationInput.RlInitializationInput.PackageVersion;
119-
var pythonCommunicationVersion = initializationInput.RlInitializationInput.CommunicationVersion;
120-
var unityCommunicationVersion = initParameters.unityCommunicationVersion;
121-
122-
TrainingAnalytics.SetTrainerInformation(pythonPackageVersion, pythonCommunicationVersion);
123-
124-
var communicationIsCompatible = CheckCommunicationVersionsAreCompatible(unityCommunicationVersion,
125-
pythonCommunicationVersion,
126-
pythonPackageVersion);
127-
128-
// Initialization succeeded part-way. The most likely cause is a mismatch between the communicator
129-
// API strings, so log an explicit warning if that's the case.
130-
if (initializationInput != null && input == null)
117+
out input
118+
);
119+
}
120+
catch (Exception ex)
121+
{
122+
if (ex is RpcException rpcException)
131123
{
132-
if (!communicationIsCompatible)
133-
{
134-
Debug.LogWarningFormat(
135-
"Communication protocol between python ({0}) and Unity ({1}) have different " +
136-
"versions which make them incompatible. Python library version: {2}.",
137-
pythonCommunicationVersion, initParameters.unityCommunicationVersion,
138-
pythonPackageVersion
139-
);
140-
}
141-
else
124+
125+
switch (rpcException.Status.StatusCode)
142126
{
143-
Debug.LogWarningFormat(
144-
"Unknown communication error between Python. Python communication protocol: {0}, " +
145-
"Python library version: {1}.",
146-
pythonCommunicationVersion,
147-
pythonPackageVersion
148-
);
127+
case StatusCode.Unavailable:
128+
// This is the common case where there's no trainer to connect to.
129+
break;
130+
case StatusCode.DeadlineExceeded:
131+
// We don't currently set a deadline for connection, but likely will in the future.
132+
break;
133+
default:
134+
Debug.Log($"Unexpected gRPC exception when trying to initialize communication: {rpcException}");
135+
break;
149136
}
150-
151-
throw new UnityAgentsException("ICommunicator.Initialize() failed.");
152137
}
138+
else
139+
{
140+
Debug.Log($"Unexpected exception when trying to initialize communication: {ex}");
141+
}
142+
initParametersOut = new UnityRLInitParameters();
143+
return false;
153144
}
154-
catch
155-
{
156-
var exceptionMessage = "The Communicator was unable to connect. Please make sure the External " +
157-
"process is ready to accept communication with Unity.";
158145

159-
// Check for common error condition and add details to the exception message.
160-
var httpProxy = Environment.GetEnvironmentVariable("HTTP_PROXY");
161-
var httpsProxy = Environment.GetEnvironmentVariable("HTTPS_PROXY");
162-
if (httpProxy != null || httpsProxy != null)
146+
var pythonPackageVersion = initializationInput.RlInitializationInput.PackageVersion;
147+
var pythonCommunicationVersion = initializationInput.RlInitializationInput.CommunicationVersion;
148+
149+
TrainingAnalytics.SetTrainerInformation(pythonPackageVersion, pythonCommunicationVersion);
150+
151+
var communicationIsCompatible = CheckCommunicationVersionsAreCompatible(
152+
initParameters.unityCommunicationVersion,
153+
pythonCommunicationVersion
154+
);
155+
156+
// Initialization succeeded part-way. The most likely cause is a mismatch between the communicator
157+
// API strings, so log an explicit warning if that's the case.
158+
if (initializationInput != null && input == null)
159+
{
160+
if (!communicationIsCompatible)
161+
{
162+
Debug.LogWarningFormat(
163+
"Communication protocol between python ({0}) and Unity ({1}) have different " +
164+
"versions which make them incompatible. Python library version: {2}.",
165+
pythonCommunicationVersion, initParameters.unityCommunicationVersion,
166+
pythonPackageVersion
167+
);
168+
}
169+
else
163170
{
164-
exceptionMessage += " Try removing HTTP_PROXY and HTTPS_PROXY from the" +
165-
"environment variables and try again.";
171+
Debug.LogWarningFormat(
172+
"Unknown communication error between Python. Python communication protocol: {0}, " +
173+
"Python library version: {1}.",
174+
pythonCommunicationVersion,
175+
pythonPackageVersion
176+
);
166177
}
167-
throw new UnityAgentsException(exceptionMessage);
178+
179+
initParametersOut = new UnityRLInitParameters();
180+
return false;
168181
}
169182

170183
UpdateEnvironmentWithInput(input.RlInput);
171-
return initializationInput.RlInitializationInput.ToUnityRLInitParameters();
184+
initParametersOut = initializationInput.RlInitializationInput.ToUnityRLInitParameters();
185+
return true;
172186
}
173187

174188
/// <summary>
@@ -197,8 +211,7 @@ void UpdateEnvironmentWithInput(UnityRLInputProto rlInput)
197211
SendCommandEvent(rlInput.Command);
198212
}
199213

200-
UnityInputProto Initialize(UnityOutputProto unityOutput,
201-
out UnityInputProto unityInput)
214+
UnityInputProto Initialize(UnityOutputProto unityOutput, out UnityInputProto unityInput)
202215
{
203216
#if UNITY_EDITOR || UNITY_STANDALONE_WIN || UNITY_STANDALONE_OSX || UNITY_STANDALONE_LINUX
204217
m_IsOpen = true;
@@ -220,8 +233,7 @@ UnityInputProto Initialize(UnityOutputProto unityOutput,
220233
}
221234
return result.UnityInput;
222235
#else
223-
throw new UnityAgentsException(
224-
"You cannot perform training on this platform.");
236+
throw new UnityAgentsException("You cannot perform training on this platform.");
225237
#endif
226238
}
227239

@@ -456,33 +468,34 @@ UnityInputProto Exchange(UnityOutputProto unityOutput)
456468
QuitCommandReceived?.Invoke();
457469
return message.UnityInput;
458470
}
459-
catch (RpcException rpcException)
471+
catch (Exception ex)
460472
{
461-
// Log more verbose errors if they're something the user can possibly do something about.
462-
switch (rpcException.Status.StatusCode)
473+
if (ex is RpcException rpcException)
474+
{
475+
// Log more verbose errors if they're something the user can possibly do something about.
476+
switch (rpcException.Status.StatusCode)
477+
{
478+
case StatusCode.Unavailable:
479+
// This can happen when python disconnects. Ignore it to avoid noisy logs.
480+
break;
481+
case StatusCode.ResourceExhausted:
482+
// This happens is the message body is too large. There's no way to
483+
// gracefully handle this, but at least we can show the message and the
484+
// user can try to reduce the number of agents or observation sizes.
485+
Debug.LogError($"GRPC Exception: {rpcException.Message}. Disconnecting from trainer.");
486+
break;
487+
default:
488+
// Other unknown errors. Log at INFO level.
489+
Debug.Log($"GRPC Exception: {rpcException.Message}. Disconnecting from trainer.");
490+
break;
491+
}
492+
}
493+
else
463494
{
464-
case StatusCode.Unavailable:
465-
// This can happen when python disconnects. Ignore it to avoid noisy logs.
466-
break;
467-
case StatusCode.ResourceExhausted:
468-
// This happens is the message body is too large. There's no way to
469-
// gracefully handle this, but at least we can show the message and the
470-
// user can try to reduce the number of agents or observation sizes.
471-
Debug.LogError($"GRPC Exception: {rpcException.Message}. Disconnecting from trainer.");
472-
break;
473-
default:
474-
// Other unknown errors. Log at INFO level.
475-
Debug.Log($"GRPC Exception: {rpcException.Message}. Disconnecting from trainer.");
476-
break;
495+
// Fall-through for other error types
496+
Debug.LogError($"Communication Exception: {ex.Message}. Disconnecting from trainer.");
477497
}
478-
m_IsOpen = false;
479-
QuitCommandReceived?.Invoke();
480-
return null;
481-
}
482-
catch (Exception ex)
483-
{
484-
// Fall-through for other error types
485-
Debug.LogError($"GRPC Exception: {ex.Message}. Disconnecting from trainer.");
498+
486499
m_IsOpen = false;
487500
QuitCommandReceived?.Invoke();
488501
return null;

com.unity.ml-agents/Runtime/SideChannels/SideChannel.cs

+12-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using System.Collections.Generic;
22
using System;
3+
using UnityEngine;
34

45
namespace Unity.MLAgents.SideChannels
56
{
@@ -34,9 +35,18 @@ public Guid ChannelId
3435

3536
internal void ProcessMessage(byte[] msg)
3637
{
37-
using (var incomingMsg = new IncomingMessage(msg))
38+
try
3839
{
39-
OnMessageReceived(incomingMsg);
40+
using (var incomingMsg = new IncomingMessage(msg))
41+
{
42+
OnMessageReceived(incomingMsg);
43+
}
44+
}
45+
catch (Exception ex)
46+
{
47+
// Catch all errors in the sidechannel processing, so that a single
48+
// bad SideChannel implementation doesn't take everything down with it.
49+
Debug.LogError($"Error processing SideChannel message: {ex}.\nThe message will be skipped.");
4050
}
4151
}
4252

com.unity.ml-agents/Tests/Editor/Communicator/RpcCommunicatorTests.cs

+6-13
Original file line numberDiff line numberDiff line change
@@ -12,37 +12,30 @@ public void TestCheckCommunicationVersionsAreCompatible()
1212
{
1313
var unityVerStr = "1.0.0";
1414
var pythonVerStr = "1.0.0";
15-
var pythonPackageVerStr = "0.16.0";
1615

1716
Assert.IsTrue(RpcCommunicator.CheckCommunicationVersionsAreCompatible(unityVerStr,
18-
pythonVerStr,
19-
pythonPackageVerStr));
17+
pythonVerStr));
2018
LogAssert.NoUnexpectedReceived();
2119

2220
pythonVerStr = "1.1.0";
2321
Assert.IsTrue(RpcCommunicator.CheckCommunicationVersionsAreCompatible(unityVerStr,
24-
pythonVerStr,
25-
pythonPackageVerStr));
22+
pythonVerStr));
2623
LogAssert.NoUnexpectedReceived();
2724

2825
unityVerStr = "2.0.0";
2926
Assert.IsFalse(RpcCommunicator.CheckCommunicationVersionsAreCompatible(unityVerStr,
30-
pythonVerStr,
31-
pythonPackageVerStr));
27+
pythonVerStr));
3228

3329
unityVerStr = "0.15.0";
3430
pythonVerStr = "0.15.0";
3531
Assert.IsTrue(RpcCommunicator.CheckCommunicationVersionsAreCompatible(unityVerStr,
36-
pythonVerStr,
37-
pythonPackageVerStr));
32+
pythonVerStr));
3833
unityVerStr = "0.16.0";
3934
Assert.IsFalse(RpcCommunicator.CheckCommunicationVersionsAreCompatible(unityVerStr,
40-
pythonVerStr,
41-
pythonPackageVerStr));
35+
pythonVerStr));
4236
unityVerStr = "1.15.0";
4337
Assert.IsFalse(RpcCommunicator.CheckCommunicationVersionsAreCompatible(unityVerStr,
44-
pythonVerStr,
45-
pythonPackageVerStr));
38+
pythonVerStr));
4639

4740
}
4841
}

0 commit comments

Comments
 (0)