Skip to content

Commit f94f359

Browse files
fix tensorflow test hanging issue (#4997)
* fix tensorflow test hanging issue * set smaller timeout for download resource and ingore exception within retry * take comments * only override timeout variable for tensorflow tests
1 parent 26918a4 commit f94f359

File tree

3 files changed

+78
-80
lines changed

3 files changed

+78
-80
lines changed

src/Microsoft.ML.Core/Utilities/ResourceManagerUtils.cs

+29-10
Original file line numberDiff line numberDiff line change
@@ -127,14 +127,22 @@ private async Task<string> DownloadFromUrlWithRetryAsync(IHostEnvironment env, I
127127

128128
for (int i = 0; i < retryTimes; ++i)
129129
{
130-
var thisDownloadResult = await DownloadFromUrlAsync(env, ch, url, fileName, timeout, filePath);
130+
try
131+
{
132+
var thisDownloadResult = await DownloadFromUrlAsync(env, ch, url, fileName, timeout, filePath);
131133

132-
if (string.IsNullOrEmpty(thisDownloadResult))
133-
return thisDownloadResult;
134-
else
135-
downloadResult += thisDownloadResult + @"\n";
134+
if (string.IsNullOrEmpty(thisDownloadResult))
135+
return thisDownloadResult;
136+
else
137+
downloadResult += thisDownloadResult + @"\n";
136138

137-
await Task.Delay(10 * 1000);
139+
await Task.Delay(10 * 1000);
140+
}
141+
catch (Exception ex)
142+
{
143+
// ignore any Exception and retrying download
144+
ch.Warning($"{i+1} - th try: Dowload {fileName} from {url} fail with exception {ex.Message}");
145+
}
138146
}
139147

140148
return downloadResult;
@@ -257,6 +265,8 @@ private Exception DownloadResource(IHostEnvironment env, IChannel ch, WebClient
257265
string tempPath = Path.GetFullPath(Path.Combine(Path.GetDirectoryName(path), "temp-resource-" + guid.ToString()));
258266
try
259267
{
268+
int blockSize = 4096;
269+
260270
using (var s = webClient.OpenRead(uri))
261271
using (var fh = env.CreateOutputFile(tempPath))
262272
using (var ws = fh.CreateWriteStream())
@@ -268,15 +278,24 @@ private Exception DownloadResource(IHostEnvironment env, IChannel ch, WebClient
268278
size = 10000000;
269279

270280
long printFreq = (long)(size / 10.0);
271-
var buffer = new byte[4096];
281+
var buffer = new byte[blockSize];
272282
long total = 0;
273-
int count;
283+
274284
// REVIEW: use a progress channel instead.
275-
while ((count = s.Read(buffer, 0, 4096)) > 0)
285+
while (true)
276286
{
287+
var task = s.ReadAsync(buffer, 0, blockSize, ct);
288+
task.Wait();
289+
int count = task.Result;
290+
291+
if(count <= 0)
292+
{
293+
break;
294+
}
295+
277296
ws.Write(buffer, 0, count);
278297
total += count;
279-
if ((total - (total / printFreq) * printFreq) <= 4096)
298+
if ((total - (total / printFreq) * printFreq) <= blockSize)
280299
ch.Info($"{fileName}: Downloaded {total} bytes out of {size}");
281300
if (ct.IsCancellationRequested)
282301
{

test/Microsoft.ML.Core.Tests/UnitTests/TestResourceDownload.cs

-2
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ public TestResourceDownload(ITestOutputHelper helper)
2626
public async Task TestDownloadError()
2727
{
2828
var envVarOld = Environment.GetEnvironmentVariable(ResourceManagerUtils.CustomResourcesUrlEnvVariable);
29-
var timeoutVarOld = Environment.GetEnvironmentVariable(ResourceManagerUtils.TimeoutEnvVariable);
3029
var resourcePathVarOld = Environment.GetEnvironmentVariable(Utils.CustomSearchDirEnvVariable);
3130
Environment.SetEnvironmentVariable(Utils.CustomSearchDirEnvVariable, null);
3231

@@ -134,7 +133,6 @@ public async Task TestDownloadError()
134133
{
135134
// Set environment variable back to its old value.
136135
Environment.SetEnvironmentVariable(ResourceManagerUtils.CustomResourcesUrlEnvVariable, envVarOld);
137-
Environment.SetEnvironmentVariable(ResourceManagerUtils.TimeoutEnvVariable, timeoutVarOld);
138136
Environment.SetEnvironmentVariable(Utils.CustomSearchDirEnvVariable, resourcePathVarOld);
139137
}
140138
}

test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs

+49-68
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
using static Microsoft.ML.DataOperationsCatalog;
2424
using Microsoft.ML.Trainers;
2525
using Microsoft.ML.TestFrameworkCommon.Attributes;
26+
using Microsoft.ML.Internal.Utilities;
2627

2728
namespace Microsoft.ML.Scenarios
2829
{
@@ -61,8 +62,34 @@ public void Dispose()
6162
[Collection("NoParallelization")]
6263
public sealed class TensorFlowScenariosTests : BaseTestClass, IClassFixture<TensorFlowScenariosTestsFixture>
6364
{
65+
private readonly string _fullImagesetFolderPath;
66+
private readonly string _finalImagesFolderName;
67+
private string _timeOutOldValue;
68+
6469
public TensorFlowScenariosTests(ITestOutputHelper output) : base(output)
6570
{
71+
string imagesDownloadFolderPath = Path.Combine(TensorFlowScenariosTestsFixture.assetsPath, "inputs",
72+
"images");
73+
74+
//Download the image set and unzip
75+
_finalImagesFolderName = DownloadImageSet(
76+
imagesDownloadFolderPath);
77+
78+
_fullImagesetFolderPath = Path.Combine(
79+
imagesDownloadFolderPath, _finalImagesFolderName);
80+
}
81+
82+
protected override void Initialize()
83+
{
84+
// set timeout to 3 minutes, download sometimes will stuck so set smaller timeout to fail fast and retry download
85+
_timeOutOldValue = Environment.GetEnvironmentVariable(ResourceManagerUtils.TimeoutEnvVariable);
86+
Environment.SetEnvironmentVariable(ResourceManagerUtils.TimeoutEnvVariable, (3 * 60 * 1000).ToString());
87+
}
88+
89+
protected override void Cleanup()
90+
{
91+
// set back timeout value
92+
Environment.SetEnvironmentVariable(ResourceManagerUtils.TimeoutEnvVariable, _timeOutOldValue);
6693
}
6794

6895
private class TestData
@@ -1250,25 +1277,13 @@ public void TensorFlowStringTest()
12501277
}
12511278

12521279
[TensorFlowFact]
1253-
// This test hangs occasionally
1254-
[Trait("Category", "SkipInCI")]
12551280
public void TensorFlowImageClassificationDefault()
12561281
{
1257-
string imagesDownloadFolderPath = Path.Combine(TensorFlowScenariosTestsFixture.assetsPath, "inputs",
1258-
"images");
1259-
1260-
//Download the image set and unzip
1261-
string finalImagesFolderName = DownloadImageSet(
1262-
imagesDownloadFolderPath);
1263-
1264-
string fullImagesetFolderPath = Path.Combine(
1265-
imagesDownloadFolderPath, finalImagesFolderName);
1266-
12671282
MLContext mlContext = new MLContext(seed: 1);
12681283

12691284
//Load all the original images info
12701285
IEnumerable<ImageData> images = LoadImagesFromDirectory(
1271-
folder: fullImagesetFolderPath, useFolderNameAsLabel: true);
1286+
folder: _fullImagesetFolderPath, useFolderNameAsLabel: true);
12721287

12731288
IDataView shuffledFullImagesDataset = mlContext.Data.ShuffleRows(
12741289
mlContext.Data.LoadFromEnumerable(images), seed: 1);
@@ -1285,7 +1300,7 @@ public void TensorFlowImageClassificationDefault()
12851300
IDataView trainDataset = trainTestData.TrainSet;
12861301
IDataView testDataset = trainTestData.TestSet;
12871302

1288-
var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
1303+
var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
12891304
.Append(mlContext.MulticlassClassification.Trainers.ImageClassification("Label", "Image")
12901305
.Append(mlContext.Transforms.Conversion.MapKeyToValue(outputColumnName: "PredictedLabel", inputColumnName: "PredictedLabel"))); ;
12911306

@@ -1338,25 +1353,13 @@ internal bool ShouldReuse(string workspacePath, string trainSetBottleneckCachedV
13381353
[InlineData(ImageClassificationTrainer.Architecture.MobilenetV2)]
13391354
[InlineData(ImageClassificationTrainer.Architecture.ResnetV250)]
13401355
[InlineData(ImageClassificationTrainer.Architecture.InceptionV3)]
1341-
//Skipping test temporarily. This test will be re-enabled once the cause of failures has been determined
1342-
[Trait("Category", "SkipInCI")]
13431356
public void TensorFlowImageClassification(ImageClassificationTrainer.Architecture arch)
13441357
{
1345-
string imagesDownloadFolderPath = Path.Combine(TensorFlowScenariosTestsFixture.assetsPath, "inputs",
1346-
"images");
1347-
1348-
//Download the image set and unzip
1349-
string finalImagesFolderName = DownloadImageSet(
1350-
imagesDownloadFolderPath);
1351-
1352-
string fullImagesetFolderPath = Path.Combine(
1353-
imagesDownloadFolderPath, finalImagesFolderName);
1354-
13551358
MLContext mlContext = new MLContext(seed: 1);
13561359

13571360
//Load all the original images info
13581361
IEnumerable<ImageData> images = LoadImagesFromDirectory(
1359-
folder: fullImagesetFolderPath, useFolderNameAsLabel: true);
1362+
folder: _fullImagesetFolderPath, useFolderNameAsLabel: true);
13601363

13611364
IDataView shuffledFullImagesDataset = mlContext.Data.ShuffleRows(
13621365
mlContext.Data.LoadFromEnumerable(images), seed: 1);
@@ -1372,13 +1375,13 @@ public void TensorFlowImageClassification(ImageClassificationTrainer.Architectur
13721375

13731376
IDataView trainDataset = trainTestData.TrainSet;
13741377
IDataView testDataset = trainTestData.TestSet;
1375-
var validationSet = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
1378+
var validationSet = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
13761379
.Fit(testDataset)
13771380
.Transform(testDataset);
13781381

13791382
// Check if the bottleneck cached values already exist
13801383
var (trainSetBottleneckCachedValuesFileName, validationSetBottleneckCachedValuesFileName,
1381-
workspacePath, isReuse) = getInitialParameters(arch, finalImagesFolderName);
1384+
workspacePath, isReuse) = getInitialParameters(arch, _finalImagesFolderName);
13821385

13831386
var options = new ImageClassificationTrainer.Options()
13841387
{
@@ -1401,7 +1404,7 @@ public void TensorFlowImageClassification(ImageClassificationTrainer.Architectur
14011404
ValidationSet = validationSet
14021405
};
14031406

1404-
var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
1407+
var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
14051408
.Append(mlContext.MulticlassClassification.Trainers.ImageClassification(options)
14061409
.Append(mlContext.Transforms.Conversion.MapKeyToValue(outputColumnName: "PredictedLabel", inputColumnName: "PredictedLabel")));
14071410

@@ -1429,9 +1432,9 @@ public void TensorFlowImageClassification(ImageClassificationTrainer.Architectur
14291432
.CreatePredictionEngine<ImageData, ImagePrediction>(loadedModel);
14301433

14311434
IEnumerable<ImageData> testImages = LoadImagesFromDirectory(
1432-
fullImagesetFolderPath, true);
1435+
_fullImagesetFolderPath, true);
14331436

1434-
string[] directories = Directory.GetDirectories(fullImagesetFolderPath);
1437+
string[] directories = Directory.GetDirectories(_fullImagesetFolderPath);
14351438
string[] labels = new string[directories.Length];
14361439
for (int j = 0; j < labels.Length; j++)
14371440
{
@@ -1442,13 +1445,13 @@ public void TensorFlowImageClassification(ImageClassificationTrainer.Architectur
14421445
// Test daisy image
14431446
ImageData firstImageToPredict = new ImageData
14441447
{
1445-
ImagePath = Path.Combine(fullImagesetFolderPath, "daisy", "5794835_d15905c7c8_n.jpg")
1448+
ImagePath = Path.Combine(_fullImagesetFolderPath, "daisy", "5794835_d15905c7c8_n.jpg")
14461449
};
14471450

14481451
// Test rose image
14491452
ImageData secondImageToPredict = new ImageData
14501453
{
1451-
ImagePath = Path.Combine(fullImagesetFolderPath, "roses", "12240303_80d87f77a3_n.jpg")
1454+
ImagePath = Path.Combine(_fullImagesetFolderPath, "roses", "12240303_80d87f77a3_n.jpg")
14521455
};
14531456

14541457
var predictionFirst = predictionEngine.Predict(firstImageToPredict);
@@ -1486,21 +1489,11 @@ public void TensorFlowImageClassificationWithPolynomialLRScheduling()
14861489

14871490
internal void TensorFlowImageClassificationWithLRScheduling(LearningRateScheduler learningRateScheduler, int epoch)
14881491
{
1489-
string imagesDownloadFolderPath = Path.Combine(TensorFlowScenariosTestsFixture.assetsPath, "inputs",
1490-
"images");
1491-
1492-
//Download the image set and unzip
1493-
string finalImagesFolderName = DownloadImageSet(
1494-
imagesDownloadFolderPath);
1495-
1496-
string fullImagesetFolderPath = Path.Combine(
1497-
imagesDownloadFolderPath, finalImagesFolderName);
1498-
14991492
MLContext mlContext = new MLContext(seed: 1);
15001493

15011494
//Load all the original images info
15021495
IEnumerable<ImageData> images = LoadImagesFromDirectory(
1503-
folder: fullImagesetFolderPath, useFolderNameAsLabel: true);
1496+
folder: _fullImagesetFolderPath, useFolderNameAsLabel: true);
15041497

15051498
IDataView shuffledFullImagesDataset = mlContext.Data.ShuffleRows(
15061499
mlContext.Data.LoadFromEnumerable(images), seed: 1);
@@ -1516,13 +1509,13 @@ internal void TensorFlowImageClassificationWithLRScheduling(LearningRateSchedule
15161509

15171510
IDataView trainDataset = trainTestData.TrainSet;
15181511
IDataView testDataset = trainTestData.TestSet;
1519-
var validationSet = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
1512+
var validationSet = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
15201513
.Fit(testDataset)
15211514
.Transform(testDataset);
15221515

15231516
// Check if the bottleneck cached values already exist
15241517
var (trainSetBottleneckCachedValuesFileName, validationSetBottleneckCachedValuesFileName,
1525-
workspacePath, isReuse) = getInitialParameters(ImageClassificationTrainer.Architecture.ResnetV2101, finalImagesFolderName);
1518+
workspacePath, isReuse) = getInitialParameters(ImageClassificationTrainer.Architecture.ResnetV2101, _finalImagesFolderName);
15261519

15271520
var options = new ImageClassificationTrainer.Options()
15281521
{
@@ -1546,7 +1539,7 @@ internal void TensorFlowImageClassificationWithLRScheduling(LearningRateSchedule
15461539
LearningRateScheduler = learningRateScheduler
15471540
};
15481541

1549-
var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
1542+
var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
15501543
.Append(mlContext.MulticlassClassification.Trainers.ImageClassification(options))
15511544
.Append(mlContext.Transforms.Conversion.MapKeyToValue(
15521545
outputColumnName: "PredictedLabel",
@@ -1575,9 +1568,9 @@ internal void TensorFlowImageClassificationWithLRScheduling(LearningRateSchedule
15751568
.CreatePredictionEngine<ImageData, ImagePrediction>(loadedModel);
15761569

15771570
IEnumerable<ImageData> testImages = LoadImagesFromDirectory(
1578-
fullImagesetFolderPath, true);
1571+
_fullImagesetFolderPath, true);
15791572

1580-
string[] directories = Directory.GetDirectories(fullImagesetFolderPath);
1573+
string[] directories = Directory.GetDirectories(_fullImagesetFolderPath);
15811574
string[] labels = new string[directories.Length];
15821575
for (int j = 0; j < labels.Length; j++)
15831576
{
@@ -1588,13 +1581,13 @@ internal void TensorFlowImageClassificationWithLRScheduling(LearningRateSchedule
15881581
// Test daisy image
15891582
ImageData firstImageToPredict = new ImageData
15901583
{
1591-
ImagePath = Path.Combine(fullImagesetFolderPath, "daisy", "5794835_d15905c7c8_n.jpg")
1584+
ImagePath = Path.Combine(_fullImagesetFolderPath, "daisy", "5794835_d15905c7c8_n.jpg")
15921585
};
15931586

15941587
// Test rose image
15951588
ImageData secondImageToPredict = new ImageData
15961589
{
1597-
ImagePath = Path.Combine(fullImagesetFolderPath, "roses", "12240303_80d87f77a3_n.jpg")
1590+
ImagePath = Path.Combine(_fullImagesetFolderPath, "roses", "12240303_80d87f77a3_n.jpg")
15981591
};
15991592

16001593
var predictionFirst = predictionEngine.Predict(firstImageToPredict);
@@ -1624,25 +1617,13 @@ internal void TensorFlowImageClassificationWithLRScheduling(LearningRateSchedule
16241617
[TensorFlowTheory]
16251618
[InlineData(ImageClassificationTrainer.EarlyStoppingMetric.Accuracy)]
16261619
[InlineData(ImageClassificationTrainer.EarlyStoppingMetric.Loss)]
1627-
// This test hangs ocassionally
1628-
[Trait("Category", "SkipInCI")]
16291620
public void TensorFlowImageClassificationEarlyStopping(ImageClassificationTrainer.EarlyStoppingMetric earlyStoppingMetric)
16301621
{
1631-
string imagesDownloadFolderPath = Path.Combine(TensorFlowScenariosTestsFixture.assetsPath, "inputs",
1632-
"images");
1633-
1634-
//Download the image set and unzip
1635-
string finalImagesFolderName = DownloadImageSet(
1636-
imagesDownloadFolderPath);
1637-
1638-
string fullImagesetFolderPath = Path.Combine(
1639-
imagesDownloadFolderPath, finalImagesFolderName);
1640-
16411622
MLContext mlContext = new MLContext(seed: 1);
16421623

16431624
//Load all the original images info
16441625
IEnumerable<ImageData> images = LoadImagesFromDirectory(
1645-
folder: fullImagesetFolderPath, useFolderNameAsLabel: true);
1626+
folder: _fullImagesetFolderPath, useFolderNameAsLabel: true);
16461627

16471628
IDataView shuffledFullImagesDataset = mlContext.Data.ShuffleRows(
16481629
mlContext.Data.LoadFromEnumerable(images), seed: 1);
@@ -1660,13 +1641,13 @@ public void TensorFlowImageClassificationEarlyStopping(ImageClassificationTraine
16601641
IDataView testDataset = trainTestData.TestSet;
16611642

16621643
int lastEpoch = 0;
1663-
var validationSet = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
1644+
var validationSet = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
16641645
.Fit(testDataset)
16651646
.Transform(testDataset);
16661647

16671648
// Check if the bottleneck cached values already exist
16681649
var (trainSetBottleneckCachedValuesFileName, validationSetBottleneckCachedValuesFileName,
1669-
workspacePath, isReuse) = getInitialParameters(ImageClassificationTrainer.Architecture.ResnetV2101, finalImagesFolderName);
1650+
workspacePath, isReuse) = getInitialParameters(ImageClassificationTrainer.Architecture.ResnetV2101, _finalImagesFolderName);
16701651

16711652

16721653

@@ -1692,7 +1673,7 @@ public void TensorFlowImageClassificationEarlyStopping(ImageClassificationTraine
16921673
ValidationSet = validationSet
16931674
};
16941675

1695-
var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
1676+
var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
16961677
.Append(mlContext.MulticlassClassification.Trainers.ImageClassification(options));
16971678

16981679
using var trainedModel = pipeline.Fit(trainDataset);

0 commit comments

Comments
 (0)