Skip to content

Commit 5c3ac8b

Browse files
Prashanth Govindarajangfoidl
Prashanth Govindarajan
andauthored
Improve LoadCsv to handle null values when deducing the column types (dotnet#2916)
* Unit test to repro * Fix dotnet/corefxlab#2915 Append a null value to a column when encountering it instead of changing the column type to a StringDataFrameColumn * Update src/Microsoft.Data.Analysis/DataFrame.IO.cs Co-authored-by: Günther Foidl <[email protected]> * Update src/Microsoft.Data.Analysis/DataFrame.cs Co-authored-by: Günther Foidl <[email protected]> * Feedback Co-authored-by: Günther Foidl <[email protected]>
1 parent 28140bd commit 5c3ac8b

File tree

3 files changed

+191
-20
lines changed

3 files changed

+191
-20
lines changed

src/Microsoft.Data.Analysis/DataFrame.IO.cs

+6
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ private static Type GuessKind(int col, List<string[]> read)
2323
throw new FormatException(string.Format(Strings.LessColumnsThatExpected, nbline + 1));
2424

2525
string val = line[col];
26+
27+
if (string.Equals(val, "null", StringComparison.OrdinalIgnoreCase))
28+
{
29+
continue;
30+
}
31+
2632
bool boolParse = bool.TryParse(val, out bool boolResult);
2733
if (boolParse)
2834
{

src/Microsoft.Data.Analysis/DataFrame.cs

+27-20
Original file line numberDiff line numberDiff line change
@@ -463,35 +463,42 @@ public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
463463
bool columnMoveNext = columnEnumerator.MoveNext();
464464
if (row != null)
465465
{
466-
// Go through row first to make sure there are no data type incompatibilities
467-
IEnumerator<object> rowEnumerator = row.GetEnumerator();
468-
bool rowMoveNext = rowEnumerator.MoveNext();
469-
List<object> cachedObjectConversions = new List<object>();
470-
while (columnMoveNext && rowMoveNext)
466+
// Go through row first to make sure there are no data type incompatibilities
467+
IEnumerator<object> rowEnumerator = row.GetEnumerator();
468+
bool rowMoveNext = rowEnumerator.MoveNext();
469+
List<object> cachedObjectConversions = new List<object>();
470+
while (columnMoveNext && rowMoveNext)
471+
{
472+
DataFrameColumn column = columnEnumerator.Current;
473+
object value = rowEnumerator.Current;
474+
// StringDataFrameColumn can accept empty strings. The other columns interpret empty values as nulls
475+
if (value is string stringValue)
471476
{
472-
DataFrameColumn column = columnEnumerator.Current;
473-
object value = rowEnumerator.Current;
474-
// StringDataFrameColumn can accept empty strings. The other columns interpret empty values as nulls
475-
if (value is string stringValue && string.IsNullOrEmpty(stringValue) && column.DataType != typeof(string))
477+
if (stringValue.Length == 0 && column.DataType != typeof(string))
476478
{
477479
value = null;
478480
}
479-
if (value != null)
481+
else if (stringValue.Equals("null", StringComparison.OrdinalIgnoreCase))
480482
{
481-
value = Convert.ChangeType(value, column.DataType);
482-
if (value is null)
483-
{
484-
throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), value.GetType().ToString());
485-
}
483+
value = null;
486484
}
487-
cachedObjectConversions.Add(value);
488-
columnMoveNext = columnEnumerator.MoveNext();
489-
rowMoveNext = rowEnumerator.MoveNext();
490485
}
491-
if (rowMoveNext)
486+
if (value != null)
492487
{
493-
throw new ArgumentException(string.Format(Strings.ExceedsNumberOfColumns, Columns.Count), nameof(row));
488+
value = Convert.ChangeType(value, column.DataType);
489+
if (value is null)
490+
{
491+
throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), value.GetType().ToString());
492+
}
494493
}
494+
cachedObjectConversions.Add(value);
495+
columnMoveNext = columnEnumerator.MoveNext();
496+
rowMoveNext = rowEnumerator.MoveNext();
497+
}
498+
if (rowMoveNext)
499+
{
500+
throw new ArgumentException(string.Format(Strings.ExceedsNumberOfColumns, Columns.Count), nameof(row));
501+
}
495502
// Reset the enumerators
496503
columnEnumerator = ret.Columns.GetEnumerator();
497504
columnMoveNext = columnEnumerator.MoveNext();

tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs

+158
Original file line numberDiff line numberDiff line change
@@ -445,5 +445,163 @@ Stream GetStream(string streamData)
445445
VerifyColumnTypes(df);
446446

447447
}
448+
449+
[Fact]
450+
public void TestReadCsvWithAllNulls()
451+
{
452+
string data = @"vendor_id,rate_code,passenger_count,trip_time_in_secs
453+
null,null,null,null
454+
Null,Null,Null,Null
455+
null,null,null,null
456+
Null,Null,Null,Null
457+
null,null,null,null
458+
null,null,null,null";
459+
460+
Stream GetStream(string streamData)
461+
{
462+
return new MemoryStream(Encoding.Default.GetBytes(streamData));
463+
}
464+
DataFrame df = DataFrame.LoadCsv(GetStream(data));
465+
Assert.Equal(6, df.Rows.Count);
466+
Assert.Equal(4, df.Columns.Count);
467+
468+
Assert.True(typeof(string) == df.Columns[0].DataType);
469+
Assert.True(typeof(string) == df.Columns[1].DataType);
470+
Assert.True(typeof(string) == df.Columns[2].DataType);
471+
Assert.True(typeof(string) == df.Columns[3].DataType);
472+
473+
Assert.Equal("vendor_id", df.Columns[0].Name);
474+
Assert.Equal("rate_code", df.Columns[1].Name);
475+
Assert.Equal("passenger_count", df.Columns[2].Name);
476+
Assert.Equal("trip_time_in_secs", df.Columns[3].Name);
477+
VerifyColumnTypes(df);
478+
479+
foreach (var column in df.Columns)
480+
{
481+
Assert.Equal(6, column.NullCount);
482+
foreach (var value in column)
483+
{
484+
Assert.Null(value);
485+
}
486+
}
487+
}
488+
489+
[Fact]
490+
public void TestReadCsvWithNullsAndDataTypes()
491+
{
492+
string data = @"vendor_id,rate_code,passenger_count,trip_time_in_secs
493+
null,1,1,1271
494+
CMT,Null,1,474
495+
CMT,1,null,637
496+
Null,,,
497+
,,,
498+
CMT,1,1,null";
499+
500+
Stream GetStream(string streamData)
501+
{
502+
return new MemoryStream(Encoding.Default.GetBytes(streamData));
503+
}
504+
DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(short), typeof(int), typeof(long) });
505+
Assert.Equal(6, df.Rows.Count);
506+
Assert.Equal(4, df.Columns.Count);
507+
508+
Assert.True(typeof(string) == df.Columns[0].DataType);
509+
Assert.True(typeof(short) == df.Columns[1].DataType);
510+
Assert.True(typeof(int) == df.Columns[2].DataType);
511+
Assert.True(typeof(long) == df.Columns[3].DataType);
512+
513+
Assert.Equal("vendor_id", df.Columns[0].Name);
514+
Assert.Equal("rate_code", df.Columns[1].Name);
515+
Assert.Equal("passenger_count", df.Columns[2].Name);
516+
Assert.Equal("trip_time_in_secs", df.Columns[3].Name);
517+
VerifyColumnTypes(df);
518+
519+
foreach (var column in df.Columns)
520+
{
521+
if (column.DataType != typeof(string))
522+
{
523+
Assert.Equal(3, column.NullCount);
524+
}
525+
else
526+
{
527+
Assert.Equal(2, column.NullCount);
528+
}
529+
}
530+
var nullRow = df.Rows[3];
531+
Assert.Null(nullRow[0]);
532+
Assert.Null(nullRow[1]);
533+
Assert.Null(nullRow[2]);
534+
Assert.Null(nullRow[3]);
535+
536+
nullRow = df.Rows[4];
537+
Assert.Equal("", nullRow[0]);
538+
Assert.Null(nullRow[1]);
539+
Assert.Null(nullRow[2]);
540+
Assert.Null(nullRow[3]);
541+
542+
Assert.Null(df[0, 0]);
543+
Assert.Null(df[1, 1]);
544+
Assert.Null(df[2, 2]);
545+
Assert.Null(df[5, 3]);
546+
}
547+
548+
[Fact]
549+
public void TestReadCsvWithNulls()
550+
{
551+
string data = @"vendor_id,rate_code,passenger_count,trip_time_in_secs
552+
null,1,1,1271
553+
CMT,Null,1,474
554+
CMT,1,null,637
555+
Null,,,
556+
,,,
557+
CMT,1,1,null";
558+
559+
Stream GetStream(string streamData)
560+
{
561+
return new MemoryStream(Encoding.Default.GetBytes(streamData));
562+
}
563+
DataFrame df = DataFrame.LoadCsv(GetStream(data));
564+
Assert.Equal(6, df.Rows.Count);
565+
Assert.Equal(4, df.Columns.Count);
566+
567+
Assert.True(typeof(string) == df.Columns[0].DataType);
568+
Assert.True(typeof(float) == df.Columns[1].DataType);
569+
Assert.True(typeof(float) == df.Columns[2].DataType);
570+
Assert.True(typeof(float) == df.Columns[3].DataType);
571+
572+
Assert.Equal("vendor_id", df.Columns[0].Name);
573+
Assert.Equal("rate_code", df.Columns[1].Name);
574+
Assert.Equal("passenger_count", df.Columns[2].Name);
575+
Assert.Equal("trip_time_in_secs", df.Columns[3].Name);
576+
VerifyColumnTypes(df);
577+
578+
foreach (var column in df.Columns)
579+
{
580+
if (column.DataType != typeof(string))
581+
{
582+
Assert.Equal(3, column.NullCount);
583+
}
584+
else
585+
{
586+
Assert.Equal(2, column.NullCount);
587+
}
588+
}
589+
var nullRow = df.Rows[3];
590+
Assert.Null(nullRow[0]);
591+
Assert.Null(nullRow[1]);
592+
Assert.Null(nullRow[2]);
593+
Assert.Null(nullRow[3]);
594+
595+
nullRow = df.Rows[4];
596+
Assert.Equal("", nullRow[0]);
597+
Assert.Null(nullRow[1]);
598+
Assert.Null(nullRow[2]);
599+
Assert.Null(nullRow[3]);
600+
601+
Assert.Null(df[0, 0]);
602+
Assert.Null(df[1, 1]);
603+
Assert.Null(df[2, 2]);
604+
Assert.Null(df[5, 3]);
605+
}
448606
}
449607
}

0 commit comments

Comments
 (0)