From 2cb141b79f99966eea79a2bc58de4ff2648bd141 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 14 Jan 2019 13:16:30 -0800 Subject: [PATCH 1/4] Remove ISchema in BinaryLoader --- .../DataLoadSave/Binary/BinaryLoader.cs | 123 +++++++----------- 1 file changed, 47 insertions(+), 76 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs index 94e59e3433..2c7e78f642 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs @@ -472,6 +472,11 @@ private abstract class MetadataTableOfContentsEntry protected readonly BinaryLoader Parent; + /// + /// Retuen the getter to the stored entry value as . + /// + public abstract Delegate GetGetter(); + protected MetadataTableOfContentsEntry(BinaryLoader parent, string kind, CompressionKind compression, long blockOffset, long blockSize) { @@ -541,6 +546,8 @@ public ImplDead(BinaryLoader parent, string kind, IValueCodec codec, { _codec = codec; } + + public override Delegate GetGetter() => null; } private sealed class ImplOne : MetadataTableOfContentsEntry @@ -551,10 +558,10 @@ public ImplOne(BinaryLoader parent, string kind, IValueCodec codec, { } - public override void Get(ref T value) + public override Delegate GetGetter() { - EnsureValue(); - value = Value; + ValueGetter getter = (ref T value) => value = Value; + return getter; } } @@ -566,10 +573,10 @@ public ImplVec(BinaryLoader parent, string kind, IValueCodec> codec, { } - public override void Get(ref VBuffer value) + public override Delegate GetGetter() { - EnsureValue(); - Value.CopyTo(ref value); + ValueGetter> getter = (ref VBuffer value) => Value.CopyTo(ref value); + return getter; } } } @@ -614,87 +621,51 @@ protected void EnsureValue() } } } - - public abstract void Get(ref T value); } - private sealed class SchemaImpl : ISchema + /// + /// This function returns output schema, , of by translating into + /// s. If a loads a text column from the input file, its + /// should contains a with as its . + /// + /// of loaded file. + private Schema ComputeOutputSchema() { - private readonly TableOfContentsEntry[] _toc; - private readonly Dictionary _name2col; - private readonly IExceptionContext _ectx; + var schemaBuilder = new SchemaBuilder(); - public SchemaImpl(BinaryLoader parent) + for(int i = 0; i < _aliveColumns.Length; ++i) { - Contracts.AssertValue(parent, "parent"); - Contracts.AssertValue(parent._host, "parent"); - _ectx = parent._host; + // Informaiton of a column loaded from a binary file. + var loadedColumn = _aliveColumns[i]; + // Metadata fields of the loaded column. + var metadataArray = loadedColumn.GetMetadataTocArray(); - _name2col = new Dictionary(); - _toc = parent._aliveColumns; - for (int c = 0; c < _toc.Length; ++c) - _name2col[_toc[c].Name] = c; - } - - public int ColumnCount { get { return _toc.Length; } } - - public bool TryGetColumnIndex(string name, out int col) - { - _ectx.CheckValueOrNull(name); - if (name == null) + if (Utils.Size(metadataArray) > 0) { - col = default(int); - return false; + // We got some metadata fields here. + var metadataBuilder = new MetadataBuilder(); + foreach(var loadedMetadataColumn in metadataArray) + { + var metadataGetter = loadedMetadataColumn.GetGetter(); + if (metadataGetter == null) + throw MetadataUtils.ExceptGetMetadata(); + metadataBuilder.Add(loadedMetadataColumn.Kind, loadedMetadataColumn.Codec.Type, metadataGetter); + } + schemaBuilder.AddColumn(loadedColumn.Name, loadedColumn.Type, metadataBuilder.GetMetadata()); } - return _name2col.TryGetValue(name, out col); - } - - public string GetColumnName(int col) - { - _ectx.CheckParam(0 <= col && col < ColumnCount, nameof(col)); - return _toc[col].Name; - } - - public ColumnType GetColumnType(int col) - { - _ectx.CheckParam(0 <= col && col < ColumnCount, nameof(col)); - return _toc[col].Type; - } - - public IEnumerable> GetMetadataTypes(int col) - { - _ectx.CheckParam(0 <= col && col < ColumnCount, nameof(col)); - var metadatas = _toc[col].GetMetadataTocArray(); - if (Utils.Size(metadatas) > 0) - return metadatas.Select(e => new KeyValuePair(e.Kind, e.Codec.Type)); - return Enumerable.Empty>(); - } - - public ColumnType GetMetadataTypeOrNull(string kind, int col) - { - _ectx.CheckNonEmpty(kind, nameof(kind)); - _ectx.CheckParam(0 <= col && col < ColumnCount, nameof(col)); - var entry = _toc[col].GetMetadataTocEntryOrNull(kind); - return entry == null ? null : entry.Codec.Type; + else + // This case has no metadata. + schemaBuilder.AddColumn(loadedColumn.Name, loadedColumn.Type); } - public void GetMetadata(string kind, int col, ref TValue value) - { - _ectx.CheckNonEmpty(kind, nameof(kind)); - _ectx.CheckParam(0 <= col && col < ColumnCount, nameof(col)); - - var entry = _toc[col].GetMetadataTocEntryOrNull(kind) as MetadataTableOfContentsEntry; - if (entry == null) - throw MetadataUtils.ExceptGetMetadata(); - entry.Get(ref value); - } + return schemaBuilder.GetSchema(); } private readonly Stream _stream; private readonly BinaryReader _reader; private readonly CodecFactory _factory; private readonly Header _header; - private readonly Schema _schema; + private readonly Schema _outputSchema; private readonly bool _autodeterminedThreads; private readonly int _threads; private readonly string _generatedRowIndexName; @@ -757,7 +728,7 @@ public void GetMetadata(string kind, int col, ref TValue value) /// private const ulong ReaderFirstVersion = 0x0001000100010002; - public Schema Schema { get { return _schema; } } + public Schema Schema { get { return _outputSchema; } } private long RowCount { get { return _header.RowCount; } } @@ -805,8 +776,8 @@ private BinaryLoader(Arguments args, IHost host, Stream stream, bool leaveOpen) _threads = Math.Max(1, args.Threads ?? (Environment.ProcessorCount / 2)); _generatedRowIndexName = string.IsNullOrWhiteSpace(args.RowIndexName) ? null : args.RowIndexName; InitToc(ch, out _aliveColumns, out _deadColumns, out _rowsPerBlock, out _tocEndLim); - _schema = Schema.Create(new SchemaImpl(this)); - _host.Assert(_schema.Count == Utils.Size(_aliveColumns)); + _outputSchema = ComputeOutputSchema(); + _host.Assert(_outputSchema.Count == Utils.Size(_aliveColumns)); _bufferCollection = new MemoryStreamCollection(); if (Utils.Size(_deadColumns) > 0) ch.Warning("BinaryLoader does not know how to interpret {0} columns", Utils.Size(_deadColumns)); @@ -894,8 +865,8 @@ private BinaryLoader(IHost host, ModelLoadContext ctx, Stream stream) _header = InitHeader(); InitToc(ch, out _aliveColumns, out _deadColumns, out _rowsPerBlock, out _tocEndLim); - _schema = Schema.Create(new SchemaImpl(this)); - ch.Assert(_schema.Count == Utils.Size(_aliveColumns)); + _outputSchema = ComputeOutputSchema(); + ch.Assert(_outputSchema.Count == Utils.Size(_aliveColumns)); _bufferCollection = new MemoryStreamCollection(); if (Utils.Size(_deadColumns) > 0) ch.Warning("BinaryLoader does not know how to interpret {0} columns", Utils.Size(_deadColumns)); From 49fa1f61824517754fe83dbded68feb6cc0a05c4 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 14 Jan 2019 13:27:28 -0800 Subject: [PATCH 2/4] Fix a doc string --- src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs index 2c7e78f642..6fda32f263 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs @@ -473,7 +473,9 @@ private abstract class MetadataTableOfContentsEntry protected readonly BinaryLoader Parent; /// - /// Retuen the getter to the stored entry value as . + /// Return to the stored entry value as . An example of stored value is + /// . For implementations of , see , + /// , and . /// public abstract Delegate GetGetter(); From c1e1ae3c52adca31fc4daa7b1f12395a01f1ce0a Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 14 Jan 2019 15:26:51 -0800 Subject: [PATCH 3/4] Clean some fields --- src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs index 6fda32f263..6f49c5250f 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs @@ -730,13 +730,13 @@ private Schema ComputeOutputSchema() /// private const ulong ReaderFirstVersion = 0x0001000100010002; - public Schema Schema { get { return _outputSchema; } } + public Schema Schema => _outputSchema; - private long RowCount { get { return _header.RowCount; } } + private long RowCount => _header.RowCount; - public long? GetRowCount() { return RowCount; } + public long? GetRowCount() => RowCount; - public bool CanShuffle { get { return true; } } + public bool CanShuffle => true; internal const string Summary = "Loads native Binary IDV data file."; internal const string LoadName = "BinaryLoader"; From b3caf2597894627ad16c511f6c0296a48e1c2074 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 14 Jan 2019 16:31:51 -0800 Subject: [PATCH 4/4] Fix build --- src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs index 6f49c5250f..54d349ab33 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs @@ -562,6 +562,7 @@ public ImplOne(BinaryLoader parent, string kind, IValueCodec codec, public override Delegate GetGetter() { + EnsureValue(); ValueGetter getter = (ref T value) => value = Value; return getter; } @@ -577,6 +578,7 @@ public ImplVec(BinaryLoader parent, string kind, IValueCodec> codec, public override Delegate GetGetter() { + EnsureValue(); ValueGetter> getter = (ref VBuffer value) => Value.CopyTo(ref value); return getter; } @@ -601,6 +603,10 @@ protected MetadataTableOfContentsEntry(BinaryLoader parent, string kind, IValueC _codec = codec; } + /// + /// By calling , we make sure 's content get loaded definitely. + /// Without calling , could be default value of its type. + /// protected void EnsureValue() { if (!_fetched)