Skip to content

Remove NoMetadataSchema and make its relatives not ISchema #2080

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 10, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 61 additions & 127 deletions src/Microsoft.ML.Data/DataView/Transposer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -784,13 +784,13 @@ private sealed class DataViewSlicer : IDataView
// For each output column, indicates what output column it's surfacing
// from the splitter.
private readonly int[] _colToSplitCol;
private readonly SchemaImpl _schema;

private readonly IHost _host;
public Schema Schema => _schema.AsSchema;

public bool CanShuffle { get { return _input.CanShuffle; } }

public Schema Schema { get; }

public DataViewSlicer(IHost host, IDataView input, int[] toSlice)
{
Contracts.AssertValue(host, "host");
Expand All @@ -810,27 +810,48 @@ public DataViewSlicer(IHost host, IDataView input, int[] toSlice)
{
var splitter = _splitters[c] = Splitter.Create(_input, toSlice[c]);
_host.Assert(splitter.ColumnCount >= 1);
// One splitter can produce multiple columns because it splits a input column into multiple output columns.
// _incolToLim[c] stores (the last output column index of the c-th splitter) + 1.
_incolToLim[c] = outputColumnCount += splitter.ColumnCount;
// toSlice[c] stores the input column index processed by the c-th splitter. In the output schema, we map a
// output column name to the last column produced by the associated splitter. For example, if input column
// "Features" (column index 5) gets splitted into three output columns "Features" (column index 0), "Features"
// (column index 1), "Features" (column index 2), nameToCol["Features"] should return 2. Note that output column
// names are identical to their source column name.
nameToCol[_input.Schema[toSlice[c]].Name] = outputColumnCount - 1;
}
// Here outputColumnCount denotes the total number of columns produced by all splitters.
_colToSplitIndex = new int[outputColumnCount];
_colToSplitCol = new int[outputColumnCount];
// Below outputColumnCount becomes index of output columns. When outputColumnCount = 0, we process the first column
// in the output data.
outputColumnCount = 0;
// Iterate through all splitters. For each splitter, multiple output columns can be produced.
for (int c = 0; c < _splitters.Length; ++c)
{
int outCount = _splitters[c].ColumnCount;
// Iterate through all columns produced by the c-th splitter.
for (int i = 0; i < outCount; ++i)
{
// Output column indexed by outputColumnCount is produce by _splitters[c].
_colToSplitIndex[outputColumnCount] = c;
// Output column indexed by outputColumnCount is the i-th column in _splitters[c]'s output columns.
_colToSplitCol[outputColumnCount++] = i;
}
}
_host.Assert(outputColumnCount == _colToSplitIndex.Length);
_schema = new SchemaImpl(this, nameToCol);

// Sequentially concatenate output columns from all splitters to form output schema.
var schemaBuilder = new SchemaBuilder();
for (int c = 0; c < _splitters.Length; ++c)
schemaBuilder.AddColumns(_splitters[c].OutputSchema);
Schema = schemaBuilder.GetSchema();
}

public long? GetRowCount()
{
// Splitting columns into smaller pieces doesn't affect number of rows, so the row number
// in output data is the same to that of input data.
return _input.GetRowCount();
}

Expand All @@ -849,6 +870,12 @@ public void InColToOutRange(int incol, out int outMin, out int outLim)
outLim = _incolToLim[incol];
}

/// <summary>
/// Given an output column index, find which spliter produces it and which spliter column is its source.
/// </summary>
/// <param name="col">An output column index</param>
/// <param name="splitInd"><see cref="_splitters"/>[splitInd] produces the specified output column.</param>
/// <param name="splitCol">The specified output column is the splitCol-th column among columns produced by <see cref="_splitters"/>[splitInd].</param>
private void OutputColumnToSplitterIndices(int col, out int splitInd, out int splitCol)
{
_host.Assert(0 <= col && col < _colToSplitIndex.Length);
Expand Down Expand Up @@ -895,7 +922,7 @@ private Func<int, bool> CreateInputPredicate(Func<int, bool> pred, out bool[] ac
{
var splitter = _splitters[i];
// Don't activate input source columns if none of the resulting columns were selected.
bool isActive = pred == null || Enumerable.Range(offset, splitter.AsSchema.Count).Any(c => pred(c));
bool isActive = pred == null || Enumerable.Range(offset, splitter.OutputSchema.Count).Any(c => pred(c));
if (isActive)
{
activeSplitters[i] = isActive;
Expand All @@ -906,100 +933,25 @@ private Func<int, bool> CreateInputPredicate(Func<int, bool> pred, out bool[] ac
return activeSrcSet.Contains;
}

/// <summary>
/// This collates the schemas of all the columns from the <see cref="Splitter"/> instances.
/// </summary>
private sealed class SchemaImpl : NoMetadataSchema
{
private readonly DataViewSlicer _slicer;
private readonly Dictionary<string, int> _nameToCol;

public Schema AsSchema { get; }

public override int ColumnCount { get { return _slicer._colToSplitIndex.Length; } }

public SchemaImpl(DataViewSlicer slicer, Dictionary<string, int> nameToCol)
{
Contracts.AssertValue(slicer);
Contracts.AssertValue(nameToCol);
_slicer = slicer;
_nameToCol = nameToCol;
AsSchema = Schema.Create(this);
}

public override bool TryGetColumnIndex(string name, out int col)
{
Contracts.CheckValueOrNull(name);
return Utils.TryGetValue(_nameToCol, name, out col);
}

public override string GetColumnName(int col)
{
Contracts.CheckParam(0 <= col && col < ColumnCount, nameof(col));
int splitInd;
int splitCol;
_slicer.OutputColumnToSplitterIndices(col, out splitInd, out splitCol);
return _slicer._splitters[splitInd].GetColumnName(splitCol);
}

public override ColumnType GetColumnType(int col)
{
Contracts.CheckParam(0 <= col && col < ColumnCount, nameof(col));
int splitInd;
int splitCol;
_slicer.OutputColumnToSplitterIndices(col, out splitInd, out splitCol);
return _slicer._splitters[splitInd].GetColumnType(splitCol);
}
}

/// <summary>
/// Very simple schema base that surfaces no metadata, since I have a couple schema
/// implementations neither of which I care about surfacing metadata.
/// </summary>
private abstract class NoMetadataSchema : ISchema
{
public abstract int ColumnCount { get; }

public abstract bool TryGetColumnIndex(string name, out int col);

public abstract string GetColumnName(int col);

public abstract ColumnType GetColumnType(int col);

public IEnumerable<KeyValuePair<string, ColumnType>> GetMetadataTypes(int col)
{
Contracts.CheckParam(0 <= col && col < ColumnCount, nameof(col));
return Enumerable.Empty<KeyValuePair<string, ColumnType>>();
}

public ColumnType GetMetadataTypeOrNull(string kind, int col)
{
Contracts.CheckParam(0 <= col && col < ColumnCount, nameof(col));
return null;
}

public void GetMetadata<TValue>(string kind, int col, ref TValue value)
{
Contracts.CheckParam(0 <= col && col < ColumnCount, nameof(col));
throw MetadataUtils.ExceptGetMetadata();
}
}

/// <summary>
/// There is one instance of these per column, implementing the possible splitting
/// of one column from a <see cref="IDataView"/> into multiple columns. The instance
/// describes the resulting split columns through its implementation of
/// <see cref="ISchema"/>, and then can be bound to an <see cref="Row"/> to provide
/// that splitting functionality.
/// describes the resulting split columns through <see cref="Splitter.OutputSchema"/>,
/// and then can be bound to an <see cref="Row"/> to provide that splitting functionality.
/// </summary>
private abstract class Splitter : NoMetadataSchema
private abstract class Splitter
{
private readonly IDataView _view;
private readonly int _col;
public abstract int ColumnCount { get; }

public int SrcCol { get { return _col; } }

public abstract Schema AsSchema { get; }
/// <summary>
/// Output schema of a splitter. A splitter takes a column from input data and then divide it into multiple columns
/// to form its output data.
/// </summary>
public abstract Schema OutputSchema { get; }

protected Splitter(IDataView view, int col)
{
Expand Down Expand Up @@ -1063,35 +1015,12 @@ private static Splitter CreateCore<T>(IDataView view, int col, int[] ends)
return new ColumnSplitter<T>(view, col, ends);
}

#region ISchema implementation
// Subclasses should implement ColumnCount and GetColumnType.
public override bool TryGetColumnIndex(string name, out int col)
{
Contracts.CheckNonEmpty(name, nameof(name));
if (name != _view.Schema[SrcCol].Name)
{
col = default(int);
return false;
}
// We're just pretending all the columns have the same name, so we
// just return the last column's index if it happens to match.
col = ColumnCount - 1;
return true;
}

public override string GetColumnName(int col)
{
Contracts.CheckParam(0 <= col && col < ColumnCount, nameof(col));
return _view.Schema[SrcCol].Name;
}
#endregion

private abstract class RowBase<TSplitter> : WrappingRow
where TSplitter : Splitter
{
protected readonly TSplitter Parent;

public sealed override Schema Schema => Parent.AsSchema;
public sealed override Schema Schema => Parent.OutputSchema;

public RowBase(TSplitter parent, Row input)
: base(input)
Expand All @@ -1112,19 +1041,26 @@ private sealed class NoSplitter<T> : Splitter
{
public override int ColumnCount => 1;

public override Schema AsSchema { get; }
public override Schema OutputSchema { get; }

/// <summary>
/// This is NoSplitter. Thus, the column, indexed by col, which supposes to be splitted will just be copied to an output
/// column without splitting.
/// </summary>
/// <param name="view">Input data whose columns can be splitted.</param>
/// <param name="col">The selected column's index.</param>
public NoSplitter(IDataView view, int col)
: base(view, col)
{
Contracts.Assert(_view.Schema[col].Type.RawType == typeof(T));
AsSchema = Schema.Create(this);
}

public override ColumnType GetColumnType(int col)
{
Contracts.CheckParam(0 <= col && col < ColumnCount, nameof(col));
return _view.Schema[SrcCol].Type;
// The column selected for splitting.
var selectedColumn = _view.Schema[col];

var schemaBuilder = new SchemaBuilder();
// Just copy the selected column to output since no splitting happens.
schemaBuilder.AddColumn(selectedColumn.Name, selectedColumn.Type, selectedColumn.Metadata);
OutputSchema = schemaBuilder.GetSchema();
}

public override Row Bind(Row row, Func<int, bool> pred)
Expand Down Expand Up @@ -1171,7 +1107,7 @@ private sealed class ColumnSplitter<T> : Splitter
// Cache of the types of each slice.
private readonly VectorType[] _types;

public override Schema AsSchema { get; }
public override Schema OutputSchema { get; }

public override int ColumnCount { get { return _lims.Length; } }

Expand Down Expand Up @@ -1204,13 +1140,11 @@ public ColumnSplitter(IDataView view, int col, int[] lims)
for (int c = 1; c < _lims.Length; ++c)
_types[c] = new VectorType(type.ItemType, _lims[c] - _lims[c - 1]);

AsSchema = Schema.Create(this);
}

public override ColumnType GetColumnType(int col)
{
Contracts.CheckParam(0 <= col && col < ColumnCount, nameof(col));
return _types[col];
var selectedColumn = _view.Schema[col];
var schemaBuilder = new SchemaBuilder();
for (int c = 0; c < _lims.Length; ++c)
schemaBuilder.AddColumn(selectedColumn.Name, _types[c]);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

schemaBuilder.AddColumn(selectedColumn.Name, _types[c]); [](start = 27, length = 57)

double-checking that omitting metadata is deliberate.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is reasonable because splitting a vector-valued column breaks most metadata. It's why previously Splitter was a NoMetadataSchema.


In reply to: 246463230 [](ancestors = 246463230)

OutputSchema = schemaBuilder.GetSchema();
}

public override Row Bind(Row row, Func<int, bool> pred)
Expand Down