Skip to content

Commit c54086b

Browse files
authored
Remove IRowCursorConsolidator. (#1938)
1 parent 3188f1a commit c54086b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+270
-381
lines changed

src/Microsoft.ML.Core/Data/IDataView.cs

Lines changed: 53 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ internal interface ISchema
6363

6464
/// <summary>
6565
/// The input and output of Query Operators (Transforms). This is the fundamental data pipeline
66-
/// type, comparable to IEnumerable for LINQ.
66+
/// type, comparable to <see cref="IEnumerable{T}"/> for LINQ.
6767
/// </summary>
6868
public interface IDataView
6969
{
@@ -92,28 +92,28 @@ public interface IDataView
9292
RowCursor GetRowCursor(Func<int, bool> needCol, Random rand = null);
9393

9494
/// <summary>
95-
/// This constructs a set of parallel batch cursors. The value n is a recommended limit
96-
/// on cardinality. If <paramref name="n"/> is non-positive, this indicates that the caller
97-
/// has no recommendation, and the implementation should have some default behavior to cover
98-
/// this case. Note that this is strictly a recommendation: it is entirely possible that
99-
/// an implementation can return a different number of cursors.
95+
/// This constructs a set of parallel batch cursors. The value <paramref name="n"/> is a recommended limit on
96+
/// cardinality. If <paramref name="n"/> is non-positive, this indicates that the caller has no recommendation,
97+
/// and the implementation should have some default behavior to cover this case. Note that this is strictly a
98+
/// recommendation: it is entirely possible that an implementation can return a different number of cursors.
10099
///
101100
/// The cursors should return the same data as returned through
102-
/// <see cref="GetRowCursor(Func{int, bool}, Random)"/>, except partitioned: no two cursors
103-
/// should return the "same" row as would have been returned through the regular serial cursor,
104-
/// but all rows should be returned by exactly one of the cursors returned from this cursor.
105-
/// The cursors can have their values reconciled downstream through the use of the
106-
/// <see cref="Row.Batch"/> property.
101+
/// <see cref="GetRowCursor(Func{int, bool}, Random)"/>, except partitioned: no two cursors should return the
102+
/// "same" row as would have been returned through the regular serial cursor, but all rows should be returned by
103+
/// exactly one of the cursors returned from this cursor. The cursors can have their values reconciled
104+
/// downstream through the use of the <see cref="Row.Batch"/> property.
105+
///
106+
/// The typical usage pattern is that a set of cursors is requested, each of them is then given to a set of
107+
/// working threads that consume from them independently while, ultimately, the results are finally collated in
108+
/// the end by exploiting the ordering of the <see cref="Row.Batch"/> property described above. More typical
109+
/// scenarios will be content with pulling from the single serial cursor of
110+
/// <see cref="GetRowCursor(Func{int, bool}, Random)"/>.
107111
/// </summary>
108-
/// <param name="consolidator">This is an object that can be used to reconcile the
109-
/// returned array of cursors. When the array of cursors is of length 1, it is legal,
110-
/// indeed expected, that this parameter should be null.</param>
111112
/// <param name="needCol">The predicate, where a column is active if this returns true.</param>
112113
/// <param name="n">The suggested degree of parallelism.</param>
113114
/// <param name="rand">An instance </param>
114115
/// <returns></returns>
115-
RowCursor[] GetRowCursorSet(out IRowCursorConsolidator consolidator,
116-
Func<int, bool> needCol, int n, Random rand = null);
116+
RowCursor[] GetRowCursorSet(Func<int, bool> needCol, int n, Random rand = null);
117117

118118
/// <summary>
119119
/// Gets an instance of Schema.
@@ -122,20 +122,8 @@ RowCursor[] GetRowCursorSet(out IRowCursorConsolidator consolidator,
122122
}
123123

124124
/// <summary>
125-
/// This is used to consolidate parallel cursors into a single cursor. The object that determines
126-
/// the number of cursors and splits the row "stream" provides the consolidator object.
127-
/// </summary>
128-
public interface IRowCursorConsolidator
129-
{
130-
/// <summary>
131-
/// Create a consolidated cursor from the given parallel cursor set.
132-
/// </summary>
133-
RowCursor CreateCursor(IChannelProvider provider, RowCursor[] inputs);
134-
}
135-
136-
/// <summary>
137-
/// Delegate type to get a value. This can used for efficient access to data in an IRow
138-
/// or IRowCursor.
125+
/// Delegate type to get a value. This can be used for efficient access to data in a <see cref="Row"/>
126+
/// or <see cref="RowCursor"/>.
139127
/// </summary>
140128
public delegate void ValueGetter<TValue>(ref TValue value);
141129

@@ -146,43 +134,54 @@ public interface IRowCursorConsolidator
146134
public abstract class Row : IDisposable
147135
{
148136
/// <summary>
149-
/// This is incremented when the underlying contents changes, giving clients a way to detect change.
150-
/// Generally it's -1 when the object is in an invalid state. In particular, for an <see cref="RowCursor"/>, this is -1
151-
/// when the <see cref="RowCursor.State"/> is <see cref="CursorState.NotStarted"/> or <see cref="CursorState.Done"/>.
137+
/// This is incremented when the underlying contents changes, giving clients a way to detect change. Generally
138+
/// it's -1 when the object is in an invalid state. In particular, for an <see cref="RowCursor"/>, this is -1
139+
/// when the <see cref="RowCursor.State"/> is <see cref="CursorState.NotStarted"/> or <see
140+
/// cref="CursorState.Done"/>.
152141
///
153-
/// Note that this position is not position within the underlying data, but position of this cursor only.
154-
/// If one, for example, opened a set of parallel streaming cursors, or a shuffled cursor, each such cursor's
155-
/// first valid entry would always have position 0.
142+
/// Note that this position is not position within the underlying data, but position of this cursor only. If
143+
/// one, for example, opened a set of parallel streaming cursors, or a shuffled cursor, each such cursor's first
144+
/// valid entry would always have position 0.
156145
/// </summary>
157146
public abstract long Position { get; }
158147

159148
/// <summary>
160-
/// This provides a means for reconciling multiple streams of counted things. Generally, in each stream,
161-
/// batch numbers should be non-decreasing. Furthermore, any given batch number should only appear in one
162-
/// of the streams. Order is determined by batch number. The reconciler ensures that each stream (that is
163-
/// still active) has at least one item available, then takes the item with the smallest batch number.
149+
/// This provides a means for reconciling multiple rows that have been produced generally from
150+
/// <see cref="IDataView.GetRowCursorSet(Func{int, bool}, int, Random)"/>. When getting a set, there is a need
151+
/// to, while allowing parallel processing to proceed, always have an aim that the original order should be
152+
/// reconverable. Note, whether or not a user cares about that original order in ones specific application is
153+
/// another story altogether (most callers of this as a practical matter do not, otherwise they would not call
154+
/// it), but at least in principle it should be possible to reconstruct the original order one would get from an
155+
/// identically configured <see cref="IDataView.GetRowCursor(Func{int, bool}, Random)"/>. So: for any cursor
156+
/// implementation, batch numbers should be non-decreasing. Furthermore, any given batch number should only
157+
/// appear in one of the cursors as returned by
158+
/// <see cref="IDataView.GetRowCursorSet(Func{int, bool}, int, Random)"/>. In this way, order is determined by
159+
/// batch number. An operation that reconciles these cursors to produce a consistent single cursoring, could do
160+
/// so by drawing from the single cursor, among all cursors in the set, that has the smallest batch number
161+
/// available.
164162
///
165-
/// Note that there is no suggestion that the batches for a particular entry will be consistent from
166-
/// cursoring to cursoring, except for the consistency in resulting in the same overall ordering. The same
167-
/// entry could have different batch numbers from one cursoring to another. There is also no requirement
168-
/// that any given batch number must appear, at all.
163+
/// Note that there is no suggestion that the batches for a particular entry will be consistent from cursoring
164+
/// to cursoring, except for the consistency in resulting in the same overall ordering. The same entry could
165+
/// have different batch numbers from one cursoring to another. There is also no requirement that any given
166+
/// batch number must appear, at all. It is merely a mechanism for recovering ordering from a possibly arbitrary
167+
/// partitioning of the data. It also follows from this, of course, that considering the batch to be a property
168+
/// of the data is completely invalid.
169169
/// </summary>
170170
public abstract long Batch { get; }
171171

172172
/// <summary>
173173
/// A getter for a 128-bit ID value. It is common for objects to serve multiple <see cref="Row"/>
174174
/// instances to iterate over what is supposed to be the same data, for example, in a <see cref="IDataView"/>
175-
/// a cursor set will produce the same data as a serial cursor, just partitioned, and a shuffled cursor
176-
/// will produce the same data as a serial cursor or any other shuffled cursor, only shuffled. The ID
177-
/// exists for applications that need to reconcile which entry is actually which. Ideally this ID should
178-
/// be unique, but for practical reasons, it suffices if collisions are simply extremely improbable.
175+
/// a cursor set will produce the same data as a serial cursor, just partitioned, and a shuffled cursor will
176+
/// produce the same data as a serial cursor or any other shuffled cursor, only shuffled. The ID exists for
177+
/// applications that need to reconcile which entry is actually which. Ideally this ID should be unique, but for
178+
/// practical reasons, it suffices if collisions are simply extremely improbable.
179179
///
180-
/// Note that this ID, while it must be consistent for multiple streams according to the semantics
181-
/// above, is not considered part of the data per se. So, to take the example of a data view specifically,
182-
/// a single data view must render consistent IDs across all cursorings, but there is no suggestion at
183-
/// all that if the "same" data were presented in a different data view (as by, say, being transformed,
184-
/// cached, saved, or whatever), that the IDs between the two different data views would have any
185-
/// discernable relationship.</summary>
180+
/// Note that this ID, while it must be consistent for multiple streams according to the semantics above, is not
181+
/// considered part of the data per se. So, to take the example of a data view specifically, a single data view
182+
/// must render consistent IDs across all cursorings, but there is no suggestion at all that if the "same" data
183+
/// were presented in a different data view (as by, say, being transformed, cached, saved, or whatever), that
184+
/// the IDs between the two different data views would have any discernable relationship.</summary>
186185
public abstract ValueGetter<RowId> GetIdGetter();
187186

188187
/// <summary>

0 commit comments

Comments
 (0)