Skip to content

Commit 911e38a

Browse files
tf-transform-teamzoyahav
tf-transform-team
authored andcommitted
Project import generated by Copybara.
PiperOrigin-RevId: 190121533
1 parent 4b45bfe commit 911e38a

File tree

10 files changed

+143
-82
lines changed

10 files changed

+143
-82
lines changed

README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ our testing framework. Other combinations may also work, but are untested.
4444

4545
|tensorflow-transform |tensorflow |apache-beam[gcp]|
4646
|--------------------------------------------------------------------------------|--------------|----------------|
47-
|[GitHub master](https://github.com/tensorflow/transform/blob/master/RELEASE.md) |nightly (1.x) |2.3.0 |
47+
|[GitHub master](https://github.com/tensorflow/transform/blob/master/RELEASE.md) |nightly (1.x) |2.4.0 |
48+
|[0.6.0](https://github.com/tensorflow/transform/blob/v0.6.0/RELEASE.md) |1.6 |2.4.0 |
4849
|[0.5.0](https://github.com/tensorflow/transform/blob/v0.5.0/RELEASE.md) |1.5 |2.3.0 |
4950
|[0.4.0](https://github.com/tensorflow/transform/blob/v0.4.0/RELEASE.md) |1.4 |2.2.0 |
5051
|[0.3.1](https://github.com/tensorflow/transform/blob/v0.3.1/RELEASE.md) |1.3 |2.1.1 |

RELEASE.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
1-
# Current version (not yet released; still in development)
1+
# Release 0.6.0
22

33
## Major Features and Improvements
44

55
## Bug Fixes and Other Changes
6+
* Depends on `apache-beam[gcp]>=2.4,<3`.
67
* Trim min/max value in `tft.bucketize where the computed number of bucket
78
boundaries is more than requested. Updated documentation to clearly indicate
89
that the number of buckets is computed using approximate algorithms, and that
910
computed number can be more or less than requested.
1011
* Change the namespace used for Beam metrics from `tensorflow_transform` to
1112
`tfx.Transform`.
1213
* Update Beam metrics to also log vocabulary sizes.
14+
* `CsvCoder` updated to support unicode.
1315

1416
## Breaking changes
17+
* Requires pre-installed TensorFlow >=1.6,<2.
1518

1619
## Deprecations
1720

setup.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,19 @@
1717
from setuptools import setup
1818

1919
# Tensorflow transform version.
20-
__version__ = '0.6.0dev'
20+
__version__ = '0.6.0'
2121

2222

2323
def _make_required_install_packages():
2424
return [
25-
'apache-beam[gcp]>=2.3,<3',
25+
'apache-beam[gcp]>=2.4,<3',
26+
'numpy>=1.10,<2',
2627

27-
# Protobuf libraries < 3.3 contain some map-related data corruption bugs
28-
# (b/35874111).
29-
'protobuf>=3.3,<4',
28+
# Protobuf libraries < 3.5.2 do not have 'cpp' implementation of protobufs
29+
# for Windows and Mac.
30+
'protobuf>=3.5.2,<4',
3031

31-
# Six 1.11.0 incompatible with apitools.
32-
'six>=1.9,<1.11',
32+
'six>=1.9,<2',
3333

3434
]
3535

tensorflow_transform/analyzers.py

+36-25
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141

4242
# Named tuple with details for each output of an Analyzer.
4343
_AnalyzerOutputInfo = collections.namedtuple(
44-
'AnalyzerOutputInfo', ['name', 'dtype', 'is_asset'])
44+
'AnalyzerOutputInfo', ['name', 'is_asset'])
4545

4646

4747
# NOTE: this code is designed so that Analyzer is pickleable, and in particular
@@ -52,6 +52,10 @@
5252
# of a PTransform in our implementation of tf.Transform on Beam currently, so
5353
# we must avoid directly putting `Tensor`s inside `Analyzer`, and instead use
5454
# tensor names.
55+
#
56+
# Due to these pickling issues and also logical separation of TensorFlow and
57+
# numpy code, the spec should also not contain TensorFlow dtypes but rather
58+
# their numpy equivalent.
5559
class Analyzer(object):
5660
"""An operation-like class for full-pass analyses of data.
5761
@@ -91,7 +95,7 @@ def __init__(self, inputs, output_dtype_shape_and_is_asset, spec, name):
9195
raise ValueError(('Tensor {} cannot represent an asset, because it '
9296
'is not a string.').format(output_tensor.name))
9397
self._output_infos.append(_AnalyzerOutputInfo(
94-
output_tensor.name, output_tensor.dtype, is_asset))
98+
output_tensor.name, is_asset))
9599
self._spec = spec
96100
tf.add_to_collection(ANALYZER_COLLECTION, self)
97101

@@ -201,11 +205,18 @@ def combine_analyzer(inputs, output_dtypes, output_shapes, combiner_spec, name):
201205

202206

203207
class _NumPyCombinerSpec(CombinerSpec):
204-
"""Combines the PCollection only on the 0th dimension using nparray."""
208+
"""Combines the PCollection only on the 0th dimension using nparray.
209+
210+
Args:
211+
fn: The numpy function representing the reduction to be done.
212+
reduce_instance_dims: Whether to reduce across non-batch dimensions.
213+
output_dtypes: The numpy dtype to cast each output to.
214+
"""
205215

206-
def __init__(self, fn, reduce_instance_dims):
216+
def __init__(self, fn, reduce_instance_dims, output_dtypes):
207217
self._fn = fn
208218
self._reduce_instance_dims = reduce_instance_dims
219+
self._output_dtypes = output_dtypes
209220

210221
def create_accumulator(self):
211222
return None
@@ -232,7 +243,13 @@ def merge_accumulators(self, accumulators):
232243
for sub_accumulators in zip(*accumulators)]
233244

234245
def extract_output(self, accumulator):
235-
return accumulator
246+
if accumulator is None:
247+
return None
248+
# For each output, cast that output to the specified type. Note there will
249+
# be one output for each input tensor to the analyzer.
250+
return [sub_accumulator.astype(output_dtype)
251+
for sub_accumulator, output_dtype
252+
in zip(accumulator, self._output_dtypes)]
236253

237254

238255
def _numeric_combine(inputs, fn, reduce_instance_dims=True, name=None):
@@ -266,11 +283,10 @@ def _numeric_combine(inputs, fn, reduce_instance_dims=True, name=None):
266283
# shape.
267284
shapes = [x.shape.as_list()[1:] if x.shape.dims is not None else None
268285
for x in inputs]
286+
spec = _NumPyCombinerSpec(fn, reduce_instance_dims,
287+
[x.dtype.as_numpy_dtype for x in inputs])
269288
return combine_analyzer(
270-
inputs,
271-
[x.dtype for x in inputs],
272-
shapes,
273-
_NumPyCombinerSpec(fn, reduce_instance_dims),
289+
inputs, [x.dtype for x in inputs], shapes, spec,
274290
name if name is not None else fn.__name__)
275291

276292

@@ -615,22 +631,17 @@ def quantiles(x, num_buckets, epsilon, name=None):
615631

616632
with tf.name_scope(name, 'quantiles'):
617633
spec = _QuantilesSpec(epsilon, num_buckets)
618-
quantile_boundaries = Analyzer(
634+
return Analyzer(
619635
[x], [(spec.bucket_dtype, [1, None], False)], spec,
620636
'quantiles').outputs[0]
621637

622-
# The Analyzer returns a 2d matrix of 1*num_buckets. Below, we remove
623-
# the first dimension and return the boundaries as a simple 1d list.
624-
return quantile_boundaries[0:1]
625-
626638

627639
class _CovarianceCombinerSpec(CombinerSpec):
628640
"""Combines the PCollection to compute the biased covariance matrix."""
629641

630-
def __init__(self, dtype=tf.float64):
642+
def __init__(self, numpy_dtype=np.float64):
631643
"""Store the dtype for np arrays/matrices for precision."""
632-
self._output_dtype = dtype
633-
self._np_dtype = dtype.as_numpy_dtype
644+
self._numpy_dtype = numpy_dtype
634645

635646
def create_accumulator(self):
636647
"""Create an accumulator with all zero entries."""
@@ -663,9 +674,9 @@ def add_input(self, accumulator, batch_values):
663674
batch_cross_terms = np.matmul(
664675
np.transpose(batch_value),
665676
batch_value
666-
).astype(self._np_dtype)
677+
).astype(self._numpy_dtype)
667678

668-
batch_sum = np.array(np.sum(batch_value, axis=0), self._np_dtype)
679+
batch_sum = np.array(np.sum(batch_value, axis=0), self._numpy_dtype)
669680
batch_count = np.shape(batch_value)[0]
670681

671682
if accumulator is None:
@@ -725,7 +736,7 @@ def covariance(x, dtype, name=None):
725736
Args:
726737
x: A rank-2 `Tensor`, 0th dim are rows, 1st dim are indices in each input
727738
vector.
728-
dtype: numpy dtype of entries in the returned matrix.
739+
dtype: Tensorflow dtype of entries in the returned matrix.
729740
name: (Optional) A name for this operation.
730741
731742
Raises:
@@ -743,17 +754,17 @@ def covariance(x, dtype, name=None):
743754
input_dim = x.shape.as_list()[1]
744755
shape = (input_dim, input_dim)
745756

746-
spec = _CovarianceCombinerSpec(dtype)
757+
spec = _CovarianceCombinerSpec(dtype.as_numpy_dtype)
747758
return combine_analyzer(
748759
[x], [dtype], [shape], spec,
749760
name if name is not None else 'covariance')[0]
750761

751762

752763
class _PCACombinerSpec(_CovarianceCombinerSpec):
753764

754-
def __init__(self, output_dim=None, dtype=tf.float64):
765+
def __init__(self, output_dim=None, numpy_dtype=np.float64):
755766
"""Store pca output dimension, and dtype for precision."""
756-
super(_PCACombinerSpec, self).__init__(dtype=dtype)
767+
super(_PCACombinerSpec, self).__init__(numpy_dtype=numpy_dtype)
757768
self._output_dim = output_dim
758769

759770
def extract_output(self, accumulator):
@@ -844,7 +855,7 @@ def pca(x, output_dim, dtype, name=None):
844855
Args:
845856
x: A rank-2 `Tensor`, 0th dim are rows, 1st dim are indices in row vectors.
846857
output_dim: The PCA output dimension (number of eigenvectors to return).
847-
dtype: numpy dtype of entries in the returned matrix.
858+
dtype: Tensorflow dtype of entries in the returned matrix.
848859
name: (Optional) A name for this operation.
849860
850861
Raises:
@@ -862,7 +873,7 @@ def pca(x, output_dim, dtype, name=None):
862873
input_dim = x.shape.as_list()[1]
863874
shape = (input_dim, output_dim)
864875

865-
spec = _PCACombinerSpec(output_dim, dtype)
876+
spec = _PCACombinerSpec(output_dim, dtype.as_numpy_dtype)
866877
return combine_analyzer(
867878
[x], [dtype], [shape], spec,
868879
name if name is not None else 'pca')[0]

tensorflow_transform/beam/analyzer_impls.py

+17-13
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323

2424
import apache_beam as beam
2525

26-
from apache_beam.typehints import Any
2726
from apache_beam.typehints import KV
2827
from apache_beam.typehints import List
2928
from apache_beam.typehints import with_input_types
@@ -64,16 +63,14 @@ def _maybe_deserialize_tf_config(serialized_tf_config):
6463

6564

6665
@with_input_types(List[np.ndarray])
67-
@with_output_types(List[Any])
66+
@with_output_types(List[np.ndarray])
6867
class _AnalyzerImpl(beam.PTransform):
6968
"""PTransform that implements a given analyzer.
7069
7170
_AnalyzerImpl accepts a PCollection where each element is a list of ndarrays.
7271
Each element in this list contains a batch of values for the corresponding
7372
input tensor of the analyzer. _AnalyzerImpl returns a PCollection containing a
74-
single element which is a list of values. Each element should be convertible
75-
to an ndarray via np.asarray, and the converted value will be the
76-
corresponding output tensor of the analyzer.
73+
single element which is a list of `ndarray`s.
7774
7875
_AnalyzerImpl dispatches to an implementation transform, with the same
7976
signature as _AnalyzerImpl.
@@ -106,7 +103,7 @@ def _flatten_value_to_list(batch_values):
106103

107104

108105
@with_input_types(List[np.ndarray])
109-
@with_output_types(List[Any])
106+
@with_output_types(List[np.ndarray])
110107
class _UniquesAnalyzerImpl(beam.PTransform):
111108
"""Saves the unique elements in a PCollection of batches."""
112109

@@ -196,15 +193,15 @@ def order_by_decreasing_counts(ignored, counts_iter, store_frequency):
196193
# Return the vocabulary path.
197194
wait_for_vocabulary_transform = (
198195
pcoll.pipeline
199-
| 'CreatePath' >> beam.Create([[vocabulary_file]])
196+
| 'CreatePath' >> beam.Create([[np.array(vocabulary_file)]])
200197
# Ensure that the analysis returns only after the file is written.
201198
| 'WaitForVocabularyFile' >> beam.Map(
202199
lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written)))
203200
return wait_for_vocabulary_transform
204201

205202

206203
@with_input_types(List[np.ndarray])
207-
@with_output_types(List[Any])
204+
@with_output_types(List[np.ndarray])
208205
class _ComputeQuantiles(beam.CombineFn):
209206
"""Computes quantiles on the PCollection.
210207
@@ -213,9 +210,11 @@ class _ComputeQuantiles(beam.CombineFn):
213210
see also http://web.cs.ucla.edu/~weiwang/paper/SSDBM07_2.pdf
214211
"""
215212

216-
def __init__(self, num_quantiles, epsilon, serialized_tf_config=None):
213+
def __init__(self, num_quantiles, epsilon, bucket_dtype,
214+
serialized_tf_config=None):
217215
self._num_quantiles = num_quantiles
218216
self._epsilon = epsilon
217+
self._bucket_dtype = bucket_dtype
219218
self._serialized_tf_config = serialized_tf_config
220219

221220
# _stamp_token is used to commit the state of the qaccumulator. In
@@ -297,7 +296,8 @@ def merge_accumulators(self, summaries):
297296

298297
def extract_output(self, summary):
299298
if summary is self._empty_summary:
300-
return [[[]]]
299+
# Return an empty (1, 0) ndarray using np.zeros.
300+
return [np.zeros(shape=(1, 0), dtype=self._bucket_dtype)]
301301

302302
# All relevant state about the input is captured by 'summary'
303303
# (see comment in add_input() and merge_accumulators()).
@@ -330,11 +330,14 @@ def extract_output(self, summary):
330330
# Do not trim min/max, these are part of requested boundaries.
331331
pass
332332

333-
return [[buckets]]
333+
# Convert to a (1, ?) shape array.
334+
buckets = np.expand_dims(buckets, 0)
335+
336+
return [buckets]
334337

335338

336339
@with_input_types(List[np.ndarray])
337-
@with_output_types(List[Any])
340+
@with_output_types(List[np.ndarray])
338341
class _QuantilesAnalyzerImpl(beam.PTransform):
339342
"""Computes the quantile buckets in a PCollection of batches."""
340343

@@ -350,11 +353,12 @@ def expand(self, pcoll):
350353
_ComputeQuantiles(
351354
num_quantiles=self._spec.num_buckets,
352355
epsilon=self._spec.epsilon,
356+
bucket_dtype=self._spec.bucket_dtype.as_numpy_dtype,
353357
serialized_tf_config=serialized_tf_config)))
354358

355359

356360
@with_input_types(List[np.ndarray])
357-
@with_output_types(List[Any])
361+
@with_output_types(List[np.ndarray])
358362
class _CombineFnWrapper(beam.CombineFn):
359363
"""Class to wrap a analyzers._CombinerSpec as a beam.CombineFn."""
360364

0 commit comments

Comments
 (0)