Skip to content

Commit 51a0c5f

Browse files
tf-transform-teamzoyahav
tf-transform-team
authored andcommitted
Project import generated by Copybara.
PiperOrigin-RevId: 172899890
1 parent 1173a9f commit 51a0c5f

15 files changed

+879
-257
lines changed

README.md

+11
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,18 @@ Note: If you clone tf.Transform's implementation and samples from GitHub's
5353
from PyPI) they will likely only work with TensorFlow's nightly
5454
[build](https://github.com/tensorflow/tensorflow).
5555

56+
### Compatible Versions
5657

58+
This is a table of versions known to be compatible with each other. This is not
59+
a comprehensive list, meaning other combinations may also work, but these are
60+
the combinations tested by our testing framework and by the team before
61+
releasing a new version.
62+
63+
|tensorflow-transform |tensorflow|apache-beam[gcp]|
64+
|--------------------------------------------------------------------------------|----------|----------------|
65+
|[GitHub master](https://github.com/tensorflow/transform/blob/master/RELEASE.md) |nightly |latest (2.x) |
66+
|[0.3.0](https://github.com/tensorflow/transform/blob/v0.3.0/RELEASE.md) |1.3 |2.1.1 |
67+
|[0.1.10](https://github.com/tensorflow/transform/blob/v0.1.10/RELEASE.md) |1.0 |2.0.0 |
5768
## Getting Started
5869

5970
For instructions on using tf.Transform see the [getting started

RELEASE.md

+19
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,22 @@
1+
# Release 0.3.1
2+
3+
## Major Features and Improvements
4+
* We now provide helper methods for creating `serving_input_receiver_fn` for use
5+
with tf.estimator. These mirror the existing functions targeting the
6+
legacy tf.contrib.learn.estimators-- i.e. for each `*_serving_input_fn()`
7+
in input_fn_maker there is now also a `*_serving_input_receiver_fn()`.
8+
9+
## Bug Fixes and Other Changes
10+
* Introduced `tft.apply_vocab` this allows users to separately apply a single
11+
vocabulary (as generated by `tft.uniques`) to several different columns.
12+
* Provide a source distribution tar `tensorflow-transform-X.Y.Z.tar.gz`.
13+
14+
## Breaking changes
15+
* The default prefix for `tft.string_to_int` `vocab_filename` changed from
16+
`vocab_string_to_int` to `vocab_string_to_int_uniques`. To make your pipelines
17+
resilient to implementation details please set `vocab_filename` if you are using
18+
the generated vocab_filename on a downstream component.
19+
120
# Release 0.3.0
221

322
## Major Features and Improvements

examples/census_example.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import pprint
2424
import tempfile
2525

26+
2627
import tensorflow as tf
2728
import tensorflow_transform as tft
2829
from apache_beam.io import textio
@@ -195,14 +196,18 @@ def convert_label(label):
195196

196197

197198
def train_and_evaluate(transformed_train_filepattern,
198-
transformed_test_filepattern, transformed_metadata_dir):
199+
transformed_test_filepattern, transformed_metadata_dir,
200+
num_train_instances=NUM_TRAIN_INSTANCES,
201+
num_test_instances=NUM_TEST_INSTANCES):
199202
"""Train the model on training data and evaluate on test data.
200203
201204
Args:
202205
transformed_train_filepattern: File pattern for transformed training data
203206
shards
204207
transformed_test_filepattern: File pattern for transformed test data shards
205208
transformed_metadata_dir: Directory containing transformed data metadata
209+
num_train_instances: Number of instances in train set
210+
num_test_instances: Number of instances in test set
206211
207212
Returns:
208213
The results from the estimator's 'evaluate' method
@@ -231,7 +236,7 @@ def train_and_evaluate(transformed_train_filepattern,
231236
# Estimate the model using the default optimizer.
232237
estimator.fit(
233238
input_fn=train_input_fn,
234-
max_steps=TRAIN_NUM_EPOCHS * NUM_TRAIN_INSTANCES / TRAIN_BATCH_SIZE)
239+
max_steps=TRAIN_NUM_EPOCHS * num_train_instances / TRAIN_BATCH_SIZE)
235240

236241
# Evaluate model on test dataset.
237242
eval_input_fn = input_fn_maker.build_training_input_fn(
@@ -240,7 +245,7 @@ def train_and_evaluate(transformed_train_filepattern,
240245
training_batch_size=1,
241246
label_keys=[LABEL_COLUMN])
242247

243-
return estimator.evaluate(input_fn=eval_input_fn, steps=NUM_TEST_INSTANCES)
248+
return estimator.evaluate(input_fn=eval_input_fn, steps=num_test_instances)
244249

245250

246251
def main():

examples/sentiment_example.py

+79-26
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import pprint
2424
import tempfile
2525

26+
2627
import tensorflow as tf
2728
import tensorflow_transform as tft
2829
from apache_beam.io import textio
@@ -49,6 +50,13 @@
4950
REVIEW_WEIGHT = 'review_weight'
5051
LABEL_COLUMN = 'label'
5152

53+
RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
54+
REVIEW_COLUMN: dataset_schema.ColumnSchema(
55+
tf.string, [], dataset_schema.FixedColumnRepresentation()),
56+
LABEL_COLUMN: dataset_schema.ColumnSchema(
57+
tf.int64, [], dataset_schema.FixedColumnRepresentation()),
58+
}))
59+
5260
DELIMITERS = '.,!?() '
5361

5462

@@ -99,13 +107,13 @@ def ReadAndShuffleData(pcoll, filepatterns):
99107
lambda p: {REVIEW_COLUMN: p[0], LABEL_COLUMN: p[1]})
100108

101109

102-
def transform_data(train_neg_filepattern, train_pos_filepattern,
103-
test_neg_filepattern, test_pos_filepattern,
104-
transformed_train_filebase, transformed_test_filebase,
105-
transformed_metadata_dir):
106-
"""Transform the data and write out as a TFRecord of Example protos.
110+
def read_and_shuffle_data(
111+
train_neg_filepattern, train_pos_filepattern, test_neg_filepattern,
112+
test_pos_filepattern, shuffled_train_filebase, shuffled_test_filebase):
113+
"""Read and shuffle the data and write out as a TFRecord of Example protos.
107114
108-
Read in the data from the positive and negative examples on disk, and
115+
Read in the data from the positive and negative examples on disk, shuffle it
116+
and write it out in TFRecord format.
109117
transform it using a preprocessing pipeline that removes punctuation,
110118
tokenizes and maps tokens to int64 values indices.
111119
@@ -114,6 +122,42 @@ def transform_data(train_neg_filepattern, train_pos_filepattern,
114122
train_pos_filepattern: Filepattern for training data positive examples
115123
test_neg_filepattern: Filepattern for test data negative examples
116124
test_pos_filepattern: Filepattern for test data positive examples
125+
shuffled_train_filebase: Base filename for shuffled training data shards
126+
shuffled_test_filebase: Base filename for shuffled test data shards
127+
"""
128+
with beam.Pipeline() as pipeline:
129+
# pylint: disable=no-value-for-parameter
130+
_ = (
131+
pipeline
132+
| 'ReadAndShuffleTrain' >> ReadAndShuffleData(
133+
(train_neg_filepattern, train_pos_filepattern))
134+
| 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
135+
shuffled_train_filebase,
136+
coder=example_proto_coder.ExampleProtoCoder(
137+
RAW_DATA_METADATA.schema)))
138+
_ = (
139+
pipeline
140+
| 'ReadAndShuffleTest' >> ReadAndShuffleData(
141+
(test_neg_filepattern, test_pos_filepattern))
142+
| 'WriteTestData' >> tfrecordio.WriteToTFRecord(
143+
shuffled_test_filebase,
144+
coder=example_proto_coder.ExampleProtoCoder(
145+
RAW_DATA_METADATA.schema)))
146+
# pylint: enable=no-value-for-parameter
147+
148+
149+
def transform_data(shuffled_train_filepattern, shuffled_test_filepattern,
150+
transformed_train_filebase, transformed_test_filebase,
151+
transformed_metadata_dir):
152+
"""Transform the data and write out as a TFRecord of Example protos.
153+
154+
Read in the data from the positive and negative examples on disk, and
155+
transform it using a preprocessing pipeline that removes punctuation,
156+
tokenizes and maps tokens to int64 values indices.
157+
158+
Args:
159+
shuffled_train_filepattern: Base filename for shuffled training data shards
160+
shuffled_test_filepattern: Base filename for shuffled test data shards
117161
transformed_train_filebase: Base filename for transformed training data
118162
shards
119163
transformed_test_filebase: Base filename for transformed test data shards
@@ -123,19 +167,19 @@ def transform_data(train_neg_filepattern, train_pos_filepattern,
123167

124168
with beam.Pipeline() as pipeline:
125169
with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
126-
# pylint: disable=no-value-for-parameter
127-
train_data = pipeline | 'ReadTrain' >> ReadAndShuffleData(
128-
(train_neg_filepattern, train_pos_filepattern))
129-
# pylint: disable=no-value-for-parameter
130-
test_data = pipeline | 'ReadTest' >> ReadAndShuffleData(
131-
(test_neg_filepattern, test_pos_filepattern))
132-
133-
metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
134-
REVIEW_COLUMN: dataset_schema.ColumnSchema(
135-
tf.string, [], dataset_schema.FixedColumnRepresentation()),
136-
LABEL_COLUMN: dataset_schema.ColumnSchema(
137-
tf.int64, [], dataset_schema.FixedColumnRepresentation()),
138-
}))
170+
train_data = (
171+
pipeline |
172+
'ReadTrain' >> tfrecordio.ReadFromTFRecord(
173+
shuffled_train_filepattern,
174+
coder=example_proto_coder.ExampleProtoCoder(
175+
RAW_DATA_METADATA.schema)))
176+
177+
test_data = (
178+
pipeline |
179+
'ReadTest' >> tfrecordio.ReadFromTFRecord(
180+
shuffled_test_filepattern,
181+
coder=example_proto_coder.ExampleProtoCoder(
182+
RAW_DATA_METADATA.schema)))
139183

140184
def preprocessing_fn(inputs):
141185
"""Preprocess input columns into transformed columns."""
@@ -153,12 +197,12 @@ def preprocessing_fn(inputs):
153197
}
154198

155199
(transformed_train_data, transformed_metadata), transform_fn = (
156-
(train_data, metadata)
200+
(train_data, RAW_DATA_METADATA)
157201
| 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(
158202
preprocessing_fn))
159203

160204
transformed_test_data, _ = (
161-
((test_data, metadata), transform_fn)
205+
((test_data, RAW_DATA_METADATA), transform_fn)
162206
| 'Transform' >> beam_impl.TransformDataset())
163207

164208
_ = (
@@ -183,7 +227,9 @@ def preprocessing_fn(inputs):
183227

184228

185229
def train_and_evaluate(transformed_train_filepattern,
186-
transformed_test_filepattern, transformed_metadata_dir):
230+
transformed_test_filepattern, transformed_metadata_dir,
231+
num_train_instances=NUM_TRAIN_INSTANCES,
232+
num_test_instances=NUM_TEST_INSTANCES):
187233
"""Train the model on training data and evaluate on evaluation data.
188234
189235
Args:
@@ -192,6 +238,8 @@ def train_and_evaluate(transformed_train_filepattern,
192238
transformed_test_filepattern: Base filename for transformed evaluation data
193239
shards
194240
transformed_metadata_dir: Directory containing transformed data metadata
241+
num_train_instances: Number of instances in train set
242+
num_test_instances: Number of instances in test set
195243
196244
Returns:
197245
The results from the estimator's 'evaluate' method
@@ -219,7 +267,7 @@ def train_and_evaluate(transformed_train_filepattern,
219267
# Estimate the model using the default optimizer.
220268
estimator.fit(
221269
input_fn=train_input_fn,
222-
max_steps=TRAIN_NUM_EPOCHS * NUM_TRAIN_INSTANCES / TRAIN_BATCH_SIZE)
270+
max_steps=TRAIN_NUM_EPOCHS * num_train_instances / TRAIN_BATCH_SIZE)
223271

224272
# Evaluate model on eval dataset.
225273
eval_input_fn = input_fn_maker.build_training_input_fn(
@@ -228,7 +276,7 @@ def train_and_evaluate(transformed_train_filepattern,
228276
training_batch_size=1,
229277
label_keys=[LABEL_COLUMN])
230278

231-
return estimator.evaluate(input_fn=eval_input_fn, steps=NUM_TEST_INSTANCES)
279+
return estimator.evaluate(input_fn=eval_input_fn, steps=num_test_instances)
232280

233281

234282
def main():
@@ -248,14 +296,19 @@ def main():
248296
train_pos_filepattern = os.path.join(args.input_data_dir, 'train/pos/*')
249297
test_neg_filepattern = os.path.join(args.input_data_dir, 'test/neg/*')
250298
test_pos_filepattern = os.path.join(args.input_data_dir, 'test/pos/*')
299+
shuffled_train_filebase = os.path.join(transformed_data_dir, 'train_shuffled')
300+
shuffled_test_filebase = os.path.join(transformed_data_dir, 'test_shuffled')
251301
transformed_train_filebase = os.path.join(transformed_data_dir,
252302
'train_transformed')
253303
transformed_test_filebase = os.path.join(transformed_data_dir,
254304
'test_transformed')
255305
transformed_metadata_dir = os.path.join(transformed_data_dir, 'metadata')
256306

257-
transform_data(train_neg_filepattern, train_pos_filepattern,
258-
test_neg_filepattern, test_pos_filepattern,
307+
read_and_shuffle_data(train_neg_filepattern, train_pos_filepattern,
308+
test_neg_filepattern, test_pos_filepattern,
309+
shuffled_train_filebase, shuffled_test_filebase)
310+
311+
transform_data(shuffled_train_filebase + '*', shuffled_test_filebase + '*',
259312
transformed_train_filebase, transformed_test_filebase,
260313
transformed_metadata_dir)
261314

examples/simple_example.py

+43-37
Original file line numberDiff line numberDiff line change
@@ -20,52 +20,58 @@
2020
import pprint
2121
import tempfile
2222

23+
2324
import tensorflow as tf
2425
import tensorflow_transform as tft
2526
import tensorflow_transform.beam.impl as beam_impl
2627
from tensorflow_transform.tf_metadata import dataset_metadata
2728
from tensorflow_transform.tf_metadata import dataset_schema
2829

2930

30-
def preprocessing_fn(inputs):
31-
"""Preprocess input columns into transformed columns."""
32-
x = inputs['x']
33-
y = inputs['y']
34-
s = inputs['s']
35-
x_centered = x - tft.mean(x)
36-
y_normalized = tft.scale_to_0_1(y)
37-
s_integerized = tft.string_to_int(s)
38-
x_centered_times_y_normalized = (x_centered * y_normalized)
39-
return {
40-
'x_centered': x_centered,
41-
'y_normalized': y_normalized,
42-
'x_centered_times_y_normalized': x_centered_times_y_normalized,
43-
's_integerized': s_integerized
44-
}
31+
def main():
32+
def preprocessing_fn(inputs):
33+
"""Preprocess input columns into transformed columns."""
34+
x = inputs['x']
35+
y = inputs['y']
36+
s = inputs['s']
37+
x_centered = x - tft.mean(x)
38+
y_normalized = tft.scale_to_0_1(y)
39+
s_integerized = tft.string_to_int(s)
40+
x_centered_times_y_normalized = (x_centered * y_normalized)
41+
return {
42+
'x_centered': x_centered,
43+
'y_normalized': y_normalized,
44+
'x_centered_times_y_normalized': x_centered_times_y_normalized,
45+
's_integerized': s_integerized
46+
}
47+
48+
raw_data = [
49+
{'x': 1, 'y': 1, 's': 'hello'},
50+
{'x': 2, 'y': 2, 's': 'world'},
51+
{'x': 3, 'y': 3, 's': 'hello'}
52+
]
4553

46-
raw_data = [
47-
{'x': 1, 'y': 1, 's': 'hello'},
48-
{'x': 2, 'y': 2, 's': 'world'},
49-
{'x': 3, 'y': 3, 's': 'hello'}
50-
]
54+
raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
55+
's': dataset_schema.ColumnSchema(
56+
tf.string, [], dataset_schema.FixedColumnRepresentation()),
57+
'y': dataset_schema.ColumnSchema(
58+
tf.float32, [], dataset_schema.FixedColumnRepresentation()),
59+
'x': dataset_schema.ColumnSchema(
60+
tf.float32, [], dataset_schema.FixedColumnRepresentation())
61+
}))
5162

52-
raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
53-
's': dataset_schema.ColumnSchema(
54-
tf.string, [], dataset_schema.FixedColumnRepresentation()),
55-
'y': dataset_schema.ColumnSchema(
56-
tf.float32, [], dataset_schema.FixedColumnRepresentation()),
57-
'x': dataset_schema.ColumnSchema(
58-
tf.float32, [], dataset_schema.FixedColumnRepresentation())
59-
}))
63+
with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
64+
transform_fn = (
65+
(raw_data, raw_data_metadata)
66+
| beam_impl.AnalyzeDataset(preprocessing_fn))
67+
transformed_dataset = (
68+
((raw_data, raw_data_metadata), transform_fn)
69+
| beam_impl.TransformDataset())
6070

61-
with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
62-
transform_fn = (
63-
(raw_data, raw_data_metadata)
64-
| beam_impl.AnalyzeDataset(preprocessing_fn))
65-
transformed_dataset = (
66-
((raw_data, raw_data_metadata), transform_fn)
67-
| beam_impl.TransformDataset())
71+
# pylint: disable=unused-variable
72+
transformed_data, transformed_metadata = transformed_dataset
6873

69-
transformed_data, transformed_metadata = transformed_dataset
74+
pprint.pprint(transformed_data)
7075

71-
pprint.pprint(transformed_data)
76+
if __name__ == '__main__':
77+
main()

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from setuptools import setup
1818

1919
# Tensorflow transform version.
20-
__version__ = '0.3.0'
20+
__version__ = '0.3.1'
2121

2222

2323
def _make_required_install_packages():

0 commit comments

Comments
 (0)