Skip to content

Commit fe3f1e5

Browse files
anguillanneufbusunkim96
authored andcommitted
Pub/Sub: update how to test with mock [(#2555)](GoogleCloudPlatform/python-docs-samples#2555)
* Update test with mock * Clean up resources after tests * Use unique resource names avoid test failures * Delete subscriptions in cleanup phase * Ensure unique topic name * Update assert to remove bytestring notation * Rewrite PubSubToGCS test using dataflow testing module
1 parent 6507cfc commit fe3f1e5

File tree

3 files changed

+92
-114
lines changed

3 files changed

+92
-114
lines changed

samples/snippets/PubSubToGCS.py

Lines changed: 50 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -34,24 +34,25 @@ def __init__(self, window_size):
3434
self.window_size = int(window_size * 60)
3535

3636
def expand(self, pcoll):
37-
return (pcoll
38-
# Assigns window info to each Pub/Sub message based on its
39-
# publish timestamp.
40-
| 'Window into Fixed Intervals' >> beam.WindowInto(
41-
window.FixedWindows(self.window_size))
42-
| 'Add timestamps to messages' >> (beam.ParDo(AddTimestamps()))
43-
# Use a dummy key to group the elements in the same window.
44-
# Note that all the elements in one window must fit into memory
45-
# for this. If the windowed elements do not fit into memory,
46-
# please consider using `beam.util.BatchElements`.
47-
# https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements
48-
| 'Add Dummy Key' >> beam.Map(lambda elem: (None, elem))
49-
| 'Groupby' >> beam.GroupByKey()
50-
| 'Abandon Dummy Key' >> beam.MapTuple(lambda _, val: val))
37+
return (
38+
pcoll
39+
# Assigns window info to each Pub/Sub message based on its
40+
# publish timestamp.
41+
| "Window into Fixed Intervals"
42+
>> beam.WindowInto(window.FixedWindows(self.window_size))
43+
| "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
44+
# Use a dummy key to group the elements in the same window.
45+
# Note that all the elements in one window must fit into memory
46+
# for this. If the windowed elements do not fit into memory,
47+
# please consider using `beam.util.BatchElements`.
48+
# https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements
49+
| "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
50+
| "Groupby" >> beam.GroupByKey()
51+
| "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
52+
)
5153

5254

5355
class AddTimestamps(beam.DoFn):
54-
5556
def process(self, element, publish_time=beam.DoFn.TimestampParam):
5657
"""Processes each incoming windowed element by extracting the Pub/Sub
5758
message and its publish timestamp into a dictionary. `publish_time`
@@ -60,61 +61,72 @@ def process(self, element, publish_time=beam.DoFn.TimestampParam):
6061
"""
6162

6263
yield {
63-
'message_body': element.decode('utf-8'),
64-
'publish_time': datetime.datetime.utcfromtimestamp(
65-
float(publish_time)).strftime("%Y-%m-%d %H:%M:%S.%f"),
64+
"message_body": element.decode("utf-8"),
65+
"publish_time": datetime.datetime.utcfromtimestamp(
66+
float(publish_time)
67+
).strftime("%Y-%m-%d %H:%M:%S.%f"),
6668
}
6769

6870

6971
class WriteBatchesToGCS(beam.DoFn):
70-
7172
def __init__(self, output_path):
7273
self.output_path = output_path
7374

7475
def process(self, batch, window=beam.DoFn.WindowParam):
7576
"""Write one batch per file to a Google Cloud Storage bucket. """
7677

77-
ts_format = '%H:%M'
78+
ts_format = "%H:%M"
7879
window_start = window.start.to_utc_datetime().strftime(ts_format)
7980
window_end = window.end.to_utc_datetime().strftime(ts_format)
80-
filename = '-'.join([self.output_path, window_start, window_end])
81+
filename = "-".join([self.output_path, window_start, window_end])
8182

82-
with beam.io.gcp.gcsio.GcsIO().open(filename=filename, mode='w') as f:
83+
with beam.io.gcp.gcsio.GcsIO().open(filename=filename, mode="w") as f:
8384
for element in batch:
84-
f.write('{}\n'.format(json.dumps(element)).encode('utf-8'))
85+
f.write("{}\n".format(json.dumps(element)).encode("utf-8"))
8586

8687

8788
def run(input_topic, output_path, window_size=1.0, pipeline_args=None):
8889
# `save_main_session` is set to true because some DoFn's rely on
8990
# globally imported modules.
9091
pipeline_options = PipelineOptions(
91-
pipeline_args, streaming=True, save_main_session=True)
92+
pipeline_args, streaming=True, save_main_session=True
93+
)
9294

9395
with beam.Pipeline(options=pipeline_options) as pipeline:
94-
(pipeline
95-
| 'Read PubSub Messages' >> beam.io.ReadFromPubSub(topic=input_topic)
96-
| 'Window into' >> GroupWindowsIntoBatches(window_size)
97-
| 'Write to GCS' >> beam.ParDo(WriteBatchesToGCS(output_path)))
96+
(
97+
pipeline
98+
| "Read PubSub Messages"
99+
>> beam.io.ReadFromPubSub(topic=input_topic)
100+
| "Window into" >> GroupWindowsIntoBatches(window_size)
101+
| "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(output_path))
102+
)
98103

99104

100-
if __name__ == '__main__': # noqa
105+
if __name__ == "__main__": # noqa
101106
logging.getLogger().setLevel(logging.INFO)
102107

103108
parser = argparse.ArgumentParser()
104109
parser.add_argument(
105-
'--input_topic',
106-
help='The Cloud Pub/Sub topic to read from.\n'
107-
'"projects/<PROJECT_NAME>/topics/<TOPIC_NAME>".')
110+
"--input_topic",
111+
help="The Cloud Pub/Sub topic to read from.\n"
112+
'"projects/<PROJECT_NAME>/topics/<TOPIC_NAME>".',
113+
)
108114
parser.add_argument(
109-
'--window_size',
115+
"--window_size",
110116
type=float,
111117
default=1.0,
112-
help='Output file\'s window size in number of minutes.')
118+
help="Output file's window size in number of minutes.",
119+
)
113120
parser.add_argument(
114-
'--output_path',
115-
help='GCS Path of the output file including filename prefix.')
121+
"--output_path",
122+
help="GCS Path of the output file including filename prefix.",
123+
)
116124
known_args, pipeline_args = parser.parse_known_args()
117125

118-
run(known_args.input_topic, known_args.output_path, known_args.window_size,
119-
pipeline_args)
126+
run(
127+
known_args.input_topic,
128+
known_args.output_path,
129+
known_args.window_size,
130+
pipeline_args,
131+
)
120132
# [END pubsub_to_gcs]

samples/snippets/PubSubToGCS_test.py

Lines changed: 41 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -12,89 +12,55 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import multiprocessing as mp
15+
import mock
1616
import os
17-
import pytest
18-
import subprocess as sp
19-
import tempfile
20-
import time
2117
import uuid
2218

2319
import apache_beam as beam
24-
from google.cloud import pubsub_v1
25-
26-
27-
PROJECT = os.environ['GCLOUD_PROJECT']
28-
BUCKET = os.environ['CLOUD_STORAGE_BUCKET']
29-
TOPIC = 'test-topic'
30-
UUID = uuid.uuid4().hex
31-
32-
33-
@pytest.fixture
34-
def publisher_client():
35-
yield pubsub_v1.PublisherClient()
36-
37-
38-
@pytest.fixture
39-
def topic_path(publisher_client):
40-
topic_path = publisher_client.topic_path(PROJECT, TOPIC)
41-
42-
try:
43-
publisher_client.delete_topic(topic_path)
44-
except Exception:
45-
pass
46-
47-
response = publisher_client.create_topic(topic_path)
48-
yield response.name
49-
50-
51-
def _infinite_publish_job(publisher_client, topic_path):
52-
while True:
53-
future = publisher_client.publish(
54-
topic_path, data='Hello World!'.encode('utf-8'))
55-
future.result()
56-
time.sleep(10)
57-
58-
59-
def test_run(publisher_client, topic_path):
60-
"""This is an integration test that runs `PubSubToGCS.py` in its entirety.
61-
It checks for output files on GCS.
62-
"""
63-
64-
# Use one process to publish messages to a topic.
65-
publish_process = mp.Process(
66-
target=lambda: _infinite_publish_job(publisher_client, topic_path))
67-
68-
# Use another process to run the streaming pipeline that should write one
69-
# file to GCS every minute (according to the default window size).
70-
pipeline_process = mp.Process(
71-
target=lambda: sp.call([
72-
'python', 'PubSubToGCS.py',
73-
'--project', PROJECT,
74-
'--runner', 'DirectRunner',
75-
'--temp_location', tempfile.mkdtemp(),
76-
'--input_topic', topic_path,
77-
'--output_path', 'gs://{}/pubsub/{}/output'.format(BUCKET, UUID),
78-
])
20+
from apache_beam.testing.test_pipeline import TestPipeline
21+
from apache_beam.testing.test_stream import TestStream
22+
from apache_beam.testing.test_utils import TempDir
23+
from apache_beam.transforms.window import TimestampedValue
24+
25+
import PubSubToGCS
26+
27+
PROJECT = os.environ["GCLOUD_PROJECT"]
28+
BUCKET = os.environ["CLOUD_STORAGE_BUCKET"]
29+
UUID = uuid.uuid1().hex
30+
31+
32+
@mock.patch("apache_beam.Pipeline", TestPipeline)
33+
@mock.patch(
34+
"apache_beam.io.ReadFromPubSub",
35+
lambda topic: (
36+
TestStream()
37+
.advance_watermark_to(0)
38+
.advance_processing_time(30)
39+
.add_elements([TimestampedValue(b"a", 1575937195)])
40+
.advance_processing_time(30)
41+
.add_elements([TimestampedValue(b"b", 1575937225)])
42+
.advance_processing_time(30)
43+
.add_elements([TimestampedValue(b"c", 1575937255)])
44+
.advance_watermark_to_infinity()
45+
),
46+
)
47+
def test_pubsub_to_gcs():
48+
PubSubToGCS.run(
49+
input_topic="unused", # mocked by TestStream
50+
output_path="gs://{}/pubsub/{}/output".format(BUCKET, UUID),
51+
window_size=1, # 1 minute
52+
pipeline_args=[
53+
"--project",
54+
PROJECT,
55+
"--temp_location",
56+
TempDir().get_path(),
57+
],
7958
)
8059

81-
publish_process.start()
82-
pipeline_process.start()
83-
84-
# Times out the streaming pipeline after 90 seconds.
85-
pipeline_process.join(timeout=90)
86-
# Immediately kills the publish process after the pipeline shuts down.
87-
publish_process.join(timeout=0)
88-
89-
pipeline_process.terminate()
90-
publish_process.terminate()
91-
9260
# Check for output files on GCS.
9361
gcs_client = beam.io.gcp.gcsio.GcsIO()
94-
# This returns a dictionary.
95-
files = gcs_client.list_prefix('gs://{}/pubsub/{}'.format(BUCKET, UUID))
62+
files = gcs_client.list_prefix("gs://{}/pubsub/{}".format(BUCKET, UUID))
9663
assert len(files) > 0
9764

98-
# Clean up. Delete topic. Delete files.
99-
publisher_client.delete_topic(topic_path)
65+
# Clean up.
10066
gcs_client.delete_batch(list(files))

samples/snippets/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
apache-beam[gcp]==2.15.0
1+
apache-beam[gcp,test]==2.16.0

0 commit comments

Comments
 (0)