Skip to content

Commit 8efa5cd

Browse files
Avoid more time-consuming calls (elastic#70)
With this commit we reduce usage of two time-consuming methods on the hot code path: generating random numbers and determining the current timestamp. This also changes behavior slightly: * The `hostname` field will vary host names from 1 - 3 in a regular pattern instead of randomly (however, as the host name is e.g. `web-EU-1.elastic.co` and only the number changes non-randomly we deem this change ok) * The `offset` will change be more realistic now: Before it changed randomly and now the offset increases by the average event size up to a certain maximum. * The current `@timestamp` will be retrieved only once per bulk. For documents within a bulk we'll advance the microsecond portion by `1 / bulk size` microseconds. We've measured the performance impact of this change by stubbing out Elasticsearch with nginx and running the `index-logs-fixed-daily-volume` challenge with the following track parameters: * `bulk_size`: 20000 * `bulk_indexing_clients`: 16 * `number_of_days`: 1 * `daily_logging_volume`: "20GB" We have measured the following median indexing throughput in our test environment: * baseline (master): 153476 docs/s * Using a deterministic `hostname` and `offset`: 174371 docs/s * All three measures together: 222611 docs/s This means we improve the maximum achievable indexing throughput by 45% in this configuration.
1 parent 146a372 commit 8efa5cd

File tree

6 files changed

+119
-17
lines changed

6 files changed

+119
-17
lines changed

eventdata/parameter_sources/elasticlogs_bulk_source.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ def percent_completed(self):
144144
def params(self):
145145
# Build bulk array
146146
bulk_array = []
147+
self._randomevent.start_bulk(self._bulk_size)
147148
for x in range(0, self._bulk_size):
148149
try:
149150
evt, idx, typ = self._randomevent.generate_event()

eventdata/parameter_sources/randomevent.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import datetime
2020
import gzip
21+
import itertools
2122
import json
2223
import os
2324
import random
@@ -295,6 +296,11 @@ def __init__(self, params, agent=Agent, client_ip=ClientIp, referrer=Referrer, r
295296
self.total_days = params.get("number_of_days")
296297
self.remaining_days = self.total_days
297298
self.record_raw_event_size = params.get("record_raw_event_size", False)
299+
self._offset = 0
300+
self._web_host = itertools.cycle([1, 2, 3])
301+
self._timestruct = None
302+
self._index_name = None
303+
self._time_interval_current_bulk = 0
298304

299305
@property
300306
def percent_completed(self):
@@ -306,25 +312,31 @@ def percent_completed(self):
306312
total = self.total_days * self.daily_logging_volume
307313
return already_generated / total
308314

315+
def start_bulk(self, bulk_size):
316+
self._time_interval_current_bulk = 1 / bulk_size
317+
self._timestruct = self._timestamp_generator.next_timestamp()
318+
self._index_name = self.__generate_index_pattern(self._timestruct)
319+
309320
def generate_event(self):
310321
if self.remaining_days == 0:
311322
raise StopIteration()
312-
timestruct = self._timestamp_generator.next_timestamp()
313-
index_name = self.__generate_index_pattern(timestruct)
314323

324+
# advance time by a few micros
325+
self._timestruct = self._timestamp_generator.simulate_tick(self._time_interval_current_bulk)
326+
# index for the current line - we may cross a date boundary later if we're above the daily logging volume
327+
index = self._index_name
315328
event = self._event
316-
event["@timestamp"] = timestruct["iso"]
329+
event["@timestamp"] = self._timestruct["iso"]
317330

318-
# set random offset
319-
event["offset"] = random.randrange(0, 10000000)
331+
# assume a typical event size of 263 bytes but limit the file size to 4GB
332+
event["offset"] = (self._offset + 263) % (4 * 1024 * 1024 * 1024)
320333

321334
self._agent.add_fields(event)
322335
self._clientip.add_fields(event)
323336
self._referrer.add_fields(event)
324337
self._request.add_fields(event)
325338

326-
# set host name
327-
event["hostname"] = "web-{}-{}.elastic.co".format(event["geoip_continent_code"], random.randrange(1, 3))
339+
event["hostname"] = "web-%s-%s.elastic.co" % (event["geoip_continent_code"], next(self._web_host))
328340

329341
if self.record_raw_event_size or self.daily_logging_volume:
330342
# determine the raw event size (as if this were contained in nginx log file). We do not bother to
@@ -340,6 +352,10 @@ def generate_event(self):
340352
if self.remaining_days is not None:
341353
self.remaining_days -= 1
342354
self._timestamp_generator.skip(datetime.timedelta(days=1))
355+
# advance time now for real (we usually use #simulate_tick() which will keep everything except for
356+
# microseconds constant.
357+
self._timestruct = self._timestamp_generator.next_timestamp()
358+
self._index_name = self.__generate_index_pattern(self._timestruct)
343359
self.current_logging_volume = 0
344360

345361
if self.record_raw_event_size:
@@ -387,7 +403,7 @@ def generate_event(self):
387403
event["referrer"],
388404
event["request"], event["bytes"], event["verb"], event["response"], event["httpversion"])
389405

390-
return line, index_name, self._type
406+
return line, index, self._type
391407

392408
def __generate_index_pattern(self, timestruct):
393409
if self._index_pattern:

eventdata/parameter_sources/timeutils.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,26 @@ def __init__(self, starting_point, offset=None, acceleration_factor=1.0, utcnow=
4242
self._acceleration_factor = acceleration_factor
4343
# reuse to reduce object churn
4444
self._ts = {}
45+
self._simulated_micros = 0.0
4546

4647
def next_timestamp(self):
48+
self._simulated_micros = 0.0
4749
delta = (self._utcnow() - self._start) * self._acceleration_factor
48-
return self.__to_struct(self._starting_point + delta)
50+
self.__to_struct(self._starting_point + delta)
51+
return self.simulate_tick(0)
52+
53+
def simulate_tick(self, micros):
54+
"""
55+
56+
Advances the current timestamp by a given number of microseconds but keep all other time components. This can be
57+
used to avoid retrieving the current timestamp to often but still simulate changes in time.
58+
59+
:param micros: A positive number of microseconds to add.
60+
:return: The current (formatted) timestamp structure as a dict.
61+
"""
62+
self._simulated_micros += micros
63+
self._ts["iso"] = "%s.%03dZ" % (self._ts["iso_prefix"], self._simulated_micros)
64+
return self._ts
4965

5066
def skip(self, delta):
5167
# advance the generated timestamp by delta
@@ -55,14 +71,13 @@ def skip(self, delta):
5571

5672
def __to_struct(self, dt):
5773
# string formatting is about 4 times faster than strftime.
58-
iso = "%04d-%02d-%02dT%02d:%02d:%02d.%03dZ" % (dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.microsecond)
59-
self._ts["iso"] = iso
60-
self._ts["yyyy"] = iso[:4]
61-
self._ts["yy"] = iso[2:4]
62-
self._ts["mm"] = iso[5:7]
63-
self._ts["dd"] = iso[8:10]
64-
self._ts["hh"] = iso[11:13]
65-
return self._ts
74+
iso_prefix = "%04d-%02d-%02dT%02d:%02d:%02d" % (dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
75+
self._ts["iso_prefix"] = iso_prefix
76+
self._ts["yyyy"] = iso_prefix[:4]
77+
self._ts["yy"] = iso_prefix[2:4]
78+
self._ts["mm"] = iso_prefix[5:7]
79+
self._ts["dd"] = iso_prefix[8:10]
80+
self._ts["hh"] = iso_prefix[11:13]
6681

6782
def __parse_starting_point(self, point):
6883
if point == "now":

tests/parameter_sources/elasticlogs_bulk_source_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ def __init__(self, index, type, doc, at_most=-1):
2626
self.doc = doc
2727
self.at_most = at_most
2828

29+
def start_bulk(self, bulk_size):
30+
pass
31+
2932
def generate_event(self):
3033
if self.at_most == 0:
3134
raise StopIteration()

tests/parameter_sources/randomevent_test.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def test_random_event_no_event_size_by_default():
7272
referrer=StaticReferrer,
7373
request=StaticRequest)
7474

75+
e.start_bulk(1)
7576
raw_doc, index, doc_type = e.generate_event()
7677

7778
doc = json.loads(raw_doc)
@@ -93,6 +94,7 @@ def test_random_event_with_event_size():
9394
referrer=StaticReferrer,
9495
request=StaticRequest)
9596

97+
e.start_bulk(1)
9698
raw_doc, index, doc_type = e.generate_event()
9799

98100
doc = json.loads(raw_doc)
@@ -118,6 +120,7 @@ def test_random_events_with_daily_logging_volume():
118120

119121
assert e.percent_completed is None
120122

123+
e.start_bulk(15)
121124
# 5 events fit into one kilobyte
122125
for i in range(5):
123126
doc, index, _ = e.generate_event()
@@ -150,13 +153,15 @@ def test_random_events_with_daily_logging_volume_and_maximum_days():
150153

151154
assert e.percent_completed == 0.0
152155

156+
e.start_bulk(5)
153157
# 5 events fit into one kilobyte
154158
for i in range(5):
155159
doc, index, _ = e.generate_event()
156160
assert index == "logs-20190105"
157161

158162
assert e.percent_completed == 0.5
159163

164+
e.start_bulk(6)
160165
for i in range(5):
161166
doc, index, _ = e.generate_event()
162167
assert index == "logs-20190106"

tests/parameter_sources/timeutils_test.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ def test_generate_interval_from_now():
4141
# first generated timestamp will be one (clock) invocation after the original start
4242
assert g.next_timestamp() == {
4343
"iso": "2019-01-05T15:00:05.000Z",
44+
"iso_prefix": "2019-01-05T15:00:05",
4445
"yyyy": "2019",
4546
"yy": "19",
4647
"mm": "01",
@@ -50,6 +51,7 @@ def test_generate_interval_from_now():
5051

5152
assert g.next_timestamp() == {
5253
"iso": "2019-01-05T15:00:10.000Z",
54+
"iso_prefix": "2019-01-05T15:00:10",
5355
"yyyy": "2019",
5456
"yy": "19",
5557
"mm": "01",
@@ -59,6 +61,7 @@ def test_generate_interval_from_now():
5961

6062
assert g.next_timestamp() == {
6163
"iso": "2019-01-05T15:00:15.000Z",
64+
"iso_prefix": "2019-01-05T15:00:15",
6265
"yyyy": "2019",
6366
"yy": "19",
6467
"mm": "01",
@@ -77,6 +80,7 @@ def test_generate_interval_from_fixed_starting_point():
7780

7881
assert g.next_timestamp() == {
7982
"iso": "2018-05-01T00:59:59.000Z",
83+
"iso_prefix": "2018-05-01T00:59:59",
8084
"yyyy": "2018",
8185
"yy": "18",
8286
"mm": "05",
@@ -86,6 +90,7 @@ def test_generate_interval_from_fixed_starting_point():
8690

8791
assert g.next_timestamp() == {
8892
"iso": "2018-05-01T01:00:02.000Z",
93+
"iso_prefix": "2018-05-01T01:00:02",
8994
"yyyy": "2018",
9095
"yy": "18",
9196
"mm": "05",
@@ -94,6 +99,7 @@ def test_generate_interval_from_fixed_starting_point():
9499
}
95100
assert g.next_timestamp() == {
96101
"iso": "2018-05-01T01:00:05.000Z",
102+
"iso_prefix": "2018-05-01T01:00:05",
97103
"yyyy": "2018",
98104
"yy": "18",
99105
"mm": "05",
@@ -113,6 +119,7 @@ def test_generate_interval_from_fixed_starting_point_with_offset():
113119

114120
assert g.next_timestamp() == {
115121
"iso": "2018-05-11T00:59:59.000Z",
122+
"iso_prefix": "2018-05-11T00:59:59",
116123
"yyyy": "2018",
117124
"yy": "18",
118125
"mm": "05",
@@ -122,6 +129,7 @@ def test_generate_interval_from_fixed_starting_point_with_offset():
122129

123130
assert g.next_timestamp() == {
124131
"iso": "2018-05-11T01:00:02.000Z",
132+
"iso_prefix": "2018-05-11T01:00:02",
125133
"yyyy": "2018",
126134
"yy": "18",
127135
"mm": "05",
@@ -130,6 +138,7 @@ def test_generate_interval_from_fixed_starting_point_with_offset():
130138
}
131139
assert g.next_timestamp() == {
132140
"iso": "2018-05-11T01:00:05.000Z",
141+
"iso_prefix": "2018-05-11T01:00:05",
133142
"yyyy": "2018",
134143
"yy": "18",
135144
"mm": "05",
@@ -148,6 +157,7 @@ def test_generate_interval_and_skip():
148157

149158
assert g.next_timestamp() == {
150159
"iso": "2018-05-01T00:59:59.000Z",
160+
"iso_prefix": "2018-05-01T00:59:59",
151161
"yyyy": "2018",
152162
"yy": "18",
153163
"mm": "05",
@@ -157,6 +167,7 @@ def test_generate_interval_and_skip():
157167

158168
assert g.next_timestamp() == {
159169
"iso": "2018-05-01T01:00:02.000Z",
170+
"iso_prefix": "2018-05-01T01:00:02",
160171
"yyyy": "2018",
161172
"yy": "18",
162173
"mm": "05",
@@ -168,6 +179,7 @@ def test_generate_interval_and_skip():
168179

169180
assert g.next_timestamp() == {
170181
"iso": "2018-05-02T00:59:59.000Z",
182+
"iso_prefix": "2018-05-02T00:59:59",
171183
"yyyy": "2018",
172184
"yy": "18",
173185
"mm": "05",
@@ -177,6 +189,7 @@ def test_generate_interval_and_skip():
177189

178190
assert g.next_timestamp() == {
179191
"iso": "2018-05-02T01:00:02.000Z",
192+
"iso_prefix": "2018-05-02T01:00:02",
180193
"yyyy": "2018",
181194
"yy": "18",
182195
"mm": "05",
@@ -185,6 +198,55 @@ def test_generate_interval_and_skip():
185198
}
186199

187200

201+
def test_simulate_ticks():
202+
clock = ReproducibleClock(start=datetime.datetime(year=2019, month=1, day=5, hour=15),
203+
delta=datetime.timedelta(seconds=1))
204+
205+
g = TimestampStructGenerator(starting_point="2018-05-01:00:59:56",
206+
acceleration_factor=3.0,
207+
utcnow=clock)
208+
209+
assert g.next_timestamp() == {
210+
"iso": "2018-05-01T00:59:59.000Z",
211+
"iso_prefix": "2018-05-01T00:59:59",
212+
"yyyy": "2018",
213+
"yy": "18",
214+
"mm": "05",
215+
"dd": "01",
216+
"hh": "00"
217+
}
218+
219+
assert g.simulate_tick(micros=1.0) == {
220+
"iso": "2018-05-01T00:59:59.001Z",
221+
"iso_prefix": "2018-05-01T00:59:59",
222+
"yyyy": "2018",
223+
"yy": "18",
224+
"mm": "05",
225+
"dd": "01",
226+
"hh": "00"
227+
}
228+
229+
assert g.simulate_tick(micros=0.1) == {
230+
"iso": "2018-05-01T00:59:59.001Z",
231+
"iso_prefix": "2018-05-01T00:59:59",
232+
"yyyy": "2018",
233+
"yy": "18",
234+
"mm": "05",
235+
"dd": "01",
236+
"hh": "00"
237+
}
238+
239+
assert g.simulate_tick(micros=10.0) == {
240+
"iso": "2018-05-01T00:59:59.011Z",
241+
"iso_prefix": "2018-05-01T00:59:59",
242+
"yyyy": "2018",
243+
"yy": "18",
244+
"mm": "05",
245+
"dd": "01",
246+
"hh": "00"
247+
}
248+
249+
188250
def test_generate_invalid_time_interval():
189251
# "w" is unsupported
190252
with pytest.raises(TimeParsingError) as ex:

0 commit comments

Comments
 (0)