-
Notifications
You must be signed in to change notification settings - Fork 307
/
Copy pathcommon.rb
453 lines (385 loc) · 17.4 KB
/
common.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
require "logstash/outputs/elasticsearch/template_manager"
module LogStash; module Outputs; class ElasticSearch;
module Common
attr_reader :client, :hosts
# These codes apply to documents, not at the request level
DOC_DLQ_CODES = [400, 404]
DOC_SUCCESS_CODES = [200, 201]
DOC_CONFLICT_CODE = 409
# When you use external versioning, you are communicating that you want
# to ignore conflicts. More obviously, since an external version is a
# constant part of the incoming document, we should not retry, as retrying
# will never succeed.
VERSION_TYPES_PERMITTING_CONFLICT = ["external", "external_gt", "external_gte"]
def register
@template_installed = Concurrent::AtomicBoolean.new(false)
@stopping = Concurrent::AtomicBoolean.new(false)
# To support BWC, we check if DLQ exists in core (< 5.4). If it doesn't, we use nil to resort to previous behavior.
@dlq_writer = dlq_enabled? ? execution_context.dlq_writer : nil
fill_hosts_from_cloud_id
fill_user_password_from_cloud_auth
setup_hosts # properly sets @hosts
build_client
setup_after_successful_connection
check_action_validity
@bulk_request_metrics = metric.namespace(:bulk_requests)
@document_level_metrics = metric.namespace(:documents)
@logger.info("New Elasticsearch output", :class => self.class.name, :hosts => @hosts.map(&:sanitized).map(&:to_s))
end
# Receive an array of events and immediately attempt to index them (no buffering)
def multi_receive(events)
until @template_installed.true?
sleep 1
end
retrying_submit(events.map {|e| event_action_tuple(e)})
end
def setup_after_successful_connection
@template_installer ||= Thread.new do
sleep_interval = @retry_initial_interval
until successful_connection? || @stopping.true?
@logger.debug("Waiting for connectivity to Elasticsearch cluster. Retrying in #{sleep_interval}s")
Stud.stoppable_sleep(sleep_interval) { @stopping.true? }
sleep_interval = next_sleep_interval(sleep_interval)
end
if successful_connection?
discover_cluster_uuid
install_template
setup_ilm if ilm_in_use?
end
end
end
def stop_template_installer
@template_installer.join unless @template_installer.nil?
end
def successful_connection?
!!maximum_seen_major_version
end
def use_event_type?(client)
client.maximum_seen_major_version < 8
end
# Convert the event into a 3-tuple of action, params, and event
def event_action_tuple(event)
action = event.sprintf(@action)
params = {
:_id => @document_id ? event.sprintf(@document_id) : nil,
:_index => event.sprintf(@index),
routing_field_name => @routing ? event.sprintf(@routing) : nil
}
params[:_type] = get_event_type(event) if use_event_type?(client)
if @pipeline
params[:pipeline] = event.sprintf(@pipeline)
end
if @parent
if @join_field
join_value = event.get(@join_field)
parent_value = event.sprintf(@parent)
event.set(@join_field, { "name" => join_value, "parent" => parent_value })
params[routing_field_name] = event.sprintf(@parent)
else
params[:parent] = event.sprintf(@parent)
end
end
if action == 'update'
params[:_upsert] = LogStash::Json.load(event.sprintf(@upsert)) if @upsert != ""
params[:_script] = event.sprintf(@script) if @script != ""
params[retry_on_conflict_action_name] = @retry_on_conflict
end
if @version
params[:version] = event.sprintf(@version)
end
if @version_type
params[:version_type] = event.sprintf(@version_type)
end
[action, params, event]
end
def setup_hosts
@hosts = Array(@hosts)
if @hosts.empty?
@logger.info("No 'host' set in elasticsearch output. Defaulting to localhost")
@hosts.replace(["localhost"])
end
end
def hosts_default?(hosts)
# NOTE: would be nice if pipeline allowed us a clean way to detect a config default :
hosts.is_a?(Array) && hosts.size == 1 && hosts.first.equal?(CommonConfigs::DEFAULT_HOST)
end
private :hosts_default?
def fill_hosts_from_cloud_id
return unless @cloud_id
if @hosts && !hosts_default?(@hosts)
raise LogStash::ConfigurationError, 'Both cloud_id and hosts specified, please only use one of those.'
end
@hosts = parse_host_uri_from_cloud_id(@cloud_id)
end
def fill_user_password_from_cloud_auth
return unless @cloud_auth
if @user || @password
raise LogStash::ConfigurationError, 'Both cloud_auth and user/password specified, please only use one.'
end
@user, @password = parse_user_password_from_cloud_auth(@cloud_auth)
params['user'], params['password'] = @user, @password
end
def parse_host_uri_from_cloud_id(cloud_id)
begin # might not be available on older LS
require 'logstash/util/cloud_setting_id'
rescue LoadError
raise LogStash::ConfigurationError, 'The cloud_id setting is not supported by your version of Logstash, ' +
'please upgrade your installation (or set hosts instead).'
end
begin
cloud_id = LogStash::Util::CloudSettingId.new(cloud_id) # already does append ':{port}' to host
rescue ArgumentError => e
raise LogStash::ConfigurationError, e.message.to_s.sub(/Cloud Id/i, 'cloud_id')
end
cloud_uri = "#{cloud_id.elasticsearch_scheme}://#{cloud_id.elasticsearch_host}"
LogStash::Util::SafeURI.new(cloud_uri)
end
private :parse_host_uri_from_cloud_id
def parse_user_password_from_cloud_auth(cloud_auth)
begin # might not be available on older LS
require 'logstash/util/cloud_setting_auth'
rescue LoadError
raise LogStash::ConfigurationError, 'The cloud_auth setting is not supported by your version of Logstash, ' +
'please upgrade your installation (or set user/password instead).'
end
cloud_auth = cloud_auth.value if cloud_auth.is_a?(LogStash::Util::Password)
begin
cloud_auth = LogStash::Util::CloudSettingAuth.new(cloud_auth)
rescue ArgumentError => e
raise LogStash::ConfigurationError, e.message.to_s.sub(/Cloud Auth/i, 'cloud_auth')
end
[ cloud_auth.username, cloud_auth.password ]
end
private :parse_user_password_from_cloud_auth
def maximum_seen_major_version
client.maximum_seen_major_version
end
def routing_field_name
maximum_seen_major_version >= 6 ? :routing : :_routing
end
def retry_on_conflict_action_name
maximum_seen_major_version >= 7 ? :retry_on_conflict : :_retry_on_conflict
end
def install_template
TemplateManager.install_template(self)
@template_installed.make_true
end
def discover_cluster_uuid
return unless defined?(plugin_metadata)
cluster_info = client.get('/')
plugin_metadata.set(:cluster_uuid, cluster_info['cluster_uuid'])
rescue => e
# TODO introducing this logging message breaks many tests that need refactoring
# @logger.error("Unable to retrieve elasticsearch cluster uuid", error => e.message)
end
def check_action_validity
raise LogStash::ConfigurationError, "No action specified!" unless @action
# If we're using string interpolation, we're good!
return if @action =~ /%{.+}/
return if valid_actions.include?(@action)
raise LogStash::ConfigurationError, "Action '#{@action}' is invalid! Pick one of #{valid_actions} or use a sprintf style statement"
end
# To be overidden by the -java version
VALID_HTTP_ACTIONS=["index", "delete", "create", "update"]
def valid_actions
VALID_HTTP_ACTIONS
end
def retrying_submit(actions)
# Initially we submit the full list of actions
submit_actions = actions
sleep_interval = @retry_initial_interval
failed_retries = 0
while submit_actions && submit_actions.length > 0
# We retry with whatever is didn't succeed
begin
submit_actions = submit(submit_actions)
if submit_actions && submit_actions.size > 0
@logger.info("Retrying individual bulk actions that failed or were rejected by the previous bulk request.", :count => submit_actions.size)
end
rescue => e
@logger.error("Encountered an unexpected error submitting a bulk request! Will retry.",
:error_message => e.message,
:class => e.class.name,
:backtrace => e.backtrace)
end
# Everything was a success!
break if !submit_actions || submit_actions.empty?
failed_retries += 1
if @retry_max_failures > 0 && failed_retries == @retry_max_failures
@logger.warn("Giving up on individual bulk actions that failed or were rejected by the previous bulk request.",
:actions => submit_actions.map { |action_type, params, event| [action_type, params, event.to_s] },
:retries => failed_retries,
:count => submit_actions.size)
break
end
# If we're retrying the action sleep for the recommended interval
# Double the interval for the next time through to achieve exponential backoff
Stud.stoppable_sleep(sleep_interval) { @stopping.true? }
sleep_interval = next_sleep_interval(sleep_interval)
end
end
def sleep_for_interval(sleep_interval)
Stud.stoppable_sleep(sleep_interval) { @stopping.true? }
next_sleep_interval(sleep_interval)
end
def next_sleep_interval(current_interval)
doubled = current_interval * 2
doubled > @retry_max_interval ? @retry_max_interval : doubled
end
def submit(actions)
bulk_response = safe_bulk(actions)
# If the response is nil that means we were in a retry loop
# and aborted since we're shutting down
return if bulk_response.nil?
# If it did return and there are no errors we're good as well
if bulk_response["errors"]
@bulk_request_metrics.increment(:with_errors)
else
@bulk_request_metrics.increment(:successes)
@document_level_metrics.increment(:successes, actions.size)
return
end
actions_to_retry = []
bulk_response["items"].each_with_index do |response,idx|
action_type, action_props = response.first
status = action_props["status"]
failure = action_props["error"]
action = actions[idx]
action_params = action[1]
# Retry logic: If it is success, we move on. If it is a failure, we have 3 paths:
# - For 409, we log and drop. there is nothing we can do
# - For a mapping error, we send to dead letter queue for a human to intervene at a later point.
# - For everything else there's mastercard. Yep, and we retry indefinitely. This should fix #572 and other transient network issues
if DOC_SUCCESS_CODES.include?(status)
@document_level_metrics.increment(:successes)
next
elsif DOC_CONFLICT_CODE == status
@document_level_metrics.increment(:non_retryable_failures)
@logger.warn "Failed action.", status: status, action: action, response: response if !failure_type_logging_whitelist.include?(failure["type"])
next
elsif DOC_DLQ_CODES.include?(status)
handle_dlq_status("Could not index event to Elasticsearch.", action, status, response)
@document_level_metrics.increment(:non_retryable_failures)
next
else
# only log what the user whitelisted
@document_level_metrics.increment(:retryable_failures)
@logger.info "retrying failed action with response code: #{status} (#{failure})" if !failure_type_logging_whitelist.include?(failure["type"])
actions_to_retry << action
end
end
actions_to_retry
end
def handle_dlq_status(message, action, status, response)
# To support bwc, we check if DLQ exists. otherwise we log and drop event (previous behavior)
if @dlq_writer
# TODO: Change this to send a map with { :status => status, :action => action } in the future
@dlq_writer.write(action[2], "#{message} status: #{status}, action: #{action}, response: #{response}")
else
error_type = response.fetch('index', {}).fetch('error', {})['type']
if 'invalid_index_name_exception' == error_type
level = :error
else
level = :warn
end
@logger.send level, message, status: status, action: action, response: response
end
end
# Determine the correct value for the 'type' field for the given event
DEFAULT_EVENT_TYPE_ES6="doc".freeze
DEFAULT_EVENT_TYPE_ES7="_doc".freeze
def get_event_type(event)
# Set the 'type' value for the index.
type = if @document_type
event.sprintf(@document_type)
else
if client.maximum_seen_major_version < 6
event.get("type") || DEFAULT_EVENT_TYPE_ES6
elsif client.maximum_seen_major_version == 6
DEFAULT_EVENT_TYPE_ES6
elsif client.maximum_seen_major_version == 7
DEFAULT_EVENT_TYPE_ES7
else
nil
end
end
if !(type.is_a?(String) || type.is_a?(Numeric))
@logger.warn("Bad event type! Non-string/integer type value set!", :type_class => type.class, :type_value => type.to_s, :event => event)
end
type.to_s
end
# Rescue retryable errors during bulk submission
def safe_bulk(actions)
sleep_interval = @retry_initial_interval
begin
es_actions = actions.map {|action_type, params, event| [action_type, params, event.to_hash]}
response = @client.bulk(es_actions)
response
rescue ::LogStash::Outputs::ElasticSearch::HttpClient::Pool::HostUnreachableError => e
# If we can't even connect to the server let's just print out the URL (:hosts is actually a URL)
# and let the user sort it out from there
@logger.error(
"Attempted to send a bulk request to elasticsearch'"+
" but Elasticsearch appears to be unreachable or down!",
:error_message => e.message,
:class => e.class.name,
:will_retry_in_seconds => sleep_interval
)
@logger.debug("Failed actions for last bad bulk request!", :actions => actions)
# We retry until there are no errors! Errors should all go to the retry queue
sleep_interval = sleep_for_interval(sleep_interval)
@bulk_request_metrics.increment(:failures)
retry unless @stopping.true?
rescue ::LogStash::Outputs::ElasticSearch::HttpClient::Pool::NoConnectionAvailableError => e
@logger.error(
"Attempted to send a bulk request to elasticsearch, but no there are no living connections in the connection pool. Perhaps Elasticsearch is unreachable or down?",
:error_message => e.message,
:class => e.class.name,
:will_retry_in_seconds => sleep_interval
)
Stud.stoppable_sleep(sleep_interval) { @stopping.true? }
sleep_interval = next_sleep_interval(sleep_interval)
@bulk_request_metrics.increment(:failures)
retry unless @stopping.true?
rescue ::LogStash::Outputs::ElasticSearch::HttpClient::Pool::BadResponseCodeError => e
@bulk_request_metrics.increment(:failures)
log_hash = {:code => e.response_code, :url => e.url.sanitized.to_s}
log_hash[:body] = e.response_body if @logger.debug? # Generally this is too verbose
message = "Encountered a retryable error. Will Retry with exponential backoff "
# We treat 429s as a special case because these really aren't errors, but
# rather just ES telling us to back off a bit, which we do.
# The other retryable code is 503, which are true errors
# Even though we retry the user should be made aware of these
if e.response_code == 429
logger.debug(message, log_hash)
else
logger.error(message, log_hash)
end
sleep_interval = sleep_for_interval(sleep_interval)
retry
rescue => e
# Stuff that should never happen
# For all other errors print out full connection issues
@logger.error(
"An unknown error occurred sending a bulk request to Elasticsearch. We will retry indefinitely",
:error_message => e.message,
:error_class => e.class.name,
:backtrace => e.backtrace
)
@logger.debug("Failed actions for last bad bulk request!", :actions => actions)
sleep_interval = sleep_for_interval(sleep_interval)
@bulk_request_metrics.increment(:failures)
retry unless @stopping.true?
end
end
def default_index?(index)
@index == LogStash::Outputs::ElasticSearch::CommonConfigs::DEFAULT_INDEX_NAME
end
def dlq_enabled?
# TODO there should be a better way to query if DLQ is enabled
# See more in: https://github.com/elastic/logstash/issues/8064
respond_to?(:execution_context) && execution_context.respond_to?(:dlq_writer) &&
!execution_context.dlq_writer.inner_writer.is_a?(::LogStash::Util::DummyDeadLetterQueueWriter)
end
end
end end end