1
1
from __future__ import unicode_literals
2
2
3
3
import logging
4
- from itertools import islice
5
4
from operator import methodcaller
6
5
7
6
from ..exceptions import ElasticsearchException , TransportError
@@ -40,22 +39,35 @@ def expand_action(data):
40
39
41
40
return action , data .get ('_source' , data )
42
41
43
- def _chunk_actions (actions , chunk_size , serializer ):
44
- while True :
45
- bulk_actions = []
46
- for action , data in islice (actions , chunk_size ):
47
- bulk_actions .append (serializer .dumps (action ))
48
- if data is not None :
49
- bulk_actions .append (serializer .dumps (data ))
50
-
51
- if not bulk_actions :
52
- return
53
-
42
+ def _chunk_actions (actions , chunk_size , max_chunk_bytes , serializer ):
43
+ bulk_actions = []
44
+ size , action_count = 0 , 0
45
+ for action , data in actions :
46
+ action = serializer .dumps (action )
47
+ cur_size = len (action ) + 1
48
+
49
+ if data is not None :
50
+ data = serializer .dumps (data )
51
+ cur_size += len (data ) + 1
52
+
53
+ # full chunk, send it and start a new one
54
+ if bulk_actions and (size + cur_size > max_chunk_bytes or action_count == chunk_size ):
55
+ yield bulk_actions
56
+ bulk_actions = []
57
+ size , action_count = 0 , 0
58
+
59
+ bulk_actions .append (action )
60
+ if data is not None :
61
+ bulk_actions .append (data )
62
+ size += cur_size
63
+ action_count += 1
64
+
65
+ if bulk_actions :
54
66
yield bulk_actions
55
67
56
- def streaming_bulk (client , actions , chunk_size = 500 , raise_on_error = True ,
57
- expand_action_callback = expand_action , raise_on_exception = True ,
58
- ** kwargs ):
68
+ def streaming_bulk (client , actions , chunk_size = 500 , max_chunk_bytes = 100 * 1014 * 1024 ,
69
+ raise_on_error = True , expand_action_callback = expand_action ,
70
+ raise_on_exception = True , ** kwargs ):
59
71
"""
60
72
Streaming bulk consumes actions from the iterable passed in and yields
61
73
results per action. For non-streaming usecases use
@@ -101,6 +113,7 @@ def streaming_bulk(client, actions, chunk_size=500, raise_on_error=True,
101
113
:arg client: instance of :class:`~elasticsearch.Elasticsearch` to use
102
114
:arg actions: iterable containing the actions to be executed
103
115
:arg chunk_size: number of docs in one chunk sent to es (default: 500)
116
+ :arg max_chunk_bytes: the maximum size of the request in bytes (default: 100MB)
104
117
:arg raise_on_error: raise ``BulkIndexError`` containing errors (as `.errors`)
105
118
from the execution of the last chunk when some occur. By default we raise.
106
119
:arg raise_on_exception: if ``False`` then don't propagate exceptions from
@@ -115,7 +128,7 @@ def streaming_bulk(client, actions, chunk_size=500, raise_on_error=True,
115
128
# if raise on error is set, we need to collect errors per chunk before raising them
116
129
errors = []
117
130
118
- for bulk_actions in _chunk_actions (actions , chunk_size , serializer ):
131
+ for bulk_actions in _chunk_actions (actions , chunk_size , max_chunk_bytes , serializer ):
119
132
try :
120
133
# send the actual request
121
134
resp = client .bulk ('\n ' .join (bulk_actions ) + '\n ' , ** kwargs )
0 commit comments