|
| 1 | +# encoding: utf-8 |
| 2 | + |
1 | 3 | module Elasticsearch
|
2 | 4 | module Extensions
|
3 |
| - # Reindex using the scroll api. This moves data (not mappings) from one index |
4 |
| - # to another. The target index can be on a different cluster. |
5 |
| - # |
6 |
| - # This is useful when updating mappings on existing fields in an index (eg with |
7 |
| - # new analyzers). |
8 |
| - # |
9 |
| - # @example Reindex all documents under a new index name |
| 5 | + |
| 6 | + # This module allows copying documents from one index/cluster to another one |
10 | 7 | #
|
11 |
| - # Elasticsearch::Extensions::Reindex.new client: client, src_index: 'foo', target_index: 'bar' |
| 8 | + # When required together with the client, it will add the `reindex` method |
12 | 9 | #
|
13 |
| - # @see https://www.elastic.co/guide/en/elasticsearch/guide/current/reindex.html |
| 10 | + # @see Reindex::Reindex.initialize |
| 11 | + # @see Reindex::Reindex#perform |
14 | 12 | #
|
15 |
| - # @option arguments [Client] :client (*Required*) |
16 |
| - # @option arguments [String] :src_index (*Required*) |
17 |
| - # @option arguments [String] :target_index (*Required*) |
18 |
| - # @option arguments [Client] :target_client |
19 |
| - # @option arguments [Int] :chunk_size |
20 |
| - # @option arguments [String] :period period to ask es to keep scroll buffer open '5m' |
| 13 | + # @see http://www.rubydoc.info/gems/elasticsearch-api/Elasticsearch/API/Actions#reindex-instance_method |
21 | 14 | #
|
22 |
| - class Reindex |
23 |
| - def initialize(opts = {}) |
24 |
| - raise ArgumentError, "Required argument 'client' missing" unless opts[:client] |
25 |
| - raise ArgumentError, "Required argument 'src_index' missing" unless opts[:src_index] |
26 |
| - raise ArgumentError, "Required argument 'target_index' missing" unless opts[:target_index] |
27 |
| - |
28 |
| - valid_params = [ |
29 |
| - :client, |
30 |
| - :src_index, |
31 |
| - :target_index, |
32 |
| - :target_client, |
33 |
| - :chunk_size, |
34 |
| - :period |
35 |
| - ] |
36 |
| - |
37 |
| - default_params = { |
38 |
| - chunk_size: 500, |
39 |
| - period: '5m' |
40 |
| - } |
41 |
| - |
42 |
| - opts.each { |k, v| raise ArgumentError unless valid_params.include?(k) } |
43 |
| - params = default_params.merge(opts) |
44 |
| - client = params[:client] |
45 |
| - target_client = params[:target_client] || client |
46 |
| - |
47 |
| - r = client.search(index: params[:src_index], |
48 |
| - search_type: 'scan', |
49 |
| - scroll: params[:period], |
50 |
| - size: params[:chunk_size]) |
51 |
| - |
52 |
| - while r = client.scroll(scroll_id: r['_scroll_id'], scroll: params[:period]) do |
53 |
| - docs = r['hits']['hits'] |
54 |
| - break if docs.empty? |
55 |
| - body = docs.map do |doc| |
56 |
| - doc['_index'] = params[:target_index] |
57 |
| - doc['data'] = doc['_source'] |
58 |
| - doc.delete('_score') |
59 |
| - doc.delete('_source') |
60 |
| - { index: doc } |
| 15 | + module Reindex |
| 16 | + |
| 17 | + # Initialize a new instance of the Reindex class (shortcut) |
| 18 | + # |
| 19 | + # @see Reindex::Reindex.initialize |
| 20 | + # |
| 21 | + def new(arguments={}) |
| 22 | + Reindex.new(arguments) |
| 23 | + end; extend self |
| 24 | + |
| 25 | + module API |
| 26 | + # Copy documents from one index into another and refresh the target index |
| 27 | + # |
| 28 | + # @example |
| 29 | + # client.reindex source: { index: 'test1' }, target: { index: 'test2' }, refresh: true |
| 30 | + # |
| 31 | + # The method allows all the options as {Reindex::Reindex.new}. |
| 32 | + # |
| 33 | + # This method will be mixed into the Elasticsearch client's API, if available. |
| 34 | + # |
| 35 | + def reindex(arguments={}) |
| 36 | + arguments[:source] ||= {} |
| 37 | + arguments[:source][:client] = self |
| 38 | + Reindex.new(arguments).perform |
| 39 | + end |
| 40 | + end |
| 41 | + |
| 42 | + # Include the `reindex` method in the API and client, if available |
| 43 | + Elasticsearch::API::Actions.__send__ :include, API if defined?(Elasticsearch::API::Actions) |
| 44 | + Elasticsearch::Transport::Client.__send__ :include, API if defined?(Elasticsearch::Transport::Client) && defined?(Elasticsearch::API) |
| 45 | + |
| 46 | + # Copy documents from one index into another |
| 47 | + # |
| 48 | + # @example Copy documents to another index |
| 49 | + # |
| 50 | + # client = Elasticsearch::Client.new |
| 51 | + # reindex = Elasticsearch::Extensions::Reindex.new \ |
| 52 | + # source: { index: 'test1', client: client }, |
| 53 | + # target: { index: 'test2' } |
| 54 | + # |
| 55 | + # reindex.perform |
| 56 | + # |
| 57 | + # @example Copy documents to a different cluster |
| 58 | + # |
| 59 | + # source_client = Elasticsearch::Client.new url: 'http://localhost:9200' |
| 60 | + # target_client = Elasticsearch::Client.new url: 'http://localhost:9250' |
| 61 | + # |
| 62 | + # reindex = Elasticsearch::Extensions::Reindex.new \ |
| 63 | + # source: { index: 'test', client: source_client }, |
| 64 | + # target: { index: 'test', client: target_client } |
| 65 | + # reindex.perform |
| 66 | + # |
| 67 | + # @example Transform the documents during re-indexing |
| 68 | + # |
| 69 | + # reindex = Elasticsearch::Extensions::Reindex.new \ |
| 70 | + # source: { index: 'test1', client: client }, |
| 71 | + # target: { index: 'test2' }, |
| 72 | + # transform: lambda { |doc| doc['_source']['category'].upcase! } |
| 73 | + # |
| 74 | + # The reindexing process works by "scrolling" an index and sending |
| 75 | + # batches via the "Bulk" API to the target index/cluster |
| 76 | + # |
| 77 | + # @option arguments [String] :source The source index/cluster definition (*Required*) |
| 78 | + # @option arguments [String] :target The target index/cluster definition (*Required*) |
| 79 | + # @option arguments [Integer] :batch_size The size of the batch for scroll operation (Default: 1000) |
| 80 | + # @option arguments [String] :scroll The timeout for the scroll operation (Default: 5min) |
| 81 | + # @option arguments [Boolean] :refresh Whether to refresh the target index after |
| 82 | + # the operation is completed (Default: false) |
| 83 | + # @option arguments [Proc] :transform A block which will be executed for each document |
| 84 | + # |
| 85 | + # Be aware, that if you want to change the target index settings and/or mappings, |
| 86 | + # you have to do so in advance by using the "Indices Create" API. |
| 87 | + # |
| 88 | + # Note, that there is a native "Reindex" API in Elasticsearch 2.3.x and higer versions, |
| 89 | + # which will be more performant than the Ruby version. |
| 90 | + # |
| 91 | + # @see http://www.rubydoc.info/gems/elasticsearch-api/Elasticsearch/API/Actions#reindex-instance_method |
| 92 | + # |
| 93 | + class Reindex |
| 94 | + attr_reader :arguments |
| 95 | + |
| 96 | + def initialize(arguments={}) |
| 97 | + [ |
| 98 | + [:source, :index], |
| 99 | + [:source, :client], |
| 100 | + [:target, :index] |
| 101 | + ].each do |required_option| |
| 102 | + value = required_option.reduce(arguments) { |sum, o| sum = sum[o] ? sum[o] : {} } |
| 103 | + |
| 104 | + raise ArgumentError, |
| 105 | + "Required argument '#{Hash[*required_option]}' missing" if \ |
| 106 | + value.respond_to?(:empty?) ? value.empty? : value.nil? |
| 107 | + end |
| 108 | + |
| 109 | + @arguments = { |
| 110 | + batch_size: 1000, |
| 111 | + scroll: '5m', |
| 112 | + refresh: false |
| 113 | + }.merge(arguments) |
| 114 | + |
| 115 | + arguments[:target][:client] ||= arguments[:source][:client] |
| 116 | + end |
| 117 | + |
| 118 | + # Performs the operation |
| 119 | + # |
| 120 | + # @return [Hash] A Hash with the information about the operation outcome |
| 121 | + # |
| 122 | + def perform |
| 123 | + output = { errors: 0 } |
| 124 | + |
| 125 | + response = arguments[:source][:client].search( |
| 126 | + index: arguments[:source][:index], |
| 127 | + scroll: arguments[:scroll], |
| 128 | + size: arguments[:batch_size], |
| 129 | + search_type: 'scan', |
| 130 | + fields: ['_source', '_parent', '_routing', '_timestamp'] |
| 131 | + ) |
| 132 | + |
| 133 | + while response = arguments[:source][:client].scroll(scroll_id: response['_scroll_id'], scroll: arguments[:scroll]) do |
| 134 | + documents = response['hits']['hits'] |
| 135 | + break if documents.empty? |
| 136 | + |
| 137 | + bulk = documents.map do |doc| |
| 138 | + doc['_index'] = arguments[:target][:index] |
| 139 | + |
| 140 | + arguments[:target][:transform].call(doc) if arguments[:target][:transform] |
| 141 | + |
| 142 | + doc['data'] = doc['_source'] |
| 143 | + doc.delete('_score') |
| 144 | + doc.delete('_source') |
| 145 | + |
| 146 | + { index: doc } |
| 147 | + end |
| 148 | + |
| 149 | + bulk_response = arguments[:target][:client].bulk body: bulk |
| 150 | + output[:errors] += bulk_response['items'].select { |k, v| k.values.first['error'] }.size |
61 | 151 | end
|
62 |
| - target_client.bulk body: body |
| 152 | + |
| 153 | + arguments[:target][:client].indices.refresh index: arguments[:target][:index] if arguments[:refresh] |
| 154 | + |
| 155 | + output |
63 | 156 | end
|
64 | 157 | end
|
65 | 158 | end
|
|
0 commit comments