Skip to content

Commit 6f2b2d9

Browse files
committed
[Rollup] Add more diagnostic stats to job (#35471)
This adds some new statistics to the job to help with debugging performance issues: - Total search and index time (in milliseconds) encounteed by the indexer during runtime. This time is the total service time including transfer between nodes, not just the `took` time. - Total count of search and index requests. Together with the total times, this can be used to determine average request time. - Count of search/bulk failures encountered during runtime. This information is also in the log, but a runtime counter will help expose problems faster
1 parent f2deb9a commit 6f2b2d9

File tree

13 files changed

+332
-39
lines changed

13 files changed

+332
-39
lines changed

client/rest-high-level/src/main/java/org/elasticsearch/client/rollup/GetRollupJobResponse.java

Lines changed: 93 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,15 @@
2626
import org.elasticsearch.common.xcontent.XContentParser;
2727

2828
import java.io.IOException;
29-
import java.util.Objects;
3029
import java.util.List;
3130
import java.util.Locale;
3231
import java.util.Map;
32+
import java.util.Objects;
3333

34-
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.constructorArg;
35-
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.optionalConstructorArg;
3634
import static java.util.Collections.unmodifiableList;
3735
import static java.util.stream.Collectors.joining;
36+
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.constructorArg;
37+
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.optionalConstructorArg;
3838

3939
/**
4040
* Response from rollup's get jobs api.
@@ -51,6 +51,12 @@ public class GetRollupJobResponse {
5151
static final ParseField STATE = new ParseField("job_state");
5252
static final ParseField CURRENT_POSITION = new ParseField("current_position");
5353
static final ParseField UPGRADED_DOC_ID = new ParseField("upgraded_doc_id");
54+
static final ParseField INDEX_TIME_IN_MS = new ParseField("index_time_in_ms");
55+
static final ParseField SEARCH_TIME_IN_MS = new ParseField("search_time_in_ms");
56+
static final ParseField INDEX_TOTAL = new ParseField("index_total");
57+
static final ParseField SEARCH_TOTAL = new ParseField("search_total");
58+
static final ParseField SEARCH_FAILURES = new ParseField("search_failures");
59+
static final ParseField INDEX_FAILURES = new ParseField("index_failures");
5460

5561
private List<JobWrapper> jobs;
5662

@@ -181,12 +187,25 @@ public static class RollupIndexerJobStats {
181187
private final long numInputDocuments;
182188
private final long numOuputDocuments;
183189
private final long numInvocations;
184-
185-
RollupIndexerJobStats(long numPages, long numInputDocuments, long numOuputDocuments, long numInvocations) {
190+
private long indexTime;
191+
private long indexTotal;
192+
private long searchTime;
193+
private long searchTotal;
194+
private long indexFailures;
195+
private long searchFailures;
196+
197+
RollupIndexerJobStats(long numPages, long numInputDocuments, long numOuputDocuments, long numInvocations,
198+
long indexTime, long indexTotal, long searchTime, long searchTotal, long indexFailures, long searchFailures) {
186199
this.numPages = numPages;
187200
this.numInputDocuments = numInputDocuments;
188201
this.numOuputDocuments = numOuputDocuments;
189202
this.numInvocations = numInvocations;
203+
this.indexTime = indexTime;
204+
this.indexTotal = indexTotal;
205+
this.searchTime = searchTime;
206+
this.searchTotal = searchTotal;
207+
this.indexFailures = indexFailures;
208+
this.searchFailures = searchFailures;
190209
}
191210

192211
/**
@@ -217,15 +236,65 @@ public long getOutputDocuments() {
217236
return numOuputDocuments;
218237
}
219238

239+
/**
240+
* Number of failures that have occurred during the bulk indexing phase of Rollup
241+
*/
242+
public long getIndexFailures() {
243+
return indexFailures;
244+
}
245+
246+
/**
247+
* Number of failures that have occurred during the search phase of Rollup
248+
*/
249+
public long getSearchFailures() {
250+
return searchFailures;
251+
}
252+
253+
/**
254+
* Returns the time spent indexing (cumulative) in milliseconds
255+
*/
256+
public long getIndexTime() {
257+
return indexTime;
258+
}
259+
260+
/**
261+
* Returns the time spent searching (cumulative) in milliseconds
262+
*/
263+
public long getSearchTime() {
264+
return searchTime;
265+
}
266+
267+
/**
268+
* Returns the total number of indexing requests that have been sent by the rollup job
269+
* (Note: this is not the number of _documents_ that have been indexed)
270+
*/
271+
public long getIndexTotal() {
272+
return indexTotal;
273+
}
274+
275+
/**
276+
* Returns the total number of search requests that have been sent by the rollup job
277+
*/
278+
public long getSearchTotal() {
279+
return searchTotal;
280+
}
281+
220282
private static final ConstructingObjectParser<RollupIndexerJobStats, Void> PARSER = new ConstructingObjectParser<>(
221283
STATS.getPreferredName(),
222284
true,
223-
args -> new RollupIndexerJobStats((long) args[0], (long) args[1], (long) args[2], (long) args[3]));
285+
args -> new RollupIndexerJobStats((long) args[0], (long) args[1], (long) args[2], (long) args[3],
286+
(long) args[4], (long) args[5], (long) args[6], (long) args[7], (long) args[8], (long) args[9]));
224287
static {
225288
PARSER.declareLong(constructorArg(), NUM_PAGES);
226289
PARSER.declareLong(constructorArg(), NUM_INPUT_DOCUMENTS);
227290
PARSER.declareLong(constructorArg(), NUM_OUTPUT_DOCUMENTS);
228291
PARSER.declareLong(constructorArg(), NUM_INVOCATIONS);
292+
PARSER.declareLong(constructorArg(), INDEX_TIME_IN_MS);
293+
PARSER.declareLong(constructorArg(), INDEX_TOTAL);
294+
PARSER.declareLong(constructorArg(), SEARCH_TIME_IN_MS);
295+
PARSER.declareLong(constructorArg(), SEARCH_TOTAL);
296+
PARSER.declareLong(constructorArg(), INDEX_FAILURES);
297+
PARSER.declareLong(constructorArg(), SEARCH_FAILURES);
229298
}
230299

231300
@Override
@@ -234,22 +303,35 @@ public boolean equals(Object other) {
234303
if (other == null || getClass() != other.getClass()) return false;
235304
RollupIndexerJobStats that = (RollupIndexerJobStats) other;
236305
return Objects.equals(this.numPages, that.numPages)
237-
&& Objects.equals(this.numInputDocuments, that.numInputDocuments)
238-
&& Objects.equals(this.numOuputDocuments, that.numOuputDocuments)
239-
&& Objects.equals(this.numInvocations, that.numInvocations);
306+
&& Objects.equals(this.numInputDocuments, that.numInputDocuments)
307+
&& Objects.equals(this.numOuputDocuments, that.numOuputDocuments)
308+
&& Objects.equals(this.numInvocations, that.numInvocations)
309+
&& Objects.equals(this.indexTime, that.indexTime)
310+
&& Objects.equals(this.searchTime, that.searchTime)
311+
&& Objects.equals(this.indexFailures, that.indexFailures)
312+
&& Objects.equals(this.searchFailures, that.searchFailures)
313+
&& Objects.equals(this.searchTotal, that.searchTotal)
314+
&& Objects.equals(this.indexTotal, that.indexTotal);
240315
}
241316

242317
@Override
243318
public int hashCode() {
244-
return Objects.hash(numPages, numInputDocuments, numOuputDocuments, numInvocations);
319+
return Objects.hash(numPages, numInputDocuments, numOuputDocuments, numInvocations,
320+
indexTime, searchTime, indexFailures, searchFailures, searchTotal, indexTotal);
245321
}
246322

247323
@Override
248324
public final String toString() {
249325
return "{pages=" + numPages
250326
+ ", input_docs=" + numInputDocuments
251327
+ ", output_docs=" + numOuputDocuments
252-
+ ", invocations=" + numInvocations + "}";
328+
+ ", invocations=" + numInvocations
329+
+ ", index_failures=" + indexFailures
330+
+ ", search_failures=" + searchFailures
331+
+ ", index_time_in_ms=" + indexTime
332+
+ ", index_total=" + indexTotal
333+
+ ", search_time_in_ms=" + searchTime
334+
+ ", search_total=" + searchTotal+ "}";
253335
}
254336
}
255337

client/rest-high-level/src/test/java/org/elasticsearch/client/rollup/GetRollupJobResponseTests.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,9 @@ private GetRollupJobResponse createTestInstance() {
6262
}
6363

6464
private RollupIndexerJobStats randomStats() {
65-
return new RollupIndexerJobStats(randomLong(), randomLong(), randomLong(), randomLong());
65+
return new RollupIndexerJobStats(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong(),
66+
randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong(),
67+
randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong());
6668
}
6769

6870
private RollupJobStatus randomStatus() {
@@ -115,6 +117,13 @@ public void toXContent(RollupIndexerJobStats stats, XContentBuilder builder, ToX
115117
builder.field(GetRollupJobResponse.NUM_INPUT_DOCUMENTS.getPreferredName(), stats.getNumDocuments());
116118
builder.field(GetRollupJobResponse.NUM_OUTPUT_DOCUMENTS.getPreferredName(), stats.getOutputDocuments());
117119
builder.field(GetRollupJobResponse.NUM_INVOCATIONS.getPreferredName(), stats.getNumInvocations());
120+
builder.field(GetRollupJobResponse.INDEX_TIME_IN_MS.getPreferredName(), stats.getIndexTime());
121+
builder.field(GetRollupJobResponse.INDEX_TOTAL.getPreferredName(), stats.getIndexTotal());
122+
builder.field(GetRollupJobResponse.INDEX_FAILURES.getPreferredName(), stats.getIndexFailures());
123+
builder.field(GetRollupJobResponse.SEARCH_TIME_IN_MS.getPreferredName(), stats.getSearchTime());
124+
builder.field(GetRollupJobResponse.SEARCH_TOTAL.getPreferredName(), stats.getSearchTotal());
125+
builder.field(GetRollupJobResponse.SEARCH_FAILURES.getPreferredName(), stats.getSearchFailures());
118126
builder.endObject();
119127
}
128+
120129
}

docs/reference/rollup/apis/get-job.asciidoc

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,13 @@ Which will yield the following response:
101101
"pages_processed" : 0,
102102
"documents_processed" : 0,
103103
"rollups_indexed" : 0,
104-
"trigger_count" : 0
104+
"trigger_count" : 0,
105+
"index_failures": 0,
106+
"index_time_in_ms": 0,
107+
"index_total": 0,
108+
"search_failures": 0,
109+
"search_time_in_ms": 0,
110+
"search_total": 0
105111
}
106112
}
107113
]
@@ -221,7 +227,13 @@ Which will yield the following response:
221227
"pages_processed" : 0,
222228
"documents_processed" : 0,
223229
"rollups_indexed" : 0,
224-
"trigger_count" : 0
230+
"trigger_count" : 0,
231+
"index_failures": 0,
232+
"index_time_in_ms": 0,
233+
"index_total": 0,
234+
"search_failures": 0,
235+
"search_time_in_ms": 0,
236+
"search_total": 0
225237
}
226238
},
227239
{
@@ -270,7 +282,13 @@ Which will yield the following response:
270282
"pages_processed" : 0,
271283
"documents_processed" : 0,
272284
"rollups_indexed" : 0,
273-
"trigger_count" : 0
285+
"trigger_count" : 0,
286+
"index_failures": 0,
287+
"index_time_in_ms": 0,
288+
"index_total": 0,
289+
"search_failures": 0,
290+
"search_time_in_ms": 0,
291+
"search_total": 0
274292
}
275293
}
276294
]

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/indexing/AsyncTwoPhaseIndexer.java

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,10 @@ public synchronized boolean maybeTriggerAsyncJob(long now) {
153153
// fire off the search. Note this is async, the method will return from here
154154
executor.execute(() -> {
155155
try {
156-
doNextSearch(buildSearchRequest(), ActionListener.wrap(this::onSearchResponse, exc -> finishWithFailure(exc)));
156+
stats.markStartSearch();
157+
doNextSearch(buildSearchRequest(), ActionListener.wrap(this::onSearchResponse, this::finishWithSearchFailure));
157158
} catch (Exception e) {
158-
finishWithFailure(e);
159+
finishWithSearchFailure(e);
159160
}
160161
});
161162
logger.debug("Beginning to index [" + getJobId() + "], state: [" + currentState + "]");
@@ -256,7 +257,13 @@ public synchronized boolean maybeTriggerAsyncJob(long now) {
256257
*/
257258
protected abstract void onAbort();
258259

259-
private void finishWithFailure(Exception exc) {
260+
private void finishWithSearchFailure(Exception exc) {
261+
stats.incrementSearchFailures();
262+
doSaveState(finishAndSetState(), position.get(), () -> onFailure(exc));
263+
}
264+
265+
private void finishWithIndexingFailure(Exception exc) {
266+
stats.incrementIndexingFailures();
260267
doSaveState(finishAndSetState(), position.get(), () -> onFailure(exc));
261268
}
262269

@@ -291,6 +298,7 @@ private IndexerState finishAndSetState() {
291298
}
292299

293300
private void onSearchResponse(SearchResponse searchResponse) {
301+
stats.markEndSearch();
294302
try {
295303
if (checkState(getState()) == false) {
296304
return;
@@ -320,6 +328,7 @@ private void onSearchResponse(SearchResponse searchResponse) {
320328
// TODO this might be a valid case, e.g. if implementation filters
321329
assert bulkRequest.requests().size() > 0;
322330

331+
stats.markStartIndexing();
323332
doNextBulk(bulkRequest, ActionListener.wrap(bulkResponse -> {
324333
// TODO we should check items in the response and move after accordingly to
325334
// resume the failing buckets ?
@@ -335,24 +344,24 @@ private void onSearchResponse(SearchResponse searchResponse) {
335344
position.set(newPosition);
336345

337346
onBulkResponse(bulkResponse, newPosition);
338-
}, exc -> finishWithFailure(exc)));
347+
}, this::finishWithIndexingFailure));
339348
} catch (Exception e) {
340-
finishWithFailure(e);
349+
finishWithSearchFailure(e);
341350
}
342351
}
343352

344353
private void onBulkResponse(BulkResponse response, JobPosition position) {
354+
stats.markEndIndexing();
345355
try {
346-
347-
ActionListener<SearchResponse> listener = ActionListener.wrap(this::onSearchResponse, this::finishWithFailure);
356+
ActionListener<SearchResponse> listener = ActionListener.wrap(this::onSearchResponse, this::finishWithSearchFailure);
348357
// TODO probably something more intelligent than every-50 is needed
349358
if (stats.getNumPages() > 0 && stats.getNumPages() % 50 == 0) {
350359
doSaveState(IndexerState.INDEXING, position, () -> doNextSearch(buildSearchRequest(), listener));
351360
} else {
352361
doNextSearch(buildSearchRequest(), listener);
353362
}
354363
} catch (Exception e) {
355-
finishWithFailure(e);
364+
finishWithIndexingFailure(e);
356365
}
357366
}
358367

0 commit comments

Comments
 (0)