7
7
8
8
import com .ibm .icu .text .CharsetDetector ;
9
9
import com .ibm .icu .text .CharsetMatch ;
10
+ import org .elasticsearch .ElasticsearchTimeoutException ;
10
11
import org .elasticsearch .common .collect .Tuple ;
12
+ import org .elasticsearch .common .unit .TimeValue ;
11
13
12
14
import java .io .BufferedInputStream ;
13
15
import java .io .BufferedReader ;
23
25
import java .util .HashSet ;
24
26
import java .util .List ;
25
27
import java .util .Locale ;
28
+ import java .util .Objects ;
26
29
import java .util .Optional ;
27
30
import java .util .Set ;
31
+ import java .util .concurrent .ScheduledExecutorService ;
28
32
import java .util .stream .Collectors ;
29
33
30
34
/**
31
35
* Runs the high-level steps needed to create ingest configs for the specified file. In order:
32
36
* 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
33
37
* 2. Load a sample of the file, consisting of the first 1000 lines of the file
34
- * 3. Determine the most likely file structure - one of ND-JSON, XML, CSV, TSV or semi-structured text
38
+ * 3. Determine the most likely file structure - one of ND-JSON, XML, delimited or semi-structured text
35
39
* 4. Create an appropriate structure object and delegate writing configs to it
36
40
*/
37
41
public final class FileStructureFinderManager {
@@ -81,8 +85,18 @@ public final class FileStructureFinderManager {
81
85
82
86
private static final int BUFFER_SIZE = 8192 ;
83
87
88
+ private final ScheduledExecutorService scheduler ;
89
+
90
+ /**
91
+ * Create the file structure manager.
92
+ * @param scheduler Used for checking timeouts.
93
+ */
94
+ public FileStructureFinderManager (ScheduledExecutorService scheduler ) {
95
+ this .scheduler = Objects .requireNonNull (scheduler );
96
+ }
97
+
84
98
public FileStructureFinder findFileStructure (Integer idealSampleLineCount , InputStream fromFile ) throws Exception {
85
- return findFileStructure (idealSampleLineCount , fromFile , FileStructureOverrides .EMPTY_OVERRIDES );
99
+ return findFileStructure (idealSampleLineCount , fromFile , FileStructureOverrides .EMPTY_OVERRIDES , null );
86
100
}
87
101
88
102
/**
@@ -95,42 +109,49 @@ public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Input
95
109
* @param overrides Aspects of the file structure that are known in advance. These take precedence over
96
110
* values determined by structure analysis. An exception will be thrown if the file structure
97
111
* is incompatible with an overridden value.
112
+ * @param timeout The maximum time the analysis is permitted to take. If it takes longer than this an
113
+ * {@link ElasticsearchTimeoutException} may be thrown (although not necessarily immediately
114
+ * the timeout is exceeded).
98
115
* @return A {@link FileStructureFinder} object from which the structure and messages can be queried.
99
116
* @throws Exception A variety of problems could occur at various stages of the structure finding process.
100
117
*/
101
- public FileStructureFinder findFileStructure (Integer idealSampleLineCount , InputStream fromFile , FileStructureOverrides overrides )
118
+ public FileStructureFinder findFileStructure (Integer idealSampleLineCount , InputStream fromFile , FileStructureOverrides overrides ,
119
+ TimeValue timeout )
102
120
throws Exception {
103
121
return findFileStructure (new ArrayList <>(), (idealSampleLineCount == null ) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount ,
104
- fromFile , overrides );
122
+ fromFile , overrides , timeout );
105
123
}
106
124
107
125
public FileStructureFinder findFileStructure (List <String > explanation , int idealSampleLineCount , InputStream fromFile )
108
126
throws Exception {
109
- return findFileStructure (new ArrayList <>() , idealSampleLineCount , fromFile , FileStructureOverrides .EMPTY_OVERRIDES );
127
+ return findFileStructure (explanation , idealSampleLineCount , fromFile , FileStructureOverrides .EMPTY_OVERRIDES , null );
110
128
}
111
129
112
130
public FileStructureFinder findFileStructure (List <String > explanation , int idealSampleLineCount , InputStream fromFile ,
113
- FileStructureOverrides overrides ) throws Exception {
114
-
115
- String charsetName = overrides .getCharset ();
116
- Reader sampleReader ;
117
- if (charsetName != null ) {
118
- // Creating the reader will throw if the specified character set does not exist
119
- sampleReader = new InputStreamReader (fromFile , charsetName );
120
- explanation .add ("Using specified character encoding [" + charsetName + "]" );
121
- } else {
122
- CharsetMatch charsetMatch = findCharset (explanation , fromFile );
123
- charsetName = charsetMatch .getName ();
124
- sampleReader = charsetMatch .getReader ();
125
- }
131
+ FileStructureOverrides overrides , TimeValue timeout ) throws Exception {
132
+
133
+ try (TimeoutChecker timeoutChecker = new TimeoutChecker ("structure analysis" , timeout , scheduler )) {
134
+
135
+ String charsetName = overrides .getCharset ();
136
+ Reader sampleReader ;
137
+ if (charsetName != null ) {
138
+ // Creating the reader will throw if the specified character set does not exist
139
+ sampleReader = new InputStreamReader (fromFile , charsetName );
140
+ explanation .add ("Using specified character encoding [" + charsetName + "]" );
141
+ } else {
142
+ CharsetMatch charsetMatch = findCharset (explanation , fromFile , timeoutChecker );
143
+ charsetName = charsetMatch .getName ();
144
+ sampleReader = charsetMatch .getReader ();
145
+ }
126
146
127
- Tuple <String , Boolean > sampleInfo = sampleFile (sampleReader , charsetName , MIN_SAMPLE_LINE_COUNT ,
128
- Math .max (MIN_SAMPLE_LINE_COUNT , idealSampleLineCount ));
147
+ Tuple <String , Boolean > sampleInfo = sampleFile (sampleReader , charsetName , MIN_SAMPLE_LINE_COUNT ,
148
+ Math .max (MIN_SAMPLE_LINE_COUNT , idealSampleLineCount ), timeoutChecker );
129
149
130
- return makeBestStructureFinder (explanation , sampleInfo .v1 (), charsetName , sampleInfo .v2 (), overrides );
150
+ return makeBestStructureFinder (explanation , sampleInfo .v1 (), charsetName , sampleInfo .v2 (), overrides , timeoutChecker );
151
+ }
131
152
}
132
153
133
- CharsetMatch findCharset (List <String > explanation , InputStream inputStream ) throws Exception {
154
+ CharsetMatch findCharset (List <String > explanation , InputStream inputStream , TimeoutChecker timeoutChecker ) throws Exception {
134
155
135
156
// We need an input stream that supports mark and reset, so wrap the argument
136
157
// in a BufferedInputStream if it doesn't already support this feature
@@ -141,6 +162,7 @@ CharsetMatch findCharset(List<String> explanation, InputStream inputStream) thro
141
162
// This is from ICU4J
142
163
CharsetDetector charsetDetector = new CharsetDetector ().setText (inputStream );
143
164
CharsetMatch [] charsetMatches = charsetDetector .detectAll ();
165
+ timeoutChecker .check ("character set detection" );
144
166
145
167
// Determine some extra characteristics of the input to compensate for some deficiencies of ICU4J
146
168
boolean pureAscii = true ;
@@ -164,6 +186,7 @@ CharsetMatch findCharset(List<String> explanation, InputStream inputStream) thro
164
186
remainingLength -= bytesRead ;
165
187
} while (containsZeroBytes == false && remainingLength > 0 );
166
188
inputStream .reset ();
189
+ timeoutChecker .check ("character set detection" );
167
190
168
191
if (pureAscii ) {
169
192
// If the input is pure ASCII then many single byte character sets will match. We want to favour
@@ -220,7 +243,7 @@ CharsetMatch findCharset(List<String> explanation, InputStream inputStream) thro
220
243
}
221
244
222
245
FileStructureFinder makeBestStructureFinder (List <String > explanation , String sample , String charsetName , Boolean hasByteOrderMarker ,
223
- FileStructureOverrides overrides ) throws Exception {
246
+ FileStructureOverrides overrides , TimeoutChecker timeoutChecker ) throws Exception {
224
247
225
248
Character delimiter = overrides .getDelimiter ();
226
249
Character quote = overrides .getQuote ();
@@ -250,16 +273,18 @@ FileStructureFinder makeBestStructureFinder(List<String> explanation, String sam
250
273
}
251
274
252
275
for (FileStructureFinderFactory factory : factories ) {
276
+ timeoutChecker .check ("high level format detection" );
253
277
if (factory .canCreateFromSample (explanation , sample )) {
254
- return factory .createFromSample (explanation , sample , charsetName , hasByteOrderMarker , overrides );
278
+ return factory .createFromSample (explanation , sample , charsetName , hasByteOrderMarker , overrides , timeoutChecker );
255
279
}
256
280
}
257
281
258
282
throw new IllegalArgumentException ("Input did not match " +
259
283
((overrides .getFormat () == null ) ? "any known formats" : "the specified format [" + overrides .getFormat () + "]" ));
260
284
}
261
285
262
- private Tuple <String , Boolean > sampleFile (Reader reader , String charsetName , int minLines , int maxLines ) throws IOException {
286
+ private Tuple <String , Boolean > sampleFile (Reader reader , String charsetName , int minLines , int maxLines , TimeoutChecker timeoutChecker )
287
+ throws IOException {
263
288
264
289
int lineCount = 0 ;
265
290
BufferedReader bufferedReader = new BufferedReader (reader );
@@ -283,6 +308,7 @@ private Tuple<String, Boolean> sampleFile(Reader reader, String charsetName, int
283
308
String line ;
284
309
while ((line = bufferedReader .readLine ()) != null && ++lineCount <= maxLines ) {
285
310
sample .append (line ).append ('\n' );
311
+ timeoutChecker .check ("sample line splitting" );
286
312
}
287
313
288
314
if (lineCount < minLines ) {
0 commit comments