|
| 1 | +/* |
| 2 | + * Licensed to Elasticsearch under one or more contributor |
| 3 | + * license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright |
| 5 | + * ownership. Elasticsearch licenses this file to you under |
| 6 | + * the Apache License, Version 2.0 (the "License"); you may |
| 7 | + * not use this file except in compliance with the License. |
| 8 | + * You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +package org.elasticsearch.index.analysis; |
| 21 | + |
| 22 | +import com.carrotsearch.randomizedtesting.generators.RandomStrings; |
| 23 | + |
| 24 | +import org.apache.lucene.analysis.core.WhitespaceTokenizer; |
| 25 | +import org.elasticsearch.Version; |
| 26 | +import org.elasticsearch.cluster.metadata.IndexMetaData; |
| 27 | +import org.elasticsearch.common.settings.Settings; |
| 28 | +import org.elasticsearch.index.Index; |
| 29 | +import org.elasticsearch.index.IndexSettings; |
| 30 | +import org.elasticsearch.test.ESTestCase; |
| 31 | +import org.elasticsearch.test.IndexSettingsModule; |
| 32 | + |
| 33 | +import java.io.IOException; |
| 34 | +import java.io.Reader; |
| 35 | +import java.io.StringReader; |
| 36 | + |
| 37 | +import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents; |
| 38 | + |
| 39 | +public class WhitespaceTokenizerFactoryTests extends ESTestCase { |
| 40 | + |
| 41 | + public void testSimpleWhiteSpaceTokenizer() throws IOException { |
| 42 | + final Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); |
| 43 | + IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(new Index("test", "_na_"), indexSettings); |
| 44 | + WhitespaceTokenizer tokenizer = (WhitespaceTokenizer) new WhitespaceTokenizerFactory(indexProperties, null, "whitespace_maxlen", |
| 45 | + Settings.EMPTY).create(); |
| 46 | + |
| 47 | + try (Reader reader = new StringReader("one, two, three")) { |
| 48 | + tokenizer.setReader(reader); |
| 49 | + assertTokenStreamContents(tokenizer, new String[] { "one,", "two,", "three" }); |
| 50 | + } |
| 51 | + } |
| 52 | + |
| 53 | + public void testMaxTokenLength() throws IOException { |
| 54 | + final Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); |
| 55 | + IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(new Index("test", "_na_"), indexSettings); |
| 56 | + final Settings settings = Settings.builder().put(WhitespaceTokenizerFactory.MAX_TOKEN_LENGTH, 2).build(); |
| 57 | + WhitespaceTokenizer tokenizer = (WhitespaceTokenizer) new WhitespaceTokenizerFactory(indexProperties, null, "whitespace_maxlen", |
| 58 | + settings).create(); |
| 59 | + try (Reader reader = new StringReader("one, two, three")) { |
| 60 | + tokenizer.setReader(reader); |
| 61 | + assertTokenStreamContents(tokenizer, new String[] { "on", "e,", "tw", "o,", "th", "re", "e" }); |
| 62 | + } |
| 63 | + |
| 64 | + final Settings defaultSettings = Settings.EMPTY; |
| 65 | + tokenizer = (WhitespaceTokenizer) new WhitespaceTokenizerFactory(indexProperties, null, "whitespace_maxlen", defaultSettings) |
| 66 | + .create(); |
| 67 | + String veryLongToken = RandomStrings.randomAsciiAlphanumOfLength(random(), 256); |
| 68 | + try (Reader reader = new StringReader(veryLongToken)) { |
| 69 | + tokenizer.setReader(reader); |
| 70 | + assertTokenStreamContents(tokenizer, new String[] { veryLongToken.substring(0, 255), veryLongToken.substring(255) }); |
| 71 | + } |
| 72 | + |
| 73 | + final Settings tooLongSettings = Settings.builder().put(WhitespaceTokenizerFactory.MAX_TOKEN_LENGTH, 1024 * 1024 + 1).build(); |
| 74 | + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, |
| 75 | + () -> new WhitespaceTokenizerFactory(indexProperties, null, "whitespace_maxlen", tooLongSettings).create()); |
| 76 | + assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 1048577", e.getMessage()); |
| 77 | + |
| 78 | + final Settings negativeSettings = Settings.builder().put(WhitespaceTokenizerFactory.MAX_TOKEN_LENGTH, -1).build(); |
| 79 | + e = expectThrows(IllegalArgumentException.class, |
| 80 | + () -> new WhitespaceTokenizerFactory(indexProperties, null, "whitespace_maxlen", negativeSettings).create()); |
| 81 | + assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage()); |
| 82 | + } |
| 83 | +} |
0 commit comments