Skip to content

Commit 3d15cb0

Browse files
richardwilly98dadoonet
authored andcommitted
Add language detection option
Based on PR elastic#45, we add a new language detection option using Language detection feature available in Tika: https://tika.apache.org/1.4/detection.html#Language_Detection By default, language detection is disabled (`false`) as it could come with a cost. This default value can be changed by setting the `index.mapping.attachment.detect_language` setting. It can also be provided on a per document indexed using the `_detect_language` parameter. Closes elastic#45. Closes elastic#44.
1 parent 621995d commit 3d15cb0

File tree

7 files changed

+225
-7
lines changed

7 files changed

+225
-7
lines changed

README.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ The metadata supported are:
6363
* `keywords`
6464
* `content_type`
6565
* `content_length` is the original content_length before text extraction (aka file size)
66+
* `language`
6667

6768
They can be queried using the "dot notation", for example: `my_attachment.author`.
6869

@@ -81,7 +82,8 @@ Both the meta data and the actual content are simple core type mappers (string,
8182
"author" : {"analyzer" : "myAnalyzer"},
8283
"keywords" : {store : "yes"},
8384
"content_type" : {store : "yes"},
84-
"content_length" : {store : "yes"}
85+
"content_length" : {store : "yes"},
86+
"language" : {store : "yes"}
8587
}
8688
}
8789
}
@@ -96,7 +98,7 @@ Indexed Characters
9698

9799
By default, `100000` characters are extracted when indexing the content. This default value can be changed by setting the `index.mapping.attachment.indexed_chars` setting. It can also be provided on a per document indexed using the `_indexed_chars` parameter. `-1` can be set to extract all text, but note that all the text needs to be allowed to be represented in memory.
98100

99-
Note, this feature is support since `1.3.0` version.
101+
Note, this feature is supported since `1.3.0` version.
100102

101103
Metadata parsing error handling
102104
-------------------------------
@@ -106,6 +108,16 @@ Since version `1.9.0`, parsing errors are ignored so your document is indexed.
106108

107109
You can disable this feature by setting the `index.mapping.attachment.ignore_errors` setting to `false`.
108110

111+
Language Detection
112+
------------------
113+
114+
By default, language detection is disabled (`false`) as it could come with a cost.
115+
This default value can be changed by setting the `index.mapping.attachment.detect_language` setting.
116+
It can also be provided on a per document indexed using the `_detect_language` parameter.
117+
118+
Note, this feature is supported since `2.0.0` version.
119+
120+
109121
License
110122
-------
111123

src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.elasticsearch.index.mapper.attachment;
2121

22+
import org.apache.tika.language.LanguageIdentifier;
2223
import org.apache.tika.metadata.Metadata;
2324
import org.elasticsearch.common.io.stream.BytesStreamInput;
2425
import org.elasticsearch.common.logging.ESLogger;
@@ -70,9 +71,11 @@ public static class Builder extends Mapper.Builder<Builder, AttachmentMapper> {
7071

7172
private ContentPath.Type pathType = Defaults.PATH_TYPE;
7273

74+
private Boolean ignoreErrors = null;
75+
7376
private Integer defaultIndexedChars = null;
7477

75-
private Boolean ignoreErrors = null;
78+
private Boolean langDetect = null;
7679

7780
private Mapper.Builder contentBuilder;
7881

@@ -90,6 +93,8 @@ public static class Builder extends Mapper.Builder<Builder, AttachmentMapper> {
9093

9194
private Mapper.Builder contentLengthBuilder = integerField("content_length");
9295

96+
private Mapper.Builder languageBuilder = stringField("language");
97+
9398
public Builder(String name) {
9499
super(name);
95100
this.builder = this;
@@ -141,6 +146,11 @@ public Builder contentLength(Mapper.Builder contentType) {
141146
return this;
142147
}
143148

149+
public Builder language(Mapper.Builder language) {
150+
this.languageBuilder = language;
151+
return this;
152+
}
153+
144154
@Override
145155
public AttachmentMapper build(BuilderContext context) {
146156
ContentPath.Type origPathType = context.path().pathType();
@@ -158,6 +168,7 @@ public AttachmentMapper build(BuilderContext context) {
158168
Mapper keywordsMapper = keywordsBuilder.build(context);
159169
Mapper contentTypeMapper = contentTypeBuilder.build(context);
160170
Mapper contentLength = contentLengthBuilder.build(context);
171+
Mapper language = languageBuilder.build(context);
161172
context.path().remove();
162173

163174
context.path().pathType(origPathType);
@@ -176,7 +187,14 @@ public AttachmentMapper build(BuilderContext context) {
176187
ignoreErrors = Boolean.TRUE;
177188
}
178189

179-
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper, contentLength);
190+
if (langDetect == null && context.indexSettings() != null) {
191+
langDetect = context.indexSettings().getAsBoolean("index.mapping.attachment.detect_language", Boolean.FALSE);
192+
}
193+
if (langDetect == null) {
194+
langDetect = Boolean.FALSE;
195+
}
196+
197+
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, langDetect, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper, contentLength, language);
180198
}
181199
}
182200

@@ -245,6 +263,8 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext
245263
builder.contentType(parserContext.typeParser(StringFieldMapper.CONTENT_TYPE).parse("content_type", (Map<String, Object>) propNode, parserContext));
246264
} else if ("content_length".equals(propName)) {
247265
builder.contentLength(parserContext.typeParser(IntegerFieldMapper.CONTENT_TYPE).parse("content_length", (Map<String, Object>) propNode, parserContext));
266+
} else if ("language".equals(propName)) {
267+
builder.language(parserContext.typeParser(StringFieldMapper.CONTENT_TYPE).parse("language", (Map<String, Object>) propNode, parserContext));
248268
}
249269
}
250270
}
@@ -262,6 +282,8 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext
262282

263283
private final boolean ignoreErrors;
264284

285+
private final boolean defaultLangDetect;
286+
265287
private final Mapper contentMapper;
266288

267289
private final Mapper dateMapper;
@@ -278,13 +300,16 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext
278300

279301
private final Mapper contentLengthMapper;
280302

281-
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper,
303+
private final Mapper languageMapper;
304+
305+
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Boolean defaultLangDetect, Mapper contentMapper,
282306
Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper,
283-
Mapper keywordsMapper, Mapper contentTypeMapper, Mapper contentLengthMapper) {
307+
Mapper keywordsMapper, Mapper contentTypeMapper, Mapper contentLengthMapper, Mapper languageMapper) {
284308
this.name = name;
285309
this.pathType = pathType;
286310
this.defaultIndexedChars = defaultIndexedChars;
287311
this.ignoreErrors = ignoreErrors;
312+
this.defaultLangDetect = defaultLangDetect;
288313
this.contentMapper = contentMapper;
289314
this.dateMapper = dateMapper;
290315
this.titleMapper = titleMapper;
@@ -293,6 +318,7 @@ public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndex
293318
this.keywordsMapper = keywordsMapper;
294319
this.contentTypeMapper = contentTypeMapper;
295320
this.contentLengthMapper = contentLengthMapper;
321+
this.languageMapper = languageMapper;
296322
}
297323

298324
@Override
@@ -305,7 +331,9 @@ public void parse(ParseContext context) throws IOException {
305331
byte[] content = null;
306332
String contentType = null;
307333
int indexedChars = defaultIndexedChars;
334+
boolean langDetect = defaultLangDetect;
308335
String name = null;
336+
String language = null;
309337

310338
XContentParser parser = context.parser();
311339
XContentParser.Token token = parser.currentToken();
@@ -323,11 +351,17 @@ public void parse(ParseContext context) throws IOException {
323351
contentType = parser.text();
324352
} else if ("_name".equals(currentFieldName)) {
325353
name = parser.text();
354+
} else if ("language".equals(currentFieldName)) {
355+
language = parser.text();
326356
}
327357
} else if (token == XContentParser.Token.VALUE_NUMBER) {
328358
if ("_indexed_chars".equals(currentFieldName) || "_indexedChars".equals(currentFieldName)) {
329359
indexedChars = parser.intValue();
330360
}
361+
} else if (token == XContentParser.Token.VALUE_BOOLEAN) {
362+
if ("_detect_language".equals(currentFieldName) || "_detectLanguage".equals(currentFieldName)) {
363+
langDetect = parser.booleanValue();
364+
}
331365
}
332366
}
333367
}
@@ -347,7 +381,7 @@ public void parse(ParseContext context) throws IOException {
347381

348382
String parsedContent;
349383
try {
350-
// Set the maximum length of strings returned by the parseToString method, -1 sets no limit
384+
// Set the maximum length of strings returned by the parseToString method, -1 sets no limit
351385
parsedContent = tika().parseToString(new BytesStreamInput(content, false), metadata, indexedChars);
352386
} catch (Throwable e) {
353387
// #18: we could ignore errors when Tika does not parse data
@@ -358,6 +392,20 @@ public void parse(ParseContext context) throws IOException {
358392
context.externalValue(parsedContent);
359393
contentMapper.parse(context);
360394

395+
if (langDetect) {
396+
try {
397+
if (language != null) {
398+
metadata.add(Metadata.CONTENT_LANGUAGE, language);
399+
} else {
400+
LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
401+
language = identifier.getLanguage();
402+
}
403+
context.externalValue(language);
404+
languageMapper.parse(context);
405+
} catch(Throwable t) {
406+
logger.warn("Cannot detect language: {}", t.getMessage());
407+
}
408+
}
361409

362410
try {
363411
context.externalValue(name);
@@ -437,6 +485,7 @@ public void traverse(FieldMapperListener fieldMapperListener) {
437485
keywordsMapper.traverse(fieldMapperListener);
438486
contentTypeMapper.traverse(fieldMapperListener);
439487
contentLengthMapper.traverse(fieldMapperListener);
488+
languageMapper.traverse(fieldMapperListener);
440489
}
441490

442491
@Override
@@ -453,6 +502,7 @@ public void close() {
453502
keywordsMapper.close();
454503
contentTypeMapper.close();
455504
contentLengthMapper.close();
505+
languageMapper.close();
456506
}
457507

458508
@Override
@@ -470,6 +520,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
470520
keywordsMapper.toXContent(builder, params);
471521
contentTypeMapper.toXContent(builder, params);
472522
contentLengthMapper.toXContent(builder, params);
523+
languageMapper.toXContent(builder, params);
473524
builder.endObject();
474525

475526
builder.endObject();
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/*
2+
* Licensed to ElasticSearch and Shay Banon under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. ElasticSearch licenses this
6+
* file to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.mapper.xcontent;
21+
22+
import org.elasticsearch.common.settings.ImmutableSettings;
23+
import org.elasticsearch.common.xcontent.XContentBuilder;
24+
import org.elasticsearch.index.Index;
25+
import org.elasticsearch.index.analysis.AnalysisService;
26+
import org.elasticsearch.index.mapper.DocumentMapper;
27+
import org.elasticsearch.index.mapper.DocumentMapperParser;
28+
import org.elasticsearch.index.mapper.ParseContext;
29+
import org.elasticsearch.index.mapper.attachment.AttachmentMapper;
30+
import org.elasticsearch.index.mapper.core.StringFieldMapper;
31+
import org.elasticsearch.test.ElasticsearchTestCase;
32+
import org.junit.Before;
33+
import org.junit.Test;
34+
35+
import java.io.IOException;
36+
37+
import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath;
38+
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
39+
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
40+
import static org.hamcrest.Matchers.equalTo;
41+
import static org.hamcrest.Matchers.instanceOf;
42+
43+
/**
44+
*
45+
*/
46+
public class LanguageDetectionAttachmentMapperTests extends ElasticsearchTestCase {
47+
48+
private DocumentMapper docMapper;
49+
50+
@Before
51+
public void setupMapperParser() throws IOException {
52+
setupMapperParser(true);
53+
}
54+
55+
public void setupMapperParser(boolean langDetect) throws IOException {
56+
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"),
57+
ImmutableSettings.settingsBuilder().put("index.mapping.attachment.detect_language", langDetect).build(),
58+
new AnalysisService(new Index("test")), null, null, null);
59+
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
60+
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/language/language-mapping.json");
61+
docMapper = mapperParser.parse(mapping);
62+
63+
assertThat(docMapper.mappers().fullName("file.language").mapper(), instanceOf(StringFieldMapper.class));
64+
}
65+
66+
private void testLanguage(String filename, String expected, String... forcedLanguage) throws IOException {
67+
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/" + filename);
68+
69+
XContentBuilder xcb = jsonBuilder()
70+
.startObject()
71+
.field("_id", 1)
72+
.startObject("file")
73+
.field("_name", filename)
74+
.field("content", html);
75+
76+
if (forcedLanguage.length > 0) {
77+
xcb.field("language", forcedLanguage[0]);
78+
}
79+
80+
xcb.endObject().endObject();
81+
82+
ParseContext.Document doc = docMapper.parse(xcb.bytes()).rootDoc();
83+
84+
// Our mapping should be kept as a String
85+
assertThat(doc.get(docMapper.mappers().smartName("file.language").mapper().names().indexName()), equalTo(expected));
86+
}
87+
88+
@Test
89+
public void testFrDetection() throws Exception {
90+
testLanguage("text-in-french.txt", "fr");
91+
}
92+
93+
@Test
94+
public void testEnDetection() throws Exception {
95+
testLanguage("text-in-english.txt", "en");
96+
}
97+
98+
@Test
99+
public void testFrForced() throws Exception {
100+
testLanguage("text-in-english.txt", "fr", "fr");
101+
}
102+
103+
/**
104+
* This test gives strange results! detection of ":-)" gives "lt" as a result
105+
* @throws Exception
106+
*/
107+
@Test
108+
public void testNoLanguage() throws Exception {
109+
testLanguage("text-in-nolang.txt", "lt");
110+
}
111+
112+
@Test
113+
public void testLangDetectDisabled() throws Exception {
114+
// We replace the mapper with another one which have index.mapping.attachment.detect_language = false
115+
setupMapperParser(false);
116+
testLanguage("text-in-english.txt", null);
117+
}
118+
119+
@Test
120+
public void testLangDetectDocumentEnabled() throws Exception {
121+
// We replace the mapper with another one which have index.mapping.attachment.detect_language = false
122+
setupMapperParser(false);
123+
124+
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/text-in-english.txt");
125+
126+
XContentBuilder xcb = jsonBuilder()
127+
.startObject()
128+
.field("_id", 1)
129+
.startObject("file")
130+
.field("_name", "text-in-english.txt")
131+
.field("content", html)
132+
.field("_detect_language", true)
133+
.endObject().endObject();
134+
135+
ParseContext.Document doc = docMapper.parse(xcb.bytes()).rootDoc();
136+
137+
// Our mapping should be kept as a String
138+
assertThat(doc.get(docMapper.mappers().smartName("file.language").mapper().names().indexName()), equalTo("en"));
139+
}
140+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"person": {
3+
"properties": {
4+
"file": {
5+
"type": "attachment",
6+
"path": "full",
7+
"fields": {
8+
"language": { "type": "string" }
9+
}
10+
}
11+
}
12+
}
13+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"God Save the Queen" (alternatively "God Save the King"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Allons enfants de la Patrie Le jour de gloire est arrivé. Contre nous de la tyrannie

src/test/resources/org/elasticsearch/index/mapper/xcontent/text-in-nolang.txt

Whitespace-only changes.

0 commit comments

Comments
 (0)