Skip to content

Commit b35ad80

Browse files
committed
Ignore encrypted documents
Original request: I am sending multiple pdf, word etc. attachments in one documents to be indexed. Some of them (pdf) are encrypted and I am getting a MapperParsingException caused by org.apache.tika.exception.TikaException: Unable to extract PDF content cause by org.apache.pdfbox.exceptions.WrappedIOException: Error decrypting document. I was wondering if the attachment mapper could expose some switch to ignore the documents it can not extract? As we now have option `ignore_errors`, we can support it. See elastic#38 relative to this option. Closes elastic#18.
1 parent d6aa2f0 commit b35ad80

File tree

5 files changed

+270
-1
lines changed

5 files changed

+270
-1
lines changed

src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,9 @@ public void parse(ParseContext context) throws IOException {
356356
// Set the maximum length of strings returned by the parseToString method, -1 sets no limit
357357
parsedContent = tika().parseToString(new BytesStreamInput(content, false), metadata, indexedChars);
358358
} catch (Throwable e) {
359-
throw new MapperParsingException("Failed to extract [" + indexedChars + "] characters of text for [" + name + "]", e);
359+
// #18: we could ignore errors when Tika does not parse data
360+
if (!ignoreErrors) throw new MapperParsingException("Failed to extract [" + indexedChars + "] characters of text for [" + name + "]", e);
361+
return;
360362
}
361363

362364
context.externalValue(parsedContent);
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
package org.elasticsearch.index.mapper.xcontent;
2+
3+
import org.apache.lucene.document.Document;
4+
import org.elasticsearch.common.bytes.BytesReference;
5+
import org.elasticsearch.common.settings.ImmutableSettings;
6+
import org.elasticsearch.index.Index;
7+
import org.elasticsearch.index.analysis.AnalysisService;
8+
import org.elasticsearch.index.mapper.DocumentMapper;
9+
import org.elasticsearch.index.mapper.DocumentMapperParser;
10+
import org.elasticsearch.index.mapper.MapperParsingException;
11+
import org.elasticsearch.index.mapper.attachment.AttachmentMapper;
12+
import org.testng.annotations.Test;
13+
14+
import java.io.IOException;
15+
16+
import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath;
17+
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
18+
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
19+
import static org.hamcrest.MatcherAssert.assertThat;
20+
import static org.hamcrest.Matchers.*;
21+
22+
/**
23+
* Test for https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/18
24+
* Note that we have converted /org/elasticsearch/index/mapper/xcontent/testContentLength.txt
25+
* to a /org/elasticsearch/index/mapper/xcontent/encrypted.pdf with password `12345678`.
26+
*/
27+
public class EncryptedDocMapperTest {
28+
29+
@Test
30+
public void testMultipleDocsEncryptedLast() throws IOException {
31+
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), new AnalysisService(new Index("test")), null, null);
32+
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
33+
34+
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/multipledocs/test-mapping.json");
35+
DocumentMapper docMapper = mapperParser.parse(mapping);
36+
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html");
37+
byte[] pdf = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/encrypted.pdf");
38+
39+
BytesReference json = jsonBuilder()
40+
.startObject()
41+
.field("_id", 1)
42+
.field("file1", html)
43+
.field("file2", pdf)
44+
.endObject().bytes();
45+
46+
Document doc = docMapper.parse(json).rootDoc();
47+
assertThat(doc.get(docMapper.mappers().smartName("file1").mapper().names().indexName()), containsString("World"));
48+
assertThat(doc.get(docMapper.mappers().smartName("file1.title").mapper().names().indexName()), equalTo("Hello"));
49+
assertThat(doc.get(docMapper.mappers().smartName("file1.author").mapper().names().indexName()), equalTo("kimchy"));
50+
assertThat(doc.get(docMapper.mappers().smartName("file1.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
51+
assertThat(doc.get(docMapper.mappers().smartName("file1.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
52+
assertThat(doc.getField(docMapper.mappers().smartName("file1.content_length").mapper().names().indexName()).numericValue().longValue(), is(344L));
53+
54+
assertThat(doc.get(docMapper.mappers().smartName("file2").mapper().names().indexName()), nullValue());
55+
assertThat(doc.get(docMapper.mappers().smartName("file2.title").mapper().names().indexName()), nullValue());
56+
assertThat(doc.get(docMapper.mappers().smartName("file2.author").mapper().names().indexName()), nullValue());
57+
assertThat(doc.get(docMapper.mappers().smartName("file2.keywords").mapper().names().indexName()), nullValue());
58+
assertThat(doc.get(docMapper.mappers().smartName("file2.content_type").mapper().names().indexName()), nullValue());
59+
assertThat(doc.getField(docMapper.mappers().smartName("file2.content_length").mapper().names().indexName()), nullValue());
60+
}
61+
62+
@Test
63+
public void testMultipleDocsEncryptedFirst() throws IOException {
64+
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), new AnalysisService(new Index("test")), null, null);
65+
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
66+
67+
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/multipledocs/test-mapping.json");
68+
DocumentMapper docMapper = mapperParser.parse(mapping);
69+
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html");
70+
byte[] pdf = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/encrypted.pdf");
71+
72+
BytesReference json = jsonBuilder()
73+
.startObject()
74+
.field("_id", 1)
75+
.field("file1", pdf)
76+
.field("file2", html)
77+
.endObject().bytes();
78+
79+
Document doc = docMapper.parse(json).rootDoc();
80+
assertThat(doc.get(docMapper.mappers().smartName("file1").mapper().names().indexName()), nullValue());
81+
assertThat(doc.get(docMapper.mappers().smartName("file1.title").mapper().names().indexName()), nullValue());
82+
assertThat(doc.get(docMapper.mappers().smartName("file1.author").mapper().names().indexName()), nullValue());
83+
assertThat(doc.get(docMapper.mappers().smartName("file1.keywords").mapper().names().indexName()), nullValue());
84+
assertThat(doc.get(docMapper.mappers().smartName("file1.content_type").mapper().names().indexName()), nullValue());
85+
assertThat(doc.getField(docMapper.mappers().smartName("file1.content_length").mapper().names().indexName()), nullValue());
86+
87+
assertThat(doc.get(docMapper.mappers().smartName("file2").mapper().names().indexName()), containsString("World"));
88+
assertThat(doc.get(docMapper.mappers().smartName("file2.title").mapper().names().indexName()), equalTo("Hello"));
89+
assertThat(doc.get(docMapper.mappers().smartName("file2.author").mapper().names().indexName()), equalTo("kimchy"));
90+
assertThat(doc.get(docMapper.mappers().smartName("file2.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
91+
assertThat(doc.get(docMapper.mappers().smartName("file2.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
92+
assertThat(doc.getField(docMapper.mappers().smartName("file2.content_length").mapper().names().indexName()).numericValue().longValue(), is(344L));
93+
}
94+
95+
@Test(expectedExceptions = MapperParsingException.class)
96+
public void testMultipleDocsEncryptedNotIgnoringErrors() throws IOException {
97+
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"),
98+
ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(),
99+
new AnalysisService(new Index("test")), null, null);
100+
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
101+
102+
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/multipledocs/test-mapping.json");
103+
DocumentMapper docMapper = mapperParser.parse(mapping);
104+
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html");
105+
byte[] pdf = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/encrypted.pdf");
106+
107+
BytesReference json = jsonBuilder()
108+
.startObject()
109+
.field("_id", 1)
110+
.field("file1", pdf)
111+
.field("file2", html)
112+
.endObject().bytes();
113+
114+
Document doc = docMapper.parse(json).rootDoc();
115+
assertThat(doc.get(docMapper.mappers().smartName("file1").mapper().names().indexName()), nullValue());
116+
assertThat(doc.get(docMapper.mappers().smartName("file1.title").mapper().names().indexName()), nullValue());
117+
assertThat(doc.get(docMapper.mappers().smartName("file1.author").mapper().names().indexName()), nullValue());
118+
assertThat(doc.get(docMapper.mappers().smartName("file1.keywords").mapper().names().indexName()), nullValue());
119+
assertThat(doc.get(docMapper.mappers().smartName("file1.content_type").mapper().names().indexName()), nullValue());
120+
assertThat(doc.getField(docMapper.mappers().smartName("file1.content_length").mapper().names().indexName()), nullValue());
121+
122+
assertThat(doc.get(docMapper.mappers().smartName("file2").mapper().names().indexName()), containsString("World"));
123+
assertThat(doc.get(docMapper.mappers().smartName("file2.title").mapper().names().indexName()), equalTo("Hello"));
124+
assertThat(doc.get(docMapper.mappers().smartName("file2.author").mapper().names().indexName()), equalTo("kimchy"));
125+
assertThat(doc.get(docMapper.mappers().smartName("file2.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
126+
assertThat(doc.get(docMapper.mappers().smartName("file2.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
127+
assertThat(doc.getField(docMapper.mappers().smartName("file2.content_length").mapper().names().indexName()).numericValue().longValue(), is(344L));
128+
}
129+
130+
}
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
/*
2+
* Licensed to ElasticSearch and Shay Banon under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. ElasticSearch licenses this
6+
* file to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.plugin.mapper.attachments.test;
21+
22+
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
23+
import org.elasticsearch.action.admin.cluster.health.ClusterHealthStatus;
24+
import org.elasticsearch.action.count.CountResponse;
25+
import org.elasticsearch.common.logging.ESLogger;
26+
import org.elasticsearch.common.logging.Loggers;
27+
import org.elasticsearch.common.network.NetworkUtils;
28+
import org.elasticsearch.common.settings.ImmutableSettings;
29+
import org.elasticsearch.common.settings.Settings;
30+
import org.elasticsearch.index.mapper.MapperParsingException;
31+
import org.elasticsearch.node.Node;
32+
import org.testng.annotations.AfterClass;
33+
import org.testng.annotations.AfterMethod;
34+
import org.testng.annotations.BeforeClass;
35+
import org.testng.annotations.Test;
36+
37+
import static org.elasticsearch.client.Requests.*;
38+
import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath;
39+
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
40+
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
41+
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
42+
import static org.elasticsearch.index.query.QueryBuilders.fieldQuery;
43+
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
44+
import static org.hamcrest.MatcherAssert.assertThat;
45+
import static org.hamcrest.Matchers.equalTo;
46+
47+
/**
48+
* Test case for issue https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/18
49+
*/
50+
@Test
51+
public class MultipleAttachmentIntegrationTests {
52+
53+
private final ESLogger logger = Loggers.getLogger(getClass());
54+
55+
private Node node;
56+
57+
@BeforeClass
58+
public void setupServer() {
59+
node = nodeBuilder().local(true).settings(settingsBuilder()
60+
.put("path.data", "target/data")
61+
.put("cluster.name", "test-cluster-" + NetworkUtils.getLocalAddress())
62+
.put("gateway.type", "none")).node();
63+
}
64+
65+
@AfterClass
66+
public void closeServer() {
67+
node.close();
68+
}
69+
70+
private void createIndex(Settings settings) {
71+
logger.info("creating index [test]");
72+
node.client().admin().indices().create(createIndexRequest("test").settings(settingsBuilder().put("index.numberOfReplicas", 0).put(settings))).actionGet();
73+
logger.info("Running Cluster Health");
74+
ClusterHealthResponse clusterHealth = node.client().admin().cluster().health(clusterHealthRequest().waitForGreenStatus()).actionGet();
75+
logger.info("Done Cluster Health, status " + clusterHealth.getStatus());
76+
assertThat(clusterHealth.isTimedOut(), equalTo(false));
77+
assertThat(clusterHealth.getStatus(), equalTo(ClusterHealthStatus.GREEN));
78+
}
79+
80+
@AfterMethod
81+
public void deleteIndex() {
82+
logger.info("deleting index [test]");
83+
node.client().admin().indices().delete(deleteIndexRequest("test")).actionGet();
84+
}
85+
86+
/**
87+
* When we want to ignore errors (default)
88+
*/
89+
@Test
90+
public void testMultipleAttachmentsWithEncryptedDoc() throws Exception {
91+
createIndex(ImmutableSettings.builder().build());
92+
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/multipledocs/test-mapping.json");
93+
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html");
94+
byte[] pdf = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/encrypted.pdf");
95+
96+
node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet();
97+
98+
node.client().index(indexRequest("test").type("person")
99+
.source(jsonBuilder().startObject().field("file1", html).field("file2", pdf).field("hello","world").endObject())).actionGet();
100+
node.client().admin().indices().refresh(refreshRequest()).actionGet();
101+
102+
CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file1", "World"))).actionGet();
103+
assertThat(countResponse.getCount(), equalTo(1l));
104+
105+
countResponse = node.client().count(countRequest("test").query(fieldQuery("hello", "World"))).actionGet();
106+
assertThat(countResponse.getCount(), equalTo(1l));
107+
}
108+
109+
/**
110+
* When we don't want to ignore errors
111+
*/
112+
@Test(expectedExceptions = MapperParsingException.class)
113+
public void testMultipleAttachmentsWithEncryptedDocNotIgnoringErrors() throws Exception {
114+
createIndex(ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build());
115+
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/multipledocs/test-mapping.json");
116+
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html");
117+
byte[] pdf = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/encrypted.pdf");
118+
119+
node.client().admin().indices()
120+
.putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet();
121+
122+
node.client().index(indexRequest("test").type("person")
123+
.source(jsonBuilder().startObject().field("file1", html).field("file2", pdf).field("hello","world").endObject())).actionGet();
124+
}
125+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"person":{
3+
"properties":{
4+
"file1":{
5+
"type":"attachment"
6+
},
7+
"file2":{
8+
"type":"attachment"
9+
}
10+
}
11+
}
12+
}
Binary file not shown.

0 commit comments

Comments
 (0)