Skip to content

update ingest-attachment to use Tika 1.17 and newer deps #27824

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 15, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 68 additions & 45 deletions plugins/ingest-attachment/build.gradle

Large diffs are not rendered by default.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
c62dfe18a3b827a2c626ade0ffba44562ddf3f61

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
f2d653c617004193f3350330d907f77b60c88c56
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/commons-io-2.4.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/commons-io-2.5.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2852e6e05fbb95076fc091f6d1780f1f8fe35e0f
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/fontbox-2.0.3.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/fontbox-2.0.8.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
52f852fcfc7481d45efdffd224eb78b85981b17b
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/jempbox-1.8.12.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/jempbox-1.8.13.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
a874cef0ed0e2a8c4cc5ed52c23ba3e6d78eca4e
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/pdfbox-2.0.3.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/pdfbox-2.0.8.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
17bdf273d66f3afe41eedb9d3ab6a7b819c44a0c
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/poi-3.16.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/poi-3.17.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0ae92292a2043888b40d418da97dc0b669fde326
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/poi-ooxml-3.16.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/poi-ooxml-3.17.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
07d8c44407178b73246462842bf1e206e99c8e0a

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
890114bfa82f5b6380ea0e9b0bf49b0af797b414

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
85d86a0e26c7f5c0db4ee63e8c7728e51c5d64ce
1 change: 0 additions & 1 deletion plugins/ingest-attachment/licenses/tika-core-1.15.jar.sha1

This file was deleted.

1 change: 1 addition & 0 deletions plugins/ingest-attachment/licenses/tika-core-1.17.jar.sha1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
b450102c2aee98107474d2f92661d947b9cef183

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
4277c54fcaed542fbc8a0001fdb4c23baccc0132
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.elasticsearch.ingest.attachment;

import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
Expand Down Expand Up @@ -81,68 +82,72 @@ public void execute(IngestDocument ingestDocument) {
throw new IllegalArgumentException("field [" + field + "] is null, cannot parse.");
}

Metadata metadata = new Metadata();
String parsedContent = "";
try {
Metadata metadata = new Metadata();
String parsedContent = TikaImpl.parse(input, metadata, indexedChars);
parsedContent = TikaImpl.parse(input, metadata, indexedChars);
} catch (ZeroByteFileException e) {
// tika 1.17 throws an exception when the InputStream has 0 bytes.
// previously, it did not mind. This is here to preserve that behavior.
} catch (Exception e) {
throw new ElasticsearchParseException("Error parsing document in field [{}]", e, field);
}

if (properties.contains(Property.CONTENT) && Strings.hasLength(parsedContent)) {
// somehow tika seems to append a newline at the end automatically, lets remove that again
additionalFields.put(Property.CONTENT.toLowerCase(), parsedContent.trim());
}
if (properties.contains(Property.CONTENT) && Strings.hasLength(parsedContent)) {
// somehow tika seems to append a newline at the end automatically, lets remove that again
additionalFields.put(Property.CONTENT.toLowerCase(), parsedContent.trim());
}

if (properties.contains(Property.LANGUAGE) && Strings.hasLength(parsedContent)) {
LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
String language = identifier.getLanguage();
additionalFields.put(Property.LANGUAGE.toLowerCase(), language);
}
if (properties.contains(Property.LANGUAGE) && Strings.hasLength(parsedContent)) {
LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
String language = identifier.getLanguage();
additionalFields.put(Property.LANGUAGE.toLowerCase(), language);
}

if (properties.contains(Property.DATE)) {
String createdDate = metadata.get(TikaCoreProperties.CREATED);
if (createdDate != null) {
additionalFields.put(Property.DATE.toLowerCase(), createdDate);
}
if (properties.contains(Property.DATE)) {
String createdDate = metadata.get(TikaCoreProperties.CREATED);
if (createdDate != null) {
additionalFields.put(Property.DATE.toLowerCase(), createdDate);
}
}

if (properties.contains(Property.TITLE)) {
String title = metadata.get(TikaCoreProperties.TITLE);
if (Strings.hasLength(title)) {
additionalFields.put(Property.TITLE.toLowerCase(), title);
}
if (properties.contains(Property.TITLE)) {
String title = metadata.get(TikaCoreProperties.TITLE);
if (Strings.hasLength(title)) {
additionalFields.put(Property.TITLE.toLowerCase(), title);
}
}

if (properties.contains(Property.AUTHOR)) {
String author = metadata.get("Author");
if (Strings.hasLength(author)) {
additionalFields.put(Property.AUTHOR.toLowerCase(), author);
}
if (properties.contains(Property.AUTHOR)) {
String author = metadata.get("Author");
if (Strings.hasLength(author)) {
additionalFields.put(Property.AUTHOR.toLowerCase(), author);
}
}

if (properties.contains(Property.KEYWORDS)) {
String keywords = metadata.get("Keywords");
if (Strings.hasLength(keywords)) {
additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords);
}
if (properties.contains(Property.KEYWORDS)) {
String keywords = metadata.get("Keywords");
if (Strings.hasLength(keywords)) {
additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords);
}
}

if (properties.contains(Property.CONTENT_TYPE)) {
String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (Strings.hasLength(contentType)) {
additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType);
}
if (properties.contains(Property.CONTENT_TYPE)) {
String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (Strings.hasLength(contentType)) {
additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType);
}
}

if (properties.contains(Property.CONTENT_LENGTH)) {
String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
long length;
if (Strings.hasLength(contentLength)) {
length = Long.parseLong(contentLength);
} else {
length = parsedContent.length();
}
additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length);
if (properties.contains(Property.CONTENT_LENGTH)) {
String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
long length;
if (Strings.hasLength(contentLength)) {
length = Long.parseLong(contentLength);
} else {
length = parsedContent.length();
}
} catch (Exception e) {
throw new ElasticsearchParseException("Error parsing document in field [{}]", e, field);
additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length);
}

ingestDocument.setFieldValue(targetField, additionalFields);
Expand Down