Skip to content

Commit 2a9da80

Browse files
authored
Add HTML strip processor (#41888)
This processor uses the lucene HTMLStripCharFilter class to remove HTML entities from a field. This adds to the char filter, so that there is possibility to store the stripped version as well. Note, that the characeter filter replaces tags with a newline, so that the produced HTML will look slightly different than the incoming HTML with regards to newlines.
1 parent 2592b49 commit 2a9da80

File tree

8 files changed

+179
-2
lines changed

8 files changed

+179
-2
lines changed

docs/reference/ingest/ingest-node.asciidoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -864,6 +864,7 @@ include::processors/foreach.asciidoc[]
864864
include::processors/geoip.asciidoc[]
865865
include::processors/grok.asciidoc[]
866866
include::processors/gsub.asciidoc[]
867+
include::processors/html_strip.asciidoc[]
867868
include::processors/join.asciidoc[]
868869
include::processors/json.asciidoc[]
869870
include::processors/kv.asciidoc[]
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
[[htmlstrip-processor]]
2+
=== HTML Strip Processor
3+
Removes HTML from field.
4+
5+
NOTE: Each HTML tag is replaced with a `\n` character.
6+
7+
[[htmlstrip-options]]
8+
.HTML Strip Options
9+
[options="header"]
10+
|======
11+
| Name | Required | Default | Description
12+
| `field` | yes | - | The string-valued field to remove HTML tags from
13+
| `target_field` | no | `field` | The field to assign the value to, by default `field` is updated in-place
14+
| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document
15+
include::common-options.asciidoc[]
16+
|======
17+
18+
[source,js]
19+
--------------------------------------------------
20+
{
21+
"html_strip": {
22+
"field": "foo"
23+
}
24+
}
25+
--------------------------------------------------
26+
// NOTCONSOLE
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.ingest.common;
21+
22+
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
23+
import org.elasticsearch.ElasticsearchException;
24+
25+
import java.io.IOException;
26+
import java.io.StringReader;
27+
import java.util.Map;
28+
29+
public final class HtmlStripProcessor extends AbstractStringProcessor<String> {
30+
31+
public static final String TYPE = "html_strip";
32+
33+
HtmlStripProcessor(String tag, String field, boolean ignoreMissing, String targetField) {
34+
super(tag, field, ignoreMissing, targetField);
35+
}
36+
37+
@Override
38+
protected String process(String value) {
39+
// shortcut, no need to create a string builder and go through each char
40+
if (value.contains("<") == false || value.contains(">") == false) {
41+
return value;
42+
}
43+
44+
HTMLStripCharFilter filter = new HTMLStripCharFilter(new StringReader(value));
45+
46+
StringBuilder builder = new StringBuilder();
47+
int ch;
48+
try {
49+
while ((ch = filter.read()) != -1) {
50+
builder.append((char)ch);
51+
}
52+
} catch (IOException e) {
53+
throw new ElasticsearchException(e);
54+
}
55+
56+
return builder.toString();
57+
}
58+
59+
@Override
60+
public String getType() {
61+
return TYPE;
62+
}
63+
64+
public static final class Factory extends AbstractStringProcessor.Factory {
65+
66+
public Factory() {
67+
super(TYPE);
68+
}
69+
70+
@Override
71+
protected HtmlStripProcessor newProcessor(String tag, Map<String, Object> config, String field,
72+
boolean ignoreMissing, String targetField) {
73+
return new HtmlStripProcessor(tag, field, ignoreMissing, targetField);
74+
}
75+
}
76+
}

modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ public Map<String, Processor.Factory> getProcessors(Processor.Parameters paramet
8787
entry(BytesProcessor.TYPE, new BytesProcessor.Factory()),
8888
entry(PipelineProcessor.TYPE, new PipelineProcessor.Factory(parameters.ingestService)),
8989
entry(DissectProcessor.TYPE, new DissectProcessor.Factory()),
90-
entry(DropProcessor.TYPE, new DropProcessor.Factory()));
90+
entry(DropProcessor.TYPE, new DropProcessor.Factory()),
91+
entry(HtmlStripProcessor.TYPE, new HtmlStripProcessor.Factory()));
9192
}
9293

9394
@Override
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.ingest.common;
21+
22+
public class HtmlStripProcessorFactoryTests extends AbstractStringProcessorFactoryTestCase {
23+
@Override
24+
protected AbstractStringProcessor.Factory newFactory() {
25+
return new HtmlStripProcessor.Factory();
26+
}
27+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.ingest.common;
21+
22+
public class HtmlStripProcessorTests extends AbstractStringProcessorTestCase<String> {
23+
24+
@Override
25+
protected AbstractStringProcessor<String> newProcessor(String field, boolean ignoreMissing, String targetField) {
26+
return new HtmlStripProcessor(randomAlphaOfLength(10), field, ignoreMissing, targetField);
27+
}
28+
29+
@Override
30+
protected String modifyInput(String input) {
31+
return "<p><b>test</b>" + input + "<p><b>test</b>";
32+
}
33+
34+
@Override
35+
protected String expectedResult(String input) {
36+
return "\ntest" + input + "\ntest";
37+
}
38+
}

modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/10_basic.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
- contains: { nodes.$master.ingest.processors: { type: foreach } }
2424
- contains: { nodes.$master.ingest.processors: { type: grok } }
2525
- contains: { nodes.$master.ingest.processors: { type: gsub } }
26+
- contains: { nodes.$master.ingest.processors: { type: html_strip } }
2627
- contains: { nodes.$master.ingest.processors: { type: join } }
2728
- contains: { nodes.$master.ingest.processors: { type: json } }
2829
- contains: { nodes.$master.ingest.processors: { type: kv } }

modules/ingest-common/src/test/resources/rest-api-spec/test/ingest/40_mutate.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ teardown:
7676
"pattern" : "-",
7777
"replacement" : "."
7878
}
79+
},
80+
{
81+
"html_strip" : {
82+
"field" : "field_to_html_strip"
83+
}
7984
}
8085
]
8186
}
@@ -96,7 +101,8 @@ teardown:
96101
"field_to_split": "127-0-0-1",
97102
"field_to_join": ["127","0","0","1"],
98103
"field_to_convert": ["127","0","0","1"],
99-
"field_to_gsub": "127-0-0-1"
104+
"field_to_gsub": "127-0-0-1",
105+
"field_to_html_strip": "<p>this <title>is</title> a <b>test</b>"
100106
}
101107
102108
- do:
@@ -114,6 +120,7 @@ teardown:
114120
- match: { _source.field_to_join: "127-0-0-1" }
115121
- match: { _source.field_to_convert: [127,0,0,1] }
116122
- match: { _source.field_to_gsub: "127.0.0.1" }
123+
- match: { _source.field_to_html_strip: "\nthis \nis\n a test" }
117124

118125
---
119126
"Test metadata":

0 commit comments

Comments
 (0)