Skip to content

Commit c58a09a

Browse files
ioanatiaChrisHegartyelasticmachine
authored andcommitted
ES|QL Add initial support for semantic_text field type (elastic#113920)
* Add initial support for semantic_text field type * Update docs/changelog/113920.yaml * More tests and fixes * Use mock inference service * Fix tests * Spotless * Fix mixed-cluster and multi-clusters tests * sort * Attempt another fix for bwc tests * Spotless * Fix merge * Attempt another fix * Don't load the inference-service-test plugin for mixed versions/clusters * Add more tests, address review comments * trivial * revert * post-merge fix block loader * post-merge fix compile * add mixed version testing * whitespace * fix MultiClusterSpecIT * add more fields to mapping * Revert mixed version testing * whitespace --------- Co-authored-by: ChrisHegarty <[email protected]> Co-authored-by: Elastic Machine <[email protected]>
1 parent f6390b9 commit c58a09a

File tree

26 files changed

+490
-35
lines changed

26 files changed

+490
-35
lines changed

docs/changelog/113920.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 113920
2+
summary: Add initial support for `semantic_text` field type
3+
area: Search
4+
type: enhancement
5+
issues: []

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/plugin/EsqlCorePlugin.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@
1414
public class EsqlCorePlugin extends Plugin implements ExtensiblePlugin {
1515
public static final FeatureFlag DATE_NANOS_FEATURE_FLAG = new FeatureFlag("esql_date_nanos");
1616

17+
public static final FeatureFlag SEMANTIC_TEXT_FEATURE_FLAG = new FeatureFlag("esql_semantic_text");
1718
}

x-pack/plugin/esql-core/src/main/java/org/elasticsearch/xpack/esql/core/type/DataType.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,14 @@ public enum DataType {
194194
* inside alongside time-series aggregations. These fields are not parsable from the
195195
* mapping and should be hidden from users.
196196
*/
197-
PARTIAL_AGG(builder().esType("partial_agg").unknownSize());
197+
PARTIAL_AGG(builder().esType("partial_agg").unknownSize()),
198+
/**
199+
* String fields that are split into chunks, where each chunk has attached embeddings
200+
* used for semantic search. Generally ESQL only sees {@code semantic_text} fields when
201+
* loaded from the index and ESQL will load these fields as strings without their attached
202+
* chunks or embeddings.
203+
*/
204+
SEMANTIC_TEXT(builder().esType("semantic_text").unknownSize());
198205

199206
/**
200207
* Types that are actively being built. These types are not returned
@@ -203,7 +210,8 @@ public enum DataType {
203210
* check that sending them to a function produces a sane error message.
204211
*/
205212
public static final Map<DataType, FeatureFlag> UNDER_CONSTRUCTION = Map.ofEntries(
206-
Map.entry(DATE_NANOS, EsqlCorePlugin.DATE_NANOS_FEATURE_FLAG)
213+
Map.entry(DATE_NANOS, EsqlCorePlugin.DATE_NANOS_FEATURE_FLAG),
214+
Map.entry(SEMANTIC_TEXT, EsqlCorePlugin.SEMANTIC_TEXT_FEATURE_FLAG)
207215
);
208216

209217
private final String typeName;

x-pack/plugin/esql/qa/server/mixed-cluster/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/mixed/MixedClusterEsqlSpecIT.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,9 @@ protected boolean supportsAsync() {
8686
protected boolean enableRoundingDoubleValuesOnAsserting() {
8787
return true;
8888
}
89+
90+
@Override
91+
protected boolean supportsInferenceTestService() {
92+
return false;
93+
}
8994
}

x-pack/plugin/esql/qa/server/multi-clusters/src/javaRestTest/java/org/elasticsearch/xpack/esql/ccq/MultiClusterSpecIT.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,4 +261,9 @@ static boolean hasIndexMetadata(String query) {
261261
protected boolean enableRoundingDoubleValuesOnAsserting() {
262262
return true;
263263
}
264+
265+
@Override
266+
protected boolean supportsInferenceTestService() {
267+
return false;
268+
}
264269
}

x-pack/plugin/esql/qa/server/multi-node/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ dependencies {
1111

1212
clusterPlugins project(':plugins:mapper-size')
1313
clusterPlugins project(':plugins:mapper-murmur3')
14+
clusterPlugins project(':x-pack:plugin:inference:qa:test-service-plugin')
1415
}
1516

1617
GradleUtils.extendSourceSet(project, "javaRestTest", "yamlRestTest")

x-pack/plugin/esql/qa/server/multi-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/multi_node/EsqlSpecIT.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
public class EsqlSpecIT extends EsqlSpecTestCase {
1616
@ClassRule
17-
public static ElasticsearchCluster cluster = Clusters.testCluster(spec -> {});
17+
public static ElasticsearchCluster cluster = Clusters.testCluster(spec -> spec.plugin("inference-service-test"));
1818

1919
@Override
2020
protected String getTestRestCluster() {

x-pack/plugin/esql/qa/server/single-node/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ dependencies {
2222

2323
clusterPlugins project(':plugins:mapper-size')
2424
clusterPlugins project(':plugins:mapper-murmur3')
25+
clusterPlugins project(':x-pack:plugin:inference:qa:test-service-plugin')
2526
}
2627

2728
restResources {

x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/EsqlSpecIT.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
@ThreadLeakFilters(filters = TestClustersThreadFilter.class)
1919
public class EsqlSpecIT extends EsqlSpecTestCase {
2020
@ClassRule
21-
public static ElasticsearchCluster cluster = Clusters.testCluster();
21+
public static ElasticsearchCluster cluster = Clusters.testCluster(spec -> spec.plugin("inference-service-test"));
2222

2323
@Override
2424
protected String getTestRestCluster() {

x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/EsqlSpecTestCase.java

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,10 @@
6565
import static org.elasticsearch.xpack.esql.CsvTestUtils.ExpectedResults;
6666
import static org.elasticsearch.xpack.esql.CsvTestUtils.isEnabled;
6767
import static org.elasticsearch.xpack.esql.CsvTestUtils.loadCsvSpecValues;
68-
import static org.elasticsearch.xpack.esql.CsvTestsDataLoader.CSV_DATASET_MAP;
68+
import static org.elasticsearch.xpack.esql.CsvTestsDataLoader.availableDatasetsForEs;
69+
import static org.elasticsearch.xpack.esql.CsvTestsDataLoader.clusterHasInferenceEndpoint;
70+
import static org.elasticsearch.xpack.esql.CsvTestsDataLoader.createInferenceEndpoint;
71+
import static org.elasticsearch.xpack.esql.CsvTestsDataLoader.deleteInferenceEndpoint;
6972
import static org.elasticsearch.xpack.esql.CsvTestsDataLoader.loadDataSetIntoEs;
7073
import static org.elasticsearch.xpack.esql.EsqlTestUtils.classpathResources;
7174

@@ -129,7 +132,11 @@ protected EsqlSpecTestCase(
129132

130133
@Before
131134
public void setup() throws IOException {
132-
if (indexExists(CSV_DATASET_MAP.keySet().iterator().next()) == false) {
135+
if (supportsInferenceTestService() && clusterHasInferenceEndpoint(client()) == false) {
136+
createInferenceEndpoint(client());
137+
}
138+
139+
if (indexExists(availableDatasetsForEs(client()).iterator().next().indexName()) == false) {
133140
loadDataSetIntoEs(client());
134141
}
135142
}
@@ -148,6 +155,8 @@ public static void wipeTestData() throws IOException {
148155
throw e;
149156
}
150157
}
158+
159+
deleteInferenceEndpoint(client());
151160
}
152161

153162
public boolean logResults() {
@@ -164,6 +173,9 @@ public final void test() throws Throwable {
164173
}
165174

166175
protected void shouldSkipTest(String testName) throws IOException {
176+
if (testCase.requiredCapabilities.contains("semantic_text_type")) {
177+
assumeTrue("Inference test service needs to be supported for semantic_text", supportsInferenceTestService());
178+
}
167179
checkCapabilities(adminClient(), testFeatureService, testName, testCase);
168180
assumeTrue("Test " + testName + " is not enabled", isEnabled(testName, instructions, Version.CURRENT));
169181
}
@@ -207,6 +219,10 @@ protected static void checkCapabilities(RestClient client, TestFeatureService te
207219
}
208220
}
209221

222+
protected boolean supportsInferenceTestService() {
223+
return true;
224+
}
225+
210226
protected final void doTest() throws Throwable {
211227
RequestObjectBuilder builder = new RequestObjectBuilder(randomFrom(XContentType.values()));
212228

x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvAssert.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,11 @@ private static void assertMetadata(
134134
|| expectedType == UNSIGNED_LONG)) {
135135
continue;
136136
}
137-
if (blockType == Type.KEYWORD && (expectedType == Type.IP || expectedType == Type.VERSION || expectedType == Type.TEXT)) {
137+
if (blockType == Type.KEYWORD
138+
&& (expectedType == Type.IP
139+
|| expectedType == Type.VERSION
140+
|| expectedType == Type.TEXT
141+
|| expectedType == Type.SEMANTIC_TEXT)) {
138142
// Type.asType translates all bytes references into keywords
139143
continue;
140144
}

x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestUtils.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,7 @@ public enum Type {
447447
SCALED_FLOAT(s -> s == null ? null : scaledFloat(s, "100"), Double.class),
448448
KEYWORD(Object::toString, BytesRef.class),
449449
TEXT(Object::toString, BytesRef.class),
450+
SEMANTIC_TEXT(Object::toString, BytesRef.class),
450451
IP(
451452
StringUtils::parseIP,
452453
(l, r) -> l instanceof String maybeIP

x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/CsvTestsDataLoader.java

Lines changed: 113 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import org.apache.logging.log4j.core.config.plugins.util.PluginManager;
2020
import org.elasticsearch.client.Request;
2121
import org.elasticsearch.client.Response;
22+
import org.elasticsearch.client.ResponseException;
2223
import org.elasticsearch.client.RestClient;
2324
import org.elasticsearch.client.RestClientBuilder;
2425
import org.elasticsearch.common.Strings;
@@ -36,9 +37,11 @@
3637
import java.net.URI;
3738
import java.net.URL;
3839
import java.util.ArrayList;
40+
import java.util.HashSet;
3941
import java.util.List;
4042
import java.util.Map;
4143
import java.util.Set;
44+
import java.util.stream.Collectors;
4245

4346
import static org.elasticsearch.common.logging.LoggerMessageFormat.format;
4447
import static org.elasticsearch.xpack.esql.CsvTestUtils.COMMA_ESCAPING_REGEX;
@@ -81,6 +84,7 @@ public class CsvTestsDataLoader {
8184
private static final TestsDataset K8S = new TestsDataset("k8s", "k8s-mappings.json", "k8s.csv").withSetting("k8s-settings.json");
8285
private static final TestsDataset ADDRESSES = new TestsDataset("addresses");
8386
private static final TestsDataset BOOKS = new TestsDataset("books");
87+
private static final TestsDataset SEMANTIC_TEXT = new TestsDataset("semantic_text").withInferenceEndpoint(true);
8488

8589
public static final Map<String, TestsDataset> CSV_DATASET_MAP = Map.ofEntries(
8690
Map.entry(EMPLOYEES.indexName, EMPLOYEES),
@@ -112,7 +116,8 @@ public class CsvTestsDataLoader {
112116
Map.entry(K8S.indexName, K8S),
113117
Map.entry(DISTANCES.indexName, DISTANCES),
114118
Map.entry(ADDRESSES.indexName, ADDRESSES),
115-
Map.entry(BOOKS.indexName, BOOKS)
119+
Map.entry(BOOKS.indexName, BOOKS),
120+
Map.entry(SEMANTIC_TEXT.indexName, SEMANTIC_TEXT)
116121
);
117122

118123
private static final EnrichConfig LANGUAGES_ENRICH = new EnrichConfig("languages_policy", "enrich-policy-languages.json");
@@ -219,8 +224,13 @@ public static void main(String[] args) throws IOException {
219224
}
220225
}
221226

222-
private static void loadDataSetIntoEs(RestClient client, IndexCreator indexCreator) throws IOException {
223-
loadDataSetIntoEs(client, LogManager.getLogger(CsvTestsDataLoader.class), indexCreator);
227+
public static Set<TestsDataset> availableDatasetsForEs(RestClient client) throws IOException {
228+
boolean inferenceEnabled = clusterHasInferenceEndpoint(client);
229+
230+
return CSV_DATASET_MAP.values()
231+
.stream()
232+
.filter(d -> d.requiresInferenceEndpoint == false || inferenceEnabled)
233+
.collect(Collectors.toCollection(HashSet::new));
224234
}
225235

226236
public static void loadDataSetIntoEs(RestClient client) throws IOException {
@@ -229,22 +239,61 @@ public static void loadDataSetIntoEs(RestClient client) throws IOException {
229239
});
230240
}
231241

232-
public static void loadDataSetIntoEs(RestClient client, Logger logger) throws IOException {
233-
loadDataSetIntoEs(client, logger, (restClient, indexName, indexMapping, indexSettings) -> {
234-
ESRestTestCase.createIndex(restClient, indexName, indexSettings, indexMapping, null);
235-
});
236-
}
242+
private static void loadDataSetIntoEs(RestClient client, IndexCreator indexCreator) throws IOException {
243+
Logger logger = LogManager.getLogger(CsvTestsDataLoader.class);
237244

238-
private static void loadDataSetIntoEs(RestClient client, Logger logger, IndexCreator indexCreator) throws IOException {
239-
for (var dataset : CSV_DATASET_MAP.values()) {
245+
Set<String> loadedDatasets = new HashSet<>();
246+
for (var dataset : availableDatasetsForEs(client)) {
240247
load(client, dataset, logger, indexCreator);
248+
loadedDatasets.add(dataset.indexName);
241249
}
242-
forceMerge(client, CSV_DATASET_MAP.keySet(), logger);
250+
forceMerge(client, loadedDatasets, logger);
243251
for (var policy : ENRICH_POLICIES) {
244252
loadEnrichPolicy(client, policy.policyName, policy.policyFileName, logger);
245253
}
246254
}
247255

256+
/** The semantic_text mapping type require an inference endpoint that needs to be setup before creating the index. */
257+
public static void createInferenceEndpoint(RestClient client) throws IOException {
258+
Request request = new Request("PUT", "_inference/sparse_embedding/test_sparse_inference");
259+
request.setJsonEntity("""
260+
{
261+
"service": "test_service",
262+
"service_settings": {
263+
"model": "my_model",
264+
"api_key": "abc64"
265+
},
266+
"task_settings": {
267+
}
268+
}
269+
""");
270+
client.performRequest(request);
271+
}
272+
273+
public static void deleteInferenceEndpoint(RestClient client) throws IOException {
274+
try {
275+
client.performRequest(new Request("DELETE", "_inference/test_sparse_inference"));
276+
} catch (ResponseException e) {
277+
// 404 here means the endpoint was not created
278+
if (e.getResponse().getStatusLine().getStatusCode() != 404) {
279+
throw e;
280+
}
281+
}
282+
}
283+
284+
public static boolean clusterHasInferenceEndpoint(RestClient client) throws IOException {
285+
Request request = new Request("GET", "_inference/sparse_embedding/test_sparse_inference");
286+
try {
287+
client.performRequest(request);
288+
} catch (ResponseException e) {
289+
if (e.getResponse().getStatusLine().getStatusCode() == 404) {
290+
return false;
291+
}
292+
throw e;
293+
}
294+
return true;
295+
}
296+
248297
private static void loadEnrichPolicy(RestClient client, String policyName, String policyFileName, Logger logger) throws IOException {
249298
URL policyMapping = CsvTestsDataLoader.class.getResource("/" + policyFileName);
250299
if (policyMapping == null) {
@@ -511,34 +560,79 @@ public record TestsDataset(
511560
String dataFileName,
512561
String settingFileName,
513562
boolean allowSubFields,
514-
Map<String, String> typeMapping
563+
Map<String, String> typeMapping,
564+
boolean requiresInferenceEndpoint
515565
) {
516566
public TestsDataset(String indexName, String mappingFileName, String dataFileName) {
517-
this(indexName, mappingFileName, dataFileName, null, true, null);
567+
this(indexName, mappingFileName, dataFileName, null, true, null, false);
518568
}
519569

520570
public TestsDataset(String indexName) {
521-
this(indexName, "mapping-" + indexName + ".json", indexName + ".csv", null, true, null);
571+
this(indexName, "mapping-" + indexName + ".json", indexName + ".csv", null, true, null, false);
522572
}
523573

524574
public TestsDataset withIndex(String indexName) {
525-
return new TestsDataset(indexName, mappingFileName, dataFileName, settingFileName, allowSubFields, typeMapping);
575+
return new TestsDataset(
576+
indexName,
577+
mappingFileName,
578+
dataFileName,
579+
settingFileName,
580+
allowSubFields,
581+
typeMapping,
582+
requiresInferenceEndpoint
583+
);
526584
}
527585

528586
public TestsDataset withData(String dataFileName) {
529-
return new TestsDataset(indexName, mappingFileName, dataFileName, settingFileName, allowSubFields, typeMapping);
587+
return new TestsDataset(
588+
indexName,
589+
mappingFileName,
590+
dataFileName,
591+
settingFileName,
592+
allowSubFields,
593+
typeMapping,
594+
requiresInferenceEndpoint
595+
);
530596
}
531597

532598
public TestsDataset withSetting(String settingFileName) {
533-
return new TestsDataset(indexName, mappingFileName, dataFileName, settingFileName, allowSubFields, typeMapping);
599+
return new TestsDataset(
600+
indexName,
601+
mappingFileName,
602+
dataFileName,
603+
settingFileName,
604+
allowSubFields,
605+
typeMapping,
606+
requiresInferenceEndpoint
607+
);
534608
}
535609

536610
public TestsDataset noSubfields() {
537-
return new TestsDataset(indexName, mappingFileName, dataFileName, settingFileName, false, typeMapping);
611+
return new TestsDataset(
612+
indexName,
613+
mappingFileName,
614+
dataFileName,
615+
settingFileName,
616+
false,
617+
typeMapping,
618+
requiresInferenceEndpoint
619+
);
538620
}
539621

540622
public TestsDataset withTypeMapping(Map<String, String> typeMapping) {
541-
return new TestsDataset(indexName, mappingFileName, dataFileName, settingFileName, allowSubFields, typeMapping);
623+
return new TestsDataset(
624+
indexName,
625+
mappingFileName,
626+
dataFileName,
627+
settingFileName,
628+
allowSubFields,
629+
typeMapping,
630+
requiresInferenceEndpoint
631+
);
632+
}
633+
634+
public TestsDataset withInferenceEndpoint(boolean needsInference) {
635+
return new TestsDataset(indexName, mappingFileName, dataFileName, settingFileName, allowSubFields, typeMapping, needsInference);
542636
}
543637
}
544638

x-pack/plugin/esql/qa/testFixtures/src/main/java/org/elasticsearch/xpack/esql/EsqlTestUtils.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -648,7 +648,7 @@ public static Literal randomLiteral(DataType type) {
648648
case KEYWORD -> new BytesRef(randomAlphaOfLength(5));
649649
case IP -> new BytesRef(InetAddressPoint.encode(randomIp(randomBoolean())));
650650
case TIME_DURATION -> Duration.ofMillis(randomLongBetween(-604800000L, 604800000L)); // plus/minus 7 days
651-
case TEXT -> new BytesRef(randomAlphaOfLength(50));
651+
case TEXT, SEMANTIC_TEXT -> new BytesRef(randomAlphaOfLength(50));
652652
case VERSION -> randomVersion().toBytesRef();
653653
case GEO_POINT -> GEO.asWkb(GeometryTestUtils.randomPoint());
654654
case CARTESIAN_POINT -> CARTESIAN.asWkb(ShapeTestUtils.randomPoint());

0 commit comments

Comments
 (0)