Skip to content

Commit ca1930e

Browse files
committed
Allow Uid#decodeId to decode from a byte array slice (#26987)
Today we only allow to decode byte arrays where the data has a 0 offset and the same length as the array. Allowing to decode stuff from a slice will make decoding IDs cheaper if the the ID is for instance coming from a term dictionary or BytesRef. Relates to #26931
1 parent 1c3e02c commit ca1930e

File tree

2 files changed

+72
-47
lines changed
  • core/src

2 files changed

+72
-47
lines changed

core/src/main/java/org/elasticsearch/index/mapper/Uid.java

+52-44
Original file line numberDiff line numberDiff line change
@@ -135,36 +135,36 @@ static boolean isURLBase64WithoutPadding(String id) {
135135
// 'xxx=' and 'xxx' could be considered the same id
136136
final int length = id.length();
137137
switch (length & 0x03) {
138-
case 0:
139-
break;
140-
case 1:
141-
return false;
142-
case 2:
143-
// the last 2 symbols (12 bits) are encoding 1 byte (8 bits)
144-
// so the last symbol only actually uses 8-6=2 bits and can only take 4 values
145-
char last = id.charAt(length - 1);
146-
if (last != 'A' && last != 'Q' && last != 'g' && last != 'w') {
138+
case 0:
139+
break;
140+
case 1:
147141
return false;
148-
}
149-
break;
150-
case 3:
151-
// The last 3 symbols (18 bits) are encoding 2 bytes (16 bits)
152-
// so the last symbol only actually uses 16-12=4 bits and can only take 16 values
153-
last = id.charAt(length - 1);
154-
if (last != 'A' && last != 'E' && last != 'I' && last != 'M' && last != 'Q'&& last != 'U'&& last != 'Y'
142+
case 2:
143+
// the last 2 symbols (12 bits) are encoding 1 byte (8 bits)
144+
// so the last symbol only actually uses 8-6=2 bits and can only take 4 values
145+
char last = id.charAt(length - 1);
146+
if (last != 'A' && last != 'Q' && last != 'g' && last != 'w') {
147+
return false;
148+
}
149+
break;
150+
case 3:
151+
// The last 3 symbols (18 bits) are encoding 2 bytes (16 bits)
152+
// so the last symbol only actually uses 16-12=4 bits and can only take 16 values
153+
last = id.charAt(length - 1);
154+
if (last != 'A' && last != 'E' && last != 'I' && last != 'M' && last != 'Q'&& last != 'U'&& last != 'Y'
155155
&& last != 'c'&& last != 'g'&& last != 'k' && last != 'o' && last != 's' && last != 'w'
156156
&& last != '0' && last != '4' && last != '8') {
157-
return false;
158-
}
159-
break;
160-
default:
161-
// number & 0x03 is always in [0,3]
162-
throw new AssertionError("Impossible case");
157+
return false;
158+
}
159+
break;
160+
default:
161+
// number & 0x03 is always in [0,3]
162+
throw new AssertionError("Impossible case");
163163
}
164164
for (int i = 0; i < length; ++i) {
165165
final char c = id.charAt(i);
166166
final boolean allowed =
167-
(c >= '0' && c <= '9') ||
167+
(c >= '0' && c <= '9') ||
168168
(c >= 'A' && c <= 'Z') ||
169169
(c >= 'a' && c <= 'z') ||
170170
c == '-' || c == '_';
@@ -244,16 +244,16 @@ public static BytesRef encodeId(String id) {
244244
}
245245
}
246246

247-
private static String decodeNumericId(byte[] idBytes) {
248-
assert Byte.toUnsignedInt(idBytes[0]) == NUMERIC;
249-
int length = (idBytes.length - 1) * 2;
247+
private static String decodeNumericId(byte[] idBytes, int offset, int len) {
248+
assert Byte.toUnsignedInt(idBytes[offset]) == NUMERIC;
249+
int length = (len - 1) * 2;
250250
char[] chars = new char[length];
251-
for (int i = 1; i < idBytes.length; ++i) {
252-
final int b = Byte.toUnsignedInt(idBytes[i]);
251+
for (int i = 1; i < len; ++i) {
252+
final int b = Byte.toUnsignedInt(idBytes[offset + i]);
253253
final int b1 = (b >>> 4);
254254
final int b2 = b & 0x0f;
255255
chars[(i - 1) * 2] = (char) (b1 + '0');
256-
if (i == idBytes.length - 1 && b2 == 0x0f) {
256+
if (i == len - 1 && b2 == 0x0f) {
257257
length--;
258258
break;
259259
}
@@ -262,33 +262,41 @@ private static String decodeNumericId(byte[] idBytes) {
262262
return new String(chars, 0, length);
263263
}
264264

265-
private static String decodeUtf8Id(byte[] idBytes) {
266-
assert Byte.toUnsignedInt(idBytes[0]) == UTF8;
267-
return new BytesRef(idBytes, 1, idBytes.length - 1).utf8ToString();
265+
private static String decodeUtf8Id(byte[] idBytes, int offset, int length) {
266+
assert Byte.toUnsignedInt(idBytes[offset]) == UTF8;
267+
return new BytesRef(idBytes, offset + 1, length - 1).utf8ToString();
268268
}
269269

270-
private static String decodeBase64Id(byte[] idBytes) {
271-
assert Byte.toUnsignedInt(idBytes[0]) <= BASE64_ESCAPE;
272-
if (Byte.toUnsignedInt(idBytes[0]) == BASE64_ESCAPE) {
273-
idBytes = Arrays.copyOfRange(idBytes, 1, idBytes.length);
270+
private static String decodeBase64Id(byte[] idBytes, int offset, int length) {
271+
assert Byte.toUnsignedInt(idBytes[offset]) <= BASE64_ESCAPE;
272+
if (Byte.toUnsignedInt(idBytes[offset]) == BASE64_ESCAPE) {
273+
idBytes = Arrays.copyOfRange(idBytes, offset + 1, offset + length);
274+
} else if ((idBytes.length == length && offset == 0) == false) { // no need to copy if it's not a slice
275+
idBytes = Arrays.copyOfRange(idBytes, offset, offset + length);
274276
}
275277
return Base64.getUrlEncoder().withoutPadding().encodeToString(idBytes);
276278
}
277279

278280
/** Decode an indexed id back to its original form.
279281
* @see #encodeId */
280282
public static String decodeId(byte[] idBytes) {
281-
if (idBytes.length == 0) {
283+
return decodeId(idBytes, 0, idBytes.length);
284+
}
285+
286+
/** Decode an indexed id back to its original form.
287+
* @see #encodeId */
288+
public static String decodeId(byte[] idBytes, int offset, int length) {
289+
if (length == 0) {
282290
throw new IllegalArgumentException("Ids can't be empty");
283291
}
284-
final int magicChar = Byte.toUnsignedInt(idBytes[0]);
292+
final int magicChar = Byte.toUnsignedInt(idBytes[offset]);
285293
switch (magicChar) {
286-
case NUMERIC:
287-
return decodeNumericId(idBytes);
288-
case UTF8:
289-
return decodeUtf8Id(idBytes);
290-
default:
291-
return decodeBase64Id(idBytes);
294+
case NUMERIC:
295+
return decodeNumericId(idBytes, offset, length);
296+
case UTF8:
297+
return decodeUtf8Id(idBytes, offset, length);
298+
default:
299+
return decodeBase64Id(idBytes, offset, length);
292300
}
293301
}
294302
}

core/src/test/java/org/elasticsearch/index/mapper/UidTests.java

+20-3
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ public void testEncodeUTF8Ids() {
7979
for (int iter = 0; iter < iters; ++iter) {
8080
final String id = TestUtil.randomRealisticUnicodeString(random(), 1, 10);
8181
BytesRef encoded = Uid.encodeId(id);
82-
assertEquals(id, Uid.decodeId(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length)));
82+
assertEquals(id, doDecodeId(encoded));
8383
assertTrue(encoded.length <= 1 + new BytesRef(id).length);
8484
}
8585
}
@@ -93,7 +93,7 @@ public void testEncodeNumericIds() {
9393
id = "0" + id;
9494
}
9595
BytesRef encoded = Uid.encodeId(id);
96-
assertEquals(id, Uid.decodeId(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length)));
96+
assertEquals(id, doDecodeId(encoded));
9797
assertEquals(1 + (id.length() + 1) / 2, encoded.length);
9898
}
9999
}
@@ -105,9 +105,26 @@ public void testEncodeBase64Ids() {
105105
random().nextBytes(binaryId);
106106
final String id = Base64.getUrlEncoder().withoutPadding().encodeToString(binaryId);
107107
BytesRef encoded = Uid.encodeId(id);
108-
assertEquals(id, Uid.decodeId(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length)));
108+
assertEquals(id, doDecodeId(encoded));
109109
assertTrue(encoded.length <= 1 + binaryId.length);
110110
}
111111
}
112112

113+
private static String doDecodeId(BytesRef encoded) {
114+
115+
if (randomBoolean()) {
116+
return Uid.decodeId(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length));
117+
} else {
118+
if (randomBoolean()) {
119+
BytesRef slicedCopy = new BytesRef(randomIntBetween(encoded.length + 1, encoded.length + 100));
120+
slicedCopy.offset = randomIntBetween(1, slicedCopy.bytes.length - encoded.length);
121+
slicedCopy.length = encoded.length;
122+
System.arraycopy(encoded.bytes, encoded.offset, slicedCopy.bytes, slicedCopy.offset, encoded.length);
123+
assertArrayEquals(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length),
124+
Arrays.copyOfRange(slicedCopy.bytes, slicedCopy.offset, slicedCopy.offset + slicedCopy.length));
125+
encoded = slicedCopy;
126+
}
127+
return Uid.decodeId(encoded.bytes, encoded.offset, encoded.length);
128+
}
129+
}
113130
}

0 commit comments

Comments
 (0)