Skip to content

Commit df6d430

Browse files
authored
Refactor ecma_builtin_global_object_unescape (#4115)
based on ECMA-262 v11, B.2.1.2 Fixed tests from the exclude list: * annexB/built-ins/unescape/four-ignore-bad-u.js * annexB/built-ins/unescape/four.js * annexB/built-ins/unescape/two.js JerryScript-DCO-1.0-Signed-off-by: Adam Szilagyi [email protected]
1 parent 4541524 commit df6d430

File tree

3 files changed

+180
-61
lines changed

3 files changed

+180
-61
lines changed

jerry-core/ecma/builtin-objects/ecma-builtin-global.c

+60-58
Original file line numberDiff line numberDiff line change
@@ -494,11 +494,49 @@ ecma_builtin_global_object_escape (lit_utf8_byte_t *input_start_p, /**< routine'
494494
return ecma_make_string_value (ecma_stringbuilder_finalize (&builder));
495495
} /* ecma_builtin_global_object_escape */
496496

497+
/**
498+
* Utility method to resolve character sequences for the 'unescape' method.
499+
*
500+
* Expected formats: %uxxxx or %yy
501+
*
502+
* @return number of characters processed during the escape resolve
503+
*/
504+
static uint8_t
505+
ecma_builtin_global_object_unescape_resolve_escape (const lit_utf8_byte_t *buffer_p, /**< character buffer */
506+
bool unicode_sequence, /**< true if unescaping unicode sequence */
507+
ecma_char_t *out_result_p) /**< [out] resolved character */
508+
{
509+
JERRY_ASSERT (buffer_p != NULL);
510+
JERRY_ASSERT (out_result_p != NULL);
511+
512+
ecma_char_t unescaped_chr = 0;
513+
uint8_t sequence_length = unicode_sequence ? 5 : 2;
514+
uint8_t start = unicode_sequence ? 1 : 0;
515+
516+
for (uint8_t i = start; i < sequence_length; i++)
517+
{
518+
const lit_utf8_byte_t current_char = buffer_p[i];
519+
520+
if (!lit_char_is_hex_digit (current_char))
521+
{
522+
/* This was not an escape sequence, skip processing */
523+
return 0;
524+
}
525+
526+
unescaped_chr = (ecma_char_t) ((unescaped_chr << 4) + (ecma_char_t) lit_char_hex_to_int (current_char));
527+
}
528+
529+
*out_result_p = unescaped_chr;
530+
531+
return sequence_length;
532+
} /* ecma_builtin_global_object_unescape_resolve_escape */
533+
497534
/**
498535
* The Global object's 'unescape' routine
499536
*
500537
* See also:
501538
* ECMA-262 v5, B.2.2
539+
* ECMA-262 v11, B.2.1.2
502540
*
503541
* @return ecma value
504542
* Returned value must be freed with ecma_free_value.
@@ -509,76 +547,40 @@ ecma_builtin_global_object_unescape (lit_utf8_byte_t *input_start_p, /**< routin
509547
lit_utf8_size_t input_size) /**< routine's first argument's
510548
* string buffer's size */
511549
{
550+
if (input_size == 0)
551+
{
552+
return ecma_make_magic_string_value (LIT_MAGIC_STRING__EMPTY);
553+
}
554+
512555
const lit_utf8_byte_t *input_curr_p = input_start_p;
513556
const lit_utf8_byte_t *input_end_p = input_start_p + input_size;
514-
/* 4. */
515-
/* The length of input string is always greater than output string
516-
* so we re-use the input string buffer.
517-
* The %xx is three byte long, and the maximum encoded value is 0xff,
518-
* which maximum encoded length is two byte. Similar to this, the maximum
519-
* encoded length of %uxxxx is four byte. */
520-
lit_utf8_byte_t *output_char_p = input_start_p;
521-
522-
/* The state of parsing that tells us where we are in an escape pattern.
523-
* 0 we are outside of pattern,
524-
* 1 found '%', start of pattern,
525-
* 2 found first hex digit of '%xy' pattern
526-
* 3 found valid '%xy' pattern
527-
* 4 found 'u', start of '%uwxyz' pattern
528-
* 5-7 found hex digits of '%uwxyz' pattern
529-
* 8 found valid '%uwxyz' pattern
530-
*/
531-
uint8_t status = 0;
532-
ecma_char_t hex_digits = 0;
533-
/* 5. */
557+
ecma_stringbuilder_t builder = ecma_stringbuilder_create ();
558+
534559
while (input_curr_p < input_end_p)
535560
{
536-
/* 6. */
537561
ecma_char_t chr = lit_cesu8_read_next (&input_curr_p);
538562

539-
/* 7-8. */
540-
if (status == 0 && chr == LIT_CHAR_PERCENT)
541-
{
542-
/* Found '%' char, start of escape sequence. */
543-
status = 1;
544-
}
545-
/* 9-10. */
546-
else if (status == 1 && chr == LIT_CHAR_LOWERCASE_U)
563+
// potential pattern
564+
if (chr == LIT_CHAR_PERCENT)
547565
{
548-
/* Found 'u' char after '%'. */
549-
status = 4;
550-
}
551-
else if (status > 0 && lit_char_is_hex_digit (chr))
552-
{
553-
/* Found hexadecimal digit in escape sequence. */
554-
hex_digits = (ecma_char_t) (hex_digits * 16 + (ecma_char_t) lit_char_hex_to_int (chr));
555-
status++;
556-
}
557-
else
558-
{
559-
/* Previously found hexadecimal digit in escape sequence but it's not valid '%xy' pattern
560-
* so essentially it was only a simple character. */
561-
status = 0;
562-
}
566+
const lit_utf8_size_t chars_leftover = (lit_utf8_size_t) (input_end_p - input_curr_p);
563567

564-
/* 11-17. Found valid '%uwxyz' or '%xy' escape. */
565-
if (status == 8 || status == 3)
566-
{
567-
output_char_p -= (status == 3) ? 2 : 5;
568-
status = 0;
569-
chr = hex_digits;
570-
hex_digits = 0;
568+
// potential unicode sequence
569+
if (chars_leftover >= 5 && input_curr_p[0] == LIT_CHAR_LOWERCASE_U)
570+
{
571+
input_curr_p += ecma_builtin_global_object_unescape_resolve_escape (input_curr_p, true, &chr);
572+
}
573+
// potential two hexa sequence
574+
else if (chars_leftover >= 2)
575+
{
576+
input_curr_p += ecma_builtin_global_object_unescape_resolve_escape (input_curr_p, false, &chr);
577+
}
571578
}
572579

573-
/* Copying character. */
574-
lit_utf8_size_t lit_size = lit_code_unit_to_utf8 (chr, output_char_p);
575-
output_char_p += lit_size;
576-
JERRY_ASSERT (output_char_p <= input_curr_p);
580+
ecma_stringbuilder_append_char (&builder, chr);
577581
}
578582

579-
lit_utf8_size_t output_length = (lit_utf8_size_t) (output_char_p - input_start_p);
580-
ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (input_start_p, output_length);
581-
return ecma_make_string_value (output_string_p);
583+
return ecma_make_string_value (ecma_stringbuilder_finalize (&builder));
582584
} /* ecma_builtin_global_object_unescape */
583585

584586
#endif /* ENABLED (JERRY_BUILTIN_ANNEXB) */
+120
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// Copyright JS Foundation and other contributors, http://js.foundation
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// Copyright (C) 2016 the V8 project authors. All rights reserved.
16+
// This code is governed by the BSD license found in the LICENSE file.
17+
18+
assert(unescape('%U0000') === '%U0000');
19+
assert(unescape('%t0000') === '%t0000');
20+
assert(unescape('%v0000') ==='%v0000');
21+
assert(unescape('%%0000') === '%\x0000');
22+
23+
// tests for two hexa unescape
24+
assert(unescape('%0%0000') === '%0\x0000');
25+
assert(unescape('%0%0100') === '%0\x0100');
26+
27+
assert(unescape('%0%2900') === '%0)00');
28+
assert(unescape('%0%2a00') === '%0*00');
29+
assert(unescape('%0%2A00') === '%0*00');
30+
assert(unescape('%0%2b00') === '%0+00');
31+
assert(unescape('%0%2B00') === '%0+00');
32+
assert(unescape('%0%2c00') === '%0,00');
33+
assert(unescape('%0%2C00') === '%0,00');
34+
assert(unescape('%0%2d00') === '%0-00');
35+
assert(unescape('%0%2D00') === '%0-00');
36+
37+
assert(unescape('%0%3900') === '%0900');
38+
assert(unescape('%0%3a00') === '%0:00');
39+
assert(unescape('%0%3A00') === '%0:00');
40+
41+
assert(unescape('%0%3f00') === '%0?00');
42+
assert(unescape('%0%3F00') === '%0?00');
43+
assert(unescape('%0%4000') === '%0@00');
44+
45+
assert(unescape('%0%5a00') === '%0Z00');
46+
assert(unescape('%0%5A00') === '%0Z00');
47+
assert(unescape('%0%5b00') === '%0[00');
48+
assert(unescape('%0%5B00') === '%0[00');
49+
50+
assert(unescape('%0%5e00') === '%0^00');
51+
assert(unescape('%0%5E00') === '%0^00');
52+
assert(unescape('%0%5f00') === '%0_00');
53+
assert(unescape('%0%5F00') === '%0_00');
54+
assert(unescape('%0%6000') === '%0`00');
55+
assert(unescape('%0%6100') === '%0a00');
56+
57+
assert(unescape('%0%7a00') === '%0z00');
58+
assert(unescape('%0%7A00') === '%0z00');
59+
assert(unescape('%0%7b00') === '%0{00');
60+
assert(unescape('%0%7B00') === '%0{00');
61+
62+
assert(unescape('%0%fe00') === '%0\xfe00');
63+
assert(unescape('%0%Fe00') === '%0\xfe00');
64+
assert(unescape('%0%fE00') === '%0\xfe00');
65+
assert(unescape('%0%FE00') === '%0\xfe00');
66+
67+
assert(unescape('%0%ff00') === '%0\xff00');
68+
assert(unescape('%0%Ff00') === '%0\xff00');
69+
assert(unescape('%0%fF00') === '%0\xff00');
70+
assert(unescape('%0%FF00') === '%0\xff00');
71+
72+
// tests for unicode unescape
73+
assert(unescape('%0%u00290') === '%0)0');
74+
assert(unescape('%0%u002a0') === '%0*0');
75+
assert(unescape('%0%u002A0') === '%0*0');
76+
assert(unescape('%0%u002b0') === '%0+0');
77+
assert(unescape('%0%u002B0') === '%0+0');
78+
assert(unescape('%0%u002c0') === '%0,0');
79+
assert(unescape('%0%u002C0') === '%0,0');
80+
assert(unescape('%0%u002d0') === '%0-0');
81+
assert(unescape('%0%u002D0') === '%0-0');
82+
83+
assert(unescape('%0%u00390') === '%090');
84+
assert(unescape('%0%u003a0') === '%0:0');
85+
assert(unescape('%0%u003A0') === '%0:0');
86+
87+
assert(unescape('%0%u003f0') === '%0?0');
88+
assert(unescape('%0%u003F0') === '%0?0');
89+
assert(unescape('%0%u00400') === '%0@0');
90+
91+
assert(unescape('%0%u005a0') === '%0Z0');
92+
assert(unescape('%0%u005A0') === '%0Z0');
93+
assert(unescape('%0%u005b0') === '%0[0');
94+
assert(unescape('%0%u005B0') === '%0[0');
95+
96+
assert(unescape('%0%u005e0') === '%0^0');
97+
assert(unescape('%0%u005E0') === '%0^0');
98+
assert(unescape('%0%u005f0') === '%0_0');
99+
assert(unescape('%0%u005F0') === '%0_0');
100+
assert(unescape('%0%u00600') === '%0`0');
101+
assert(unescape('%0%u00610') === '%0a0');
102+
103+
assert(unescape('%0%u007a0') === '%0z0');
104+
assert(unescape('%0%u007A0') === '%0z0');
105+
assert(unescape('%0%u007b0') === '%0{0');
106+
assert(unescape('%0%u007B0') === '%0{0');
107+
108+
assert(unescape('%0%ufffe0') === '%0\ufffe0');
109+
assert(unescape('%0%uFffe0') === '%0\ufffe0');
110+
assert(unescape('%0%ufFfe0') === '%0\ufffe0');
111+
assert(unescape('%0%uffFe0') === '%0\ufffe0');
112+
assert(unescape('%0%ufffE0') === '%0\ufffe0');
113+
assert(unescape('%0%uFFFE0') === '%0\ufffe0');
114+
115+
assert(unescape('%0%uffff0') === '%0\uffff0');
116+
assert(unescape('%0%uFfff0') === '%0\uffff0');
117+
assert(unescape('%0%ufFff0') === '%0\uffff0');
118+
assert(unescape('%0%uffFf0') === '%0\uffff0');
119+
assert(unescape('%0%ufffF0') === '%0\uffff0');
120+
assert(unescape('%0%uFFFF0') === '%0\uffff0');

tests/test262-esnext-excludelist.xml

-3
Original file line numberDiff line numberDiff line change
@@ -1168,9 +1168,6 @@
11681168
<test id="annexB/built-ins/String/prototype/sup/name.js"><reason></reason></test>
11691169
<test id="annexB/built-ins/String/prototype/sup/prop-desc.js"><reason></reason></test>
11701170
<test id="annexB/built-ins/String/prototype/sup/this-val-tostring-err.js"><reason></reason></test>
1171-
<test id="annexB/built-ins/unescape/four-ignore-bad-u.js"><reason></reason></test>
1172-
<test id="annexB/built-ins/unescape/four.js"><reason></reason></test>
1173-
<test id="annexB/built-ins/unescape/two.js"><reason></reason></test>
11741171
<test id="annexB/language/comments/multi-line-html-close.js"><reason></reason></test>
11751172
<test id="annexB/language/comments/single-line-html-close-asi.js"><reason></reason></test>
11761173
<test id="annexB/language/comments/single-line-html-close-unicode-separators.js"><reason></reason></test>

0 commit comments

Comments
 (0)