Skip to content

Commit bf8424a

Browse files
committed
Add the utf16idx() and strutf16len() functions and add UTF-16 flag to the byteidx(), byteidxcomp() and charidx() functions
1 parent 8281a16 commit bf8424a

File tree

7 files changed

+655
-37
lines changed

7 files changed

+655
-37
lines changed

runtime/doc/builtin.txt

+78-8
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,10 @@ bufnr([{buf} [, {create}]]) Number Number of the buffer {buf}
8181
bufwinid({buf}) Number window ID of buffer {buf}
8282
bufwinnr({buf}) Number window number of buffer {buf}
8383
byte2line({byte}) Number line number at byte count {byte}
84-
byteidx({expr}, {nr}) Number byte index of {nr}'th char in {expr}
85-
byteidxcomp({expr}, {nr}) Number byte index of {nr}'th char in {expr}
84+
byteidx({expr}, {nr} [, {utf16}])
85+
Number byte index of {nr}'th char in {expr}
86+
byteidxcomp({expr}, {nr} [, {utf16}])
87+
Number byte index of {nr}'th char in {expr}
8688
call({func}, {arglist} [, {dict}])
8789
any call {func} with arguments {arglist}
8890
ceil({expr}) Float round {expr} up
@@ -117,7 +119,7 @@ changenr() Number current change number
117119
char2nr({expr} [, {utf8}]) Number ASCII/UTF-8 value of first char in {expr}
118120
charclass({string}) Number character class of {string}
119121
charcol({expr} [, {winid}]) Number column number of cursor or mark
120-
charidx({string}, {idx} [, {countcc}])
122+
charidx({string}, {idx} [, {countcc} [, {utf16}]])
121123
Number char index of byte {idx} in {string}
122124
chdir({dir}) String change current working directory
123125
cindent({lnum}) Number C indent for line {lnum}
@@ -604,6 +606,8 @@ strptime({format}, {timestring})
604606
strridx({haystack}, {needle} [, {start}])
605607
Number last index of {needle} in {haystack}
606608
strtrans({expr}) String translate string to make it printable
609+
strutf16len({string} [, {countcc}])
610+
Number number of UTF-16 code units in {string}
607611
strwidth({expr}) Number display cell length of the String {expr}
608612
submatch({nr} [, {list}]) String or List
609613
specific match in ":s" or substitute()
@@ -704,6 +708,8 @@ undofile({name}) String undo file name for {name}
704708
undotree() List undo file tree
705709
uniq({list} [, {func} [, {dict}]])
706710
List remove adjacent duplicates from a list
711+
utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
712+
Number UTF-16 index of byte {idx} in {string}
707713
values({dict}) List values in {dict}
708714
virtcol({expr} [, {list}]) Number or List
709715
screen column of cursor or mark
@@ -1363,7 +1369,7 @@ byte2line({byte}) *byte2line()*
13631369
< {not available when compiled without the |+byte_offset|
13641370
feature}
13651371

1366-
byteidx({expr}, {nr}) *byteidx()*
1372+
byteidx({expr}, {nr} [, {utf16}]) *byteidx()*
13671373
Return byte index of the {nr}'th character in the String
13681374
{expr}. Use zero for the first character, it then returns
13691375
zero.
@@ -1373,6 +1379,13 @@ byteidx({expr}, {nr}) *byteidx()*
13731379
length is added to the preceding base character. See
13741380
|byteidxcomp()| below for counting composing characters
13751381
separately.
1382+
When {utf16} is TRUE, {nr} is used as the UTF-16 index in the
1383+
String {expr} instead of as the character index. The UTF-16
1384+
index is the index in the string when it is encoded with
1385+
16-bit words. If the specified UTF-16 index is in the middle
1386+
of a character (e.g. in a 4-byte character), then the byte
1387+
index of the first byte in the character is returned.
1388+
Refer to |string-offset-encoding| for more information.
13761389
Example : >
13771390
echo matchstr(str, ".", byteidx(str, 3))
13781391
< will display the fourth character. Another way to do the
@@ -1384,11 +1397,17 @@ byteidx({expr}, {nr}) *byteidx()*
13841397
If there are less than {nr} characters -1 is returned.
13851398
If there are exactly {nr} characters the length of the string
13861399
in bytes is returned.
1387-
1400+
See |charidx()| and |utf16idx()| for getting the character and
1401+
UTF-16 index respectively from the byte index.
1402+
Examples: >
1403+
echo byteidx('a😊😊', 2) returns 5
1404+
echo byteidx('a😊😊', 2, 1) returns 1
1405+
echo byteidx('a😊😊', 3, 1) returns 5
1406+
<
13881407
Can also be used as a |method|: >
13891408
GetName()->byteidx(idx)
13901409
1391-
byteidxcomp({expr}, {nr}) *byteidxcomp()*
1410+
byteidxcomp({expr}, {nr} [, {utf16}]) *byteidxcomp()*
13921411
Like byteidx(), except that a composing character is counted
13931412
as a separate character. Example: >
13941413
let s = 'e' .. nr2char(0x301)
@@ -1493,7 +1512,7 @@ charcol({expr} [, {winid}]) *charcol()*
14931512
GetPos()->col()
14941513
<
14951514
*charidx()*
1496-
charidx({string}, {idx} [, {countcc}])
1515+
charidx({string}, {idx} [, {countcc} [, {utf16}]])
14971516
Return the character index of the byte at {idx} in {string}.
14981517
The index of the first character is zero.
14991518
If there are no multibyte characters the returned value is
@@ -1503,17 +1522,22 @@ charidx({string}, {idx} [, {countcc}])
15031522
added to the preceding base character.
15041523
When {countcc} is |TRUE|, then composing characters are
15051524
counted as separate characters.
1525+
When {utf16} is TRUE, {idx} is used as the UTF-16 index in the
1526+
String {expr} instead of as the byte index.
15061527
Returns -1 if the arguments are invalid or if {idx} is greater
15071528
than the index of the last byte in {string}. An error is
15081529
given if the first argument is not a string, the second
15091530
argument is not a number or when the third argument is present
15101531
and is not zero or one.
15111532
See |byteidx()| and |byteidxcomp()| for getting the byte index
1512-
from the character index.
1533+
from the character index and |utf16idx()| for getting the
1534+
UTF-16 index from the character index.
1535+
Refer to |string-offset-encoding| for more information.
15131536
Examples: >
15141537
echo charidx('áb́ć', 3) returns 1
15151538
echo charidx('áb́ć', 6, 1) returns 4
15161539
echo charidx('áb́ć', 16) returns -1
1540+
echo charidx('a😊😊', 4, 0, 1) returns 2
15171541
<
15181542
Can also be used as a |method|: >
15191543
GetName()->charidx(idx)
@@ -9243,6 +9267,27 @@ strtrans({string}) *strtrans()*
92439267
Can also be used as a |method|: >
92449268
GetString()->strtrans()
92459269
9270+
strutf16len({string} [, {countcc}]) *strutf16len()*
9271+
The result is a Number, which is the number of UTF-16 code
9272+
units in String {string}.
9273+
When {countcc} is TRUE, composing characters are counted
9274+
separately.
9275+
When {countcc} is omitted or FALSE, composing characters are
9276+
ignored.
9277+
9278+
Returns zero on error.
9279+
9280+
Also see |strlen()| and |strcharlen()|.
9281+
Examples: >
9282+
echo strutf16len('a') returns 1
9283+
echo strutf16len('©') returns 1
9284+
echo strutf16len('😊') returns 2
9285+
echo strutf16len('ą́') returns 1
9286+
echo strutf16len('ą́', v:true) returns 3
9287+
9288+
Can also be used as a |method|: >
9289+
GetText()->strutf16len()
9290+
<
92469291
strwidth({string}) *strwidth()*
92479292
The result is a Number, which is the number of display cells
92489293
String {string} occupies. A Tab character is counted as one
@@ -10058,6 +10103,31 @@ uniq({list} [, {func} [, {dict}]]) *uniq()* *E882*
1005810103

1005910104
Can also be used as a |method|: >
1006010105
mylist->uniq()
10106+
<
10107+
*utf16idx()*
10108+
utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
10109+
Same as |charidx()| but returns the UTF-16 index of the byte
10110+
at {idx} in {string}.
10111+
When {charidx} is TRUE, {idx} is used as the character index
10112+
in the String {string} instead of as the byte index.
10113+
An {idx} in the middle of a UTF-8 sequence is rounded upwards
10114+
to the end of that sequence.
10115+
See |byteidx()| and |byteidxcomp()| for getting the byte index
10116+
from the UTF-16 index and |charidx()| for getting the
10117+
character index from the UTF-16 index.
10118+
Refer to |string-offset-encoding| for more information.
10119+
Examples: >
10120+
echo utf16idx('a😊😊', 3) returns 2
10121+
echo utf16idx('a😊😊', 7) returns 4
10122+
echo utf16idx('a😊😊', 1, 0, 1) returns 2
10123+
echo utf16idx('a😊😊', 2, 0, 1) returns 4
10124+
echo utf16idx('aą́c', 6) returns 2
10125+
echo utf16idx('aą́c', 6, 1) returns 4
10126+
echo utf16idx('a😊😊', 9) returns -1
10127+
<
10128+
Can also be used as a |method|: >
10129+
GetName()->utf16idx(idx)
10130+
1006110131
1006210132
values({dict}) *values()*
1006310133
Return a |List| with all the values of {dict}. The |List| is

runtime/doc/eval.txt

+20
Original file line numberDiff line numberDiff line change
@@ -1580,6 +1580,26 @@ Examples: >
15801580
echo $"The square root of {{9}} is {sqrt(9)}"
15811581
< The square root of {9} is 3.0 ~
15821582

1583+
*string-offset-encoding*
1584+
A string can consist of one byte or two byte or four byte characters. A
1585+
string offset can use UTF-8 encoding (byte) or UTF-16 encoding (16-bit words)
1586+
or UTF-32 encoding (character). In legacy vim scripts UTF-8 encoding (byte)
1587+
is used for string offsets and in vim9 scripts UTF-32 encoding (character) is
1588+
used for string offsets.
1589+
1590+
The different offsets for the string "a©😊" are below:
1591+
1592+
UTF-8 offsets:
1593+
[0]: 61, [1]: C2, [2]: A9, [3]: F0, [4]: 9F, [5]: 98, [6]: 8A
1594+
UTF-16 offsets:
1595+
[0]: 61, [1]: C2 A9, [2]: F0 9F, [3]: 98 8A
1596+
UTF-32 offsets:
1597+
[0]: 61, [1]: C2 A9, [2]: F0 9F 98 8A
1598+
1599+
The functions |byteidx()|, |utf16idx()| and |charidx()| can be used to convert
1600+
between these indices. The functions |strlen()|, |strutf16len()| and
1601+
|strcharlen()| return the number of bytes, UTF-16 code units and characters in
1602+
a string respectively.
15831603

15841604
option *expr-option* *E112* *E113*
15851605
------

runtime/doc/usr_41.txt

+2
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,7 @@ String manipulation: *string-functions*
754754
strlen() length of a string in bytes
755755
strcharlen() length of a string in characters
756756
strchars() number of characters in a string
757+
strutf16len() number of UTF-16 code units in a string
757758
strwidth() size of string when displayed
758759
strdisplaywidth() size of string when displayed, deals with tabs
759760
setcellwidths() set character cell width overrides
@@ -771,6 +772,7 @@ String manipulation: *string-functions*
771772
byteidx() byte index of a character in a string
772773
byteidxcomp() like byteidx() but count composing characters
773774
charidx() character index of a byte in a string
775+
utf16idx() UTF-16 index of a byte in a string
774776
repeat() repeat a string multiple times
775777
eval() evaluate a string expression
776778
execute() execute an Ex command and get the output

src/evalfunc.c

+7-3
Original file line numberDiff line numberDiff line change
@@ -1751,9 +1751,9 @@ static funcentry_T global_functions[] =
17511751
ret_number, f_bufwinnr},
17521752
{"byte2line", 1, 1, FEARG_1, arg1_number,
17531753
ret_number, f_byte2line},
1754-
{"byteidx", 2, 2, FEARG_1, arg2_string_number,
1754+
{"byteidx", 2, 3, FEARG_1, arg3_string_number_bool,
17551755
ret_number, f_byteidx},
1756-
{"byteidxcomp", 2, 2, FEARG_1, arg2_string_number,
1756+
{"byteidxcomp", 2, 3, FEARG_1, arg3_string_number_bool,
17571757
ret_number, f_byteidxcomp},
17581758
{"call", 2, 3, FEARG_1, arg3_any_list_dict,
17591759
ret_any, f_call},
@@ -1803,7 +1803,7 @@ static funcentry_T global_functions[] =
18031803
ret_number, f_charclass},
18041804
{"charcol", 1, 2, FEARG_1, arg2_string_or_list_number,
18051805
ret_number, f_charcol},
1806-
{"charidx", 2, 3, FEARG_1, arg3_string_number_bool,
1806+
{"charidx", 2, 4, FEARG_1, arg3_string_number_bool,
18071807
ret_number, f_charidx},
18081808
{"chdir", 1, 1, FEARG_1, arg1_string,
18091809
ret_string, f_chdir},
@@ -2601,6 +2601,8 @@ static funcentry_T global_functions[] =
26012601
ret_number, f_strridx},
26022602
{"strtrans", 1, 1, FEARG_1, arg1_string,
26032603
ret_string, f_strtrans},
2604+
{"strutf16len", 1, 2, FEARG_1, arg2_string_bool,
2605+
ret_number, f_strutf16len},
26042606
{"strwidth", 1, 1, FEARG_1, arg1_string,
26052607
ret_number, f_strwidth},
26062608
{"submatch", 1, 2, FEARG_1, arg2_number_bool,
@@ -2785,6 +2787,8 @@ static funcentry_T global_functions[] =
27852787
ret_dict_any, f_undotree},
27862788
{"uniq", 1, 3, FEARG_1, arg13_sortuniq,
27872789
ret_first_arg, f_uniq},
2790+
{"utf16idx", 2, 4, FEARG_1, arg3_string_number_bool,
2791+
ret_number, f_utf16idx},
27882792
{"values", 1, 1, FEARG_1, arg1_dict_any,
27892793
ret_list_member, f_values},
27902794
{"virtcol", 1, 2, FEARG_1, arg2_string_or_list_bool,

src/proto/strings.pro

+2
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,14 @@ void f_string(typval_T *argvars, typval_T *rettv);
3636
void f_strlen(typval_T *argvars, typval_T *rettv);
3737
void f_strcharlen(typval_T *argvars, typval_T *rettv);
3838
void f_strchars(typval_T *argvars, typval_T *rettv);
39+
void f_strutf16len(typval_T *argvars, typval_T *rettv);
3940
void f_strdisplaywidth(typval_T *argvars, typval_T *rettv);
4041
void f_strwidth(typval_T *argvars, typval_T *rettv);
4142
void f_strcharpart(typval_T *argvars, typval_T *rettv);
4243
void f_strpart(typval_T *argvars, typval_T *rettv);
4344
void f_strridx(typval_T *argvars, typval_T *rettv);
4445
void f_strtrans(typval_T *argvars, typval_T *rettv);
46+
void f_utf16idx(typval_T *argvars, typval_T *rettv);
4547
void f_tolower(typval_T *argvars, typval_T *rettv);
4648
void f_toupper(typval_T *argvars, typval_T *rettv);
4749
void f_tr(typval_T *argvars, typval_T *rettv);

0 commit comments

Comments
 (0)