Skip to content

Commit 67672ef

Browse files
chrisbrabrammool
authored andcommitted
patch 9.0.1485: no functions for converting from/to UTF-16 index
Problem: no functions for converting from/to UTF-16 index. Solution: Add UTF-16 flag to existing funtions and add strutf16len() and utf16idx(). (Yegappan Lakshmanan, closes #12216)
1 parent e1b4822 commit 67672ef

File tree

8 files changed

+676
-55
lines changed

8 files changed

+676
-55
lines changed

Diff for: runtime/doc/builtin.txt

+88-10
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,10 @@ bufnr([{buf} [, {create}]]) Number Number of the buffer {buf}
8181
bufwinid({buf}) Number window ID of buffer {buf}
8282
bufwinnr({buf}) Number window number of buffer {buf}
8383
byte2line({byte}) Number line number at byte count {byte}
84-
byteidx({expr}, {nr}) Number byte index of {nr}'th char in {expr}
85-
byteidxcomp({expr}, {nr}) Number byte index of {nr}'th char in {expr}
84+
byteidx({expr}, {nr} [, {utf16}])
85+
Number byte index of {nr}'th char in {expr}
86+
byteidxcomp({expr}, {nr} [, {utf16}])
87+
Number byte index of {nr}'th char in {expr}
8688
call({func}, {arglist} [, {dict}])
8789
any call {func} with arguments {arglist}
8890
ceil({expr}) Float round {expr} up
@@ -117,7 +119,7 @@ changenr() Number current change number
117119
char2nr({expr} [, {utf8}]) Number ASCII/UTF-8 value of first char in {expr}
118120
charclass({string}) Number character class of {string}
119121
charcol({expr} [, {winid}]) Number column number of cursor or mark
120-
charidx({string}, {idx} [, {countcc}])
122+
charidx({string}, {idx} [, {countcc} [, {utf16}]])
121123
Number char index of byte {idx} in {string}
122124
chdir({dir}) String change current working directory
123125
cindent({lnum}) Number C indent for line {lnum}
@@ -604,6 +606,8 @@ strptime({format}, {timestring})
604606
strridx({haystack}, {needle} [, {start}])
605607
Number last index of {needle} in {haystack}
606608
strtrans({expr}) String translate string to make it printable
609+
strutf16len({string} [, {countcc}])
610+
Number number of UTF-16 code units in {string}
607611
strwidth({expr}) Number display cell length of the String {expr}
608612
submatch({nr} [, {list}]) String or List
609613
specific match in ":s" or substitute()
@@ -704,6 +708,8 @@ undofile({name}) String undo file name for {name}
704708
undotree() List undo file tree
705709
uniq({list} [, {func} [, {dict}]])
706710
List remove adjacent duplicates from a list
711+
utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
712+
Number UTF-16 index of byte {idx} in {string}
707713
values({dict}) List values in {dict}
708714
virtcol({expr} [, {list}]) Number or List
709715
screen column of cursor or mark
@@ -1363,7 +1369,7 @@ byte2line({byte}) *byte2line()*
13631369
< {not available when compiled without the |+byte_offset|
13641370
feature}
13651371

1366-
byteidx({expr}, {nr}) *byteidx()*
1372+
byteidx({expr}, {nr} [, {utf16}]) *byteidx()*
13671373
Return byte index of the {nr}'th character in the String
13681374
{expr}. Use zero for the first character, it then returns
13691375
zero.
@@ -1373,6 +1379,13 @@ byteidx({expr}, {nr}) *byteidx()*
13731379
length is added to the preceding base character. See
13741380
|byteidxcomp()| below for counting composing characters
13751381
separately.
1382+
When {utf16} is present and TRUE, {nr} is used as the UTF-16
1383+
index in the String {expr} instead of as the character index.
1384+
The UTF-16 index is the index in the string when it is encoded
1385+
with 16-bit words. If the specified UTF-16 index is in the
1386+
middle of a character (e.g. in a 4-byte character), then the
1387+
byte index of the first byte in the character is returned.
1388+
Refer to |string-offset-encoding| for more information.
13761389
Example : >
13771390
echo matchstr(str, ".", byteidx(str, 3))
13781391
< will display the fourth character. Another way to do the
@@ -1384,11 +1397,17 @@ byteidx({expr}, {nr}) *byteidx()*
13841397
If there are less than {nr} characters -1 is returned.
13851398
If there are exactly {nr} characters the length of the string
13861399
in bytes is returned.
1387-
1400+
See |charidx()| and |utf16idx()| for getting the character and
1401+
UTF-16 index respectively from the byte index.
1402+
Examples: >
1403+
echo byteidx('a😊😊', 2) returns 5
1404+
echo byteidx('a😊😊', 2, 1) returns 1
1405+
echo byteidx('a😊😊', 3, 1) returns 5
1406+
<
13881407
Can also be used as a |method|: >
13891408
GetName()->byteidx(idx)
13901409
1391-
byteidxcomp({expr}, {nr}) *byteidxcomp()*
1410+
byteidxcomp({expr}, {nr} [, {utf16}]) *byteidxcomp()*
13921411
Like byteidx(), except that a composing character is counted
13931412
as a separate character. Example: >
13941413
let s = 'e' .. nr2char(0x301)
@@ -1493,27 +1512,36 @@ charcol({expr} [, {winid}]) *charcol()*
14931512
GetPos()->col()
14941513
<
14951514
*charidx()*
1496-
charidx({string}, {idx} [, {countcc}])
1515+
charidx({string}, {idx} [, {countcc} [, {utf16}]])
14971516
Return the character index of the byte at {idx} in {string}.
14981517
The index of the first character is zero.
14991518
If there are no multibyte characters the returned value is
15001519
equal to {idx}.
1520+
15011521
When {countcc} is omitted or |FALSE|, then composing characters
1502-
are not counted separately, their byte length is
1503-
added to the preceding base character.
1522+
are not counted separately, their byte length is added to the
1523+
preceding base character.
15041524
When {countcc} is |TRUE|, then composing characters are
15051525
counted as separate characters.
1526+
1527+
When {utf16} is present and TRUE, {idx} is used as the UTF-16
1528+
index in the String {expr} instead of as the byte index.
1529+
15061530
Returns -1 if the arguments are invalid or if {idx} is greater
15071531
than the index of the last byte in {string}. An error is
15081532
given if the first argument is not a string, the second
15091533
argument is not a number or when the third argument is present
15101534
and is not zero or one.
1535+
15111536
See |byteidx()| and |byteidxcomp()| for getting the byte index
1512-
from the character index.
1537+
from the character index and |utf16idx()| for getting the
1538+
UTF-16 index from the character index.
1539+
Refer to |string-offset-encoding| for more information.
15131540
Examples: >
15141541
echo charidx('áb́ć', 3) returns 1
15151542
echo charidx('áb́ć', 6, 1) returns 4
15161543
echo charidx('áb́ć', 16) returns -1
1544+
echo charidx('a😊😊', 4, 0, 1) returns 2
15171545
<
15181546
Can also be used as a |method|: >
15191547
GetName()->charidx(idx)
@@ -9244,6 +9272,28 @@ strtrans({string}) *strtrans()*
92449272
Can also be used as a |method|: >
92459273
GetString()->strtrans()
92469274
9275+
strutf16len({string} [, {countcc}]) *strutf16len()*
9276+
The result is a Number, which is the number of UTF-16 code
9277+
units in String {string} (after converting it to UTF-16).
9278+
9279+
When {countcc} is TRUE, composing characters are counted
9280+
separately.
9281+
When {countcc} is omitted or FALSE, composing characters are
9282+
ignored.
9283+
9284+
Returns zero on error.
9285+
9286+
Also see |strlen()| and |strcharlen()|.
9287+
Examples: >
9288+
echo strutf16len('a') returns 1
9289+
echo strutf16len('©') returns 1
9290+
echo strutf16len('😊') returns 2
9291+
echo strutf16len('ą́') returns 1
9292+
echo strutf16len('ą́', v:true) returns 3
9293+
9294+
Can also be used as a |method|: >
9295+
GetText()->strutf16len()
9296+
<
92479297
strwidth({string}) *strwidth()*
92489298
The result is a Number, which is the number of display cells
92499299
String {string} occupies. A Tab character is counted as one
@@ -10059,6 +10109,34 @@ uniq({list} [, {func} [, {dict}]]) *uniq()* *E882*
1005910109

1006010110
Can also be used as a |method|: >
1006110111
mylist->uniq()
10112+
<
10113+
*utf16idx()*
10114+
utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
10115+
Same as |charidx()| but returns the UTF-16 index of the byte
10116+
at {idx} in {string} (after converting it to UTF-16).
10117+
10118+
When {charidx} is present and TRUE, {idx} is used as the
10119+
character index in the String {string} instead of as the byte
10120+
index.
10121+
An {idx} in the middle of a UTF-8 sequence is rounded upwards
10122+
to the end of that sequence.
10123+
10124+
See |byteidx()| and |byteidxcomp()| for getting the byte index
10125+
from the UTF-16 index and |charidx()| for getting the
10126+
character index from the UTF-16 index.
10127+
Refer to |string-offset-encoding| for more information.
10128+
Examples: >
10129+
echo utf16idx('a😊😊', 3) returns 2
10130+
echo utf16idx('a😊😊', 7) returns 4
10131+
echo utf16idx('a😊😊', 1, 0, 1) returns 2
10132+
echo utf16idx('a😊😊', 2, 0, 1) returns 4
10133+
echo utf16idx('aą́c', 6) returns 2
10134+
echo utf16idx('aą́c', 6, 1) returns 4
10135+
echo utf16idx('a😊😊', 9) returns -1
10136+
<
10137+
Can also be used as a |method|: >
10138+
GetName()->utf16idx(idx)
10139+
1006210140
1006310141
values({dict}) *values()*
1006410142
Return a |List| with all the values of {dict}. The |List| is

Diff for: runtime/doc/eval.txt

+27
Original file line numberDiff line numberDiff line change
@@ -1580,6 +1580,33 @@ Examples: >
15801580
echo $"The square root of {{9}} is {sqrt(9)}"
15811581
< The square root of {9} is 3.0 ~
15821582

1583+
*string-offset-encoding*
1584+
A string consists of multiple characters. How the characters are stored
1585+
depends on 'encoding'. Most common is UTF-8, which uses one byte for ASCII
1586+
characters, two bytes for other latin characters and more bytes for other
1587+
characters.
1588+
1589+
A string offset can count characters or bytes. Other programs may use
1590+
UTF-16 encoding (16-bit words) and an offset of UTF-16 words. Some functions
1591+
use byte offsets, usually for UTF-8 encoding. Other functions use character
1592+
offsets, in which case the encoding doesn't matter.
1593+
1594+
The different offsets for the string "a©😊" are below:
1595+
1596+
UTF-8 offsets:
1597+
[0]: 61, [1]: C2, [2]: A9, [3]: F0, [4]: 9F, [5]: 98, [6]: 8A
1598+
UTF-16 offsets:
1599+
[0]: 0061, [1]: 00A9, [2]: D83D, [3]: DE0A
1600+
UTF-32 (character) offsets:
1601+
[0]: 00000061, [1]: 000000A9, [2]: 0001F60A
1602+
1603+
You can use the "g8" and "ga" commands on a character to see the
1604+
decimal/hex/octal values.
1605+
1606+
The functions |byteidx()|, |utf16idx()| and |charidx()| can be used to convert
1607+
between these indices. The functions |strlen()|, |strutf16len()| and
1608+
|strcharlen()| return the number of bytes, UTF-16 code units and characters in
1609+
a string respectively.
15831610

15841611
option *expr-option* *E112* *E113*
15851612
------

Diff for: runtime/doc/usr_41.txt

+2
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,7 @@ String manipulation: *string-functions*
754754
strlen() length of a string in bytes
755755
strcharlen() length of a string in characters
756756
strchars() number of characters in a string
757+
strutf16len() number of UTF-16 code units in a string
757758
strwidth() size of string when displayed
758759
strdisplaywidth() size of string when displayed, deals with tabs
759760
setcellwidths() set character cell width overrides
@@ -771,6 +772,7 @@ String manipulation: *string-functions*
771772
byteidx() byte index of a character in a string
772773
byteidxcomp() like byteidx() but count composing characters
773774
charidx() character index of a byte in a string
775+
utf16idx() UTF-16 index of a byte in a string
774776
repeat() repeat a string multiple times
775777
eval() evaluate a string expression
776778
execute() execute an Ex command and get the output

Diff for: src/evalfunc.c

+7-3
Original file line numberDiff line numberDiff line change
@@ -1751,9 +1751,9 @@ static funcentry_T global_functions[] =
17511751
ret_number, f_bufwinnr},
17521752
{"byte2line", 1, 1, FEARG_1, arg1_number,
17531753
ret_number, f_byte2line},
1754-
{"byteidx", 2, 2, FEARG_1, arg2_string_number,
1754+
{"byteidx", 2, 3, FEARG_1, arg3_string_number_bool,
17551755
ret_number, f_byteidx},
1756-
{"byteidxcomp", 2, 2, FEARG_1, arg2_string_number,
1756+
{"byteidxcomp", 2, 3, FEARG_1, arg3_string_number_bool,
17571757
ret_number, f_byteidxcomp},
17581758
{"call", 2, 3, FEARG_1, arg3_any_list_dict,
17591759
ret_any, f_call},
@@ -1803,7 +1803,7 @@ static funcentry_T global_functions[] =
18031803
ret_number, f_charclass},
18041804
{"charcol", 1, 2, FEARG_1, arg2_string_or_list_number,
18051805
ret_number, f_charcol},
1806-
{"charidx", 2, 3, FEARG_1, arg3_string_number_bool,
1806+
{"charidx", 2, 4, FEARG_1, arg3_string_number_bool,
18071807
ret_number, f_charidx},
18081808
{"chdir", 1, 1, FEARG_1, arg1_string,
18091809
ret_string, f_chdir},
@@ -2601,6 +2601,8 @@ static funcentry_T global_functions[] =
26012601
ret_number, f_strridx},
26022602
{"strtrans", 1, 1, FEARG_1, arg1_string,
26032603
ret_string, f_strtrans},
2604+
{"strutf16len", 1, 2, FEARG_1, arg2_string_bool,
2605+
ret_number, f_strutf16len},
26042606
{"strwidth", 1, 1, FEARG_1, arg1_string,
26052607
ret_number, f_strwidth},
26062608
{"submatch", 1, 2, FEARG_1, arg2_number_bool,
@@ -2785,6 +2787,8 @@ static funcentry_T global_functions[] =
27852787
ret_dict_any, f_undotree},
27862788
{"uniq", 1, 3, FEARG_1, arg13_sortuniq,
27872789
ret_first_arg, f_uniq},
2790+
{"utf16idx", 2, 4, FEARG_1, arg3_string_number_bool,
2791+
ret_number, f_utf16idx},
27882792
{"values", 1, 1, FEARG_1, arg1_dict_any,
27892793
ret_list_member, f_values},
27902794
{"virtcol", 1, 2, FEARG_1, arg2_string_or_list_bool,

Diff for: src/proto/strings.pro

+2
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,14 @@ void f_string(typval_T *argvars, typval_T *rettv);
3636
void f_strlen(typval_T *argvars, typval_T *rettv);
3737
void f_strcharlen(typval_T *argvars, typval_T *rettv);
3838
void f_strchars(typval_T *argvars, typval_T *rettv);
39+
void f_strutf16len(typval_T *argvars, typval_T *rettv);
3940
void f_strdisplaywidth(typval_T *argvars, typval_T *rettv);
4041
void f_strwidth(typval_T *argvars, typval_T *rettv);
4142
void f_strcharpart(typval_T *argvars, typval_T *rettv);
4243
void f_strpart(typval_T *argvars, typval_T *rettv);
4344
void f_strridx(typval_T *argvars, typval_T *rettv);
4445
void f_strtrans(typval_T *argvars, typval_T *rettv);
46+
void f_utf16idx(typval_T *argvars, typval_T *rettv);
4547
void f_tolower(typval_T *argvars, typval_T *rettv);
4648
void f_toupper(typval_T *argvars, typval_T *rettv);
4749
void f_tr(typval_T *argvars, typval_T *rettv);

0 commit comments

Comments
 (0)