Fix langinfo(ALT_DIGITS)

khwilliamson · khwilliamson · commit bfaaa04b13e3 · 2024-01-15T14:58:39.000-07:00
This has never worked properly before in Perl. The code is returning the result of the libc function nl_langinfo(). The documentation for it that I have found (and presumably my predecessors) is very unclear. But what actually happens (from using gdb) is that the return is very C unfriendly. Instead of returning a NUL-terminated string, it returns 100 (perhaps fewer) NUL-terminated strings in a row. When it is fewer (given the few examples I've seen), the final one ends with two NULs in a row. (I can't think of a way for it to work and be otherwise). The 100th one doesn't necessarily have two terminating NULs. Prior to this commit, only the string for the zeroth digit was returned; now the entire ALT_DIGIT string sequence is returned, forcing a double NUL at the end of the final one. This information is accessible in several ways. Via XS, one can use any of several functions, including the newly introduced sv_langinfo(), returning an SV, which allows for easier handling of embedded NULs. (Otherwise in XS, using the functions that return a char*, one has to look for the double-NUL.) From Perl-space, the access is via I18N::Langinfo, which behind the scenes also uses an SV. The documentation added in this commit gives advice for how to turn the return into an @array for more convenient access.
diff --git a/ext/I18N-Langinfo/Langinfo.pm b/ext/I18N-Langinfo/Langinfo.pm
@@ -70,13 +70,15 @@ our @EXPORT_OK = qw(
 	YESSTR
 );
 
-our $VERSION = '0.22';
+our $VERSION = '0.23';
 
 XSLoader::load();
 
 1;
 __END__
 
+=encoding utf8
+
 =head1 NAME
 
 I18N::Langinfo - query locale information
@@ -155,16 +157,52 @@ For the character code set being used (such as "ISO8859-1", "cp850",
 
 =item *
 
-For an alternate representation of digits, for the
-radix character used between the integer and the fractional part
-of decimal numbers, the group separator string for large-ish floating point
-numbers (yes, the final two are redundant with
+For the radix character used between the integer and the fractional part of
+decimal numbers and the group separator string for large-ish floating point
+numbers (yes, these are redundant with
 L<POSIX::localeconv()|POSIX/localeconv>):
 
-    ALT_DIGITS RADIXCHAR THOUSEP
+    RADIXCHAR THOUSEP
 
 =item *
 
+For any alternate digits in the locale
+
+    ALT_DIGITS
+
+This returns a sequence of up to 100 strings, starting with the alternate
+representation of zero; then the same for one, two, ... ninety-nine.
+
+To access this data conveniently, you could do something like
+
+ use I18N::Langinfo qw(langinfo ALT_DIGITS);
+ my @alt_digits = split ';', langinfo(ALT_DIGITS);
+
+The array C<@alt_digits> will contain 0 elements if the current locale doesn't
+have alternate digits specified for it.  Otherwise, it will have as many
+elements as the locale defines, with C<[0]> containing the alternate digit for
+zero; C<[1]> for one; and so forth, up to potentially C<[99]> for the
+alternate representation of ninety-nine.
+
+Most locales don't have alternate digits, so the array will be empty.
+
+Be aware that the alternate representation in some locales for the numbers
+0..9 will have a leading alternate-zero, so would look like the equivalent of
+00..09.
+
+Running this program
+
+ use I18N::Langinfo qw(langinfo ALT_DIGITS);
+ my @alt_digits = split ';', langinfo(ALT_DIGITS);
+ splice @alt_digits, 15; 
+ print join " ", @alt_digits, "\n";
+
+on a Japanese locale yields
+
+S<C<〇 一 二 三 四 五 六 七 八 九 十 十一 十二 十三 十四>>
+
+= item *
+
 For the affirmative and negative responses and expressions:
 
     YESSTR YESEXPR NOSTR NOEXPR
@@ -235,9 +273,11 @@ differently, please file a report at L<https://github.com/Perl/perl5/issues>.
 
 =item C<ALT_DIGITS>
 
-Currently this gives the same results as Linux does.  If you have examples of
-it needing to work differently, please file a report at
-L<https://github.com/Perl/perl5/issues>.
+This tries hard to return the same values as C<nl_langinfo()>.  It uses the
+C<%O> formats that the C<libc> L<strftime(3)> function on some platforms (not
+Windows) understands.  It returns as many consecutive alternate digits as it
+can find, starting with the one for zero; or the empty string if none are
+found.
 
 =item C<ERA_D_FMT>
 
diff --git a/lib/locale.t b/lib/locale.t
@@ -58,7 +58,7 @@ BEGIN {
 }
 
 use feature 'fc';
-use I18N::Langinfo qw(langinfo CODESET CRNCYSTR RADIXCHAR THOUSEP);
+use I18N::Langinfo qw(langinfo CODESET CRNCYSTR RADIXCHAR THOUSEP ALT_DIGITS);
 
 # =1 adds debugging output; =2 increases the verbosity somewhat
 our $debug = $ENV{PERL_DEBUG_FULL_TEST} // 0;
@@ -1060,6 +1060,7 @@ foreach my $Locale (@Locale) {
         debug "is utf8 locale? = $is_utf8_locale\n";
         debug "radix = " . disp_str(langinfo(RADIXCHAR)) . "\n";
         debug "numeric group separator = '" .  disp_str(langinfo(THOUSEP)) . "'\n";
+        debug "alt_digits = " . disp_str(langinfo(ALT_DIGITS)) . "\n";
         debug "currency = " . disp_str(langinfo(CRNCYSTR));
     }
 
@@ -2465,6 +2466,51 @@ foreach my $Locale (@Locale) {
             print "# failed $locales_test_number locale '$Locale' numbers @f\n"
 	}
     }
+
+    {
+        my @f = ();
+        ++$locales_test_number;
+        $test_names{$locales_test_number} =
+                 'Verify ALT_DIGITS returns nothing, or else non-ASCII and'
+               . ' the single char digits evaluate to consecutive integers'
+               . ' starting at 0';
+
+        my $alts = langinfo(ALT_DIGITS);
+        if ($alts) {
+            my @alts = split ';', $alts;
+            my $prev = -1;
+            foreach my $num (@alts) {
+                if ($num =~ /[[:ascii:]]/) {
+                    push @f, disp_str($num);
+                    last;
+                }
+
+                # We only look at single character strings; likely locales
+                # that have alternate digits have a different mechanism for
+                # representing larger numbers.  Japanese for example, has a
+                # single character for the number 10, which is prefixed to the
+                # '1' symbol for '11', etc.  And 21 is represented by 3
+                # characters, the '2' symbol, followed by the '10' symbol,
+                # then the '1' symbol.  (There is nothing to say that a locale
+                # even has to use base 10.)
+                last if length $num > 1;
+
+                use Unicode::UCD 'num';
+                my $value = num($num);
+                if ($value != $prev + 1) {
+                    push @f, disp_str($num);
+                    last;
+                }
+
+                $prev = $value;
+            }
+        }
+
+        report_result($Locale, $locales_test_number, @f == 0);
+        if (@f) {
+            print "# failed $locales_test_number locale '$Locale' numbers @f\n"
+	}
+    }
 }
 
 my $final_locales_test_number = $locales_test_number;
diff --git a/locale.c b/locale.c
@@ -6118,13 +6118,66 @@ S_langinfo_sv_i(pTHX_
 
         const char * retval = nl_langinfo(item);
         Size_t total_len = strlen(retval);
+        char separator;
+
+        if (UNLIKELY(item == ALT_DIGITS) && total_len > 0) {
+
+            char * sep_pos =
+                    (char *) strpbrk(retval, "!\"#$%&'()*+,-./.@[\\]^_`{|}~");
+            if (sep_pos) {
+                separator = retval[sep_pos - retval];
+            }
+            else {
+                separator = '\0';
+ 
+                /* Must be using NUL to separate the digits.  There are up to
+                 * 100 of them, ending in two NULs if fewer.  Find the end */
+                const char * s = retval + total_len + 1;
+
+                for (unsigned int i = 1; i <= 99; i++) {
+                    Size_t len = strlen(s) + 1;
+                    total_len += len;
+
+                    if (len == 1) {     /* Only a NUL */
+                        break;
+                    }
+
+                    s += len;
+                }
+            }
+        }
+
         sv_setpvn(sv, retval, total_len);
 
         gwLOCALE_UNLOCK;
 
+        /* Convert the ALT_DIGITS separator to a semi-colong if not already */
+        if (UNLIKELY(item == ALT_DIGITS) && total_len > 0 && separator != ';') {
+            char * digit_string = SvPVX(sv);
+            char * s = digit_string;
+            char * e = s + total_len;
+
+            while (s < e) {
+                char * this_end = (char *) memchr(s, separator, total_len);
+                if (! this_end) {
+                    break;
+                }
+
+                *this_end = ';';
+                s = this_end;
+            }
+        }
+
         SvUTF8_off(sv);
         retval = SvPVX_const(sv);
 
+        /* Note that get_locale_string_utf8ness_i() is passed a char*, so stops
+         * looking at the first NUL, meaning it only looks at string [0] in the
+         * ALT_DIGITS case: alternate zero.  One might think that you'd need to
+         * look at all the strings to determine utf8ness.  But that is not true
+         * for this case; string [0] is sufficient.  This is because there are
+         * no ASCII alternate digits, so [0] is enough to decide the utf8ness
+         * */
         if (utf8ness) {
             *utf8ness = get_locale_string_utf8ness_i(retval,
                                                      LOCALE_UTF8NESS_UNKNOWN,
@@ -6865,34 +6918,7 @@ S_emulate_langinfo(pTHX_ const int item,
 
         restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
 
-        /* If the item is 'ALT_DIGITS', '*retbuf' contains the alternate
-        * format for wday 0.  If the value is the same as the normal 0,
-        * there isn't an alternate, so clear the buffer.
-        *
-        * (wday was chosen because its range is all a single digit.
-        * Things like tm_sec have two digits as the minimum: '00'.) */
-        if (item == ALT_DIGITS && strEQ(temp, "0")) {
-            retval = "";
-            Safefree(temp);
-            break;
-        }
-
-        /* ALT_DIGITS is problematic.  Experiments on it showed that
-        * strftime() did not always work properly when going from alt-9 to
-        * alt-10.  Only a few locales have this item defined, and in all
-        * of them on Linux that khw was able to find, nl_langinfo() merely
-        * returned the alt-0 character, possibly doubled.  Most Unicode
-        * digits are in blocks of 10 consecutive code points, so that is
-        * sufficient information for such scripts, as we can infer alt-1,
-        * alt-2, ....  But for a Japanese locale, a CJK ideographic 0 is
-        * returned, and the CJK digits are not in code point order, so you
-        * can't really infer anything.  The localedef for this locale did
-        * specify the succeeding digits, so that strftime() works properly
-        * on them, without needing to infer anything.  But the
-        * nl_langinfo() return did not give sufficient information for the
-        * caller to understand what's going on.  So until there is
-        * evidence that it should work differently, this returns the alt-0
-        * string for ALT_DIGITS. */
+        if (LIKELY(item != ALT_DIGITS)) {
 
             /* If to return what strftime() returns, are done */
             if (! return_format) {
@@ -6926,6 +6952,130 @@ S_emulate_langinfo(pTHX_ const int item,
 
             Safefree(temp);
             break;
+        }
+
+        /* Here, the item is 'ALT_DIGITS' and temp contains the zeroth
+         * alternate digit.  If empty or doesn't differ from regular digits,
+         * return that there aren't alternate digits */
+        if (temp[0] == '\0' || strchr(temp, '0')) {
+            Safefree(temp);
+            retval = "";
+            break;
+        }
+
+        /* ALT_DIGITS requires special handling because it requires up to 100
+         * values.  Below we generate those by using the %O modifier to
+         * strftime() formats.
+         *
+         * We already have the alternate digit for zero in 'temp', generated
+         * using the %Ow format.  That was used because it seems least likely
+         * to have a leading zero.  But some locales return that anyway.  If
+         * the first half of temp is identical to the second half, assume that
+         * is the case, and use just the first half */
+        const char * alt0 = temp;    /* Clearer synonym */
+        Size_t alt0_len = strlen(alt0);
+        if ((alt0_len & 1) == 0) {
+            Size_t half_alt0_len = alt0_len / 2;
+            if (strnEQ(temp, temp + half_alt0_len, half_alt0_len)) {
+                alt0_len = half_alt0_len;
+            }
+        }
+
+        /* Save the 0 digit string */
+        sv_setpvn(sv, alt0, alt0_len);
+        sv_catpvn_nomg (sv, ";", 1);
+
+        /* Various %O formats can be used to derive the alternate digits.  Only
+         * %Oy can go up to the full 100 values.  If it doesn't work, we try
+         * various fallbacks in decreasing order of how many values they can
+         * deliver.  maxes[] tells the highest value that the format applies
+         * to; offsets[] compensates for 0-based vs 1-based indices; and vars[]
+         * holds what field in the 'struct tm' to applies to the corresponding
+         * format */
+        int year, min, sec;
+      const char  * fmts[] = {"%Oy", "%OM", "%OS", "%Od", "%OH", "%Om", "%Ow" };
+      const Size_t maxes[] = {  99,    59,    59,    31,    23,    11,    6   };
+      const int  offsets[] = {   0,     0,     0,     1,     0,     1,    0   };
+      int         * vars[] = {&year,  &min,  &sec,  &mday, &hour, &mon, &mday };
+        Size_t j = 0;   /* Current index into the above tables */
+
+        orig_TIME_locale = toggle_locale_c(LC_TIME, locale);
+
+        for (unsigned int i = 1; i <= 99; i++) {
+            struct tm  mytm;
+
+          redo:
+            if (j >= C_ARRAY_LENGTH(fmts)) {
+                break;  /* Exhausted formats early; can't continue */
+            }
+
+            if (i > maxes[j]) {
+                j++;    /* Exhausted this format; try next one */
+                goto redo;
+            }
+
+            year = (strchr(fmts[j], 'y')) ? 1900 : 2011;
+            hour = 0;
+            min = 0;
+            sec = 0;
+            mday = 1;
+            mon = 0;
+
+            /* Change the variable corresponding to this format to the
+            * current time being run in 'i' */
+            *(vars[j]) += i - offsets[j];
+
+            /* Do the strftime.  Once we have determined the UTF8ness (if
+            * we want it), assume the rest will be the same, and use
+            * strftime_tm(), which doesn't recalculate UTF8ness */
+            ints_to_tm(&mytm, sec, min, hour, mday, mon, year, 0, 0, 0);
+            char * temp;
+            if (utf8ness && is_utf8 != UTF8NESS_NO && is_utf8 != UTF8NESS_YES) {
+                temp = strftime8(fmts[j],
+                                 &mytm,
+                                 UTF8NESS_IMMATERIAL,
+                                 &is_utf8,
+                                 false    /* not calling from sv_strftime */
+                                );
+            }
+            else {
+                temp = strftime_tm(fmts[j], &mytm);
+            }
+
+            DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+                                "i=%d, format=%s, alt='%s'\n",
+                                i, fmts[j], temp));
+
+            /* If no result (meaning this platform didn't recognize this
+            * format), or it returned regular digits, give up on this
+            * format, to try the next candidate one */
+            if (temp == NULL || strpbrk(temp, "0123456789")) {
+                Safefree(temp);
+                j++;
+                goto redo;
+            }
+
+            /* If there is a leading zero, skip past it, to get the second
+            * one in the string */
+            const char * current = temp;
+            if (strnEQ(temp, alt0, alt0_len)) {
+                current += alt0_len;
+            }
+
+            /* Append this number to the ongoing list, including the separator.
+             * */
+            sv_catpv_nomg (sv, current);
+            sv_catpvn_nomg (sv, ";", 1);
+            Safefree(temp);
+        } /* End of loop generating ALT_DIGIT strings */
+
+        Safefree(alt0);
+
+        restore_toggled_locale_c(LC_TIME, orig_TIME_locale);
+
+        retval_type = RETVAL_IN_sv;
+        break;
+
 #  endif
 
        }    /* End of braced group for outer switch 'default:' case */
diff --git a/pod/perldelta.pod b/pod/perldelta.pod