Skip to content

Commit a1a69c3

Browse files
committed
Support Microsoft's "Best Fit" mappings for Windows-1252 text encoding
In b5ff87c, I made a number of adjustments to our conversion code for CP1252. One of the adjustments was to make the mappings match those published by the Unicode Consortium in the file CP1252.TXT. These do not include mappings for the CP1252 bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D. Rostyslav Gulka reported that this caused a problem. His application stores binary JPEG data in an MS-SQL database. When they SELECT the binary data out of the database, it is treated as CP1252 text and automatically converted to UTF-8. To recover the original binary data, they then do a conversion from UTF-8 to CP1252. Obviously, that does not work if certain CP1252 bytes do not map to any Unicode codepoint at all. While this is a very unusual application of text encoding conversion, and we might choose not to support it if there was no other basis for including those mappings, it seems that Microsoft does actually include them in the Win32 API as "best fit" mappings. These are extra mappings from Unicode to other text encodings, which the Win32 API function WideCharToMultiByte uses by default unless the WC_NO_BEST_FIT_CHARS flag was passed. A list of these "best fit" mappings for CP1252 can be found here: https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit1252.txt
1 parent c21a592 commit a1a69c3

File tree

3 files changed

+10
-15
lines changed

3 files changed

+10
-15
lines changed

ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -379,10 +379,10 @@ DEF_SB_TBL(cp1251, "Windows-1251", "Windows-1251", cp1251_aliases, 0x80, cp1251_
379379

380380
static const char *cp1252_aliases[] = {"cp1252", NULL};
381381
static const unsigned short cp1252_ucs_table[] = {
382-
0X20AC,0X0000,0X201A,0X0192,0X201E,0X2026,0X2020,0X2021,
383-
0X02C6,0X2030,0X0160,0X2039,0X0152,0X0000,0X017D,0X0000,
384-
0X0000,0X2018,0X2019,0X201C,0X201D,0X2022,0X2013,0X2014,
385-
0X02DC,0X2122,0X0161,0X203A,0X0153,0X0000,0X017E,0X0178
382+
0X20AC,0X0081,0X201A,0X0192,0X201E,0X2026,0X2020,0X2021,
383+
0X02C6,0X2030,0X0160,0X2039,0X0152,0X008D,0X017D,0X008F,
384+
0X0090,0X2018,0X2019,0X201C,0X201D,0X2022,0X2013,0X2014,
385+
0X02DC,0X2122,0X0161,0X203A,0X0153,0X009D,0X017E,0X0178
386386
};
387387
DEF_SB(cp1252, "Windows-1252", "Windows-1252", cp1252_aliases);
388388

@@ -396,7 +396,7 @@ static int mbfl_filt_conv_wchar_cp1252(int c, mbfl_convert_filter *filter)
396396
}
397397
}
398398
CK(mbfl_filt_conv_illegal_output(c, filter));
399-
} else if ((c <= 0x7F || c >= 0xA0) && c != MBFL_BAD_INPUT) {
399+
} else if ((c <= 0x7F || c >= 0xA0 || c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D) && c != MBFL_BAD_INPUT) {
400400
CK((*filter->output_function)(c, filter->data));
401401
} else {
402402
CK(mbfl_filt_conv_illegal_output(c, filter));

ext/mbstring/tests/cp1252_encoding.phpt

-5
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,6 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
1111
include('encoding_tests.inc');
1212
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1252.txt', 'CP1252');
1313

14-
// Test "long" illegal character markers
15-
mb_substitute_character("long");
16-
convertInvalidString("\x81", "%", "CP1252", "UTF-8");
17-
convertInvalidString("\x9D", "%", "CP1252", "UTF-8");
18-
1914
echo "Done!\n";
2015
?>
2116
--EXPECT--

ext/mbstring/tests/data/CP1252.txt

+5-5
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@
145145
0x7E 0x007E #TILDE
146146
0x7F 0x007F #DELETE
147147
0x80 0x20AC #EURO SIGN
148-
0x81 #UNDEFINED
148+
0x81 0x0081 #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
149149
0x82 0x201A #SINGLE LOW-9 QUOTATION MARK
150150
0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK
151151
0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK
@@ -157,10 +157,10 @@
157157
0x8A 0x0160 #LATIN CAPITAL LETTER S WITH CARON
158158
0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
159159
0x8C 0x0152 #LATIN CAPITAL LIGATURE OE
160-
0x8D #UNDEFINED
160+
0x8D 0x008D #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
161161
0x8E 0x017D #LATIN CAPITAL LETTER Z WITH CARON
162-
0x8F #UNDEFINED
163-
0x90 #UNDEFINED
162+
0x8F 0x008F #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
163+
0x90 0x0090 #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
164164
0x91 0x2018 #LEFT SINGLE QUOTATION MARK
165165
0x92 0x2019 #RIGHT SINGLE QUOTATION MARK
166166
0x93 0x201C #LEFT DOUBLE QUOTATION MARK
@@ -173,7 +173,7 @@
173173
0x9A 0x0161 #LATIN SMALL LETTER S WITH CARON
174174
0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
175175
0x9C 0x0153 #LATIN SMALL LIGATURE OE
176-
0x9D #UNDEFINED
176+
0x9D 0x009D #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
177177
0x9E 0x017E #LATIN SMALL LETTER Z WITH CARON
178178
0x9F 0x0178 #LATIN CAPITAL LETTER Y WITH DIAERESIS
179179
0xA0 0x00A0 #NO-BREAK SPACE

0 commit comments

Comments
 (0)