Fix problem with CP949 conversion when 0xC9 precedes byte lower than 0xA1

alexdowad · alexdowad · commit 8e6be1437259 · 2023-05-20T21:27:48.000-07:00
This bug was introduced in e837a88. In that commit, I increased the performance of CP949 text conversion, but accidentally broke the case where 0xC9 (illegal byte to start a character) is followed by a valid character with a first byte less than 0xA1. The 'broken' behavior is that both the 0xC9 byte and the following valid character would be converted to error markers.
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cjk.c b/ext/mbstring/libmbfl/filters/mbfilter_cjk.c
@@ -10224,17 +10224,13 @@ static size_t mb_uhc_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf,
 				w = (c - 0xC7)*94 + c2 - 0xA1;
 				ZEND_ASSERT(w < uhc3_ucs_table_size);
 				w = uhc3_ucs_table[w];
-				if (!w) {
-					/* If c == 0xC9, we shouldn't have tried to read a 2-byte char at all... but it is faster
-					 * to fix up that rare case here rather than include an extra check in the hot path */
-					if (c == 0xC9) {
-						p--;
-					}
-					*out++ = MBFL_BAD_INPUT;
-					continue;
-				}
 			}
 			if (!w) {
+				/* If c == 0xC9, we shouldn't have tried to read a 2-byte char at all... but it is faster
+				 * to fix up that rare case here rather than include an extra check in the hot path */
+				if (c == 0xC9) {
+					p--;
+				}
 				w = MBFL_BAD_INPUT;
 			}
 			*out++ = w;
diff --git a/ext/mbstring/tests/uhc_encoding.phpt b/ext/mbstring/tests/uhc_encoding.phpt
@@ -14,6 +14,11 @@ testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP949.txt', 'UHC');
 // Regression test
 convertInvalidString("\xE4\xA4\xB4<", "\x75\x1A\x00%", "UHC", "UTF-16BE");
 
+// When optimizing performance of CP949 conversion, I accidentally broke the
+// case where 0xC9 appears before a valid character which starts with a
+// byte lower than 0xA1
+convertInvalidString("\xC9\x9E\x98", "%\xEC\x98\x92", "UHC", "UTF-8");
+
 // Test "long" illegal character markers
 mb_substitute_character("long");
 convertInvalidString("\x80", "%", "UHC", "UTF-8");