@@ -1297,6 +1297,118 @@ def test_comparewithasciistring(self):
1297
1297
# CRASHES comparewithasciistring([], b'abc')
1298
1298
# CRASHES comparewithasciistring(NULL, b'abc')
1299
1299
1300
+ @support .cpython_only
1301
+ @unittest .skipIf (_testcapi is None , 'need _testcapi module' )
1302
+ def test_equaltoutf8 (self ):
1303
+ # Test PyUnicode_EqualToUTF8()
1304
+ from _testcapi import unicode_equaltoutf8 as equaltoutf8
1305
+ from _testcapi import unicode_asutf8andsize as asutf8andsize
1306
+
1307
+ strings = [
1308
+ 'abc' , '\xa1 \xa2 \xa3 ' , '\u4f60 \u597d \u4e16 ' ,
1309
+ '\U0001f600 \U0001f601 \U0001f602 ' ,
1310
+ '\U0010ffff ' ,
1311
+ ]
1312
+ for s in strings :
1313
+ # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
1314
+ # encoded string cached in the Unicode object.
1315
+ asutf8andsize (s , 0 )
1316
+ b = s .encode ()
1317
+ self .assertEqual (equaltoutf8 (s , b ), 1 ) # Use the UTF-8 cache.
1318
+ s2 = b .decode () # New Unicode object without the UTF-8 cache.
1319
+ self .assertEqual (equaltoutf8 (s2 , b ), 1 )
1320
+ self .assertEqual (equaltoutf8 (s + 'x' , b + b'x' ), 1 )
1321
+ self .assertEqual (equaltoutf8 (s + 'x' , b + b'y' ), 0 )
1322
+ self .assertEqual (equaltoutf8 (s , b + b'\0 ' ), 1 )
1323
+ self .assertEqual (equaltoutf8 (s2 , b + b'\0 ' ), 1 )
1324
+ self .assertEqual (equaltoutf8 (s + '\0 ' , b + b'\0 ' ), 0 )
1325
+ self .assertEqual (equaltoutf8 (s + '\0 ' , b ), 0 )
1326
+ self .assertEqual (equaltoutf8 (s2 , b + b'x' ), 0 )
1327
+ self .assertEqual (equaltoutf8 (s2 , b [:- 1 ]), 0 )
1328
+ self .assertEqual (equaltoutf8 (s2 , b [:- 1 ] + b'x' ), 0 )
1329
+
1330
+ self .assertEqual (equaltoutf8 ('' , b'' ), 1 )
1331
+ self .assertEqual (equaltoutf8 ('' , b'\0 ' ), 1 )
1332
+
1333
+ # embedded null chars/bytes
1334
+ self .assertEqual (equaltoutf8 ('abc' , b'abc\0 def\0 ' ), 1 )
1335
+ self .assertEqual (equaltoutf8 ('a\0 bc' , b'abc' ), 0 )
1336
+ self .assertEqual (equaltoutf8 ('abc' , b'a\0 bc' ), 0 )
1337
+
1338
+ # Surrogate characters are always treated as not equal
1339
+ self .assertEqual (equaltoutf8 ('\udcfe ' ,
1340
+ '\udcfe ' .encode ("utf8" , "surrogateescape" )), 0 )
1341
+ self .assertEqual (equaltoutf8 ('\udcfe ' ,
1342
+ '\udcfe ' .encode ("utf8" , "surrogatepass" )), 0 )
1343
+ self .assertEqual (equaltoutf8 ('\ud801 ' ,
1344
+ '\ud801 ' .encode ("utf8" , "surrogatepass" )), 0 )
1345
+
1346
+ @support .cpython_only
1347
+ @unittest .skipIf (_testcapi is None , 'need _testcapi module' )
1348
+ def test_equaltoutf8andsize (self ):
1349
+ # Test PyUnicode_EqualToUTF8AndSize()
1350
+ from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize
1351
+ from _testcapi import unicode_asutf8andsize as asutf8andsize
1352
+
1353
+ strings = [
1354
+ 'abc' , '\xa1 \xa2 \xa3 ' , '\u4f60 \u597d \u4e16 ' ,
1355
+ '\U0001f600 \U0001f601 \U0001f602 ' ,
1356
+ '\U0010ffff ' ,
1357
+ ]
1358
+ for s in strings :
1359
+ # Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
1360
+ # encoded string cached in the Unicode object.
1361
+ asutf8andsize (s , 0 )
1362
+ b = s .encode ()
1363
+ self .assertEqual (equaltoutf8andsize (s , b ), 1 ) # Use the UTF-8 cache.
1364
+ s2 = b .decode () # New Unicode object without the UTF-8 cache.
1365
+ self .assertEqual (equaltoutf8andsize (s2 , b ), 1 )
1366
+ self .assertEqual (equaltoutf8andsize (s + 'x' , b + b'x' ), 1 )
1367
+ self .assertEqual (equaltoutf8andsize (s + 'x' , b + b'y' ), 0 )
1368
+ self .assertEqual (equaltoutf8andsize (s , b + b'\0 ' ), 0 )
1369
+ self .assertEqual (equaltoutf8andsize (s2 , b + b'\0 ' ), 0 )
1370
+ self .assertEqual (equaltoutf8andsize (s + '\0 ' , b + b'\0 ' ), 1 )
1371
+ self .assertEqual (equaltoutf8andsize (s + '\0 ' , b ), 0 )
1372
+ self .assertEqual (equaltoutf8andsize (s2 , b + b'x' ), 0 )
1373
+ self .assertEqual (equaltoutf8andsize (s2 , b [:- 1 ]), 0 )
1374
+ self .assertEqual (equaltoutf8andsize (s2 , b [:- 1 ] + b'x' ), 0 )
1375
+ # Not null-terminated,
1376
+ self .assertEqual (equaltoutf8andsize (s , b + b'x' , len (b )), 1 )
1377
+ self .assertEqual (equaltoutf8andsize (s2 , b + b'x' , len (b )), 1 )
1378
+ self .assertEqual (equaltoutf8andsize (s + '\0 ' , b + b'\0 x' , len (b ) + 1 ), 1 )
1379
+ self .assertEqual (equaltoutf8andsize (s2 , b , len (b ) - 1 ), 0 )
1380
+
1381
+ self .assertEqual (equaltoutf8andsize ('' , b'' ), 1 )
1382
+ self .assertEqual (equaltoutf8andsize ('' , b'\0 ' ), 0 )
1383
+ self .assertEqual (equaltoutf8andsize ('' , b'x' , 0 ), 1 )
1384
+
1385
+ # embedded null chars/bytes
1386
+ self .assertEqual (equaltoutf8andsize ('abc\0 def' , b'abc\0 def' ), 1 )
1387
+ self .assertEqual (equaltoutf8andsize ('abc\0 def\0 ' , b'abc\0 def\0 ' ), 1 )
1388
+
1389
+ # Surrogate characters are always treated as not equal
1390
+ self .assertEqual (equaltoutf8andsize ('\udcfe ' ,
1391
+ '\udcfe ' .encode ("utf8" , "surrogateescape" )), 0 )
1392
+ self .assertEqual (equaltoutf8andsize ('\udcfe ' ,
1393
+ '\udcfe ' .encode ("utf8" , "surrogatepass" )), 0 )
1394
+ self .assertEqual (equaltoutf8andsize ('\ud801 ' ,
1395
+ '\ud801 ' .encode ("utf8" , "surrogatepass" )), 0 )
1396
+
1397
+ def check_not_equal_encoding (text , encoding ):
1398
+ self .assertEqual (equaltoutf8andsize (text , text .encode (encoding )), 0 )
1399
+ self .assertNotEqual (text .encode (encoding ), text .encode ("utf8" ))
1400
+
1401
+ # Strings encoded to other encodings are not equal to expected UTF8-encoding string
1402
+ check_not_equal_encoding ('Stéphane' , 'latin1' )
1403
+ check_not_equal_encoding ('Stéphane' , 'utf-16-le' ) # embedded null characters
1404
+ check_not_equal_encoding ('北京市' , 'gbk' )
1405
+
1406
+ # CRASHES equaltoutf8andsize('abc', b'abc', -1)
1407
+ # CRASHES equaltoutf8andsize(b'abc', b'abc')
1408
+ # CRASHES equaltoutf8andsize([], b'abc')
1409
+ # CRASHES equaltoutf8andsize(NULL, b'abc')
1410
+ # CRASHES equaltoutf8andsize('abc', NULL)
1411
+
1300
1412
@support .cpython_only
1301
1413
@unittest .skipIf (_testcapi is None , 'need _testcapi module' )
1302
1414
def test_richcompare (self ):
0 commit comments