@@ -730,6 +730,25 @@ codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
730
730
}
731
731
732
732
733
+ /*
734
+ * Determine the number of digits for a decimal representation of Unicode
735
+ * codepoint 'ch' (by design, Unicode codepoints are limited to 7 digits).
736
+ */
737
+ static inline int
738
+ n_decimal_digits_for_codepoint (Py_UCS4 ch )
739
+ {
740
+ if (ch < 10 ) return 1 ;
741
+ if (ch < 100 ) return 2 ;
742
+ if (ch < 1000 ) return 3 ;
743
+ if (ch < 10000 ) return 4 ;
744
+ if (ch < 100000 ) return 5 ;
745
+ if (ch < 1000000 ) return 6 ;
746
+ if (ch < 10000000 ) return 7 ;
747
+ // Unicode codepoints are limited to 1114111 (7 decimal digits)
748
+ Py_UNREACHABLE ();
749
+ }
750
+
751
+
733
752
/*
734
753
* Create a Unicode string containing 'count' copies of the official
735
754
* Unicode REPLACEMENT CHARACTER (0xFFFD).
@@ -867,9 +886,12 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
867
886
}
868
887
}
869
888
889
+
890
+ // --- handler: 'xmlcharrefreplace' -------------------------------------------
891
+
870
892
PyObject * PyCodec_XMLCharRefReplaceErrors (PyObject * exc )
871
893
{
872
- if (!PyObject_TypeCheck (exc , ( PyTypeObject * ) PyExc_UnicodeEncodeError )) {
894
+ if (!_PyIsUnicodeEncodeError (exc )) {
873
895
wrong_exception_type (exc );
874
896
return NULL ;
875
897
}
@@ -896,30 +918,11 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
896
918
897
919
Py_ssize_t ressize = 0 ;
898
920
for (Py_ssize_t i = start ; i < end ; ++ i ) {
899
- /* object is guaranteed to be "ready" */
900
921
Py_UCS4 ch = PyUnicode_READ_CHAR (obj , i );
901
- if (ch < 10 ) {
902
- ressize += 2 + 1 + 1 ;
903
- }
904
- else if (ch < 100 ) {
905
- ressize += 2 + 2 + 1 ;
906
- }
907
- else if (ch < 1000 ) {
908
- ressize += 2 + 3 + 1 ;
909
- }
910
- else if (ch < 10000 ) {
911
- ressize += 2 + 4 + 1 ;
912
- }
913
- else if (ch < 100000 ) {
914
- ressize += 2 + 5 + 1 ;
915
- }
916
- else if (ch < 1000000 ) {
917
- ressize += 2 + 6 + 1 ;
918
- }
919
- else {
920
- assert (ch < 10000000 );
921
- ressize += 2 + 7 + 1 ;
922
- }
922
+ int k = n_decimal_digits_for_codepoint (ch );
923
+ assert (k != 0 );
924
+ assert (k <= 7 );
925
+ ressize += 2 + k + 1 ;
923
926
}
924
927
925
928
/* allocate replacement */
@@ -931,45 +934,20 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
931
934
Py_UCS1 * outp = PyUnicode_1BYTE_DATA (res );
932
935
/* generate replacement */
933
936
for (Py_ssize_t i = start ; i < end ; ++ i ) {
934
- int digits , base ;
935
937
Py_UCS4 ch = PyUnicode_READ_CHAR (obj , i );
936
- if (ch < 10 ) {
937
- digits = 1 ;
938
- base = 1 ;
939
- }
940
- else if (ch < 100 ) {
941
- digits = 2 ;
942
- base = 10 ;
943
- }
944
- else if (ch < 1000 ) {
945
- digits = 3 ;
946
- base = 100 ;
947
- }
948
- else if (ch < 10000 ) {
949
- digits = 4 ;
950
- base = 1000 ;
951
- }
952
- else if (ch < 100000 ) {
953
- digits = 5 ;
954
- base = 10000 ;
955
- }
956
- else if (ch < 1000000 ) {
957
- digits = 6 ;
958
- base = 100000 ;
959
- }
960
- else {
961
- assert (ch < 10000000 );
962
- digits = 7 ;
963
- base = 1000000 ;
964
- }
938
+ /*
939
+ * Write the decimal representation of 'ch' to the buffer pointed by 'p'
940
+ * using at most 7 characters prefixed by '&#' and suffixed by ';'.
941
+ */
965
942
* outp ++ = '&' ;
966
943
* outp ++ = '#' ;
967
- while (digits -- > 0 ) {
968
- assert (base >= 1 );
969
- * outp ++ = '0' + ch / base ;
970
- ch %= base ;
971
- base /= 10 ;
944
+ Py_UCS1 * digit_end = outp + n_decimal_digits_for_codepoint (ch );
945
+ for (Py_UCS1 * p_digit = digit_end - 1 ; p_digit >= outp ; -- p_digit ) {
946
+ * p_digit = '0' + (ch % 10 );
947
+ ch /= 10 ;
972
948
}
949
+ assert (ch == 0 );
950
+ outp = digit_end ;
973
951
* outp ++ = ';' ;
974
952
}
975
953
assert (_PyUnicode_CheckConsistency (res , 1 ));
@@ -1517,7 +1495,8 @@ replace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
1517
1495
}
1518
1496
1519
1497
1520
- static PyObject * xmlcharrefreplace_errors (PyObject * self , PyObject * exc )
1498
+ static inline PyObject *
1499
+ xmlcharrefreplace_errors (PyObject * Py_UNUSED (self ), PyObject * exc )
1521
1500
{
1522
1501
return PyCodec_XMLCharRefReplaceErrors (exc );
1523
1502
}
0 commit comments