Skip to content

Commit 2cb94ab

Browse files
committed
Enhance the performance of two important Unicode character
type lookups: whitespace and linebreak. These lookup tables are from the Python 1.6 version with the addition of the 205F code point which was added as whitespace code point to Unicode since then.
1 parent 9984e70 commit 2cb94ab

File tree

1 file changed

+52
-10
lines changed

1 file changed

+52
-10
lines changed

Objects/unicodectype.c

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,24 @@ gettyperecord(Py_UNICODE code)
4949
return &_PyUnicode_TypeRecords[index];
5050
}
5151

52-
/* Returns 1 for Unicode characters having the category 'Zl' or type
53-
'B', 0 otherwise. */
52+
/* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
53+
type 'B', 0 otherwise. */
5454

55-
int _PyUnicode_IsLinebreak(Py_UNICODE ch)
55+
int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
5656
{
57-
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
58-
59-
return (ctype->flags & LINEBREAK_MASK) != 0;
57+
switch (ch) {
58+
case 0x000A: /* LINE FEED */
59+
case 0x000D: /* CARRIAGE RETURN */
60+
case 0x001C: /* FILE SEPARATOR */
61+
case 0x001D: /* GROUP SEPARATOR */
62+
case 0x001E: /* RECORD SEPARATOR */
63+
case 0x0085: /* NEXT LINE */
64+
case 0x2028: /* LINE SEPARATOR */
65+
case 0x2029: /* PARAGRAPH SEPARATOR */
66+
return 1;
67+
default:
68+
return 0;
69+
}
6070
}
6171

6272
/* Returns the titlecase Unicode characters corresponding to ch or just
@@ -327,11 +337,43 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
327337
/* Returns 1 for Unicode characters having the bidirectional type
328338
'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
329339

330-
int _PyUnicode_IsWhitespace(Py_UNICODE ch)
340+
int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
331341
{
332-
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
333-
334-
return (ctype->flags & SPACE_MASK) != 0;
342+
switch (ch) {
343+
case 0x0009: /* HORIZONTAL TABULATION */
344+
case 0x000A: /* LINE FEED */
345+
case 0x000B: /* VERTICAL TABULATION */
346+
case 0x000C: /* FORM FEED */
347+
case 0x000D: /* CARRIAGE RETURN */
348+
case 0x001C: /* FILE SEPARATOR */
349+
case 0x001D: /* GROUP SEPARATOR */
350+
case 0x001E: /* RECORD SEPARATOR */
351+
case 0x001F: /* UNIT SEPARATOR */
352+
case 0x0020: /* SPACE */
353+
case 0x0085: /* NEXT LINE */
354+
case 0x00A0: /* NO-BREAK SPACE */
355+
case 0x1680: /* OGHAM SPACE MARK */
356+
case 0x2000: /* EN QUAD */
357+
case 0x2001: /* EM QUAD */
358+
case 0x2002: /* EN SPACE */
359+
case 0x2003: /* EM SPACE */
360+
case 0x2004: /* THREE-PER-EM SPACE */
361+
case 0x2005: /* FOUR-PER-EM SPACE */
362+
case 0x2006: /* SIX-PER-EM SPACE */
363+
case 0x2007: /* FIGURE SPACE */
364+
case 0x2008: /* PUNCTUATION SPACE */
365+
case 0x2009: /* THIN SPACE */
366+
case 0x200A: /* HAIR SPACE */
367+
case 0x200B: /* ZERO WIDTH SPACE */
368+
case 0x2028: /* LINE SEPARATOR */
369+
case 0x2029: /* PARAGRAPH SEPARATOR */
370+
case 0x202F: /* NARROW NO-BREAK SPACE */
371+
case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
372+
case 0x3000: /* IDEOGRAPHIC SPACE */
373+
return 1;
374+
default:
375+
return 0;
376+
}
335377
}
336378

337379
/* Returns 1 for Unicode characters having the category 'Ll', 0

0 commit comments

Comments
 (0)