Enhance the performance of two important Unicode character

malemburg · malemburg · commit 2cb94aba122b · 2005-10-20T19:06:35.000Z
type lookups: whitespace and linebreak.

These lookup tables are from the Python 1.6 version with the addition
of the 205F code point which was added as whitespace code point to Unicode
since then.
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c
@@ -49,14 +49,24 @@ gettyperecord(Py_UNICODE code)
     return &_PyUnicode_TypeRecords[index];
 }
 
-/* Returns 1 for Unicode characters having the category 'Zl' or type
-   'B', 0 otherwise. */
+/* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
+   type 'B', 0 otherwise. */
 
-int _PyUnicode_IsLinebreak(Py_UNICODE ch)
+int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
 {
-    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-
-    return (ctype->flags & LINEBREAK_MASK) != 0;
+    switch (ch) {
+    case 0x000A: /* LINE FEED */
+    case 0x000D: /* CARRIAGE RETURN */
+    case 0x001C: /* FILE SEPARATOR */
+    case 0x001D: /* GROUP SEPARATOR */
+    case 0x001E: /* RECORD SEPARATOR */
+    case 0x0085: /* NEXT LINE */
+    case 0x2028: /* LINE SEPARATOR */
+    case 0x2029: /* PARAGRAPH SEPARATOR */
+	return 1;
+    default:
+	return 0;
+    }
 }
 
 /* Returns the titlecase Unicode characters corresponding to ch or just
@@ -327,11 +337,43 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
 /* Returns 1 for Unicode characters having the bidirectional type
    'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
 
-int _PyUnicode_IsWhitespace(Py_UNICODE ch)
+int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
 {
-    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
-
-    return (ctype->flags & SPACE_MASK) != 0;
+    switch (ch) {
+    case 0x0009: /* HORIZONTAL TABULATION */
+    case 0x000A: /* LINE FEED */
+    case 0x000B: /* VERTICAL TABULATION */
+    case 0x000C: /* FORM FEED */
+    case 0x000D: /* CARRIAGE RETURN */
+    case 0x001C: /* FILE SEPARATOR */
+    case 0x001D: /* GROUP SEPARATOR */
+    case 0x001E: /* RECORD SEPARATOR */
+    case 0x001F: /* UNIT SEPARATOR */
+    case 0x0020: /* SPACE */
+    case 0x0085: /* NEXT LINE */
+    case 0x00A0: /* NO-BREAK SPACE */
+    case 0x1680: /* OGHAM SPACE MARK */
+    case 0x2000: /* EN QUAD */
+    case 0x2001: /* EM QUAD */
+    case 0x2002: /* EN SPACE */
+    case 0x2003: /* EM SPACE */
+    case 0x2004: /* THREE-PER-EM SPACE */
+    case 0x2005: /* FOUR-PER-EM SPACE */
+    case 0x2006: /* SIX-PER-EM SPACE */
+    case 0x2007: /* FIGURE SPACE */
+    case 0x2008: /* PUNCTUATION SPACE */
+    case 0x2009: /* THIN SPACE */
+    case 0x200A: /* HAIR SPACE */
+    case 0x200B: /* ZERO WIDTH SPACE */
+    case 0x2028: /* LINE SEPARATOR */
+    case 0x2029: /* PARAGRAPH SEPARATOR */
+    case 0x202F: /* NARROW NO-BREAK SPACE */
+    case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
+    case 0x3000: /* IDEOGRAPHIC SPACE */
+	return 1;
+    default:
+	return 0;
+    }
 }
 
 /* Returns 1 for Unicode characters having the category 'Ll', 0