From c9aa6fa3aee39a0cf5697090a560b01183d7b942 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Fri, 14 Feb 2020 21:22:00 +0900 Subject: [PATCH 1/2] Implement a special-case lookup for ascii grapeheme categories. This speeds up processing even for many non-ascii texts, since they often still use ascii-range punctuation and whitespace. --- src/grapheme.rs | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/grapheme.rs b/src/grapheme.rs index b66536e..067de25 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -284,12 +284,30 @@ impl GraphemeCursor { fn grapheme_category(&mut self, ch: char) -> GraphemeCat { use tables::grapheme as gr; - // If this char isn't within the cached range, update the cache to the - // range that includes it. - if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 { - self.grapheme_cat_cache = gr::grapheme_category(ch); + use tables::grapheme::GraphemeCat::*; + + if ch <= '\u{7e}' { + // Special-case optimization for ascii, except U+007F. This + // improves performance even for many primarily non-ascii texts, + // due to use of punctuation and white space characters from the + // ascii range. + if ch >= '\u{20}' { + GC_Any + } else if ch == '\u{a}' { + GC_LF + } else if ch == '\u{d}' { + GC_CR + } else { + GC_Control + } + } else { + // If this char isn't within the cached range, update the cache to the + // range that includes it. + if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 { + self.grapheme_cat_cache = gr::grapheme_category(ch); + } + self.grapheme_cat_cache.2 } - self.grapheme_cat_cache.2 } // Not sure I'm gonna keep this, the advantage over new() seems thin. From 945dbb61c7db54c98c3282b8ec42ee0acd010ca5 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Fri, 14 Feb 2020 12:44:14 -0800 Subject: [PATCH 2/2] Apply suggestions from code review --- src/grapheme.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/grapheme.rs b/src/grapheme.rs index 067de25..176a7aa 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -293,9 +293,9 @@ impl GraphemeCursor { // ascii range. if ch >= '\u{20}' { GC_Any - } else if ch == '\u{a}' { + } else if ch == '\n' { GC_LF - } else if ch == '\u{d}' { + } else if ch == '\r' { GC_CR } else { GC_Control