From f1daf0a8721cbd3ea5014322eaf9b36dc0d7334a Mon Sep 17 00:00:00 2001 From: Gabriel Burt Date: Tue, 24 Dec 2024 23:18:01 +0000 Subject: [PATCH 1/4] collate: fix Compare[String] funcs to use key comparisons and add unit tests for the UCA Variable Weighting examples --- collate/collate.go | 94 +++++++------------------------------- collate/collate_test.go | 18 ++++++++ collate/sort_test.go | 99 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 78 deletions(-) diff --git a/collate/collate.go b/collate/collate.go index d8c23cb69..17863e18f 100644 --- a/collate/collate.go +++ b/collate/collate.go @@ -102,90 +102,28 @@ func (b *Buffer) Reset() { // Compare returns an integer comparing the two byte slices. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. +// Note that this is less performant than calling c.Sort() because +// a new buffer will be allocated for each call. func (c *Collator) Compare(a, b []byte) int { - // TODO: skip identical prefixes once we have a fast way to detect if a rune is - // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. - c.iter(0).SetInput(a) - c.iter(1).SetInput(b) - if res := c.compare(); res != 0 { - return res - } - if !c.ignore[colltab.Identity] { - return bytes.Compare(a, b) - } - return 0 + var ( + buf Buffer + kA = c.Key(&buf, a) + kB = c.Key(&buf, b) + ) + return bytes.Compare(kA, kB) } // CompareString returns an integer comparing the two strings. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. +// Note that this is less performant than calling c.Sort() because +// a new buffer will be allocated for each call. func (c *Collator) CompareString(a, b string) int { - // TODO: skip identical prefixes once we have a fast way to detect if a rune is - // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. - c.iter(0).SetInputString(a) - c.iter(1).SetInputString(b) - if res := c.compare(); res != 0 { - return res - } - if !c.ignore[colltab.Identity] { - if a < b { - return -1 - } else if a > b { - return 1 - } - } - return 0 -} - -func compareLevel(f func(i *iter) int, a, b *iter) int { - a.pce = 0 - b.pce = 0 - for { - va := f(a) - vb := f(b) - if va != vb { - if va < vb { - return -1 - } - return 1 - } else if va == 0 { - break - } - } - return 0 -} - -func (c *Collator) compare() int { - ia, ib := c.iter(0), c.iter(1) - // Process primary level - if c.alternate != altShifted { - // TODO: implement script reordering - if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 { - return res - } - } else { - // TODO: handle shifted - } - if !c.ignore[colltab.Secondary] { - f := (*iter).nextSecondary - if c.backwards { - f = (*iter).prevSecondary - } - if res := compareLevel(f, ia, ib); res != 0 { - return res - } - } - // TODO: special case handling (Danish?) - if !c.ignore[colltab.Tertiary] || c.caseLevel { - if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 { - return res - } - if !c.ignore[colltab.Quaternary] { - if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 { - return res - } - } - } - return 0 + var ( + buf Buffer + kA = c.KeyFromString(&buf, a) + kB = c.KeyFromString(&buf, b) + ) + return bytes.Compare(kA, kB) } // Key returns the collation key for str. diff --git a/collate/collate_test.go b/collate/collate_test.go index 0e78b96fa..841114325 100644 --- a/collate/collate_test.go +++ b/collate/collate_test.go @@ -450,6 +450,24 @@ func TestCompare(t *testing.T) { t.Errorf("%d: CompareString(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res) } } + + c = New(language.MustParse("en-us-u-ka-posix-ks-level4")) + if c.CompareString("deluge", "de luge") != -1 { + t.Errorf("CompareString for 'deluge' vs 'de luge' in Shift-Trimmed mode should return -1 but returned %v", c.CompareString("deluge", "de luge")) + } +} + +func TestKeyFromStringCompareForShiftTrimmed(t *testing.T) { + var ( + c = New(language.MustParse("en-us-u-ka-posix-ks-level4")) + buf Buffer + kA = c.KeyFromString(&buf, "deluge") + kB = c.KeyFromString(&buf, "de luge") + ) + + if bytes.Compare(kA, kB) != -1 { + t.Errorf("The Keys for 'deluge' should sort before the key for 'de luge' in Shift-Trimmed mode, but it compares as %v", bytes.Compare(kA, kB)) + } } func TestNumeric(t *testing.T) { diff --git a/collate/sort_test.go b/collate/sort_test.go index 4bbb227fc..5a05c9bb5 100644 --- a/collate/sort_test.go +++ b/collate/sort_test.go @@ -5,7 +5,9 @@ package collate_test import ( + "bytes" "fmt" + "strings" "testing" "golang.org/x/text/collate" @@ -53,3 +55,100 @@ func TestSort(t *testing.T) { t.Errorf("found %s; want %s", res, want) } } + +func TestSortStringsAndCompareString(t *testing.T) { + for _, tt := range []struct { + name string + c *collate.Collator + want []string + }{ + { + name: "English default options", + c: collate.New(language.English), + want: []string{ + "abc", + "bcd", + "ddd", + }, + }, + { + // From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples + name: "Blanked", + c: collate.New(language.MustParse("en-us-u-ka-blanked")), + want: []string{ + "death", + "de luge", + "de-luge", + "deluge", + "de-luge", + "de Luge", + "de-Luge", + "deLuge", + "de-Luge", + "demark", + }, + }, + { + // From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples + name: "Shifted", + c: collate.New(language.MustParse("en-us-u-ka-shifted")), + want: []string{ + "death", + "de luge", + "de-luge", + "de-luge", + "deluge", + "de Luge", + "de-Luge", + "de-Luge", + "deLuge", + "demark", + }, + }, + { + // From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples + name: "Shift-Trimmed", + c: collate.New(language.MustParse("en-us-u-ka-posix-ks-level4")), + want: []string{ + "death", + "deluge", + "de luge", + "de-luge", + "de-luge", + "deLuge", + "de Luge", + "de-Luge", + "de-Luge", + "demark", + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + actual := make([]string, len(tt.want)) + copy(actual, tt.want) + tt.c.SortStrings(actual) + + p := func(v []string) string { return strings.Join(v, ", ") } + if p(tt.want) != p(actual) { + t.Errorf("SortStrings want: '%v'\n Got: '%v'", p(tt.want), p(actual)) + } + + buf := collate.Buffer{} + for i := 0; i < len(tt.want)-1; i++ { + a, b := tt.want[i], tt.want[i+1] + kA, kB := tt.c.KeyFromString(&buf, a), tt.c.KeyFromString(&buf, b) + if bytes.Compare(kA, kB) > 0 { + t.Errorf("KeyFromString for %v is bigger than for %v", a, b) + } + } + + for i := 0; i < len(tt.want)-1; i++ { + a, b := tt.want[i], tt.want[i+1] + cmp := tt.c.CompareString(a, b) + if cmp > 0 { + t.Errorf("CompareString for '%v' vs '%v' is 1 when should be -1 or 0", a, b) + } + } + }) + } +} From 7e21e25a1e99f1ca4592cbdb0ea0a401c6b2b4d4 Mon Sep 17 00:00:00 2001 From: Gabriel Burt Date: Mon, 30 Dec 2024 17:36:43 +0000 Subject: [PATCH 2/4] use a shared buf --- collate/collate.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/collate/collate.go b/collate/collate.go index 17863e18f..127f392ab 100644 --- a/collate/collate.go +++ b/collate/collate.go @@ -19,12 +19,14 @@ import ( ) // Collator provides functionality for comparing strings for a given -// collation order. +// collation order. A given Collator is not safe to use concurrently. type Collator struct { options sorter sorter + buf Buffer + _iter [2]iter } @@ -106,9 +108,8 @@ func (b *Buffer) Reset() { // a new buffer will be allocated for each call. func (c *Collator) Compare(a, b []byte) int { var ( - buf Buffer - kA = c.Key(&buf, a) - kB = c.Key(&buf, b) + kA = c.Key(&c.buf, a) + kB = c.Key(&c.buf, b) ) return bytes.Compare(kA, kB) } @@ -119,9 +120,8 @@ func (c *Collator) Compare(a, b []byte) int { // a new buffer will be allocated for each call. func (c *Collator) CompareString(a, b string) int { var ( - buf Buffer - kA = c.KeyFromString(&buf, a) - kB = c.KeyFromString(&buf, b) + kA = c.KeyFromString(&c.buf, a) + kB = c.KeyFromString(&c.buf, b) ) return bytes.Compare(kA, kB) } From 14ca72f6771ba813ebba25fd20b3bdffbd026669 Mon Sep 17 00:00:00 2001 From: Gabriel Burt Date: Tue, 7 Jan 2025 22:10:06 +0000 Subject: [PATCH 3/4] avoid memory leak; add test for zero allocations for Compare calls --- collate/collate.go | 16 ++++++++++------ collate/collate_test.go | 27 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/collate/collate.go b/collate/collate.go index 127f392ab..6d3918990 100644 --- a/collate/collate.go +++ b/collate/collate.go @@ -108,10 +108,12 @@ func (b *Buffer) Reset() { // a new buffer will be allocated for each call. func (c *Collator) Compare(a, b []byte) int { var ( - kA = c.Key(&c.buf, a) - kB = c.Key(&c.buf, b) + kA = c.Key(&c.buf, a) + kB = c.Key(&c.buf, b) + ret = bytes.Compare(kA, kB) ) - return bytes.Compare(kA, kB) + c.buf.Reset() + return ret } // CompareString returns an integer comparing the two strings. @@ -120,10 +122,12 @@ func (c *Collator) Compare(a, b []byte) int { // a new buffer will be allocated for each call. func (c *Collator) CompareString(a, b string) int { var ( - kA = c.KeyFromString(&c.buf, a) - kB = c.KeyFromString(&c.buf, b) + kA = c.KeyFromString(&c.buf, a) + kB = c.KeyFromString(&c.buf, b) + ret = bytes.Compare(kA, kB) ) - return bytes.Compare(kA, kB) + c.buf.Reset() + return ret } // Key returns the collation key for str. diff --git a/collate/collate_test.go b/collate/collate_test.go index 841114325..9a4ad8b39 100644 --- a/collate/collate_test.go +++ b/collate/collate_test.go @@ -6,6 +6,7 @@ package collate import ( "bytes" + "runtime" "testing" "golang.org/x/text/internal/colltab" @@ -470,6 +471,32 @@ func TestKeyFromStringCompareForShiftTrimmed(t *testing.T) { } } +func totalAllocs() uint64 { + var mem runtime.MemStats + runtime.ReadMemStats(&mem) + return mem.TotalAlloc +} + +func TestNoAllocationsForCompares(t *testing.T) { + var a = []byte("foo") + var b = []byte("bar") + var c = New(language.MustParse("en-us-u-ka-posix-ks-level4")) + var before = totalAllocs() + + for i := 0; i < 100; i++ { + c.CompareString("foo", "bar") + c.Compare(a, b) + } + + var bytesAllocated = totalAllocs() - before + + // We should allocate zero additional bytes b/c the Buffer has 4k of memory pre-allocated that + // we should re-use for each/every key we generate for the comparisons. + if bytesAllocated != 0 { + t.Errorf("Expected CompareStrings to not allocate memory but it allocated %v", bytesAllocated) + } +} + func TestNumeric(t *testing.T) { c := New(language.English, Loose, Numeric) From 086270d37581eb92fc91a57ef30a17c2d4a99c67 Mon Sep 17 00:00:00 2001 From: Gabriel Burt Date: Tue, 21 Jan 2025 21:34:52 +0000 Subject: [PATCH 4/4] var -> := --- collate/collate.go | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/collate/collate.go b/collate/collate.go index 6d3918990..6177bce9f 100644 --- a/collate/collate.go +++ b/collate/collate.go @@ -107,11 +107,9 @@ func (b *Buffer) Reset() { // Note that this is less performant than calling c.Sort() because // a new buffer will be allocated for each call. func (c *Collator) Compare(a, b []byte) int { - var ( - kA = c.Key(&c.buf, a) - kB = c.Key(&c.buf, b) - ret = bytes.Compare(kA, kB) - ) + kA := c.Key(&c.buf, a) + kB := c.Key(&c.buf, b) + ret := bytes.Compare(kA, kB) c.buf.Reset() return ret } @@ -121,11 +119,9 @@ func (c *Collator) Compare(a, b []byte) int { // Note that this is less performant than calling c.Sort() because // a new buffer will be allocated for each call. func (c *Collator) CompareString(a, b string) int { - var ( - kA = c.KeyFromString(&c.buf, a) - kB = c.KeyFromString(&c.buf, b) - ret = bytes.Compare(kA, kB) - ) + kA := c.KeyFromString(&c.buf, a) + kB := c.KeyFromString(&c.buf, b) + ret := bytes.Compare(kA, kB) c.buf.Reset() return ret }