diff --git a/collate/collate.go b/collate/collate.go index d8c23cb6..6177bce9 100644 --- a/collate/collate.go +++ b/collate/collate.go @@ -19,12 +19,14 @@ import ( ) // Collator provides functionality for comparing strings for a given -// collation order. +// collation order. A given Collator is not safe to use concurrently. type Collator struct { options sorter sorter + buf Buffer + _iter [2]iter } @@ -102,90 +104,26 @@ func (b *Buffer) Reset() { // Compare returns an integer comparing the two byte slices. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. +// Note that this is less performant than calling c.Sort() because +// a new buffer will be allocated for each call. func (c *Collator) Compare(a, b []byte) int { - // TODO: skip identical prefixes once we have a fast way to detect if a rune is - // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. - c.iter(0).SetInput(a) - c.iter(1).SetInput(b) - if res := c.compare(); res != 0 { - return res - } - if !c.ignore[colltab.Identity] { - return bytes.Compare(a, b) - } - return 0 + kA := c.Key(&c.buf, a) + kB := c.Key(&c.buf, b) + ret := bytes.Compare(kA, kB) + c.buf.Reset() + return ret } // CompareString returns an integer comparing the two strings. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. +// Note that this is less performant than calling c.Sort() because +// a new buffer will be allocated for each call. func (c *Collator) CompareString(a, b string) int { - // TODO: skip identical prefixes once we have a fast way to detect if a rune is - // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. - c.iter(0).SetInputString(a) - c.iter(1).SetInputString(b) - if res := c.compare(); res != 0 { - return res - } - if !c.ignore[colltab.Identity] { - if a < b { - return -1 - } else if a > b { - return 1 - } - } - return 0 -} - -func compareLevel(f func(i *iter) int, a, b *iter) int { - a.pce = 0 - b.pce = 0 - for { - va := f(a) - vb := f(b) - if va != vb { - if va < vb { - return -1 - } - return 1 - } else if va == 0 { - break - } - } - return 0 -} - -func (c *Collator) compare() int { - ia, ib := c.iter(0), c.iter(1) - // Process primary level - if c.alternate != altShifted { - // TODO: implement script reordering - if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 { - return res - } - } else { - // TODO: handle shifted - } - if !c.ignore[colltab.Secondary] { - f := (*iter).nextSecondary - if c.backwards { - f = (*iter).prevSecondary - } - if res := compareLevel(f, ia, ib); res != 0 { - return res - } - } - // TODO: special case handling (Danish?) - if !c.ignore[colltab.Tertiary] || c.caseLevel { - if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 { - return res - } - if !c.ignore[colltab.Quaternary] { - if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 { - return res - } - } - } - return 0 + kA := c.KeyFromString(&c.buf, a) + kB := c.KeyFromString(&c.buf, b) + ret := bytes.Compare(kA, kB) + c.buf.Reset() + return ret } // Key returns the collation key for str. diff --git a/collate/collate_test.go b/collate/collate_test.go index 0e78b96f..9a4ad8b3 100644 --- a/collate/collate_test.go +++ b/collate/collate_test.go @@ -6,6 +6,7 @@ package collate import ( "bytes" + "runtime" "testing" "golang.org/x/text/internal/colltab" @@ -450,6 +451,50 @@ func TestCompare(t *testing.T) { t.Errorf("%d: CompareString(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res) } } + + c = New(language.MustParse("en-us-u-ka-posix-ks-level4")) + if c.CompareString("deluge", "de luge") != -1 { + t.Errorf("CompareString for 'deluge' vs 'de luge' in Shift-Trimmed mode should return -1 but returned %v", c.CompareString("deluge", "de luge")) + } +} + +func TestKeyFromStringCompareForShiftTrimmed(t *testing.T) { + var ( + c = New(language.MustParse("en-us-u-ka-posix-ks-level4")) + buf Buffer + kA = c.KeyFromString(&buf, "deluge") + kB = c.KeyFromString(&buf, "de luge") + ) + + if bytes.Compare(kA, kB) != -1 { + t.Errorf("The Keys for 'deluge' should sort before the key for 'de luge' in Shift-Trimmed mode, but it compares as %v", bytes.Compare(kA, kB)) + } +} + +func totalAllocs() uint64 { + var mem runtime.MemStats + runtime.ReadMemStats(&mem) + return mem.TotalAlloc +} + +func TestNoAllocationsForCompares(t *testing.T) { + var a = []byte("foo") + var b = []byte("bar") + var c = New(language.MustParse("en-us-u-ka-posix-ks-level4")) + var before = totalAllocs() + + for i := 0; i < 100; i++ { + c.CompareString("foo", "bar") + c.Compare(a, b) + } + + var bytesAllocated = totalAllocs() - before + + // We should allocate zero additional bytes b/c the Buffer has 4k of memory pre-allocated that + // we should re-use for each/every key we generate for the comparisons. + if bytesAllocated != 0 { + t.Errorf("Expected CompareStrings to not allocate memory but it allocated %v", bytesAllocated) + } } func TestNumeric(t *testing.T) { diff --git a/collate/sort_test.go b/collate/sort_test.go index 4bbb227f..5a05c9bb 100644 --- a/collate/sort_test.go +++ b/collate/sort_test.go @@ -5,7 +5,9 @@ package collate_test import ( + "bytes" "fmt" + "strings" "testing" "golang.org/x/text/collate" @@ -53,3 +55,100 @@ func TestSort(t *testing.T) { t.Errorf("found %s; want %s", res, want) } } + +func TestSortStringsAndCompareString(t *testing.T) { + for _, tt := range []struct { + name string + c *collate.Collator + want []string + }{ + { + name: "English default options", + c: collate.New(language.English), + want: []string{ + "abc", + "bcd", + "ddd", + }, + }, + { + // From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples + name: "Blanked", + c: collate.New(language.MustParse("en-us-u-ka-blanked")), + want: []string{ + "death", + "de luge", + "de-luge", + "deluge", + "de-luge", + "de Luge", + "de-Luge", + "deLuge", + "de-Luge", + "demark", + }, + }, + { + // From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples + name: "Shifted", + c: collate.New(language.MustParse("en-us-u-ka-shifted")), + want: []string{ + "death", + "de luge", + "de-luge", + "de-luge", + "deluge", + "de Luge", + "de-Luge", + "de-Luge", + "deLuge", + "demark", + }, + }, + { + // From https://www.unicode.org/reports/tr10/#Variable_Weighting_Examples + name: "Shift-Trimmed", + c: collate.New(language.MustParse("en-us-u-ka-posix-ks-level4")), + want: []string{ + "death", + "deluge", + "de luge", + "de-luge", + "de-luge", + "deLuge", + "de Luge", + "de-Luge", + "de-Luge", + "demark", + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + actual := make([]string, len(tt.want)) + copy(actual, tt.want) + tt.c.SortStrings(actual) + + p := func(v []string) string { return strings.Join(v, ", ") } + if p(tt.want) != p(actual) { + t.Errorf("SortStrings want: '%v'\n Got: '%v'", p(tt.want), p(actual)) + } + + buf := collate.Buffer{} + for i := 0; i < len(tt.want)-1; i++ { + a, b := tt.want[i], tt.want[i+1] + kA, kB := tt.c.KeyFromString(&buf, a), tt.c.KeyFromString(&buf, b) + if bytes.Compare(kA, kB) > 0 { + t.Errorf("KeyFromString for %v is bigger than for %v", a, b) + } + } + + for i := 0; i < len(tt.want)-1; i++ { + a, b := tt.want[i], tt.want[i+1] + cmp := tt.c.CompareString(a, b) + if cmp > 0 { + t.Errorf("CompareString for '%v' vs '%v' is 1 when should be -1 or 0", a, b) + } + } + }) + } +}