Skip to content

Commit d103655

Browse files
authored
Print as text if mostly text (#258)
The previous heuristic of treating strings as binary data if it contains any invalid UTF-8 was too strict. Loosen the heuristic to check if most of the characters are printable text. Fixes #257
1 parent 9181d1e commit d103655

File tree

3 files changed

+42
-17
lines changed

3 files changed

+42
-17
lines changed

cmp/compare_test.go

+5
Original file line numberDiff line numberDiff line change
@@ -1307,6 +1307,11 @@ using the AllowUnexported option.`, "\n"),
13071307
x: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa,#=_value _value=2 11\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb,#=_value _value=2 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc,#=_value _value=1 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd,#=_value _value=3 31\torg-4747474747474747,bucket-4242424242424242:m,tag1=c,#=_value _value=4 41\t",
13081308
y: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa _value=2 11\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb _value=2 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc _value=1 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd _value=3 31\torg-4747474747474747,bucket-4242424242424242:m,tag1=c _value=4 41\t",
13091309
reason: "leading/trailing equal spans should not appear in diff lines",
1310+
}, {
1311+
label: label + "/MostlyTextString",
1312+
x: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa,\xff=_value _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb,\xff=_value _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc,\xff=_value _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd,\xff=_value _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1=c,\xff=_value _value=4 41\n",
1313+
y: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1=c _value=4 41\n",
1314+
reason: "the presence of a few invalid UTF-8 characters should not prevent printing this as text",
13101315
}, {
13111316
label: label + "/AllLinesDiffer",
13121317
x: "d5c14bdf6bac81c27afc5429500ed750\n25483503b557c606dad4f144d27ae10b\n90bdbcdbb6ea7156068e3dcfb7459244\n978f480a6e3cced51e297fbff9a506b7\n",

cmp/report_slices.go

+18-17
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package cmp
77
import (
88
"bytes"
99
"fmt"
10+
"math"
1011
"reflect"
1112
"strconv"
1213
"strings"
@@ -96,30 +97,29 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
9697
}
9798

9899
// Auto-detect the type of the data.
99-
var isLinedText, isText, isBinary bool
100100
var sx, sy string
101101
var ssx, ssy []string
102+
var isString, isMostlyText, isPureLinedText, isBinary bool
102103
switch {
103104
case t.Kind() == reflect.String:
104105
sx, sy = vx.String(), vy.String()
105-
isText = true // Initial estimate, verify later
106+
isString = true
106107
case t.Kind() == reflect.Slice && t.Elem() == reflect.TypeOf(byte(0)):
107108
sx, sy = string(vx.Bytes()), string(vy.Bytes())
108-
isBinary = true // Initial estimate, verify later
109+
isString = true
109110
case t.Kind() == reflect.Array:
110111
// Arrays need to be addressable for slice operations to work.
111112
vx2, vy2 := reflect.New(t).Elem(), reflect.New(t).Elem()
112113
vx2.Set(vx)
113114
vy2.Set(vy)
114115
vx, vy = vx2, vy2
115116
}
116-
if isText || isBinary {
117-
var numLines, lastLineIdx, maxLineLen int
118-
isBinary = !utf8.ValidString(sx) || !utf8.ValidString(sy)
117+
if isString {
118+
var numTotalRunes, numValidRunes, numLines, lastLineIdx, maxLineLen int
119119
for i, r := range sx + sy {
120-
if !(unicode.IsPrint(r) || unicode.IsSpace(r)) || r == utf8.RuneError {
121-
isBinary = true
122-
break
120+
numTotalRunes++
121+
if (unicode.IsPrint(r) || unicode.IsSpace(r)) && r != utf8.RuneError {
122+
numValidRunes++
123123
}
124124
if r == '\n' {
125125
if maxLineLen < i-lastLineIdx {
@@ -129,12 +129,14 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
129129
numLines++
130130
}
131131
}
132-
isText = !isBinary
133-
isLinedText = isText && numLines >= 4 && maxLineLen <= 1024
132+
isPureText := numValidRunes == numTotalRunes
133+
isMostlyText = float64(numValidRunes) > math.Floor(0.90*float64(numTotalRunes))
134+
isPureLinedText = isPureText && numLines >= 4 && maxLineLen <= 1024
135+
isBinary = !isMostlyText
134136

135137
// Avoid diffing by lines if it produces a significantly more complex
136138
// edit script than diffing by bytes.
137-
if isLinedText {
139+
if isPureLinedText {
138140
ssx = strings.Split(sx, "\n")
139141
ssy = strings.Split(sy, "\n")
140142
esLines := diff.Difference(len(ssx), len(ssy), func(ix, iy int) diff.Result {
@@ -145,7 +147,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
145147
})
146148
efficiencyLines := float64(esLines.Dist()) / float64(len(esLines))
147149
efficiencyBytes := float64(esBytes.Dist()) / float64(len(esBytes))
148-
isLinedText = efficiencyLines < 4*efficiencyBytes
150+
isPureLinedText = efficiencyLines < 4*efficiencyBytes
149151
}
150152
}
151153

@@ -155,7 +157,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
155157
switch {
156158
// If the text appears to be multi-lined text,
157159
// then perform differencing across individual lines.
158-
case isLinedText:
160+
case isPureLinedText:
159161
list = opts.formatDiffSlice(
160162
reflect.ValueOf(ssx), reflect.ValueOf(ssy), 1, "line",
161163
func(v reflect.Value, d diffMode) textRecord {
@@ -244,15 +246,14 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
244246
// If the text appears to be single-lined text,
245247
// then perform differencing in approximately fixed-sized chunks.
246248
// The output is printed as quoted strings.
247-
case isText:
249+
case isMostlyText:
248250
list = opts.formatDiffSlice(
249251
reflect.ValueOf(sx), reflect.ValueOf(sy), 64, "byte",
250252
func(v reflect.Value, d diffMode) textRecord {
251253
s := formatString(v.String())
252254
return textRecord{Diff: d, Value: textLine(s)}
253255
},
254256
)
255-
delim = ""
256257

257258
// If the text appears to be binary data,
258259
// then perform differencing in approximately fixed-sized chunks.
@@ -314,7 +315,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
314315

315316
// Wrap the output with appropriate type information.
316317
var out textNode = &textWrap{Prefix: "{", Value: list, Suffix: "}"}
317-
if !isText {
318+
if !isMostlyText {
318319
// The "{...}" byte-sequence literal is not valid Go syntax for strings.
319320
// Emit the type for extra clarity (e.g. "string{...}").
320321
if t.Kind() == reflect.String {

cmp/testdata/diffs

+19
Original file line numberDiff line numberDiff line change
@@ -1065,6 +1065,25 @@
10651065
` _value=4 41 `,
10661066
}, "")
10671067
>>> TestDiff/Reporter/SurroundingEqualElements
1068+
<<< TestDiff/Reporter/MostlyTextString
1069+
strings.Join({
1070+
"org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa",
1071+
- ",\xff=_value",
1072+
" _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1",
1073+
"=a,tag2=bb",
1074+
- ",\xff=_value",
1075+
" _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1",
1076+
"=b,tag2=cc",
1077+
- ",\xff=_value",
1078+
" _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1",
1079+
"=a,tag2=dd",
1080+
- ",\xff=_value",
1081+
" _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1",
1082+
"=c",
1083+
- ",\xff=_value",
1084+
" _value=4 41\n",
1085+
}, "")
1086+
>>> TestDiff/Reporter/MostlyTextString
10681087
<<< TestDiff/Reporter/AllLinesDiffer
10691088
strings.Join({
10701089
+ "X",

0 commit comments

Comments
 (0)