Skip to content

Commit 0537a49

Browse files
committed
fixes #24 changes $ behavior in singleline when using RE2 and ECMAScript modes
PCRE and .NET have a different definition of $ than RE2 and ECMAScript engines in singleline mode. PCRE defines it as "$ asserts position at the end of the string, or before the line terminator right at the end of the string (if any)." This means that a pattern of "^ac$\n" is valid and can match "ac\n" OR "ac". This behavior is different in RE2 and ECMAScript engines. For these engines the pattern "^ac$\n" won't match any inputs in singleline mode because the $ demands the string ends but the pattern requires an extra \n so they both cannot be true. The PCRE/.NET behavior feels wrong, but for this project I maintain compatibility with them in "default" mode. The other, less suprising behavior is enabled by using either the RE2 option or the ECMAScript option.
1 parent f48b8c1 commit 0537a49

File tree

3 files changed

+56
-1
lines changed

3 files changed

+56
-1
lines changed

regexp_re2_test.go

+23
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,26 @@ func TestRE2NamedAscii_Concat(t *testing.T) {
9696
t.Fatal("Expected match")
9797
}
9898
}
99+
100+
func TestRE2Dollar_Singleline(t *testing.T) {
101+
// PCRE allows for \n after the $ and RE2 doesn't
102+
r := MustCompile(`^ac$\n`, RE2)
103+
if m, _ := r.MatchString("ac"); m {
104+
t.Fatal("Expected no match")
105+
}
106+
if m, _ := r.MatchString("ac\n"); m {
107+
t.Fatal("Expected no match")
108+
}
109+
}
110+
111+
func TestRE2Dollar_Multiline(t *testing.T) {
112+
r := MustCompile(`^ac$\n`, RE2|Multiline)
113+
if m, _ := r.MatchString("ac"); m {
114+
t.Fatal("Expected no match")
115+
}
116+
if m, err := r.MatchString("ac\n"); err != nil {
117+
t.Fatal(err)
118+
} else if !m {
119+
t.Fatal("Expected match")
120+
}
121+
}

regexp_test.go

+19
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,25 @@ func TestECMANegateRange(t *testing.T) {
758758
}
759759
}
760760

761+
func TestDollar(t *testing.T) {
762+
// PCRE/C# allow \n to match to $ at end-of-string in singleline mode...
763+
// a weird edge-case kept for compatibility, ECMAScript/RE2 mode don't allow it
764+
re := MustCompile(`ac$`, 0)
765+
if m, err := re.MatchString("ac\n"); err != nil {
766+
t.Fatal(err)
767+
} else if !m {
768+
t.Fatal("Expected match")
769+
}
770+
}
771+
func TestECMADollar(t *testing.T) {
772+
re := MustCompile(`ac$`, ECMAScript)
773+
if m, err := re.MatchString("ac\n"); err != nil {
774+
t.Fatal(err)
775+
} else if m {
776+
t.Fatal("Expected no match")
777+
}
778+
}
779+
761780
func TestThreeByteUnicode_InputOnly(t *testing.T) {
762781
// confirm the bmprefix properly ignores 3-byte unicode in the input value
763782
// this used to panic

runner.go

+14-1
Original file line numberDiff line numberDiff line change
@@ -566,9 +566,22 @@ func (r *runner) execute() error {
566566
continue
567567

568568
case syntax.EndZ:
569-
if r.rightchars() > 1 || r.rightchars() == 1 && r.charAt(r.textPos()) != '\n' {
569+
rchars := r.rightchars()
570+
if rchars > 1 {
570571
break
571572
}
573+
// RE2 and EcmaScript define $ as "asserts position at the end of the string"
574+
// PCRE/.NET adds "or before the line terminator right at the end of the string (if any)"
575+
if (r.re.options & (RE2 | ECMAScript)) != 0 {
576+
// RE2/Ecmascript mode
577+
if rchars > 0 {
578+
break
579+
}
580+
} else if rchars == 1 && r.charAt(r.textPos()) != '\n' {
581+
// "regular" mode
582+
break
583+
}
584+
572585
r.advance(0)
573586
continue
574587

0 commit comments

Comments
 (0)