Skip to content

Commit 0fc24d2

Browse files
committed
syntax: add is_line_anchored_{start,end}
This commit adds two new predicates to `Hir` values that permit querying whether an expression is *line* anchored at the start or end. This was motivated by a desire to tweak the offsets of a match when enabling --crlf mode in ripgrep.
1 parent 036cad6 commit 0fc24d2

File tree

2 files changed

+150
-8
lines changed

2 files changed

+150
-8
lines changed

Diff for: regex-syntax/src/hir/mod.rs

+87-5
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,8 @@ impl Hir {
222222
info.set_all_assertions(true);
223223
info.set_anchored_start(false);
224224
info.set_anchored_end(false);
225+
info.set_line_anchored_start(false);
226+
info.set_line_anchored_end(false);
225227
info.set_any_anchored_start(false);
226228
info.set_any_anchored_end(false);
227229
info.set_match_empty(true);
@@ -246,6 +248,8 @@ impl Hir {
246248
info.set_all_assertions(false);
247249
info.set_anchored_start(false);
248250
info.set_anchored_end(false);
251+
info.set_line_anchored_start(false);
252+
info.set_line_anchored_end(false);
249253
info.set_any_anchored_start(false);
250254
info.set_any_anchored_end(false);
251255
info.set_match_empty(false);
@@ -262,6 +266,8 @@ impl Hir {
262266
info.set_all_assertions(false);
263267
info.set_anchored_start(false);
264268
info.set_anchored_end(false);
269+
info.set_line_anchored_start(false);
270+
info.set_line_anchored_end(false);
265271
info.set_any_anchored_start(false);
266272
info.set_any_anchored_end(false);
267273
info.set_match_empty(false);
@@ -278,17 +284,27 @@ impl Hir {
278284
info.set_all_assertions(true);
279285
info.set_anchored_start(false);
280286
info.set_anchored_end(false);
287+
info.set_line_anchored_start(false);
288+
info.set_line_anchored_end(false);
281289
info.set_any_anchored_start(false);
282290
info.set_any_anchored_end(false);
283291
info.set_match_empty(true);
284292
if let Anchor::StartText = anchor {
285293
info.set_anchored_start(true);
294+
info.set_line_anchored_start(true);
286295
info.set_any_anchored_start(true);
287296
}
288297
if let Anchor::EndText = anchor {
289298
info.set_anchored_end(true);
299+
info.set_line_anchored_end(true);
290300
info.set_any_anchored_end(true);
291301
}
302+
if let Anchor::StartLine = anchor {
303+
info.set_line_anchored_start(true);
304+
}
305+
if let Anchor::EndLine = anchor {
306+
info.set_line_anchored_end(true);
307+
}
292308
Hir {
293309
kind: HirKind::Anchor(anchor),
294310
info: info,
@@ -302,6 +318,8 @@ impl Hir {
302318
info.set_all_assertions(true);
303319
info.set_anchored_start(false);
304320
info.set_anchored_end(false);
321+
info.set_line_anchored_start(false);
322+
info.set_line_anchored_end(false);
305323
info.set_any_anchored_start(false);
306324
info.set_any_anchored_end(false);
307325
// A negated word boundary matches the empty string, but a normal
@@ -330,6 +348,12 @@ impl Hir {
330348
info.set_anchored_end(
331349
!rep.is_match_empty() && rep.hir.is_anchored_end()
332350
);
351+
info.set_line_anchored_start(
352+
!rep.is_match_empty() && rep.hir.is_anchored_start()
353+
);
354+
info.set_line_anchored_end(
355+
!rep.is_match_empty() && rep.hir.is_anchored_end()
356+
);
333357
info.set_any_anchored_start(rep.hir.is_any_anchored_start());
334358
info.set_any_anchored_end(rep.hir.is_any_anchored_end());
335359
info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
@@ -346,6 +370,8 @@ impl Hir {
346370
info.set_all_assertions(group.hir.is_all_assertions());
347371
info.set_anchored_start(group.hir.is_anchored_start());
348372
info.set_anchored_end(group.hir.is_anchored_end());
373+
info.set_line_anchored_start(group.hir.is_line_anchored_start());
374+
info.set_line_anchored_end(group.hir.is_line_anchored_end());
349375
info.set_any_anchored_start(group.hir.is_any_anchored_start());
350376
info.set_any_anchored_end(group.hir.is_any_anchored_end());
351377
info.set_match_empty(group.hir.is_match_empty());
@@ -361,7 +387,7 @@ impl Hir {
361387
pub fn concat(mut exprs: Vec<Hir>) -> Hir {
362388
match exprs.len() {
363389
0 => Hir::empty(),
364-
1 => exprs.pop().unwrap(),
390+
1 => { exprs.pop().unwrap() }
365391
_ => {
366392
let mut info = HirInfo::new();
367393
info.set_always_utf8(true);
@@ -418,6 +444,24 @@ impl Hir {
418444
.any(|e| {
419445
e.is_anchored_end()
420446
}));
447+
// Repeat the process for line anchors.
448+
info.set_line_anchored_start(
449+
exprs.iter()
450+
.take_while(|e| {
451+
e.is_line_anchored_start() || e.is_all_assertions()
452+
})
453+
.any(|e| {
454+
e.is_line_anchored_start()
455+
}));
456+
info.set_line_anchored_end(
457+
exprs.iter()
458+
.rev()
459+
.take_while(|e| {
460+
e.is_line_anchored_end() || e.is_all_assertions()
461+
})
462+
.any(|e| {
463+
e.is_line_anchored_end()
464+
}));
421465
Hir {
422466
kind: HirKind::Concat(exprs),
423467
info: info,
@@ -439,6 +483,8 @@ impl Hir {
439483
info.set_all_assertions(true);
440484
info.set_anchored_start(true);
441485
info.set_anchored_end(true);
486+
info.set_line_anchored_start(true);
487+
info.set_line_anchored_end(true);
442488
info.set_any_anchored_start(false);
443489
info.set_any_anchored_end(false);
444490
info.set_match_empty(false);
@@ -457,6 +503,14 @@ impl Hir {
457503
let x = info.is_anchored_end() && e.is_anchored_end();
458504
info.set_anchored_end(x);
459505

506+
let x = info.is_line_anchored_start()
507+
&& e.is_line_anchored_start();
508+
info.set_line_anchored_start(x);
509+
510+
let x = info.is_line_anchored_end()
511+
&& e.is_line_anchored_end();
512+
info.set_line_anchored_end(x);
513+
460514
let x =
461515
info.is_any_anchored_start()
462516
|| e.is_any_anchored_start();
@@ -551,6 +605,32 @@ impl Hir {
551605
self.info.is_anchored_end()
552606
}
553607

608+
/// Return true if and only if this HIR is required to match from the
609+
/// beginning of text or the beginning of a line. This includes expressions
610+
/// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar`
611+
/// but not `^foo|bar` or `(?m)^foo|bar`.
612+
///
613+
/// Note that if `is_anchored_start` is `true`, then
614+
/// `is_line_anchored_start` will also be `true`. The reverse implication
615+
/// is not true. For example, `(?m)^foo` is line anchored, but not
616+
/// `is_anchored_start`.
617+
pub fn is_line_anchored_start(&self) -> bool {
618+
self.info.is_line_anchored_start()
619+
}
620+
621+
/// Return true if and only if this HIR is required to match at the
622+
/// end of text or the end of a line. This includes expressions like
623+
/// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`,
624+
/// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`.
625+
///
626+
/// Note that if `is_anchored_end` is `true`, then
627+
/// `is_line_anchored_end` will also be `true`. The reverse implication
628+
/// is not true. For example, `(?m)foo$` is line anchored, but not
629+
/// `is_anchored_end`.
630+
pub fn is_line_anchored_end(&self) -> bool {
631+
self.info.is_line_anchored_end()
632+
}
633+
554634
/// Return true if and only if this HIR contains any sub-expression that
555635
/// is required to match at the beginning of text. Specifically, this
556636
/// returns true if the `^` symbol (when multiline mode is disabled) or the
@@ -1299,7 +1379,7 @@ struct HirInfo {
12991379
///
13001380
/// If more attributes need to be added, it is OK to increase the size of
13011381
/// this as appropriate.
1302-
bools: u8,
1382+
bools: u16,
13031383
}
13041384

13051385
// A simple macro for defining bitfield accessors/mutators.
@@ -1330,9 +1410,11 @@ impl HirInfo {
13301410
define_bool!(1, is_all_assertions, set_all_assertions);
13311411
define_bool!(2, is_anchored_start, set_anchored_start);
13321412
define_bool!(3, is_anchored_end, set_anchored_end);
1333-
define_bool!(4, is_any_anchored_start, set_any_anchored_start);
1334-
define_bool!(5, is_any_anchored_end, set_any_anchored_end);
1335-
define_bool!(6, is_match_empty, set_match_empty);
1413+
define_bool!(4, is_line_anchored_start, set_line_anchored_start);
1414+
define_bool!(5, is_line_anchored_end, set_line_anchored_end);
1415+
define_bool!(6, is_any_anchored_start, set_any_anchored_start);
1416+
define_bool!(7, is_any_anchored_end, set_any_anchored_end);
1417+
define_bool!(8, is_match_empty, set_match_empty);
13361418
}
13371419

13381420
#[cfg(test)]

Diff for: regex-syntax/src/hir/translate.rs

+63-3
Original file line numberDiff line numberDiff line change
@@ -2414,40 +2414,68 @@ mod tests {
24142414
// Positive examples.
24152415
assert!(t(r"^").is_anchored_start());
24162416
assert!(t(r"$").is_anchored_end());
2417+
assert!(t(r"^").is_line_anchored_start());
2418+
assert!(t(r"$").is_line_anchored_end());
24172419

24182420
assert!(t(r"^^").is_anchored_start());
24192421
assert!(t(r"$$").is_anchored_end());
2422+
assert!(t(r"^^").is_line_anchored_start());
2423+
assert!(t(r"$$").is_line_anchored_end());
24202424

24212425
assert!(t(r"^$").is_anchored_start());
24222426
assert!(t(r"^$").is_anchored_end());
2427+
assert!(t(r"^$").is_line_anchored_start());
2428+
assert!(t(r"^$").is_line_anchored_end());
24232429

24242430
assert!(t(r"^foo").is_anchored_start());
24252431
assert!(t(r"foo$").is_anchored_end());
2432+
assert!(t(r"^foo").is_line_anchored_start());
2433+
assert!(t(r"foo$").is_line_anchored_end());
24262434

24272435
assert!(t(r"^foo|^bar").is_anchored_start());
24282436
assert!(t(r"foo$|bar$").is_anchored_end());
2437+
assert!(t(r"^foo|^bar").is_line_anchored_start());
2438+
assert!(t(r"foo$|bar$").is_line_anchored_end());
24292439

24302440
assert!(t(r"^(foo|bar)").is_anchored_start());
24312441
assert!(t(r"(foo|bar)$").is_anchored_end());
2442+
assert!(t(r"^(foo|bar)").is_line_anchored_start());
2443+
assert!(t(r"(foo|bar)$").is_line_anchored_end());
24322444

24332445
assert!(t(r"^+").is_anchored_start());
24342446
assert!(t(r"$+").is_anchored_end());
2447+
assert!(t(r"^+").is_line_anchored_start());
2448+
assert!(t(r"$+").is_line_anchored_end());
24352449
assert!(t(r"^++").is_anchored_start());
24362450
assert!(t(r"$++").is_anchored_end());
2451+
assert!(t(r"^++").is_line_anchored_start());
2452+
assert!(t(r"$++").is_line_anchored_end());
24372453
assert!(t(r"(^)+").is_anchored_start());
24382454
assert!(t(r"($)+").is_anchored_end());
2455+
assert!(t(r"(^)+").is_line_anchored_start());
2456+
assert!(t(r"($)+").is_line_anchored_end());
24392457

24402458
assert!(t(r"$^").is_anchored_start());
2441-
assert!(t(r"$^").is_anchored_end());
2459+
assert!(t(r"$^").is_anchored_start());
2460+
assert!(t(r"$^").is_line_anchored_end());
2461+
assert!(t(r"$^").is_line_anchored_end());
24422462
assert!(t(r"$^|^$").is_anchored_start());
24432463
assert!(t(r"$^|^$").is_anchored_end());
2464+
assert!(t(r"$^|^$").is_line_anchored_start());
2465+
assert!(t(r"$^|^$").is_line_anchored_end());
24442466

24452467
assert!(t(r"\b^").is_anchored_start());
24462468
assert!(t(r"$\b").is_anchored_end());
2469+
assert!(t(r"\b^").is_line_anchored_start());
2470+
assert!(t(r"$\b").is_line_anchored_end());
24472471
assert!(t(r"^(?m:^)").is_anchored_start());
24482472
assert!(t(r"(?m:$)$").is_anchored_end());
2473+
assert!(t(r"^(?m:^)").is_line_anchored_start());
2474+
assert!(t(r"(?m:$)$").is_line_anchored_end());
24492475
assert!(t(r"(?m:^)^").is_anchored_start());
24502476
assert!(t(r"$(?m:$)").is_anchored_end());
2477+
assert!(t(r"(?m:^)^").is_line_anchored_start());
2478+
assert!(t(r"$(?m:$)").is_line_anchored_end());
24512479

24522480
// Negative examples.
24532481
assert!(!t(r"(?m)^").is_anchored_start());
@@ -2459,21 +2487,53 @@ mod tests {
24592487

24602488
assert!(!t(r"a^").is_anchored_start());
24612489
assert!(!t(r"$a").is_anchored_start());
2490+
assert!(!t(r"a^").is_line_anchored_start());
2491+
assert!(!t(r"$a").is_line_anchored_start());
24622492

2463-
assert!(!t(r"a^").is_anchored_start());
2464-
assert!(!t(r"$a").is_anchored_start());
2493+
assert!(!t(r"a^").is_anchored_end());
2494+
assert!(!t(r"$a").is_anchored_end());
2495+
assert!(!t(r"a^").is_line_anchored_end());
2496+
assert!(!t(r"$a").is_line_anchored_end());
24652497

24662498
assert!(!t(r"^foo|bar").is_anchored_start());
24672499
assert!(!t(r"foo|bar$").is_anchored_end());
2500+
assert!(!t(r"^foo|bar").is_line_anchored_start());
2501+
assert!(!t(r"foo|bar$").is_line_anchored_end());
24682502

24692503
assert!(!t(r"^*").is_anchored_start());
24702504
assert!(!t(r"$*").is_anchored_end());
2505+
assert!(!t(r"^*").is_line_anchored_start());
2506+
assert!(!t(r"$*").is_line_anchored_end());
24712507
assert!(!t(r"^*+").is_anchored_start());
24722508
assert!(!t(r"$*+").is_anchored_end());
2509+
assert!(!t(r"^*+").is_line_anchored_start());
2510+
assert!(!t(r"$*+").is_line_anchored_end());
24732511
assert!(!t(r"^+*").is_anchored_start());
24742512
assert!(!t(r"$+*").is_anchored_end());
2513+
assert!(!t(r"^+*").is_line_anchored_start());
2514+
assert!(!t(r"$+*").is_line_anchored_end());
24752515
assert!(!t(r"(^)*").is_anchored_start());
24762516
assert!(!t(r"($)*").is_anchored_end());
2517+
assert!(!t(r"(^)*").is_line_anchored_start());
2518+
assert!(!t(r"($)*").is_line_anchored_end());
2519+
}
2520+
2521+
#[test]
2522+
fn analysis_is_line_anchored() {
2523+
assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start());
2524+
assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end());
2525+
2526+
assert!(t(r"(?m)^foo|^bar").is_line_anchored_start());
2527+
assert!(t(r"(?m)foo$|bar$").is_line_anchored_end());
2528+
2529+
assert!(t(r"(?m)^").is_line_anchored_start());
2530+
assert!(t(r"(?m)$").is_line_anchored_end());
2531+
2532+
assert!(t(r"(?m:^$)|$^").is_line_anchored_start());
2533+
assert!(t(r"(?m:^$)|$^").is_line_anchored_end());
2534+
2535+
assert!(t(r"$^|(?m:^$)").is_line_anchored_start());
2536+
assert!(t(r"$^|(?m:^$)").is_line_anchored_end());
24772537
}
24782538

24792539
#[test]

0 commit comments

Comments
 (0)