Skip to content

Commit f4201ef

Browse files
Lukasz Anforowiczcalebcartwright
Lukasz Anforowicz
authored andcommitted
Handling of numbered markdown lists.
Fixes issue #5416
1 parent 66b9951 commit f4201ef

File tree

5 files changed

+319
-25
lines changed

5 files changed

+319
-25
lines changed

Diff for: src/comment.rs

+153-21
Original file line numberDiff line numberDiff line change
@@ -432,12 +432,18 @@ impl CodeBlockAttribute {
432432

433433
/// Block that is formatted as an item.
434434
///
435-
/// An item starts with either a star `*` a dash `-` a greater-than `>` or a plus '+'.
435+
/// An item starts with either a star `*`, a dash `-`, a greater-than `>`, a plus '+', or a number
436+
/// `12.` or `34)` (with at most 2 digits). An item represents CommonMark's ["list
437+
/// items"](https://spec.commonmark.org/0.30/#list-items) and/or ["block
438+
/// quotes"](https://spec.commonmark.org/0.30/#block-quotes), but note that only a subset of
439+
/// CommonMark is recognized - see the doc comment of [`ItemizedBlock::get_marker_length`] for more
440+
/// details.
441+
///
436442
/// Different level of indentation are handled by shrinking the shape accordingly.
437443
struct ItemizedBlock {
438444
/// the lines that are identified as part of an itemized block
439445
lines: Vec<String>,
440-
/// the number of characters (typically whitespaces) up to the item sigil
446+
/// the number of characters (typically whitespaces) up to the item marker
441447
indent: usize,
442448
/// the string that marks the start of an item
443449
opener: String,
@@ -446,37 +452,70 @@ struct ItemizedBlock {
446452
}
447453

448454
impl ItemizedBlock {
449-
/// Returns `true` if the line is formatted as an item
450-
fn is_itemized_line(line: &str) -> bool {
451-
let trimmed = line.trim_start();
455+
/// Checks whether the `trimmed` line includes an item marker. Returns `None` if there is no
456+
/// marker. Returns the length of the marker (in bytes) if one is present. Note that the length
457+
/// includes the whitespace that follows the marker, for example the marker in `"* list item"`
458+
/// has the length of 2.
459+
///
460+
/// This function recognizes item markers that correspond to CommonMark's
461+
/// ["bullet list marker"](https://spec.commonmark.org/0.30/#bullet-list-marker),
462+
/// ["block quote marker"](https://spec.commonmark.org/0.30/#block-quote-marker), and/or
463+
/// ["ordered list marker"](https://spec.commonmark.org/0.30/#ordered-list-marker).
464+
///
465+
/// Compared to CommonMark specification, the number of digits that are allowed in an ["ordered
466+
/// list marker"](https://spec.commonmark.org/0.30/#ordered-list-marker) is more limited (to at
467+
/// most 2 digits). Limiting the length of the marker helps reduce the risk of recognizing
468+
/// arbitrary numbers as markers. See also
469+
/// <https://talk.commonmark.org/t/blank-lines-before-lists-revisited/1990> which gives the
470+
/// following example where a number (i.e. "1868") doesn't signify an ordered list:
471+
/// ```md
472+
/// The Captain died in
473+
/// 1868. He wes buried in...
474+
/// ```
475+
fn get_marker_length(trimmed: &str) -> Option<usize> {
476+
// https://spec.commonmark.org/0.30/#bullet-list-marker or
477+
// https://spec.commonmark.org/0.30/#block-quote-marker
452478
let itemized_start = ["* ", "- ", "> ", "+ "];
453-
itemized_start.iter().any(|s| trimmed.starts_with(s))
479+
if itemized_start.iter().any(|s| trimmed.starts_with(s)) {
480+
return Some(2); // All items in `itemized_start` have length 2.
481+
}
482+
483+
// https://spec.commonmark.org/0.30/#ordered-list-marker, where at most 2 digits are
484+
// allowed.
485+
for suffix in [". ", ") "] {
486+
if let Some((prefix, _)) = trimmed.split_once(suffix) {
487+
if prefix.len() <= 2 && prefix.chars().all(|c| char::is_ascii_digit(&c)) {
488+
return Some(prefix.len() + suffix.len());
489+
}
490+
}
491+
}
492+
493+
None // No markers found.
454494
}
455495

456-
/// Creates a new ItemizedBlock described with the given line.
457-
/// The `is_itemized_line` needs to be called first.
458-
fn new(line: &str) -> ItemizedBlock {
459-
let space_to_sigil = line.chars().take_while(|c| c.is_whitespace()).count();
460-
// +2 = '* ', which will add the appropriate amount of whitespace to keep itemized
461-
// content formatted correctly.
462-
let mut indent = space_to_sigil + 2;
496+
/// Creates a new `ItemizedBlock` described with the given `line`.
497+
/// Returns `None` if `line` doesn't start an item.
498+
fn new(line: &str) -> Option<ItemizedBlock> {
499+
let marker_length = ItemizedBlock::get_marker_length(line.trim_start())?;
500+
let space_to_marker = line.chars().take_while(|c| c.is_whitespace()).count();
501+
let mut indent = space_to_marker + marker_length;
463502
let mut line_start = " ".repeat(indent);
464503

465504
// Markdown blockquote start with a "> "
466505
if line.trim_start().starts_with(">") {
467506
// remove the original +2 indent because there might be multiple nested block quotes
468507
// and it's easier to reason about the final indent by just taking the length
469-
// of th new line_start. We update the indent because it effects the max width
508+
// of the new line_start. We update the indent because it effects the max width
470509
// of each formatted line.
471510
line_start = itemized_block_quote_start(line, line_start, 2);
472511
indent = line_start.len();
473512
}
474-
ItemizedBlock {
513+
Some(ItemizedBlock {
475514
lines: vec![line[indent..].to_string()],
476515
indent,
477516
opener: line[..indent].to_string(),
478517
line_start,
479-
}
518+
})
480519
}
481520

482521
/// Returns a `StringFormat` used for formatting the content of an item.
@@ -495,7 +534,7 @@ impl ItemizedBlock {
495534
/// Returns `true` if the line is part of the current itemized block.
496535
/// If it is, then it is added to the internal lines list.
497536
fn add_line(&mut self, line: &str) -> bool {
498-
if !ItemizedBlock::is_itemized_line(line)
537+
if ItemizedBlock::get_marker_length(line.trim_start()).is_none()
499538
&& self.indent <= line.chars().take_while(|c| c.is_whitespace()).count()
500539
{
501540
self.lines.push(line.to_string());
@@ -766,10 +805,11 @@ impl<'a> CommentRewrite<'a> {
766805
self.item_block = None;
767806
if let Some(stripped) = line.strip_prefix("```") {
768807
self.code_block_attr = Some(CodeBlockAttribute::new(stripped))
769-
} else if self.fmt.config.wrap_comments() && ItemizedBlock::is_itemized_line(line) {
770-
let ib = ItemizedBlock::new(line);
771-
self.item_block = Some(ib);
772-
return false;
808+
} else if self.fmt.config.wrap_comments() {
809+
if let Some(ib) = ItemizedBlock::new(line) {
810+
self.item_block = Some(ib);
811+
return false;
812+
}
773813
}
774814

775815
if self.result == self.opener {
@@ -2020,4 +2060,96 @@ fn main() {
20202060
"#;
20212061
assert_eq!(s, filter_normal_code(s_with_comment));
20222062
}
2063+
2064+
#[test]
2065+
fn test_itemized_block_first_line_handling() {
2066+
fn run_test(
2067+
test_input: &str,
2068+
expected_line: &str,
2069+
expected_indent: usize,
2070+
expected_opener: &str,
2071+
expected_line_start: &str,
2072+
) {
2073+
let block = ItemizedBlock::new(test_input).unwrap();
2074+
assert_eq!(1, block.lines.len(), "test_input: {:?}", test_input);
2075+
assert_eq!(
2076+
expected_line, &block.lines[0],
2077+
"test_input: {:?}",
2078+
test_input
2079+
);
2080+
assert_eq!(
2081+
expected_indent, block.indent,
2082+
"test_input: {:?}",
2083+
test_input
2084+
);
2085+
assert_eq!(
2086+
expected_opener, &block.opener,
2087+
"test_input: {:?}",
2088+
test_input
2089+
);
2090+
assert_eq!(
2091+
expected_line_start, &block.line_start,
2092+
"test_input: {:?}",
2093+
test_input
2094+
);
2095+
}
2096+
2097+
run_test("- foo", "foo", 2, "- ", " ");
2098+
run_test("* foo", "foo", 2, "* ", " ");
2099+
run_test("> foo", "foo", 2, "> ", "> ");
2100+
2101+
run_test("1. foo", "foo", 3, "1. ", " ");
2102+
run_test("12. foo", "foo", 4, "12. ", " ");
2103+
run_test("1) foo", "foo", 3, "1) ", " ");
2104+
run_test("12) foo", "foo", 4, "12) ", " ");
2105+
2106+
run_test(" - foo", "foo", 6, " - ", " ");
2107+
2108+
// https://spec.commonmark.org/0.30 says: "A start number may begin with 0s":
2109+
run_test("0. foo", "foo", 3, "0. ", " ");
2110+
run_test("01. foo", "foo", 4, "01. ", " ");
2111+
}
2112+
2113+
#[test]
2114+
fn test_itemized_block_nonobvious_markers_are_rejected() {
2115+
let test_inputs = vec![
2116+
// Non-numeric item markers (e.g. `a.` or `iv.`) are not allowed by
2117+
// https://spec.commonmark.org/0.30/#ordered-list-marker. We also note that allowing
2118+
// them would risk misidentifying regular words as item markers. See also the
2119+
// discussion in https://talk.commonmark.org/t/blank-lines-before-lists-revisited/1990
2120+
"word. rest of the paragraph.",
2121+
"a. maybe this is a list item? maybe not?",
2122+
"iv. maybe this is a list item? maybe not?",
2123+
// Numbers with 3 or more digits are not recognized as item markers, to avoid
2124+
// formatting the following example as a list:
2125+
//
2126+
// ```
2127+
// The Captain died in
2128+
// 1868. He was buried in...
2129+
// ```
2130+
"123. only 2-digit numbers are recognized as item markers.",
2131+
// Parens:
2132+
"123) giving some coverage to parens as well.",
2133+
"a) giving some coverage to parens as well.",
2134+
// https://spec.commonmark.org/0.30 says that "at least one space or tab is needed
2135+
// between the list marker and any following content":
2136+
"1.Not a list item.",
2137+
"1.2.3. Not a list item.",
2138+
"1)Not a list item.",
2139+
"-Not a list item.",
2140+
"+Not a list item.",
2141+
"+1 not a list item.",
2142+
// https://spec.commonmark.org/0.30 says: "A start number may not be negative":
2143+
"-1. Not a list item.",
2144+
"-1 Not a list item.",
2145+
];
2146+
for line in test_inputs.iter() {
2147+
let maybe_block = ItemizedBlock::new(line);
2148+
assert!(
2149+
maybe_block.is_none(),
2150+
"The following line shouldn't be classified as a list item: {}",
2151+
line
2152+
);
2153+
}
2154+
}
20232155
}

Diff for: tests/source/itemized-blocks/no_wrap.rs

+35-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// rustfmt-normalize_comments: true
22
// rustfmt-format_code_in_doc_comments: true
33

4-
//! This is a list:
4+
//! This is an itemized markdown list (see also issue #3224):
55
//! * Outer
66
//! * Outer
77
//! * Inner
@@ -13,6 +13,40 @@
1313
//! - when the log level is info, the level name is green and the rest of the line is white
1414
//! - when the log level is debug, the whole line is white
1515
//! - when the log level is trace, the whole line is gray ("bright black")
16+
//!
17+
//! This is a numbered markdown list (see also issue #5416):
18+
//! 1. Long long long long long long long long long long long long long long long long long line
19+
//! 2. Another very long long long long long long long long long long long long long long long line
20+
//! 3. Nested list
21+
//! 1. Long long long long long long long long long long long long long long long long line
22+
//! 2. Another very long long long long long long long long long long long long long long line
23+
//! 4. Last item
24+
//!
25+
//! Using the ')' instead of '.' character after the number:
26+
//! 1) Long long long long long long long long long long long long long long long long long line
27+
//! 2) Another very long long long long long long long long long long long long long long long line
28+
//!
29+
//! Deep list that mixes various bullet and number formats:
30+
//! 1. First level with a long long long long long long long long long long long long long long
31+
//! long long long line
32+
//! 2. First level with another very long long long long long long long long long long long long
33+
//! long long long line
34+
//! * Second level with a long long long long long long long long long long long long long
35+
//! long long long line
36+
//! * Second level with another very long long long long long long long long long long long
37+
//! long long long line
38+
//! 1) Third level with a long long long long long long long long long long long long long
39+
//! long long long line
40+
//! 2) Third level with another very long long long long long long long long long long
41+
//! long long long long line
42+
//! - Forth level with a long long long long long long long long long long long long
43+
//! long long long long line
44+
//! - Forth level with another very long long long long long long long long long long
45+
//! long long long long line
46+
//! 3) One more item at the third level
47+
//! 4) Last item of the third level
48+
//! * Last item of second level
49+
//! 3. Last item of first level
1650
1751
/// All the parameters ***except for `from_theater`*** should be inserted as sent by the remote
1852
/// theater, i.e., as passed to [`Theater::send`] on the remote actor:

Diff for: tests/source/itemized-blocks/wrap.rs

+35-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// rustfmt-format_code_in_doc_comments: true
33
// rustfmt-max_width: 50
44

5-
//! This is a list:
5+
//! This is an itemized markdown list (see also issue #3224):
66
//! * Outer
77
//! * Outer
88
//! * Inner
@@ -14,6 +14,40 @@
1414
//! - when the log level is info, the level name is green and the rest of the line is white
1515
//! - when the log level is debug, the whole line is white
1616
//! - when the log level is trace, the whole line is gray ("bright black")
17+
//!
18+
//! This is a numbered markdown list (see also issue #5416):
19+
//! 1. Long long long long long long long long long long long long long long long long long line
20+
//! 2. Another very long long long long long long long long long long long long long long long line
21+
//! 3. Nested list
22+
//! 1. Long long long long long long long long long long long long long long long long line
23+
//! 2. Another very long long long long long long long long long long long long long long line
24+
//! 4. Last item
25+
//!
26+
//! Using the ')' instead of '.' character after the number:
27+
//! 1) Long long long long long long long long long long long long long long long long long line
28+
//! 2) Another very long long long long long long long long long long long long long long long line
29+
//!
30+
//! Deep list that mixes various bullet and number formats:
31+
//! 1. First level with a long long long long long long long long long long long long long long
32+
//! long long long line
33+
//! 2. First level with another very long long long long long long long long long long long long
34+
//! long long long line
35+
//! * Second level with a long long long long long long long long long long long long long
36+
//! long long long line
37+
//! * Second level with another very long long long long long long long long long long long
38+
//! long long long line
39+
//! 1) Third level with a long long long long long long long long long long long long long
40+
//! long long long line
41+
//! 2) Third level with another very long long long long long long long long long long
42+
//! long long long long line
43+
//! - Forth level with a long long long long long long long long long long long long
44+
//! long long long long line
45+
//! - Forth level with another very long long long long long long long long long long
46+
//! long long long long line
47+
//! 3) One more item at the third level
48+
//! 4) Last item of the third level
49+
//! * Last item of second level
50+
//! 3. Last item of first level
1751
1852
// This example shows how to configure fern to output really nicely colored logs
1953
// - when the log level is error, the whole line is red

Diff for: tests/target/itemized-blocks/no_wrap.rs

+35-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// rustfmt-normalize_comments: true
22
// rustfmt-format_code_in_doc_comments: true
33

4-
//! This is a list:
4+
//! This is an itemized markdown list (see also issue #3224):
55
//! * Outer
66
//! * Outer
77
//! * Inner
@@ -13,6 +13,40 @@
1313
//! - when the log level is info, the level name is green and the rest of the line is white
1414
//! - when the log level is debug, the whole line is white
1515
//! - when the log level is trace, the whole line is gray ("bright black")
16+
//!
17+
//! This is a numbered markdown list (see also issue #5416):
18+
//! 1. Long long long long long long long long long long long long long long long long long line
19+
//! 2. Another very long long long long long long long long long long long long long long long line
20+
//! 3. Nested list
21+
//! 1. Long long long long long long long long long long long long long long long long line
22+
//! 2. Another very long long long long long long long long long long long long long long line
23+
//! 4. Last item
24+
//!
25+
//! Using the ')' instead of '.' character after the number:
26+
//! 1) Long long long long long long long long long long long long long long long long long line
27+
//! 2) Another very long long long long long long long long long long long long long long long line
28+
//!
29+
//! Deep list that mixes various bullet and number formats:
30+
//! 1. First level with a long long long long long long long long long long long long long long
31+
//! long long long line
32+
//! 2. First level with another very long long long long long long long long long long long long
33+
//! long long long line
34+
//! * Second level with a long long long long long long long long long long long long long
35+
//! long long long line
36+
//! * Second level with another very long long long long long long long long long long long
37+
//! long long long line
38+
//! 1) Third level with a long long long long long long long long long long long long long
39+
//! long long long line
40+
//! 2) Third level with another very long long long long long long long long long long
41+
//! long long long long line
42+
//! - Forth level with a long long long long long long long long long long long long
43+
//! long long long long line
44+
//! - Forth level with another very long long long long long long long long long long
45+
//! long long long long line
46+
//! 3) One more item at the third level
47+
//! 4) Last item of the third level
48+
//! * Last item of second level
49+
//! 3. Last item of first level
1650
1751
/// All the parameters ***except for `from_theater`*** should be inserted as sent by the remote
1852
/// theater, i.e., as passed to [`Theater::send`] on the remote actor:

0 commit comments

Comments
 (0)