md-stream: fix code block detection after paragraph with single newline - notedeck

commit 14387f3fc0668827d0d1978f95d6c09564c0f5f6
parent 05b9ff75c6cdeee71eaa65ff7246edac979e712b
Author: William Casarin <jb55@jb55.com>
Date:   Sun, 15 Feb 2026 20:06:06 -0800

md-stream: fix code block detection after paragraph with single newline

Refactor StreamParser from Vec<String> token storage to a single
contiguous String buffer, eliminating cross-token boundary issues
that prevented code fence detection in char-by-char streaming.

Key fixes:
- Buffer consolidation: push() is now zero-alloc (push_str)
- could_be_block_start() defers ambiguous prefixes (` -> ```)
- at_line_start properly cleared on non-newline accumulation
- Paragraph partial saved before try_block_start to prevent clobbering
- Code fence closing detection defers on partial fence chars

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Diffstat:
M crates/md-stream/src/parser.rs  | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M crates/md-stream/src/partial.rs  | 22 ++++------------------
M crates/md-stream/src/tests.rs  | 48 ++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 195 insertions(+), 94 deletions(-)
diff --git a/crates/md-stream/src/parser.rs b/crates/md-stream/src/parser.rs
@@ -6,14 +6,11 @@ use crate::partial::{Partial, PartialKind};
 
 /// Incremental markdown parser for streaming input.
 ///
-/// Maintains a buffer of incoming tokens and tracks parsing state
-/// to allow progressive rendering as content streams in.
+/// Maintains a single contiguous buffer of incoming text and tracks
+/// a processing cursor to allow progressive rendering as content streams in.
 pub struct StreamParser {
-    /// Raw token chunks from the stream
-    tokens: Vec<String>,
-
-    /// Total bytes in tokens (for efficient length tracking)
-    total_bytes: usize,
+    /// Contiguous buffer of all pushed text
+    buffer: String,
 
     /// Completed markdown elements
     parsed: Vec<MdElement>,
@@ -21,11 +18,8 @@ pub struct StreamParser {
     /// Current in-progress element (if any)
     partial: Option<Partial>,
 
-    /// Index of first unprocessed token
-    process_idx: usize,
-
-    /// Byte offset within the token at process_idx
-    process_offset: usize,
+    /// Byte offset of first unprocessed content in buffer
+    process_pos: usize,
 
     /// Are we at the start of a line? (for block-level detection)
     at_line_start: bool,
@@ -34,12 +28,10 @@ pub struct StreamParser {
 impl StreamParser {
     pub fn new() -> Self {
         Self {
-            tokens: Vec::new(),
-            total_bytes: 0,
+            buffer: String::new(),
             parsed: Vec::new(),
             partial: None,
-            process_idx: 0,
-            process_offset: 0,
+            process_pos: 0,
             at_line_start: true,
         }
     }
@@ -50,8 +42,7 @@ impl StreamParser {
             return;
         }
 
-        self.tokens.push(token.to_string());
-        self.total_bytes += token.len();
+        self.buffer.push_str(token);
         self.process_new_content();
     }
 
@@ -84,20 +75,15 @@ impl StreamParser {
         )
     }
 
+    /// Get the unprocessed portion of the buffer.
+    fn remaining(&self) -> &str {
+        &self.buffer[self.process_pos..]
+    }
+
     /// Process newly added content.
     fn process_new_content(&mut self) {
-        while self.process_idx < self.tokens.len() {
-            // Clone the remaining text to avoid borrow conflicts
-            let remaining = {
-                let token = &self.tokens[self.process_idx];
-                let slice = &token[self.process_offset..];
-                if slice.is_empty() {
-                    self.process_idx += 1;
-                    self.process_offset = 0;
-                    continue;
-                }
-                slice.to_string()
-            };
+        while self.process_pos < self.buffer.len() {
+            let remaining = self.remaining().to_string();
 
             // Handle based on current partial state
             let partial_kind = self.partial.as_ref().map(|p| p.kind.clone());
@@ -122,17 +108,31 @@ impl StreamParser {
                     PartialKind::Paragraph => {
                         // For paragraphs, check if we're at a line start that could be a block element
                         if self.at_line_start {
+                            // Take the paragraph partial first — try_block_start may
+                            // replace self.partial with the new block element
+                            let para_partial = self.partial.take();
+
                             if let Some(consumed) = self.try_block_start(&remaining) {
-                                // Emit the current paragraph before starting the new block
-                                if let Some(partial) = self.partial.take() {
-                                    if !partial.content.trim().is_empty() {
-                                        let inline_elements = parse_inline(partial.content.trim());
+                                // Emit the saved paragraph before the new block
+                                if let Some(partial) = para_partial {
+                                    let trimmed = partial.content.trim();
+                                    if !trimmed.is_empty() {
+                                        let inline_elements = parse_inline(trimmed);
                                         self.parsed.push(MdElement::Paragraph(inline_elements));
                                     }
                                 }
                                 self.advance(consumed);
                                 continue;
                             }
+
+                            // Block start failed — restore the paragraph partial
+                            self.partial = para_partial;
+                            // If remaining could be the start of a block element but we
+                            // don't have enough chars yet, wait for more input rather than
+                            // consuming into the paragraph (e.g. "`" could become "```")
+                            if self.could_be_block_start(&remaining) {
+                                return;
+                            }
                         }
                         // Continue with inline processing
                         if self.process_inline(&remaining) {
@@ -156,6 +156,9 @@ impl StreamParser {
                     self.advance(consumed);
                     continue;
                 }
+                if self.could_be_block_start(&remaining) {
+                    return;
+                }
             }
 
             // Fall back to inline processing
@@ -166,6 +169,35 @@ impl StreamParser {
         }
     }
 
+    /// Check if text could be the start of a block element but we don't
+    /// have enough characters to confirm yet. Used to defer consuming
+    /// ambiguous prefixes like "`" or "``" that might become "```".
+    fn could_be_block_start(&self, text: &str) -> bool {
+        let trimmed = text.trim_start();
+        if trimmed.is_empty() {
+            return false;
+        }
+
+        // Could be a code fence: need at least 3 backticks or tildes
+        if trimmed.len() < 3 {
+            let first = trimmed.as_bytes()[0];
+            if first == b'`' || first == b'~' {
+                // All chars so far are the same fence char
+                return trimmed.bytes().all(|b| b == first);
+            }
+        }
+
+        // Could be a thematic break: need "---", "***", or "___"
+        if trimmed.len() < 3 {
+            let first = trimmed.as_bytes()[0];
+            if first == b'-' || first == b'*' || first == b'_' {
+                return trimmed.bytes().all(|b| b == first);
+            }
+        }
+
+        false
+    }
+
     /// Try to detect a block-level element at line start.
     /// Returns bytes consumed if successful.
     fn try_block_start(&mut self, text: &str) -> Option<usize> {
@@ -180,7 +212,7 @@ impl StreamParser {
                     if rest.starts_with(' ') || rest.is_empty() {
                         self.partial = Some(Partial::new(
                             PartialKind::Heading { level: level as u8 },
-                            self.process_idx,
+                            self.process_pos,
                         ));
                         self.at_line_start = false;
                         return Some(leading_space + level + rest.starts_with(' ') as usize);
@@ -225,7 +257,7 @@ impl StreamParser {
                         fence_len,
                         language,
                     },
-                    self.process_idx,
+                    self.process_pos,
                 ));
                 self.at_line_start = false;
                 return Some(leading_space + fence_len + consumed_lang);
@@ -255,34 +287,52 @@ impl StreamParser {
     /// Process content inside a code fence.
     /// Returns true if we should continue processing, false if we need more input.
     fn process_code_fence(&mut self, fence_char: char, fence_len: usize, text: &str) -> bool {
-        let closing = std::iter::repeat_n(fence_char, fence_len).collect::<String>();
-
-        // Look for closing fence at start of line
         let partial = self.partial.as_mut().unwrap();
 
         for line in text.split_inclusive('\n') {
-            let trimmed = line.trim_start();
-            if trimmed.starts_with(&closing) {
-                // Check it's a valid closing fence (only fence chars and whitespace after)
-                let after_fence = &trimmed[fence_len..];
-                if after_fence.trim().is_empty() || after_fence.starts_with('\n') {
-                    // Found closing fence! Complete the code block
-                    let language = if let PartialKind::CodeFence { language, .. } = &partial.kind {
-                        language.clone()
-                    } else {
-                        None
-                    };
-
-                    let content = std::mem::take(&mut partial.content);
-                    self.parsed
-                        .push(MdElement::CodeBlock(CodeBlock { language, content }));
-                    self.partial = None;
-                    self.at_line_start = true;
+            // Check if we're at a line start within the code fence
+            let at_content_line_start =
+                partial.content.is_empty() || partial.content.ends_with('\n');
+
+            if at_content_line_start {
+                let trimmed = line.trim_start();
+
+                // Check for closing fence
+                if trimmed.len() >= fence_len
+                    && trimmed.as_bytes().iter().take(fence_len).all(|&b| b == fence_char as u8)
+                {
+                    let after_fence = &trimmed[fence_len..];
+                    if after_fence.trim().is_empty() || after_fence.starts_with('\n') {
+                        // Found closing fence! Complete the code block
+                        let language =
+                            if let PartialKind::CodeFence { language, .. } = &partial.kind {
+                                language.clone()
+                            } else {
+                                None
+                            };
+
+                        let content = std::mem::take(&mut partial.content);
+                        self.parsed
+                            .push(MdElement::CodeBlock(CodeBlock { language, content }));
+                        self.partial = None;
+                        self.at_line_start = true;
+
+                        // Advance past the closing fence line
+                        let consumed = text.find(line).unwrap() + line.len();
+                        self.advance(consumed);
+                        return true;
+                    }
+                }
 
-                    // Advance past the closing fence line
-                    let consumed = text.find(line).unwrap() + line.len();
-                    self.advance(consumed);
-                    return true;
+                // If this could be the start of a closing fence but we don't
+                // have enough chars yet, wait for more input
+                if !trimmed.is_empty()
+                    && trimmed.len() < fence_len
+                    && trimmed.bytes().all(|b| b == fence_char as u8)
+                    && !line.contains('\n')
+                {
+                    // Don't advance — wait for more chars
+                    return false;
                 }
             }
 
@@ -360,6 +410,35 @@ impl StreamParser {
         }
 
         if let Some(nl_pos) = text.find('\n') {
+            let after_nl = &text[nl_pos + 1..];
+
+            // Check if text after the newline starts a block element (code fence, heading, etc.)
+            // If so, emit the current paragraph and let the block parser handle the rest.
+            if !after_nl.is_empty() {
+                let trimmed_after = after_nl.trim_start();
+                let is_block_start = trimmed_after.starts_with("```")
+                    || trimmed_after.starts_with("~~~")
+                    || trimmed_after.starts_with('#');
+                if is_block_start {
+                    // Accumulate text before the newline into the paragraph
+                    let para_text = if let Some(ref mut partial) = self.partial {
+                        partial.content.push_str(&text[..nl_pos]);
+                        std::mem::take(&mut partial.content)
+                    } else {
+                        text[..nl_pos].to_string()
+                    };
+                    self.partial = None;
+
+                    if !para_text.trim().is_empty() {
+                        let inline_elements = parse_inline(para_text.trim());
+                        self.parsed.push(MdElement::Paragraph(inline_elements));
+                    }
+                    self.at_line_start = true;
+                    self.advance(nl_pos + 1);
+                    return true;
+                }
+            }
+
             // Single newline - continue accumulating but track position
             if let Some(ref mut partial) = self.partial {
                 partial.content.push_str(&text[..=nl_pos]);
@@ -368,8 +447,7 @@ impl StreamParser {
                 let content = text[..=nl_pos].to_string();
                 self.partial = Some(Partial {
                     kind: PartialKind::Paragraph,
-                    start_idx: self.process_idx,
-                    byte_offset: self.process_offset,
+                    start_pos: self.process_pos,
                     content,
                 });
             }
@@ -384,29 +462,18 @@ impl StreamParser {
         } else {
             self.partial = Some(Partial {
                 kind: PartialKind::Paragraph,
-                start_idx: self.process_idx,
-                byte_offset: self.process_offset,
+                start_pos: self.process_pos,
                 content: text.to_string(),
             });
         }
+        self.at_line_start = false;
         self.advance(text.len());
         false
     }
 
     /// Advance the processing position by n bytes.
     fn advance(&mut self, n: usize) {
-        let mut remaining = n;
-        while remaining > 0 && self.process_idx < self.tokens.len() {
-            let token_remaining = self.tokens[self.process_idx].len() - self.process_offset;
-            if remaining >= token_remaining {
-                remaining -= token_remaining;
-                self.process_idx += 1;
-                self.process_offset = 0;
-            } else {
-                self.process_offset += remaining;
-                remaining = 0;
-            }
-        }
+        self.process_pos += n;
     }
 
     /// Finalize parsing (call when stream ends).
diff --git a/crates/md-stream/src/partial.rs b/crates/md-stream/src/partial.rs
@@ -7,32 +7,18 @@ pub struct Partial {
     /// What kind of element we're building
     pub kind: PartialKind,
 
-    /// Index into the token buffer where this element starts.
-    /// Used to resume parsing from the right spot.
-    pub start_idx: usize,
-
-    /// Byte offset within the starting token (for mid-token starts)
-    pub byte_offset: usize,
+    /// Byte offset into the buffer where this element starts
+    pub start_pos: usize,
 
     /// Accumulated content so far (for elements that need it)
     pub content: String,
 }
 
 impl Partial {
-    pub fn new(kind: PartialKind, start_idx: usize) -> Self {
-        Self {
-            kind,
-            start_idx,
-            byte_offset: 0,
-            content: String::new(),
-        }
-    }
-
-    pub fn with_offset(kind: PartialKind, start_idx: usize, byte_offset: usize) -> Self {
+    pub fn new(kind: PartialKind, start_pos: usize) -> Self {
         Self {
             kind,
-            start_idx,
-            byte_offset,
+            start_pos,
             content: String::new(),
         }
     }
diff --git a/crates/md-stream/src/tests.rs b/crates/md-stream/src/tests.rs
@@ -497,6 +497,54 @@ fn test_streaming_multiple_code_spans_with_angle_brackets() {
 }
 
 #[test]
+fn test_code_block_after_paragraph_single_newline() {
+    // Reproduces: paragraph text ending with ":\n" then "```\n" code block
+    // This is the common pattern: "All events share these common tags:\n```\n..."
+    let mut parser = StreamParser::new();
+    let input = "All events share these common tags:\n```\n[\"d\", \"<session-id>\"]\n```\n";
+    parser.push(input);
+
+    eprintln!("Before finalize - parsed: {:#?}", parser.parsed());
+    eprintln!("Before finalize - partial: {:#?}", parser.partial());
+
+    parser.finalize();
+
+    eprintln!("After finalize - parsed: {:#?}", parser.parsed());
+
+    // Should have: paragraph + code block
+    let has_paragraph = parser.parsed().iter().any(|e| matches!(e, MdElement::Paragraph(_)));
+    let has_code_block = parser.parsed().iter().any(|e| matches!(e, MdElement::CodeBlock(_)));
+
+    assert!(has_paragraph, "Missing paragraph element");
+    assert!(has_code_block, "Missing code block element");
+}
+
+#[test]
+fn test_code_block_after_paragraph_single_newline_streaming() {
+    // Same test but streaming char-by-char (how LLM tokens arrive)
+    let mut parser = StreamParser::new();
+    let input = "All events share these common tags:\n```\n[\"d\", \"<session-id>\"]\n```\n";
+
+    for ch in input.chars() {
+        parser.push(&ch.to_string());
+    }
+
+    eprintln!("Before finalize - parsed: {:#?}", parser.parsed());
+    eprintln!("Before finalize - partial: {:#?}", parser.partial());
+    eprintln!("Before finalize - in_code_block: {}", parser.in_code_block());
+
+    parser.finalize();
+
+    eprintln!("After finalize - parsed: {:#?}", parser.parsed());
+
+    let has_paragraph = parser.parsed().iter().any(|e| matches!(e, MdElement::Paragraph(_)));
+    let has_code_block = parser.parsed().iter().any(|e| matches!(e, MdElement::CodeBlock(_)));
+
+    assert!(has_paragraph, "Missing paragraph element");
+    assert!(has_code_block, "Missing code block element");
+}
+
+#[test]
 fn test_heading_partial_kind_distinct_from_paragraph() {
     let mut parser = StreamParser::new();
     parser.push("# Heading without newline");

	notedeck One damus client to rule them all
	git clone git://jb55.com/notedeck
	Log \| Files \| Refs \| README \| LICENSE

M	crates/md-stream/src/parser.rs	\|	219	+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M	crates/md-stream/src/partial.rs	\|	22	++++------------------
M	crates/md-stream/src/tests.rs	\|	48	++++++++++++++++++++++++++++++++++++++++++++++++