notedeck

One damus client to rule them all
git clone git://jb55.com/notedeck
Log | Files | Refs | README | LICENSE

commit 5e47747c39f7d8cabc4fd2d2047c7393002608dc
parent 39f877e4e48530769802fb289671cb60ef0ab829
Author: William Casarin <jb55@jb55.com>
Date:   Wed, 18 Feb 2026 14:30:14 -0800

md-stream: fix truncated code fence language when streaming

When the opening fence line arrived in chunks (e.g. "```ru" then
"st\n"), the parser committed a premature language span that was
never corrected. Also fix re-processing of content lines when a
partial closing fence caused the parser to wait for more input.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Diffstat:
Mcrates/md-stream/src/parser.rs | 70+++++++++++++++++++++++++++++++++++-----------------------------------
Mcrates/md-stream/src/tests.rs | 104++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 138 insertions(+), 36 deletions(-)

diff --git a/crates/md-stream/src/parser.rs b/crates/md-stream/src/parser.rs @@ -234,12 +234,18 @@ impl StreamParser { } // Could be a code fence: need at least 3 backticks or tildes - if trimmed.len() < 3 { - let first = trimmed.as_bytes()[0]; - if first == b'`' || first == b'~' { + let first = trimmed.as_bytes()[0]; + if first == b'`' || first == b'~' { + if trimmed.len() < 3 { // All chars so far are the same fence char return trimmed.bytes().all(|b| b == first); } + // Have 3+ fence chars — still need the newline to finalize + // the opening line (language tag may be incomplete) + let fence_len = trimmed.bytes().take_while(|&b| b == first).count(); + if fence_len >= 3 && !trimmed[fence_len..].contains('\n') { + return true; + } } // Could be a thematic break: need "---", "***", or "___" @@ -294,7 +300,7 @@ impl StreamParser { if fence_len >= 3 { let after_fence = &trimmed[fence_len..]; - let (language, consumed_lang) = if let Some(nl_pos) = after_fence.find('\n') { + if let Some(nl_pos) = after_fence.find('\n') { let lang = after_fence[..nl_pos].trim(); let lang_span = if lang.is_empty() { None @@ -308,37 +314,28 @@ impl StreamParser { self.process_pos + leading_space + fence_len + lang_start_in_after; Some(Span::new(abs_start, abs_start + lang.len())) }; - (lang_span, nl_pos + 1) - } else { - // No newline yet - language might be incomplete - let lang = after_fence.trim(); - let lang_span = if lang.is_empty() { - None - } else { - let lang_start_in_after = - after_fence.len() - after_fence.trim_start().len(); - let abs_start = - self.process_pos + leading_space + fence_len + lang_start_in_after; - Some(Span::new(abs_start, abs_start + lang.len())) - }; - (lang_span, after_fence.len()) - }; + let consumed_lang = nl_pos + 1; - let consumed = leading_space + fence_len + consumed_lang; - let content_start = self.process_pos + consumed; - let mut partial = Partial::new( - PartialKind::CodeFence { - fence_char, - fence_len, - language, - }, - self.process_pos, - ); - partial.content_start = content_start; - partial.content_end = content_start; - self.partial = Some(partial); - self.at_line_start = false; - return Some(consumed); + let consumed = leading_space + fence_len + consumed_lang; + let content_start = self.process_pos + consumed; + let mut partial = Partial::new( + PartialKind::CodeFence { + fence_char, + fence_len, + language: lang_span, + }, + self.process_pos, + ); + partial.content_start = content_start; + partial.content_end = content_start; + self.partial = Some(partial); + self.at_line_start = false; + return Some(consumed); + } else { + // No newline yet — the language tag may be incomplete. + // Wait for more input so we don't commit a truncated span. + return None; + } } } @@ -449,7 +446,10 @@ impl StreamParser { && trimmed.bytes().all(|b| b == fence_char as u8) && !line.contains('\n') { - // Don't advance — wait for more chars + // Advance past content lines we already processed, + // but stop before the partial fence so we re-check it + // when more data arrives. + self.advance(pos - text_start); return false; } } diff --git a/crates/md-stream/src/tests.rs b/crates/md-stream/src/tests.rs @@ -67,9 +67,11 @@ fn test_code_block_streaming() { let mut parser = StreamParser::new(); parser.push("```py"); - assert!(parser.in_code_block() || parser.partial().is_some()); + // No partial yet — language tag may be incomplete without newline + assert!(parser.partial().is_none()); parser.push("thon\n"); + // Now the full opening fence line is available assert!(parser.in_code_block()); parser.push("print('hello')\n"); @@ -938,3 +940,103 @@ fn test_table_partial_shows_during_streaming() { partial.kind ); } + +#[test] +fn test_code_fence_partial_has_language() { + // While streaming a code block, the partial should expose the language + let mut parser = StreamParser::new(); + parser.push("```rust\nfn main() {\n"); + + let partial = parser + .partial() + .expect("Should have partial while code block is open"); + match &partial.kind { + PartialKind::CodeFence { language, .. } => { + let lang = language.expect("Language should be set during partial"); + assert_eq!(lang.resolve(parser.buffer()), "rust"); + } + other => panic!("Expected CodeFence partial, got: {:?}", other), + } + // Content should be available too + assert_eq!(partial.content(parser.buffer()), "fn main() {\n"); +} + +#[test] +fn test_code_fence_partial_language_streamed_char_by_char() { + // Simulate LLM token-by-token streaming + let mut parser = StreamParser::new(); + let input = "```python\ndef hello():\n print(\"hi\")\n"; + + for ch in input.chars() { + parser.push(&ch.to_string()); + } + + // Should still be partial (no closing fence) + assert_eq!( + parser.parsed().len(), + 0, + "Should not have finalized any elements" + ); + let partial = parser.partial().expect("Should have partial"); + match &partial.kind { + PartialKind::CodeFence { language, .. } => { + let lang = language.expect("Language should be set"); + assert_eq!(lang.resolve(parser.buffer()), "python"); + } + other => panic!("Expected CodeFence partial, got: {:?}", other), + } + assert_eq!( + partial.content(parser.buffer()), + "def hello():\n print(\"hi\")\n" + ); +} + +#[test] +fn test_consecutive_code_blocks_preserve_language() { + // Multiple code blocks back-to-back, as an LLM would produce + let mut parser = StreamParser::new(); + let input = "```rust\nlet x = 1;\n```\n\n```python\nx = 1\n```\n\n```c\nint x = 1;\n```\n"; + + // Stream in small chunks to simulate LLM output + let chunks: Vec<&str> = input + .as_bytes() + .chunks(5) + .map(|c| std::str::from_utf8(c).unwrap()) + .collect(); + for chunk in &chunks { + parser.push(chunk); + } + + let code_blocks: Vec<_> = parser + .parsed() + .iter() + .filter_map(|e| match e { + MdElement::CodeBlock(cb) => Some(cb), + _ => None, + }) + .collect(); + + assert!( + code_blocks.len() >= 3, + "Expected 3 code blocks, got {} (parsed: {:?})", + code_blocks.len(), + parser.parsed() + ); + + assert_eq!( + code_blocks[0].language.map(|s| r(&s, parser.buffer())), + Some("rust") + ); + assert_eq!( + code_blocks[1].language.map(|s| r(&s, parser.buffer())), + Some("python") + ); + assert_eq!( + code_blocks[2].language.map(|s| r(&s, parser.buffer())), + Some("c") + ); + + assert_eq!(r(&code_blocks[0].content, parser.buffer()), "let x = 1;\n"); + assert_eq!(r(&code_blocks[1].content, parser.buffer()), "x = 1\n"); + assert_eq!(r(&code_blocks[2].content, parser.buffer()), "int x = 1;\n"); +}