commit 5e47747c39f7d8cabc4fd2d2047c7393002608dc
parent 39f877e4e48530769802fb289671cb60ef0ab829
Author: William Casarin <jb55@jb55.com>
Date: Wed, 18 Feb 2026 14:30:14 -0800
md-stream: fix truncated code fence language when streaming
When the opening fence line arrived in chunks (e.g. "```ru" then
"st\n"), the parser committed a premature language span that was
never corrected. Also fix re-processing of content lines when a
partial closing fence caused the parser to wait for more input.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Diffstat:
2 files changed, 138 insertions(+), 36 deletions(-)
diff --git a/crates/md-stream/src/parser.rs b/crates/md-stream/src/parser.rs
@@ -234,12 +234,18 @@ impl StreamParser {
}
// Could be a code fence: need at least 3 backticks or tildes
- if trimmed.len() < 3 {
- let first = trimmed.as_bytes()[0];
- if first == b'`' || first == b'~' {
+ let first = trimmed.as_bytes()[0];
+ if first == b'`' || first == b'~' {
+ if trimmed.len() < 3 {
// All chars so far are the same fence char
return trimmed.bytes().all(|b| b == first);
}
+ // Have 3+ fence chars — still need the newline to finalize
+ // the opening line (language tag may be incomplete)
+ let fence_len = trimmed.bytes().take_while(|&b| b == first).count();
+ if fence_len >= 3 && !trimmed[fence_len..].contains('\n') {
+ return true;
+ }
}
// Could be a thematic break: need "---", "***", or "___"
@@ -294,7 +300,7 @@ impl StreamParser {
if fence_len >= 3 {
let after_fence = &trimmed[fence_len..];
- let (language, consumed_lang) = if let Some(nl_pos) = after_fence.find('\n') {
+ if let Some(nl_pos) = after_fence.find('\n') {
let lang = after_fence[..nl_pos].trim();
let lang_span = if lang.is_empty() {
None
@@ -308,37 +314,28 @@ impl StreamParser {
self.process_pos + leading_space + fence_len + lang_start_in_after;
Some(Span::new(abs_start, abs_start + lang.len()))
};
- (lang_span, nl_pos + 1)
- } else {
- // No newline yet - language might be incomplete
- let lang = after_fence.trim();
- let lang_span = if lang.is_empty() {
- None
- } else {
- let lang_start_in_after =
- after_fence.len() - after_fence.trim_start().len();
- let abs_start =
- self.process_pos + leading_space + fence_len + lang_start_in_after;
- Some(Span::new(abs_start, abs_start + lang.len()))
- };
- (lang_span, after_fence.len())
- };
+ let consumed_lang = nl_pos + 1;
- let consumed = leading_space + fence_len + consumed_lang;
- let content_start = self.process_pos + consumed;
- let mut partial = Partial::new(
- PartialKind::CodeFence {
- fence_char,
- fence_len,
- language,
- },
- self.process_pos,
- );
- partial.content_start = content_start;
- partial.content_end = content_start;
- self.partial = Some(partial);
- self.at_line_start = false;
- return Some(consumed);
+ let consumed = leading_space + fence_len + consumed_lang;
+ let content_start = self.process_pos + consumed;
+ let mut partial = Partial::new(
+ PartialKind::CodeFence {
+ fence_char,
+ fence_len,
+ language: lang_span,
+ },
+ self.process_pos,
+ );
+ partial.content_start = content_start;
+ partial.content_end = content_start;
+ self.partial = Some(partial);
+ self.at_line_start = false;
+ return Some(consumed);
+ } else {
+ // No newline yet — the language tag may be incomplete.
+ // Wait for more input so we don't commit a truncated span.
+ return None;
+ }
}
}
@@ -449,7 +446,10 @@ impl StreamParser {
&& trimmed.bytes().all(|b| b == fence_char as u8)
&& !line.contains('\n')
{
- // Don't advance — wait for more chars
+ // Advance past content lines we already processed,
+ // but stop before the partial fence so we re-check it
+ // when more data arrives.
+ self.advance(pos - text_start);
return false;
}
}
diff --git a/crates/md-stream/src/tests.rs b/crates/md-stream/src/tests.rs
@@ -67,9 +67,11 @@ fn test_code_block_streaming() {
let mut parser = StreamParser::new();
parser.push("```py");
- assert!(parser.in_code_block() || parser.partial().is_some());
+ // No partial yet — language tag may be incomplete without newline
+ assert!(parser.partial().is_none());
parser.push("thon\n");
+ // Now the full opening fence line is available
assert!(parser.in_code_block());
parser.push("print('hello')\n");
@@ -938,3 +940,103 @@ fn test_table_partial_shows_during_streaming() {
partial.kind
);
}
+
+#[test]
+fn test_code_fence_partial_has_language() {
+ // While streaming a code block, the partial should expose the language
+ let mut parser = StreamParser::new();
+ parser.push("```rust\nfn main() {\n");
+
+ let partial = parser
+ .partial()
+ .expect("Should have partial while code block is open");
+ match &partial.kind {
+ PartialKind::CodeFence { language, .. } => {
+ let lang = language.expect("Language should be set during partial");
+ assert_eq!(lang.resolve(parser.buffer()), "rust");
+ }
+ other => panic!("Expected CodeFence partial, got: {:?}", other),
+ }
+ // Content should be available too
+ assert_eq!(partial.content(parser.buffer()), "fn main() {\n");
+}
+
+#[test]
+fn test_code_fence_partial_language_streamed_char_by_char() {
+ // Simulate LLM token-by-token streaming
+ let mut parser = StreamParser::new();
+ let input = "```python\ndef hello():\n print(\"hi\")\n";
+
+ for ch in input.chars() {
+ parser.push(&ch.to_string());
+ }
+
+ // Should still be partial (no closing fence)
+ assert_eq!(
+ parser.parsed().len(),
+ 0,
+ "Should not have finalized any elements"
+ );
+ let partial = parser.partial().expect("Should have partial");
+ match &partial.kind {
+ PartialKind::CodeFence { language, .. } => {
+ let lang = language.expect("Language should be set");
+ assert_eq!(lang.resolve(parser.buffer()), "python");
+ }
+ other => panic!("Expected CodeFence partial, got: {:?}", other),
+ }
+ assert_eq!(
+ partial.content(parser.buffer()),
+ "def hello():\n print(\"hi\")\n"
+ );
+}
+
+#[test]
+fn test_consecutive_code_blocks_preserve_language() {
+ // Multiple code blocks back-to-back, as an LLM would produce
+ let mut parser = StreamParser::new();
+ let input = "```rust\nlet x = 1;\n```\n\n```python\nx = 1\n```\n\n```c\nint x = 1;\n```\n";
+
+ // Stream in small chunks to simulate LLM output
+ let chunks: Vec<&str> = input
+ .as_bytes()
+ .chunks(5)
+ .map(|c| std::str::from_utf8(c).unwrap())
+ .collect();
+ for chunk in &chunks {
+ parser.push(chunk);
+ }
+
+ let code_blocks: Vec<_> = parser
+ .parsed()
+ .iter()
+ .filter_map(|e| match e {
+ MdElement::CodeBlock(cb) => Some(cb),
+ _ => None,
+ })
+ .collect();
+
+ assert!(
+ code_blocks.len() >= 3,
+ "Expected 3 code blocks, got {} (parsed: {:?})",
+ code_blocks.len(),
+ parser.parsed()
+ );
+
+ assert_eq!(
+ code_blocks[0].language.map(|s| r(&s, parser.buffer())),
+ Some("rust")
+ );
+ assert_eq!(
+ code_blocks[1].language.map(|s| r(&s, parser.buffer())),
+ Some("python")
+ );
+ assert_eq!(
+ code_blocks[2].language.map(|s| r(&s, parser.buffer())),
+ Some("c")
+ );
+
+ assert_eq!(r(&code_blocks[0].content, parser.buffer()), "let x = 1;\n");
+ assert_eq!(r(&code_blocks[1].content, parser.buffer()), "x = 1\n");
+ assert_eq!(r(&code_blocks[2].content, parser.buffer()), "int x = 1;\n");
+}