cellstate_core/
lint.rs

1// Copyright 2024-2026 CELLSTATE Contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Markdown semantic lint — deterministic checks for context hazards.
5//!
6//! Moved from the standalone `cellstate-server` crate into core because
7//! `ContextValidator::check_artifact_semantics()` calls it at runtime.
8
9use crate::pcp::{LintIssueType, MarkdownSemanticIssue};
10use regex::Regex;
11use std::sync::OnceLock;
12
13fn reserved_at_regex() -> &'static Regex {
14    static RE: OnceLock<Regex> = OnceLock::new();
15    RE.get_or_init(|| {
16        Regex::new(r"(^|[^\w\\])@([A-Za-z0-9_./-]+)")
17            .expect("reserved @ mention regex must compile")
18    })
19}
20
21fn markdown_table_separator_regex() -> &'static Regex {
22    static RE: OnceLock<Regex> = OnceLock::new();
23    RE.get_or_init(|| {
24        Regex::new(r"^:?-{3,}:?$").expect("markdown table separator regex must compile")
25    })
26}
27
28fn markdown_table_column_count(line: &str) -> usize {
29    let trimmed = line.trim();
30    if !trimmed.contains('|') {
31        return 0;
32    }
33    let core = trimmed.trim_matches('|').trim();
34    if core.is_empty() {
35        return 0;
36    }
37    core.split('|').count()
38}
39
40fn is_markdown_separator_row(line: &str) -> bool {
41    let core = line.trim().trim_matches('|').trim();
42    if core.is_empty() {
43        return false;
44    }
45    core.split('|')
46        .map(str::trim)
47        .all(|seg| markdown_table_separator_regex().is_match(seg))
48}
49
50/// Lint markdown/text for semantic hazards that commonly break context tooling.
51///
52/// This is intentionally lightweight and deterministic:
53/// - rejects unescaped mention-like `@` tokens (`@foo`, `@path/to/file`)
54/// - flags unterminated fenced code blocks
55/// - flags malformed markdown table rows with inconsistent column counts
56pub fn lint_markdown_semantics(content: &str) -> Vec<MarkdownSemanticIssue> {
57    let mut issues = Vec::new();
58    let lines: Vec<&str> = content.lines().collect();
59
60    // Reserved `@` leak detection (mention-like tokens, not emails).
61    for (idx, line) in lines.iter().enumerate() {
62        if reserved_at_regex().is_match(line) {
63            issues.push(MarkdownSemanticIssue {
64                issue_type: LintIssueType::ReservedCharacterLeak,
65                line: idx + 1,
66                message: "unescaped '@' token detected; escape as '\\@' to avoid agent import side-effects".to_string(),
67            });
68        }
69    }
70
71    // Unterminated fenced block detection.
72    let mut open_fence_line: Option<usize> = None;
73    for (idx, line) in lines.iter().enumerate() {
74        if line.trim_start().starts_with("```") {
75            if open_fence_line.is_some() {
76                open_fence_line = None;
77            } else {
78                open_fence_line = Some(idx + 1);
79            }
80        }
81    }
82    if let Some(line) = open_fence_line {
83        issues.push(MarkdownSemanticIssue {
84            issue_type: LintIssueType::SyntaxError,
85            line,
86            message: format!("unterminated fenced code block opened at line {}", line),
87        });
88    }
89
90    // Markdown table shape validation.
91    let mut i = 0usize;
92    while i + 1 < lines.len() {
93        if !lines[i].contains('|') || !is_markdown_separator_row(lines[i + 1]) {
94            i += 1;
95            continue;
96        }
97
98        let expected_cols = markdown_table_column_count(lines[i]);
99        if expected_cols == 0 {
100            i += 1;
101            continue;
102        }
103
104        let mut j = i + 2;
105        while j < lines.len() && lines[j].contains('|') {
106            let cols = markdown_table_column_count(lines[j]);
107            if cols != expected_cols {
108                issues.push(MarkdownSemanticIssue {
109                    issue_type: LintIssueType::SyntaxError,
110                    line: j + 1,
111                    message: format!(
112                        "malformed markdown table row: expected {} columns, found {}",
113                        expected_cols, cols
114                    ),
115                });
116                break;
117            }
118            j += 1;
119        }
120
121        i = j;
122    }
123
124    issues
125}
126
127#[cfg(test)]
128mod tests {
129    use super::*;
130
131    #[test]
132    fn test_lint_markdown_semantics_clean() {
133        let issues = lint_markdown_semantics("# Hello\n\nSome text.");
134        assert!(issues.is_empty());
135    }
136
137    #[test]
138    fn test_lint_markdown_semantics_unterminated_fence() {
139        let content = "```rust\nfn main() {}\n";
140        let issues = lint_markdown_semantics(content);
141        assert!(issues
142            .iter()
143            .any(|i| i.issue_type == LintIssueType::SyntaxError));
144    }
145
146    #[test]
147    fn test_lint_markdown_semantics_reserved_at_mention() {
148        let content = "Please use @my_tool for this.";
149        let issues = lint_markdown_semantics(content);
150        assert!(issues
151            .iter()
152            .any(|i| i.issue_type == LintIssueType::ReservedCharacterLeak));
153    }
154
155    #[test]
156    fn test_lint_markdown_semantics_detects_table_shape_mismatch() {
157        let content = r#"
158| a | b |
159| --- | --- |
160| 1 | 2 | 3 |
161"#;
162        let issues = lint_markdown_semantics(content);
163        assert!(issues
164            .iter()
165            .any(|i| i.issue_type == LintIssueType::SyntaxError));
166    }
167}