cellstate_core/
redaction.rs

1//! PII Redaction Types and Verification Registry
2//!
3//! This module provides the core types for PII redaction and the server-side
4//! verification registry. Under Option A, the client scrubs + encrypts;
5//! the server verifies consistency and stores ciphertext.
6//!
7//! # Key types
8//!
9//! - [`ScrubbedText`]: Text that has been through redaction. Private constructor.
10//! - [`ScrubbedPayload`]: JSON payload that has been through redaction. Private constructor.
11//! - [`RedactionManifest`]: Describes what was redacted (spans + vault inserts).
12//! - [`RedactionRegistry`]: Server-side verifier + egress guard + redact-only fallback.
13
14use serde::{Deserialize, Serialize};
15use std::fmt;
16use std::sync::OnceLock;
17use uuid::Uuid;
18
19// ============================================================================
20// PII TYPE CLASSIFICATION
21// ============================================================================
22
23/// PII type classification for detected sensitive data.
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
25#[serde(rename_all = "snake_case")]
26pub enum PiiType {
27    Ssn,
28    CreditCard,
29    Email,
30    Phone,
31    AwsKey,
32    PrivateKey,
33    ConnectionString,
34    Password,
35    ApiKey,
36    ServerPath,
37    Custom,
38}
39
40impl fmt::Display for PiiType {
41    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42        match self {
43            Self::Ssn => write!(f, "ssn"),
44            Self::CreditCard => write!(f, "credit_card"),
45            Self::Email => write!(f, "email"),
46            Self::Phone => write!(f, "phone"),
47            Self::AwsKey => write!(f, "aws_key"),
48            Self::PrivateKey => write!(f, "private_key"),
49            Self::ConnectionString => write!(f, "connection_string"),
50            Self::Password => write!(f, "password"),
51            Self::ApiKey => write!(f, "api_key"),
52            Self::ServerPath => write!(f, "server_path"),
53            Self::Custom => write!(f, "custom"),
54        }
55    }
56}
57
58// ============================================================================
59// REDACTION SPAN
60// ============================================================================
61
62/// A single detected PII span within text.
63#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
64pub struct RedactionSpan {
65    /// Unique token identifying this redacted value (links to vault).
66    pub token_id: Uuid,
67    /// Classification of the detected PII.
68    pub pii_type: PiiType,
69    /// Placeholder string inserted into the scrubbed text (e.g. `[REDACTED:ssn:abc12345]`).
70    pub placeholder: String,
71    /// Confidence score from the detector (0.0–1.0).
72    pub confidence: f32,
73}
74
75// ============================================================================
76// VAULT INSERT
77// ============================================================================
78
79/// Client-encrypted value for vault storage.
80///
81/// Produced by client-side WebCrypto. The server stores these as opaque bytes
82/// and **never** generates ciphertext or holds decryption keys.
83#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
84pub struct VaultInsert {
85    /// Token ID linking this ciphertext to a [`RedactionSpan`].
86    pub token_id: Uuid,
87    /// Classification of the encrypted PII.
88    pub pii_type: PiiType,
89    /// Client-side WebCrypto-encrypted original value.
90    pub ciphertext: Vec<u8>,
91}
92
93// ============================================================================
94// REDACTION MANIFEST
95// ============================================================================
96
97/// Manifest describing what was redacted. Accompanies scrubbed text.
98///
99/// NOTE: No `source_hash`. Hashing originals leaks low-entropy secrets
100/// (SSN, phone numbers) via brute-force. Dropped by design.
101#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
102pub struct RedactionManifest {
103    /// All PII spans detected and redacted.
104    pub spans: Vec<RedactionSpan>,
105    /// Client-encrypted vault inserts for spans that require reveal capability.
106    pub vault_inserts: Vec<VaultInsert>,
107}
108
109impl RedactionManifest {
110    /// Create an empty manifest (no redactions applied).
111    pub fn empty() -> Self {
112        Self {
113            spans: Vec::new(),
114            vault_inserts: Vec::new(),
115        }
116    }
117
118    /// Whether any PII was detected and redacted.
119    pub fn has_redactions(&self) -> bool {
120        !self.spans.is_empty()
121    }
122
123    /// Count of redacted spans.
124    pub fn span_count(&self) -> usize {
125        self.spans.len()
126    }
127
128    /// Distinct PII types found.
129    pub fn pii_types(&self) -> Vec<PiiType> {
130        let mut types: Vec<PiiType> = self.spans.iter().map(|s| s.pii_type).collect();
131        types.sort_by_key(|t| *t as u8);
132        types.dedup();
133        types
134    }
135
136    /// Token IDs for all redacted spans.
137    pub fn token_ids(&self) -> Vec<Uuid> {
138        self.spans.iter().map(|s| s.token_id).collect()
139    }
140}
141
142// ============================================================================
143// SCRUBBED TEXT
144// ============================================================================
145
146/// Text that has been through PII redaction.
147///
148/// **No public constructor.** Only produced by:
149/// - Client-side `RedactionEngine` (primary path under Option A)
150/// - Server-side `RedactionRegistry::scrub_redact_only()` (fallback for server-originated text)
151/// - `ScrubbedText::from_verified()` (for text verified clean by the registry)
152///
153/// Implements `Clone` (once scrubbed, it's safe). Does NOT implement `AsRef<str>`.
154#[derive(Clone, PartialEq, Serialize, Deserialize)]
155pub struct ScrubbedText {
156    text: String,
157    manifest: RedactionManifest,
158}
159
160impl ScrubbedText {
161    /// Access the redacted text content.
162    pub fn as_redacted_str(&self) -> &str {
163        &self.text
164    }
165
166    /// Access the redaction manifest.
167    pub fn manifest(&self) -> &RedactionManifest {
168        &self.manifest
169    }
170
171    /// Consume into the inner redacted string.
172    pub fn into_string(self) -> String {
173        self.text
174    }
175
176    /// Construct from text that has been verified clean by the registry.
177    ///
178    /// This is a `pub(crate)` constructor — only `RedactionRegistry` methods
179    /// and tests within this crate can call it.
180    pub(crate) fn new_verified(text: String, manifest: RedactionManifest) -> Self {
181        Self { text, manifest }
182    }
183}
184
185impl fmt::Debug for ScrubbedText {
186    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
187        write!(f, "ScrubbedText(\"{}\")", self.text)
188    }
189}
190
191impl fmt::Display for ScrubbedText {
192    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
193        f.write_str(&self.text)
194    }
195}
196
197// ============================================================================
198// SCRUBBED PAYLOAD
199// ============================================================================
200
201/// JSON payload that has been through PII redaction.
202///
203/// **No public constructor.** Same enforcement as [`ScrubbedText`].
204#[derive(Clone, PartialEq, Serialize, Deserialize)]
205pub struct ScrubbedPayload {
206    value: serde_json::Value,
207    manifest: RedactionManifest,
208}
209
210impl ScrubbedPayload {
211    /// Access the redacted JSON value.
212    pub fn as_value(&self) -> &serde_json::Value {
213        &self.value
214    }
215
216    /// Access the redaction manifest.
217    pub fn manifest(&self) -> &RedactionManifest {
218        &self.manifest
219    }
220
221    /// Consume into the inner JSON value.
222    pub fn into_value(self) -> serde_json::Value {
223        self.value
224    }
225
226    /// Construct from a payload that has been verified clean.
227    pub(crate) fn new_verified(value: serde_json::Value, manifest: RedactionManifest) -> Self {
228        Self { value, manifest }
229    }
230}
231
232impl fmt::Debug for ScrubbedPayload {
233    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
234        write!(
235            f,
236            "ScrubbedPayload({} redactions)",
237            self.manifest.span_count()
238        )
239    }
240}
241
242// ============================================================================
243// VIOLATION TYPES
244// ============================================================================
245
246/// A PII pattern detected during verification (text claimed scrubbed but isn't).
247#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
248pub struct PiiViolation {
249    /// The type of PII that was detected.
250    pub pii_type: PiiType,
251    /// Byte offset where the violation starts (for diagnostics, never logged with value).
252    pub byte_offset: usize,
253    /// Length of the matched region in bytes.
254    pub byte_len: usize,
255    /// Confidence score of the detection.
256    pub confidence: f32,
257}
258
259/// A consistency violation between manifest and payload.
260#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
261pub struct ManifestViolation {
262    /// Human-readable description of the violation.
263    pub reason: String,
264    /// Token ID involved (if applicable).
265    pub token_id: Option<Uuid>,
266}
267
268// ============================================================================
269// PII DETECTOR (compiled regex)
270// ============================================================================
271
272/// A single PII detector: a compiled regex + metadata.
273struct PiiDetector {
274    pii_type: PiiType,
275    regex: regex::Regex,
276    confidence: f32,
277}
278
279/// Compiled detectors, built once at startup.
280fn default_detectors() -> &'static [PiiDetector] {
281    static DETECTORS: OnceLock<Vec<PiiDetector>> = OnceLock::new();
282    DETECTORS.get_or_init(|| {
283        vec![
284            // SSN: 123-45-6789
285            PiiDetector {
286                pii_type: PiiType::Ssn,
287                regex: regex::Regex::new(r"\b\d{3}-\d{2}-\d{4}\b").expect("valid SSN regex"),
288                confidence: 0.95,
289            },
290            // Credit card: Visa (4xxx) or Mastercard (51-55xx)
291            PiiDetector {
292                pii_type: PiiType::CreditCard,
293                regex: regex::Regex::new(r"\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14})\b")
294                    .expect("valid credit card regex"),
295                confidence: 0.90,
296            },
297            // Email
298            PiiDetector {
299                pii_type: PiiType::Email,
300                regex: regex::Regex::new(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
301                    .expect("valid email regex"),
302                confidence: 0.85,
303            },
304            // Phone: US formats (xxx-xxx-xxxx, (xxx) xxx-xxxx, +1xxxxxxxxxx)
305            PiiDetector {
306                pii_type: PiiType::Phone,
307                regex: regex::Regex::new(r"\b(?:\+1[-.]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b")
308                    .expect("valid phone regex"),
309                confidence: 0.80,
310            },
311            // AWS access key: AKIA followed by 16 uppercase alphanumeric
312            PiiDetector {
313                pii_type: PiiType::AwsKey,
314                regex: regex::Regex::new(r"AKIA[0-9A-Z]{16}").expect("valid AWS key regex"),
315                confidence: 0.95,
316            },
317            // Private key headers
318            PiiDetector {
319                pii_type: PiiType::PrivateKey,
320                regex: regex::Regex::new(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----")
321                    .expect("valid private key regex"),
322                confidence: 0.99,
323            },
324            // Connection strings: postgres://, mysql://, mongodb://, redis://
325            PiiDetector {
326                pii_type: PiiType::ConnectionString,
327                regex: regex::Regex::new(
328                    r"(?i)(?:postgres|mysql|mongodb|redis)://[^:]+:[^@]+@[^/\s]+",
329                )
330                .expect("valid connection string regex"),
331                confidence: 0.95,
332            },
333            // Password in key=value or key: "value" patterns
334            PiiDetector {
335                pii_type: PiiType::Password,
336                regex: regex::Regex::new(r#"(?i)password["']?\s*[:=]\s*["'][^"']+["']"#)
337                    .expect("valid password pattern regex"),
338                confidence: 0.85,
339            },
340            // API key in key=value patterns
341            PiiDetector {
342                pii_type: PiiType::ApiKey,
343                regex: regex::Regex::new(r#"(?i)api[_-]?key["']?\s*[:=]\s*["'][^"']+["']"#)
344                    .expect("valid API key pattern regex"),
345                confidence: 0.85,
346            },
347            // Server paths (Unix)
348            PiiDetector {
349                pii_type: PiiType::ServerPath,
350                regex: regex::Regex::new(r"/(?:home|var|usr|etc)/[a-zA-Z0-9_-]+/")
351                    .expect("valid server path regex"),
352                confidence: 0.70,
353            },
354        ]
355    })
356}
357
358// ============================================================================
359// REDACTION REGISTRY
360// ============================================================================
361
362/// Server-side PII verification registry and egress guard.
363///
364/// Under Option A, the client performs primary scrubbing. The registry's role:
365/// 1. **Verify**: Check that text claimed to be scrubbed has no surviving PII.
366/// 2. **Verify manifest**: Check consistency between manifest and payload.
367/// 3. **Scrub (redact-only)**: Fallback for server-originated text where
368///    client-side scrubbing wasn't possible. No vault inserts — permanent redaction.
369pub struct RedactionRegistry {
370    confidence_threshold: f32,
371}
372
373impl RedactionRegistry {
374    /// Create a registry with default detectors and the given confidence threshold.
375    pub fn with_defaults(confidence_threshold: f32) -> Self {
376        // Force lazy init of compiled regexes at construction time
377        let _ = default_detectors();
378        Self {
379            confidence_threshold,
380        }
381    }
382
383    /// Default confidence threshold (0.8).
384    pub fn default_threshold() -> f32 {
385        0.8
386    }
387
388    /// VERIFY: Check that text claimed to be scrubbed has no surviving PII.
389    ///
390    /// Returns a list of violations (empty = text is clean).
391    pub fn verify(&self, text: &str, confidence_threshold: f32) -> Vec<PiiViolation> {
392        let mut violations = Vec::new();
393        for detector in default_detectors() {
394            if detector.confidence < confidence_threshold {
395                continue;
396            }
397            for m in detector.regex.find_iter(text) {
398                violations.push(PiiViolation {
399                    pii_type: detector.pii_type,
400                    byte_offset: m.start(),
401                    byte_len: m.len(),
402                    confidence: detector.confidence,
403                });
404            }
405        }
406        violations
407    }
408
409    /// VERIFY MANIFEST: Check that a manifest is consistent with a JSON payload.
410    ///
411    /// Checks:
412    /// - Every span's placeholder string exists in the serialized payload.
413    /// - Every span whose PII type requires encryption has a matching vault_insert.
414    pub fn verify_manifest(
415        &self,
416        payload: &serde_json::Value,
417        manifest: &RedactionManifest,
418    ) -> Result<(), Vec<ManifestViolation>> {
419        let mut violations = Vec::new();
420        let payload_text = payload.to_string();
421
422        for span in &manifest.spans {
423            // Check placeholder exists in payload
424            if !payload_text.contains(&span.placeholder) {
425                violations.push(ManifestViolation {
426                    reason: format!("placeholder '{}' not found in payload", span.placeholder),
427                    token_id: Some(span.token_id),
428                });
429            }
430
431            // Check vault insert exists for encrypted types
432            if requires_vault_insert(span.pii_type) {
433                let has_insert = manifest
434                    .vault_inserts
435                    .iter()
436                    .any(|vi| vi.token_id == span.token_id);
437                if !has_insert {
438                    violations.push(ManifestViolation {
439                        reason: format!(
440                            "span {} ({}) requires vault insert but none provided",
441                            span.token_id, span.pii_type
442                        ),
443                        token_id: Some(span.token_id),
444                    });
445                }
446            }
447        }
448
449        // Check for orphan vault inserts (insert without matching span)
450        for vi in &manifest.vault_inserts {
451            let has_span = manifest.spans.iter().any(|s| s.token_id == vi.token_id);
452            if !has_span {
453                violations.push(ManifestViolation {
454                    reason: format!("vault insert {} has no matching span", vi.token_id),
455                    token_id: Some(vi.token_id),
456                });
457            }
458        }
459
460        if violations.is_empty() {
461            Ok(())
462        } else {
463            Err(violations)
464        }
465    }
466
467    /// SCRUB (server-side, redact-only): For server-originated text where
468    /// client-side scrubbing wasn't possible.
469    ///
470    /// No vault inserts are produced — these redactions are **permanent**
471    /// (no reveal possible). Use this only for server-generated content.
472    pub fn scrub_redact_only(&self, text: &str, threshold: f32) -> ScrubbedText {
473        let detectors = default_detectors();
474        let mut matches = Vec::new();
475
476        for detector in detectors {
477            if detector.confidence < threshold {
478                continue;
479            }
480            for m in detector.regex.find_iter(text) {
481                matches.push(DetectedMatch {
482                    pii_type: detector.pii_type,
483                    start: m.start(),
484                    end: m.end(),
485                    confidence: detector.confidence,
486                });
487            }
488        }
489
490        // Sort by start position, then longest match wins for overlaps
491        matches.sort_by(|a, b| a.start.cmp(&b.start).then_with(|| b.end.cmp(&a.end)));
492
493        // Remove overlapping matches (longest match wins)
494        let resolved = resolve_overlaps(&matches);
495
496        // Build scrubbed text and manifest
497        let mut result = String::with_capacity(text.len());
498        let mut spans = Vec::with_capacity(resolved.len());
499        let mut last_end = 0;
500
501        for dm in &resolved {
502            // Append text before this match
503            result.push_str(&text[last_end..dm.start]);
504
505            // Generate placeholder
506            let token_id = Uuid::new_v4();
507            let short_id = &token_id.to_string()[..8];
508            let placeholder = format!("[REDACTED:{}:{}]", dm.pii_type, short_id);
509
510            result.push_str(&placeholder);
511            spans.push(RedactionSpan {
512                token_id,
513                pii_type: dm.pii_type,
514                placeholder,
515                confidence: dm.confidence,
516            });
517            last_end = dm.end;
518        }
519        result.push_str(&text[last_end..]);
520
521        let manifest = RedactionManifest {
522            spans,
523            vault_inserts: Vec::new(), // redact-only: no vault inserts
524        };
525
526        ScrubbedText::new_verified(result, manifest)
527    }
528
529    /// Wrap text that has been externally verified as PII-clean into a `ScrubbedText`.
530    ///
531    /// Use this for text that arrived pre-scrubbed from the client SDK and passed
532    /// server-side verification (verify + verify_manifest). The registry reference
533    /// acts as a capability token — you can only produce `ScrubbedText` if you have
534    /// access to a registry.
535    pub fn mark_verified(&self, text: String, manifest: RedactionManifest) -> ScrubbedText {
536        ScrubbedText::new_verified(text, manifest)
537    }
538
539    /// Wrap text known to be Cellstate-generated (no user data) as clean.
540    ///
541    /// Use for server-originated text that cannot contain user PII by construction
542    /// (e.g. error messages, system prompt templates assembled from static strings).
543    pub fn mark_static_clean(&self, text: String) -> ScrubbedText {
544        ScrubbedText::new_verified(text, RedactionManifest::empty())
545    }
546
547    /// Scrub all string values in a JSON payload (redact-only, no vault inserts).
548    pub fn scrub_payload_redact_only(
549        &self,
550        value: serde_json::Value,
551        threshold: f32,
552    ) -> ScrubbedPayload {
553        let mut all_spans = Vec::new();
554        let scrubbed_value = self.scrub_json_value(value, threshold, &mut all_spans);
555        let manifest = RedactionManifest {
556            spans: all_spans,
557            vault_inserts: Vec::new(),
558        };
559        ScrubbedPayload::new_verified(scrubbed_value, manifest)
560    }
561
562    /// Recursively scrub string values in a JSON value.
563    fn scrub_json_value(
564        &self,
565        value: serde_json::Value,
566        threshold: f32,
567        spans: &mut Vec<RedactionSpan>,
568    ) -> serde_json::Value {
569        match value {
570            serde_json::Value::String(s) => {
571                let scrubbed = self.scrub_redact_only(&s, threshold);
572                spans.extend(scrubbed.manifest.spans);
573                serde_json::Value::String(scrubbed.text)
574            }
575            serde_json::Value::Array(arr) => {
576                let scrubbed_arr: Vec<serde_json::Value> = arr
577                    .into_iter()
578                    .map(|v| self.scrub_json_value(v, threshold, spans))
579                    .collect();
580                serde_json::Value::Array(scrubbed_arr)
581            }
582            serde_json::Value::Object(map) => {
583                let scrubbed_map: serde_json::Map<String, serde_json::Value> = map
584                    .into_iter()
585                    .map(|(k, v)| (k, self.scrub_json_value(v, threshold, spans)))
586                    .collect();
587                serde_json::Value::Object(scrubbed_map)
588            }
589            other => other,
590        }
591    }
592}
593
594impl fmt::Debug for RedactionRegistry {
595    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
596        f.debug_struct("RedactionRegistry")
597            .field("confidence_threshold", &self.confidence_threshold)
598            .field("detector_count", &default_detectors().len())
599            .finish()
600    }
601}
602
603// ============================================================================
604// HELPERS
605// ============================================================================
606
607/// Whether a PII type requires a vault insert for reveal capability.
608fn requires_vault_insert(pii_type: PiiType) -> bool {
609    matches!(
610        pii_type,
611        PiiType::Ssn
612            | PiiType::CreditCard
613            | PiiType::Email
614            | PiiType::Phone
615            | PiiType::AwsKey
616            | PiiType::PrivateKey
617            | PiiType::ConnectionString
618            | PiiType::Password
619            | PiiType::ApiKey
620    )
621}
622
623/// Internal: a detected match before overlap resolution.
624#[derive(Debug, Clone)]
625struct DetectedMatch {
626    pii_type: PiiType,
627    start: usize,
628    end: usize,
629    confidence: f32,
630}
631
632/// Resolve overlapping matches: longest match wins.
633/// Input must be sorted by start position.
634fn resolve_overlaps(matches: &[DetectedMatch]) -> Vec<DetectedMatch> {
635    let mut resolved: Vec<DetectedMatch> = Vec::with_capacity(matches.len());
636    for m in matches {
637        if let Some(last) = resolved.last() {
638            if m.start < last.end {
639                // Overlap: keep the longer one
640                if (m.end - m.start) > (last.end - last.start) {
641                    resolved.pop();
642                    resolved.push(m.clone());
643                }
644                // else: skip this shorter match
645                continue;
646            }
647        }
648        resolved.push(m.clone());
649    }
650    resolved
651}
652
653// ============================================================================
654// TESTS
655// ============================================================================
656
657#[cfg(test)]
658mod tests {
659    use super::*;
660
661    fn registry() -> RedactionRegistry {
662        RedactionRegistry::with_defaults(0.8)
663    }
664
665    // ========================================================================
666    // PiiType Display
667    // ========================================================================
668
669    #[test]
670    fn pii_type_display() {
671        assert_eq!(PiiType::Ssn.to_string(), "ssn");
672        assert_eq!(PiiType::CreditCard.to_string(), "credit_card");
673        assert_eq!(PiiType::Email.to_string(), "email");
674        assert_eq!(PiiType::ConnectionString.to_string(), "connection_string");
675    }
676
677    // ========================================================================
678    // RedactionManifest
679    // ========================================================================
680
681    #[test]
682    fn empty_manifest() {
683        let m = RedactionManifest::empty();
684        assert!(!m.has_redactions());
685        assert_eq!(m.span_count(), 0);
686        assert!(m.pii_types().is_empty());
687        assert!(m.token_ids().is_empty());
688    }
689
690    // ========================================================================
691    // ScrubbedText: no public constructor
692    // ========================================================================
693
694    #[test]
695    fn scrubbed_text_debug_shows_redacted() {
696        let st = ScrubbedText::new_verified(
697            "hello [REDACTED:ssn:abc12345]".to_string(),
698            RedactionManifest::empty(),
699        );
700        let debug = format!("{:?}", st);
701        assert!(debug.contains("ScrubbedText"));
702        assert!(debug.contains("[REDACTED:ssn:abc12345]"));
703    }
704
705    #[test]
706    fn scrubbed_text_display() {
707        let st = ScrubbedText::new_verified("clean text".to_string(), RedactionManifest::empty());
708        assert_eq!(st.to_string(), "clean text");
709    }
710
711    #[test]
712    fn scrubbed_text_clone() {
713        let st = ScrubbedText::new_verified("text".to_string(), RedactionManifest::empty());
714        let cloned = st.clone();
715        assert_eq!(st.as_redacted_str(), cloned.as_redacted_str());
716    }
717
718    // ========================================================================
719    // Verify: detect surviving PII
720    // ========================================================================
721
722    #[test]
723    fn verify_clean_text_returns_empty() {
724        let r = registry();
725        let violations = r.verify("Hello world, nothing sensitive here", 0.8);
726        assert!(violations.is_empty());
727    }
728
729    #[test]
730    fn verify_detects_ssn() {
731        let r = registry();
732        let violations = r.verify("My SSN is 123-45-6789", 0.8);
733        assert_eq!(violations.len(), 1);
734        assert_eq!(violations[0].pii_type, PiiType::Ssn);
735    }
736
737    #[test]
738    fn verify_detects_credit_card() {
739        let r = registry();
740        let violations = r.verify("Card: 4111111111111111", 0.8);
741        assert_eq!(violations.len(), 1);
742        assert_eq!(violations[0].pii_type, PiiType::CreditCard);
743    }
744
745    #[test]
746    fn verify_detects_email() {
747        let r = registry();
748        let violations = r.verify("Contact me at user@example.com", 0.8);
749        assert_eq!(violations.len(), 1);
750        assert_eq!(violations[0].pii_type, PiiType::Email);
751    }
752
753    #[test]
754    fn verify_detects_aws_key() {
755        let r = registry();
756        let violations = r.verify("Key: AKIAIOSFODNN7EXAMPLE", 0.8);
757        assert_eq!(violations.len(), 1);
758        assert_eq!(violations[0].pii_type, PiiType::AwsKey);
759    }
760
761    #[test]
762    fn verify_detects_private_key() {
763        let r = registry();
764        let violations = r.verify("-----BEGIN RSA PRIVATE KEY-----\nMIIEow...", 0.8);
765        assert_eq!(violations.len(), 1);
766        assert_eq!(violations[0].pii_type, PiiType::PrivateKey);
767    }
768
769    #[test]
770    fn verify_detects_connection_string() {
771        let r = registry();
772        let violations = r.verify("postgres://user:pass@host/db", 0.8);
773        assert_eq!(violations.len(), 1);
774        assert_eq!(violations[0].pii_type, PiiType::ConnectionString);
775    }
776
777    #[test]
778    fn verify_detects_password_pattern() {
779        let r = registry();
780        let violations = r.verify(r#"password: "hunter2""#, 0.8);
781        assert_eq!(violations.len(), 1);
782        assert_eq!(violations[0].pii_type, PiiType::Password);
783    }
784
785    #[test]
786    fn verify_detects_api_key_pattern() {
787        let r = registry();
788        let violations = r.verify(r#"api_key: "sk-abc123def456""#, 0.8);
789        assert_eq!(violations.len(), 1);
790        assert_eq!(violations[0].pii_type, PiiType::ApiKey);
791    }
792
793    #[test]
794    fn verify_detects_multiple() {
795        let r = registry();
796        let text = "SSN: 123-45-6789, Card: 4111111111111111";
797        let violations = r.verify(text, 0.8);
798        assert_eq!(violations.len(), 2);
799        let types: Vec<PiiType> = violations.iter().map(|v| v.pii_type).collect();
800        assert!(types.contains(&PiiType::Ssn));
801        assert!(types.contains(&PiiType::CreditCard));
802    }
803
804    #[test]
805    fn verify_respects_threshold() {
806        let r = registry();
807        // Server paths have 0.70 confidence — should be filtered at 0.8 threshold
808        let violations = r.verify("/home/user/data/", 0.8);
809        assert!(violations.is_empty());
810        // But detected at 0.5 threshold
811        let violations = r.verify("/home/user/data/", 0.5);
812        assert_eq!(violations.len(), 1);
813        assert_eq!(violations[0].pii_type, PiiType::ServerPath);
814    }
815
816    // ========================================================================
817    // Verify manifest consistency
818    // ========================================================================
819
820    #[test]
821    fn verify_manifest_valid() {
822        let r = registry();
823        let token = Uuid::new_v4();
824        let manifest = RedactionManifest {
825            spans: vec![RedactionSpan {
826                token_id: token,
827                pii_type: PiiType::Ssn,
828                placeholder: "[REDACTED:ssn:abc12345]".to_string(),
829                confidence: 0.95,
830            }],
831            vault_inserts: vec![VaultInsert {
832                token_id: token,
833                pii_type: PiiType::Ssn,
834                ciphertext: vec![1, 2, 3],
835            }],
836        };
837        let payload = serde_json::json!({"name": "test", "ssn": "[REDACTED:ssn:abc12345]"});
838        assert!(r.verify_manifest(&payload, &manifest).is_ok());
839    }
840
841    #[test]
842    fn verify_manifest_missing_placeholder() {
843        let r = registry();
844        let token = Uuid::new_v4();
845        let manifest = RedactionManifest {
846            spans: vec![RedactionSpan {
847                token_id: token,
848                pii_type: PiiType::Ssn,
849                placeholder: "[REDACTED:ssn:abc12345]".to_string(),
850                confidence: 0.95,
851            }],
852            vault_inserts: vec![VaultInsert {
853                token_id: token,
854                pii_type: PiiType::Ssn,
855                ciphertext: vec![1, 2, 3],
856            }],
857        };
858        // Payload doesn't contain the placeholder
859        let payload = serde_json::json!({"name": "test", "ssn": "not redacted"});
860        let err = r.verify_manifest(&payload, &manifest).unwrap_err();
861        assert_eq!(err.len(), 1);
862        assert!(err[0].reason.contains("not found in payload"));
863    }
864
865    #[test]
866    fn verify_manifest_missing_vault_insert() {
867        let r = registry();
868        let token = Uuid::new_v4();
869        let manifest = RedactionManifest {
870            spans: vec![RedactionSpan {
871                token_id: token,
872                pii_type: PiiType::Ssn,
873                placeholder: "[REDACTED:ssn:abc12345]".to_string(),
874                confidence: 0.95,
875            }],
876            vault_inserts: vec![], // Missing!
877        };
878        let payload = serde_json::json!({"ssn": "[REDACTED:ssn:abc12345]"});
879        let err = r.verify_manifest(&payload, &manifest).unwrap_err();
880        assert_eq!(err.len(), 1);
881        assert!(err[0].reason.contains("requires vault insert"));
882    }
883
884    #[test]
885    fn verify_manifest_orphan_vault_insert() {
886        let r = registry();
887        let orphan_token = Uuid::new_v4();
888        let manifest = RedactionManifest {
889            spans: vec![],
890            vault_inserts: vec![VaultInsert {
891                token_id: orphan_token,
892                pii_type: PiiType::Email,
893                ciphertext: vec![4, 5, 6],
894            }],
895        };
896        let payload = serde_json::json!({"clean": "data"});
897        let err = r.verify_manifest(&payload, &manifest).unwrap_err();
898        assert_eq!(err.len(), 1);
899        assert!(err[0].reason.contains("no matching span"));
900    }
901
902    #[test]
903    fn verify_manifest_server_path_no_vault_required() {
904        let r = registry();
905        let token = Uuid::new_v4();
906        // ServerPath does NOT require vault insert
907        let manifest = RedactionManifest {
908            spans: vec![RedactionSpan {
909                token_id: token,
910                pii_type: PiiType::ServerPath,
911                placeholder: "[REDACTED:server_path:abc12345]".to_string(),
912                confidence: 0.70,
913            }],
914            vault_inserts: vec![], // OK — ServerPath doesn't require vault
915        };
916        let payload = serde_json::json!({"path": "[REDACTED:server_path:abc12345]"});
917        assert!(r.verify_manifest(&payload, &manifest).is_ok());
918    }
919
920    // ========================================================================
921    // Scrub redact-only
922    // ========================================================================
923
924    #[test]
925    fn scrub_redact_only_clean_text() {
926        let r = registry();
927        let scrubbed = r.scrub_redact_only("Hello world", 0.8);
928        assert_eq!(scrubbed.as_redacted_str(), "Hello world");
929        assert!(!scrubbed.manifest().has_redactions());
930    }
931
932    #[test]
933    fn scrub_redact_only_ssn() {
934        let r = registry();
935        let scrubbed = r.scrub_redact_only("SSN: 123-45-6789", 0.8);
936        assert!(!scrubbed.as_redacted_str().contains("123-45-6789"));
937        assert!(scrubbed.as_redacted_str().contains("[REDACTED:ssn:"));
938        assert_eq!(scrubbed.manifest().span_count(), 1);
939        assert_eq!(scrubbed.manifest().spans[0].pii_type, PiiType::Ssn);
940        // No vault inserts (redact-only)
941        assert!(scrubbed.manifest().vault_inserts.is_empty());
942    }
943
944    #[test]
945    fn scrub_redact_only_multiple() {
946        let r = registry();
947        let text = "SSN: 123-45-6789, email: user@example.com";
948        let scrubbed = r.scrub_redact_only(text, 0.8);
949        assert!(!scrubbed.as_redacted_str().contains("123-45-6789"));
950        assert!(!scrubbed.as_redacted_str().contains("user@example.com"));
951        assert_eq!(scrubbed.manifest().span_count(), 2);
952    }
953
954    #[test]
955    fn scrub_redact_only_preserves_surrounding_text() {
956        let r = registry();
957        let scrubbed = r.scrub_redact_only("before 123-45-6789 after", 0.8);
958        let text = scrubbed.as_redacted_str();
959        assert!(text.starts_with("before "));
960        assert!(text.ends_with(" after"));
961    }
962
963    // ========================================================================
964    // Overlap resolution: longest match wins
965    // ========================================================================
966
967    #[test]
968    fn overlap_resolution_longest_wins() {
969        // Manually test the overlap resolver with two overlapping ranges
970        let matches = vec![
971            DetectedMatch {
972                pii_type: PiiType::ServerPath,
973                start: 10,
974                end: 30,
975                confidence: 0.70,
976            },
977            DetectedMatch {
978                pii_type: PiiType::ConnectionString,
979                start: 5,
980                end: 50,
981                confidence: 0.95,
982            },
983        ];
984        // Sort by start first (required by resolve_overlaps)
985        let mut sorted = matches;
986        sorted.sort_by(|a, b| a.start.cmp(&b.start).then_with(|| b.end.cmp(&a.end)));
987        let resolved = resolve_overlaps(&sorted);
988        // The longer ConnectionString match should win
989        assert_eq!(resolved.len(), 1);
990        assert_eq!(resolved[0].pii_type, PiiType::ConnectionString);
991    }
992
993    #[test]
994    fn overlap_resolution_non_overlapping_preserved() {
995        let r = registry();
996        // SSN and email don't overlap — both should survive
997        let text = "SSN: 123-45-6789 email: user@example.com";
998        let scrubbed = r.scrub_redact_only(text, 0.8);
999        assert_eq!(scrubbed.manifest().span_count(), 2);
1000    }
1001
1002    // ========================================================================
1003    // Scrub JSON payload
1004    // ========================================================================
1005
1006    #[test]
1007    fn scrub_payload_redact_only() {
1008        let r = registry();
1009        let payload = serde_json::json!({
1010            "name": "John",
1011            "ssn": "123-45-6789",
1012            "notes": ["Contact at user@example.com"]
1013        });
1014        let scrubbed = r.scrub_payload_redact_only(payload, 0.8);
1015        let v = scrubbed.as_value();
1016        // SSN scrubbed
1017        let ssn = v["ssn"].as_str().unwrap();
1018        assert!(!ssn.contains("123-45-6789"));
1019        assert!(ssn.contains("[REDACTED:ssn:"));
1020        // Email scrubbed in array
1021        let note = v["notes"][0].as_str().unwrap();
1022        assert!(!note.contains("user@example.com"));
1023        assert!(note.contains("[REDACTED:email:"));
1024        // Name preserved (not PII)
1025        assert_eq!(v["name"].as_str().unwrap(), "John");
1026        // Manifest has both spans
1027        assert_eq!(scrubbed.manifest().span_count(), 2);
1028    }
1029
1030    #[test]
1031    fn scrub_payload_preserves_non_string_values() {
1032        let r = registry();
1033        let payload = serde_json::json!({
1034            "count": 42,
1035            "active": true,
1036            "data": null
1037        });
1038        let scrubbed = r.scrub_payload_redact_only(payload, 0.8);
1039        let v = scrubbed.as_value();
1040        assert_eq!(v["count"], 42);
1041        assert_eq!(v["active"], true);
1042        assert!(v["data"].is_null());
1043        assert!(!scrubbed.manifest().has_redactions());
1044    }
1045
1046    // ========================================================================
1047    // Serde round-trip
1048    // ========================================================================
1049
1050    #[test]
1051    fn manifest_serde_roundtrip() {
1052        let token = Uuid::new_v4();
1053        let manifest = RedactionManifest {
1054            spans: vec![RedactionSpan {
1055                token_id: token,
1056                pii_type: PiiType::Email,
1057                placeholder: "[REDACTED:email:abc12345]".to_string(),
1058                confidence: 0.85,
1059            }],
1060            vault_inserts: vec![VaultInsert {
1061                token_id: token,
1062                pii_type: PiiType::Email,
1063                ciphertext: vec![10, 20, 30],
1064            }],
1065        };
1066        let json = serde_json::to_string(&manifest).unwrap();
1067        let deserialized: RedactionManifest = serde_json::from_str(&json).unwrap();
1068        assert_eq!(manifest, deserialized);
1069    }
1070
1071    #[test]
1072    fn pii_type_serde_snake_case() {
1073        let json = serde_json::to_string(&PiiType::CreditCard).unwrap();
1074        assert_eq!(json, "\"credit_card\"");
1075        let deserialized: PiiType = serde_json::from_str(&json).unwrap();
1076        assert_eq!(deserialized, PiiType::CreditCard);
1077    }
1078}