1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
use crate::tokenizer::{NormalizedString, Normalizer, Result};
use crate::utils::macro_rules_attribute;
use serde::{Deserialize, Serialize};
use unicode_normalization_alignments::char::is_combining_mark;

#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
#[serde(tag = "type")]
#[non_exhaustive]
pub struct Strip {
    pub strip_left: bool,
    pub strip_right: bool,
}

impl Strip {
    pub fn new(strip_left: bool, strip_right: bool) -> Self {
        Self {
            strip_left,
            strip_right,
        }
    }
}

impl Normalizer for Strip {
    /// Strip the normalized string inplace
    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
        if self.strip_left && self.strip_right {
            // Fast path
            normalized.strip();
        } else {
            if self.strip_left {
                normalized.lstrip();
            }

            if self.strip_right {
                normalized.rstrip();
            }
        }

        Ok(())
    }
}

// This normalizer removes combining marks from a normalized string
// It's different from unidecode as it does not attempt to modify
// non ascii languages.
#[derive(Copy, Clone, Debug)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct StripAccents;

impl Normalizer for StripAccents {
    /// Strip the normalized string inplace
    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
        normalized.filter(|c| !is_combining_mark(c));
        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::normalizer::NormalizedString;
    use crate::normalizers::Lowercase;
    use crate::normalizers::NFKD;
    use unicode_normalization_alignments::UnicodeNormalization;

    #[test]
    fn test_strip_accents() {
        // Unicode combining char
        let original: String = "Me llamó".nfkd().map(|(c, _)| c).collect();
        let normalized = "Me llamo";
        assert_ne!(original, normalized);
        let mut n = NormalizedString::from(original);
        StripAccents.normalize(&mut n).unwrap();
        assert_eq!(&n.get(), &normalized);

        // Ignores regular ascii
        let original = "Me llamo";
        let normalized = "Me llamo";
        assert_eq!(original, normalized);
        let mut n = NormalizedString::from(original);
        StripAccents.normalize(&mut n).unwrap();
        assert_eq!(&n.get(), &normalized);

        // Does not change chinese
        let original: String = "这很简单".nfkd().map(|(c, _)| c).collect();
        let normalized = "这很简单";
        assert_eq!(original, normalized);
        let mut n = NormalizedString::from(original);
        StripAccents.normalize(&mut n).unwrap();
        assert_eq!(&n.get(), &normalized);
    }

    #[test]
    fn test_vietnamese_bug() {
        let original: String = "ậ…".to_string();
        let normalized = "a...".to_string();
        assert_ne!(original, normalized);
        let mut n = NormalizedString::from(original);
        NFKD.normalize(&mut n).unwrap();
        StripAccents.normalize(&mut n).unwrap();
        assert_eq!(&n.get(), &normalized);
        Lowercase.normalize(&mut n).unwrap();
        assert_eq!(&n.get(), &normalized);

        let original: String = "Cụ thể, bạn sẽ tham gia một nhóm các giám đốc điều hành tổ chức, các nhà lãnh đạo doanh nghiệp, các học giả, chuyên gia phát triển và tình nguyện viên riêng biệt trong lĩnh vực phi lợi nhuận…".to_string();
        let normalized = "cu the, ban se tham gia mot nhom cac giam đoc đieu hanh to chuc, cac nha lanh đao doanh nghiep, cac hoc gia, chuyen gia phat trien va tinh nguyen vien rieng biet trong linh vuc phi loi nhuan...".to_string();
        let mut n = NormalizedString::from(original);
        NFKD.normalize(&mut n).unwrap();
        StripAccents.normalize(&mut n).unwrap();
        Lowercase.normalize(&mut n).unwrap();
        assert_eq!(&n.get(), &normalized);
    }

    #[test]
    fn test_thai_bug() {
        let original = "ำน\u{e49}ำ3ลำ".to_string();
        let normalized = "านา3ลา".to_string();
        assert_ne!(original, normalized);
        let mut n = NormalizedString::from(original);
        NFKD.normalize(&mut n).unwrap();
        StripAccents.normalize(&mut n).unwrap();
        Lowercase.normalize(&mut n).unwrap();
        assert_eq!(&n.get(), &normalized);
    }

    #[test]
    fn test_strip_accents_multiple() {
        let original = "e\u{304}\u{304}\u{304}o";
        let normalized = "eo";
        assert_ne!(original, normalized);
        let mut n = NormalizedString::from(original);
        StripAccents.normalize(&mut n).unwrap();
        assert_eq!(&n.get(), &normalized);
        assert_eq!(
            n,
            NormalizedString::new(
                original.to_string(),
                normalized.to_string(),
                vec![(0, 1), (7, 8)],
                0
            )
        );
        assert_eq!(
            n.alignments_original(),
            vec![
                (0, 1),
                (1, 1),
                (1, 1),
                (1, 1),
                (1, 1),
                (1, 1),
                (1, 1),
                (1, 2)
            ]
        );
    }
}