use crate::tokenizer::{NormalizedString, Normalizer, Result};
use crate::utils::macro_rules_attribute;
#[derive(Default, Copy, Clone, Debug)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct NFD;
impl Normalizer for NFD {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
normalized.nfd();
Ok(())
}
}
#[derive(Default, Copy, Clone, Debug)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct NFKD;
impl Normalizer for NFKD {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
normalized.nfkd();
Ok(())
}
}
#[derive(Default, Copy, Clone, Debug)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct NFC;
impl Normalizer for NFC {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
normalized.nfc();
Ok(())
}
}
#[derive(Default, Copy, Clone, Debug)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct NFKC;
impl Normalizer for NFKC {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
normalized.nfkc();
Ok(())
}
}
fn do_nmt(normalized: &mut NormalizedString) {
normalized
.filter(|c| {
!matches!(
c as u32,
0x0001..=0x0008 |
0x000B |
0x000E..=0x001F |
0x007F |
0x008F |
0x009F
)
})
.map(|c| match c as u32 {
0x0009 => ' ',
0x000A => ' ',
0x000C => ' ',
0x000D => ' ',
0x1680 => ' ',
0x200B..=0x200F => ' ',
0x2028 => ' ',
0x2029 => ' ',
0x2581 => ' ',
0xFEFF => ' ',
0xFFFD => ' ',
_ => c,
});
}
#[derive(Default, Copy, Clone, Debug)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct Nmt;
impl Normalizer for Nmt {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
do_nmt(normalized);
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_nfkc() {
let original = "\u{fb01}".to_string();
let normalized = "fi".to_string();
let mut n = NormalizedString::from(original.clone());
NFKC.normalize(&mut n).unwrap();
assert_eq!(
n,
NormalizedString::new(original, normalized, vec![(0, 3), (0, 3)], 0)
);
assert_eq!(n.alignments_original(), vec![(0, 2), (0, 2), (0, 2)]);
}
}