use crate::tokenizer::{NormalizedString, Normalizer, Result};
pub use spm_precompiled::Precompiled;
use std::cmp::Ordering;
use unicode_segmentation::UnicodeSegmentation;
fn replace(transformations: &mut Vec<(char, isize)>, old_part: &str, new_part: &str) {
let old_count = old_part.chars().count() as isize;
let new_count = new_part.chars().count() as isize;
let diff = new_count - old_count;
transformations.extend(new_part.chars().map(|c| (c, 0)));
match diff.cmp(&0) {
Ordering::Greater => {
transformations
.iter_mut()
.rev()
.take(diff as usize)
.for_each(|(_, cs)| *cs = 1);
}
Ordering::Less => {
if let Some((_, cs)) = transformations.last_mut() {
*cs += diff;
}
}
_ => {}
}
}
impl Normalizer for Precompiled {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
let mut transformations = Vec::with_capacity(normalized.get().len());
let mut modified = false;
normalized.get().graphemes(true).for_each(|grapheme| {
if grapheme.len() < 6 {
if let Some(norm) = self.transform(grapheme) {
modified = true;
replace(&mut transformations, grapheme, norm);
return;
}
}
for (char_index, c) in grapheme.char_indices() {
let part = &grapheme[char_index..char_index + c.len_utf8()];
if let Some(norm) = self.transform(part) {
modified = true;
replace(&mut transformations, part, norm);
} else {
transformations.push((c, 0));
}
}
});
if modified {
normalized.transform(transformations, 0);
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn expansion_followed_by_removal() {
let mut transformations = vec![];
let mut n = NormalizedString::from("™\x1eg");
replace(&mut transformations, "™", "TM");
replace(&mut transformations, "\x1e", "");
transformations.push(('g', 0));
n.transform(transformations, 0);
assert_eq!(n.get(), "TMg");
}
}