use regex::Regex;
use crate::tokenizer::{
pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior,
};
use crate::utils::macro_rules_attribute;
#[derive(Clone, Debug, PartialEq, Eq)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct Whitespace;
impl Default for Whitespace {
fn default() -> Self {
Self
}
}
impl PreTokenizer for Whitespace {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
lazy_static! {
static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
}
let re_ref: &Regex = &RE;
pretokenized.split(|_, normalized| {
normalized.split(Invert(re_ref), SplitDelimiterBehavior::Removed)
})
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct WhitespaceSplit;
impl PreTokenizer for WhitespaceSplit {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
pretokenized.split(|_, normalized| {
normalized.split(char::is_whitespace, SplitDelimiterBehavior::Removed)
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{OffsetReferential, OffsetType, PreTokenizer};
#[test]
fn basic() {
let tests = vec![
(
"Hey man!",
vec![("Hey", (0, 3)), ("man", (4, 7)), ("!", (7, 8))],
),
(
"How are you doing?",
vec![
("How", (0, 3)),
("are", (4, 7)),
("you", (8, 11)),
("doing", (12, 17)),
("?", (17, 18)),
],
),
("\n", vec![]),
];
let pretok = Whitespace {};
for (s, res) in tests {
let mut pretokenized = PreTokenizedString::from(s);
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Original, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
res
);
}
}
#[test]
fn whitespace_split() {
let tests = vec![
("Hey man!", vec![("Hey", (0, 3)), ("man!", (4, 8))]),
(
"Hey, man, Good?",
vec![("Hey,", (0, 4)), ("man,", (5, 9)), ("Good?", (10, 15))],
),
];
let pretok = WhitespaceSplit;
for (s, res) in tests {
let mut pretokenized = PreTokenizedString::from(s);
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Original, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
res
);
}
}
}