1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
use crate::tokenizer::{NormalizedString, Normalizer, Result};
use serde::{Deserialize, Serialize};
use unicode_categories::UnicodeCategories;
/// Checks whether a character is whitespace
fn is_whitespace(c: char) -> bool {
// These are technically control characters but we count them as whitespace
match c {
'\t' | '\n' | '\r' => true,
_ => c.is_whitespace(),
}
}
/// Checks whether a character is a control character
fn is_control(c: char) -> bool {
// These are technically control characters but we count them as whitespace
match c {
'\t' | '\n' | '\r' => false,
// The definition of `is_control` here is quite large and contains also
// Cc, Cf, Cn or Co
// cf. https://unicode.org/reports/tr44/ (Table 12)
_ => c.is_other(),
}
}
/// Checks whether a character is chinese
/// This defines a "chinese character" as anything in the CJK Unicode block:
/// https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
///
/// Note that the CJK Unicode block is NOT all Japanese and Korean characters,
/// despite its name. The modern Korean Hangul alphabet is a different block,
/// as is Japanese Hiragana and Katakana. Those alphabets are used to write
/// space-separated words, so they are not treated specially and handled
/// like for all of the other languages.
fn is_chinese_char(c: char) -> bool {
matches!(
c as usize,
0x4E00..=0x9FFF |
0x3400..=0x4DBF |
0x20000..=0x2A6DF |
0x2A700..=0x2B73F |
0x2B740..=0x2B81F |
0x2B920..=0x2CEAF |
0xF900..=0xFAFF |
0x2F800..=0x2FA1F
)
}
#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
#[serde(tag = "type")]
#[non_exhaustive]
pub struct BertNormalizer {
/// Whether to do the bert basic cleaning:
/// 1. Remove any control characters
/// 2. Replace all sorts of whitespace by the classic one ` `
pub clean_text: bool,
/// Whether to put spaces around chinese characters so they get split
pub handle_chinese_chars: bool,
/// Whether to strip accents
pub strip_accents: Option<bool>,
/// Whether to lowercase the input
pub lowercase: bool,
}
impl Default for BertNormalizer {
fn default() -> Self {
Self {
clean_text: true,
handle_chinese_chars: true,
strip_accents: None,
lowercase: true,
}
}
}
impl BertNormalizer {
pub fn new(
clean_text: bool,
handle_chinese_chars: bool,
strip_accents: Option<bool>,
lowercase: bool,
) -> Self {
Self {
clean_text,
handle_chinese_chars,
strip_accents,
lowercase,
}
}
fn do_clean_text(&self, normalized: &mut NormalizedString) {
normalized
.filter(|c| !(c as usize == 0 || c as usize == 0xfffd || is_control(c)))
.map(|c| if is_whitespace(c) { ' ' } else { c });
}
fn do_handle_chinese_chars(&self, normalized: &mut NormalizedString) {
let mut new_chars: Vec<(char, isize)> = vec![];
normalized.for_each(|c| {
if is_chinese_char(c) {
new_chars.extend([(' ', 0), (c, 1), (' ', 1)]);
} else {
new_chars.push((c, 0));
}
});
normalized.transform(new_chars, 0);
}
fn do_strip_accents(&self, normalized: &mut NormalizedString) {
normalized.nfd().filter(|c| !c.is_mark_nonspacing());
}
fn do_lowercase(&self, normalized: &mut NormalizedString) {
normalized.lowercase();
}
}
impl Normalizer for BertNormalizer {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
if self.clean_text {
self.do_clean_text(normalized);
}
if self.handle_chinese_chars {
self.do_handle_chinese_chars(normalized);
}
let strip_accents = self.strip_accents.unwrap_or(self.lowercase);
if strip_accents {
self.do_strip_accents(normalized);
}
if self.lowercase {
self.do_lowercase(normalized);
}
Ok(())
}
}