Struct tokenizers::tokenizer::TokenizerImpl  
source · pub struct TokenizerImpl<M, N, PT, PP, D> { /* private fields */ }Expand description
A Tokenizer is capable of encoding/decoding any text.
Implementations§
source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
 
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
sourcepub fn with_normalizer(&mut self, normalizer: impl Into<N>) -> &mut Self
 
pub fn with_normalizer(&mut self, normalizer: impl Into<N>) -> &mut Self
Set the normalizer
sourcepub fn get_normalizer(&self) -> Option<&N>
 
pub fn get_normalizer(&self) -> Option<&N>
Get the normalizer
sourcepub fn with_pre_tokenizer(&mut self, pre_tokenizer: impl Into<PT>) -> &mut Self
 
pub fn with_pre_tokenizer(&mut self, pre_tokenizer: impl Into<PT>) -> &mut Self
Set the pre tokenizer
sourcepub fn get_pre_tokenizer(&self) -> Option<&PT>
 
pub fn get_pre_tokenizer(&self) -> Option<&PT>
Get the pre tokenizer
sourcepub fn with_post_processor(
    &mut self,
    post_processor: impl Into<PP>
) -> &mut Self
 
pub fn with_post_processor( &mut self, post_processor: impl Into<PP> ) -> &mut Self
Set the post processor
sourcepub fn get_post_processor(&self) -> Option<&PP>
 
pub fn get_post_processor(&self) -> Option<&PP>
Get the post processor
sourcepub fn with_decoder(&mut self, decoder: impl Into<D>) -> &mut Self
 
pub fn with_decoder(&mut self, decoder: impl Into<D>) -> &mut Self
Set the decoder
sourcepub fn get_decoder(&self) -> Option<&D>
 
pub fn get_decoder(&self) -> Option<&D>
Get the decoder
sourcepub fn with_model(&mut self, model: impl Into<M>) -> &mut Self
 
pub fn with_model(&mut self, model: impl Into<M>) -> &mut Self
Set the model
sourcepub fn with_truncation(
    &mut self,
    trunc: Option<TruncationParams>
) -> Result<&mut Self>
 
pub fn with_truncation( &mut self, trunc: Option<TruncationParams> ) -> Result<&mut Self>
Set the truncation parameters
Fails if stride is too high relative to max_length and post_processor.added_tokens()
sourcepub fn get_truncation(&self) -> Option<&TruncationParams>
 
pub fn get_truncation(&self) -> Option<&TruncationParams>
Get the currently set truncation parameters
sourcepub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>
 
pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>
Get a mutable reference to the currently set truncation parameters
sourcepub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self
 
pub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self
Set the padding parameters
sourcepub fn get_padding(&self) -> Option<&PaddingParams>
 
pub fn get_padding(&self) -> Option<&PaddingParams>
Get the currently set padding parameters
sourcepub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>
 
pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>
Get a mutable reference to the currently set padding parameters
sourcepub fn get_added_tokens_decoder(&self) -> HashMap<u32, AddedToken>
 
pub fn get_added_tokens_decoder(&self) -> HashMap<u32, AddedToken>
Get the added tokens decoder
sourcepub fn get_vocab_size(&self, with_added_tokens: bool) -> usize
 
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize
Get the size of the vocabulary
sourcepub fn token_to_id(&self, token: &str) -> Option<u32>
 
pub fn token_to_id(&self, token: &str) -> Option<u32>
Converts a token in the corresponding id.
sourcepub fn id_to_token(&self, id: u32) -> Option<String>
 
pub fn id_to_token(&self, id: u32) -> Option<String>
Converts an id to the corresponding token.
sourcepub fn set_encode_special_tokens(&mut self, value: bool)
 
pub fn set_encode_special_tokens(&mut self, value: bool)
set the added bocab’s splitting scheme
sourcepub fn get_encode_special_tokens(&self) -> bool
 
pub fn get_encode_special_tokens(&self) -> bool
Get added token value
sourcepub fn encode<'s, E>(
    &self,
    input: E,
    add_special_tokens: bool
) -> Result<Encoding>where
    E: Into<EncodeInput<'s>>,
 
pub fn encode<'s, E>(
    &self,
    input: E,
    add_special_tokens: bool
) -> Result<Encoding>where
    E: Into<EncodeInput<'s>>,
Encode the given input. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);
// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
    &["Sequence", "A"][..],
    &["Sequence", "B"][..]
), false);
// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);sourcepub fn encode_char_offsets<'s, E>(
    &self,
    input: E,
    add_special_tokens: bool
) -> Result<Encoding>where
    E: Into<EncodeInput<'s>>,
 
pub fn encode_char_offsets<'s, E>(
    &self,
    input: E,
    add_special_tokens: bool
) -> Result<Encoding>where
    E: Into<EncodeInput<'s>>,
Encode the given input, using offsets relative to chars instead of bytes. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:
// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);
// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
    &["Sequence", "A"][..],
    &["Sequence", "B"][..]
), false);
// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
    N: Normalizer,
    M: Model,
 
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
    N: Normalizer,
    M: Model,
sourcepub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize
 
pub fn add_special_tokens(&mut self, tokens: &[AddedToken]) -> usize
Register the given tokens as special tokens. This is especially useful for removing these special tokens while decoding
sourcepub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize
 
pub fn add_tokens(&mut self, tokens: &[AddedToken]) -> usize
Add the given tokens to the added vocabulary
source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
    PP: PostProcessor,
 
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
    PP: PostProcessor,
source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
    M: Model + Send + Sync,
    N: Normalizer + Send + Sync,
    PT: PreTokenizer + Send + Sync,
    PP: PostProcessor + Send + Sync,
    D: Decoder + Send + Sync,
 
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
    M: Model + Send + Sync,
    N: Normalizer + Send + Sync,
    PT: PreTokenizer + Send + Sync,
    PP: PostProcessor + Send + Sync,
    D: Decoder + Send + Sync,
sourcepub fn encode_batch<'s, E>(
    &self,
    inputs: Vec<E>,
    add_special_tokens: bool
) -> Result<Vec<Encoding>>
 
pub fn encode_batch<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool ) -> Result<Vec<Encoding>>
Encode all the sentences in parallel, using multiple threads
sourcepub fn encode_batch_char_offsets<'s, E>(
    &self,
    inputs: Vec<E>,
    add_special_tokens: bool
) -> Result<Vec<Encoding>>
 
pub fn encode_batch_char_offsets<'s, E>( &self, inputs: Vec<E>, add_special_tokens: bool ) -> Result<Vec<Encoding>>
Encode all the sentences in parallel, using multiple threads.
The offsets on each Encoding will be relative to chars instead of bytes.
sourcepub fn decode_batch(
    &self,
    sentences: &[&[u32]],
    skip_special_tokens: bool
) -> Result<Vec<String>>
 
pub fn decode_batch( &self, sentences: &[&[u32]], skip_special_tokens: bool ) -> Result<Vec<String>>
Decode all sentences in parallel
source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
    M: DeserializeOwned + Model,
    N: DeserializeOwned + Normalizer,
    PT: DeserializeOwned + PreTokenizer,
    PP: DeserializeOwned + PostProcessor,
    D: DeserializeOwned + Decoder,
 
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
    M: DeserializeOwned + Model,
    N: DeserializeOwned + Normalizer,
    PT: DeserializeOwned + PreTokenizer,
    PP: DeserializeOwned + PostProcessor,
    D: DeserializeOwned + Decoder,
source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
    M: DeserializeOwned + Model,
    N: DeserializeOwned + Normalizer,
    PT: DeserializeOwned + PreTokenizer,
    PP: DeserializeOwned + PostProcessor,
    D: DeserializeOwned + Decoder,
 
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>where
    M: DeserializeOwned + Model,
    N: DeserializeOwned + Normalizer,
    PT: DeserializeOwned + PreTokenizer,
    PP: DeserializeOwned + PostProcessor,
    D: DeserializeOwned + Decoder,
sourcepub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>
 
pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>
Instantiate a new Tokenizer from bytes
source§impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
 
impl<M, N, PT, PP, D> TokenizerImpl<M, N, PT, PP, D>
Trait Implementations§
source§impl<M: Clone, N: Clone, PT: Clone, PP: Clone, D: Clone> Clone for TokenizerImpl<M, N, PT, PP, D>
 
impl<M: Clone, N: Clone, PT: Clone, PP: Clone, D: Clone> Clone for TokenizerImpl<M, N, PT, PP, D>
source§fn clone(&self) -> TokenizerImpl<M, N, PT, PP, D>
 
fn clone(&self) -> TokenizerImpl<M, N, PT, PP, D>
1.0.0 · source§fn clone_from(&mut self, source: &Self)
 
fn clone_from(&mut self, source: &Self)
source. Read more