Struct tokenizers::tokenizer::Tokenizer

source ·

pub struct Tokenizer(/* private fields */);

Implementations§

source §

impl Tokenizer

source

pub fn new(model: impl Into<ModelWrapper>) -> Self

Construct a new Tokenizer based on the model.

source

pub fn into_inner( self ) -> TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>

Unwrap the TokenizerImpl.

source

pub fn from_file<P: AsRef<Path>>(file: P) -> Result<Self>

source

pub fn from_bytes<P: AsRef<[u8]>>(bytes: P) -> Result<Self>

Methods from Deref<Target = TokenizerImpl<ModelWrapper, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper>>§

source

pub fn with_normalizer(&mut self, normalizer: impl Into<N>) -> &mut Self

Set the normalizer

source

pub fn get_normalizer(&self) -> Option<&N>

Get the normalizer

source

pub fn with_pre_tokenizer(&mut self, pre_tokenizer: impl Into<PT>) -> &mut Self

Set the pre tokenizer

source

pub fn get_pre_tokenizer(&self) -> Option<&PT>

Get the pre tokenizer

source

pub fn with_post_processor( &mut self, post_processor: impl Into<PP> ) -> &mut Self

Set the post processor

source

pub fn get_post_processor(&self) -> Option<&PP>

Get the post processor

source

pub fn with_decoder(&mut self, decoder: impl Into<D>) -> &mut Self

Set the decoder

source

pub fn get_decoder(&self) -> Option<&D>

Get the decoder

source

pub fn with_model(&mut self, model: impl Into<M>) -> &mut Self

Set the model

source

pub fn get_model(&self) -> &M

Get the model

source

pub fn with_truncation( &mut self, trunc: Option<TruncationParams> ) -> Result<&mut Self>

Set the truncation parameters

Fails if stride is too high relative to max_length and post_processor.added_tokens()

source

pub fn get_truncation(&self) -> Option<&TruncationParams>

Get the currently set truncation parameters

source

pub fn get_truncation_mut(&mut self) -> Option<&mut TruncationParams>

Get a mutable reference to the currently set truncation parameters

source

pub fn with_padding(&mut self, padding: Option<PaddingParams>) -> &mut Self

Set the padding parameters

source

pub fn get_padding(&self) -> Option<&PaddingParams>

Get the currently set padding parameters

source

pub fn get_padding_mut(&mut self) -> Option<&mut PaddingParams>

Get a mutable reference to the currently set padding parameters

source

pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32>

Get the vocabulary

source

pub fn get_added_tokens_decoder(&self) -> HashMap<u32, AddedToken>

Get the added tokens decoder

source

pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize

Get the size of the vocabulary

source

pub fn token_to_id(&self, token: &str) -> Option<u32>

Converts a token in the corresponding id.

source

pub fn id_to_token(&self, id: u32) -> Option<String>

Converts an id to the corresponding token.

source

pub fn set_encode_special_tokens(&mut self, value: bool)

set the added bocab’s splitting scheme

source

pub fn get_encode_special_tokens(&self) -> bool

Get added token value

source

pub fn encode<'s, E>( &self, input: E, add_special_tokens: bool ) -> Result<Encoding>
where E: Into<EncodeInput<'s>>,

Encode the given input. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:

// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);

// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
    &["Sequence", "A"][..],
    &["Sequence", "B"][..]
), false);

// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);

source

pub fn encode_char_offsets<'s, E>( &self, input: E, add_special_tokens: bool ) -> Result<Encoding>
where E: Into<EncodeInput<'s>>,

Encode the given input, using offsets relative to chars instead of bytes. This method accepts both single sequences, as well as pair sequences. Also, a sequence can be a string, or already pre-tokenized input directly:

// Sequences:
tokenizer.encode("Single sequence", false);
tokenizer.encode(("Sequence A", "Sequence B"), false);

// Pre-tokenized sequences:
tokenizer.encode(&["Single", "sequence"][..], false);
tokenizer.encode((
    &["Sequence", "A"][..],
    &["Sequence", "B"][..]
), false);

// or even both types together:
tokenizer.encode(("A complete sequence", &["And", "a", "tokenized"][..]), false);