Struct tokenizers::tokenizer::Encoding
source · pub struct Encoding { /* private fields */ }
Expand description
Represents the output of a Tokenizer
.
Implementations§
source§impl Encoding
impl Encoding
pub fn new( ids: Vec<u32>, type_ids: Vec<u32>, tokens: Vec<String>, words: Vec<Option<u32>>, offsets: Vec<Offsets>, special_tokens_mask: Vec<u32>, attention_mask: Vec<u32>, overflowing: Vec<Self>, sequence_ranges: HashMap<usize, Range<usize>> ) -> Self
pub fn with_capacity(len: usize) -> Self
pub fn from_tokens(tokens: Vec<Token>, type_id: u32) -> Self
sourcepub fn n_sequences(&self) -> usize
pub fn n_sequences(&self) -> usize
Return the number of sequences combined in this Encoding
sourcepub fn set_sequence_id(&mut self, sequence_id: usize)
pub fn set_sequence_id(&mut self, sequence_id: usize)
Set the given sequence id for the whole range of tokens contained in this Encoding
pub fn get_tokens(&self) -> &[String]
pub fn get_word_ids(&self) -> &[Option<u32>]
pub fn get_word_ids_mut(&mut self) -> &mut [Option<u32>]
pub fn get_sequence_ids(&self) -> Vec<Option<usize>>
pub fn get_ids(&self) -> &[u32]
pub fn get_type_ids(&self) -> &[u32]
pub fn set_type_ids(&mut self, type_ids: Vec<u32>)
pub fn get_offsets(&self) -> &[Offsets]
pub fn get_offsets_mut(&mut self) -> &mut [Offsets]
pub fn get_special_tokens_mask(&self) -> &[u32]
pub fn get_attention_mask(&self) -> &[u32]
pub fn get_overflowing(&self) -> &Vec<Encoding>
pub fn set_overflowing(&mut self, overflowing: Vec<Encoding>)
pub fn get_overflowing_mut(&mut self) -> &mut Vec<Encoding>
pub fn take_overflowing(&mut self) -> Vec<Encoding>
sourcepub fn token_to_sequence(&self, token: usize) -> Option<usize>
pub fn token_to_sequence(&self, token: usize) -> Option<usize>
Returns the index of the sequence containing the given token
sourcepub fn word_to_tokens(
&self,
word: u32,
sequence_id: usize
) -> Option<(usize, usize)>
pub fn word_to_tokens( &self, word: u32, sequence_id: usize ) -> Option<(usize, usize)>
Get the encoded tokens corresponding to the word at the given index in the input sequence, with the form (start_token, end_token + 1)
sourcepub fn word_to_chars(&self, word: u32, sequence_id: usize) -> Option<Offsets>
pub fn word_to_chars(&self, word: u32, sequence_id: usize) -> Option<Offsets>
Get the offsets of the word at the given index in the input sequence.
sourcepub fn token_to_chars(&self, token: usize) -> Option<(usize, Offsets)>
pub fn token_to_chars(&self, token: usize) -> Option<(usize, Offsets)>
Get the offsets of the token at the given index.
sourcepub fn token_to_word(&self, token: usize) -> Option<(usize, u32)>
pub fn token_to_word(&self, token: usize) -> Option<(usize, u32)>
Get the word that contains the token at the given index.
sourcepub fn char_to_token(&self, pos: usize, sequence_id: usize) -> Option<usize>
pub fn char_to_token(&self, pos: usize, sequence_id: usize) -> Option<usize>
Get the token that contains the given char.
sourcepub fn char_to_word(&self, pos: usize, sequence_id: usize) -> Option<u32>
pub fn char_to_word(&self, pos: usize, sequence_id: usize) -> Option<u32>
Get the word that contains the given char.
sourcepub fn truncate(
&mut self,
max_len: usize,
stride: usize,
direction: TruncationDirection
)
pub fn truncate( &mut self, max_len: usize, stride: usize, direction: TruncationDirection )
Truncate the current Encoding
.
Panics if stride >= max_len
sourcepub fn merge<I: IntoIterator<Item = Encoding>>(
encodings: I,
growing_offsets: bool
) -> Self
pub fn merge<I: IntoIterator<Item = Encoding>>( encodings: I, growing_offsets: bool ) -> Self
Merge all Encodings together
sourcepub fn merge_with(&mut self, pair: Encoding, growing_offsets: bool)
pub fn merge_with(&mut self, pair: Encoding, growing_offsets: bool)
Merge ourself with the given Encoding
. Happens in place.