Trait tokenizers::tokenizer::Model
source · pub trait Model {
type Trainer: Trainer + Sync;
// Required methods
fn tokenize(&self, sequence: &str) -> Result<Vec<Token>>;
fn token_to_id(&self, token: &str) -> Option<u32>;
fn id_to_token(&self, id: u32) -> Option<String>;
fn get_vocab(&self) -> HashMap<String, u32>;
fn get_vocab_size(&self) -> usize;
fn save(&self, folder: &Path, prefix: Option<&str>) -> Result<Vec<PathBuf>>;
fn get_trainer(&self) -> <Self as Model>::Trainer;
}
Expand description
Represents a model used during Tokenization (like BPE or Word or Unigram).
Required Associated Types§
Required Methods§
sourcefn tokenize(&self, sequence: &str) -> Result<Vec<Token>>
fn tokenize(&self, sequence: &str) -> Result<Vec<Token>>
Tokenize the given sequence into multiple underlying Token
. The offsets
on the Token
are expected to be relative to the given sequence.
sourcefn token_to_id(&self, token: &str) -> Option<u32>
fn token_to_id(&self, token: &str) -> Option<u32>
Find the ID associated to a string token
sourcefn id_to_token(&self, id: u32) -> Option<String>
fn id_to_token(&self, id: u32) -> Option<String>
Find the string token associated to an ID
sourcefn get_vocab(&self) -> HashMap<String, u32>
fn get_vocab(&self) -> HashMap<String, u32>
Retrieve the entire vocabulary mapping (token -> ID)
sourcefn get_vocab_size(&self) -> usize
fn get_vocab_size(&self) -> usize
Retrieve the size of the vocabulary
sourcefn save(&self, folder: &Path, prefix: Option<&str>) -> Result<Vec<PathBuf>>
fn save(&self, folder: &Path, prefix: Option<&str>) -> Result<Vec<PathBuf>>
Save the current Model
in the given folder, using the given prefix
for the various
files that need to be saved.
sourcefn get_trainer(&self) -> <Self as Model>::Trainer
fn get_trainer(&self) -> <Self as Model>::Trainer
Get an instance of a Trainer capable of training this Model