Struct tokenizers::tokenizer::pre_tokenizer::PreTokenizedString

source ·

pub struct PreTokenizedString { /* private fields */ }

Expand description

The PreTokenizedString is in charge of splitting an underlying string, making sure everything is fine while doing so, and providing ways to normalize and tokenize these splits. Once everything has been normalized and tokenized, the PreTokenizedString is able to build an Encoding with all the relevant offsets and word ids, relative to the original string.

Struct tokenizers::tokenizer::pre_tokenizer::PreTokenizedStringCopy item path

Implementations§

impl PreTokenizedString

pub fn split<F, U, R>(&mut self, split_fn: F) -> Result<()>where F: FnMut(usize, NormalizedString) -> Result<U>, U: IntoIterator<Item = R>, R: Into<Split>,

pub fn normalize<F>(&mut self, normalize: F) -> Result<()>where F: Fn(&mut NormalizedString) -> Result<()>,

pub fn tokenize<F>(&mut self, tokenize: F) -> Result<()>where F: Fn(&NormalizedString) -> Result<Vec<Token>>,

pub fn into_encoding( self, word_idx: Option<u32>, type_id: u32, offset_type: OffsetType ) -> Result<Encoding>

pub fn get_splits( &self, offset_ref: OffsetReferential, offset_type: OffsetType ) -> Vec<(&str, Offsets, &Option<Vec<Token>>)>

Trait Implementations§

impl Clone for PreTokenizedString

fn clone(&self) -> PreTokenizedString

fn clone_from(&mut self, source: &Self)

impl Debug for PreTokenizedString

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl From<&str> for PreTokenizedString

fn from(s: &str) -> Self

impl From<NormalizedString> for PreTokenizedString

fn from(s: NormalizedString) -> Self

impl From<String> for PreTokenizedString

fn from(s: String) -> Self

impl PartialEq for PreTokenizedString

fn eq(&self, other: &PreTokenizedString) -> bool

fn ne(&self, other: &Rhs) -> bool

impl Eq for PreTokenizedString

impl StructuralPartialEq for PreTokenizedString

Auto Trait Implementations§

impl Freeze for PreTokenizedString

impl RefUnwindSafe for PreTokenizedString

impl Send for PreTokenizedString

impl Sync for PreTokenizedString

impl Unpin for PreTokenizedString

impl UnwindSafe for PreTokenizedString

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> Pointable for T

const ALIGN: usize = _

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

Struct tokenizers::tokenizer::pre_tokenizer::PreTokenizedString

pub fn split<F, U, R>(&mut self, split_fn: F) -> Result<()>
where F: FnMut(usize, NormalizedString) -> Result<U>, U: IntoIterator<Item = R>, R: Into<Split>,

pub fn normalize<F>(&mut self, normalize: F) -> Result<()>
where F: Fn(&mut NormalizedString) -> Result<()>,

pub fn tokenize<F>(&mut self, tokenize: F) -> Result<()>
where F: Fn(&NormalizedString) -> Result<Vec<Token>>,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,