Struct tokenizers::pre_tokenizers::byte_level::ByteLevel

source ·

#[non_exhaustive]pub struct ByteLevel {
    pub add_prefix_space: bool,
    pub trim_offsets: bool,
    pub use_regex: bool,
}

Expand description

Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care of all the required processing steps to transform a UTF-8 string as needed before and after the BPE model does its job.

Fields (Non-exhaustive)§

This struct is marked as non-exhaustive

Non-exhaustive structs could have additional fields added in future. Therefore, non-exhaustive structs cannot be constructed in external crates using the traditional Struct { .. } syntax; cannot be matched against without a wildcard ..; and struct update syntax will not work.

§add_prefix_space: bool

Whether to add a leading space to the first word. This allows to treat the leading word just as any other word.

§trim_offsets: bool

Whether the post processing step should trim offsets to avoid including whitespaces.

§use_regex: bool

Whether to use the standard GPT2 regex for whitespace splitting Set it to False if you want to use your own splitting.

Struct tokenizers::pre_tokenizers::byte_level::ByteLevelCopy item path

Fields (Non-exhaustive)§

Implementations§

impl ByteLevel

pub fn new(add_prefix_space: bool, trim_offsets: bool, use_regex: bool) -> Self

pub fn alphabet() -> HashSet<char>

pub fn add_prefix_space(self, v: bool) -> Self

pub fn trim_offsets(self, v: bool) -> Self

pub fn use_regex(self, v: bool) -> Self

Trait Implementations§

impl Clone for ByteLevel

fn clone(&self) -> ByteLevel

fn clone_from(&mut self, source: &Self)

impl Debug for ByteLevel

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Decoder for ByteLevel

fn decode_chain(&self, tokens: Vec<String>) -> Result<Vec<String>>

fn decode(&self, tokens: Vec<String>) -> Result<String>

impl Default for ByteLevel

fn default() -> Self

impl<'de> Deserialize<'de> for ByteLevel

fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where __D: Deserializer<'de>,

impl From<ByteLevel> for DecoderWrapper

fn from(from: ByteLevel) -> Self

impl From<ByteLevel> for PostProcessorWrapper

fn from(from: ByteLevel) -> Self

impl From<ByteLevel> for PreTokenizerWrapper

fn from(from: ByteLevel) -> Self

impl PartialEq for ByteLevel

fn eq(&self, other: &ByteLevel) -> bool

fn ne(&self, other: &Rhs) -> bool

impl PostProcessor for ByteLevel

fn added_tokens(&self, _is_pair: bool) -> usize

fn process_encodings( &self, encodings: Vec<Encoding>, _add_special_tokens: bool ) -> Result<Vec<Encoding>>

fn process( &self, encoding: Encoding, pair_encoding: Option<Encoding>, add_special_tokens: bool ) -> Result<Encoding>

impl PreTokenizer for ByteLevel

fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()>

impl Serialize for ByteLevel

fn serialize<__S>(&self, __serializer: __S) -> Result<__S::Ok, __S::Error>where __S: Serializer,

impl Copy for ByteLevel

impl Eq for ByteLevel

impl StructuralPartialEq for ByteLevel

Auto Trait Implementations§

impl Freeze for ByteLevel

impl RefUnwindSafe for ByteLevel

impl Send for ByteLevel

impl Sync for ByteLevel

impl Unpin for ByteLevel

impl UnwindSafe for ByteLevel

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> Pointable for T

const ALIGN: usize = _

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> DeserializeOwned for Twhere T: for<'de> Deserialize<'de>,

Struct tokenizers::pre_tokenizers::byte_level::ByteLevel

fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where __D: Deserializer<'de>,

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

impl<T> DeserializeOwned for T
where T: for<'de> Deserialize<'de>,