pub fn build_clip_transformer<P: AsRef<Path>>(
    clip: &Config,
    clip_weights: P,
    device: &Device,
    dtype: DType
) -> Result<ClipTextTransformer>