1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
use super::model::Unigram;
use serde::{
    de::{Error, MapAccess, Visitor},
    ser::SerializeStruct,
    Deserialize, Deserializer, Serialize, Serializer,
};

impl Serialize for Unigram {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        let mut model = serializer.serialize_struct("Unigram", 3)?;

        model.serialize_field("type", "Unigram")?;
        model.serialize_field("unk_id", &self.unk_id)?;
        model.serialize_field("vocab", &self.vocab)?;
        model.serialize_field("byte_fallback", &self.byte_fallback())?;

        model.end()
    }
}

impl<'de> Deserialize<'de> for Unigram {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: Deserializer<'de>,
    {
        deserializer.deserialize_struct(
            "Unigram",
            &["type", "vocab", "unk_id", "byte_fallback"],
            UnigramVisitor,
        )
    }
}

struct UnigramVisitor;
impl<'de> Visitor<'de> for UnigramVisitor {
    type Value = Unigram;

    fn expecting(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
        write!(fmt, "struct Unigram")
    }

    fn visit_map<V>(self, mut map: V) -> std::result::Result<Self::Value, V::Error>
    where
        V: MapAccess<'de>,
    {
        let mut vocab: Option<Vec<(String, f64)>> = None;
        let mut unk_id: Option<usize> = None;
        let mut byte_fallback: bool = false;
        while let Some(key) = map.next_key::<String>()? {
            match key.as_ref() {
                "unk_id" => {
                    unk_id = map.next_value()?;
                }
                "byte_fallback" => byte_fallback = map.next_value()?,
                "vocab" => vocab = Some(map.next_value()?),
                "type" => match map.next_value()? {
                    "Unigram" => {}
                    u => {
                        return Err(serde::de::Error::invalid_value(
                            serde::de::Unexpected::Str(u),
                            &"Unigram",
                        ))
                    }
                },
                _ => (),
            }
        }
        match (vocab, unk_id, byte_fallback) {
            (Some(vocab), unk_id, byte_fallback) => Ok(Unigram::from(vocab, unk_id, byte_fallback)
                .map_err(|err| Error::custom(format!("Unable to load vocab {:?}", err)))?),
            (None, _, _) => Err(Error::custom("Missing vocab")),
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_serialization() {
        let vocab = vec![("<unk>".to_string(), 0.0), ("a".to_string(), -0.5)];
        let model = Unigram::from(vocab, Some(0), false).unwrap();

        let data = serde_json::to_string(&model).unwrap();
        let reconstructed = serde_json::from_str(&data).unwrap();

        assert_eq!(model, reconstructed);
    }

    #[test]
    fn test_serialization_unk_id_not_zero() {
        let vocab = vec![("a".to_string(), -0.5), ("<unk>".to_string(), 0.0)];
        let model = Unigram::from(vocab, Some(1), false).unwrap();

        let data = serde_json::to_string(&model).unwrap();
        let reconstructed = serde_json::from_str(&data).unwrap();

        assert_eq!(model, reconstructed);
    }

    #[test]
    fn test_serialization_no_unk_id() {
        let vocab = vec![("a".to_string(), -0.5)];
        let model = Unigram::from(vocab, None, false).unwrap();

        let data = serde_json::to_string(&model).unwrap();
        let reconstructed = serde_json::from_str(&data).unwrap();

        assert_eq!(model, reconstructed);
    }
}