Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Non-Human-Readable Deserialization for Felt Using Serde #76

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
226 changes: 120 additions & 106 deletions crates/starknet-types-core/src/felt/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -913,6 +913,8 @@ mod serde_impl {

use super::*;

const COMPRESSED: u8 = 0b1000_0000;

impl Serialize for Felt {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
Expand All @@ -921,11 +923,23 @@ mod serde_impl {
if serializer.is_human_readable() {
serializer.serialize_str(&format!("{:#x}", self))
} else {
let mut seq = serializer.serialize_seq(Some(32))?;
for b in self.to_bytes_be() {
seq.serialize_element(&b)?;
let bytes = self.to_bytes_be();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can avoid the if else nesting of if first_significant_byte > 1 doing the following:

let be_bytes = self.to_bytes_be();
let mut i = match be_bytes.iter().find(|b| b != 0) {
 Some(i) => i,
 None => { // handle Felt::ZERO and return }
}
let mut seq = serializer.serialize_seq(None)?;
if i != 0 {
  seq.serialize_element(32 - i | COMPRESSED)?;
}
while i < 32 {
    seq.serialize_element(be_bytes[i])?;
    i += 1;
}
seq.end();

let first_significant_byte = bytes.iter().position(|&b| b != 0).unwrap_or(31);
if first_significant_byte > 1 {
let len = 32 - first_significant_byte;
let mut seq = serializer.serialize_seq(Some(len + 1))?;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we are already storing the size. We should call serialize_eq with None in order to spare the extra byte which contains the len at the beginning.
Otherway there is no use to store the len a second time ourself

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes our worst case 33 bytes and our best 3 bytes (value, our len, their len). We want 32 and 2.

seq.serialize_element(&(len as u8 | COMPRESSED))?;
for b in &bytes[first_significant_byte..] {
seq.serialize_element(b)?;
}
seq.end()
} else {
let mut seq = serializer.serialize_seq(Some(32))?;
for b in &bytes {
seq.serialize_element(b)?;
}
seq.end()
}
seq.end()
}
}
}
Expand All @@ -935,7 +949,11 @@ mod serde_impl {
where
D: ::serde::Deserializer<'de>,
{
deserializer.deserialize_str(FeltVisitor)
if deserializer.is_human_readable() {
deserializer.deserialize_str(FeltVisitor)
} else {
deserializer.deserialize_seq(FeltVisitor)
}
}
}

Expand All @@ -960,6 +978,36 @@ mod serde_impl {
.ok_or(String::from("Expected hex string to be prefixed by '0x'"))
.map_err(de::Error::custom)
}

fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
where
A: de::SeqAccess<'de>,
{
let mut bytes = [0u8; 32];
let first = seq
.next_element::<u8>()?
.ok_or(de::Error::invalid_length(0, &"more bytes"))?;
if first & COMPRESSED != 0 {
let len = first & !COMPRESSED;
for (i, byte) in bytes.iter_mut().skip(32 - len as usize).enumerate() {
if let Some(b) = seq.next_element()? {
*byte = b;
} else {
return Err(de::Error::invalid_length(i + 1, &"more bytes"));
}
}
} else {
bytes[0] = first;
for byte in bytes.iter_mut().skip(1) {
if let Some(b) = seq.next_element()? {
*byte = b;
} else {
return Err(de::Error::invalid_length(0, &"32 bytes"));
}
}
}
Ok(Felt::from_bytes_be(&bytes))
}
}
}

Expand Down Expand Up @@ -1088,7 +1136,9 @@ mod test {
use proptest::prelude::*;
use regex::Regex;
#[cfg(feature = "serde")]
use serde_test::{assert_de_tokens, assert_ser_tokens, Configure, Token};
use serde_test::{
assert_de_tokens, assert_de_tokens_error, assert_ser_tokens, Compact, Configure, Token,
};

#[test]
fn test_debug_format() {
Expand Down Expand Up @@ -1622,15 +1672,60 @@ mod test {
#[test]
#[cfg(feature = "serde")]
fn deserialize() {
assert_de_tokens(&Felt::ZERO, &[Token::String("0x0")]);
assert_de_tokens(&Felt::TWO, &[Token::String("0x2")]);
assert_de_tokens(&Felt::THREE, &[Token::String("0x3")]);
assert_de_tokens(&Felt::ZERO.readable(), &[Token::String("0x0")]);
assert_de_tokens(&Felt::TWO.readable(), &[Token::String("0x2")]);
assert_de_tokens(&Felt::THREE.readable(), &[Token::String("0x3")]);
assert_de_tokens(
&Felt::MAX,
&Felt::MAX.readable(),
&[Token::String(
"0x800000000000011000000000000000000000000000000000000000000000000",
)],
);

assert_de_tokens(
&Felt::ZERO.compact(),
&[
Token::Seq { len: Some(2) },
Token::U8(1 | 0x80),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need this first element containing len and the compressed flag?
can't we just encode Felt::ONE as

&[
Token::Seq { len: Some(1) },
Token::U8(1),
]

And know that because we are using big endian repr, missing bytes are 0s and just be happy with it?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Serialization can be:

let be_bytes = self.to_bytes_be();
let mut i = match be_bytes.iter().find(|b| b != 0) {
 Some(i) => i,
 None => { // handle Felt::ZERO and return }
}
let mut seq = serializer.serialize_seq(Some(32 - i))?;
while i < 32 {
    seq.serialize_element(be_bytes[i])?;
    i += 1;
}
seq.end();

And deserialization:

let mut buffer = [0; 32];
let mut i = 32;
while i > 0 {
   if let Some(b) = seq.next_element()? {
         buffer[i-1] = b;
   } else {
           break;
   }
  i -= 1;
}
if i == 0 && seq.next_element()?.is_some() {
  // Return error invalid len
}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why wouldn't this way simpler impl work?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using seq.next_element()? to infer lengths and handle missing bytes as zeros isn’t reliable for all serializers. Some serializers may not support this implicit handling, leading to incorrect deserialization. Explicitly encoding the length and compressed flag ensures that the deserializer knows exactly how many bytes to expect and process, making the format self-descriptive and robust across different implementations.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But isn't what we are already doing with Some(32 - i)?

And how exactly could it fail?

Copy link

@cchudant cchudant Jun 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea here we had with @jbcaron was to make a binary format that never exceeds 32 bytes in the worse case (abitrary hash), and compresses it down to a few bytes in the best cases (small felt)

To do that, we think of the input as a byte stream and we use the felt unused bits in the first byte to encode a flag telling us if the felt is compressed or not

When the felt is compressed we treat that first byte as the length, so we read len more bytes from the wire -
when it's not compressed, we treat it as the first byte of the hash and complete it by reading 31 mores bytes.

However, we found out that there is an issue: using serde serialize_seq (or serialize_bytes), bincode will prepend the length of the sequence before the felt, and we end up with >32 bytes felts :(
This is not bincode's problem though it's fundamental to how serde works

Taking inspiration from fixed-sized arrays implementation in serde, they are serialized using serialize_tuple, which requires the length to be known beforehand... and we don't know the length before the first byte.

That gave me an idea though - which I tried here https://gist.github.com/cchudant/9bbb025c60a9fe23ca56b895fdcff13c
It works by treating the Felt as a tuple of (u8, <rest>) with <rest> being another tuple that is DeserializeSeeded with the length
Nesting tuples like this has no effect on the actual bytes on the wire at the end, so we are free to do that
The resulting encoding never exceeds 32 bytes as intended and compresses stuff correctly

@tdelabro what do you think of that?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I understand better.
I will redo my review keeping this in mind

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still, why do you need the flag for?
Get rid of it and if you have 32 elements, it's not compressed. If you have less it is compressed.
The flag doesn't add anything there

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need the byte that contain flag+len only when you serialize [Felt].
Then, because the size in bytes of a Felt can change you have to do:
[flag+len, byte, .., byte, flag+len, byte, flag+len, byte, .., byte, ...]

Copy link
Collaborator

@tdelabro tdelabro Jun 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So my recomandation would be:

  1. impl Serde on Felt the way I described
  2. Create public method serialize_sequence_of_felt_packed() and deserialize_sequence_of_felt_packed() that can be called inside #[serde(de/serialize_with = "path")] and implement your self descriptive packed representation here

This way values that are a single felt will always be between 1 and 32 bytes.
Value that are a sequence of felt will be serialized as:

  • by default:
[
  SerdeLen, // Number of felt in sequence
  [SerdeLen, u8, u8, ...], // felt 1
  [SerdeLen, u8, u8, ...], // felt 2
  [SerdeLen, u8, u8, ...], // felt 3
]
  • using our custom methods:
[
  SerdeLen, // Number of felt in sequence
  OurLenAndFlagOrUnflaggedFirstByte // First Byte of the first felt
  u8,
  ...,
  u8, // Last byte of the first felt 
  OurLenAndFlagOrUnflaggedFirstByte // First Byte of the second felt
  ...,
]

This makes you spare one byte in the case of felts that are 32 bytes long and nothing for all the others. Not gonna lie, it's not a huge optimization but may still be worth for some people.

Token::U8(0),
Token::SeqEnd,
],
);
assert_de_tokens(
&Felt::ONE.compact(),
&[
Token::Seq { len: Some(2) },
Token::U8(1 | 0x80),
Token::U8(1),
Token::SeqEnd,
],
);
assert_de_tokens(
&Felt::TWO.compact(),
&[
Token::Seq { len: Some(2) },
Token::U8(1 | 0x80),
Token::U8(2),
Token::SeqEnd,
],
);
assert_de_tokens(
&Felt::THREE.compact(),
&[
Token::Seq { len: Some(2) },
Token::U8(1 | 0x80),
Token::U8(3),
Token::SeqEnd,
],
);
assert_de_tokens_error::<Compact<Felt>>(
&[
Token::Seq { len: Some(1) },
Token::U8(1 | 0x80),
Token::SeqEnd,
],
"invalid length 1, expected more bytes",
);
}

#[test]
Expand All @@ -1649,116 +1744,35 @@ mod test {
assert_ser_tokens(
&Felt::ZERO.compact(),
&[
Token::Seq { len: Some(32) },
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::Seq { len: Some(2) },
Token::U8(1 | 0x80),
Token::U8(0),
Token::SeqEnd,
],
);
assert_ser_tokens(
&Felt::ONE.compact(),
&[
Token::Seq { len: Some(2) },
Token::U8(1 | 0x80),
Token::U8(1),
Token::SeqEnd,
],
);
assert_ser_tokens(
&Felt::TWO.compact(),
&[
Token::Seq { len: Some(32) },
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::Seq { len: Some(2) },
Token::U8(1 | 0x80),
Token::U8(2),
Token::SeqEnd,
],
);
assert_ser_tokens(
&Felt::THREE.compact(),
&[
Token::Seq { len: Some(32) },
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::U8(0),
Token::Seq { len: Some(2) },
Token::U8(1 | 0x80),
Token::U8(3),
Token::SeqEnd,
],
Expand Down
Loading