From 85225ca88c89147e7068ca920e805820fdb862dc Mon Sep 17 00:00:00 2001 From: Matthew <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 8 Jan 2024 09:56:00 -0500 Subject: [PATCH] introduce rust --- appendable-rs/.gitignore | 1 + appendable-rs/Cargo.lock | 23 +++ appendable-rs/Cargo.toml | 20 +++ appendable-rs/README.md | 262 ++++++++++++++++++++++++++++ appendable-rs/USAGE.md | 14 ++ appendable-rs/appendable/Cargo.toml | 8 + appendable-rs/appendable/src/lib.rs | 14 ++ appendable-rs/btree/Cargo.toml | 8 + appendable-rs/btree/src/lib.rs | 14 ++ appendable-rs/cmd/Cargo.toml | 8 + appendable-rs/cmd/src/main.rs | 3 + appendable-rs/encoding/Cargo.toml | 8 + appendable-rs/encoding/src/lib.rs | 14 ++ appendable-rs/protocol/Cargo.toml | 8 + appendable-rs/protocol/src/lib.rs | 14 ++ 15 files changed, 419 insertions(+) create mode 100644 appendable-rs/.gitignore create mode 100644 appendable-rs/Cargo.lock create mode 100644 appendable-rs/Cargo.toml create mode 100644 appendable-rs/README.md create mode 100644 appendable-rs/USAGE.md create mode 100644 appendable-rs/appendable/Cargo.toml create mode 100644 appendable-rs/appendable/src/lib.rs create mode 100644 appendable-rs/btree/Cargo.toml create mode 100644 appendable-rs/btree/src/lib.rs create mode 100644 appendable-rs/cmd/Cargo.toml create mode 100644 appendable-rs/cmd/src/main.rs create mode 100644 appendable-rs/encoding/Cargo.toml create mode 100644 appendable-rs/encoding/src/lib.rs create mode 100644 appendable-rs/protocol/Cargo.toml create mode 100644 appendable-rs/protocol/src/lib.rs diff --git a/appendable-rs/.gitignore b/appendable-rs/.gitignore new file mode 100644 index 00000000..9f970225 --- /dev/null +++ b/appendable-rs/.gitignore @@ -0,0 +1 @@ +target/ \ No newline at end of file diff --git a/appendable-rs/Cargo.lock b/appendable-rs/Cargo.lock new file mode 100644 index 00000000..d1722bab --- /dev/null +++ b/appendable-rs/Cargo.lock @@ -0,0 +1,23 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "appendable" +version = "0.1.0" + +[[package]] +name = "btree" +version = "0.1.0" + +[[package]] +name = "cmd" +version = "0.1.0" + +[[package]] +name = "encoding" +version = "0.1.0" + +[[package]] +name = "protocol" +version = "0.1.0" diff --git a/appendable-rs/Cargo.toml b/appendable-rs/Cargo.toml new file mode 100644 index 00000000..325ed34e --- /dev/null +++ b/appendable-rs/Cargo.toml @@ -0,0 +1,20 @@ + +[workspace] +members=[ + "cmd", + "appendable", + "btree", + "encoding", + "protocol" +] + + +# Use Cargo's new feature resolver, which can handle target-specific features. +# Explicit opt-in is required even with the 2021 edition because we use a +# virtual workspace. +# See: https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#cargos-new-feature-resolver +resolver = "2" + +[workspace.package] +edition = "2021" +# TODO(friendlymatthew | kevmo314) sync up on Rust version | i.e. rust-version = "1.75.0" \ No newline at end of file diff --git a/appendable-rs/README.md b/appendable-rs/README.md new file mode 100644 index 00000000..b6606376 --- /dev/null +++ b/appendable-rs/README.md @@ -0,0 +1,262 @@ +# Appendable + +Appendable is an append-only, schemaless, daemon-less database. + +Appendable doesn't require a conventional server, instead it generates an index +file that you can host on your favorite static hosting site. + +Appendable currently supports data files in the following formats: + +- [x] [JSON Lines](https://jsonlines.org/) `.jsonl` +- [ ] Parquet +- [ ] CSV +- [ ] TSV +- [ ] RecordIO + +with more formats coming soon. + +> [!CAUTION] +> This README is currently somewhat aspirational and many features are not implemented yet. +> Check out the technical demo for the functionality. + +## Motivation + +A smart friend of mine once said + +> _The problem with databases is that everybody cares about a different killer feature_ + +Appendable's primary goals are + +- Cost-optimized serving. Leverage your favorite static content host instead of + maintaining and provisioning resources for a dedicated database server. +- Speed-optimized (O(1)) index updating for appends. Index file updates are + fast and deterministic, making them suitable for real-time and streaming data + updates. + +## Demonstration + +Check out this repository's GitHub pages for an example querying the server. + +```ts +import Appendable from "appendable"; + +const db = Appendable.init("data.jsonl", "index.dat"); + +const results = await db + .where("timestamp", ">=", "2023-11-01T00:00:00Z") + .where("count", "<=", 15) + .orderBy("count", "DESC") + .orderBy("timestamp", "ASC") + .limit(20) + .get(); + +console.log(results); // contains data.jsonl queried with the above query. +``` + +## Getting Started + +TODO: implement a mini walkthrough. + +## Advanced Usage + +### Real-time updates + +Appendable indexes are intended to be very cheap to produce incrementally. It is so +cheap that it is not unreasonable to generate the index on demand. That is, you can +run a server such that `index.dat` produces the output from running +`./appendable -i index.dat` and cache the latest version on your CDN. Couple this with +a signalling channel to indicate that a version update has occurred to subscribe to +updates. For example, + +```ts +import Appendable from "appendable"; + +const db = Appendable.init("data.jsonl", "index.dat"); + +const unsubscribe = db + .where("timestamp", ">=", "2023-11-01T00:00:00Z") + .where("count", "<=", 15) + .orderBy("count", "DESC") + .orderBy("timestamp", "ASC") + .limit(20) + .onSnapshot((results) => { + console.log(results); + }); + +// then elsewhere + +db.dirty(); +``` + +Snapshot updates will only occur when the underlying data has changed. Therefore, `.dirty()` +can be called without too much concern. + +### Schemas + +A schema file is not required to use Appendable, however if you wish to ensure that +your data follows certain types, pass a JSON Schema file with `-s schema.json` and +Appendable will throw an error instead of inferring the type from the data. This +can be useful for detecting consistency issues or enforcing field restrictions. + +A word of caution, if you add a non-nullable field to your JSON schema, this will cause +all your previous data to be invalidated requiring an index regeneration. To avoid this, +pass `--imply-nullable` to indicate that previous data is ok to be null but new data +should validate. Be aware that this has implications on the generated types, in particular +your client will see the field as nullable despite the schema saying non-nullable. + +### Generated types + +Appendable can also emit TypeScript type definitions. Pass `-t output.d.ts` to produce +an inferred type definition file to make your queries type-safe. This can be used with + +```ts +import Appendable from "appendable"; +import DBTypes from 'output.d.ts'; + +const db = Appendable.init("data.jsonl", "index.dat"); + +... +``` + +Note that if a schema file is provided, it is guaranteed that the generated type definition +file is stable. That is, if the schema file does not change, the type definition file will +not change. + +### Complex queries + +The demonstration example uses a simple query, however the query builder is syntactic sugar over +a `.query()` call. If you wish to perform more advanced queries, you can do so by calling `.query()` +directly. For example, + +```ts +import Appendable from "appendable"; + +const db = Appendable.init("data.jsonl", "index.dat"); + +const results = await db.query({ + where: [ + { operation: ">=", key: "timestamp", value: "2023-11-01T00:00:00Z" }, + { operation: "<=", key: "count", value: 15 }, + ], + orderBy: [ + { key: "count", direction: "DESC" }, + { key: "timestamp", direction: "ASC" }, + ], +}); +``` + +### Permissioning and sharding + +Appendable does not support permissioning because it assumes that the data is publicly +readable. To accommodate permissions, we recomend guarding the access of your data files +via your preferred authentication scheme. That is, create an index file for each user's +data. For example, your static file content may look something like + +``` +/users/alice/data.jsonl +/users/alice/index.dat +/users/bob/data.jsonl +/users/bob/index.dat +/users/catherine/data.jsonl +/users/catherine/index.dat +``` + +Where each user has access to their own data and index file. + +### Mutability and skew + +Appendable is geared towards data that is immutable, however in practice this might not +be ideal. In order to accommodate data mutations, a data integrity hash is maintained so +when data is mutated, the data will be reindexed. Reindexing is O(n) in the age +of the oldest mutation (hence why appending is O(1) for updating the index!) so mutating +data early on in the data file will be more expensive to update. + +Mutations must be carefully performed because they will cause the previous index +to be corrupted. Therefore, when updating the files on your server, the data and index +files must be done atomically. This is tricky to do right, however one approach is to +version your data and index files. + +When a mutation is performed, you will need to create a new version of the data +file that includes the mutation along with the updated index file. Host the two +separately under a different version number so clients that started by querying +the previous version can continue to access it. + +Note that the performance limitations of indexing makes this intractible at any +appreciable scale so generally speaking, it's strongly recommended to keep your +data immutable and append-only within a data file. + +### Custom `fetch()` API + +For convenience, Appendable uses the browser's `fetch()` for fetching data files if +the data and index files are specified as a string. If you wish to use your own library +or wish to add your own headers, pass a callback. + +The callback must correctly return a byte slice representing the start and end parameters. + +For example, + +```ts +import Appendable from "appendable"; + +const db = Appendable.init( + (start: number, end: number) => { + const response = await fetch("data.jsonl", { + headers: { Range: `bytes=${start}-${end}` }, + }); + return await response.arrayBuffer(); + }, + (start: number, end: number) => { + const response = await fetch("index.dat", { + headers: { Range: `bytes=${start}-${end}` }, + }); + return await response.arrayBuffer(); + } +); +``` + +## Peanut gallery + +### Why not query a SQLite database with range requests? + +Dumping all the data into a SQLite database, hosting it, and then querying with +something like [sql.js](https://sql.js.org/) _could_ (and probably would) work, +but I find it particularly elegant that Appendable doesn't change the raw data. +In other words, besides producing an index file the `jsonl` file provided stays +untouched which means updates to the index file can lag behind data changes and +remain valid because the database doesn't need to shuffle the data around. + +### My data is never append-only, what's the point of this? + +A lot of data isn't actually append-only but can often be restructured as if it +were append-only. For example, creating a sequence of changelogs or deltas lets +you see the history and evolution of a document. + +Of course, not all data can be structured like this but Appendable started from +me observing that a decent chunk of my data _was_ written to an appending table +and not mutating any existing data and that it avoided some performance issues, +but the underlying database didn't take advantage of them. For example, with an +append-only data file, Appendable doesn't have to worry about row locking. That +means that there's no tail latency issues when querying and the database can be +scaled horizontally with conventional CDNs. This isn't possible (well ok, it is +but it's [very expensive](https://cloud.google.com/spanner)) with the usual set +of databases. + +If you're not convinced, think of Appendable as more geared towards time-series +datasets. Somewhat like a [kdb+](https://en.wikipedia.org/wiki/Kdb%2B) database +but meant for applications instead of more specialized use cases. + +### How do I deal with deletion requests if I can only append? + +It's recommended to shard your data in a way that, if you need to delete any of +it, the entire data file and index file are deleted together. For example, data +representing a document can be deleted by deleting the document's corresponding +data file and and its index file. + +If, for example, you wish to delete records associated with a given user within +a data file and a mutation must be performed, consult the _Mutability and skew_ +section above for the associated caveats. + +## Limitations + +- Max field size: 8793945536512 bytes +- Max number of rows: 2^64-1 diff --git a/appendable-rs/USAGE.md b/appendable-rs/USAGE.md new file mode 100644 index 00000000..1ee8864f --- /dev/null +++ b/appendable-rs/USAGE.md @@ -0,0 +1,14 @@ +## Appendable-rs Usage +*notes:* +- when we say root directory, we mean from the `appendable-rs` root level + + +```shell +cargo run +``` +You can invoke the cli by running the command above from the root directory. + +To be specific you can run +```shell +cargo run -p cmd +``` \ No newline at end of file diff --git a/appendable-rs/appendable/Cargo.toml b/appendable-rs/appendable/Cargo.toml new file mode 100644 index 00000000..0d7d91a3 --- /dev/null +++ b/appendable-rs/appendable/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "appendable" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/appendable-rs/appendable/src/lib.rs b/appendable-rs/appendable/src/lib.rs new file mode 100644 index 00000000..7d12d9af --- /dev/null +++ b/appendable-rs/appendable/src/lib.rs @@ -0,0 +1,14 @@ +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} diff --git a/appendable-rs/btree/Cargo.toml b/appendable-rs/btree/Cargo.toml new file mode 100644 index 00000000..e289749a --- /dev/null +++ b/appendable-rs/btree/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "btree" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/appendable-rs/btree/src/lib.rs b/appendable-rs/btree/src/lib.rs new file mode 100644 index 00000000..7d12d9af --- /dev/null +++ b/appendable-rs/btree/src/lib.rs @@ -0,0 +1,14 @@ +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} diff --git a/appendable-rs/cmd/Cargo.toml b/appendable-rs/cmd/Cargo.toml new file mode 100644 index 00000000..1be91123 --- /dev/null +++ b/appendable-rs/cmd/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "cmd" +version = "0.1.0" +edition.workspace = true + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/appendable-rs/cmd/src/main.rs b/appendable-rs/cmd/src/main.rs new file mode 100644 index 00000000..9767e1fa --- /dev/null +++ b/appendable-rs/cmd/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("this is the command binary"); +} diff --git a/appendable-rs/encoding/Cargo.toml b/appendable-rs/encoding/Cargo.toml new file mode 100644 index 00000000..44f246e8 --- /dev/null +++ b/appendable-rs/encoding/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "encoding" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/appendable-rs/encoding/src/lib.rs b/appendable-rs/encoding/src/lib.rs new file mode 100644 index 00000000..7d12d9af --- /dev/null +++ b/appendable-rs/encoding/src/lib.rs @@ -0,0 +1,14 @@ +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} diff --git a/appendable-rs/protocol/Cargo.toml b/appendable-rs/protocol/Cargo.toml new file mode 100644 index 00000000..77c82607 --- /dev/null +++ b/appendable-rs/protocol/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "protocol" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/appendable-rs/protocol/src/lib.rs b/appendable-rs/protocol/src/lib.rs new file mode 100644 index 00000000..7d12d9af --- /dev/null +++ b/appendable-rs/protocol/src/lib.rs @@ -0,0 +1,14 @@ +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +}