From 01e6935cb69dd08725ce33d21c8e97589619787c Mon Sep 17 00:00:00 2001 From: Zolisa Bleki Date: Tue, 27 Aug 2024 12:12:37 +0200 Subject: [PATCH] Add an example implementing a custom ZipStore. This adds an example showing how one can use the library to create their own custom Zarr store and the minimum requirements needed to be satisfied. --- README.md | 4 +- examples/data/testdata.zip | Bin 0 -> 6407 bytes examples/dune | 13 +++ examples/inmemory_zipstore.ml | 188 ++++++++++++++++++++++++++++++++++ examples/readonly_zipstore.ml | 113 ++++++++++++++++++++ 5 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 examples/data/testdata.zip create mode 100644 examples/dune create mode 100644 examples/inmemory_zipstore.ml create mode 100644 examples/readonly_zipstore.ml diff --git a/README.md b/README.md index e687eeb..8ee0cb6 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,8 @@ arrays, designed for use in parallel computing. - Compresses chunks using a variety of supported compression codecs. - Supports indexing operations to read/write views of a Zarr array. - Supports storing arrays in-memory or the local filesystem. It is also - extensible, allowing users to create and use their own custom storage backends. + extensible, allowing users to easily create and use their own custom storage + backends. See the example implementing a [Zip file store][9] for more details. - Supports both synchronous and concurrent I/O via [Lwt][4] and [Eio][8]. - Leverages the strong type system of Ocaml to create a type-safe API; making it impossible to create, read or write malformed arrays. @@ -132,3 +133,4 @@ FilesystemStore.erase_group_node store group_node;; [6]: https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html [7]: https://zoj613.github.io/zarr-ml/zarr/Zarr/index.html#examples [8]: https://github.com/ocaml-multicore/eio +[9]: https://github.com/zoj613/zarr-ml/tree/main/examples/inmemory_zipstore.ml diff --git a/examples/data/testdata.zip b/examples/data/testdata.zip new file mode 100644 index 0000000000000000000000000000000000000000..e88469a35d9bbea0140a513d1e132980d2ab19b0 GIT binary patch literal 6407 zcmb_gdpy(YAGe(9WTK+EEi{B&x1^>?O2~4FOY!?I$Du{9b6&6Cv)5kV=a0|l{rx=e=ly*?pXXzN=Kn>MkMCDL zzTi?xJHAC@B``nf=Y!R-_{XG(#{@iYavbLuxk?##%efa1#F$oLYgA$$MDm>ei;d>Mca`o^tl2_vuo}j@E9Y zsE?0OL3jl2qD}8?@!f1?pZChP(HH#v`O8W_k7%CAe9?5P2bUwUPtrY44TC|e4%r{D zoDP0-T#U&Y9!yN?eM2diHWy<*ta-O}+h5qR`e$O3R%@<*zD~?`n_2&==|O6tBEd5q z#U5x1ChaX9x?c8P^@_(+QF###c}ISMv|?^8KE{yfK(u(}aLQq9VV{UR`?T(RHho>c z+f`oc4NF+N&+io=6kdBkK665uQ^7ZMcMZ*-ctDo%SnvT}!?&5QIxx=l6QGC*;4ikU zk0xjWBse1z%k=@qY|2wz<0G!(t6m{2X&XfTQvOx0sQ7o8@;xfa#dlg(NSSPo!T+Ju ztbIq4cscb~_=^2o4Sr2CzIGAn6(einUHn$q6CNjPpSQ>2=BWPLtcKWWWDn;7vVFR~ zs3Brz;zVy(ciIMy;conne#MYOGby#$s4`+>e<<0361n?m=z&p!9>#HOBje`z+Cn-@ z4cXukaADBdPGR+I(apBk6YLylD!P9w$I^^Gf>7@z4z4GgKA_%39nr2RBb}J>e_8?L*iGy&Ct#sNBmQ$xkIXw2lvws?~K`!XM9p#^uxxo13zkX(wAzpy4DV6;^?U-#yX9P{)$EwK zH7Lvrm=w0`p~p-VLC& zNmL;^19RMjl#Y-BLrgLXRh7VCq*h5~@jLtao}uXvk<7jc=G4Uby&8>_XOX>nk1(Q0 z{ZYIMMu?Myg2mF#;Nqaa)<4|JFkmNV%jpqJG3uPG6SBfIdG;pqI;w4LDYb@F)`7_K7lf5xgB`xTcT^mjlum@#m0l3UESVx0&x>=IFR?t-)Az#HBQW4^- z!H}*7qNDa|^ZJhc)-P$O>LY9KH#pZxyKFE1x~fv-acI7Fp3hk?mn-AhMNb8p@D8(M z7d?ML8jmKw7|7pd-QIcZg?=GE)1}5+_mPxzse<(`r4OZM(p~0WBhtro_IiP@9^laL zywCT#n&G*s#K}U;>NeEWUDp@5jT zyRkO}q|+X_JKWs7v0)&@Z1?Q+@N9Vm#DQdFM%TBK>*`NHrQOFJ{@D6kS&9frpV^vy zBDo!Btlb(yhNWRrhVeVtho92j$y@bUC*Hi#VK{XvyYy$*6TC@IW;D%-xWPI>-Dp2H z_5N29tOrvHrG<6mdDe!d*dW^KkH3KJU=={q2sj4_>ueZaiiAEwA4gYYsZ+5MLX&?@ zqJ@sLnUnSxOQcPO`w1w5J4yAfQ?N(q=6zB!h2aQCssbbb$sRA+?Rd#g8f(M&A9l6$ zE#~y!rUdwZP6_ZuX=LszhfSW@&P@lS%kCW4M-wy(TEA%L!k?OYvw-tAl9HzKRqy_TZEg!-$=y>YP`#o!K>2s7i}xr-i8m z)5PP&X>{!zyc8zvWQpbo+$Yr?v9(u_L1t%lZ`Gip*{6H;Dz!$EgZmF6ohYjHh&lm64>x}*U8&7fE5(qz<)c}iWSa!4?UzSW+3^FkEfUsR z0yx1ls?`@RxNKBWOek{rSdLf3md2TgS2r>#KI}S2)ymdlCj8|g_rw>q^4B&{jMS?F zu5>GUv%Dl~fAcB4f=hAS+~;^H*7QlK-N$&a;o<6FZ6P%V{EzTLBaCWwdm)feqGoF1 z`=HtGp@xP@8OJ0Uo4`B0Uc#q$rv#iD+i_d5z4iHYz~kGW7^emNR)Elp)i|esgg)dWM2tWtE7=}ze81|uq9y{15{z$wJ;TLYz9hg z2i1sVHA+~?F%(LX>lY0<;!I?)Hig)#RO=-OGSbalI__x>LK?I3eBBzV@Tl0j+=w&O0B|-El4XS-UfRm zP8-)0*bt{4`cg}0Jb>ZCE{YR7!}!xtJsIvgn-0O$c58U|n{f^^Q}$ufQ1aIV;I4f& zM)5a7N*qY~&dGEPYC@k88nRcLK7k{m6!qvsd7j-z z5G(cdqoFqo9AZhKydilG3sBM_8o|*~GW1W4he!v6oYjugz__z&8EWy& zNk$3orX$P&Lp+l{D2cSA3_)M2o)UFIO%=q^A}CrWkaw?wR~xm*99b6?9OBzXJYHCp z_!V<&NMo9hqFXlnqd;=%x@Q#tamfBuAPA5wsArKtMxHdP{1qU+di++)DP`=#0%A*> z;T5S%k$dFdwXCg>3yKh}I$#)upax3hD}f`=uC?pH?nw?*Z7mS}5IV#Efn%0yGv?P5 zn$tcJHo@sR&KjFMxib|@mQBN?)hAh#H}Aq4LRgvIDVsBx7(umYzA|{DjCK+UQ^c4(pdaYmdbb#r@f4f>+@d*KBO30YeLH1@ zWORcDA!~ryCONGV66%xpC-+vg29Oo9*v3Lsi@i^^;yP;vW4{nxUGJWb&OIo8(N`}B zCe#0WH<7{Ka|B}>AI$KNashIH>U7Q^Q83m$ov4HV_8X7fHNLjed{A~Vv;Q^`ApUbA zK>UyhR{Ijd{eg1-1n^(_Zou`?B#gZ|$`>`8Ra?7livvbVJi;wE`-^P5#j(~~HkMXT z6=>#m!E{@>kKVYZ3ad4sI_Gd_3tY|p_pIx)_tW-jRBJem+EhTjP z^Ku1H9cffUiF~UIFne}iaj(P|6(_e~|81JQHK~1Lo|hjGp!ZLMgf9fSg9LFz8(&WP zFzDS~o+LjMr!gWH zV%(;~CGy4l>znlaYn9Z1_~yBDvbDekW%i-VD-$$oa*z#or;?|MJ79WpDvI)d(08GR7MCZRqA4_1bNMl>(3B zz<0(YhIBd}kn5CQ9W@@PStE<8_`T^UCAV{@JMody^Yp*!L4DY9R5=RK4JUaEu)S(bjT2`|fdoOI#&&VIw4;C7*De|pj##ub!cc)6uJj`?oa=f_aL zwYgxIn)nA=rxGt?Vtb~)pr_x=9?3U5-Vr?Vt}d{^T7)hQ?J6?fCj!Y$go&sr)yl@6 z6)QQc2jX661b~&#HDgLH9p)$ibMBw#j{Of6XJFi^%P~Kkk82=IXPYX_*AIhr2@LVa z!gOHY$b*G14=V&ff1~;T1-J$P{I6J`6{7shx1az(V65Y7(&s+^Zb&V-EY*_Y8qT@M z0nbG;vOq8I$Zb*04eq-R0@Fo~3k@sK3TbYz%e1WiM&N5Y7rr#WKin)1g}0%?ZP&~V zZYXr%b66D0-_4tah46ZF+cyi|oF%-u&79@^xvN|7R|cAKylbIeAg#c8wbW9UAeFb`S^#+` zxE#oTAay@*nEefmGh&U;)HJcXH#g0XrgQ!C2>;5Ln}+5` b)4A=yk+{fhCGZJzD+~!fe_(d*+j;pvbM^fF literal 0 HcmV?d00001 diff --git a/examples/dune b/examples/dune new file mode 100644 index 0000000..ea6e9e1 --- /dev/null +++ b/examples/dune @@ -0,0 +1,13 @@ +(executable + (name readonly_zipstore) + (modules readonly_zipstore) + (ocamlopt_flags (:standard -O3)) + (libraries zarr-eio camlzip)) + +(executable + (name inmemory_zipstore) + (modules inmemory_zipstore) + (ocamlopt_flags (:standard -O3)) + (libraries zarr-lwt zipc) + (preprocess + (pps ppx_deriving.show))) diff --git a/examples/inmemory_zipstore.ml b/examples/inmemory_zipstore.ml new file mode 100644 index 0000000..b464c87 --- /dev/null +++ b/examples/inmemory_zipstore.ml @@ -0,0 +1,188 @@ +(* This module implements a Zip file zarr store that is Lwt-aware. + It supports both read and write operations. This is because the + underlying Zip library used reads all Zip file bytes into memory. All + store updates are done in-memory and thus to update the actual zip file + we must persist the data using `MemoryZipStore.write_to_file`. + + The main requirement is to implement the signature of Zarr.Types.IO. + We use Zarr_lwt Deferred module for `Deferred` so that the store can be + Lwt-aware. + + To compile & run this example execute the command + dune exec -- examples/inmemory_zipstore.exe + in your shell at the root of this project. *) + +module MemoryZipStore : sig + include Zarr.Storage.STORE with type 'a Deferred.t = 'a Lwt.t + (*val create : ?level:Zipc_deflate.level -> string -> t *) + val open_store : ?level:Zipc_deflate.level -> string -> t + val write_to_file : t -> unit Deferred.t +end = struct + module M = Map.Make(String) + + module Z = struct + module Deferred = Zarr_lwt.Deferred + open Deferred.Infix + + type t = + {mutable ic : Zipc.t + ;mutex : Lwt_mutex.t + ;level : Zipc_deflate.level + ;path : string} + + let is_member t key = + Deferred.return @@ Zipc.mem key t.ic + + let size t key = + Deferred.return @@ + match Zipc.find key t.ic with + | None -> 0 + | Some m -> + match Zipc.Member.kind m with + | Zipc.Member.Dir -> 0 + | Zipc.Member.File f -> Zipc.File.decompressed_size f + + let get t key = + Deferred.return @@ + match Zipc.find key t.ic with + | None -> raise (Zarr.Storage.Key_not_found key) + | Some m -> + match Zipc.Member.kind m with + | Zipc.Member.Dir -> failwith "cannot get size of directory." + | Zipc.Member.File f -> + match Zipc.File.to_binary_string f with + | Error e -> failwith e + | Ok s -> s + + let get_partial_values t key ranges = + get t key >>= fun data -> + let size = String.length data in + ranges |> Lwt_list.map_p @@ fun (offset, len) -> + Deferred.return + (match len with + | None -> String.sub data offset (size - offset) + | Some l -> String.sub data offset l) + + let list t = + Deferred.return @@ Zipc.fold + (fun m acc -> + match Zipc.Member.kind m with + | Zipc.Member.Dir -> acc + | Zipc.Member.File _ -> Zipc.Member.path m :: acc) t.ic [] + + let list_dir t prefix = + let module StrSet = Zarr.Util.StrSet in + let n = String.length prefix in + let m = Zipc.to_string_map t.ic in + let prefs, keys = + M.fold + (fun key v ((l, r) as acc) -> + match Zipc.Member.kind v with + | Zipc.Member.Dir -> acc + | Zipc.Member.File _ -> + let pred = String.starts_with ~prefix key in + match key with + | k when pred && String.contains_from k n '/' -> + StrSet.add String.(sub k 0 @@ 1 + index_from k n '/') l, r + | k when pred -> l, k :: r + | _ -> acc) m (StrSet.empty, []) + in Deferred.return (keys, StrSet.elements prefs) + + let set t key value = + match Zipc.File.deflate_of_binary_string ~level:t.level value with + | Error e -> failwith e + | Ok f -> + match Zipc.Member.(make ~path:key @@ File f) with + | Error e -> failwith e + | Ok m -> + Lwt_mutex.with_lock + t.mutex + (fun () -> + t.ic <- Zipc.add m t.ic; + Deferred.return_unit) + + let set_partial_values t key ?(append=false) rv = + let f = + if append then + fun acc (_, v) -> + Deferred.return @@ acc ^ v + else + fun acc (rs, v) -> + let s = Bytes.of_string acc in + String.(length v |> Bytes.blit_string v 0 s rs); + Deferred.return @@ Bytes.to_string s + in + match Zipc.Member.kind (Option.get @@ Zipc.find key t.ic) with + | Zipc.Member.Dir -> Deferred.return_unit + | Zipc.Member.File file -> + match Zipc.File.to_binary_string file with + | Error e -> failwith e + | Ok s -> Deferred.fold_left f s rv >>= set t key + + let erase t key = + Lwt_mutex.with_lock + t.mutex + (fun () -> + t.ic <- Zipc.remove key t.ic; + Deferred.return_unit) + + let erase_prefix t prefix = + let m = Zipc.to_string_map t.ic in + let m' = + M.filter_map + (fun k v -> + if String.starts_with ~prefix k then None else Some v) m in + Lwt_mutex.with_lock + t.mutex + (fun () -> + t.ic <- Zipc.of_string_map m'; + Deferred.return_unit) + end + + (* this functor generates the public signature of our Zip file store. *) + include Zarr.Storage.Make(Z) + + (* now we create functions to open and close the store. *) + + (*let create ?(level=`Default) path = Z.{ic = Zipc.empty; level; path} *) + + let open_store ?(level=`Default) path = + match Zipc.of_binary_string In_channel.(with_open_bin path input_all) with + | Ok ic -> Z.{ic; level; path; mutex = Lwt_mutex.create ()} + | Error e -> failwith e + + let write_to_file (t : Z.t) = + Lwt_io.with_file + ~flags:Unix.[O_WRONLY; O_TRUNC; O_CREAT; O_NONBLOCK] + ~mode:Lwt_io.Output + t.path + (fun oc -> + let open Lwt.Infix in + match Zipc.to_binary_string t.ic with + | Error e -> failwith e + | Ok s -> Lwt_io.write oc s >>= fun () -> Lwt_io.flush oc) +end + +let _ = + Lwt_main.run @@ begin + let open Zarr.Node in + let open MemoryZipStore.Deferred.Infix in + + let printlist = [%show: string list] in + let store = MemoryZipStore.open_store "examples/data/testdata.zip" in + MemoryZipStore.find_all_nodes store >>= fun (xs, _) -> + print_endline @@ "All array nodes: " ^ printlist (List.map ArrayNode.to_path xs); + let anode = List.hd @@ List.filter + (fun node -> ArrayNode.to_path node = "/some/group/name") xs in + let slice = Owl_types.[|R [0; 20]; I 10; R []|] in + MemoryZipStore.read_array store anode slice Bigarray.Char >>= fun x -> + print_string @@ "BEFORE: " ^ Owl_pretty.dsnda_to_string x; + let x' = + Owl.Dense.Ndarray.Generic.map + (fun _ -> Owl_stats_dist.uniform_int_rvs ~a:0 ~b:255 |> Char.chr) x in + MemoryZipStore.write_array store anode slice x' >>= fun () -> + MemoryZipStore.read_array store anode slice Bigarray.Char >>= fun y -> + print_string @@ "AFTER: " ^ Owl_pretty.dsnda_to_string y; + MemoryZipStore.write_to_file store >>| fun () -> + print_endline "Zip store has been update." + end diff --git a/examples/readonly_zipstore.ml b/examples/readonly_zipstore.ml new file mode 100644 index 0000000..1570a5a --- /dev/null +++ b/examples/readonly_zipstore.ml @@ -0,0 +1,113 @@ +(* This module implements a Read-only Zip file zarr store that is Eio-aware. + The main requirement is to implement the signature of Zarr.Types.IO. + We use Zarr_eio's Deferred module for `Deferred` so that the store can be + Eio-aware. Since Zip stores cannot have files updated or removed, we only + implement the get_* and list_* family of functions and raise an + Not_implemented exception for the set_* and erase_* family of functions. + This effectively allows us to create a read-only store since calling any + of the following functions would result in an `Not_implemented` exception: + - ReadOnlyZipStore.create_group + - ReadOnlyZipStore.create_array + - ReadOnlyZipStore.erase_group_node + - ReadOnlyZipStore.erase_array_node + - ReadOnlyZipStore.erase_all_nodes + - ReadOnlyZipStore.write_array + - ReadOnlyZipStore.reshape + Below we show how to implement this custom Zarr Store. + + To compile & run this example execute the command + dune exec -- examples/zipstore.exe + in your shell at the root of this project. *) + +module ReadOnlyZipStore : sig + exception Not_implemented + + include Zarr.Storage.STORE with type 'a Deferred.t = 'a + val open_store : string -> t + val close : t -> unit + +end = struct + exception Not_implemented + + module Z = struct + module Deferred = Zarr_eio.Deferred + open Deferred.Infix + + type t = Zip.in_file + + let is_member t key = + match Zip.find_entry t key with + | exception Not_found -> false + | _ -> true + + let size t key = + match Zip.find_entry t key with + | e -> e.uncompressed_size + | exception Not_found -> 0 + + let get t key = + match Zip.find_entry t key with + | e -> Zip.read_entry t e + | exception Not_found -> raise (Zarr.Storage.Key_not_found key) + + let get_partial_values t key ranges = + get t key >>= fun data -> + let size = String.length data in + ranges |> Eio.Fiber.List.map @@ fun (offset, len) -> + match len with + | None -> String.sub data offset (size - offset) + | Some l -> String.sub data offset l + + let list t = + Zip.entries t |> Eio.Fiber.List.filter_map @@ function + | (e : Zip.entry) when not e.is_directory -> Some e.filename + | _ -> None + + let list_dir t prefix = + let module StrSet = Zarr.Util.StrSet in + let n = String.length prefix in + let prefs, keys = + List.fold_left + (fun ((l, r) as acc) -> function + | (e : Zip.entry) when e.is_directory -> acc + | e when not @@ String.starts_with ~prefix e.filename -> acc + | e when String.contains_from e.filename n '/' -> + let key = e.filename in + let pre = String.sub key 0 @@ 1 + String.index_from key n '/' in + StrSet.add pre l, r + | e -> l, e.filename :: r) (StrSet.empty, []) @@ Zip.entries t + in keys, StrSet.elements prefs + + let set _ = raise Not_implemented + + let set_partial_values _ = raise Not_implemented + + let erase _ = raise Not_implemented + + let erase_prefix _ = raise Not_implemented + end + + (* this functor generates the public signature of our Zip file store. *) + include Zarr.Storage.Make(Z) + + (* now we create functions to open and close the store. *) + let open_store path = Zip.open_in path + let close = Zip.close_in +end + +let _ = + Eio_main.run @@ fun _ -> + let open Zarr.Metadata in + let open Zarr.Node in + + let store = ReadOnlyZipStore.open_store "examples/data/testdata.zip" in + let xs, _ = ReadOnlyZipStore.find_all_nodes store in + let anode = List.hd @@ Eio.Fiber.List.filter + (fun node -> ArrayNode.to_path node = "/some/group/name") xs in + let meta = ReadOnlyZipStore.array_metadata store anode in + let slice = Array.map (Fun.const @@ Owl_types.R []) (ArrayMetadata.shape meta) in + let arr = ReadOnlyZipStore.read_array store anode slice Bigarray.Char in + print_string @@ Owl_pretty.dsnda_to_string arr; + try ReadOnlyZipStore.write_array store anode slice arr with + | ReadOnlyZipStore.Not_implemented -> print_endline "Store is read-only"; + ReadOnlyZipStore.close store