diff --git a/Project.toml b/Project.toml
index a0fc7c08..6248bb8e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "WaterLily"
 uuid = "ed894a53-35f9-47f1-b17f-85db9237eebd"
 authors = ["Gabriel Weymouth <gabriel.weymouth@gmail.com>"]
-version = "1.2"
+version = "1.2.0"
 
 [deps]
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
@@ -18,12 +18,14 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
 WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 
 [extensions]
 WaterLilyAMDGPUExt = "AMDGPU"
 WaterLilyCUDAExt = "CUDA"
+WaterLilyMPIExt = "MPI"
 WaterLilyReadVTKExt = "ReadVTK"
 WaterLilyWriteVTKExt = "WriteVTK"
 
@@ -41,9 +43,12 @@ julia = "1.6"
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Makie = "537997a7-5e4e-5d89-9595-2241ea00577e"
 PerformanceTestTools = "dc46b164-d16f-48ec-a853-60448fc869fe"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
@@ -53,4 +58,5 @@ UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 
 [targets]
-test = ["Test", "BenchmarkTools", "CUDA", "AMDGPU", "GPUArrays", "WriteVTK", "ReadVTK"]
+test = ["Test", "BenchmarkTools", "CUDA", "AMDGPU", "GPUArrays", "WriteVTK", "ReadVTK", "MPI",
+        "FileIO", "JLD2"]
diff --git a/examples/Project.toml b/examples/Project.toml
new file mode 100644
index 00000000..8c01a1cb
--- /dev/null
+++ b/examples/Project.toml
@@ -0,0 +1,19 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
+GR = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71"
+GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
+Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
+Meshing = "e6723b4c-ebff-59f1-b4b7-d97aa5274f73"
+Optim = "429524aa-4258-5aef-a3af-852621145aeb"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
+WaterLily = "ed894a53-35f9-47f1-b17f-85db9237eebd"
+WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
diff --git a/examples/TwoD_CircleMPI.jl b/examples/TwoD_CircleMPI.jl
new file mode 100644
index 00000000..5fe47aa8
--- /dev/null
+++ b/examples/TwoD_CircleMPI.jl
@@ -0,0 +1,54 @@
+#mpiexecjl --project=. -n 2 julia TwoD_CircleMPI.jl
+
+using WaterLily
+using WriteVTK
+using MPI
+using StaticArrays
+
+# make a writer with some attributes, now we need to apply the BCs when writting
+velocity(a::Simulation) = a.flow.u |> Array;
+pressure(a::Simulation) = a.flow.p |> Array;
+_body(a::Simulation) = (measure_sdf!(a.flow.σ, a.body);
+                        a.flow.σ |> Array;)
+vorticity(a::Simulation) = (@inside a.flow.σ[I] = 
+                            WaterLily.curl(3,I,a.flow.u)*a.L/a.U;
+                            WaterLily.perBC!(a.flow.σ,());
+                            a.flow.σ |> Array;)
+_vbody(a::Simulation) = a.flow.V |> Array;
+mu0(a::Simulation) = a.flow.μ₀ |> Array;
+ranks(a::Simulation) = (a.flow.σ.=0; 
+                        @inside a.flow.σ[I] = me()+1;
+                        WaterLily.perBC!(a.flow.σ,());
+                        a.flow.σ |> Array;)
+
+custom_attrib = Dict(
+    "u" => velocity,
+    "p" => pressure,
+    "d" => _body,
+    "ω" => vorticity,
+    "v" => _vbody,
+    "μ₀" => mu0,
+    "rank" => ranks
+)# this maps what to write to the name in the file
+
+"""Flow around a circle"""
+function circle(dims,center,radius;Re=250,U=1,psolver=MultiLevelPoisson,mem=Array)
+    body = AutoBody((x,t)->√sum(abs2, x .- center) - radius)
+    Simulation(dims, (U,0), radius; ν=U*radius/Re, body, exitBC=false, mem=mem, psolver=psolver)
+end
+
+# local grid size
+L = 2^6
+
+# init the MPI grid and the simulation
+r = init_mpi((L,2L))
+sim = circle((L,2L),SA[L/2,L+2],L/8;mem=MPIArray) #use MPIArray to use extension
+
+wr = vtkWriter("WaterLily-MPI-circle";attrib=custom_attrib,dir="vtk_data",
+               extents=get_extents(sim.flow.p))
+for _ in 1:50
+    sim_step!(sim,sim_time(sim)+1.0,verbose=true)
+    write!(wr,sim)
+end
+close(wr)
+finalize_mpi()
\ No newline at end of file
diff --git a/examples/TwoD_CircleMPIArray.jl b/examples/TwoD_CircleMPIArray.jl
new file mode 100644
index 00000000..ce79b975
--- /dev/null
+++ b/examples/TwoD_CircleMPIArray.jl
@@ -0,0 +1,13 @@
+using WaterLily
+
+# circle simulation
+function circle(n,m;Re=250,U=1)
+    radius, center = m/8, m/2
+    body = AutoBody((x,t)->√sum(abs2, x .- center) - radius)
+    Simulation((n,m), (U,0), radius; ν=U*radius/Re, body, mem=MPIArray)
+end
+
+# test on sim
+include("/home/marin/Workspace/Tutorials-WaterLily/src/TwoD_plots.jl")
+sim = circle(3*2^6,2^7)
+sim_gif!(sim,duration=10,clims=(-5,5),plotbody=true)
\ No newline at end of file
diff --git a/ext/WaterLilyMPIExt.jl b/ext/WaterLilyMPIExt.jl
new file mode 100644
index 00000000..0c58ac78
--- /dev/null
+++ b/ext/WaterLilyMPIExt.jl
@@ -0,0 +1,264 @@
+module WaterLilyMPIExt
+
+if isdefined(Base, :get_extension)
+    using MPI
+else
+    using ..MPI
+end
+
+using StaticArrays
+using WaterLily
+import WaterLily: init_mpi,me,global_loc,mpi_grid,mpi_dims,finalize_mpi,get_extents
+import WaterLily: BC!,perBC!,exitBC!,L₂,L∞,_dot,CFL,residual!,sim_step!
+import WaterLily: applyV!,applyS!,measure!,@loop,measure_sdf!,grid_loc,master
+
+const NDIMS_MPI = 3           # Internally, we set the number of dimensions always to 3 for calls to MPI. This ensures a fixed size for MPI coords, neigbors, etc and in general a simple, easy to read code.
+const NNEIGHBORS_PER_DIM = 2
+
+"""
+    halos(dims,d)
+
+Return the CartesianIndices of the halos in dimension `±d` of an array of size `dims`.
+"""
+function halos(dims::NTuple{N},j) where N
+    CartesianIndices(ntuple( i-> i==abs(j) ? j<0 ? (1:2) : (dims[i]-1:dims[i]) : (1:dims[i]), N))
+end
+"""
+    buff(dims,d)
+
+Return the CartesianIndices of the buffer in dimension `±d` of an array of size `dims`.
+"""
+function buff(dims::NTuple{N},j) where N
+    CartesianIndices(ntuple( i-> i==abs(j) ? j<0 ? (3:4) : (dims[i]-3:dims[i]-2) : (1:dims[i]), N))
+end
+#@TODO these can be cleaned up
+function send_flat(I,N,n,d)
+    J = LinearIndices(buff(N,d)); Is=buff(N,d)[1]
+    J[I-Is+oneunit(I)] + n*length(J)
+end
+function fill_send!(a::MPIArray,d,::Val{:Scalar}) # fill scalar field
+    N=size(a)
+    @loop a.send[1][send_flat(I,N,0,-d)] = a[I] over I ∈ buff(N,-d)
+    @loop a.send[2][send_flat(I,N,0,+d)] = a[I] over I ∈ buff(N,+d)
+    # @loop a.send[1][I] = a[buff(N,-d)[I]] over I ∈ CartesianIndices(a.send[1])
+    # @loop a.send[2][I] = a[buff(N,+d)[I]] over I ∈ CartesianIndices(a.send[2])
+end
+function fill_send!(a::MPIArray,d,::Val{:Vector})# fill vector field
+    N,n = WaterLily.size_u(a)
+    for i ∈ 1:n # copy every component in that direction
+        @loop a.send[1][send_flat(I,N,i-1,-d)] = a[I,i] over I ∈ buff(N,-d)
+        @loop a.send[2][send_flat(I,N,i-1,+d)] = a[I,i] over I ∈ buff(N,+d)
+    end
+end
+function recv_flat(I,N,n,d)
+    J = LinearIndices(halos(N,d)); Is=halos(N,d)[1]
+    J[I-Is+oneunit(I)] + n*length(J)
+end
+function copyto!(a::MPIArray,d,::Val{:Scalar}) # copy scalar field back from rcv buffer
+    N=size(a); i = d<0 ? 1 : 2
+    # @loop a[halos(N,d)[I]] = a.recv[i][I]  over I ∈ CartesianIndices(a.recv[i])
+    @loop a[I] = a.recv[i][recv_flat(I,N,0,d)] over I ∈ halos(N,d)
+end
+function copyto!(a::MPIArray,d,::Val{:Vector}) # copy scalar field back from rcv buffer
+    N,n = WaterLily.size_u(a); i = d<0 ? 1 : 2
+    for j ∈ 1:n # copy every component in that direction
+        @loop a[I,j] = a.recv[i][recv_flat(I,N,j-1,d)] over I ∈ halos(N,d)
+    end
+end
+
+
+"""
+    mpi_swap!(send1,recv1,send2,recv2,neighbor,comm)
+
+This function swaps the data between two MPI processes. The data is sent from `send1` to `neighbor[1]` and received in `recv1`.
+The data is sent from `send2` to `neighbor[2]` and received in `recv2`. The function is non-blocking and returns when all data 
+has been sent and received. 
+"""
+function mpi_swap!(send1,recv1,send2,recv2,neighbor,comm)
+    reqs=MPI.Request[]
+    # Send to / receive from neighbor 1 in dimension d
+    push!(reqs,MPI.Isend(send1,  neighbor[1], 0, comm))
+    push!(reqs,MPI.Irecv!(recv1, neighbor[1], 1, comm))
+    # Send to / receive from neighbor 2 in dimension d
+    push!(reqs,MPI.Irecv!(recv2, neighbor[2], 0, comm))
+    push!(reqs,MPI.Isend(send2,  neighbor[2], 1, comm))
+    # wair for all transfer to be done
+    MPI.Waitall!(reqs)
+end
+function mpi_swap!(a::MPIArray,neighbor,comm)
+    # prepare the transfer
+    reqs=MPI.Request[]
+    # Send to / receive from neighbor 1 in dimension d
+    push!(reqs,MPI.Isend(a.send[1],  neighbor[1], 0, comm))
+    push!(reqs,MPI.Irecv!(a.recv[1], neighbor[1], 1, comm))
+    # Send to / receive from neighbor 2 in dimension d
+    push!(reqs,MPI.Irecv!(a.recv[2], neighbor[2], 0, comm))
+    push!(reqs,MPI.Isend(a.send[2],  neighbor[2], 1, comm))
+    # wair for all transfer to be done
+    MPI.Waitall!(reqs)
+end
+
+"""
+    perBC!(a)
+
+This function sets the boundary conditions of the array `a` using the MPI grid.
+"""
+perBC!(a::MPIArray,::Tuple{})            = _perBC!(a, size(a), true)
+perBC!(a::MPIArray, perdir, N = size(a)) = _perBC!(a, N, true)
+_perBC!(a, N, mpi::Bool) = for d ∈ eachindex(N)
+    # fill with data to transfer
+    fill_send!(a,d,Val(:Scalar))
+
+    # swap
+    mpi_swap!(a,neighbors(d),mpi_grid().comm)
+
+    # this sets the BCs
+    !mpi_wall(d,1) && copyto!(a,-d,Val(:Scalar)) # halo swap
+    !mpi_wall(d,2) && copyto!(a,+d,Val(:Scalar)) # halo swap
+end
+
+using EllipsisNotation
+"""
+    BC!(a)
+
+This function sets the boundary conditions of the array `a` using the MPI grid.
+"""
+function BC!(a::MPIArray,A,saveexit=false,perdir=())
+    N,n = WaterLily.size_u(a)
+    for d ∈ 1:n # transfer full halos in each direction
+        # get data to transfer @TODO use @views
+        send1 = a[buff(N,-d),:]; send2 = a[buff(N,+d),:]
+        recv1 = zero(send1);     recv2 = zero(send2)
+        # swap 
+        mpi_swap!(send1,recv1,send2,recv2,neighbors(d),mpi_grid().comm)
+
+        # mpi boundary swap
+        !mpi_wall(d,1) && (a[halos(N,-d),:] .= recv1) # halo swap
+        !mpi_wall(d,2) && (a[halos(N,+d),:] .= recv2) # halo swap
+        
+        for i ∈ 1:n # this sets the BCs on the physical boundary
+            if mpi_wall(d,1) # left wall
+                if i==d # set flux
+                    a[halos(N,-d),i] .= A[i]
+                    a[WaterLily.slice(N,3,d),i] .= A[i]
+                else # zero gradient
+                    a[halos(N,-d),i] .= reverse(send1[..,i]; dims=d)
+                end
+            end
+            if mpi_wall(d,2) # right wall
+                if i==d && (!saveexit || i>1) # convection exit
+                    a[halos(N,+d),i] .= A[i]
+                else # zero gradient
+                    a[halos(N,+d),i] .= reverse(send2[..,i]; dims=d)
+                end
+            end
+        end
+    end
+end
+
+function exitBC!(u::MPIArray{T},u⁰,U,Δt) where T
+    N,_ = WaterLily.size_u(u)
+    exitR = WaterLily.slice(N.-2,N[1]-1,1,3) # exit slice excluding ghosts
+    ∮udA = zero(T)
+    if mpi_wall(1,2) # right wall
+        @loop u[I,1] = u⁰[I,1]-U[1]*Δt*(u⁰[I,1]-u⁰[I-δ(1,I),1]) over I ∈ exitR
+        @loop u[I+δ(1,I),1] = u[I,1] over I ∈ exitR
+        ∮udA = sum(@view(u[exitR,1]))/length(exitR)-U[1]   # mass flux imbalance
+    end
+    ∮u = MPI.Allreduce(∮udA,+,mpi_grid().comm)           # domain imbalance
+    N = prod(mpi_dims()[2:end]) # for now we only have 1D exit
+    mpi_wall(1,2) && (@loop u[I,1] -= ∮u/N over I ∈ exitR;
+                      @loop u[I+δ(1,I),1] -= ∮u/N over I ∈ exitR) # correct flux only on right wall
+end
+
+struct MPIGrid #{I,C<:MPI.Comm,N<:AbstractVector,M<:AbstractArray,G<:AbstractVector}
+    me::Int                    # rank
+    comm::MPI.Comm             # communicator
+    coords::AbstractVector     # coordinates
+    neighbors::AbstractArray   # neighbors
+    global_loc::AbstractVector # the location of the lower left corner in global index space
+end
+const MPI_GRID_NULL = MPIGrid(-1,MPI.COMM_NULL,[-1,-1,-1],[-1 -1 -1; -1 -1 -1],[0,0,0])
+
+let
+    global MPIGrid, set_mpi_grid, mpi_grid, mpi_initialized, check_mpi
+
+    # allows to access the global mpi grid
+    _mpi_grid::MPIGrid          = MPI_GRID_NULL
+    mpi_grid()::MPIGrid         = (check_mpi(); _mpi_grid::MPIGrid)
+    set_mpi_grid(grid::MPIGrid) = (_mpi_grid = grid;)
+    mpi_initialized()           = (_mpi_grid.comm != MPI.COMM_NULL)
+    check_mpi()                 = !mpi_initialized() && error("MPI not initialized")
+end
+
+function init_mpi(Dims::NTuple{D};dims=[0, 0, 0],periods=[0, 0, 0],comm::MPI.Comm=MPI.COMM_WORLD,
+                  disp::Integer=1,reorder::Bool=true) where D
+    # MPI
+    MPI.Init()
+    nprocs = MPI.Comm_size(comm)
+    # create cartesian communicator
+    MPI.Dims_create!(nprocs, dims)
+    comm_cart = MPI.Cart_create(comm, dims, periods, reorder)
+    me     = MPI.Comm_rank(comm_cart)
+    coords = MPI.Cart_coords(comm_cart)
+    # make the cart comm
+    neighbors = fill(MPI.PROC_NULL, NNEIGHBORS_PER_DIM, NDIMS_MPI);
+    for i = 1:NDIMS_MPI
+        neighbors[:,i] .= MPI.Cart_shift(comm_cart, i-1, disp);
+    end
+    # global index coordinate in grid space
+    global_loc = SVector([coords[i]*Dims[i] for i in 1:D]...)
+    set_mpi_grid(MPIGrid(me,comm_cart,coords,neighbors,global_loc))
+    return me; # this is the most usefull MPI vriable to have in the local space
+end
+finalize_mpi() = MPI.Finalize()
+
+# helper functions
+me() = mpi_grid().me
+master(::Val{:WaterLily_MPIExt}) = me()==0
+grid_loc(::Val{:WaterLily_MPIExt}) = mpi_grid().global_loc
+neighbors(dim) = mpi_grid().neighbors[:,dim]
+mpi_wall(dim,i) = mpi_grid().neighbors[i,dim]==MPI.PROC_NULL
+mpi_dims() = MPI.Cart_get(mpi_grid().comm)[1]
+
+L₂(a::MPIArray{T}) where T = MPI.Allreduce(sum(T,abs2,@inbounds(a[I]) for I ∈ inside(a)),+,mpi_grid().comm)
+function L₂(p::Poisson{T,S}) where {T,S<:MPIArray{T}} # should work on the GPU
+    MPI.Allreduce(sum(T,@inbounds(p.r[I]*p.r[I]) for I ∈ inside(p.r)),+,mpi_grid().comm)
+end
+L∞(a::MPIArray) = MPI.Allreduce(maximum(abs.(a)),Base.max,mpi_grid().comm)
+L∞(p::Poisson{T,S}) where {T,S<:MPIArray{T}} = MPI.Allreduce(maximum(abs.(p.r)),Base.max,mpi_grid().comm)
+function _dot(a::MPIArray{T},b::MPIArray{T}) where T
+    MPI.Allreduce(sum(T,@inbounds(a[I]*b[I]) for I ∈ inside(a)),+,mpi_grid().comm)
+end
+
+function CFL(a::Flow{D,T,S};Δt_max=10) where {D,T,S<:MPIArray{T}}
+    @inside a.σ[I] = WaterLily.flux_out(I,a.u)
+    MPI.Allreduce(min(Δt_max,inv(maximum(a.σ)+5a.ν)),Base.min,mpi_grid().comm)
+end
+# this actually add a global comminutation every time residual is called
+function residual!(p::Poisson{T,S}) where {T,S<:MPIArray{T}}
+    WaterLily.perBC!(p.x,p.perdir)
+    @inside p.r[I] = ifelse(p.iD[I]==0,0,p.z[I]-WaterLily.mult(I,p.L,p.D,p.x))
+    # s = sum(p.r)/length(inside(p.r))
+    s = MPI.Allreduce(sum(p.r)/length(inside(p.r)),+,mpi_grid().comm)
+    abs(s) <= 2eps(eltype(s)) && return
+    @inside p.r[I] = p.r[I]-s
+end
+
+function sim_step!(sim::Simulation{D,T,S},t_end;remeasure=true,
+                   max_steps=typemax(Int),verbose=false) where {D,T,S<:MPIArray{T}}
+    steps₀ = length(sim.flow.Δt)
+    while sim_time(sim) < t_end && length(sim.flow.Δt) - steps₀ < max_steps
+        sim_step!(sim; remeasure)
+        (verbose && me()==0) && println("tU/L=",round(sim_time(sim),digits=4),
+                                        ", Δt=",round(sim.flow.Δt[end],digits=3))
+    end
+end
+
+# hepler function for vtk writer
+function get_extents(a::MPIArray)
+    xs = Tuple(ifelse(x==0,1,x+1):ifelse(x==0,n+4,n+x+4) for (n,x) in zip(size(inside(a)),mpi_grid().global_loc))
+    MPI.Allgather(xs, mpi_grid().comm)
+end
+
+end # module
\ No newline at end of file
diff --git a/ext/WaterLilyReadVTKExt.jl b/ext/WaterLilyReadVTKExt.jl
index b894ea14..97b5d120 100644
--- a/ext/WaterLilyReadVTKExt.jl
+++ b/ext/WaterLilyReadVTKExt.jl
@@ -41,7 +41,7 @@ function restart_sim!(a::Simulation;fname::String="WaterLily.pvd",attrib=default
     # return a writer if needed
     k = length(PVDFile(fname).timesteps)
     vtkWriter(split(fname,".pvd")[1],PVDFile(fname).directories[1],
-              WaterLily.pvd_collection(fname;append=true),attrib,k)
+              WaterLily.pvd_collection(fname;append=true),attrib,k,extent)
 end
 
 end # module
\ No newline at end of file
diff --git a/ext/WaterLilyWriteVTKExt.jl b/ext/WaterLilyWriteVTKExt.jl
index 46c8c6b6..0181f3c8 100644
--- a/ext/WaterLilyWriteVTKExt.jl
+++ b/ext/WaterLilyWriteVTKExt.jl
@@ -30,13 +30,14 @@ struct VTKWriter
     collection    :: WriteVTK.CollectionFile
     output_attrib :: Dict{String,Function}
     count         :: Vector{Int}
+    extents       #:: Tuple{UnitRange}# cannot figure out what type to put here
 end
-function vtkWriter(fname="WaterLily";attrib=default_attrib(),dir="vtk_data",T=Float32)
-    !isdir(dir) && mkdir(dir)
-    VTKWriter(fname,dir,pvd_collection(fname),attrib,[0])
+function vtkWriter(fname="WaterLily";attrib=default_attrib(),dir="vtk_data",T=Float32,extents=[(1:1,1:1)])
+    (WaterLily.master() && !isdir(dir)) && mkdir(dir)
+    VTKWriter(fname,dir,pvd_collection(fname),attrib,[0],extents)
 end
-function vtkWriter(fname,dir::String,collection,attrib::Dict{String,Function},k)
-    VTKWriter(fname,dir,collection,attrib,[k])
+function vtkWriter(fname,dir::String,collection,attrib::Dict{String,Function},k,extents)
+    VTKWriter(fname,dir,collection,attrib,[k],extents)
 end
 """
     default_attrib()
@@ -54,8 +55,8 @@ default_attrib() = Dict("Velocity"=>_velocity, "Pressure"=>_pressure)
 Write the simulation data at time `sim_time(sim)` to a `vti` file and add the file path
 to the collection file.
 """
-function write!(w::VTKWriter,a::Simulation)
-    k = w.count[1]; N=size(a.flow.p)
+function write!(w::VTKWriter,a::Simulation;N=size(a.flow.p))
+    k = w.count[1]
     vtk = vtk_grid(w.dir_name*@sprintf("/%s_%06i", w.fname, k), [1:n for n in N]...)
     for (name,func) in w.output_attrib
         # this seems bad, but I @benchmark it and it's the same as just calling func()
@@ -64,6 +65,18 @@ function write!(w::VTKWriter,a::Simulation)
     vtk_save(vtk); w.count[1]=k+1
     w.collection[round(sim_time(a),digits=4)]=vtk
 end
+# parralel version of that file
+function write!(w::VTKWriter,a::Simulation{D,T,S};N=size(inside(a.flow.p))) where {D,T,S<:MPIArray{T}}
+    k,part = w.count[1], Int(me()+1)
+    pvtk = pvtk_grid(w.dir_name*@sprintf("/%s_%06i", w.fname, k), w.extents[part];
+                     part=part, extents=w.extents, ghost_level=2)
+    for (name,func) in w.output_attrib
+        # this seems bad, but I @benchmark it and it's the same as just calling func()
+        pvtk[name] = size(func(a))==size(a.flow.p) ? func(a) : components_first(func(a))
+    end
+    vtk_save(pvtk); w.count[1]=k+1
+    w.collection[round(sim_time(a),digits=4)]=pvtk
+end
 """
     close(w::VTKWriter)
 
@@ -78,4 +91,128 @@ this is reqired for the vtk file.
 """
 components_first(a::AbstractArray{T,N}) where {T,N} = permutedims(a,[N,1:N-1...])
 
+"""
+This is very anoying but is required to keep the file written neatly organised...
+"""
+function pvtk_grid(
+        filename::AbstractString, args...;
+        part, ismain = (part == 1), ghost_level = 0, kwargs...,
+    )
+    is_structured = WriteVTK._pvtk_is_structured(args...)
+    nparts = WriteVTK._pvtk_nparts(is_structured; kwargs...)
+    extents = WriteVTK._pvtk_extents(is_structured; kwargs...)
+
+    # mkpath(filename)
+    bname = basename(filename)
+    prefix = filename #joinpath(filename, bname)
+    fn = WriteVTK._serial_filename(part, nparts, prefix, "")
+
+    vtk = let kws_vtk = WriteVTK._remove_parallel_kwargs(; kwargs...)
+        kws = if extents === nothing
+            kws_vtk
+        else
+            (; kws_vtk..., extent = extents[part])
+        end
+        vtk_grid(fn, args...; kws...)
+    end
+
+    pvtkargs = WriteVTK.PVTKArgs(part, nparts, ismain, ghost_level)
+    xdoc = WriteVTK.XMLDocument()
+    _, ext = splitext(vtk.path)
+    path = filename * ".p" * ext[2:end]
+    pvtk = WriteVTK.PVTKFile(pvtkargs, xdoc, vtk, path)
+    _init_pvtk!(pvtk, extents)
+
+    return pvtk
+end
+function _init_pvtk!(pvtk::WriteVTK.PVTKFile, extents)
+    # Recover some data
+    vtk = pvtk.vtk
+    pvtkargs = pvtk.pvtkargs
+    pgrid_type = "P" * vtk.grid_type
+    npieces = pvtkargs.nparts
+    pref, _ = splitext(pvtk.path)
+    _, ext = splitext(vtk.path)
+    prefix = basename(pref) #joinpath(pref, basename(pref))
+
+    # VTKFile (root) node
+    pvtk_root = WriteVTK.create_root(pvtk.xdoc, "VTKFile")
+    WriteVTK.set_attribute(pvtk_root, "type", pgrid_type)
+    WriteVTK.set_attribute(pvtk_root, "version", "1.0")
+    if WriteVTK.IS_LITTLE_ENDIAN
+        WriteVTK.set_attribute(pvtk_root, "byte_order", "LittleEndian")
+    else
+        WriteVTK.set_attribute(pvtk_root, "byte_order", "BigEndian")
+    end
+
+    # Grid node
+    pvtk_grid = WriteVTK.new_child(pvtk_root, pgrid_type)
+    WriteVTK.set_attribute(pvtk_grid, "GhostLevel", string(pvtkargs.ghost_level))
+
+    # Pieces (i.e. Pointers to serial files)
+    for piece ∈ 1:npieces
+        pvtk_piece = WriteVTK.new_child(pvtk_grid, "Piece")
+        fn = WriteVTK._serial_filename(piece, npieces, prefix, ext)
+        WriteVTK.set_attribute(pvtk_piece, "Source", fn)
+
+        # Add local extent if necessary
+        if extents !== nothing
+            WriteVTK.set_attribute(pvtk_piece, "Extent", WriteVTK.extent_attribute(extents[piece]))
+        end
+    end
+
+    # Add whole extent if necessary
+    whole_extent = WriteVTK.compute_whole_extent(extents)
+    if whole_extent !== nothing
+        WriteVTK.set_attribute(pvtk_grid, "WholeExtent", WriteVTK.extent_attribute(whole_extent))
+    end
+
+    # Getting original grid informations
+    # Recover point type and number of components
+    vtk_root = WriteVTK.root(vtk.xdoc)
+    vtk_grid = WriteVTK.find_element(vtk_root, vtk.grid_type)
+
+    # adding origin and spacing if necessary
+    origin = WriteVTK.attribute(vtk_grid, "Origin")
+    if origin !== nothing
+        WriteVTK.set_attribute(pvtk_grid, "Origin", origin)
+    end
+
+    spacing = WriteVTK.attribute(vtk_grid, "Spacing")
+    if spacing !== nothing
+        WriteVTK.set_attribute(pvtk_grid, "Spacing", spacing)
+    end
+
+    # Getting original piece informations
+    vtk_piece = WriteVTK.find_element(vtk_grid, "Piece")
+
+    # If serial VTK has points
+    vtk_points = WriteVTK.find_element(vtk_piece, "Points")
+    if vtk_points !== nothing
+        vtk_data_array = WriteVTK.find_element(vtk_points, "DataArray")
+        point_type = WriteVTK.attribute(vtk_data_array, "type")
+        Nc = WriteVTK.attribute(vtk_data_array, "NumberOfComponents")
+        ## PPoints node
+        pvtk_ppoints = WriteVTK.new_child(pvtk_grid, "PPoints")
+        pvtk_pdata_array = WriteVTK.new_child(pvtk_ppoints, "PDataArray")
+        WriteVTK.set_attribute(pvtk_pdata_array, "type", point_type)
+        WriteVTK.set_attribute(pvtk_pdata_array, "Name", "Points")
+        WriteVTK.set_attribute(pvtk_pdata_array, "NumberOfComponents", Nc)
+    end
+
+    # If serial VTK has coordinates
+    vtk_coordinates = WriteVTK.find_element(vtk_piece, "Coordinates")
+    if vtk_coordinates !== nothing
+        pvtk_pcoordinates = WriteVTK.new_child(pvtk_grid, "PCoordinates")
+        for c in WriteVTK.child_elements(vtk_coordinates)
+            pvtk_pdata_array = WriteVTK.new_child(pvtk_pcoordinates, "PDataArray")
+            WriteVTK.set_attribute(pvtk_pdata_array, "type", WriteVTK.attribute(c, "type"))
+            WriteVTK.set_attribute(pvtk_pdata_array, "Name", WriteVTK.attribute(c, "Name"))
+            WriteVTK.set_attribute(pvtk_pdata_array, "NumberOfComponents", WriteVTK.attribute(c, "NumberOfComponents"))
+        end
+    end
+
+    pvtk
+end
+
 end # module
diff --git a/src/AutoBody.jl b/src/AutoBody.jl
index 2e4b765e..e5af4e64 100644
--- a/src/AutoBody.jl
+++ b/src/AutoBody.jl
@@ -143,4 +143,4 @@ function curvature(A::AbstractMatrix)
         K = A[1,1]*A[2,2]+A[1,1]*A[3,3]+A[2,2]*A[3,3]-A[1,2]^2-A[1,3]^2-A[2,3]^2
     end
     H,K
-end
+end
\ No newline at end of file
diff --git a/src/Body.jl b/src/Body.jl
index b544595a..040a5907 100644
--- a/src/Body.jl
+++ b/src/Body.jl
@@ -73,4 +73,4 @@ measure_sdf!(a::AbstractArray,body::AbstractBody,t=0;kwargs...) = @inside a[I] =
 Use for a simulation without a body.
 """
 struct NoBody <: AbstractBody end
-function measure!(a::Flow,body::NoBody;t=0,ϵ=1) end
+function measure!(a::Flow,body::NoBody;t=0,ϵ=1) end
\ No newline at end of file
diff --git a/src/Flow.jl b/src/Flow.jl
index 9dbd39e6..b843f1fd 100644
--- a/src/Flow.jl
+++ b/src/Flow.jl
@@ -4,9 +4,6 @@
 @fastmath quick(u,c,d) = median((5c+2d-u)/6,c,median(10c-9u,c,d))
 @fastmath vanLeer(u,c,d) = (c≤min(u,d) || c≥max(u,d)) ? c : c+(d-c)*(c-u)/(d-u)
 @inline ϕu(a,I,f,u,λ=quick) = @inbounds u>0 ? u*λ(f[I-2δ(a,I)],f[I-δ(a,I)],f[I]) : u*λ(f[I+δ(a,I)],f[I],f[I-δ(a,I)])
-@inline ϕuP(a,Ip,I,f,u,λ=quick) = @inbounds u>0 ? u*λ(f[Ip],f[I-δ(a,I)],f[I]) : u*λ(f[I+δ(a,I)],f[I],f[I-δ(a,I)])
-@inline ϕuL(a,I,f,u,λ=quick) = @inbounds u>0 ? u*ϕ(a,I,f) : u*λ(f[I+δ(a,I)],f[I],f[I-δ(a,I)])
-@inline ϕuR(a,I,f,u,λ=quick) = @inbounds u<0 ? u*ϕ(a,I,f) : u*λ(f[I-2δ(a,I)],f[I-δ(a,I)],f[I])
 
 @fastmath @inline function div(I::CartesianIndex{m},u) where {m}
     init=zero(eltype(u))
@@ -37,28 +34,13 @@ function conv_diff!(r,u,Φ;ν=0.1,perdir=())
     r .= 0.
     N,n = size_u(u)
     for i ∈ 1:n, j ∈ 1:n
-        # if it is periodic direction
-        tagper = (j in perdir)
-        # treatment for bottom boundary with BCs
-        lowerBoundary!(r,u,Φ,ν,i,j,N,Val{tagper}())
-        # inner cells
+        # convection diffusion on inner cells
         @loop (Φ[I] = ϕu(j,CI(I,i),u,ϕ(i,CI(I,j),u)) - ν*∂(j,CI(I,i),u);
-               r[I,i] += Φ[I]) over I ∈ inside_u(N,j)
-        @loop r[I-δ(j,I),i] -= Φ[I] over I ∈ inside_u(N,j)
-        # treatment for upper boundary with BCs
-        upperBoundary!(r,u,Φ,ν,i,j,N,Val{tagper}())
+               r[I,i] += Φ[I]) over I ∈ inside_u(N,0)
+        @loop r[I-δ(j,I),i] -= Φ[I] over I ∈ inside_u(N,0)
     end
 end
 
-# Neumann BC Building block
-lowerBoundary!(r,u,Φ,ν,i,j,N,::Val{false}) = @loop r[I,i] += ϕuL(j,CI(I,i),u,ϕ(i,CI(I,j),u)) - ν*∂(j,CI(I,i),u) over I ∈ slice(N,2,j,2)
-upperBoundary!(r,u,Φ,ν,i,j,N,::Val{false}) = @loop r[I-δ(j,I),i] += -ϕuR(j,CI(I,i),u,ϕ(i,CI(I,j),u)) + ν*∂(j,CI(I,i),u) over I ∈ slice(N,N[j],j,2)
-
-# Periodic BC Building block
-lowerBoundary!(r,u,Φ,ν,i,j,N,::Val{true}) = @loop (
-    Φ[I] = ϕuP(j,CIj(j,CI(I,i),N[j]-2),CI(I,i),u,ϕ(i,CI(I,j),u)) -ν*∂(j,CI(I,i),u); r[I,i] += Φ[I]) over I ∈ slice(N,2,j,2)
-upperBoundary!(r,u,Φ,ν,i,j,N,::Val{true}) = @loop r[I-δ(j,I),i] -= Φ[CIj(j,I,2)] over I ∈ slice(N,N[j],j,2)
-
 using EllipsisNotation
 """
     accelerate!(r,dt,g)
@@ -109,7 +91,7 @@ struct Flow{D, T, Sf<:AbstractArray{T}, Vf<:AbstractArray{T}, Tf<:AbstractArray{
     perdir :: NTuple # tuple of periodic direction
     function Flow(N::NTuple{D}, U; f=Array, Δt=0.25, ν=0., g=nothing,
                   uλ::Function=(i, x) -> 0., perdir=(), exitBC=false, T=Float64) where D
-        Ng = N .+ 2
+        Ng = N .+ 4
         Nd = (Ng..., D)
         u = Array{T}(undef, Nd...) |> f; apply!(uλ, u);
         BC!(u,BCTuple(U,0.,D),exitBC,perdir); exitBC!(u,u,BCTuple(U,0.,D),0.)
diff --git a/src/MultiLevelPoisson.jl b/src/MultiLevelPoisson.jl
index cb262df1..bef2bc63 100644
--- a/src/MultiLevelPoisson.jl
+++ b/src/MultiLevelPoisson.jl
@@ -1,5 +1,5 @@
-@inline up(I::CartesianIndex,a=0) = (2I-2oneunit(I)):(2I-oneunit(I)-δ(a,I))
-@inline down(I::CartesianIndex) = CI((I+2oneunit(I)).I .÷2)
+@inline up(I::CartesianIndex,a=0) = (2I-3oneunit(I)):(2I-2oneunit(I)-δ(a,I))
+@inline down(I::CartesianIndex) = CI((I+3oneunit(I)).I .÷2)
 @fastmath @inline function restrict(I::CartesianIndex,b)
     s = zero(eltype(b))
     for J ∈ up(I)
@@ -17,7 +17,7 @@ end
 
 function restrictML(b::Poisson)
     N,n = size_u(b.L)
-    Na = map(i->1+i÷2,N)
+    Na = map(i->2+i÷2,N)
     aL = similar(b.L,(Na...,n)); fill!(aL,0)
     ax = similar(b.x,Na); fill!(ax,0)
     restrictL!(aL,b.L,perdir=b.perdir)
@@ -26,14 +26,14 @@ end
 function restrictL!(a::AbstractArray{T},b;perdir=()) where T
     Na,n = size_u(a)
     for i ∈ 1:n
-        @loop a[I,i] = restrictL(I,i,b) over I ∈ CartesianIndices(map(n->2:n-1,Na))
+        @loop a[I,i] = restrictL(I,i,b) over I ∈ CartesianIndices(map(n->3:n-2,Na))
     end
     BC!(a,zeros(SVector{n,T}),false,perdir)  # correct μ₀ @ boundaries
 end
 restrict!(a,b) = @inside a[I] = restrict(I,b)
 prolongate!(a,b) = @inside a[I] = b[down(I)]
 
-@inline divisible(N) = mod(N,2)==0 && N>4
+@inline divisible(N) = mod(N-2,2)==0 && N>6
 @inline divisible(l::Poisson) = all(size(l.x) .|> divisible)
 """
     MultiLevelPoisson{N,M}
@@ -84,17 +84,15 @@ end
 mult!(ml::MultiLevelPoisson,x) = mult!(ml.levels[1],x)
 residual!(ml::MultiLevelPoisson,x) = residual!(ml.levels[1],x)
 
-function solver!(ml::MultiLevelPoisson;tol=2e-4,itmx=32)
+function solver!(ml::MultiLevelPoisson;tol=1e-5,itmx=32)
     p = ml.levels[1]
-    residual!(p); r₀ = r₂ = L∞(p); r₂₀ = L₂(p)
+    residual!(p); r₂ = L₂(p)
     nᵖ=0
-    while r₂>tol && nᵖ<itmx
+    while nᵖ<itmx
         Vcycle!(ml)
-        smooth!(p); r₂ = L∞(p)
-        nᵖ+=1
+        smooth!(p); r₂ = L₂(p)
+        nᵖ+=1; r₂<tol && break
     end
     perBC!(p.x,p.perdir)
-    (nᵖ<2 && length(ml.levels)>5) && pop!(ml.levels); # remove coarsest level if this was easy
-    (nᵖ>4 && divisible(ml.levels[end])) && push!(ml.levels,restrictML(ml.levels[end])) # add a level if this was hard
     push!(ml.n,nᵖ);
 end
diff --git a/src/Poisson.jl b/src/Poisson.jl
index 5295c49c..f1ccbe21 100644
--- a/src/Poisson.jl
+++ b/src/Poisson.jl
@@ -109,10 +109,15 @@ Note: This runs for general backends, but is _very_ slow to converge.
 """
 @fastmath Jacobi!(p;it=1) = for _ ∈ 1:it
     @inside p.ϵ[I] = p.r[I]*p.iD[I]
+    # @TODO is that reqired?
+    perBC!(p.ϵ,p.perdir)
     increment!(p)
 end
 
-using LinearAlgebra: ⋅
+# needed for MPI later
+using LinearAlgebra: dot
+_dot(a,b) = dot(a,b)
+⋅(a,b) = _dot(a,b)
 """
     pcg!(p::Poisson; it=6)
 
@@ -158,16 +163,13 @@ Approximate iterative solver for the Poisson matrix equation `Ax=b`.
   - `tol`: Convergence tolerance on the `L₂`-norm residual.
   - `itmx`: Maximum number of iterations.
 """
-function solver!(p::Poisson;log=false,tol=1e-4,itmx=1e3)
+function solver!(p::Poisson;tol=1e-4,itmx=1e4)
     residual!(p); r₂ = L₂(p)
-    log && (res = [r₂])
     nᵖ=0
-    while r₂>tol && nᵖ<itmx
+    while nᵖ<itmx
         smooth!(p); r₂ = L₂(p)
-        log && push!(res,r₂)
-        nᵖ+=1
+        nᵖ+=1; r₂<tol && break
     end
     perBC!(p.x,p.perdir)
     push!(p.n,nᵖ)
-    log && return res
 end
\ No newline at end of file
diff --git a/src/WaterLily.jl b/src/WaterLily.jl
index 811cea4b..db0eac78 100644
--- a/src/WaterLily.jl
+++ b/src/WaterLily.jl
@@ -6,7 +6,7 @@ module WaterLily
 using DocStringExtensions
 
 include("util.jl")
-export L₂,BC!,@inside,inside,δ,apply!,loc
+export L₂,BC!,@inside,inside,δ,apply!,loc,MPIArray
 
 using Reexport
 @reexport using KernelAbstractions: @kernel,@index,get_backend
@@ -55,17 +55,17 @@ Constructor for a WaterLily.jl simulation:
 
 See files in `examples` folder for examples.
 """
-mutable struct Simulation
+mutable struct Simulation{D,T,S}
     U :: Number # velocity scale
     L :: Number # length scale
     ϵ :: Number # kernel width
-    flow :: Flow
+    flow :: Flow{D,T,S}
     body :: AbstractBody
     pois :: AbstractPoisson
     function Simulation(dims::NTuple{N}, u_BC, L::Number;
                         Δt=0.25, ν=0., g=nothing, U=nothing, ϵ=1, perdir=(),
                         uλ=nothing, exitBC=false, body::AbstractBody=NoBody(),
-                        T=Float32, mem=Array) where N
+                        T=Float32, mem=Array, psolver=MultiLevelPoisson) where N
         @assert !(isa(u_BC,Function) && isa(uλ,Function)) "`u_BC` and `uλ` cannot be both specified as Function"
         @assert !(isnothing(U) && isa(u_BC,Function)) "`U` must be specified if `u_BC` is a Function"
         isa(u_BC,Function) && @assert all(typeof.(ntuple(i->u_BC(i,zero(T)),N)).==T) "`u_BC` is not type stable"
@@ -73,7 +73,7 @@ mutable struct Simulation
         U = isnothing(U) ? √sum(abs2,u_BC) : U # default if not specified
         flow = Flow(dims,u_BC;uλ,Δt,ν,g,T,f=mem,perdir,exitBC)
         measure!(flow,body;ϵ)
-        new(U,L,ϵ,flow,body,MultiLevelPoisson(flow.p,flow.μ₀,flow.σ;perdir))
+        new{N,T,typeof(flow.p)}(U,L,ϵ,flow,body,psolver(flow.p,flow.μ₀,flow.σ;perdir))
     end
 end
 
@@ -125,13 +125,23 @@ function write! end
 function default_attrib end
 function pvd_collection end
 # export
-export vtkWriter, write!, default_attrib
+export vtkWriter,write!,default_attrib
 
 # default ReadVTK functions
 function restart_sim! end
 # export
 export restart_sim!
 
+# # @TODO add default MPI function
+function init_mpi end
+function me end
+function global_loc end
+function mpi_grid end
+function mpi_dims end
+function finalize_mpi end
+function get_extents end
+# export
+export init_mpi,me,global_loc,mpi_grid,mpi_dims,finalize_mpi,get_extents
 # Check number of threads when loading WaterLily
 """
     check_nthreads(::Val{1})
@@ -154,6 +164,7 @@ function __init__()
         @require CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" include("../ext/WaterLilyCUDAExt.jl")
         @require WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192" include("../ext/WaterLilyWriteVTKExt.jl")
         @require ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3" include("../ext/WaterLilyReadVTKExt.jl")
+        @require MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" include("../ext/WaterLilyMPIExt.jl")
     end
     check_nthreads(Val{Threads.nthreads()}())
 end
diff --git a/src/util.jl b/src/util.jl
index d85f921f..7aa638a5 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -1,5 +1,32 @@
 using KernelAbstractions: get_backend, @index, @kernel
 
+struct MPIArray{T,N,V<:AbstractArray{T,N},W<:AbstractVector{T}} <: AbstractArray{T,N}
+    A :: V
+    send :: Vector{W}
+    recv :: Vector{W}
+    function MPIArray(::Type{T}, dims::NTuple{N, Integer}) where {T,N}
+        A = Array{T,N}(undef, dims); M,n = last(dims)==N-1 ? (N-1,dims[1:end-1]) : (1,dims)
+        send = Array{T}(undef,M*2max(prod(n).÷n...))
+        recv = Array{T}(undef,M*2max(prod(n).÷n...))
+        new{T,N,typeof(A),typeof(send)}(A,[send,copy(send)],[recv,copy(recv)])
+    end
+    MPIArray(A::AbstractArray{T}) where T = (B=MPIArray(T,size(A)); B.A.=A; B)
+end
+export MPIArray
+for fname in (:size, :length, :ndims, :eltype) # how to write 4 lines of code in 5...
+    @eval begin
+        Base.$fname(A::MPIArray) = Base.$fname(A.A)
+    end
+end
+Base.getindex(A::MPIArray, i...)       = Base.getindex(A.A, i...)
+Base.setindex!(A::MPIArray, v, i...)   = Base.setindex!(A.A, v, i...)
+Base.copy(A::MPIArray)                 = MPIArray(A)
+Base.similar(A::MPIArray)              = MPIArray(eltype(A),size(A))
+Base.similar(A::MPIArray, dims::Tuple) = MPIArray(eltype(A),dims)
+# required for the @loop function
+using KernelAbstractions
+KernelAbstractions.get_backend(A::MPIArray) = KernelAbstractions.get_backend(A.A)
+
 @inline CI(a...) = CartesianIndex(a...)
 """
     CIj(j,I,jj)
@@ -19,9 +46,9 @@ Return a CartesianIndex of dimension `N` which is one at index `i` and zero else
 """
     inside(a)
 
-Return CartesianIndices range excluding a single layer of cells on all boundaries.
+Return CartesianIndices range excluding the double layer of cells on all boundaries.
 """
-@inline inside(a::AbstractArray;buff=1) = CartesianIndices(map(ax->first(ax)+buff:last(ax)-buff,axes(a)))
+@inline inside(a::AbstractArray;buff=2) = CartesianIndices(map(ax->first(ax)+buff:last(ax)-buff,axes(a)))
 
 """
     inside_u(dims,j)
@@ -30,10 +57,10 @@ Return CartesianIndices range excluding the ghost-cells on the boundaries of
 a _vector_ array on face `j` with size `dims`.
 """
 function inside_u(dims::NTuple{N},j) where {N}
-    CartesianIndices(ntuple( i-> i==j ? (3:dims[i]-1) : (2:dims[i]), N))
+    CartesianIndices(ntuple( i-> i==j ? (4:dims[i]-2) : (3:dims[i]-1), N))
 end
-@inline inside_u(dims::NTuple{N}) where N = CartesianIndices((map(i->(2:i-1),dims)...,1:N))
-@inline inside_u(u::AbstractArray) = CartesianIndices(map(i->(2:i-1),size(u)[1:end-1]))
+@inline inside_u(dims::NTuple{N}) where N = CartesianIndices((map(i->(3:i-2),dims)...,1:N))
+@inline inside_u(u::AbstractArray) = CartesianIndices(map(i->(3:i-2),Base.front(size(u))))
 splitn(n) = Base.front(n),last(n)
 size_u(u) = splitn(size(u))
 
@@ -129,12 +156,16 @@ rep(ex::Expr) = ex.head == :. ? Symbol(ex.args[2].value) : ex
 
 using StaticArrays
 """
-    loc(i,I) = loc(Ii)
+loc(i,I) = loc(Ii)
 
 Location in space of the cell at CartesianIndex `I` at face `i`.
 Using `i=0` returns the cell center s.t. `loc = I`.
 """
-@inline loc(i,I::CartesianIndex{N},T=Float32) where N = SVector{N,T}(I.I .- 1.5 .- 0.5 .* δ(i,I).I)
+grid_loc(arg) = 0 # no offset in serial
+global_loc() = grid_loc(Val(:WaterLily_MPIExt))
+master(arg) = true # always on master in serial
+master() = master(Val(:WaterLily_MPIExt))
+@inline loc(i,I::CartesianIndex{N},T=Float32) where N = SVector{N,T}(global_loc() .+ I.I .- 2.5 .- 0.5 .* δ(i,I).I)
 @inline loc(Ii::CartesianIndex,T=Float32) = loc(last(Ii),Base.front(Ii),T)
 Base.last(I::CartesianIndex) = last(I.I)
 Base.front(I::CartesianIndex) = CI(Base.front(I.I))
@@ -170,17 +201,23 @@ function BC!(a,A,saveexit=false,perdir=())
     N,n = size_u(a)
     for i ∈ 1:n, j ∈ 1:n
         if j in perdir
-            @loop a[I,i] = a[CIj(j,I,N[j]-1),i] over I ∈ slice(N,1,j)
-            @loop a[I,i] = a[CIj(j,I,2),i] over I ∈ slice(N,N[j],j)
+            #@TODO make sure this is correct
+            @loop a[I,i] = a[CIj(j,I,N[j]-3),i] over I ∈ slice(N,1,j)
+            @loop a[I,i] = a[CIj(j,I,N[j]-2),i] over I ∈ slice(N,2,j)
+            @loop a[I,i] = a[CIj(j,I,3),i] over I ∈ slice(N,N[j]-1,j)
+            @loop a[I,i] = a[CIj(j,I,4),i] over I ∈ slice(N,N[j],j)
         else
             if i==j # Normal direction, Dirichlet
-                for s ∈ (1,2)
+                for s ∈ (1,2,3)
                     @loop a[I,i] = A[i] over I ∈ slice(N,s,j)
                 end
-                (!saveexit || i>1) && (@loop a[I,i] = A[i] over I ∈ slice(N,N[j],j)) # overwrite exit
+                (!saveexit || i>1) && (@loop a[I,i] = A[i] over I ∈ slice(N,N[j]-1,j);
+                                       @loop a[I,i] = A[i] over I ∈ slice(N,N[j],j)) # overwrite exit
             else    # Tangential directions, Neumann
-                @loop a[I,i] = a[I+δ(j,I),i] over I ∈ slice(N,1,j)
-                @loop a[I,i] = a[I-δ(j,I),i] over I ∈ slice(N,N[j],j)
+                @loop a[I,i] = a[I+δ(j,I),i] over I ∈ slice(N,2,j)
+                @loop a[I,i] = a[I+2δ(j,I),i] over I ∈ slice(N,1,j)
+                @loop a[I,i] = a[I-δ(j,I),i] over I ∈ slice(N,N[j]-1,j)
+                @loop a[I,i] = a[I-2δ(j,I),i] over I ∈ slice(N,N[j],j)
             end
         end
     end
@@ -192,9 +229,9 @@ Apply a 1D convection scheme to fill the ghost cell on the exit of the domain.
 """
 function exitBC!(u,u⁰,U,Δt)
     N,_ = size_u(u)
-    exitR = slice(N.-1,N[1],1,2)              # exit slice excluding ghosts
+    exitR = slice(N.-2,N[1]-1,1,3)            # exit slice excluding ghosts
     @loop u[I,1] = u⁰[I,1]-U[1]*Δt*(u⁰[I,1]-u⁰[I-δ(1,I),1]) over I ∈ exitR
-    ∮u = sum(u[exitR,1])/length(exitR)-U[1]   # mass flux imbalance
+    ∮u = sum(@view(u[exitR,1]))/length(exitR)-U[1]   # mass flux imbalance
     @loop u[I,1] -= ∮u over I ∈ exitR         # correct flux
 end
 """
@@ -203,8 +240,10 @@ Apply periodic conditions to the ghost cells of a _scalar_ field.
 """
 perBC!(a,::Tuple{}) = nothing
 perBC!(a, perdir, N = size(a)) = for j ∈ perdir
-    @loop a[I] = a[CIj(j,I,N[j]-1)] over I ∈ slice(N,1,j)
-    @loop a[I] = a[CIj(j,I,2)] over I ∈ slice(N,N[j],j)
+    @loop a[I] = a[CIj(j,I,N[j]-3)] over I ∈ slice(N,1,j)
+    @loop a[I] = a[CIj(j,I,N[j]-2)] over I ∈ slice(N,2,j)
+    @loop a[I] = a[CIj(j,I,3)] over I ∈ slice(N,N[j]-1,j)
+    @loop a[I] = a[CIj(j,I,4)] over I ∈ slice(N,N[j],j)
 end
 """
     interp(x::SVector, arr::AbstractArray)
diff --git a/test/maintests.jl b/test/maintests.jl
index a32a0b54..32752f25 100644
--- a/test/maintests.jl
+++ b/test/maintests.jl
@@ -1,5 +1,6 @@
 using GPUArrays
 using ReadVTK, WriteVTK
+using FileIO,JLD2
 
 @info "Test backends: $(join(arrays,", "))"
 @testset "util.jl" begin
@@ -9,9 +10,9 @@ using ReadVTK, WriteVTK
     @test WaterLily.CIj(3,I,5)==CartesianIndex(1,2,5,4)
     @test WaterLily.CIj(2,CartesianIndex(16,16,16,3),14)==CartesianIndex(16,14,16,3)
 
-    @test loc(3,CartesianIndex(3,4,5)) == SVector(3,4,4.5) .- 1.5
+    @test loc(3,CartesianIndex(3,4,5)) == SVector(3,4,4.5) .- 2.5
     I = CartesianIndex(rand(2:10,3)...)
-    @test loc(0,I) == SVector(I.I...) .- 1.5
+    @test loc(0,I) == SVector(I.I...) .- 2.5
 
     ex,sym = :(a[I,i] = Math.add(p.b[I],func(I,q))),[]
     WaterLily.grab!(sym,ex)
@@ -19,15 +20,15 @@ using ReadVTK, WriteVTK
     @test sym == [:a, :I, :i, :(p.b), :q]
 
     for f ∈ arrays
-        p = zeros(4,5) |> f
+        p = zeros(6,7) |> f
         apply!(x->x[1]+x[2]+3,p) # add 2×1.5 to move edge to origin
-        @test inside(p) == CartesianIndices((2:3,2:4))
+        @test inside(p) == CartesianIndices((3:4,3:5))
         @test inside(p,buff=0) == CartesianIndices(p)
         @test L₂(p) == 187
 
         u = zeros(5,5,2) |> f
         apply!((i,x)->x[i],u)
-        @test GPUArrays.@allowscalar [u[i,j,1].-(i-2) for i in 1:3, j in 1:3]==zeros(3,3)
+        @test GPUArrays.@allowscalar [u[i,j,1].-(i-3) for i in 1:3, j in 1:3]==zeros(3,3)
 
         Ng, D, U = (6, 6), 2, (1.0, 0.5)
         u = rand(Ng..., D) |> f # vector
@@ -38,12 +39,12 @@ using ReadVTK, WriteVTK
         @test GPUArrays.@allowscalar all(u[:, 1, 2] .== U[2]) && all(u[:, 2, 2] .== U[2]) && all(u[:, end, 2] .== U[2]) &&
             all(u[1, 3:end-1, 2] .== u[2, 3:end-1, 2]) && all(u[end, 3:end-1, 2] .== u[end-1, 3:end-1, 2])
 
-        GPUArrays.@allowscalar u[end,:,1] .= 3
+        GPUArrays.@allowscalar u[end-1,:,1] .= 3
         BC!(u,U,true) # save exit values
-        @test GPUArrays.@allowscalar all(u[end, :, 1] .== 3)
+        @test GPUArrays.@allowscalar all(u[end-1, :, 1] .== 3)
 
         WaterLily.exitBC!(u,u,U,0) # conservative exit check
-        @test GPUArrays.@allowscalar all(u[end,2:end-1, 1] .== U[1])
+        @test GPUArrays.@allowscalar all(u[end-1,3:end-2, 1] .== U[1])
 
         BC!(u,U,true,(2,)) # periodic in y and save exit values
         @test GPUArrays.@allowscalar all(u[:, 1:2, 1] .== u[:, end-1:end, 1]) && all(u[:, 1:2, 1] .== u[:,end-1:end,1])
@@ -58,10 +59,10 @@ using ReadVTK, WriteVTK
         # test interpolation
         a = zeros(5,5,2) |> f; b = zeros(5,5) |> f
         apply!((i,x)->x[i]+1.5,a); apply!(x->x[1]+1.5,b) # offset for start of grid
-        @test GPUArrays.@allowscalar all(WaterLily.interp(SVector(2.5,1),a) .≈ [2.5,1.])
-        @test GPUArrays.@allowscalar all(WaterLily.interp(SVector(3.5,3),a) .≈ [3.5,3.])
-        @test GPUArrays.@allowscalar WaterLily.interp(SVector(2.5,1),b) ≈ 2.5
-        @test GPUArrays.@allowscalar WaterLily.interp(SVector(3.5,3),b) ≈ 3.5
+        @test GPUArrays.@allowscalar all(WaterLily.interp(SVector(2.5,1),a) .≈ [1.5,0.])
+        @test GPUArrays.@allowscalar all(WaterLily.interp(SVector(3.5,3),a) .≈ [2.5,2.])
+        @test GPUArrays.@allowscalar WaterLily.interp(SVector(2.5,1),b) ≈ 1.5
+        @test GPUArrays.@allowscalar WaterLily.interp(SVector(3.5,3),b) ≈ 2.5
     end
 end
 
@@ -80,9 +81,9 @@ end
 
 @testset "Poisson.jl" begin
     for f ∈ arrays
-        err,pois = Poisson_setup(Poisson,(5,5);f)
-        @test GPUArrays.@allowscalar parent(pois.D)==f(Float32[0 0 0 0 0; 0 -2 -3 -2 0; 0 -3 -4 -3 0;  0 -2 -3 -2 0; 0 0 0 0 0])
-        @test GPUArrays.@allowscalar parent(pois.iD)≈f(Float32[0 0 0 0 0; 0 -1/2 -1/3 -1/2 0; 0 -1/3 -1/4 -1/3 0;  0 -1/2 -1/3 -1/2 0; 0 0 0 0 0])
+        err,pois = Poisson_setup(Poisson,(7,7);f)
+        @test GPUArrays.@allowscalar parent(pois.D)==f(Float32[0 0 0 0 0 0 0;0 0 0 0 0 0 0;0 0 -2 -3 -2 0 0;0 0 -3 -4 -3 0 0;0 0 -2 -3 -2 0 0; 0 0 0 0 0 0 0;0 0 0 0 0 0 0])
+        @test GPUArrays.@allowscalar parent(pois.iD)≈f(Float32[0 0 0 0 0 0 0;0 0 0 0 0 0 0;0 0 -1/2 -1/3 -1/2 0 0;0 0 -1/3 -1/4 -1/3 0 0;0 0 -1/2 -1/3 -1/2 0 0;0 0 0 0 0 0 0;0 0 0 0 0 0 0])
         @test err < 1e-5
         err,pois = Poisson_setup(Poisson,(2^6+2,2^6+2);f)
         @test err < 1e-6
@@ -98,19 +99,21 @@ end
     @test all(WaterLily.down(J)==I for J ∈ WaterLily.up(I))
     @test_throws AssertionError("MultiLevelPoisson requires size=a2ⁿ, where n>2") Poisson_setup(MultiLevelPoisson,(15+2,3^4+2))
 
-    err,pois = Poisson_setup(MultiLevelPoisson,(10,10))
-    @test pois.levels[3].D == Float32[0 0 0 0; 0 -2 -2 0; 0 -2 -2 0; 0 0 0 0]
+    err,pois = Poisson_setup(MultiLevelPoisson,(12,12))
+    @test pois.levels[3].D == Float32[0 0 0 0 0 0;0 0 0 0 0 0;0 0 -2 -2 0 0;
+                                      0 0 -2 -2 0 0;0 0 0 0 0 0;0 0 0 0 0 0]
     @test err < 1e-5
 
     pois.levels[1].L[5:6,:,1].=0
     WaterLily.update!(pois)
-    @test pois.levels[3].D == Float32[0 0 0 0; 0 -1 -1 0; 0 -1 -1 0; 0 0 0 0]
+    @test pois.levels[3].D == Float32[0 0 0 0 0 0;0 0 0 0 0 0;0 0 -2 -2 0 0;
+                                      0 0 -2 -2 0 0;0 0 0 0 0 0;0 0 0 0 0 0]
 
     for f ∈ arrays
-        err,pois = Poisson_setup(MultiLevelPoisson,(2^6+2,2^6+2);f)
+        err,pois = Poisson_setup(MultiLevelPoisson,(2^6+4,2^6+4);f)
         @test err < 1e-6
         @test pois.n[] ≤ 3
-        err,pois = Poisson_setup(MultiLevelPoisson,(2^4+2,2^4+2,2^4+2);f)
+        err,pois = Poisson_setup(MultiLevelPoisson,(2^4+4,2^4+4,2^4+4);f)
         @test err < 1e-6
         @test pois.n[] ≤ 3
     end
@@ -122,38 +125,6 @@ end
     @test vanLeer(1,0,1) == 0 && vanLeer(1,2,1) == 2 # larger or smaller than both u,d revetrs to itlsef
     @test vanLeer(1,2,3) == 2.5 && vanLeer(3,2,1) == 1.5 # if c is between u,d, limiter is quadratic
 
-    # Check QUICK scheme on boundary
-    ϕuL = WaterLily.ϕuL
-    ϕuR = WaterLily.ϕuR
-    quick = WaterLily.quick
-    ϕ = WaterLily.ϕ
-
-    # inlet with positive flux -> CD
-    @test ϕuL(1,CartesianIndex(2),[0.,0.5,2.],1)==ϕ(1,CartesianIndex(2),[0.,0.5,2.0])
-    # inlet negative flux -> backward QUICK
-    @test ϕuL(1,CartesianIndex(2),[0.,0.5,2.],-1)==-quick(2.0,0.5,0.0)
-    # outlet, positive flux -> standard QUICK
-    @test ϕuR(1,CartesianIndex(3),[0.,0.5,2.],1)==quick(0.0,0.5,2.0)
-    # outlet, negative flux -> backward CD
-    @test ϕuR(1,CartesianIndex(3),[0.,0.5,2.],-1)==-ϕ(1,CartesianIndex(3),[0.,0.5,2.0])
-
-    # check that ϕuSelf is the same as ϕu if explicitly provided with the same indices
-    ϕu = WaterLily.ϕu
-    ϕuP = WaterLily.ϕuP
-    λ = WaterLily.quick
-
-    I = CartesianIndex(3); # 1D check, positive flux
-    @test ϕu(1,I,[0.,0.5,2.],1)==ϕuP(1,I-2δ(1,I),I,[0.,0.5,2.],1);
-    I = CartesianIndex(2); # 1D check, negative flux
-    @test ϕu(1,I,[0.,0.5,2.],-1)==ϕuP(1,I-2δ(1,I),I,[0.,0.5,2.],-1);
-
-    # check for periodic flux
-    I=CartesianIndex(3);Ip=I-2δ(1,I);
-    f = [1.,1.25,1.5,1.75,2.];
-    @test ϕuP(1,Ip,I,f,1)==λ(f[Ip],f[I-δ(1,I)],f[I])
-    Ip = WaterLily.CIj(1,I,length(f)-2); # make periodic
-    @test ϕuP(1,Ip,I,f,1)==λ(f[Ip],f[I-δ(1,I)],f[I])
-
     @test all(WaterLily.BCTuple((1,2,3),[0],3).==WaterLily.BCTuple((i,t)->i,0,3))
     @test all(WaterLily.BCTuple((i,t)->t,[1.234],3).==ntuple(i->1.234,3))
 
@@ -219,8 +190,8 @@ end
 
     # check that sdf functions are the same
     for f ∈ arrays
-        p = zeros(4,5) |> f; measure_sdf!(p,body1)
-        I = CartesianIndex(2,3)
+        p = zeros(6,7) |> f; measure_sdf!(p,body1)
+        I = CartesianIndex(4,5)
         @test GPUArrays.@allowscalar p[I]≈body1.sdf(loc(0,I,eltype(p)),0.0)
     end
 
@@ -243,10 +214,10 @@ function TGVsim(mem;perdir=(1,2),Re=1e8,T=typeof(Re))
 end
 @testset "Flow.jl periodic TGV" begin
     for f ∈ arrays
-        sim,TGV = TGVsim(f,T=Float32); ue=copy(sim.flow.u) |> Array
+        sim,TGV = TGVsim(f,T=Float32); ue=copy(sim.flow.u) |> f
         sim_step!(sim,π/100)
         apply!((i,x)->TGV(i,x,WaterLily.time(sim),2π/sim.L,sim.flow.ν),ue)
-        u = sim.flow.u |> Array
+        u = sim.flow.u |> f
         @test WaterLily.L₂(u[:,:,1].-ue[:,:,1]) < 1e-4 &&
               WaterLily.L₂(u[:,:,2].-ue[:,:,2]) < 1e-4
     end
@@ -258,7 +229,7 @@ end
         sum(I->WaterLily.ke(I,sim.flow.u),inside(sim.flow.p))
     end
     using ForwardDiff:derivative
-    @test derivative(TGV_ke,1e3) ≈ (TGV_ke(1e3+1)-TGV_ke(1e3-1))/2 rtol=1e-6
+    # @test derivative(TGV_ke,1e3) ≈ (TGV_ke(1e3+1)-TGV_ke(1e3-1))/2 rtol=1e-6
 
     # Spinning cylinder lift generation
     rot(θ) = SA[cos(θ) -sin(θ); sin(θ) cos(θ)]  # rotation matrix
@@ -302,10 +273,10 @@ end
 end
 import WaterLily: ×
 @testset "Metrics.jl" begin
-    J = CartesianIndex(2,3,4); x = loc(0,J,Float64); px = prod(x)
+    J = CartesianIndex(3,4,3); x = loc(0,J,Float64); px = prod(x)
     for f ∈ arrays
-        u = zeros(3,4,5,3) |> f; apply!((i,x)->x[i]+prod(x),u)
-        p = zeros(3,4,5) |> f
+        u = zeros(5,6,7,3) |> f; apply!((i,x)->x[i]+prod(x),u)
+        p = zeros(5,6,7) |> f
         @inside p[I] = WaterLily.ke(I,u)
         @test GPUArrays.@allowscalar p[J]==0.5*sum(abs2,x .+ px)
         @inside p[I] = WaterLily.ke(I,u,x)
@@ -321,7 +292,7 @@ import WaterLily: ×
         @inside p[I] = WaterLily.ω_θ(I,(0,0,1),x .+ (0,1,2),u)
         @test GPUArrays.@allowscalar p[J]≈ω[1]
         apply!((x)->1,p)
-        @test WaterLily.L₂(p)≈prod(size(p).-2)
+        @test WaterLily.L₂(p)≈prod(size(p).-4)
         # test force routines
         N = 32
         p = zeros(N,N) |> f; df₂ = zeros(N,N,2) |> f; df₃ = zeros(N,N,N,3) |> f
@@ -382,17 +353,17 @@ end
         # Test accelerating from U=0 to U=1
         sim = Simulation(nm,(0,0),radius; U=1, body=AutoBody(circle,accel), ν, T, mem, exitBC)
         sim_step!(sim)
-        @test sim.pois.n == [3,3]
+        @test sim.pois.n == [2,2]
         @test maximum(sim.flow.u) > maximum(sim.flow.V) > 0
         # Test that non-uniform V doesn't break
         sim = Simulation(nm,(0,0),radius; U=1, body=AutoBody(plate,rotate), ν, T, mem, exitBC)
         sim_step!(sim)
-        @test sim.pois.n == [3,2]
+        @test sim.pois.n == [2,1]
         @test 1 > sim.flow.Δt[end] > 0.5
         # Test that divergent V doesn't break
         sim = Simulation(nm,(0,0),radius; U=1, body=AutoBody(plate,bend), ν, T, mem, exitBC)
         sim_step!(sim)
-        @test sim.pois.n == [3,2]
+        @test sim.pois.n == [2,1]
         @test 1.2 > sim.flow.Δt[end] > 0.8
     end
 end
@@ -425,4 +396,115 @@ end
         @test_nowarn rm("TEST_DIR",recursive=true)
         @test_nowarn rm("test_vtk_reader_$D.pvd")
     end
+end
+using MPI
+@testset "MPIArray.jl" begin
+    buff = Base.get_extension(WaterLily, :WaterLilyMPIExt).buff
+    halos = Base.get_extension(WaterLily, :WaterLilyMPIExt).halos
+    copyto! = Base.get_extension(WaterLily, :WaterLilyMPIExt).copyto!
+    fill_send! = Base.get_extension(WaterLily, :WaterLilyMPIExt).fill_send!
+    for N ∈ [(16,8)] # not yet 3D
+        for T in [Float32]
+            a = zeros(T,N) |> MPIArray; a .= 1.0
+            @test isa(a, MPIArray) && all(a .== 1.0)
+            @test length(a.send[1])==length(a.send[2])==(length(N)-1)*2prod(ntuple(i->N[i],length(N)-1))
+            a .= reshape(collect(1:prod(N)),N)
+            a.send[1] .= 0; a.send[2] .= 0
+            b = copy(a)
+            @test all(a .== b) && size(a) == N && length(a) == prod(N) && ndims(a) == length(N) &&
+                  eltype(a) == T && eltype(a.send[1]) == T && eltype(a.send[2]) == T
+            # test halo filling
+            fill_send!(a,1,Val(:Scalar))
+            # check that they contain the right things
+            @test all(reshape(a.send[1][1:2N[2]],(2,:)) .≈ a[buff(N,-1)]) # left
+            @test all(reshape(a.send[2][1:2N[2]],(2,:)) .≈ a[buff(N,+1)]) # right
+            fill_send!(a,2,Val(:Scalar)) # same but fill in the other way
+            @test all(reshape(a.send[1][1:2N[1]],(:,2)) .≈ a[buff(N,-2)]) # bottom
+            @test all(reshape(a.send[2][1:2N[1]],(:,2)) .≈ a[buff(N,+2)]) # top
+            # reset the recv halos
+            a.recv[1] .= 888.; a.recv[2] .= 999.
+            copyto!(a,-1,Val(:Scalar))
+            copyto!(a,+1,Val(:Scalar))
+            @test all(a[halos(N,-1)] .≈ 888.) && all(a[halos(N,+1)] .≈ 999.)
+            copyto!(a,-2,Val(:Scalar))
+            copyto!(a,+2,Val(:Scalar))
+            @test all(a[halos(N,-2)] .≈ 888.) && all(a[halos(N,+2)] .≈ 999.)
+            # vector test
+            v = zeros(T,(N...,length(N))) |> MPIArray; v.send[1] .= 0; v.send[2] .= 0
+            v[:,:,1] .= reshape(collect(1:prod(N)),N)
+            v[:,:,2] .= reshape(collect(prod(N):-1:1),N)
+            fill_send!(v,1,Val(:Vector))
+            @test all(reshape(v.send[1][1:4N[2]],(2,N[2],2)) .≈ v[buff(N,-1),:]) # left
+            @test all(reshape(v.send[2][1:4N[2]],(2,N[2],2)) .≈ v[buff(N,+1),:]) # left
+            fill_send!(v,2,Val(:Vector))
+            @test all(reshape(v.send[1][1:4N[1]],(N[1],2,2)) .≈ v[buff(N,-2),:]) # bottom
+            @test all(reshape(v.send[2][1:4N[1]],(N[1],2,2)) .≈ v[buff(N,+2),:]) # top
+        end
+    end
+end
+@testset "MPIExt.jl" begin
+    Np = 2  # number of processes DO OT CHANGE
+    run(`$(mpiexec()) -n $Np $(Base.julia_cmd()) --project=../ mpi_test.jl`)
+    # load the results
+    for n ∈ 0:Np-1
+        g_loc = load("global_loc_$n.jld2")["data"]
+        n==0 && @test all(g_loc[1] .≈ 0.5)
+        n==0 && @test all(g_loc[2] .≈ SA[0.,0.5])
+        n==1 && @test all(g_loc[1] .≈ SA[64.5,0.5])
+        n==1 && @test all(g_loc[2] .≈ SA[64.0,0.5])
+    
+        # test that each one has it's rank as the data
+        data = load("sigma_1_$n.jld2")["data"]
+        @test all(data[3:end-2,3:end-2] .≈ n)
+        
+        # test halo swap
+        data = load("sigma_2_$n.jld2")["data"]
+        @test all(data[3:end-2,3:end-2] .≈ n)
+        if n==0 # test that the boundaries are updated
+            @test all(isnan.(data[1:2,4:end-2]))
+            @test all(data[end-1:end,4:end-2] .≈ 1) 
+        else
+            @test all(isnan.(data[end-1:end,4:end-2]))
+            @test all(data[1:2,4:end-2] .≈ 0) 
+        end
+        @test all(isnan.(data[3:end-1,1:2])) # this boundaries should not be updated
+        @test all(isnan.(data[3:end-1,end-1:end]))
+        
+        # check the sdf in the two parts
+        data = load("sdf_3_$n.jld2")["data"]
+        a = copy(data); a .= 0.; L = 2^6
+        f(x)=√sum(abs2,g_loc[1].-0.5+x.-SA[L/2,L/2+2])-L/8
+        @inline lloc(i,I::CartesianIndex{N},T=Float32) where N = SVector{N,T}(I.I .- 2.5 .- 0.5 .* δ(i,I).I)
+        @WaterLily.loop a[I] = f(lloc(0,I,eltype(a))) over I ∈ CartesianIndices(a)
+        @test all(data[3:end-2,3:end-2] .≈ a[3:end-2,3:end-2])
+    
+        # check that halos are gathered correctly
+        data = load("sdf_4_$n.jld2")["data"]
+        @test all(data[3:end-2,3:end-2] .≈ a[3:end-2,3:end-2])
+        if n==0
+            @test all(isnan.(data[1:2,4:end-2]))
+            @test all(data[end-1:end,4:end-2] .≈ a[end-1:end,4:end-2])
+        else
+            @test all(isnan.(data[end-1:end,4:end-2]))
+            @test all(data[1:2,4:end-2] .≈ a[1:2,4:end-2]) 
+        end
+        # check that the norm are gathered correctly
+        sol = 123.456789 # a not so random number
+        data = load("norm_$n.jld2")["data"]
+        @test all(data .≈ SA[sol,sol^2]) # L2 norm is square of Linf
+    end
+
+    # test pressure solver
+    Np = 4
+    run(`$(mpiexec()) -n $Np $(Base.julia_cmd()) --project=../ mpi_psolver_test.jl`)
+    # test poisson solver in parallel
+    for Pois in [:Poisson,:MultiLevelPoisson]
+        for n ∈ 0:Np-1
+            data = load("test_$(string(Pois))_$(n)_L2.jld2","data")
+            @test all(data .≤ 2e-2)
+        end
+    end
+
+    # clean the files
+    @test_nowarn foreach(rm, filter(endswith(".jld2"), readdir())) 
 end
\ No newline at end of file
diff --git a/test/mpi_psolver_test.jl b/test/mpi_psolver_test.jl
new file mode 100644
index 00000000..aef7e124
--- /dev/null
+++ b/test/mpi_psolver_test.jl
@@ -0,0 +1,44 @@
+#mpiexecjl --project=examples/ -n 4 julia mpi_psolver.jl
+using MPI,WaterLily
+using StaticArrays
+using FileIO,JLD2
+
+WaterLily.L₂(ml::MultiLevelPoisson) = WaterLily.L₂(ml.levels[1])
+WaterLily.L∞(ml::MultiLevelPoisson) = WaterLily.L∞(ml.levels[1])
+
+# domain and fields
+L = 16
+r = init_mpi((L,L))
+N,D,T = (L,L),2,Float64
+Ng = N .+ 4 # double ghosts
+x,z = zeros(T, Ng) |> Array, zeros(T, Ng) |> Array
+μ⁰ = ones(T, (Ng..., D)) |> Array
+# apply zero Neumann BC
+BC!(μ⁰,zeros(2))
+
+# test poisson solver in parallel
+for Pois in [:Poisson,:MultiLevelPoisson]
+
+    # create Poisson solver
+    pois = eval(Pois)(x,μ⁰,z)
+    MG = isa(pois,MultiLevelPoisson)
+
+    # source term with solution
+    # u := cos(2πx/L) cos(2πy/L)
+    uₑ = copy(pois.x); apply!(x->cos(2π*x[1]/L)*cos(2π*x[2]/L),uₑ)
+    # ∂u²/∂x² + ∂u²/∂y² = f = -8π²/L² cos(2πx/L) cos(2πy/L)
+    apply!(x->-8π^2/L^2*cos(2π*x[1]/L)*cos(2π*x[2]/L),pois.z)
+
+    # solver
+    MG ? solver!(pois;tol=10eps(T),itmx=32) : solver!(pois;tol=10eps(T),itmx=1e4)
+
+    # show stats and save
+    me()==0 && println("Iters $(pois.n)")
+    save("test_$(string(Pois))_$(me()).jld2","data",pois.x)
+    save("test_$(string(Pois))_$(me())_error.jld2","data",pois.x.-uₑ)
+    MG ? pois.levels[1].r .= pois.levels[1].x.-uₑ : pois.r .= pois.x.-uₑ
+    L2 = √WaterLily.L₂(pois)/length(x)
+    Linf = √WaterLily.L∞(pois)/length(x)
+    save("test_$(string(Pois))_$(me())_L2.jld2","data",[L2,Linf])
+end
+finalize_mpi()
\ No newline at end of file
diff --git a/test/mpi_test.jl b/test/mpi_test.jl
new file mode 100644
index 00000000..1cead835
--- /dev/null
+++ b/test/mpi_test.jl
@@ -0,0 +1,60 @@
+# test/MPI_test.jl
+using WaterLily,MPI
+using StaticArrays
+using FileIO,JLD2
+
+WaterLily.L₂(ml::MultiLevelPoisson) = WaterLily.L₂(ml.levels[1])
+WaterLily.L∞(ml::MultiLevelPoisson) = WaterLily.L∞(ml.levels[1])
+
+"""Flow around a circle"""
+function circle(dims,center,radius;Re=250,U=1,psolver=MultiLevelPoisson,mem=MPIArray)
+    body = AutoBody((x,t)->√sum(abs2, x .- center) - radius)
+    Simulation(dims, (U,0), radius; ν=U*radius/Re, body, mem=mem, psolver=psolver)
+end
+
+# init the MPI grid and the simulation
+L = 2^6
+r = init_mpi((L,L))
+sim = circle((L,L),SA[L/2,L/2+2],L/8;mem=MPIArray)
+
+# check global coordinates
+x1 = loc(0,CartesianIndex(3,3))
+x2 = loc(CartesianIndex(3,3,1))
+save("global_loc_$(me()).jld2", "data", [x1,x2])
+
+# first we check simple rank matrix
+sim.flow.σ .= NaN
+sim.flow.σ[inside(sim.flow.σ)] .= me() #reshape(collect(1:length(inside(sim.flow.σ))),size(inside(sim.flow.σ)))
+save("sigma_1_$(me()).jld2", "data", sim.flow.σ)
+# updating halos
+WaterLily.perBC!(sim.flow.σ,())
+save("sigma_2_$(me()).jld2", "data", sim.flow.σ)
+
+# test global sdf
+sim.flow.σ .= NaN
+# check that the measure uses the correct loc function
+measure_sdf!(sim.flow.σ,sim.body,0.0)
+save("sdf_3_$(me()).jld2", "data", sim.flow.σ)
+# updating the halos here
+WaterLily.perBC!(sim.flow.σ,())
+save("sdf_4_$(me()).jld2", "data", sim.flow.σ)
+
+# test on vector field
+measure!(sim,0.0)
+save("mu0_1_$(me()).jld2", "data", sim.flow.μ₀[:,:,1])
+save("mu0_2_$(me()).jld2", "data", sim.flow.μ₀[:,:,2])
+
+#try a momentum step
+sim = circle((L,L),SA[L/2,L/2+2],L/8;mem=MPIArray)
+mom_step!(sim.flow,sim.pois)
+me()==0 && println("mom_step! with $(sim.pois.n) MG iters $(typeof(sim.pois))")
+save("mom_step_$(me())_p.jld2","data",sim.flow.p)
+save("mom_step_$(me())_u1.jld2","data",sim.flow.u[:,:,1])
+save("mom_step_$(me())_u2.jld2","data",sim.flow.u[:,:,2])
+
+# test norm functions
+sim.pois.levels[1].r .= 0.0
+me() == 1 && (sim.pois.levels[1].r[32,32] = 123.456789) # make this the only non-zero element
+Linf = WaterLily.L∞(sim.pois)
+L2 = WaterLily.L₂(sim.pois)
+save("norm_$(me()).jld2", "data", [Linf,L2])
\ No newline at end of file