From 40527c2b8ee044e2aa61770eaa2046dbc0cd5299 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 23 May 2023 03:00:59 +0000
Subject: [PATCH 001/113] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 94bdca6f19..b57d1d67c1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 94bdca6f1997a9c46a2b58eea187da132151d8ed
+Subproject commit b57d1d67c1d3f4d2ad73830993dfccaca5f43682

From d1de13949c3919240a4e5fe237ce85a2bfba8c60 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 23 May 2023 21:47:34 +0000
Subject: [PATCH 002/113] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 12acf92588..b57d1d67c1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 12acf9258896626d7ebd51b159a70c6dc3f16c1b
+Subproject commit b57d1d67c1d3f4d2ad73830993dfccaca5f43682

From 3d6da71f4ac818cb7d4dfae2a931048c6f103e83 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 23 May 2023 21:57:49 +0000
Subject: [PATCH 003/113] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 12acf92588..b57d1d67c1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 12acf9258896626d7ebd51b159a70c6dc3f16c1b
+Subproject commit b57d1d67c1d3f4d2ad73830993dfccaca5f43682

From 2ccdc3f028021c9ce800fb58785796e7fd6c775b Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Wed, 24 May 2023 09:07:53 +0800
Subject: [PATCH 004/113] Init version 23.08.0-SNAPSHOT (#1159)

* Init version 23.08.0-SNAPSHOT

Signed-off-by: Peixin Li <pxli@nyu.edu>

* include doc update

Signed-off-by: Peixin Li <pxli@nyu.edu>

---------

Signed-off-by: Peixin Li <pxli@nyu.edu>
---
 .gitmodules                 | 2 +-
 CONTRIBUTING.md             | 6 +++---
 pom.xml                     | 4 ++--
 src/main/cpp/CMakeLists.txt | 2 +-
 thirdparty/cudf             | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index fb30be9970..3c32c9d1a2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "thirdparty/cudf"]
 	path = thirdparty/cudf
 	url = https://github.com/rapidsai/cudf.git
-	branch = branch-23.06
+	branch = branch-23.08
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a5c092f98e..a0846edbc1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -35,8 +35,8 @@ Maven `package` goal can be used to build the RAPIDS Accelerator JNI jar. After
 build the RAPIDS Accelerator JNI jar will be in the `spark-rapids-jni/target/` directory.
 Be sure to select the jar with the CUDA classifier.
 
-When building spark-rapids-jni, the pom.xml in the submodule thirdparty/cudf is completely 
-bypassed. For a detailed explanation please read 
+When building spark-rapids-jni, the pom.xml in the submodule thirdparty/cudf is completely
+bypassed. For a detailed explanation please read
 [this](https://github.com/NVIDIA/spark-rapids-jni/issues/1084#issuecomment-1513471739).
 
 ### Building in the Docker Container
@@ -148,7 +148,7 @@ $ ./build/build-in-docker install ...
 ```
 
 Now cd to ~/repos/NVIDIA/spark-rapids and build with one of the options from
-[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-23.06/CONTRIBUTING.md#building-from-source).
+[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-23.08/CONTRIBUTING.md#building-from-source).
 
 ```bash
 $ ./build/buildall
diff --git a/pom.xml b/pom.xml
index 8c3af0fb9f..58adcd624a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -21,7 +21,7 @@
 
   <groupId>com.nvidia</groupId>
   <artifactId>spark-rapids-jni</artifactId>
-  <version>23.06.0-SNAPSHOT</version>
+  <version>23.08.0-SNAPSHOT</version>
   <packaging>jar</packaging>
   <name>RAPIDS Accelerator JNI for Apache Spark</name>
   <description>
@@ -259,7 +259,7 @@
                 <phase>test</phase>
                 <configuration>
                   <target>
-                    <exec dir="${project.basedir}" 
+                    <exec dir="${project.basedir}"
                           failonerror="true"
                           executable="cmake">
                         <arg value="--build"/>
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index d3b46dc1a7..8239ab1211 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -32,7 +32,7 @@ rapids_cuda_init_architectures(SPARK_RAPIDS_JNI)
 
 project(
   SPARK_RAPIDS_JNI
-  VERSION 23.06.00
+  VERSION 23.08.00
   LANGUAGES C CXX CUDA
 )
 
diff --git a/thirdparty/cudf b/thirdparty/cudf
index b57d1d67c1..0b566b9400 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit b57d1d67c1d3f4d2ad73830993dfccaca5f43682
+Subproject commit 0b566b940029bb44cc2158be79b57194d3aafc5a

From 96b15fbce08eb92f6364d377454d2edcc8dd0fe6 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 25 May 2023 07:58:19 +0000
Subject: [PATCH 005/113] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 12acf92588..0b566b9400 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 12acf9258896626d7ebd51b159a70c6dc3f16c1b
+Subproject commit 0b566b940029bb44cc2158be79b57194d3aafc5a

From 6c48f5d2eeb8b1e97012839ef08be37711b5543e Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 26 May 2023 04:06:14 +0000
Subject: [PATCH 006/113] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 12acf92588..0b566b9400 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 12acf9258896626d7ebd51b159a70c6dc3f16c1b
+Subproject commit 0b566b940029bb44cc2158be79b57194d3aafc5a

From b79e00979761a737ba84560a5f4a7bc3a3d0c03e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 26 May 2023 12:51:18 +0800
Subject: [PATCH 007/113] [submodule-sync] bot-submodule-sync-branch-23.08 to
 branch-23.08 [skip ci] [bot] (#1166)

* Update submodule cudf to 097b828c21772a2399e9bae0d4f8c7234cc0f456

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 9a0f87c320c5322bc88732dde3c4147792d607e0

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to fd13c877e10e9fa69fa63ec7cd5b64bf6a6805d5

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to a03da13ceb294db766c2bc6ade400471584656ba

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 126fa3515b22315e145bf8b921f4869e98665499

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 37f76c820ddf833a80b4ce706b9c3e84908e51ff

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 5b3e3abeaacc6069bc6ea9d92ebc28408b82ff37

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

---------

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 0b566b9400..5b3e3abeaa 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 0b566b940029bb44cc2158be79b57194d3aafc5a
+Subproject commit 5b3e3abeaacc6069bc6ea9d92ebc28408b82ff37

From 2568aadc1dd08894ac4f1d07c8e1d8c8317d3860 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 26 May 2023 04:56:32 +0000
Subject: [PATCH 008/113] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 85699df8fa..5b3e3abeaa 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 85699df8fac14358a83aa319dce9aa1400ae4f1e
+Subproject commit 5b3e3abeaacc6069bc6ea9d92ebc28408b82ff37

From b92db50218686ea979bb4f86076a2a4b3b773ee7 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 26 May 2023 20:59:55 +0800
Subject: [PATCH 009/113] Update submodule cudf to
 cc317edbe48923c4a71673e1ef294d4050a29418 (#1176)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5b3e3abeaa..cc317edbe4 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5b3e3abeaacc6069bc6ea9d92ebc28408b82ff37
+Subproject commit cc317edbe48923c4a71673e1ef294d4050a29418

From e11c71b47f693b59a8865cd0879457db5c075e9f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 27 May 2023 11:02:23 +0800
Subject: [PATCH 010/113] Update submodule cudf to
 90bb88738d1fb2133f9f003139d4df712ec0b128 (#1177)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index cc317edbe4..90bb88738d 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit cc317edbe48923c4a71673e1ef294d4050a29418
+Subproject commit 90bb88738d1fb2133f9f003139d4df712ec0b128

From afa5b9c5f32bd8798334f4884f0dd8519210f1a7 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 30 May 2023 11:01:43 +0800
Subject: [PATCH 011/113] Update submodule cudf to
 4384c3bbf692042d2b7f0fe2869815f5741f0ea7 (#1178)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 90bb88738d..4384c3bbf6 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 90bb88738d1fb2133f9f003139d4df712ec0b128
+Subproject commit 4384c3bbf692042d2b7f0fe2869815f5741f0ea7

From 0e295caddfa5e3a2d7204f722ef81805e3bb79b2 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 30 May 2023 04:18:06 +0000
Subject: [PATCH 012/113] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 85699df8fa..4384c3bbf6 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 85699df8fac14358a83aa319dce9aa1400ae4f1e
+Subproject commit 4384c3bbf692042d2b7f0fe2869815f5741f0ea7

From fb0cf774f144f412564ec2e97267e77e65fdce96 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 31 May 2023 04:17:21 +0800
Subject: [PATCH 013/113] Update submodule cudf to
 5e12c25ca40c41f21fc80d4cd36713c514fabaa4 (#1180)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 4384c3bbf6..5e12c25ca4 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 4384c3bbf692042d2b7f0fe2869815f5741f0ea7
+Subproject commit 5e12c25ca40c41f21fc80d4cd36713c514fabaa4

From 1239ab13e893d3c7b49e31c1bc11358ccea48a55 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 30 May 2023 22:31:44 +0000
Subject: [PATCH 014/113] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5541e64bde..5e12c25ca4 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5541e64bdebdc3a4a22a9eba0601908ba7b3c616
+Subproject commit 5e12c25ca40c41f21fc80d4cd36713c514fabaa4

From 85b98874e9d03ec039c22aa9e69428e594e5088d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 31 May 2023 10:39:19 +0800
Subject: [PATCH 015/113] Update submodule cudf to
 87a8ede8dcd9b6cd6e38c41f74daec316f48e7db (#1183)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5e12c25ca4..87a8ede8dc 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5e12c25ca40c41f21fc80d4cd36713c514fabaa4
+Subproject commit 87a8ede8dcd9b6cd6e38c41f74daec316f48e7db

From 939268c9d33638ae3f0f43a4505b37b415f21049 Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 31 May 2023 16:02:57 +0000
Subject: [PATCH 016/113] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5541e64bde..87a8ede8dc 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5541e64bdebdc3a4a22a9eba0601908ba7b3c616
+Subproject commit 87a8ede8dcd9b6cd6e38c41f74daec316f48e7db

From 0280ada798458e49662b09ac2c5b33064cc84f34 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 1 Jun 2023 17:01:16 +0800
Subject: [PATCH 017/113] Update submodule cudf to
 ebc68df255b9f304ee19b969645a10d5ab9f8bea (#1185)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 87a8ede8dc..ebc68df255 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 87a8ede8dcd9b6cd6e38c41f74daec316f48e7db
+Subproject commit ebc68df255b9f304ee19b969645a10d5ab9f8bea

From 1f18426986bdf2e1648d0485883f586737c940b7 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 2 Jun 2023 04:17:27 +0800
Subject: [PATCH 018/113] Update submodule cudf to
 8c6c087deb3ab30e1df761952c29b703580a6382 (#1186)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index ebc68df255..8c6c087deb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit ebc68df255b9f304ee19b969645a10d5ab9f8bea
+Subproject commit 8c6c087deb3ab30e1df761952c29b703580a6382

From b61a32e1cd9879cc2eeac8b91adaedd7962dbe22 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 2 Jun 2023 11:00:58 +0800
Subject: [PATCH 019/113] Update submodule cudf to
 e01e497fe0d1f884ad2966497d1c5f3d6b5fc90b (#1187)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8c6c087deb..e01e497fe0 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8c6c087deb3ab30e1df761952c29b703580a6382
+Subproject commit e01e497fe0d1f884ad2966497d1c5f3d6b5fc90b

From a2c0b44ad5b9f557820015643019c139b6a9d972 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 2 Jun 2023 23:00:14 +0800
Subject: [PATCH 020/113] Update submodule cudf to
 87c69cbcfe5f6879274d091a3dd389802b207815 (#1188)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e01e497fe0..87c69cbcfe 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e01e497fe0d1f884ad2966497d1c5f3d6b5fc90b
+Subproject commit 87c69cbcfe5f6879274d091a3dd389802b207815

From bcf476152b26bc1aebe6d85536d3aa051d9acf0f Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 2 Jun 2023 17:32:11 +0000
Subject: [PATCH 021/113] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 6bb0328148..87c69cbcfe 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 6bb0328148cb3f66693487da5c714b11d1051831
+Subproject commit 87c69cbcfe5f6879274d091a3dd389802b207815

From f2babd469e848f8c21c423668efa666d5c213d4a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 3 Jun 2023 05:01:14 +0800
Subject: [PATCH 022/113] Update submodule cudf to
 4b7b6d59a855f93c2879325935dc5946f7f9702d (#1191)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 87c69cbcfe..4b7b6d59a8 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 87c69cbcfe5f6879274d091a3dd389802b207815
+Subproject commit 4b7b6d59a855f93c2879325935dc5946f7f9702d

From 5ebcd8bd9be352b9541c4d7b53b3ffa8e652866e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 6 Jun 2023 05:01:42 +0800
Subject: [PATCH 023/113] Update submodule cudf to
 9092b56c2c663c0377a501b2c7c0d5951f33458d (#1192)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 4b7b6d59a8..9092b56c2c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 4b7b6d59a855f93c2879325935dc5946f7f9702d
+Subproject commit 9092b56c2c663c0377a501b2c7c0d5951f33458d

From 7fcb3b4f8c370879e7f8d466d2acec5d593849cb Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 6 Jun 2023 11:01:13 +0800
Subject: [PATCH 024/113] Update submodule cudf to
 74024eb769031716f619377223239c568a7e5994 (#1193)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 9092b56c2c..74024eb769 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 9092b56c2c663c0377a501b2c7c0d5951f33458d
+Subproject commit 74024eb769031716f619377223239c568a7e5994

From c9fc83aca186707d4c7e4d14c55ef753c9c261f5 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 7 Jun 2023 05:04:00 +0800
Subject: [PATCH 025/113] Update submodule cudf to
 b8c0501d184a63c59cd70aca189b4887ba4f13fd (#1194)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 74024eb769..b8c0501d18 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 74024eb769031716f619377223239c568a7e5994
+Subproject commit b8c0501d184a63c59cd70aca189b4887ba4f13fd

From 5fc1b89726e5cb195dfa557df83f87cfb3bfb717 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 7 Jun 2023 11:00:47 +0800
Subject: [PATCH 026/113] Update submodule cudf to
 bb92f162a8082b6717f04930cff5f372f7f9d42b (#1196)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index b8c0501d18..bb92f162a8 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit b8c0501d184a63c59cd70aca189b4887ba4f13fd
+Subproject commit bb92f162a8082b6717f04930cff5f372f7f9d42b

From 3453adb5a1095f1b558c76b634d3e2044e7f3b2f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 7 Jun 2023 17:00:51 +0800
Subject: [PATCH 027/113] Update submodule cudf to
 89dcca5a36f1eb92bf41aaa9a474db79e2190803 (#1197)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index bb92f162a8..89dcca5a36 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit bb92f162a8082b6717f04930cff5f372f7f9d42b
+Subproject commit 89dcca5a36f1eb92bf41aaa9a474db79e2190803

From f4bd4986e46a1fcecf2f5bdd1915b60f301f48ab Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 7 Jun 2023 17:33:54 +0000
Subject: [PATCH 028/113] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 36f7f4dfdd..89dcca5a36 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 36f7f4dfdd648d72ec4b77ea6658ad45333e36d2
+Subproject commit 89dcca5a36f1eb92bf41aaa9a474db79e2190803

From 6fb77b67c41021cbd15b4738fe2167eace4b900a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 8 Jun 2023 05:27:55 +0800
Subject: [PATCH 029/113] Update submodule cudf to
 a99f31318cac1065a588c92b0a2f2943aa10b70d (#1200)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 89dcca5a36..a99f31318c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 89dcca5a36f1eb92bf41aaa9a474db79e2190803
+Subproject commit a99f31318cac1065a588c92b0a2f2943aa10b70d

From 88cea8cd5b432951ca64f32278ebead55896891d Mon Sep 17 00:00:00 2001
From: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 8 Jun 2023 02:22:53 +0000
Subject: [PATCH 030/113] Auto-merge use submodule in BASE ref

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f881d40c63..a99f31318c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f881d40c634b02004a544a15be4fbcf4f3858c68
+Subproject commit a99f31318cac1065a588c92b0a2f2943aa10b70d

From cd9713c1684229f3180c44335b1d523d6466de4e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 8 Jun 2023 11:01:42 +0800
Subject: [PATCH 031/113] Update submodule cudf to
 8055c2db6f80286b64d36f3927510bcf2e0eec02 (#1203)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index a99f31318c..8055c2db6f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit a99f31318cac1065a588c92b0a2f2943aa10b70d
+Subproject commit 8055c2db6f80286b64d36f3927510bcf2e0eec02

From d1f600877d944b566e8548b294e68ffbc0b04e68 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 9 Jun 2023 04:25:58 +0800
Subject: [PATCH 032/113] Update submodule cudf to
 7e0e44b4536bea9ae6e786df729cb87266babaaf (#1204)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8055c2db6f..7e0e44b453 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8055c2db6f80286b64d36f3927510bcf2e0eec02
+Subproject commit 7e0e44b4536bea9ae6e786df729cb87266babaaf

From 35af9df3d21e06c44a3511ccc7b66df59fe04af7 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 9 Jun 2023 11:02:41 +0800
Subject: [PATCH 033/113] Update submodule cudf to
 9be38d299748af3c29be29c6faa62cb3296bdf8d (#1205)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 7e0e44b453..9be38d2997 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 7e0e44b4536bea9ae6e786df729cb87266babaaf
+Subproject commit 9be38d299748af3c29be29c6faa62cb3296bdf8d

From e438a282ffc11d4bfb8f2d9a0ee5fee10d558e8a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 10 Jun 2023 05:00:36 +0800
Subject: [PATCH 034/113] Update submodule cudf to
 be501f53f19b9e73436f91ed0046ac47d80b1d4a (#1206)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 9be38d2997..be501f53f1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 9be38d299748af3c29be29c6faa62cb3296bdf8d
+Subproject commit be501f53f19b9e73436f91ed0046ac47d80b1d4a

From a5d4cc6509f6fb7822e789b4dda8b280b1cb4cbb Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Fri, 9 Jun 2023 20:05:54 -0700
Subject: [PATCH 035/113] Update cmake (#1207)

Signed-off-by: Nghia Truong <nghiatruong.vn@gmail.com>
---
 ci/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/Dockerfile b/ci/Dockerfile
index 4dcb69dd63..7d59fef5f5 100755
--- a/ci/Dockerfile
+++ b/ci/Dockerfile
@@ -36,7 +36,7 @@ RUN scl enable rh-python38 "pip install requests 'urllib3<2.0'"
 RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
 
 # 3.22.3: CUDA architecture 'native' support + flexible CMAKE_<LANG>_*_LAUNCHER for ccache
-ARG CMAKE_VERSION=3.23.3
+ARG CMAKE_VERSION=3.26.4
 
 RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
    tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \

From 319b2d487373e1e0f9d3d0fd8b8ef370ab333471 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 10 Jun 2023 16:16:54 +0800
Subject: [PATCH 036/113] [submodule-sync] bot-submodule-sync-branch-23.08 to
 branch-23.08 [skip ci] [bot] (#1208)

* Update submodule cudf to ded6122db67ad4676b4d2c1f89f181fa281ba66c

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to c733cc35071805414a26d750805baedab65a35f9

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

---------

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index be501f53f1..c733cc3507 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit be501f53f19b9e73436f91ed0046ac47d80b1d4a
+Subproject commit c733cc35071805414a26d750805baedab65a35f9

From b84eed4a472b6d4cef9c21e011842aa1bfd045fa Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 10 Jun 2023 23:00:37 +0800
Subject: [PATCH 037/113] Update submodule cudf to
 572420467704301a8856f1b5550059062cc1018b (#1209)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index c733cc3507..5724204677 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit c733cc35071805414a26d750805baedab65a35f9
+Subproject commit 572420467704301a8856f1b5550059062cc1018b

From 413e2106549837dbaff83767d6a1fe5bd2b21cc8 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 12 Jun 2023 23:00:58 +0800
Subject: [PATCH 038/113] Update submodule cudf to
 deec3f8f981fd89f1aa46c6aea3714fd7c7355b9 (#1210)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5724204677..deec3f8f98 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 572420467704301a8856f1b5550059062cc1018b
+Subproject commit deec3f8f981fd89f1aa46c6aea3714fd7c7355b9

From 5f97292ce261987e292c27fb4a4e11c0ac17219a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 13 Jun 2023 05:02:17 +0800
Subject: [PATCH 039/113] Update submodule cudf to
 13f4805598615593325b21e0d152edb840d5a8a2 (#1211)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index deec3f8f98..13f4805598 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit deec3f8f981fd89f1aa46c6aea3714fd7c7355b9
+Subproject commit 13f4805598615593325b21e0d152edb840d5a8a2

From e8d27dcb34f26f03edabac8f597a9fd337361517 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 13 Jun 2023 11:01:29 +0800
Subject: [PATCH 040/113] Update submodule cudf to
 59238c1fb8b8ccdcba8dcff48544477ad1e8560c (#1212)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 13f4805598..59238c1fb8 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 13f4805598615593325b21e0d152edb840d5a8a2
+Subproject commit 59238c1fb8b8ccdcba8dcff48544477ad1e8560c

From 24044e4db61f022f1b3a9895471de2fea927d83a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 13 Jun 2023 16:16:33 +0800
Subject: [PATCH 041/113] Update submodule cudf to
 1a712edffdcc1ce9a8eca80c9aeb7eec5556c1f5 (#1213)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 59238c1fb8..1a712edffd 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 59238c1fb8b8ccdcba8dcff48544477ad1e8560c
+Subproject commit 1a712edffdcc1ce9a8eca80c9aeb7eec5556c1f5

From 87ed8daa79e4c9f24537ada13e099c6143b415a1 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 14 Jun 2023 05:01:27 +0800
Subject: [PATCH 042/113] Update submodule cudf to
 e4ac05f00b5a9195aa4718630295f545db9efba5 (#1214)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 1a712edffd..e4ac05f00b 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 1a712edffdcc1ce9a8eca80c9aeb7eec5556c1f5
+Subproject commit e4ac05f00b5a9195aa4718630295f545db9efba5

From 70c45e141f44fb9b4ac32213c4c66b46c27aa67d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 14 Jun 2023 11:01:39 +0800
Subject: [PATCH 043/113] Update submodule cudf to
 f83c1fdb633498322ef8673be971e19605cddc88 (#1215)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e4ac05f00b..f83c1fdb63 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e4ac05f00b5a9195aa4718630295f545db9efba5
+Subproject commit f83c1fdb633498322ef8673be971e19605cddc88

From 4589b1b6cdbb83b1bbb1d0c2b9609a1e170fd8fc Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 14 Jun 2023 20:16:27 +0800
Subject: [PATCH 044/113] Update submodule cudf to
 ceeb39270cac6bbcead175e39482bada45134e24 (#1216)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f83c1fdb63..ceeb39270c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f83c1fdb633498322ef8673be971e19605cddc88
+Subproject commit ceeb39270cac6bbcead175e39482bada45134e24

From 79b0c4b45011c753f8ac019932d72f92bf465c46 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 15 Jun 2023 05:07:11 +0800
Subject: [PATCH 045/113] Update submodule cudf to
 649cf5e2c1a249472ff6c666e754edde2bcb7409 (#1218)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index ceeb39270c..649cf5e2c1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit ceeb39270cac6bbcead175e39482bada45134e24
+Subproject commit 649cf5e2c1a249472ff6c666e754edde2bcb7409

From 06aee7657810479c0519c5c19c7786f637634911 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 15 Jun 2023 11:02:53 +0800
Subject: [PATCH 046/113] Update submodule cudf to
 67efaf62c7314fbda56d39ea5f60bb893c406084 (#1219)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 649cf5e2c1..67efaf62c7 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 649cf5e2c1a249472ff6c666e754edde2bcb7409
+Subproject commit 67efaf62c7314fbda56d39ea5f60bb893c406084

From 392bea3ff6abbca2505c4edafbfddafa006bc70b Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 21 Jun 2023 05:02:33 +0800
Subject: [PATCH 047/113] Update submodule cudf to
 ea8bd0c73126baedbff41d3270f2f34af7492303 (#1220)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 67efaf62c7..ea8bd0c731 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 67efaf62c7314fbda56d39ea5f60bb893c406084
+Subproject commit ea8bd0c73126baedbff41d3270f2f34af7492303

From cbc1b221bfe9566db2b88c64d7c349d0c76af70b Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 21 Jun 2023 11:03:47 +0800
Subject: [PATCH 048/113] Update submodule cudf to
 5a7e3c712fef0fe736e7baa877980f2d17d03a37 (#1222)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index ea8bd0c731..5a7e3c712f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit ea8bd0c73126baedbff41d3270f2f34af7492303
+Subproject commit 5a7e3c712fef0fe736e7baa877980f2d17d03a37

From 76d3a281bd9f6ecf20fb1f965241822ac745d91f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 21 Jun 2023 21:02:23 +0800
Subject: [PATCH 049/113] Update submodule cudf to
 f71929adccc1d2a7e12278a9ce46590fefdeeb63 (#1223)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5a7e3c712f..f71929adcc 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5a7e3c712fef0fe736e7baa877980f2d17d03a37
+Subproject commit f71929adccc1d2a7e12278a9ce46590fefdeeb63

From 21ce3953ea09adc4287edf4b997d7fbcf7e2b2e8 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 21 Jun 2023 23:03:56 +0800
Subject: [PATCH 050/113] Update submodule cudf to
 1854ac86d08e545376704959436ec370bdd8117a (#1224)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f71929adcc..1854ac86d0 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f71929adccc1d2a7e12278a9ce46590fefdeeb63
+Subproject commit 1854ac86d08e545376704959436ec370bdd8117a

From 0ea009dc3151663a91e26009a3e831eba7e7b88e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 22 Jun 2023 11:03:28 +0800
Subject: [PATCH 051/113] Update submodule cudf to
 6aa5bd3759ed929c0da9a462b1c4a084f79377f2 (#1225)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 1854ac86d0..6aa5bd3759 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 1854ac86d08e545376704959436ec370bdd8117a
+Subproject commit 6aa5bd3759ed929c0da9a462b1c4a084f79377f2

From 867d2d875b268c9f474cde862da5057b786018d2 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 22 Jun 2023 17:02:39 +0800
Subject: [PATCH 052/113] Update submodule cudf to
 708ee59d7c26a95e980d3f3d22c5b9289716bc58 (#1226)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 6aa5bd3759..708ee59d7c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 6aa5bd3759ed929c0da9a462b1c4a084f79377f2
+Subproject commit 708ee59d7c26a95e980d3f3d22c5b9289716bc58

From 5731ee6f548247e7e9cb879b7f6c42b2a4adafcf Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 22 Jun 2023 23:01:15 +0800
Subject: [PATCH 053/113] Update submodule cudf to
 7cbef2a184f734d3e88fa61942652ad97d537da3 (#1227)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 708ee59d7c..7cbef2a184 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 708ee59d7c26a95e980d3f3d22c5b9289716bc58
+Subproject commit 7cbef2a184f734d3e88fa61942652ad97d537da3

From 3d4cac14e51ef0442e4667552a2b2a7ea7b474ea Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 23 Jun 2023 05:01:22 +0800
Subject: [PATCH 054/113] Update submodule cudf to
 8735a02c405d4b6f6e380444aeddc6a412e93958 (#1230)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 7cbef2a184..8735a02c40 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 7cbef2a184f734d3e88fa61942652ad97d537da3
+Subproject commit 8735a02c405d4b6f6e380444aeddc6a412e93958

From e7dca071b6a53ef002f8e6aebb817f50bb523451 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 23 Jun 2023 11:01:56 +0800
Subject: [PATCH 055/113] Update submodule cudf to
 0846494da393c07a13d6040927b71014dc1a03c4 (#1231)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 8735a02c40..0846494da3 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 8735a02c405d4b6f6e380444aeddc6a412e93958
+Subproject commit 0846494da393c07a13d6040927b71014dc1a03c4

From 01ad606e15b5a7ec59028dba9db3a94213e141f6 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 23 Jun 2023 17:01:11 +0800
Subject: [PATCH 056/113] Update submodule cudf to
 4f8afef295d47c6d13d2a13ba5356fb9231cf658 (#1232)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 0846494da3..4f8afef295 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 0846494da393c07a13d6040927b71014dc1a03c4
+Subproject commit 4f8afef295d47c6d13d2a13ba5356fb9231cf658

From 916131194d00acb5ca9f37a51601686b82fb524f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 23 Jun 2023 23:00:55 +0800
Subject: [PATCH 057/113] Update submodule cudf to
 6aad528eadae14423f2b4ebe85791da8d2e7933a (#1233)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 4f8afef295..6aad528ead 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 4f8afef295d47c6d13d2a13ba5356fb9231cf658
+Subproject commit 6aad528eadae14423f2b4ebe85791da8d2e7933a

From d260813707707af9217979d5082a874dc5a58b05 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 24 Jun 2023 05:01:05 +0800
Subject: [PATCH 058/113] Update submodule cudf to
 0688872200d0a0c70d5e4d1c17e2e29ecd00af8f (#1234)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 6aad528ead..0688872200 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 6aad528eadae14423f2b4ebe85791da8d2e7933a
+Subproject commit 0688872200d0a0c70d5e4d1c17e2e29ecd00af8f

From e8c87610db75b52e481ad34d98d643f739c29434 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 24 Jun 2023 11:13:57 +0800
Subject: [PATCH 059/113] Update submodule cudf to
 c7e9405f939d9750c6a6321fbd66c14674951042 (#1235)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 0688872200..c7e9405f93 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 0688872200d0a0c70d5e4d1c17e2e29ecd00af8f
+Subproject commit c7e9405f939d9750c6a6321fbd66c14674951042

From 247985e9ee8118cf5df2662b120cbad2cd8d5c4f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 26 Jun 2023 16:16:51 +0800
Subject: [PATCH 060/113] Update submodule cudf to
 9a3f3a9f0d67f71bf7376e0251d67fdb725f512f (#1236)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index c7e9405f93..9a3f3a9f0d 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit c7e9405f939d9750c6a6321fbd66c14674951042
+Subproject commit 9a3f3a9f0d67f71bf7376e0251d67fdb725f512f

From c9906bf02ff3e6e61833637fb05ae9168e91aade Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Mon, 26 Jun 2023 13:11:55 -0500
Subject: [PATCH 061/113] Adds in a metric for computation time lost due to
 retry. (#1217)

Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
---
 src/main/cpp/src/SparkResourceAdaptorJni.cpp  | 142 ++++++++++++++++--
 .../com/nvidia/spark/rapids/jni/RmmSpark.java |  42 ++++++
 .../rapids/jni/SparkResourceAdaptor.java      |  16 +-
 .../spark/rapids/jni/RmmSparkMonteCarlo.java  |  22 ++-
 .../nvidia/spark/rapids/jni/RmmSparkTest.java |   9 ++
 5 files changed, 218 insertions(+), 13 deletions(-)

diff --git a/src/main/cpp/src/SparkResourceAdaptorJni.cpp b/src/main/cpp/src/SparkResourceAdaptorJni.cpp
index 2733397ca6..13a7a50a95 100644
--- a/src/main/cpp/src/SparkResourceAdaptorJni.cpp
+++ b/src/main/cpp/src/SparkResourceAdaptorJni.cpp
@@ -186,6 +186,15 @@ class full_thread_state {
   int num_times_retry_throw = 0;
   int num_times_split_retry_throw = 0;
   long time_blocked_nanos = 0;
+  // The amount of time that this thread has lost due to retries (not inclduing blocked time)
+  long time_lost_nanos = 0;
+  // The amount of time that this thread has spent in the current retry block (not inclucing block time)
+  long time_retry_running_nanos = 0;
+  // When did the retry time for this thread start, or when did the block time end.
+  std::chrono::time_point<std::chrono::steady_clock> retry_start_or_block_end;
+  // Is this thread currently in a marked retry block. This is only used for metrics.
+  bool is_in_retry = false;
+
 
   std::chrono::time_point<std::chrono::steady_clock> block_start;
 
@@ -204,12 +213,50 @@ class full_thread_state {
     state = new_state;
   }
 
-  void before_block() { block_start = std::chrono::steady_clock::now(); }
+  void before_block() {
+    block_start = std::chrono::steady_clock::now();
+    // Don't record running time lost while we are blocked...
+    record_and_reset_pending_retry_time();
+  }
 
   void after_block() {
     auto end = std::chrono::steady_clock::now();
     auto diff = end - block_start;
     time_blocked_nanos += std::chrono::duration_cast<std::chrono::nanoseconds>(diff).count();
+    if (is_in_retry) {
+      retry_start_or_block_end = end;
+    }
+  }
+
+  long get_and_reset_failed_retry_time() {
+    long ret = time_lost_nanos;
+    time_lost_nanos = 0;
+    return ret;
+  }
+
+  void record_failed_retry_time() {
+    if (is_in_retry) {
+      record_and_reset_pending_retry_time();
+      time_lost_nanos += time_retry_running_nanos;
+      time_retry_running_nanos = 0;
+    }
+  }
+
+  void record_and_reset_pending_retry_time() {
+    if (is_in_retry) {
+      auto end = std::chrono::steady_clock::now();
+      auto diff = end - retry_start_or_block_end;
+      time_retry_running_nanos += std::chrono::duration_cast<std::chrono::nanoseconds>(diff).count();
+      retry_start_or_block_end = end;
+    }
+  }
+
+  void reset_retry_state(bool is_in_retry) {
+    time_retry_running_nanos = 0;
+    if (is_in_retry) {
+      retry_start_or_block_end = std::chrono::steady_clock::now();
+    }
+    this->is_in_retry = is_in_retry;
   }
 
   /**
@@ -290,6 +337,37 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     }
   }
 
+  void start_retry_block(long thread_id) {
+    std::unique_lock<std::mutex> lock(state_mutex);
+    auto thread = threads.find(thread_id);
+    if (thread != threads.end()) {
+      thread->second.reset_retry_state(true);
+    }
+  }
+
+  void end_retry_block(long thread_id) {
+    std::unique_lock<std::mutex> lock(state_mutex);
+    auto thread = threads.find(thread_id);
+    if (thread != threads.end()) {
+      thread->second.reset_retry_state(false);
+    }
+  }
+
+  long get_and_reset_lost_time(long task_id) {
+    std::unique_lock<std::mutex> lock(state_mutex);
+    long ret = 0;
+    auto task_at = task_to_threads.find(task_id);
+    if (task_at != task_to_threads.end()) {
+      for (auto thread_id : task_at->second) {
+        auto threads_at = threads.find(thread_id);
+        if (threads_at != threads.end()) {
+          ret += threads_at->second.get_and_reset_failed_retry_time();
+        }
+      }
+    }
+    return ret;
+  }
+
   /**
    * Update the internal state so that a specific thread is associated with shuffle.
    * This may be called multiple times for a given thread and if the thread is already
@@ -433,7 +511,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   /**
    * get the number of times a retry was thrown and reset the value to 0.
    */
-  int get_n_reset_num_retry(long task_id) {
+  int get_and_reset_num_retry(long task_id) {
     std::unique_lock<std::mutex> lock(state_mutex);
     int ret = 0;
     auto task_at = task_to_threads.find(task_id);
@@ -452,7 +530,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   /**
    * get the number of times a split and retry was thrown and reset the value to 0.
    */
-  int get_n_reset_num_split_retry(long task_id) {
+  int get_and_reset_num_split_retry(long task_id) {
     std::unique_lock<std::mutex> lock(state_mutex);
     int ret = 0;
     auto task_at = task_to_threads.find(task_id);
@@ -471,7 +549,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   /**
    * get the time in ns that the task was blocked for.
    */
-  long get_n_reset_block_time(long task_id) {
+  long get_and_reset_block_time(long task_id) {
     std::unique_lock<std::mutex> lock(state_mutex);
     long ret = 0;
     auto task_at = task_to_threads.find(task_id);
@@ -640,6 +718,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     // In testing it looks like it is a few ms if in a tight loop, not including spill
     // overhead
     if (state.num_times_retried + 1 > 500) {
+      state.record_failed_retry_time();
       throw_java_exception(cudf::jni::OOM_CLASS, "GPU OutOfMemory: retry limit exceeded");
     }
     state.num_times_retried++;
@@ -649,13 +728,15 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
                        const std::unique_lock<std::mutex> &lock) {
     state.num_times_retry_throw++;
     check_before_oom(state, lock);
+    state.record_failed_retry_time();
     throw_java_exception(RETRY_OOM_CLASS, "GPU OutOfMemory");
   }
 
-  void throw_split_n_retry_oom(const char *msg, full_thread_state &state,
+  void throw_split_and_retry_oom(const char *msg, full_thread_state &state,
                                const std::unique_lock<std::mutex> &lock) {
     state.num_times_split_retry_throw++;
     check_before_oom(state, lock);
+    state.record_failed_retry_time();
     throw_java_exception(SPLIT_AND_RETRY_OOM_CLASS, "GPU OutOfMemory");
   }
 
@@ -700,11 +781,13 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
             break;
           case SHUFFLE_THROW:
             transition(thread->second, thread_state::SHUFFLE_RUNNING);
+            thread->second.record_failed_retry_time();
             throw_java_exception(cudf::jni::OOM_CLASS,
                                  "GPU OutOfMemory: could not allocate enough for shuffle");
             break;
           case TASK_BUFN_THROW:
             transition(thread->second, thread_state::TASK_BUFN_WAIT);
+            thread->second.record_failed_retry_time();
             throw_retry_oom("rollback and retry operation", thread->second, lock);
             break;
           case TASK_BUFN_WAIT:
@@ -728,7 +811,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
             break;
           case TASK_SPLIT_THROW:
             transition(thread->second, thread_state::TASK_RUNNING);
-            throw_split_n_retry_oom("rollback, split input, and retry operation", thread->second,
+            thread->second.record_failed_retry_time();
+            throw_split_and_retry_oom("rollback, split input, and retry operation", thread->second,
                                     lock);
             break;
           case TASK_REMOVE_THROW:
@@ -736,6 +820,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
           case SHUFFLE_REMOVE_THROW:
             log_transition(thread_id, thread->second.task_id, thread->second.state,
                            thread_state::UNKNOWN);
+            // don't need to record failed time metric the thread is already gone...
             threads.erase(thread);
             task_has_woken_condition.notify_all();
             throw std::runtime_error("thread removed while blocked");
@@ -865,6 +950,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
         thread->second.retry_oom_injected--;
         thread->second.num_times_retry_throw++;
         log_status("INJECTED_RETRY_OOM", thread_id, thread->second.task_id, thread->second.state);
+        thread->second.record_failed_retry_time();
         throw_java_exception(RETRY_OOM_CLASS, "injected RetryOOM");
       }
 
@@ -872,6 +958,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
         thread->second.cudf_exception_injected--;
         log_status("INJECTED_CUDF_EXCEPTION", thread_id, thread->second.task_id,
                    thread->second.state);
+        thread->second.record_failed_retry_time();
         throw_java_exception(cudf::jni::CUDF_ERROR_CLASS, "injected CudfException");
       }
 
@@ -880,6 +967,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
         thread->second.num_times_split_retry_throw++;
         log_status("INJECTED_SPLIT_AND_RETRY_OOM", thread_id, thread->second.task_id,
                    thread->second.state);
+        thread->second.record_failed_retry_time();
         throw_java_exception(SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM");
       }
 
@@ -1470,7 +1558,7 @@ Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetRetryThrowInter
   try {
     cudf::jni::auto_set_device(env);
     auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
-    return mr->get_n_reset_num_retry(task_id);
+    return mr->get_and_reset_num_retry(task_id);
   }
   CATCH_STD(env, 0)
 }
@@ -1482,7 +1570,7 @@ Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetSplitRetryThrow
   try {
     cudf::jni::auto_set_device(env);
     auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
-    return mr->get_n_reset_num_split_retry(task_id);
+    return mr->get_and_reset_num_split_retry(task_id);
   }
   CATCH_STD(env, 0)
 }
@@ -1496,8 +1584,44 @@ Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetBlockTimeIntern
   try {
     cudf::jni::auto_set_device(env);
     auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
-    return mr->get_n_reset_block_time(task_id);
+    return mr->get_and_reset_block_time(task_id);
   }
   CATCH_STD(env, 0)
 }
+
+JNIEXPORT jlong JNICALL
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetComputeTimeLostToRetry(JNIEnv *env,
+                                                                                       jclass,
+                                                                                       jlong ptr,
+                                                                                       jlong task_id) {
+  JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    return mr->get_and_reset_lost_time(task_id);
+  }
+  CATCH_STD(env, 0)
+}
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_startRetryBlock(
+    JNIEnv *env, jclass, jlong ptr, jlong thread_id) {
+  JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
+  try {
+    cudf::jni::auto_set_device(env);
+    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    mr->start_retry_block(thread_id);
+  }
+  CATCH_STD(env, )
+}
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_endRetryBlock(
+    JNIEnv *env, jclass, jlong ptr, jlong thread_id) {
+  JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
+  try {
+    cudf::jni::auto_set_device(env);
+    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    mr->end_retry_block(thread_id);
+  }
+  CATCH_STD(env, )
+}
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java b/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java
index 29801b4738..3132dd9cd0 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java
@@ -153,6 +153,32 @@ public static void associateCurrentThreadWithShuffle() {
     associateThreadWithShuffle(getCurrentThreadId());
   }
 
+
+
+  public static void startRetryBlock(long threadId) {
+    synchronized (Rmm.class) {
+      if (sra != null && sra.isOpen()) {
+        sra.startRetryBlock(threadId);
+      }
+    }
+  }
+
+  public static void currentThreadStartRetryBlock() {
+    startRetryBlock(getCurrentThreadId());
+  }
+
+  public static void endRetryBlock(long threadId) {
+    synchronized (Rmm.class) {
+      if (sra != null && sra.isOpen()) {
+        sra.endRetryBlock(threadId);
+      }
+    }
+  }
+
+  public static void currentThreadEndRetryBlock() {
+    startRetryBlock(getCurrentThreadId());
+  }
+
   /**
    * Remove the given thread ID from any association.
    * @param threadId the ID of the thread that is no longer a part of a task or shuffle
@@ -381,4 +407,20 @@ public static long getAndResetBlockTimeNs(long taskId) {
       }
     }
   }
+
+  /**
+   * Get how long, in nanoseconds, that this task lost in computation time due to retries.
+   * @param taskId the id of the task to get the metric for.
+   * @return the time the task did computation that was lost.
+   */
+  public static long getAndResetComputeTimeLostToRetryNs(long taskId) {
+    synchronized (Rmm.class) {
+      if (sra != null && sra.isOpen()) {
+        return sra.getAndResetComputeTimeLostToRetry(taskId);
+      } else {
+        // sra is not set so the value is by definition 0
+        return 0;
+      }
+    }
+  }
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java b/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java
index 581578d696..8d98729dfc 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java
@@ -83,6 +83,14 @@ public void associateThreadWithTask(long threadId, long taskId) {
     associateThreadWithTask(getHandle(), threadId, taskId);
   }
 
+  public void startRetryBlock(long threadId) {
+    startRetryBlock(getHandle(), threadId);
+  }
+
+  public void endRetryBlock(long threadId) {
+    endRetryBlock(getHandle(), threadId);
+  }
+
   /**
    * Associate a thread with shuffle.
    * @param threadId the thread ID to associate (not java thread id).
@@ -174,6 +182,10 @@ public long getAndResetBlockTime(long taskId) {
     return getAndResetBlockTimeInternal(getHandle(), taskId);
   }
 
+  public long getAndResetComputeTimeLostToRetry(long taskId) {
+    return getAndResetComputeTimeLostToRetry(getHandle(), taskId);
+  }
+
   /**
    * Get the ID of the current thread that can be used with the other SparkResourceAdaptor APIs.
    * Don't use the java thread ID. They are not related.
@@ -196,5 +208,7 @@ public long getAndResetBlockTime(long taskId) {
   private static native int getAndResetRetryThrowInternal(long handle, long taskId);
   private static native int getAndResetSplitRetryThrowInternal(long handle, long taskId);
   private static native long getAndResetBlockTimeInternal(long handle, long taskId);
-
+  private static native long getAndResetComputeTimeLostToRetry(long handle, long taskId);
+  private static native void startRetryBlock(long handle, long threadId);
+  private static native void endRetryBlock(long handle, long threadId);
 }
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkMonteCarlo.java b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkMonteCarlo.java
index 5f5dc4a20a..e7d4c2a4da 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkMonteCarlo.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkMonteCarlo.java
@@ -316,6 +316,7 @@ public void run() {
         while (!done) { // situations loop
           waitForSitToBeSet();
           Task t = getNextTask();
+          long timeLost = 0;
           while (t != null) { // task loop
             Task backup = t.cloneForRetry();
             long start = System.nanoTime();
@@ -324,12 +325,14 @@ public void run() {
               t.run(shuffle);
               success = true;
             } catch (OutOfMemoryError oom) {
+              timeLost += System.nanoTime() - start;
               if (runner.debugOoms) {
                 System.err.println("OOM for task: " + t.taskId +
                     " and thread: " + RmmSpark.getCurrentThreadId() + " " + oom);
               }
               // ignored
             }
+            timeLost += t.getTimeLost();
             Cuda.DEFAULT_STREAM.sync();
             if (!success) {
               long stopTime = System.nanoTime() + 50;
@@ -344,7 +347,7 @@ public void run() {
               }
             }
             long end = System.nanoTime();
-            runner.updateTaskStats(success, end - start);
+            runner.updateTaskStats(success, end - start, timeLost);
             t = getNextTask();
           }
           try { // situation is done so wait for all others too
@@ -402,6 +405,7 @@ public static class SituationRunner {
     volatile int failedTasks;
     volatile int successTasks;
     volatile long totalTaskTime;
+    volatile long totalTimeLost;
     volatile boolean sitFailed;
     volatile boolean didThisSitFail = false;
 
@@ -446,6 +450,7 @@ public int run(List<Situation> situations) throws InterruptedException {
         failedTasks = 0;
         successTasks = 0;
         totalTaskTime = 0;
+        totalTimeLost = 0;
       }
       int numSits = 0;
       long totalSitTime = 0;
@@ -480,7 +485,8 @@ public int run(List<Situation> situations) throws InterruptedException {
         System.out.println("Situations: " + numSits + " total, " + successSits + " successful, " +
             failedSits + " failed. " + asTimeStr(totalSitTime));
         System.out.println("Tasks: " + numTasks + " total, " + successTasks + " successful, " +
-            failedTasks + " failed. " + asTimeStr(totalTaskTime));
+            failedTasks + " failed. " + asTimeStr(totalTaskTime) + " taskTime " +
+            asTimeStr(totalTimeLost) + " lost task computation");
         System.out.println("Exceptions: " + numSplitAndRetry.get() + " splits, " +
             numRetry.get() + " retries.");
       }
@@ -520,7 +526,7 @@ public void finish() {
       }
     }
 
-    public synchronized void updateTaskStats(boolean success, long timeNs) {
+    public synchronized void updateTaskStats(boolean success, long timeNs, long timeLost) {
       if (success) {
         successTasks++;
       } else {
@@ -528,6 +534,7 @@ public synchronized void updateTaskStats(boolean success, long timeNs) {
       }
       numTasks++;
       totalTaskTime += timeNs;
+      totalTimeLost += timeLost;
     }
 
     public synchronized void setSitFailed() {
@@ -814,6 +821,8 @@ public static class Task {
     public final int retryCount;
     LinkedList<TaskOpSet> toDo = new LinkedList<>();
 
+    long timeLost = 0;
+
     public Task(Random r, long taskMaxMiB, int maxTaskAllocs, int maxTaskSleep) {
       toDo.add(new TaskOpSet(r, taskMaxMiB, maxTaskAllocs, maxTaskSleep));
       retryCount = 0;
@@ -829,10 +838,15 @@ public Task cloneForRetry() {
       return new Task(cloned, retryCount + 1);
     }
 
+    public long getTimeLost() {
+      return timeLost;
+    }
+
     public void run(ExecutorService shuffle) {
       Thread.currentThread().setName("TASK RUNNER FOR " + taskId);
       RmmSpark.associateCurrentThreadWithTask(taskId);
       try {
+        RmmSpark.currentThreadStartRetryBlock();
         while (!toDo.isEmpty()) {
           TaskOpSet tos = toDo.pollFirst();
           try {
@@ -845,6 +859,8 @@ public void run(ExecutorService shuffle) {
           }
         }
       } finally {
+        RmmSpark.currentThreadEndRetryBlock();
+        timeLost += RmmSpark.getAndResetComputeTimeLostToRetryNs(taskId);
         RmmSpark.taskDone(taskId);
       }
     }
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
index 53cbe4874b..cd11da05ae 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
@@ -314,18 +314,27 @@ public void testInsertOOMs() {
     assertEquals(RmmSparkThreadState.UNKNOWN, RmmSpark.getStateOf(threadId));
     assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid));
     assertEquals(0, RmmSpark.getAndResetNumSplitRetryThrow(taskid));
+    assertEquals(0, RmmSpark.getAndResetComputeTimeLostToRetryNs(taskid));
     RmmSpark.associateThreadWithTask(threadId, taskid);
     assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
     try {
+      RmmSpark.startRetryBlock(threadId);
       // Allocate something small and verify that it works...
       Rmm.alloc(100).close();
       assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
 
+      try {
+        Thread.sleep(1); // Just in case we run on a really fast system in the future where
+        // all of this is sub-nanosecond...
+      } catch (InterruptedException e) {
+        // Ignored
+      }
       // Force an exception
       RmmSpark.forceRetryOOM(threadId);
       // No change in the state after a force
       assertEquals(RmmSparkThreadState.TASK_RUNNING, RmmSpark.getStateOf(threadId));
       assertThrows(RetryOOM.class, () -> Rmm.alloc(100).close());
+      assert(RmmSpark.getAndResetComputeTimeLostToRetryNs(taskid) > 0);
 
       // Verify that injecting OOM does not cause the block to actually happen or
       // the state to change

From 9e658bb9d8666804a0be16e21452a3e712241bac Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 27 Jun 2023 05:01:56 +0800
Subject: [PATCH 062/113] Update submodule cudf to
 cdb08fc6c8ebc2a406d505a50e236c162564d76a (#1237)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 9a3f3a9f0d..cdb08fc6c8 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 9a3f3a9f0d67f71bf7376e0251d67fdb725f512f
+Subproject commit cdb08fc6c8ebc2a406d505a50e236c162564d76a

From 8edf770ebca8fc66609c16e0eb89de1f27dc872e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 27 Jun 2023 11:03:19 +0800
Subject: [PATCH 063/113] Update submodule cudf to
 aed7174eae6c6eb38fbf186938df44f88787cf29 (#1238)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index cdb08fc6c8..aed7174eae 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit cdb08fc6c8ebc2a406d505a50e236c162564d76a
+Subproject commit aed7174eae6c6eb38fbf186938df44f88787cf29

From 701c84e11bd5e3a0c442e3f602c612a90560a0d7 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 28 Jun 2023 21:00:56 +0800
Subject: [PATCH 064/113] Update submodule cudf to
 0a52c5211bedf82b81a37660ae94c998e596d475 (#1239)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index aed7174eae..0a52c5211b 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit aed7174eae6c6eb38fbf186938df44f88787cf29
+Subproject commit 0a52c5211bedf82b81a37660ae94c998e596d475

From cb80c0135e226d4b342ce2116bd3e11f88c25df1 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 29 Jun 2023 11:01:17 +0800
Subject: [PATCH 065/113] Update submodule cudf to
 5c615cc1325372a8041378b83be73f65142568ff (#1240)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 0a52c5211b..5c615cc132 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 0a52c5211bedf82b81a37660ae94c998e596d475
+Subproject commit 5c615cc1325372a8041378b83be73f65142568ff

From 35aca646b408337cfc4f140d3f0f51b21b764d23 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 29 Jun 2023 23:01:52 +0800
Subject: [PATCH 066/113] Update submodule cudf to
 de4a20ec9089a77e7c0a106794ba863e3d0083df (#1241)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5c615cc132..de4a20ec90 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5c615cc1325372a8041378b83be73f65142568ff
+Subproject commit de4a20ec9089a77e7c0a106794ba863e3d0083df

From a0f920d2237e0cc32a7f1d69b5787ae628bfa072 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 30 Jun 2023 05:01:56 +0800
Subject: [PATCH 067/113] Update submodule cudf to
 08b09c20a70477d7515f705a07a6897ffc807066 (#1242)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index de4a20ec90..08b09c20a7 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit de4a20ec9089a77e7c0a106794ba863e3d0083df
+Subproject commit 08b09c20a70477d7515f705a07a6897ffc807066

From 33107160b85cb1a9053d313071f9a33de928cdd4 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 30 Jun 2023 11:07:18 +0800
Subject: [PATCH 068/113] Update submodule cudf to
 e7a1448816b9d83c3f69676cf23fa38f45891ab1 (#1243)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 08b09c20a7..e7a1448816 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 08b09c20a70477d7515f705a07a6897ffc807066
+Subproject commit e7a1448816b9d83c3f69676cf23fa38f45891ab1

From 7dffedc4a71a23b347e771f7dedd7c1d14475326 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 1 Jul 2023 05:03:04 +0800
Subject: [PATCH 069/113] Update submodule cudf to
 d14b6cce9cc39793c118f065b113c83d0210ceb6 (#1245)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e7a1448816..d14b6cce9c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e7a1448816b9d83c3f69676cf23fa38f45891ab1
+Subproject commit d14b6cce9cc39793c118f065b113c83d0210ceb6

From c23b86452fd2c60a7f804f6e5ffc97b436fdcd7f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 1 Jul 2023 10:17:27 +0800
Subject: [PATCH 070/113] Update submodule cudf to
 62c4f99f79852a95f61f241c884e598c9164331d (#1247)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index d14b6cce9c..62c4f99f79 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit d14b6cce9cc39793c118f065b113c83d0210ceb6
+Subproject commit 62c4f99f79852a95f61f241c884e598c9164331d

From 90ae63c08d5811b86e514448ade87241fda49f0d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 4 Jul 2023 05:02:08 +0800
Subject: [PATCH 071/113] Update submodule cudf to
 d078cff8fc8b3e38c59ca74ab975f6a1ecc49cfb (#1249)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 62c4f99f79..d078cff8fc 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 62c4f99f79852a95f61f241c884e598c9164331d
+Subproject commit d078cff8fc8b3e38c59ca74ab975f6a1ecc49cfb

From 10274073c98ffcef87b96c14bbd58d82912c370b Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 4 Jul 2023 21:01:41 +0800
Subject: [PATCH 072/113] Update submodule cudf to
 55b9bfcb18525360c2f30b032de63ea90913ee54 (#1250)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index d078cff8fc..55b9bfcb18 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit d078cff8fc8b3e38c59ca74ab975f6a1ecc49cfb
+Subproject commit 55b9bfcb18525360c2f30b032de63ea90913ee54

From c1c9d038f34a66488652eedecb2296f95c1d0e73 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 5 Jul 2023 23:02:09 +0800
Subject: [PATCH 073/113] Update submodule cudf to
 9da347e3b006b72efbe9c55733048dcbb1bc721f (#1251)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 55b9bfcb18..9da347e3b0 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 55b9bfcb18525360c2f30b032de63ea90913ee54
+Subproject commit 9da347e3b006b72efbe9c55733048dcbb1bc721f

From 3b3ced76308b5e03bfef4311c09fa127c15dc93f Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 5 Jul 2023 15:06:46 -0500
Subject: [PATCH 074/113] Pre-commit formatting pass on cpp files. (#1244)

* Run pre-commit to format files. We were behind a bit.

* Update pre-commit config to 16.0.1 to match cudf. Re-ran formatting.

* Reformat of code via pre-commit

Signed-off-by: db <dbaranec@nvidia.com>

---------

Signed-off-by: db <dbaranec@nvidia.com>
---
 .pre-commit-config.yaml                       |    2 +-
 .../cpp/benchmarks/cast_string_to_float.cpp   |   16 +-
 .../cpp/benchmarks/common/generate_input.hpp  |    3 +-
 src/main/cpp/benchmarks/row_conversion.cpp    |   68 +-
 src/main/cpp/src/DecimalUtilsJni.cpp          |   91 +-
 src/main/cpp/src/MapUtilsJni.cpp              |    5 +-
 src/main/cpp/src/NativeParquetJni.cpp         |  971 ++++++-----
 src/main/cpp/src/RowConversionJni.cpp         |   53 +-
 src/main/cpp/src/SparkResourceAdaptorJni.cpp  |  567 ++++---
 src/main/cpp/src/ZOrderJni.cpp                |   13 +-
 src/main/cpp/src/cast_decimal_to_string.cu    |    8 +-
 src/main/cpp/src/cast_string.cu               |   27 +-
 src/main/cpp/src/cast_string_to_float.cu      |    6 +-
 src/main/cpp/src/decimal_utils.cu             |  807 +++++----
 src/main/cpp/src/decimal_utils.hpp            |   60 +-
 src/main/cpp/src/map_utils.cu                 |  418 +++--
 src/main/cpp/src/map_utils.hpp                |    9 +-
 src/main/cpp/src/map_utils_debug.cuh          |   89 +-
 src/main/cpp/src/row_conversion.cu            | 1503 ++++++++++-------
 src/main/cpp/src/row_conversion.hpp           |    2 +-
 src/main/cpp/src/zorder.cu                    |  222 +--
 src/main/cpp/src/zorder.hpp                   |    2 +-
 src/main/cpp/tests/cast_decimal_to_string.cpp |    3 +-
 src/main/cpp/tests/cast_string.cpp            |   18 +-
 src/main/cpp/tests/row_conversion.cpp         |   69 +-
 25 files changed, 2825 insertions(+), 2207 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b7d785ad65..6773e68a14 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 
 repos:
       - repo: https://github.com/pre-commit/mirrors-clang-format
-        rev: v14.0.6
+        rev: v16.0.1
         hooks:
               - id: clang-format
                 files: \.(cu|cuh|h|hpp|cpp|inl)$
diff --git a/src/main/cpp/benchmarks/cast_string_to_float.cpp b/src/main/cpp/benchmarks/cast_string_to_float.cpp
index a231775d01..d94f9d26a0 100644
--- a/src/main/cpp/benchmarks/cast_string_to_float.cpp
+++ b/src/main/cpp/benchmarks/cast_string_to_float.cpp
@@ -27,16 +27,18 @@
 void string_to_float(nvbench::state& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.get_int64("num_rows")};
-  auto const float_tbl = create_random_table({cudf::type_id::FLOAT32}, row_count{n_rows});
-  auto const float_col = float_tbl->get_column(0);
+  auto const float_tbl  = create_random_table({cudf::type_id::FLOAT32}, row_count{n_rows});
+  auto const float_col  = float_tbl->get_column(0);
   auto const string_col = cudf::strings::from_floats(float_col.view());
 
-  state.exec(nvbench::exec_tag::sync,
-  [&](nvbench::launch& launch) {
-      auto rows = spark_rapids_jni::string_to_float(cudf::data_type{cudf::type_id::FLOAT32}, string_col->view(), false, cudf::get_default_stream());
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto rows = spark_rapids_jni::string_to_float(cudf::data_type{cudf::type_id::FLOAT32},
+                                                  string_col->view(),
+                                                  false,
+                                                  cudf::get_default_stream());
   });
 }
 
 NVBENCH_BENCH(string_to_float)
-    .set_name("Strings to Float Cast")
-    .add_int64_axis("num_rows", {1 * 1024 * 1024, 100 * 1024 * 1024});
+  .set_name("Strings to Float Cast")
+  .add_int64_axis("num_rows", {1 * 1024 * 1024, 100 * 1024 * 1024});
diff --git a/src/main/cpp/benchmarks/common/generate_input.hpp b/src/main/cpp/benchmarks/common/generate_input.hpp
index a5be50d3f9..207ad00200 100644
--- a/src/main/cpp/benchmarks/common/generate_input.hpp
+++ b/src/main/cpp/benchmarks/common/generate_input.hpp
@@ -183,8 +183,7 @@ struct distribution_params<T, std::enable_if_t<std::is_same_v<T, cudf::struct_vi
 
 // Present for compilation only. To be implemented once reader/writers support the fixed width type.
 template <typename T>
-struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
-};
+struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {};
 
 /**
  * @brief Returns a vector of types, corresponding to the input type or a type group.
diff --git a/src/main/cpp/benchmarks/row_conversion.cpp b/src/main/cpp/benchmarks/row_conversion.cpp
index 46ce39a7aa..c625342867 100644
--- a/src/main/cpp/benchmarks/row_conversion.cpp
+++ b/src/main/cpp/benchmarks/row_conversion.cpp
@@ -28,15 +28,15 @@ void fixed_width(nvbench::state& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.get_int64("num_rows")};
   auto const direction = state.get_string("direction");
-  auto const table = create_random_table(cycle_dtypes({cudf::type_id::INT8,
-                                                       cudf::type_id::INT32,
-                                                       cudf::type_id::INT16,
-                                                       cudf::type_id::INT64,
-                                                       cudf::type_id::INT32,
-                                                       cudf::type_id::BOOL8,
-                                                       cudf::type_id::UINT16,
-                                                       cudf::type_id::UINT8,
-                                                       cudf::type_id::UINT64},
+  auto const table     = create_random_table(cycle_dtypes({cudf::type_id::INT8,
+                                                           cudf::type_id::INT32,
+                                                           cudf::type_id::INT16,
+                                                           cudf::type_id::INT64,
+                                                           cudf::type_id::INT32,
+                                                           cudf::type_id::BOOL8,
+                                                           cudf::type_id::UINT16,
+                                                           cudf::type_id::UINT8,
+                                                           cudf::type_id::UINT64},
                                                       212),
                                          row_count{n_rows});
 
@@ -50,16 +50,15 @@ void fixed_width(nvbench::state& state)
 
   auto rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(table->view());
 
-  state.exec(nvbench::exec_tag::sync,
-  [&](nvbench::launch& launch) {
-      if (direction == "to row") {
-        auto _rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(table->view());
-      } else {
-        for (auto const &r : rows) {
-          cudf::lists_column_view const l(r->view());
-          auto out = spark_rapids_jni::convert_from_rows_fixed_width_optimized(l, schema);
-        }
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    if (direction == "to row") {
+      auto _rows = spark_rapids_jni::convert_to_rows_fixed_width_optimized(table->view());
+    } else {
+      for (auto const& r : rows) {
+        cudf::lists_column_view const l(r->view());
+        auto out = spark_rapids_jni::convert_from_rows_fixed_width_optimized(l, schema);
       }
+    }
   });
 
   state.add_buffer_size(n_rows, "trc", "Total Rows");
@@ -69,7 +68,7 @@ void fixed_width(nvbench::state& state)
 static void variable_or_fixed_width(nvbench::state& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.get_int64("num_rows")};
-  auto const direction = state.get_string("direction");
+  auto const direction       = state.get_string("direction");
   auto const include_strings = state.get_string("strings");
 
   if (n_rows > 1 * 1024 * 1024 && include_strings == "include strings") {
@@ -120,17 +119,16 @@ static void variable_or_fixed_width(nvbench::state& state)
 
   auto rows = spark_rapids_jni::convert_to_rows(table->view());
 
-  state.exec(nvbench::exec_tag::sync,
-  [&](nvbench::launch& launch) {
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     auto new_rows = spark_rapids_jni::convert_to_rows(table->view());
-          if (direction == "to row") {
-        auto _rows = spark_rapids_jni::convert_to_rows(table->view());
-      } else {
-        for (auto const &r : rows) {
-          cudf::lists_column_view const l(r->view());
-          auto out = spark_rapids_jni::convert_from_rows(l, schema);
-        }
+    if (direction == "to row") {
+      auto _rows = spark_rapids_jni::convert_to_rows(table->view());
+    } else {
+      for (auto const& r : rows) {
+        cudf::lists_column_view const l(r->view());
+        auto out = spark_rapids_jni::convert_from_rows(l, schema);
       }
+    }
   });
 
   state.add_buffer_size(n_rows, "trc", "Total Rows");
@@ -138,12 +136,12 @@ static void variable_or_fixed_width(nvbench::state& state)
 }
 
 NVBENCH_BENCH(fixed_width)
-    .set_name("Fixed Width Only")
-    .add_int64_axis("num_rows", {1 * 1024 * 1024, 4 * 1024 * 1024})
-    .add_string_axis("direction", {"to row", "from row"});
+  .set_name("Fixed Width Only")
+  .add_int64_axis("num_rows", {1 * 1024 * 1024, 4 * 1024 * 1024})
+  .add_string_axis("direction", {"to row", "from row"});
 
 NVBENCH_BENCH(variable_or_fixed_width)
-    .set_name("Fixed or Variable Width")
-    .add_int64_axis("num_rows", {1 * 1024 * 1024, 4 * 1024 * 1024})
-    .add_string_axis("direction", {"to row", "from row"})
-    .add_string_axis("strings", {"include strings", "no strings"});
+  .set_name("Fixed or Variable Width")
+  .add_int64_axis("num_rows", {1 * 1024 * 1024, 4 * 1024 * 1024})
+  .add_string_axis("direction", {"to row", "from row"})
+  .add_string_axis("strings", {"include strings", "no strings"});
diff --git a/src/main/cpp/src/DecimalUtilsJni.cpp b/src/main/cpp/src/DecimalUtilsJni.cpp
index 25045aa94e..f732276817 100644
--- a/src/main/cpp/src/DecimalUtilsJni.cpp
+++ b/src/main/cpp/src/DecimalUtilsJni.cpp
@@ -14,98 +14,95 @@
  * limitations under the License.
  */
 
-#include "decimal_utils.hpp"
 #include "cudf_jni_apis.hpp"
+#include "decimal_utils.hpp"
 
 extern "C" {
 
-JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_multiply128(JNIEnv *env, jclass,
-                                                                                       jlong j_view_a,
-                                                                                       jlong j_view_b,
-                                                                                       jint j_product_scale) {
+JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_multiply128(
+  JNIEnv* env, jclass, jlong j_view_a, jlong j_view_b, jint j_product_scale)
+{
   JNI_NULL_CHECK(env, j_view_a, "column is null", 0);
   JNI_NULL_CHECK(env, j_view_b, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto view_a = reinterpret_cast<cudf::column_view const *>(j_view_a);
-    auto view_b = reinterpret_cast<cudf::column_view const *>(j_view_b);
-    auto scale = static_cast<int>(j_product_scale);
-    return cudf::jni::convert_table_for_return(env, cudf::jni::multiply_decimal128(*view_a, *view_b,
-                                                                                   scale));
+    auto view_a = reinterpret_cast<cudf::column_view const*>(j_view_a);
+    auto view_b = reinterpret_cast<cudf::column_view const*>(j_view_b);
+    auto scale  = static_cast<int>(j_product_scale);
+    return cudf::jni::convert_table_for_return(
+      env, cudf::jni::multiply_decimal128(*view_a, *view_b, scale));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_divide128(JNIEnv *env, jclass,
-                                                                                     jlong j_view_a,
-                                                                                     jlong j_view_b,
-                                                                                     jint j_quotient_scale,
-                                                                                     jboolean j_is_int_div) {
+JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_divide128(
+  JNIEnv* env, jclass, jlong j_view_a, jlong j_view_b, jint j_quotient_scale, jboolean j_is_int_div)
+{
   JNI_NULL_CHECK(env, j_view_a, "column is null", 0);
   JNI_NULL_CHECK(env, j_view_b, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto view_a = reinterpret_cast<cudf::column_view const *>(j_view_a);
-    auto view_b = reinterpret_cast<cudf::column_view const *>(j_view_b);
-    auto scale = static_cast<int>(j_quotient_scale);
+    auto view_a          = reinterpret_cast<cudf::column_view const*>(j_view_a);
+    auto view_b          = reinterpret_cast<cudf::column_view const*>(j_view_b);
+    auto scale           = static_cast<int>(j_quotient_scale);
     auto is_int_division = static_cast<bool>(j_is_int_div);
     if (is_int_division) {
-      return cudf::jni::convert_table_for_return(env, cudf::jni::integer_divide_decimal128(*view_a, *view_b, scale));
+      return cudf::jni::convert_table_for_return(
+        env, cudf::jni::integer_divide_decimal128(*view_a, *view_b, scale));
     } else {
-      return cudf::jni::convert_table_for_return(env, cudf::jni::divide_decimal128(*view_a, *view_b, scale));
+      return cudf::jni::convert_table_for_return(
+        env, cudf::jni::divide_decimal128(*view_a, *view_b, scale));
     }
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_remainder128(JNIEnv *env, jclass,
-                                                                                     jlong j_view_a,
-                                                                                     jlong j_view_b,
-                                                                                     jint j_remainder_scale) {
+JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_remainder128(
+  JNIEnv* env, jclass, jlong j_view_a, jlong j_view_b, jint j_remainder_scale)
+{
   JNI_NULL_CHECK(env, j_view_a, "column is null", 0);
   JNI_NULL_CHECK(env, j_view_b, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto view_a = reinterpret_cast<cudf::column_view const *>(j_view_a);
-    auto view_b = reinterpret_cast<cudf::column_view const *>(j_view_b);
-    auto scale = static_cast<int>(j_remainder_scale);
-    return cudf::jni::convert_table_for_return(env, cudf::jni::remainder_decimal128(*view_a, *view_b, scale));
+    auto view_a = reinterpret_cast<cudf::column_view const*>(j_view_a);
+    auto view_b = reinterpret_cast<cudf::column_view const*>(j_view_b);
+    auto scale  = static_cast<int>(j_remainder_scale);
+    return cudf::jni::convert_table_for_return(
+      env, cudf::jni::remainder_decimal128(*view_a, *view_b, scale));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_add128(JNIEnv *env, jclass,
-                                                                                     jlong j_view_a,
-                                                                                     jlong j_view_b,
-                                                                                     jint j_target_scale) {
+JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_add128(
+  JNIEnv* env, jclass, jlong j_view_a, jlong j_view_b, jint j_target_scale)
+{
   JNI_NULL_CHECK(env, j_view_a, "column is null", 0);
   JNI_NULL_CHECK(env, j_view_b, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const view_a= reinterpret_cast<cudf::column_view const *>(j_view_a);
-    auto const view_b= reinterpret_cast<cudf::column_view const *>(j_view_b);
-    auto const scale = static_cast<int>(j_target_scale);
-    return cudf::jni::convert_table_for_return(env, cudf::jni::add_decimal128(*view_a, *view_b,
-                                                                                 scale));
+    auto const view_a = reinterpret_cast<cudf::column_view const*>(j_view_a);
+    auto const view_b = reinterpret_cast<cudf::column_view const*>(j_view_b);
+    auto const scale  = static_cast<int>(j_target_scale);
+    return cudf::jni::convert_table_for_return(env,
+                                               cudf::jni::add_decimal128(*view_a, *view_b, scale));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_subtract128(JNIEnv *env, jclass,
-                                                                                     jlong j_view_a,
-                                                                                     jlong j_view_b,
-                                                                                     jint j_target_scale) {
+JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_DecimalUtils_subtract128(
+  JNIEnv* env, jclass, jlong j_view_a, jlong j_view_b, jint j_target_scale)
+{
   JNI_NULL_CHECK(env, j_view_a, "column is null", 0);
   JNI_NULL_CHECK(env, j_view_b, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const view_a = reinterpret_cast<cudf::column_view const *>(j_view_a);
-    auto const view_b = reinterpret_cast<cudf::column_view const *>(j_view_b);
-    auto const scale = static_cast<int>(j_target_scale);
-    return cudf::jni::convert_table_for_return(env, cudf::jni::sub_decimal128(*view_a, *view_b,
-                                                                                 scale));
+    auto const view_a = reinterpret_cast<cudf::column_view const*>(j_view_a);
+    auto const view_b = reinterpret_cast<cudf::column_view const*>(j_view_b);
+    auto const scale  = static_cast<int>(j_target_scale);
+    return cudf::jni::convert_table_for_return(env,
+                                               cudf::jni::sub_decimal128(*view_a, *view_b, scale));
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/src/main/cpp/src/MapUtilsJni.cpp b/src/main/cpp/src/MapUtilsJni.cpp
index fbbcdd889f..dc02d04370 100644
--- a/src/main/cpp/src/MapUtilsJni.cpp
+++ b/src/main/cpp/src/MapUtilsJni.cpp
@@ -22,12 +22,13 @@
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_MapUtils_extractRawMapFromJsonString(
-    JNIEnv *env, jclass, jlong input_handle) {
+  JNIEnv* env, jclass, jlong input_handle)
+{
   JNI_NULL_CHECK(env, input_handle, "json_column_handle is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input = reinterpret_cast<cudf::column_view const*>(input_handle);
     return cudf::jni::ptr_as_jlong(spark_rapids_jni::from_json(*input).release());
   }
   CATCH_STD(env, 0);
diff --git a/src/main/cpp/src/NativeParquetJni.cpp b/src/main/cpp/src/NativeParquetJni.cpp
index 5d51857dcb..c6d90be0cc 100644
--- a/src/main/cpp/src/NativeParquetJni.cpp
+++ b/src/main/cpp/src/NativeParquetJni.cpp
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#include <stdexcept>
+#include <cwctype>
 #include <sstream>
+#include <stdexcept>
 #include <string>
 #include <vector>
-#include <cwctype>
 
 // TCompactProtocol requires some #defines to work right.
 // This came from the parquet code itself...
-#define SIGNED_RIGHT_SHIFT_IS 1
+#define SIGNED_RIGHT_SHIFT_IS  1
 #define ARITHMETIC_RIGHT_SHIFT 1
 #include <thrift/TApplicationException.h>
 #include <thrift/protocol/TCompactProtocol.h>
@@ -42,17 +42,16 @@ namespace jni {
  * and may not produce the exact same result as the JVM does. This is probably good enough
  * for now.
  */
-std::string unicode_to_lower(std::string const& input) {
+std::string unicode_to_lower(std::string const& input)
+{
   std::mbstate_t to_wc_state = std::mbstate_t();
-  const char * mbstr = input.data();
+  const char* mbstr          = input.data();
   // get the size of the wide character result
   std::size_t wide_size = std::mbsrtowcs(nullptr, &mbstr, 0, &to_wc_state);
-  if (wide_size < 0) {
-    throw std::invalid_argument("invalid character sequence");
-  }
+  if (wide_size < 0) { throw std::invalid_argument("invalid character sequence"); }
 
   std::vector<wchar_t> wide(wide_size + 1);
-  // Set a null so we can get a proper output size from wcstombs. This is because 
+  // Set a null so we can get a proper output size from wcstombs. This is because
   // we pass in a max length of 0, so it will only stop when it see the null character.
   wide.back() = 0;
   if (std::mbsrtowcs(wide.data(), &mbstr, wide_size, &to_wc_state) != wide_size) {
@@ -63,11 +62,9 @@ std::string unicode_to_lower(std::string const& input) {
   }
   // Get the multi-byte result size
   std::mbstate_t from_wc_state = std::mbstate_t();
-  const wchar_t * wcstr = wide.data();
-  std::size_t mb_size = std::wcsrtombs(nullptr, &wcstr, 0, &from_wc_state);
-  if (mb_size < 0) {
-    throw std::invalid_argument("unsupported wide character sequence");
-  }
+  const wchar_t* wcstr         = wide.data();
+  std::size_t mb_size          = std::wcsrtombs(nullptr, &wcstr, 0, &from_wc_state);
+  if (mb_size < 0) { throw std::invalid_argument("unsupported wide character sequence"); }
   // We are allocating a fixed size string so we can put the data directly into it
   // instead of going through a NUL terminated char* first. The NUL fill char is
   // just because we need to pass in a fill char. The value does not matter
@@ -102,351 +99,451 @@ struct column_pruning_maps {
  * lets us match the Spark schema to the schema in the Parquet file. Different
  * versions of parquet had different layouts for various nested types.
  */
-enum class Tag {
-  VALUE = 0,
-  STRUCT,
-  LIST,
-  MAP
-};
+enum class Tag { VALUE = 0, STRUCT, LIST, MAP };
 
 /**
  * This class will handle processing column pruning for a schema. It is written as a class because
- * of JNI we are sending the names of the columns as a depth first list, like parquet does internally.
+ * of JNI we are sending the names of the columns as a depth first list, like parquet does
+ * internally.
  */
 class column_pruner {
-public:
-    /**
-     * Create pruning filter from a depth first flattened tree of names and num_children.
-     * The root entry is not included in names or in num_children, but parent_num_children
-     * should hold how many entries there are in it.
-     */
-    column_pruner(std::vector<std::string> const & names, 
-            std::vector<int> const & num_children, 
-            std::vector<Tag> const & tags, 
-            int const parent_num_children): children(), tag(Tag::STRUCT) {
-      add_depth_first(names, num_children, tags, parent_num_children);
-    }
+ public:
+  /**
+   * Create pruning filter from a depth first flattened tree of names and num_children.
+   * The root entry is not included in names or in num_children, but parent_num_children
+   * should hold how many entries there are in it.
+   */
+  column_pruner(std::vector<std::string> const& names,
+                std::vector<int> const& num_children,
+                std::vector<Tag> const& tags,
+                int const parent_num_children)
+    : children(), tag(Tag::STRUCT)
+  {
+    add_depth_first(names, num_children, tags, parent_num_children);
+  }
 
-    column_pruner(Tag const in_tag): children(), tag(in_tag) {
-    }
+  column_pruner(Tag const in_tag) : children(), tag(in_tag) {}
 
-    column_pruner(): children(), tag(Tag::STRUCT) {
-    }
+  column_pruner() : children(), tag(Tag::STRUCT) {}
 
-    /**
-     * Given a schema from a parquet file create a set of pruning maps to prune columns from the rest of the footer
-     */
-    column_pruning_maps filter_schema(std::vector<parquet::format::SchemaElement> const & schema, bool const ignore_case) const {
-      CUDF_FUNC_RANGE();
+  /**
+   * Given a schema from a parquet file create a set of pruning maps to prune columns from the rest
+   * of the footer
+   */
+  column_pruning_maps filter_schema(std::vector<parquet::format::SchemaElement> const& schema,
+                                    bool const ignore_case) const
+  {
+    CUDF_FUNC_RANGE();
 
-      // These are the outputs of the computation.
-      std::vector<int> chunk_map;
-      std::vector<int> schema_map;
-      std::vector<int> schema_num_children;
-      std::size_t current_input_schema_index = 0;
-      std::size_t next_input_chunk_index = 0;
+    // These are the outputs of the computation.
+    std::vector<int> chunk_map;
+    std::vector<int> schema_map;
+    std::vector<int> schema_num_children;
+    std::size_t current_input_schema_index = 0;
+    std::size_t next_input_chunk_index     = 0;
+
+    filter_schema(schema,
+                  ignore_case,
+                  current_input_schema_index,
+                  next_input_chunk_index,
+                  chunk_map,
+                  schema_map,
+                  schema_num_children);
+
+    return column_pruning_maps{
+      std::move(schema_map), std::move(schema_num_children), std::move(chunk_map)};
+  }
 
-      filter_schema(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children);
+ private:
+  std::string get_name(parquet::format::SchemaElement& elem,
+                       const bool normalize_case = false) const
+  {
+    return normalize_case ? unicode_to_lower(elem.name) : elem.name;
+  }
 
-      return column_pruning_maps{std::move(schema_map),
-          std::move(schema_num_children),
-          std::move(chunk_map)};
-    }
+  int get_num_children(parquet::format::SchemaElement& elem) const
+  {
+    return elem.__isset.num_children ? elem.num_children : 0;
+  }
 
-private:
-    std::string get_name(parquet::format::SchemaElement & elem, const bool normalize_case = false) const {
-      return normalize_case ? unicode_to_lower(elem.name) : elem.name;
-    }
+  void skip(std::vector<parquet::format::SchemaElement> const& schema,
+            std::size_t& current_input_schema_index,
+            std::size_t& next_input_chunk_index) const
+  {
+    // We want to skip everything referenced by the current_input_schema_index and its children.
+    // But we do have to update the chunk indexes as we go.
+    int num_to_skip = 1;
+    while (num_to_skip > 0 && current_input_schema_index < schema.size()) {
+      auto schema_item = schema[current_input_schema_index];
+      bool is_leaf     = schema_item.__isset.type;
+      if (is_leaf) { ++next_input_chunk_index; }
+
+      if (schema_item.__isset.num_children) {
+        num_to_skip = num_to_skip + schema_item.num_children;
+      }
 
-    int get_num_children(parquet::format::SchemaElement & elem) const {
-      return elem.__isset.num_children ? elem.num_children : 0;
+      --num_to_skip;
+      ++current_input_schema_index;
     }
+  }
 
-    void skip(std::vector<parquet::format::SchemaElement> const & schema,
-            std::size_t & current_input_schema_index, std::size_t  & next_input_chunk_index) const {
-      // We want to skip everything referenced by the current_input_schema_index and its children.
-      // But we do have to update the chunk indexes as we go.
-      int num_to_skip = 1;
-      while (num_to_skip > 0 && current_input_schema_index < schema.size()) {
-        auto schema_item = schema[current_input_schema_index];
-        bool is_leaf = schema_item.__isset.type;
-        if (is_leaf) {
-          ++next_input_chunk_index;
-        }
-
-        if (schema_item.__isset.num_children) {
-          num_to_skip = num_to_skip + schema_item.num_children;
-        }
-
-        --num_to_skip;
-        ++current_input_schema_index;
+  /**
+   * filter_schema, but specific to Tag::STRUCT.
+   */
+  void filter_schema_struct(std::vector<parquet::format::SchemaElement> const& schema,
+                            bool const ignore_case,
+                            std::size_t& current_input_schema_index,
+                            std::size_t& next_input_chunk_index,
+                            std::vector<int>& chunk_map,
+                            std::vector<int>& schema_map,
+                            std::vector<int>& schema_num_children) const
+  {
+    // First verify that we found a struct, like we expected to find.
+    auto struct_schema_item = schema.at(current_input_schema_index);
+    bool is_leaf            = struct_schema_item.__isset.type;
+    if (is_leaf) { throw std::runtime_error("Found a leaf node, but expected to find a struct"); }
+
+    int num_children = get_num_children(struct_schema_item);
+    // Now that everything looks good add ourselves into the maps, and move to the next entry to
+    // look at.
+    schema_map.push_back(current_input_schema_index);
+    // We will update the num_children each time we find one...
+    int our_num_children_index = schema_num_children.size();
+    schema_num_children.push_back(0);
+    ++current_input_schema_index;
+
+    // For a STRUCT we want to look for all of the children that match the name and let each of them
+    // handle updating things themselves.
+    for (int child_id = 0; child_id < num_children && current_input_schema_index < schema.size();
+         child_id++) {
+      auto schema_item = schema[current_input_schema_index];
+      std::string name = get_name(schema_item, ignore_case);
+      auto found       = children.find(name);
+
+      if (found != children.end()) {
+        // found a match so update the number of children that passed the filter and ask it to
+        // filter itself.
+        ++schema_num_children[our_num_children_index];
+        found->second.filter_schema(schema,
+                                    ignore_case,
+                                    current_input_schema_index,
+                                    next_input_chunk_index,
+                                    chunk_map,
+                                    schema_map,
+                                    schema_num_children);
+      } else {
+        // No match was found so skip the child.
+        skip(schema, current_input_schema_index, next_input_chunk_index);
       }
     }
+  }
 
-    /**
-     * filter_schema, but specific to Tag::STRUCT.
-     */
-    void filter_schema_struct(std::vector<parquet::format::SchemaElement> const & schema, bool const ignore_case,
-            std::size_t & current_input_schema_index, std::size_t & next_input_chunk_index,
-            std::vector<int> & chunk_map, std::vector<int> & schema_map, std::vector<int> & schema_num_children) const {
-        // First verify that we found a struct, like we expected to find.
-        auto struct_schema_item = schema.at(current_input_schema_index);
-        bool is_leaf = struct_schema_item.__isset.type;
-        if (is_leaf) {
-          throw std::runtime_error("Found a leaf node, but expected to find a struct");
-        }
-
-        int num_children = get_num_children(struct_schema_item);
-        // Now that everything looks good add ourselves into the maps, and move to the next entry to look at.
-        schema_map.push_back(current_input_schema_index);
-        // We will update the num_children each time we find one...
-        int our_num_children_index = schema_num_children.size();
-        schema_num_children.push_back(0);
-        ++current_input_schema_index;
-
-        // For a STRUCT we want to look for all of the children that match the name and let each of them handle updating things
-        // themselves.
-        for (int child_id = 0; child_id < num_children && current_input_schema_index < schema.size(); child_id++) {
-          auto schema_item = schema[current_input_schema_index];
-          std::string name = get_name(schema_item, ignore_case);
-          auto found = children.find(name);
-
-          if (found != children.end()) {
-            // found a match so update the number of children that passed the filter and ask it to filter itself.
-            ++schema_num_children[our_num_children_index];
-            found->second.filter_schema(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children);
-          } else {
-            // No match was found so skip the child.
-            skip(schema, current_input_schema_index, next_input_chunk_index);
-          }
-        }
+  /**
+   * filter_schema, but specific to Tag::VALUE.
+   */
+  void filter_schema_value(std::vector<parquet::format::SchemaElement> const& schema,
+                           std::size_t& current_input_schema_index,
+                           std::size_t& next_input_chunk_index,
+                           std::vector<int>& chunk_map,
+                           std::vector<int>& schema_map,
+                           std::vector<int>& schema_num_children) const
+  {
+    auto schema_item = schema.at(current_input_schema_index);
+    bool is_leaf     = schema_item.__isset.type;
+    if (!is_leaf) { throw std::runtime_error("found a non-leaf entry when reading a leaf value"); }
+    if (get_num_children(schema_item) != 0) {
+      throw std::runtime_error("found an entry with children when reading a leaf value");
     }
+    schema_map.push_back(current_input_schema_index);
+    schema_num_children.push_back(0);
+    ++current_input_schema_index;
+    chunk_map.push_back(next_input_chunk_index);
+    ++next_input_chunk_index;
+  }
 
-    /**
-     * filter_schema, but specific to Tag::VALUE.
-     */
-    void filter_schema_value(std::vector<parquet::format::SchemaElement> const & schema,
-            std::size_t & current_input_schema_index, std::size_t & next_input_chunk_index,
-            std::vector<int> & chunk_map, std::vector<int> & schema_map, std::vector<int> & schema_num_children) const {
-        auto schema_item = schema.at(current_input_schema_index);
-        bool is_leaf = schema_item.__isset.type;
-        if (!is_leaf) {
-          throw std::runtime_error("found a non-leaf entry when reading a leaf value");
-        }
-        if (get_num_children(schema_item) != 0) {
-          throw std::runtime_error("found an entry with children when reading a leaf value");
-        }
-        schema_map.push_back(current_input_schema_index);
-        schema_num_children.push_back(0);
-        ++current_input_schema_index;
-        chunk_map.push_back(next_input_chunk_index);
-        ++next_input_chunk_index;
+  /**
+   * filter_schema, but specific to Tag::LIST.
+   */
+  void filter_schema_list(std::vector<parquet::format::SchemaElement> const& schema,
+                          bool const ignore_case,
+                          std::size_t& current_input_schema_index,
+                          std::size_t& next_input_chunk_index,
+                          std::vector<int>& chunk_map,
+                          std::vector<int>& schema_map,
+                          std::vector<int>& schema_num_children) const
+  {
+    // By convention with the java code the child is always called "element"...
+    auto found = children.at("element");
+    // A list starts out as a group element(not leaf) with a ConvertedType that is a LIST
+    // Under it will be a repeated element
+    auto list_schema_item = schema.at(current_input_schema_index);
+    std::string list_name = list_schema_item.name;
+    bool is_group         = !list_schema_item.__isset.type;
+
+    // Rules for how to parse lists from the parquet format docs
+    // 1. If the repeated field is not a group, then its type is the element type and elements are
+    // required.
+    // 2. If the repeated field is a group with multiple fields, then its type is the element type
+    // and elements are required.
+    // 3. If the repeated field is a group with one field and is named either array or uses the
+    // LIST-annotated group's name
+    //    with _tuple appended then the repeated type is the element type and elements are required.
+    // 4. Otherwise, the repeated field's type is the element type with the repeated field's
+    // repetition.
+
+    if (!is_group) {
+      if (!list_schema_item.__isset.repetition_type ||
+          list_schema_item.repetition_type != parquet::format::FieldRepetitionType::REPEATED) {
+        throw std::runtime_error("expected list item to be repeating");
+      }
+      return filter_schema_value(schema,
+                                 current_input_schema_index,
+                                 next_input_chunk_index,
+                                 chunk_map,
+                                 schema_map,
+                                 schema_num_children);
     }
-
-    /**
-     * filter_schema, but specific to Tag::LIST.
-     */
-    void filter_schema_list(std::vector<parquet::format::SchemaElement> const & schema, bool const ignore_case,
-            std::size_t & current_input_schema_index, std::size_t & next_input_chunk_index,
-            std::vector<int> & chunk_map, std::vector<int> & schema_map, std::vector<int> & schema_num_children) const {
-        // By convention with the java code the child is always called "element"...  
-        auto found = children.at("element");
-        // A list starts out as a group element(not leaf) with a ConvertedType that is a LIST
-        // Under it will be a repeated element
-        auto list_schema_item = schema.at(current_input_schema_index);
-        std::string list_name = list_schema_item.name;
-        bool is_group = !list_schema_item.__isset.type;
-
-        // Rules for how to parse lists from the parquet format docs
-        // 1. If the repeated field is not a group, then its type is the element type and elements are required.
-        // 2. If the repeated field is a group with multiple fields, then its type is the element type and elements are required.
-        // 3. If the repeated field is a group with one field and is named either array or uses the LIST-annotated group's name
-        //    with _tuple appended then the repeated type is the element type and elements are required.
-        // 4. Otherwise, the repeated field's type is the element type with the repeated field's repetition.
-
-        if (!is_group) {
-          if (!list_schema_item.__isset.repetition_type ||
-              list_schema_item.repetition_type != parquet::format::FieldRepetitionType::REPEATED) {
-            throw std::runtime_error("expected list item to be repeating");
-          }
-          return filter_schema_value(schema, current_input_schema_index, next_input_chunk_index,
-                                     chunk_map, schema_map, schema_num_children);
-        }
-        if (!list_schema_item.__isset.converted_type || list_schema_item.converted_type != parquet::format::ConvertedType::LIST) {
-          throw std::runtime_error("expected a list type, but it was not found.");
-        }
-        if (get_num_children(list_schema_item) != 1) {
-          throw std::runtime_error("the structure of the outer list group is not standard");
-        }
-
-        // Now that the top level group looks good add it into the maps, and then start to look at the children
-        schema_map.push_back(current_input_schema_index);
-        schema_num_children.push_back(1);
-        ++current_input_schema_index;
-
-        auto repeated_field_schema_item = schema.at(current_input_schema_index);
-        if (!repeated_field_schema_item.__isset.repetition_type || repeated_field_schema_item.repetition_type != parquet::format::FieldRepetitionType::REPEATED) {
-          throw std::runtime_error("the structure of the list's child is not standard (non repeating)");
-        }
-
-        bool repeated_field_is_group = !repeated_field_schema_item.__isset.type;
-        int repeated_field_num_children = get_num_children(repeated_field_schema_item);
-        std::string repeated_field_name = repeated_field_schema_item.name;
-        if (repeated_field_is_group && repeated_field_num_children == 1 && 
-                repeated_field_name != "array" && repeated_field_name != (list_name + "_tuple")) {
-          // This is the "standard" format where there are two groups and then a child under the the second group that holds the data.
-          // so add in the middle repeated group to the map
-          schema_map.push_back(current_input_schema_index);
-          schema_num_children.push_back(1);
-          ++current_input_schema_index;
-
-          // And let the child filter itself.
-          found.filter_schema(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children);
-        } else {
-          // This is for an older format that is some times used where it is just two levels
-          found.filter_schema(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children);
-        }
+    if (!list_schema_item.__isset.converted_type ||
+        list_schema_item.converted_type != parquet::format::ConvertedType::LIST) {
+      throw std::runtime_error("expected a list type, but it was not found.");
+    }
+    if (get_num_children(list_schema_item) != 1) {
+      throw std::runtime_error("the structure of the outer list group is not standard");
     }
 
-    /**
-     * filter_schema, but specific to Tag::MAP.
-     */
-    void filter_schema_map(std::vector<parquet::format::SchemaElement> const & schema, bool const ignore_case,
-            std::size_t & current_input_schema_index, std::size_t & next_input_chunk_index,
-            std::vector<int> & chunk_map, std::vector<int> & schema_map, std::vector<int> & schema_num_children) const {
-        // By convention with the java code the children are always called "key" and "value"...  
-        auto key_found = children.at("key");
-        auto value_found = children.at("value");
-        auto map_schema_item = schema.at(current_input_schema_index);
-
-        // Maps are two levels. An outer group that has a ConvertedType of MAP or MAP_KEY_VALUE
-        // and then an inner group that has two fields a key (that is required) and a value, that is optional. 
-
-        bool is_map_group = !map_schema_item.__isset.type;
-        if (!is_map_group) {
-          throw std::runtime_error("expected a map item, but found a single value");
-        }
-        if (!map_schema_item.__isset.converted_type || 
-                (map_schema_item.converted_type != parquet::format::ConvertedType::MAP && 
-                 map_schema_item.converted_type != parquet::format::ConvertedType::MAP_KEY_VALUE)) {
-          throw std::runtime_error("expected a map type, but it was not found.");
-        }
-        if (get_num_children(map_schema_item) != 1) {
-          throw std::runtime_error("the structure of the outer map group is not standard");
-        }
+    // Now that the top level group looks good add it into the maps, and then start to look at the
+    // children
+    schema_map.push_back(current_input_schema_index);
+    schema_num_children.push_back(1);
+    ++current_input_schema_index;
+
+    auto repeated_field_schema_item = schema.at(current_input_schema_index);
+    if (!repeated_field_schema_item.__isset.repetition_type ||
+        repeated_field_schema_item.repetition_type !=
+          parquet::format::FieldRepetitionType::REPEATED) {
+      throw std::runtime_error("the structure of the list's child is not standard (non repeating)");
+    }
 
-        // The outer group looks good so lets add it in.
-        schema_map.push_back(current_input_schema_index);
-        schema_num_children.push_back(1);
-        ++current_input_schema_index;
+    bool repeated_field_is_group    = !repeated_field_schema_item.__isset.type;
+    int repeated_field_num_children = get_num_children(repeated_field_schema_item);
+    std::string repeated_field_name = repeated_field_schema_item.name;
+    if (repeated_field_is_group && repeated_field_num_children == 1 &&
+        repeated_field_name != "array" && repeated_field_name != (list_name + "_tuple")) {
+      // This is the "standard" format where there are two groups and then a child under the the
+      // second group that holds the data. so add in the middle repeated group to the map
+      schema_map.push_back(current_input_schema_index);
+      schema_num_children.push_back(1);
+      ++current_input_schema_index;
+
+      // And let the child filter itself.
+      found.filter_schema(schema,
+                          ignore_case,
+                          current_input_schema_index,
+                          next_input_chunk_index,
+                          chunk_map,
+                          schema_map,
+                          schema_num_children);
+    } else {
+      // This is for an older format that is some times used where it is just two levels
+      found.filter_schema(schema,
+                          ignore_case,
+                          current_input_schema_index,
+                          next_input_chunk_index,
+                          chunk_map,
+                          schema_map,
+                          schema_num_children);
+    }
+  }
 
-        // Now lets look at the repeated child.
-        auto repeated_field_schema_item = schema.at(current_input_schema_index);
-        if (!repeated_field_schema_item.__isset.repetition_type || repeated_field_schema_item.repetition_type != parquet::format::FieldRepetitionType::REPEATED) {
-          throw std::runtime_error("found non repeating map child");
-        }
+  /**
+   * filter_schema, but specific to Tag::MAP.
+   */
+  void filter_schema_map(std::vector<parquet::format::SchemaElement> const& schema,
+                         bool const ignore_case,
+                         std::size_t& current_input_schema_index,
+                         std::size_t& next_input_chunk_index,
+                         std::vector<int>& chunk_map,
+                         std::vector<int>& schema_map,
+                         std::vector<int>& schema_num_children) const
+  {
+    // By convention with the java code the children are always called "key" and "value"...
+    auto key_found       = children.at("key");
+    auto value_found     = children.at("value");
+    auto map_schema_item = schema.at(current_input_schema_index);
+
+    // Maps are two levels. An outer group that has a ConvertedType of MAP or MAP_KEY_VALUE
+    // and then an inner group that has two fields a key (that is required) and a value, that is
+    // optional.
+
+    bool is_map_group = !map_schema_item.__isset.type;
+    if (!is_map_group) {
+      throw std::runtime_error("expected a map item, but found a single value");
+    }
+    if (!map_schema_item.__isset.converted_type ||
+        (map_schema_item.converted_type != parquet::format::ConvertedType::MAP &&
+         map_schema_item.converted_type != parquet::format::ConvertedType::MAP_KEY_VALUE)) {
+      throw std::runtime_error("expected a map type, but it was not found.");
+    }
+    if (get_num_children(map_schema_item) != 1) {
+      throw std::runtime_error("the structure of the outer map group is not standard");
+    }
 
-        int repeated_field_num_children = get_num_children(repeated_field_schema_item);
+    // The outer group looks good so lets add it in.
+    schema_map.push_back(current_input_schema_index);
+    schema_num_children.push_back(1);
+    ++current_input_schema_index;
+
+    // Now lets look at the repeated child.
+    auto repeated_field_schema_item = schema.at(current_input_schema_index);
+    if (!repeated_field_schema_item.__isset.repetition_type ||
+        repeated_field_schema_item.repetition_type !=
+          parquet::format::FieldRepetitionType::REPEATED) {
+      throw std::runtime_error("found non repeating map child");
+    }
 
-        if (repeated_field_num_children != 1 && repeated_field_num_children != 2) {
-          throw std::runtime_error("found map with wrong number of children");
-        }
+    int repeated_field_num_children = get_num_children(repeated_field_schema_item);
 
-        schema_map.push_back(current_input_schema_index);
-        schema_num_children.push_back(repeated_field_num_children);
-        ++current_input_schema_index;
+    if (repeated_field_num_children != 1 && repeated_field_num_children != 2) {
+      throw std::runtime_error("found map with wrong number of children");
+    }
 
-        // Process the key...
-        key_found.filter_schema(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children);
-        if (repeated_field_num_children == 2) {
-          // Process the value...
-          value_found.filter_schema(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children);
-        }
+    schema_map.push_back(current_input_schema_index);
+    schema_num_children.push_back(repeated_field_num_children);
+    ++current_input_schema_index;
+
+    // Process the key...
+    key_found.filter_schema(schema,
+                            ignore_case,
+                            current_input_schema_index,
+                            next_input_chunk_index,
+                            chunk_map,
+                            schema_map,
+                            schema_num_children);
+    if (repeated_field_num_children == 2) {
+      // Process the value...
+      value_found.filter_schema(schema,
+                                ignore_case,
+                                current_input_schema_index,
+                                next_input_chunk_index,
+                                chunk_map,
+                                schema_map,
+                                schema_num_children);
     }
+  }
 
-    /**
-     * Recursive method to parse and update the maps to filter out columns in the schema and chunks.
-     * Each column_pruner is responsible to parse out from schema what it holds and skip anything
-     * that does not match. chunk_map, schema_map, and schema_num_children are the final outputs.
-     * current_input_schema_index and next_input_chunk_index are also outputs but are state that is
-     * passed to each child and returned when it comsumes comething.
-     */
-    void filter_schema(std::vector<parquet::format::SchemaElement> const & schema, bool const ignore_case,
-            std::size_t & current_input_schema_index, std::size_t & next_input_chunk_index,
-            std::vector<int> & chunk_map, std::vector<int> & schema_map, std::vector<int> & schema_num_children) const {
-      switch(tag) {
-        case Tag::STRUCT:
-          filter_schema_struct(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children);
-          break;
-        case Tag::VALUE:
-          filter_schema_value(schema, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children);
-          break;
-        case Tag::LIST:
-          filter_schema_list(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children);
-          break;
-        case Tag::MAP:
-          filter_schema_map(schema, ignore_case, current_input_schema_index, next_input_chunk_index, chunk_map, schema_map, schema_num_children);
-          break;
-        default:
-          throw std::runtime_error(std::string("INTERNAL ERROR UNEXPECTED TAG FOUND ") + std::to_string(static_cast<int>(tag)));
-      }
+  /**
+   * Recursive method to parse and update the maps to filter out columns in the schema and chunks.
+   * Each column_pruner is responsible to parse out from schema what it holds and skip anything
+   * that does not match. chunk_map, schema_map, and schema_num_children are the final outputs.
+   * current_input_schema_index and next_input_chunk_index are also outputs but are state that is
+   * passed to each child and returned when it comsumes comething.
+   */
+  void filter_schema(std::vector<parquet::format::SchemaElement> const& schema,
+                     bool const ignore_case,
+                     std::size_t& current_input_schema_index,
+                     std::size_t& next_input_chunk_index,
+                     std::vector<int>& chunk_map,
+                     std::vector<int>& schema_map,
+                     std::vector<int>& schema_num_children) const
+  {
+    switch (tag) {
+      case Tag::STRUCT:
+        filter_schema_struct(schema,
+                             ignore_case,
+                             current_input_schema_index,
+                             next_input_chunk_index,
+                             chunk_map,
+                             schema_map,
+                             schema_num_children);
+        break;
+      case Tag::VALUE:
+        filter_schema_value(schema,
+                            current_input_schema_index,
+                            next_input_chunk_index,
+                            chunk_map,
+                            schema_map,
+                            schema_num_children);
+        break;
+      case Tag::LIST:
+        filter_schema_list(schema,
+                           ignore_case,
+                           current_input_schema_index,
+                           next_input_chunk_index,
+                           chunk_map,
+                           schema_map,
+                           schema_num_children);
+        break;
+      case Tag::MAP:
+        filter_schema_map(schema,
+                          ignore_case,
+                          current_input_schema_index,
+                          next_input_chunk_index,
+                          chunk_map,
+                          schema_map,
+                          schema_num_children);
+        break;
+      default:
+        throw std::runtime_error(std::string("INTERNAL ERROR UNEXPECTED TAG FOUND ") +
+                                 std::to_string(static_cast<int>(tag)));
     }
+  }
 
-    /**
-     * Do a depth first traversal to build up column_pruner into a tree that matches the schema we want to filter using.
-     */
-    void add_depth_first(std::vector<std::string> const& names,
-            std::vector<int> const& num_children,
-            std::vector<Tag> const& tags,
-            int parent_num_children) {
-      CUDF_FUNC_RANGE();
-      if (parent_num_children == 0) {
-        // There is no point in doing more the tree is empty, and it lets us avoid some corner cases
-        // in the code below
-        return;
-      }
-      auto num = names.size();
-      std::vector<column_pruner*> tree_stack;
-      std::vector<int> num_children_stack;
-      tree_stack.push_back(this);
-      num_children_stack.push_back(parent_num_children);
-      for(uint64_t i = 0; i < num; ++i) {
-        auto name = names[i];
-        auto num_c = num_children[i];
-        auto t = tags[i];
-        tree_stack.back()->children.try_emplace(name, t);
-        if (num_c > 0) {
-          tree_stack.push_back(&tree_stack.back()->children[name]);
-          num_children_stack.push_back(num_c);
-        } else {
-          // go back up the stack/tree removing children until we hit one with more children
-          bool done = false;
-          while (!done) {
-              int parent_children_left = num_children_stack.back() - 1;
-              if (parent_children_left > 0) {
-                num_children_stack.back() = parent_children_left;
-                done = true;
-              } else {
-                tree_stack.pop_back();
-                num_children_stack.pop_back();
-              }
-
-              if (tree_stack.size() <= 0) {
-                done = true;
-              }
+  /**
+   * Do a depth first traversal to build up column_pruner into a tree that matches the schema we
+   * want to filter using.
+   */
+  void add_depth_first(std::vector<std::string> const& names,
+                       std::vector<int> const& num_children,
+                       std::vector<Tag> const& tags,
+                       int parent_num_children)
+  {
+    CUDF_FUNC_RANGE();
+    if (parent_num_children == 0) {
+      // There is no point in doing more the tree is empty, and it lets us avoid some corner cases
+      // in the code below
+      return;
+    }
+    auto num = names.size();
+    std::vector<column_pruner*> tree_stack;
+    std::vector<int> num_children_stack;
+    tree_stack.push_back(this);
+    num_children_stack.push_back(parent_num_children);
+    for (uint64_t i = 0; i < num; ++i) {
+      auto name  = names[i];
+      auto num_c = num_children[i];
+      auto t     = tags[i];
+      tree_stack.back()->children.try_emplace(name, t);
+      if (num_c > 0) {
+        tree_stack.push_back(&tree_stack.back()->children[name]);
+        num_children_stack.push_back(num_c);
+      } else {
+        // go back up the stack/tree removing children until we hit one with more children
+        bool done = false;
+        while (!done) {
+          int parent_children_left = num_children_stack.back() - 1;
+          if (parent_children_left > 0) {
+            num_children_stack.back() = parent_children_left;
+            done                      = true;
+          } else {
+            tree_stack.pop_back();
+            num_children_stack.pop_back();
           }
+
+          if (tree_stack.size() <= 0) { done = true; }
         }
       }
-      if (tree_stack.size() != 0 || num_children_stack.size() != 0) {
-        throw std::invalid_argument("DIDN'T CONSUME EVERYTHING...");
-      }
     }
+    if (tree_stack.size() != 0 || num_children_stack.size() != 0) {
+      throw std::invalid_argument("DIDN'T CONSUME EVERYTHING...");
+    }
+  }
 
-    std::map<std::string, column_pruner> children;
-    Tag tag;
+  std::map<std::string, column_pruner> children;
+  Tag tag;
 };
 
-static bool invalid_file_offset(long start_index, long pre_start_index, long pre_compressed_size) {
+static bool invalid_file_offset(long start_index, long pre_start_index, long pre_compressed_size)
+{
   bool invalid = false;
   // checking the first rowGroup
   if (pre_start_index == 0 && start_index != 4) {
@@ -454,7 +551,7 @@ static bool invalid_file_offset(long start_index, long pre_start_index, long pre
     return invalid;
   }
 
-  //calculate start index for other blocks
+  // calculate start index for other blocks
   int64_t min_start_index = pre_start_index + pre_compressed_size;
   if (start_index < min_start_index) {
     // a bad offset detected, try first column's offset
@@ -465,8 +562,9 @@ static bool invalid_file_offset(long start_index, long pre_start_index, long pre
   return invalid;
 }
 
-static int64_t get_offset(parquet::format::ColumnChunk const& column_chunk) {
-  auto md = column_chunk.meta_data;
+static int64_t get_offset(parquet::format::ColumnChunk const& column_chunk)
+{
+  auto md        = column_chunk.meta_data;
   int64_t offset = md.data_page_offset;
   if (md.__isset.dictionary_page_offset && offset > md.dictionary_page_offset) {
     offset = md.dictionary_page_offset;
@@ -474,73 +572,75 @@ static int64_t get_offset(parquet::format::ColumnChunk const& column_chunk) {
   return offset;
 }
 
-static std::vector<parquet::format::RowGroup> filter_groups(parquet::format::FileMetaData const& meta, 
-        int64_t part_offset, int64_t part_length) {
-    CUDF_FUNC_RANGE();
-    // This is based off of the java parquet_mr code to find the groups in a range... 
-    auto num_row_groups = meta.row_groups.size();
-    int64_t pre_start_index = 0;
-    int64_t pre_compressed_size = 0;
-    bool first_column_with_metadata = true;
-    if (num_row_groups > 0) {
-        first_column_with_metadata = meta.row_groups[0].columns[0].__isset.meta_data;
-    }
+static std::vector<parquet::format::RowGroup> filter_groups(
+  parquet::format::FileMetaData const& meta, int64_t part_offset, int64_t part_length)
+{
+  CUDF_FUNC_RANGE();
+  // This is based off of the java parquet_mr code to find the groups in a range...
+  auto num_row_groups             = meta.row_groups.size();
+  int64_t pre_start_index         = 0;
+  int64_t pre_compressed_size     = 0;
+  bool first_column_with_metadata = true;
+  if (num_row_groups > 0) {
+    first_column_with_metadata = meta.row_groups[0].columns[0].__isset.meta_data;
+  }
 
-    std::vector<parquet::format::RowGroup> filtered_groups;
-    for (uint64_t rg_i = 0; rg_i < num_row_groups; ++rg_i) {
-        parquet::format::RowGroup const& row_group = meta.row_groups[rg_i];
-        int64_t total_size = 0;
-        int64_t start_index;
-        auto column_chunk = row_group.columns[0];
-        if (first_column_with_metadata) {
-            start_index = get_offset(column_chunk);
+  std::vector<parquet::format::RowGroup> filtered_groups;
+  for (uint64_t rg_i = 0; rg_i < num_row_groups; ++rg_i) {
+    parquet::format::RowGroup const& row_group = meta.row_groups[rg_i];
+    int64_t total_size                         = 0;
+    int64_t start_index;
+    auto column_chunk = row_group.columns[0];
+    if (first_column_with_metadata) {
+      start_index = get_offset(column_chunk);
+    } else {
+      // the file_offset of first block always holds the truth, while other blocks don't :
+      // see PARQUET-2078 for details
+      start_index = row_group.file_offset;
+      if (invalid_file_offset(start_index, pre_start_index, pre_compressed_size)) {
+        // first row group's offset is always 4
+        if (pre_start_index == 0) {
+          start_index = 4;
         } else {
-          //the file_offset of first block always holds the truth, while other blocks don't :
-          //see PARQUET-2078 for details
-          start_index = row_group.file_offset;
-          if (invalid_file_offset(start_index, pre_start_index, pre_compressed_size)) {
-            //first row group's offset is always 4
-            if (pre_start_index == 0) {
-              start_index = 4;
-            } else {
-              // use minStartIndex(imprecise in case of padding, but good enough for filtering)
-              start_index = pre_start_index + pre_compressed_size;
-            }
-          }
-          pre_start_index = start_index;
-          pre_compressed_size = row_group.total_compressed_size;
-      }
-      if (row_group.__isset.total_compressed_size) {
-        total_size = row_group.total_compressed_size;
-      } else {
-        auto num_columns = row_group.columns.size();
-        for (uint64_t cc_i = 0; cc_i < num_columns; ++cc_i) {
-            parquet::format::ColumnChunk const& col = row_group.columns[cc_i];
-            total_size += col.meta_data.total_compressed_size;
+          // use minStartIndex(imprecise in case of padding, but good enough for filtering)
+          start_index = pre_start_index + pre_compressed_size;
         }
       }
-
-      int64_t mid_point = start_index + total_size / 2;
-      if (mid_point >= part_offset && mid_point < (part_offset + part_length)) {
-        filtered_groups.push_back(row_group);
+      pre_start_index     = start_index;
+      pre_compressed_size = row_group.total_compressed_size;
+    }
+    if (row_group.__isset.total_compressed_size) {
+      total_size = row_group.total_compressed_size;
+    } else {
+      auto num_columns = row_group.columns.size();
+      for (uint64_t cc_i = 0; cc_i < num_columns; ++cc_i) {
+        parquet::format::ColumnChunk const& col = row_group.columns[cc_i];
+        total_size += col.meta_data.total_compressed_size;
       }
     }
-    return filtered_groups;
+
+    int64_t mid_point = start_index + total_size / 2;
+    if (mid_point >= part_offset && mid_point < (part_offset + part_length)) {
+      filtered_groups.push_back(row_group);
+    }
+  }
+  return filtered_groups;
 }
 
-void deserialize_parquet_footer(uint8_t * buffer, uint32_t len, parquet::format::FileMetaData * meta) {
+void deserialize_parquet_footer(uint8_t* buffer, uint32_t len, parquet::format::FileMetaData* meta)
+{
   using ThriftBuffer = apache::thrift::transport::TMemoryBuffer;
 
   CUDF_FUNC_RANGE();
-  // A lot of this came from the parquet source code...
-  // Deserialize msg bytes into c++ thrift msg using memory transport.
-  #if PARQUET_THRIFT_VERSION_MAJOR > 0 || PARQUET_THRIFT_VERSION_MINOR >= 14
+// A lot of this came from the parquet source code...
+// Deserialize msg bytes into c++ thrift msg using memory transport.
+#if PARQUET_THRIFT_VERSION_MAJOR > 0 || PARQUET_THRIFT_VERSION_MINOR >= 14
   auto conf = std::make_shared<apache::thrift::TConfiguration>();
   conf->setMaxMessageSize(std::numeric_limits<int>::max());
   auto tmem_transport = std::make_shared<ThriftBuffer>(buffer, len, ThriftBuffer::OBSERVE, conf);
-  #else
+#else
   auto tmem_transport = std::make_shared<ThriftBuffer>(buffer, len);
-  #endif
+#endif
 
   apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> tproto_factory;
   // Protect against CPU and memory bombs
@@ -549,7 +649,7 @@ void deserialize_parquet_footer(uint8_t * buffer, uint32_t len, parquet::format:
   // This limits total memory to the same order of magnitude as stringSize.
   tproto_factory.setContainerSizeLimit(1000 * 1000);
   std::shared_ptr<apache::thrift::protocol::TProtocol> tproto =
-      tproto_factory.getProtocol(tmem_transport);
+    tproto_factory.getProtocol(tmem_transport);
   try {
     meta->read(tproto.get());
   } catch (std::exception& e) {
@@ -559,7 +659,8 @@ void deserialize_parquet_footer(uint8_t * buffer, uint32_t len, parquet::format:
   }
 }
 
-void filter_columns(std::vector<parquet::format::RowGroup> & groups, std::vector<int> & chunk_filter) {
+void filter_columns(std::vector<parquet::format::RowGroup>& groups, std::vector<int>& chunk_filter)
+{
   CUDF_FUNC_RANGE();
   for (auto group_it = groups.begin(); group_it != groups.end(); ++group_it) {
     std::vector<parquet::format::ColumnChunk> new_chunks;
@@ -570,24 +671,27 @@ void filter_columns(std::vector<parquet::format::RowGroup> & groups, std::vector
   }
 }
 
-}
-}
+}  // namespace jni
+}  // namespace rapids
 
 extern "C" {
 
-JNIEXPORT long JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_readAndFilter(JNIEnv * env, jclass,
-                                                                                    jlong buffer,
-                                                                                    jlong buffer_length,
-                                                                                    jlong part_offset,
-                                                                                    jlong part_length,
-                                                                                    jobjectArray filter_col_names,
-                                                                                    jintArray num_children,
-                                                                                    jintArray tags,
-                                                                                    jint parent_num_children,
-                                                                                    jboolean ignore_case) {
+JNIEXPORT long JNICALL
+Java_com_nvidia_spark_rapids_jni_ParquetFooter_readAndFilter(JNIEnv* env,
+                                                             jclass,
+                                                             jlong buffer,
+                                                             jlong buffer_length,
+                                                             jlong part_offset,
+                                                             jlong part_length,
+                                                             jobjectArray filter_col_names,
+                                                             jintArray num_children,
+                                                             jintArray tags,
+                                                             jint parent_num_children,
+                                                             jboolean ignore_case)
+{
   CUDF_FUNC_RANGE();
   try {
-    auto meta = std::make_unique<parquet::format::FileMetaData>();
+    auto meta    = std::make_unique<parquet::format::FileMetaData>();
     uint32_t len = static_cast<uint32_t>(buffer_length);
     // We don't support encrypted parquet...
     rapids::jni::deserialize_parquet_footer(reinterpret_cast<uint8_t*>(buffer), len, meta.get());
@@ -603,18 +707,18 @@ JNIEXPORT long JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_readAndFil
     }
 
     rapids::jni::column_pruner pruner(n_filter_col_names.as_cpp_vector(),
-            std::vector(n_num_children.begin(), n_num_children.end()),
-            tags,
-            parent_num_children);
+                                      std::vector(n_num_children.begin(), n_num_children.end()),
+                                      tags,
+                                      parent_num_children);
     auto filter = pruner.filter_schema(meta->schema, ignore_case);
 
     // start by filtering the schema and the chunks
     std::size_t new_schema_size = filter.schema_map.size();
     std::vector<parquet::format::SchemaElement> new_schema(new_schema_size);
     for (std::size_t i = 0; i < new_schema_size; ++i) {
-      int orig_index = filter.schema_map[i];
-      int new_num_children = filter.schema_num_children[i];
-      new_schema[i] = meta->schema[orig_index];
+      int orig_index             = filter.schema_map[i];
+      int new_num_children       = filter.schema_num_children[i];
+      new_schema[i]              = meta->schema[orig_index];
       new_schema[i].num_children = new_num_children;
     }
     meta->schema = std::move(new_schema);
@@ -636,21 +740,25 @@ JNIEXPORT long JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_readAndFil
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_close(JNIEnv * env, jclass,
-                                                                            jlong handle) {
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_close(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
   try {
-    parquet::format::FileMetaData * ptr = reinterpret_cast<parquet::format::FileMetaData *>(handle);
+    parquet::format::FileMetaData* ptr = reinterpret_cast<parquet::format::FileMetaData*>(handle);
     delete ptr;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_getNumRows(JNIEnv * env, jclass,
-                                                                                 jlong handle) {
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_getNumRows(JNIEnv* env,
+                                                                                  jclass,
+                                                                                  jlong handle)
+{
   try {
-    parquet::format::FileMetaData * ptr = reinterpret_cast<parquet::format::FileMetaData *>(handle);
-    long ret = 0;
-    for(auto it = ptr->row_groups.begin(); it != ptr->row_groups.end(); ++it) {
+    parquet::format::FileMetaData* ptr = reinterpret_cast<parquet::format::FileMetaData*>(handle);
+    long ret                           = 0;
+    for (auto it = ptr->row_groups.begin(); it != ptr->row_groups.end(); ++it) {
       ret = ret + it->num_rows;
     }
     return ret;
@@ -658,55 +766,56 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_getNumRow
   CATCH_STD(env, -1);
 }
 
-JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_getNumColumns(JNIEnv * env, jclass,
-                                                                                     jlong handle) {
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_getNumColumns(JNIEnv* env,
+                                                                                     jclass,
+                                                                                     jlong handle)
+{
   try {
-    parquet::format::FileMetaData * ptr = reinterpret_cast<parquet::format::FileMetaData *>(handle);
-    int ret = 0;
+    parquet::format::FileMetaData* ptr = reinterpret_cast<parquet::format::FileMetaData*>(handle);
+    int ret                            = 0;
     if (ptr->schema.size() > 0) {
-      if (ptr->schema[0].__isset.num_children) {
-        ret = ptr->schema[0].num_children;
-      }
+      if (ptr->schema[0].__isset.num_children) { ret = ptr->schema[0].num_children; }
     }
     return ret;
   }
   CATCH_STD(env, -1);
 }
 
-JNIEXPORT jobject JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_serializeThriftFile(JNIEnv * env, jclass,
-                                                                                             jlong handle) {
+JNIEXPORT jobject JNICALL Java_com_nvidia_spark_rapids_jni_ParquetFooter_serializeThriftFile(
+  JNIEnv* env, jclass, jlong handle)
+{
   CUDF_FUNC_RANGE();
   try {
-    parquet::format::FileMetaData * meta = reinterpret_cast<parquet::format::FileMetaData *>(handle);
+    parquet::format::FileMetaData* meta = reinterpret_cast<parquet::format::FileMetaData*>(handle);
     std::shared_ptr<apache::thrift::transport::TMemoryBuffer> transportOut(
-            new apache::thrift::transport::TMemoryBuffer());
-    apache::thrift::protocol::TCompactProtocolFactoryT<apache::thrift::transport::TMemoryBuffer> factory;
+      new apache::thrift::transport::TMemoryBuffer());
+    apache::thrift::protocol::TCompactProtocolFactoryT<apache::thrift::transport::TMemoryBuffer>
+      factory;
     auto protocolOut = factory.getProtocol(transportOut);
     meta->write(protocolOut.get());
-    uint8_t * buf_ptr;
+    uint8_t* buf_ptr;
     uint32_t buf_size;
     transportOut->getBuffer(&buf_ptr, &buf_size);
 
     // 12 extra is for the MAGIC thrift_footer length MAGIC
-    jobject ret = cudf::jni::allocate_host_buffer(env, buf_size + 12, false);
+    jobject ret       = cudf::jni::allocate_host_buffer(env, buf_size + 12, false);
     uint8_t* ret_addr = reinterpret_cast<uint8_t*>(cudf::jni::get_host_buffer_address(env, ret));
-    ret_addr[0] = 'P';
-    ret_addr[1] = 'A';
-    ret_addr[2] = 'R';
-    ret_addr[3] = '1';
+    ret_addr[0]       = 'P';
+    ret_addr[1]       = 'A';
+    ret_addr[2]       = 'R';
+    ret_addr[3]       = '1';
     std::memcpy(ret_addr + 4, buf_ptr, buf_size);
-    uint8_t * after = ret_addr + buf_size + 4;
-    after[0] = static_cast<uint8_t>(0xFF & buf_size);
-    after[1] = static_cast<uint8_t>(0xFF & (buf_size >> 8));
-    after[2] = static_cast<uint8_t>(0xFF & (buf_size >> 16));
-    after[3] = static_cast<uint8_t>(0xFF & (buf_size >> 24));
-    after[4] = 'P';
-    after[5] = 'A';
-    after[6] = 'R';
-    after[7] = '1';
+    uint8_t* after = ret_addr + buf_size + 4;
+    after[0]       = static_cast<uint8_t>(0xFF & buf_size);
+    after[1]       = static_cast<uint8_t>(0xFF & (buf_size >> 8));
+    after[2]       = static_cast<uint8_t>(0xFF & (buf_size >> 16));
+    after[3]       = static_cast<uint8_t>(0xFF & (buf_size >> 24));
+    after[4]       = 'P';
+    after[5]       = 'A';
+    after[6]       = 'R';
+    after[7]       = '1';
     return ret;
   }
   CATCH_STD(env, nullptr);
 }
-
 }
diff --git a/src/main/cpp/src/RowConversionJni.cpp b/src/main/cpp/src/RowConversionJni.cpp
index 2d1da7e453..1fdb8a86b5 100644
--- a/src/main/cpp/src/RowConversionJni.cpp
+++ b/src/main/cpp/src/RowConversionJni.cpp
@@ -21,37 +21,42 @@
 extern "C" {
 
 JNIEXPORT jlongArray JNICALL
-Java_com_nvidia_spark_rapids_jni_RowConversion_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass,
-                                                                                jlong input_table) {
+Java_com_nvidia_spark_rapids_jni_RowConversion_convertToRowsFixedWidthOptimized(JNIEnv* env,
+                                                                                jclass,
+                                                                                jlong input_table)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view const *n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
+    cudf::table_view const* n_input_table = reinterpret_cast<cudf::table_view const*>(input_table);
     std::vector<std::unique_ptr<cudf::column>> cols =
-        spark_rapids_jni::convert_to_rows_fixed_width_optimized(*n_input_table);
+      spark_rapids_jni::convert_to_rows_fixed_width_optimized(*n_input_table);
     int const num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-    std::transform(cols.begin(), cols.end(), outcol_handles.begin(),
-                   [](auto &col) { return cudf::jni::release_as_jlong(col); });
+    std::transform(cols.begin(), cols.end(), outcol_handles.begin(), [](auto& col) {
+      return cudf::jni::release_as_jlong(col);
+    });
     return outcol_handles.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_RowConversion_convertToRows(
-    JNIEnv *env, jclass, jlong input_table) {
+JNIEXPORT jlongArray JNICALL
+Java_com_nvidia_spark_rapids_jni_RowConversion_convertToRows(JNIEnv* env, jclass, jlong input_table)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view const *n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
+    cudf::table_view const* n_input_table = reinterpret_cast<cudf::table_view const*>(input_table);
     std::vector<std::unique_ptr<cudf::column>> cols =
-        spark_rapids_jni::convert_to_rows(*n_input_table);
+      spark_rapids_jni::convert_to_rows(*n_input_table);
     int const num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-    std::transform(cols.begin(), cols.end(), outcol_handles.begin(),
-                   [](auto &col) { return cudf::jni::release_as_jlong(col); });
+    std::transform(cols.begin(), cols.end(), outcol_handles.begin(), [](auto& col) {
+      return cudf::jni::release_as_jlong(col);
+    });
     return outcol_handles.get_jArray();
   }
   CATCH_STD(env, 0);
@@ -59,46 +64,54 @@ JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_RowConversion_conv
 
 JNIEXPORT jlongArray JNICALL
 Java_com_nvidia_spark_rapids_jni_RowConversion_convertFromRowsFixedWidthOptimized(
-    JNIEnv *env, jclass, jlong input_column, jintArray types, jintArray scale) {
+  JNIEnv* env, jclass, jlong input_column, jintArray types, jintArray scale)
+{
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   JNI_NULL_CHECK(env, types, "types is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::lists_column_view const list_input{*reinterpret_cast<cudf::column_view *>(input_column)};
+    cudf::lists_column_view const list_input{*reinterpret_cast<cudf::column_view*>(input_column)};
     cudf::jni::native_jintArray n_types(env, types);
     cudf::jni::native_jintArray n_scale(env, scale);
     if (n_types.size() != n_scale.size()) {
       JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", NULL);
     }
     std::vector<cudf::data_type> types_vec;
-    std::transform(n_types.begin(), n_types.end(), n_scale.begin(), std::back_inserter(types_vec),
+    std::transform(n_types.begin(),
+                   n_types.end(),
+                   n_scale.begin(),
+                   std::back_inserter(types_vec),
                    [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); });
     std::unique_ptr<cudf::table> result =
-        spark_rapids_jni::convert_from_rows_fixed_width_optimized(list_input, types_vec);
+      spark_rapids_jni::convert_from_rows_fixed_width_optimized(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_RowConversion_convertFromRows(
-    JNIEnv *env, jclass, jlong input_column, jintArray types, jintArray scale) {
+  JNIEnv* env, jclass, jlong input_column, jintArray types, jintArray scale)
+{
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   JNI_NULL_CHECK(env, types, "types is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::lists_column_view const list_input{*reinterpret_cast<cudf::column_view *>(input_column)};
+    cudf::lists_column_view const list_input{*reinterpret_cast<cudf::column_view*>(input_column)};
     cudf::jni::native_jintArray n_types(env, types);
     cudf::jni::native_jintArray n_scale(env, scale);
     if (n_types.size() != n_scale.size()) {
       JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", NULL);
     }
     std::vector<cudf::data_type> types_vec;
-    std::transform(n_types.begin(), n_types.end(), n_scale.begin(), std::back_inserter(types_vec),
+    std::transform(n_types.begin(),
+                   n_types.end(),
+                   n_scale.begin(),
+                   std::back_inserter(types_vec),
                    [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); });
     std::unique_ptr<cudf::table> result =
-        spark_rapids_jni::convert_from_rows(list_input, types_vec);
+      spark_rapids_jni::convert_from_rows(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/src/main/cpp/src/SparkResourceAdaptorJni.cpp b/src/main/cpp/src/SparkResourceAdaptorJni.cpp
index 13a7a50a95..bcbe8080ca 100644
--- a/src/main/cpp/src/SparkResourceAdaptorJni.cpp
+++ b/src/main/cpp/src/SparkResourceAdaptorJni.cpp
@@ -32,8 +32,8 @@
 
 namespace {
 
-constexpr char const *RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/RetryOOM";
-constexpr char const *SPLIT_AND_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/SplitAndRetryOOM";
+constexpr char const* RETRY_OOM_CLASS           = "com/nvidia/spark/rapids/jni/RetryOOM";
+constexpr char const* SPLIT_AND_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/SplitAndRetryOOM";
 
 // In the task states BUFN means Block Until Further Notice.
 // Meaning the thread should be blocked until another task finishes.
@@ -43,33 +43,34 @@ constexpr char const *SPLIT_AND_RETRY_OOM_CLASS = "com/nvidia/spark/rapids/jni/S
 // in the future to know when a retry section has passed, which would
 // probably be a preferable time to restart all BUFN threads.
 enum thread_state {
-  UNKNOWN = -1, // unknown state, this is really here for logging and anything transitioning to
-                // this state should actually be accomplished by deleting the thread from the state
-  TASK_RUNNING = 0,              // task thread running normally
-  TASK_WAIT_ON_SHUFFLE = 1,      // task thread waiting on shuffle
-  TASK_BUFN_WAIT_ON_SHUFFLE = 2, // task thread waiting on shuffle, but marked as BUFN
-  TASK_ALLOC = 3,                // task thread in the middle of doing an allocation
-  TASK_ALLOC_FREE = 4,  // task thread in the middle of doing an allocation and a free happened
-  TASK_BLOCKED = 5,     // task thread that is temporarily blocked
-  TASK_BUFN_THROW = 6,  // task thread that should throw an exception to roll back before blocking
-  TASK_BUFN_WAIT = 7,   // task thread that threw an exception to roll back and now should
-                        // block the next time alloc or block_until_ready is called
-  TASK_BUFN = 8,        // task thread that is blocked until higher priority tasks start to succeed
-  TASK_SPLIT_THROW = 9, // task thread that should throw an exception to split input and retry
-  TASK_REMOVE_THROW = 10,   // task thread that is being removed and needs to throw an exception
-                            // to start the blocked thread running again.
-  SHUFFLE_RUNNING = 11,     // shuffle thread that is running normally
-  SHUFFLE_ALLOC = 12,       // shuffle thread that is in the middle of doing an alloc
-  SHUFFLE_ALLOC_FREE = 13,  // shuffle thread that is doing an alloc and a free happened.
-  SHUFFLE_BLOCKED = 14,     // shuffle thread that is temporarily blocked
-  SHUFFLE_THROW = 15,       // shuffle thread that needs to throw an OOM
-  SHUFFLE_REMOVE_THROW = 16 // shuffle thread that is being removed and needs to throw an exception
+  UNKNOWN = -1,  // unknown state, this is really here for logging and anything transitioning to
+                 // this state should actually be accomplished by deleting the thread from the state
+  TASK_RUNNING              = 0,  // task thread running normally
+  TASK_WAIT_ON_SHUFFLE      = 1,  // task thread waiting on shuffle
+  TASK_BUFN_WAIT_ON_SHUFFLE = 2,  // task thread waiting on shuffle, but marked as BUFN
+  TASK_ALLOC                = 3,  // task thread in the middle of doing an allocation
+  TASK_ALLOC_FREE = 4,   // task thread in the middle of doing an allocation and a free happened
+  TASK_BLOCKED    = 5,   // task thread that is temporarily blocked
+  TASK_BUFN_THROW = 6,   // task thread that should throw an exception to roll back before blocking
+  TASK_BUFN_WAIT  = 7,   // task thread that threw an exception to roll back and now should
+                         // block the next time alloc or block_until_ready is called
+  TASK_BUFN        = 8,  // task thread that is blocked until higher priority tasks start to succeed
+  TASK_SPLIT_THROW = 9,  // task thread that should throw an exception to split input and retry
+  TASK_REMOVE_THROW = 10,     // task thread that is being removed and needs to throw an exception
+                              // to start the blocked thread running again.
+  SHUFFLE_RUNNING      = 11,  // shuffle thread that is running normally
+  SHUFFLE_ALLOC        = 12,  // shuffle thread that is in the middle of doing an alloc
+  SHUFFLE_ALLOC_FREE   = 13,  // shuffle thread that is doing an alloc and a free happened.
+  SHUFFLE_BLOCKED      = 14,  // shuffle thread that is temporarily blocked
+  SHUFFLE_THROW        = 15,  // shuffle thread that needs to throw an OOM
+  SHUFFLE_REMOVE_THROW = 16  // shuffle thread that is being removed and needs to throw an exception
 };
 
 /**
  * Convert a state to a string representation for logging.
  */
-const char *as_str(thread_state state) {
+const char* as_str(thread_state state)
+{
   switch (state) {
     case TASK_RUNNING: return "TASK_RUNNING";
     case TASK_WAIT_ON_SHUFFLE: return "TASK_WAIT_ON_SHUFFLE";
@@ -92,20 +93,23 @@ const char *as_str(thread_state state) {
   }
 }
 
-static std::shared_ptr<spdlog::logger> make_logger(std::ostream &stream) {
+static std::shared_ptr<spdlog::logger> make_logger(std::ostream& stream)
+{
   return std::make_shared<spdlog::logger>("SPARK_RMM",
                                           std::make_shared<spdlog::sinks::ostream_sink_mt>(stream));
 }
 
-static std::shared_ptr<spdlog::logger> make_logger() {
+static std::shared_ptr<spdlog::logger> make_logger()
+{
   return std::make_shared<spdlog::logger>("SPARK_RMM",
                                           std::make_shared<spdlog::sinks::null_sink_mt>());
 }
 
-static auto make_logger(std::string const &filename) {
+static auto make_logger(std::string const& filename)
+{
   return std::make_shared<spdlog::logger>(
-      "SPARK_RMM",
-      std::make_shared<spdlog::sinks::basic_file_sink_mt>(filename, true /*truncate file*/));
+    "SPARK_RMM",
+    std::make_shared<spdlog::sinks::basic_file_sink_mt>(filename, true /*truncate file*/));
 }
 
 /**
@@ -121,15 +125,16 @@ static auto make_logger(std::string const &filename) {
  * will be MAX_LONG - (task_id + 1).
  */
 class thread_priority {
-public:
+ public:
   thread_priority(long tsk_id, long t_id) : task_id(tsk_id), thread_id(t_id) {}
 
   long get_thread_id() const { return thread_id; }
 
   long get_task_id() const { return task_id; }
 
-  bool operator<(const thread_priority &other) const {
-    long task_priority = this->task_priority();
+  bool operator<(const thread_priority& other) const
+  {
+    long task_priority       = this->task_priority();
     long other_task_priority = other.task_priority();
     if (task_priority < other_task_priority) {
       return true;
@@ -139,8 +144,9 @@ class thread_priority {
     return false;
   }
 
-  bool operator>(const thread_priority &other) const {
-    long task_priority = this->task_priority();
+  bool operator>(const thread_priority& other) const
+  {
+    long task_priority       = this->task_priority();
     long other_task_priority = other.task_priority();
     if (task_priority > other_task_priority) {
       return true;
@@ -150,12 +156,13 @@ class thread_priority {
     return false;
   }
 
-  void operator=(const thread_priority &other) {
-    task_id = other.task_id;
+  void operator=(const thread_priority& other)
+  {
+    task_id   = other.task_id;
     thread_id = other.thread_id;
   }
 
-private:
+ private:
   long task_id;
   long thread_id;
 
@@ -169,72 +176,77 @@ class thread_priority {
  * this should be accessed with a lock held.
  */
 class full_thread_state {
-public:
+ public:
   full_thread_state(thread_state state, long thread_id) : state(state), thread_id(thread_id) {}
   full_thread_state(thread_state state, long thread_id, long task_id)
-      : state(state), thread_id(thread_id), task_id(task_id) {}
+    : state(state), thread_id(thread_id), task_id(task_id)
+  {
+  }
   thread_state state;
   long thread_id;
-  long task_id = -1;
-  int retry_oom_injected = 0;
+  long task_id                     = -1;
+  int retry_oom_injected           = 0;
   int split_and_retry_oom_injected = 0;
-  int cudf_exception_injected = 0;
+  int cudf_exception_injected      = 0;
   // watchdog limit on maximum number of retries to avoid unexpected live lock situations
   int num_times_retried = 0;
   // metric for being able to report how many times each type of exception was thrown,
   // and some timings
-  int num_times_retry_throw = 0;
+  int num_times_retry_throw       = 0;
   int num_times_split_retry_throw = 0;
-  long time_blocked_nanos = 0;
+  long time_blocked_nanos         = 0;
   // The amount of time that this thread has lost due to retries (not inclduing blocked time)
   long time_lost_nanos = 0;
-  // The amount of time that this thread has spent in the current retry block (not inclucing block time)
+  // The amount of time that this thread has spent in the current retry block (not inclucing block
+  // time)
   long time_retry_running_nanos = 0;
   // When did the retry time for this thread start, or when did the block time end.
   std::chrono::time_point<std::chrono::steady_clock> retry_start_or_block_end;
   // Is this thread currently in a marked retry block. This is only used for metrics.
   bool is_in_retry = false;
 
-
   std::chrono::time_point<std::chrono::steady_clock> block_start;
 
   std::unique_ptr<std::condition_variable> wake_condition =
-      std::make_unique<std::condition_variable>();
+    std::make_unique<std::condition_variable>();
 
   /**
    * Transition to a new state. Ideally this is what is called when doing a state transition instead
    * of setting the state directly.
    */
-  void transition_to(thread_state new_state) {
+  void transition_to(thread_state new_state)
+  {
     if (new_state == thread_state::UNKNOWN) {
       throw std::runtime_error(
-          "Going to UNKNOWN state should delete the thread state, not call transition_to");
+        "Going to UNKNOWN state should delete the thread state, not call transition_to");
     }
     state = new_state;
   }
 
-  void before_block() {
+  void before_block()
+  {
     block_start = std::chrono::steady_clock::now();
     // Don't record running time lost while we are blocked...
     record_and_reset_pending_retry_time();
   }
 
-  void after_block() {
-    auto end = std::chrono::steady_clock::now();
+  void after_block()
+  {
+    auto end  = std::chrono::steady_clock::now();
     auto diff = end - block_start;
     time_blocked_nanos += std::chrono::duration_cast<std::chrono::nanoseconds>(diff).count();
-    if (is_in_retry) {
-      retry_start_or_block_end = end;
-    }
+    if (is_in_retry) { retry_start_or_block_end = end; }
   }
 
-  long get_and_reset_failed_retry_time() {
-    long ret = time_lost_nanos;
+  long get_and_reset_failed_retry_time()
+  {
+    long ret        = time_lost_nanos;
     time_lost_nanos = 0;
     return ret;
   }
 
-  void record_failed_retry_time() {
+  void record_failed_retry_time()
+  {
     if (is_in_retry) {
       record_and_reset_pending_retry_time();
       time_lost_nanos += time_retry_running_nanos;
@@ -242,20 +254,21 @@ class full_thread_state {
     }
   }
 
-  void record_and_reset_pending_retry_time() {
+  void record_and_reset_pending_retry_time()
+  {
     if (is_in_retry) {
-      auto end = std::chrono::steady_clock::now();
+      auto end  = std::chrono::steady_clock::now();
       auto diff = end - retry_start_or_block_end;
-      time_retry_running_nanos += std::chrono::duration_cast<std::chrono::nanoseconds>(diff).count();
+      time_retry_running_nanos +=
+        std::chrono::duration_cast<std::chrono::nanoseconds>(diff).count();
       retry_start_or_block_end = end;
     }
   }
 
-  void reset_retry_state(bool is_in_retry) {
+  void reset_retry_state(bool is_in_retry)
+  {
     time_retry_running_nanos = 0;
-    if (is_in_retry) {
-      retry_start_or_block_end = std::chrono::steady_clock::now();
-    }
+    if (is_in_retry) { retry_start_or_block_end = std::chrono::steady_clock::now(); }
     this->is_in_retry = is_in_retry;
   }
 
@@ -273,20 +286,20 @@ class full_thread_state {
  * memory error.
  */
 class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
-public:
-  spark_resource_adaptor(JNIEnv *env, rmm::mr::device_memory_resource *mr,
-                         std::shared_ptr<spdlog::logger> &logger)
-      : resource{mr}, logger{logger} {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  spark_resource_adaptor(JNIEnv* env,
+                         rmm::mr::device_memory_resource* mr,
+                         std::shared_ptr<spdlog::logger>& logger)
+    : resource{mr}, logger{logger}
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
     logger->flush_on(spdlog::level::info);
     logger->set_pattern("%v");
     logger->info("time,op,current thread,op thread,op task,from state,to state,notes");
     logger->set_pattern("%H:%M:%S.%f,%v");
   }
 
-  rmm::mr::device_memory_resource *get_wrapped_resource() { return resource; }
+  rmm::mr::device_memory_resource* get_wrapped_resource() { return resource; }
 
   bool supports_get_mem_info() const noexcept override { return resource->supports_get_mem_info(); }
 
@@ -302,13 +315,12 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * was an error and the entire executor is shutting down. So there should be no
    * reuse.
    */
-  void associate_thread_with_task(long thread_id, long task_id) {
+  void associate_thread_with_task(long thread_id, long task_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
-    if (shutting_down) {
-      throw std::runtime_error("spark_resource_adaptor is shutting down");
-    }
-    auto was_threads_inserted = threads.emplace(
-        thread_id, full_thread_state(thread_state::TASK_RUNNING, thread_id, task_id));
+    if (shutting_down) { throw std::runtime_error("spark_resource_adaptor is shutting down"); }
+    auto was_threads_inserted =
+      threads.emplace(thread_id, full_thread_state(thread_state::TASK_RUNNING, thread_id, task_id));
     if (was_threads_inserted.second == false) {
       if (was_threads_inserted.first->second.task_id != task_id) {
         throw std::invalid_argument("a thread can only be associated with a single task.");
@@ -325,7 +337,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
         // task_to_threads already has a task_id for this, so insert the thread_id
         was_inserted.first->second.insert(thread_id);
       }
-    } catch (const std::exception &) {
+    } catch (const std::exception&) {
       if (was_threads_inserted.second == true) {
         // roll back the thread insertion
         threads.erase(thread_id);
@@ -337,25 +349,24 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     }
   }
 
-  void start_retry_block(long thread_id) {
+  void start_retry_block(long thread_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
     auto thread = threads.find(thread_id);
-    if (thread != threads.end()) {
-      thread->second.reset_retry_state(true);
-    }
+    if (thread != threads.end()) { thread->second.reset_retry_state(true); }
   }
 
-  void end_retry_block(long thread_id) {
+  void end_retry_block(long thread_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
     auto thread = threads.find(thread_id);
-    if (thread != threads.end()) {
-      thread->second.reset_retry_state(false);
-    }
+    if (thread != threads.end()) { thread->second.reset_retry_state(false); }
   }
 
-  long get_and_reset_lost_time(long task_id) {
+  long get_and_reset_lost_time(long task_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
-    long ret = 0;
+    long ret     = 0;
     auto task_at = task_to_threads.find(task_id);
     if (task_at != task_to_threads.end()) {
       for (auto thread_id : task_at->second) {
@@ -375,14 +386,13 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * this is an error is if the thread is already marked as shutting down and has
    * not completed that transition yet.
    */
-  void associate_thread_with_shuffle(long thread_id) {
+  void associate_thread_with_shuffle(long thread_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
-    if (shutting_down) {
-      throw std::runtime_error("spark_resource_adaptor is shutting down");
-    }
+    if (shutting_down) { throw std::runtime_error("spark_resource_adaptor is shutting down"); }
 
     auto was_inserted =
-        threads.emplace(thread_id, full_thread_state(thread_state::SHUFFLE_RUNNING, thread_id));
+      threads.emplace(thread_id, full_thread_state(thread_state::SHUFFLE_RUNNING, thread_id));
     if (was_inserted.second == true) {
       log_transition(thread_id, -1, thread_state::UNKNOWN, thread_state::SHUFFLE_RUNNING);
     } else if (was_inserted.first->second.task_id != -1) {
@@ -399,11 +409,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * up and throw an exception. At that point the thread's state will be completely
    * removed.
    */
-  void remove_thread_association(long thread_id) {
+  void remove_thread_association(long thread_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
-    if (remove_thread_association(thread_id, lock)) {
-      wake_up_threads_after_task_finishes(lock);
-    }
+    if (remove_thread_association(thread_id, lock)) { wake_up_threads_after_task_finishes(lock); }
   }
 
   /**
@@ -412,19 +421,18 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * threads are currently blocked/waiting then the state will not be totally
    * removed until the thread is woken.
    */
-  void task_done(long task_id) {
+  void task_done(long task_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
     auto task_at = task_to_threads.find(task_id);
     if (task_at != task_to_threads.end()) {
       // we want to make a copy so there is no conflict here...
       std::set<long> threads_to_remove = task_at->second;
-      bool run_checks = false;
+      bool run_checks                  = false;
       for (auto thread_id : threads_to_remove) {
         run_checks = remove_thread_association(thread_id, lock) || run_checks;
       }
-      if (run_checks) {
-        wake_up_threads_after_task_finishes(lock);
-      }
+      if (run_checks) { wake_up_threads_after_task_finishes(lock); }
     }
     task_to_threads.erase(task_id);
   }
@@ -434,7 +442,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * to shut down everything in an orderly way and wait for all of the
    * threads to be done.
    */
-  void all_done() {
+  void all_done()
+  {
     {
       std::unique_lock<std::mutex> lock(state_mutex);
       // 1. Mark all threads that need to be removed as such
@@ -470,7 +479,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * Force a specific thread to throw one or more RetryOOM exceptions when an
    * alloc is called. This is intended only for testing.
    */
-  void force_retry_oom(long thread_id, int num_ooms) {
+  void force_retry_oom(long thread_id, int num_ooms)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
     auto threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
@@ -484,7 +494,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * Force a specific thread to throw one or more SplitAndRetryOOM exceptions
    * when an alloc is called. This is intended only for testing.
    */
-  void force_split_and_retry_oom(long thread_id, int num_ooms) {
+  void force_split_and_retry_oom(long thread_id, int num_ooms)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
     auto threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
@@ -498,7 +509,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * force a specific thread to throw one or more CudfExceptions when an
    * alloc is called. This is intended only for testing.
    */
-  void force_cudf_exception(long thread_id, int num_times) {
+  void force_cudf_exception(long thread_id, int num_times)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
     auto threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
@@ -511,9 +523,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   /**
    * get the number of times a retry was thrown and reset the value to 0.
    */
-  int get_and_reset_num_retry(long task_id) {
+  int get_and_reset_num_retry(long task_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
-    int ret = 0;
+    int ret      = 0;
     auto task_at = task_to_threads.find(task_id);
     if (task_at != task_to_threads.end()) {
       for (auto thread_id : task_at->second) {
@@ -530,9 +543,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   /**
    * get the number of times a split and retry was thrown and reset the value to 0.
    */
-  int get_and_reset_num_split_retry(long task_id) {
+  int get_and_reset_num_split_retry(long task_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
-    int ret = 0;
+    int ret      = 0;
     auto task_at = task_to_threads.find(task_id);
     if (task_at != task_to_threads.end()) {
       for (auto thread_id : task_at->second) {
@@ -549,9 +563,10 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   /**
    * get the time in ns that the task was blocked for.
    */
-  long get_and_reset_block_time(long task_id) {
+  long get_and_reset_block_time(long task_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
-    long ret = 0;
+    long ret     = 0;
     auto task_at = task_to_threads.find(task_id);
     if (task_at != task_to_threads.end()) {
       for (auto thread_id : task_at->second) {
@@ -569,7 +584,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * Update the internal state so that this thread is known that it is going to enter a
    * shuffle stage and could indirectly block on a shuffle thread (UCX).
    */
-  void thread_could_block_on_shuffle(long thread_id) {
+  void thread_could_block_on_shuffle(long thread_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
     auto threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
@@ -601,7 +617,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   /**
    * Indicate that the thread no longer will block indirectly on a shuffle thread.
    */
-  void thread_done_with_shuffle(long thread_id) {
+  void thread_done_with_shuffle(long thread_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
     auto threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
@@ -635,7 +652,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * before an alloc is called.  If this is not called alloc will also call into the
    * same code and block if needed until the task is ready to keep going.
    */
-  void block_thread_until_ready() {
+  void block_thread_until_ready()
+  {
     auto thread_id = static_cast<long>(pthread_self());
     std::unique_lock<std::mutex> lock(state_mutex);
     block_thread_until_ready(thread_id, lock);
@@ -645,7 +663,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * This is really here just for testing. It provides a way to look at the
    * current state of a thread.
    */
-  int get_thread_state_as_int(long thread_id) {
+  int get_thread_state_as_int(long thread_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
     auto threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
@@ -655,9 +674,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     }
   }
 
-private:
-  rmm::mr::device_memory_resource *const resource;
-  std::shared_ptr<spdlog::logger> logger; ///< spdlog logger object
+ private:
+  rmm::mr::device_memory_resource* const resource;
+  std::shared_ptr<spdlog::logger> logger;  ///< spdlog logger object
 
   // The state mutex must be held when modifying the state of threads or tasks
   // it must never be held when calling into the child resource or after returning
@@ -667,26 +686,38 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   std::map<long, full_thread_state> threads;
   std::map<long, std::set<long>> task_to_threads;
   bool shutting_down = false;
-  JavaVM *jvm;
+  JavaVM* jvm;
 
   /**
    * log a status change that does not involve a state transition.
    */
-  void log_status(const char *op, long thread_id, long task_id, thread_state state,
-                  const char *notes = nullptr) {
+  void log_status(
+    const char* op, long thread_id, long task_id, thread_state state, const char* notes = nullptr)
+  {
     auto this_id = static_cast<long>(pthread_self());
-    logger->info("{},{},{},{},{},,{}", op, this_id, thread_id, task_id, as_str(state),
+    logger->info("{},{},{},{},{},,{}",
+                 op,
+                 this_id,
+                 thread_id,
+                 task_id,
+                 as_str(state),
                  (notes == nullptr ? "" : notes));
   }
 
   /**
    * log that a state transition happened.
    */
-  void log_transition(long thread_id, long task_id, thread_state from, thread_state to,
-                      const char *notes = nullptr) {
+  void log_transition(
+    long thread_id, long task_id, thread_state from, thread_state to, const char* notes = nullptr)
+  {
     auto this_id = static_cast<long>(pthread_self());
-    logger->info("TRANSITION,{},{},{},{},{},{}", this_id, thread_id, task_id, as_str(from),
-                 as_str(to), (notes == nullptr ? "" : notes));
+    logger->info("TRANSITION,{},{},{},{},{},{}",
+                 this_id,
+                 thread_id,
+                 task_id,
+                 as_str(from),
+                 as_str(to),
+                 (notes == nullptr ? "" : notes));
   }
 
   /**
@@ -694,7 +725,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * of setting the state directly. This will log the transition and do a little bit of
    * verification.
    */
-  void transition(full_thread_state &state, thread_state new_state, const char *message = nullptr) {
+  void transition(full_thread_state& state, thread_state new_state, const char* message = nullptr)
+  {
     thread_state original = state.state;
     state.transition_to(new_state);
     log_transition(state.thread_id, state.task_id, original, new_state, message);
@@ -703,8 +735,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   /**
    * throw a java exception using the cached jvm/env.
    */
-  void throw_java_exception(const char *ex_class_name, const char *msg) {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  void throw_java_exception(const char* ex_class_name, const char* msg)
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
     cudf::jni::throw_java_exception(env, ex_class_name, msg);
   }
 
@@ -712,7 +745,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * This is a watchdog to prevent us from live locking. It should be called before we throw an
    * RetryOOM or a SplitAndRetryOOM to know if we actually should throw something else.
    */
-  void check_before_oom(full_thread_state &state, const std::unique_lock<std::mutex> &lock) {
+  void check_before_oom(full_thread_state& state, const std::unique_lock<std::mutex>& lock)
+  {
     // The limit is an arbitrary number, large enough that we should not hit it in "normal"
     // operation, but also small enough that we can detect a livelock fairly quickly.
     // In testing it looks like it is a few ms if in a tight loop, not including spill
@@ -724,23 +758,28 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     state.num_times_retried++;
   }
 
-  void throw_retry_oom(const char *msg, full_thread_state &state,
-                       const std::unique_lock<std::mutex> &lock) {
+  void throw_retry_oom(const char* msg,
+                       full_thread_state& state,
+                       const std::unique_lock<std::mutex>& lock)
+  {
     state.num_times_retry_throw++;
     check_before_oom(state, lock);
     state.record_failed_retry_time();
     throw_java_exception(RETRY_OOM_CLASS, "GPU OutOfMemory");
   }
 
-  void throw_split_and_retry_oom(const char *msg, full_thread_state &state,
-                               const std::unique_lock<std::mutex> &lock) {
+  void throw_split_and_retry_oom(const char* msg,
+                                 full_thread_state& state,
+                                 const std::unique_lock<std::mutex>& lock)
+  {
     state.num_times_split_retry_throw++;
     check_before_oom(state, lock);
     state.record_failed_retry_time();
     throw_java_exception(SPLIT_AND_RETRY_OOM_CLASS, "GPU OutOfMemory");
   }
 
-  bool is_blocked(thread_state state) {
+  bool is_blocked(thread_state state)
+  {
     switch (state) {
       case TASK_BLOCKED:
       // fall through
@@ -754,8 +793,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   /**
    * Internal implementation that will block a thread until it is ready to continue.
    */
-  void block_thread_until_ready(long thread_id, std::unique_lock<std::mutex> &lock) {
-    bool done = false;
+  void block_thread_until_ready(long thread_id, std::unique_lock<std::mutex>& lock)
+  {
+    bool done       = false;
     bool first_time = true;
     // Because this is called from alloc as well as from the public facing block_thread_until_ready
     // there are states that should only show up in relation to alloc failing. These include
@@ -812,14 +852,14 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
           case TASK_SPLIT_THROW:
             transition(thread->second, thread_state::TASK_RUNNING);
             thread->second.record_failed_retry_time();
-            throw_split_and_retry_oom("rollback, split input, and retry operation", thread->second,
-                                    lock);
+            throw_split_and_retry_oom(
+              "rollback, split input, and retry operation", thread->second, lock);
             break;
           case TASK_REMOVE_THROW:
           // fall through
           case SHUFFLE_REMOVE_THROW:
-            log_transition(thread_id, thread->second.task_id, thread->second.state,
-                           thread_state::UNKNOWN);
+            log_transition(
+              thread_id, thread->second.task_id, thread->second.state, thread_state::UNKNOWN);
             // don't need to record failed time metric the thread is already gone...
             threads.erase(thread);
             task_has_woken_condition.notify_all();
@@ -845,7 +885,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * and if there are no blocked threads, then we wake up all BUFN threads.
    * Hopefully the frees have already woken up all the blocked threads anyways.
    */
-  void wake_up_threads_after_task_finishes(const std::unique_lock<std::mutex> &lock) {
+  void wake_up_threads_after_task_finishes(const std::unique_lock<std::mutex>& lock)
+  {
     bool are_any_tasks_just_blocked = false;
     for (auto thread = threads.begin(); thread != threads.end(); thread++) {
       switch (thread->second.state) {
@@ -885,16 +926,15 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * returns true if the thread that ended was a normally running task thread.
    * This should be used to decide if wake_up_threads_after_task_finishes is called or not.
    */
-  bool remove_thread_association(long thread_id, const std::unique_lock<std::mutex> &lock) {
-    bool ret = false;
+  bool remove_thread_association(long thread_id, const std::unique_lock<std::mutex>& lock)
+  {
+    bool ret        = false;
     auto threads_at = threads.find(thread_id);
     if (threads_at != threads.end()) {
       auto task_id = threads_at->second.task_id;
       if (task_id >= 0) {
         auto task_at = task_to_threads.find(task_id);
-        if (task_at != task_to_threads.end()) {
-          task_at->second.erase(thread_id);
-        }
+        if (task_at != task_to_threads.end()) { task_at->second.erase(thread_id); }
       }
 
       switch (threads_at->second.state) {
@@ -912,8 +952,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
           ret = true;
           // fall through;
         default:
-          log_transition(thread_id, threads_at->second.task_id, threads_at->second.state,
-                         thread_state::UNKNOWN);
+          log_transition(
+            thread_id, threads_at->second.task_id, threads_at->second.state, thread_state::UNKNOWN);
           threads.erase(threads_at);
       }
     }
@@ -929,7 +969,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    *         entered the state machine. The only known case is GPU memory required for setup in
    *         cuDF for a spill operation.
    */
-  bool pre_alloc(long thread_id) {
+  bool pre_alloc(long thread_id)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
 
     auto thread = threads.find(thread_id);
@@ -956,8 +997,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
 
       if (thread->second.cudf_exception_injected > 0) {
         thread->second.cudf_exception_injected--;
-        log_status("INJECTED_CUDF_EXCEPTION", thread_id, thread->second.task_id,
-                   thread->second.state);
+        log_status(
+          "INJECTED_CUDF_EXCEPTION", thread_id, thread->second.task_id, thread->second.state);
         thread->second.record_failed_retry_time();
         throw_java_exception(cudf::jni::CUDF_ERROR_CLASS, "injected CudfException");
       }
@@ -965,8 +1006,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
       if (thread->second.split_and_retry_oom_injected > 0) {
         thread->second.split_and_retry_oom_injected--;
         thread->second.num_times_split_retry_throw++;
-        log_status("INJECTED_SPLIT_AND_RETRY_OOM", thread_id, thread->second.task_id,
-                   thread->second.state);
+        log_status(
+          "INJECTED_SPLIT_AND_RETRY_OOM", thread_id, thread->second.task_id, thread->second.state);
         thread->second.record_failed_retry_time();
         throw_java_exception(SPLIT_AND_RETRY_OOM_CLASS, "injected SplitAndRetryOOM");
       }
@@ -1001,7 +1042,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * `likely_spill` if this allocation should be treated differently, because
    * we detected recursion while handling a prior allocation in this thread.
    */
-  void post_alloc_success(long thread_id, bool likely_spill) {
+  void post_alloc_success(long thread_id, bool likely_spill)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
     // pre allocate checks
     auto thread = threads.find(thread_id);
@@ -1027,7 +1069,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * This is typically called when a free happens, or an alloc succeeds.
    * @param is_from_free true if a free happen.
    */
-  void wake_next_highest_priority_blocked(const std::unique_lock<std::mutex> &lock, bool is_from_free) {
+  void wake_next_highest_priority_blocked(const std::unique_lock<std::mutex>& lock,
+                                          bool is_from_free)
+  {
     // 1. Find the highest priority blocked thread, including shuffle.
     thread_priority to_wake(-1, -1);
     bool is_to_wake_set = false;
@@ -1036,7 +1080,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
       if (state == thread_state::TASK_BLOCKED || state == thread_state::SHUFFLE_BLOCKED) {
         thread_priority current = thread->second.priority();
         if (!is_to_wake_set || to_wake < current) {
-          to_wake = current;
+          to_wake        = current;
           is_to_wake_set = true;
         }
       }
@@ -1066,7 +1110,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     } else if (is_from_free) {
       // 3. Otherwise look to see if we are in a BUFN deadlock state.
       //
-      // Memory was freed and if all of the tasks are in a BUFN state, 
+      // Memory was freed and if all of the tasks are in a BUFN state,
       // then we want to wake up the highest priority one so it can make progress
       // instead of trying to split its input. But we only do this if it
       // is a different thread that is freeing memory from the one we want to wake up.
@@ -1079,24 +1123,21 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
       thread_priority to_wake(-1, -1);
       bool is_to_wake_set = false;
       for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-        if (thread->second.task_id >= 0) {
-          tasks_with_threads.insert(thread->second.task_id);
-        }
+        if (thread->second.task_id >= 0) { tasks_with_threads.insert(thread->second.task_id); }
 
         switch (thread->second.state) {
           case TASK_BUFN_THROW:
-              // fall through
+            // fall through
           case TASK_BUFN_WAIT:
-              // fall through
+            // fall through
           case TASK_BUFN: {
-              tasks_with_threads_bufn.insert(thread->second.task_id);
-              thread_priority current = thread->second.priority();
-              if (!is_to_wake_set || to_wake < current) {
-                to_wake = current;
-                is_to_wake_set = true;
-              }
+            tasks_with_threads_bufn.insert(thread->second.task_id);
+            thread_priority current = thread->second.priority();
+            if (!is_to_wake_set || to_wake < current) {
+              to_wake        = current;
+              is_to_wake_set = true;
             }
-            break;
+          } break;
           default: break;
         }
       }
@@ -1108,7 +1149,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
           // Don't wake up yourself on a free. It is not adding more memory for this thread
           // to use on a retry and we might need a split instead to break a deadlock
           auto this_id = static_cast<long>(pthread_self());
-          auto thread = threads.find(thread_id_to_wake);
+          auto thread  = threads.find(thread_id_to_wake);
           if (thread != threads.end() && thread->first != this_id) {
             switch (thread->second.state) {
               case TASK_BUFN:
@@ -1126,8 +1167,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
                 break;
               default: {
                 std::stringstream ss;
-                ss << "internal error expected to only wake up blocked threads " << thread_id_to_wake
-                   << " " << as_str(thread->second.state);
+                ss << "internal error expected to only wake up blocked threads "
+                   << thread_id_to_wake << " " << as_str(thread->second.state);
                 throw std::runtime_error(ss.str());
               }
             }
@@ -1142,7 +1183,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * called when a task or shuffle thread becomes blocked so that we can
    * check to see if one of them needs to become BUFN or do a split and rollback.
    */
-  void check_and_update_for_bufn(const std::unique_lock<std::mutex> &lock) {
+  void check_and_update_for_bufn(const std::unique_lock<std::mutex>& lock)
+  {
     // We want to know if all active tasks have at least one thread that
     // is effectively blocked or not.  We could change the definitions here,
     // but for now this sounds like a good starting point.
@@ -1162,9 +1204,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     }
 
     for (auto thread = threads.begin(); thread != threads.end(); thread++) {
-      if (thread->second.task_id >= 0) {
-        tasks_with_threads.insert(thread->second.task_id);
-      }
+      if (thread->second.task_id >= 0) { tasks_with_threads.insert(thread->second.task_id); }
 
       switch (thread->second.state) {
         case TASK_WAIT_ON_SHUFFLE:
@@ -1184,7 +1224,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     }
 
     bool need_to_break_deadlock =
-        tasks_with_threads.size() == tasks_with_threads_effectively_blocked.size();
+      tasks_with_threads.size() == tasks_with_threads_effectively_blocked.size();
     if (need_to_break_deadlock) {
       // Find the task thread with the lowest priority that is not already BUFN
       thread_priority to_bufn(-1, -1);
@@ -1194,7 +1234,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
           case TASK_BLOCKED: {
             thread_priority current = thread->second.priority();
             if (!is_to_bufn_set || current < to_bufn) {
-              to_bufn = current;
+              to_bufn        = current;
               is_to_bufn_set = true;
             }
           } break;
@@ -1203,7 +1243,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
       }
       if (is_to_bufn_set) {
         long thread_id_to_bufn = to_bufn.get_thread_id();
-        auto thread = threads.find(thread_id_to_bufn);
+        auto thread            = threads.find(thread_id_to_bufn);
         if (thread != threads.end()) {
           transition(thread->second, thread_state::TASK_BUFN_THROW);
           thread->second.wake_condition->notify_all();
@@ -1221,21 +1261,19 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
             case TASK_BUFN: {
               thread_priority current = thread->second.priority();
               if (!is_to_wake_set || to_wake < current) {
-                to_wake = current;
+                to_wake        = current;
                 is_to_wake_set = true;
               }
             } break;
             case TASK_WAIT_ON_SHUFFLE:
-              if (!is_any_shuffle_thread_blocked) {
-                all_bufn_or_shuffle = false;
-              }
+              if (!is_any_shuffle_thread_blocked) { all_bufn_or_shuffle = false; }
               break;
             default: all_bufn_or_shuffle = false; break;
           }
         }
       }
       if (all_bufn_or_shuffle) {
-        long thread_id = to_wake.get_thread_id();
+        long thread_id    = to_wake.get_thread_id();
         auto found_thread = threads.find(thread_id);
         if (found_thread != threads.end()) {
           transition(found_thread->second, thread_state::TASK_SPLIT_THROW);
@@ -1264,7 +1302,8 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * typically happen after this has run, and we loop around to retry the alloc
    * if the state says we should.
    */
-  bool post_alloc_failed(long thread_id, bool is_oom, bool likely_spill) {
+  bool post_alloc_failed(long thread_id, bool is_oom, bool likely_spill)
+  {
     std::unique_lock<std::mutex> lock(state_mutex);
     auto thread = threads.find(thread_id);
     // only retry if this was due to an out of memory exception.
@@ -1304,19 +1343,18 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     return ret;
   }
 
-  void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
+  void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override
+  {
     auto tid = static_cast<long>(pthread_self());
     while (true) {
       bool likely_spill = pre_alloc(tid);
       try {
-        void *ret = resource->allocate(num_bytes, stream);
+        void* ret = resource->allocate(num_bytes, stream);
         post_alloc_success(tid, likely_spill);
         return ret;
-      } catch (const std::bad_alloc &e) {
-        if (!post_alloc_failed(tid, true, likely_spill)) {
-          throw;
-        }
-      } catch (const std::exception &e) {
+      } catch (const std::bad_alloc& e) {
+        if (!post_alloc_failed(tid, true, likely_spill)) { throw; }
+      } catch (const std::exception& e) {
         post_alloc_failed(tid, false, likely_spill);
         throw;
       }
@@ -1325,13 +1363,14 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     throw std::bad_alloc();
   }
 
-  void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override {
+  void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override
+  {
     resource->deallocate(p, size, stream);
     // deallocate success
     if (size > 0) {
       std::unique_lock<std::mutex> lock(state_mutex);
 
-      auto tid = static_cast<long>(pthread_self());
+      auto tid    = static_cast<long>(pthread_self());
       auto thread = threads.find(tid);
       if (thread != threads.end()) {
         log_status("DEALLOC", tid, thread->second.task_id, thread->second.state);
@@ -1363,17 +1402,19 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     }
   }
 
-  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override {
+  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override
+  {
     return resource->get_mem_info(stream);
   }
 };
 
-} // namespace
+}  // namespace
 
 extern "C" {
 
 JNIEXPORT jlong JNICALL
-Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getCurrentThreadId(JNIEnv *env, jclass) {
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getCurrentThreadId(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     return static_cast<jlong>(pthread_self());
@@ -1382,11 +1423,12 @@ Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getCurrentThreadId(JNIEnv
 }
 
 JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_createNewAdaptor(
-    JNIEnv *env, jclass, jlong child, jstring log_loc) {
+  JNIEnv* env, jclass, jlong child, jstring log_loc)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
     cudf::jni::native_jstring nlogloc(env, log_loc);
     std::shared_ptr<spdlog::logger> logger;
     if (nlogloc.is_null()) {
@@ -1408,11 +1450,12 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_cr
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_releaseAdaptor(
-    JNIEnv *env, jclass, jlong ptr) {
+JNIEXPORT void JNICALL
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_releaseAdaptor(JNIEnv* env, jclass, jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->all_done();
     delete mr;
   }
@@ -1420,144 +1463,159 @@ JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_rel
 }
 
 JNIEXPORT void JNICALL
-Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_associateThreadWithTask(JNIEnv *env, jclass,
-                                                                              jlong ptr,
-                                                                              jlong thread_id,
-                                                                              jlong task_id) {
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_associateThreadWithTask(
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id, jlong task_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->associate_thread_with_task(thread_id, task_id);
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT void JNICALL
-Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_associateThreadWithShuffle(JNIEnv *env,
-                                                                                 jclass, jlong ptr,
-                                                                                 jlong thread_id) {
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_associateThreadWithShuffle(JNIEnv* env,
+                                                                                 jclass,
+                                                                                 jlong ptr,
+                                                                                 jlong thread_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->associate_thread_with_shuffle(thread_id);
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT void JNICALL
-Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_removeThreadAssociation(JNIEnv *env, jclass,
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_removeThreadAssociation(JNIEnv* env,
+                                                                              jclass,
                                                                               jlong ptr,
-                                                                              jlong thread_id) {
+                                                                              jlong thread_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->remove_thread_association(thread_id);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_taskDone(
-    JNIEnv *env, jclass, jlong ptr, jlong task_id) {
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_taskDone(JNIEnv* env,
+                                                                                      jclass,
+                                                                                      jlong ptr,
+                                                                                      jlong task_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->task_done(task_id);
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT void JNICALL
-Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_threadCouldBlockOnShuffle(JNIEnv *env, jclass,
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_threadCouldBlockOnShuffle(JNIEnv* env,
+                                                                                jclass,
                                                                                 jlong ptr,
-                                                                                jlong thread_id) {
+                                                                                jlong thread_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->thread_could_block_on_shuffle(thread_id);
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_threadDoneWithShuffle(
-    JNIEnv *env, jclass, jlong ptr, jlong thread_id) {
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->thread_done_with_shuffle(thread_id);
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_forceRetryOOM(
-    JNIEnv *env, jclass, jlong ptr, jlong thread_id, jint num_ooms) {
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_ooms)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->force_retry_oom(thread_id, num_ooms);
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_forceSplitAndRetryOOM(
-    JNIEnv *env, jclass, jlong ptr, jlong thread_id, jint num_ooms) {
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_ooms)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->force_split_and_retry_oom(thread_id, num_ooms);
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_forceCudfException(
-    JNIEnv *env, jclass, jlong ptr, jlong thread_id, jint num_times) {
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id, jint num_times)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->force_cudf_exception(thread_id, num_times);
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_blockThreadUntilReady(
-    JNIEnv *env, jclass, jlong ptr) {
+  JNIEnv* env, jclass, jlong ptr)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->block_thread_until_ready();
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT jint JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getStateOf(
-    JNIEnv *env, jclass, jlong ptr, jlong thread_id) {
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     return mr->get_thread_state_as_int(thread_id);
   }
   CATCH_STD(env, 0)
 }
 
 JNIEXPORT jint JNICALL
-Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetRetryThrowInternal(JNIEnv *env,
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetRetryThrowInternal(JNIEnv* env,
                                                                                     jclass,
                                                                                     jlong ptr,
-                                                                                    jlong task_id) {
+                                                                                    jlong task_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     return mr->get_and_reset_num_retry(task_id);
   }
   CATCH_STD(env, 0)
@@ -1565,61 +1623,64 @@ Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetRetryThrowInter
 
 JNIEXPORT jint JNICALL
 Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetSplitRetryThrowInternal(
-    JNIEnv *env, jclass, jlong ptr, jlong task_id) {
+  JNIEnv* env, jclass, jlong ptr, jlong task_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     return mr->get_and_reset_num_split_retry(task_id);
   }
   CATCH_STD(env, 0)
 }
 
 JNIEXPORT jlong JNICALL
-Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetBlockTimeInternal(JNIEnv *env,
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetBlockTimeInternal(JNIEnv* env,
                                                                                    jclass,
                                                                                    jlong ptr,
-                                                                                   jlong task_id) {
+                                                                                   jlong task_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     return mr->get_and_reset_block_time(task_id);
   }
   CATCH_STD(env, 0)
 }
 
 JNIEXPORT jlong JNICALL
-Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetComputeTimeLostToRetry(JNIEnv *env,
-                                                                                       jclass,
-                                                                                       jlong ptr,
-                                                                                       jlong task_id) {
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetComputeTimeLostToRetry(
+  JNIEnv* env, jclass, jlong ptr, jlong task_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     return mr->get_and_reset_lost_time(task_id);
   }
   CATCH_STD(env, 0)
 }
 
 JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_startRetryBlock(
-    JNIEnv *env, jclass, jlong ptr, jlong thread_id) {
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->start_retry_block(thread_id);
   }
   CATCH_STD(env, )
 }
 
 JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_endRetryBlock(
-    JNIEnv *env, jclass, jlong ptr, jlong thread_id) {
+  JNIEnv* env, jclass, jlong ptr, jlong thread_id)
+{
   JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<spark_resource_adaptor *>(ptr);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
     mr->end_retry_block(thread_id);
   }
   CATCH_STD(env, )
diff --git a/src/main/cpp/src/ZOrderJni.cpp b/src/main/cpp/src/ZOrderJni.cpp
index 20d5ba92a4..37925f86d6 100644
--- a/src/main/cpp/src/ZOrderJni.cpp
+++ b/src/main/cpp/src/ZOrderJni.cpp
@@ -21,8 +21,9 @@
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL
-Java_com_nvidia_spark_rapids_jni_ZOrder_interleaveBits(JNIEnv *env, jclass, jlongArray input_columns) {
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ZOrder_interleaveBits(
+  JNIEnv* env, jclass, jlongArray input_columns)
+{
   JNI_NULL_CHECK(env, input_columns, "input is null", 0);
 
   try {
@@ -35,8 +36,9 @@ Java_com_nvidia_spark_rapids_jni_ZOrder_interleaveBits(JNIEnv *env, jclass, jlon
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL
-Java_com_nvidia_spark_rapids_jni_ZOrder_hilbertIndex(JNIEnv *env, jclass, jint num_bits, jlongArray input_columns) {
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ZOrder_hilbertIndex(
+  JNIEnv* env, jclass, jint num_bits, jlongArray input_columns)
+{
   JNI_NULL_CHECK(env, input_columns, "input is null", 0);
 
   try {
@@ -48,7 +50,4 @@ Java_com_nvidia_spark_rapids_jni_ZOrder_hilbertIndex(JNIEnv *env, jclass, jint n
   }
   CATCH_STD(env, 0);
 }
-
-
-
 }
diff --git a/src/main/cpp/src/cast_decimal_to_string.cu b/src/main/cpp/src/cast_decimal_to_string.cu
index 88e8a6fdb8..a256e8e917 100644
--- a/src/main/cpp/src/cast_decimal_to_string.cu
+++ b/src/main/cpp/src/cast_decimal_to_string.cu
@@ -87,9 +87,9 @@ struct decimal_to_non_ansi_string_fn {
     } else {
       // positive scale or adjusted exponent < -6 means scientific notation
       auto const extra_digits = abs_value_digits > 1 ? 3 : 2;
-      return static_cast<int32_t>(value < 0) +  // sign if negative
-             abs_value_digits +                 // number of digits
-             extra_digits +                     // decimal point if exists, E, +/-
+      return static_cast<int32_t>(value < 0) +            // sign if negative
+             abs_value_digits +                           // number of digits
+             extra_digits +                               // decimal point if exists, E, +/-
              strings::detail::count_digits(
                numeric::detail::abs(adjusted_exponent));  // exponent portion
     }
@@ -127,7 +127,7 @@ struct decimal_to_non_ansi_string_fn {
       d_buffer +=
         strings::detail::integer_to_string(abs_value / exp_ten, d_buffer);  // add the integer part
       if (scale != 0) {
-        *d_buffer++ = '.';  // add decimal point
+        *d_buffer++ = '.';                                                  // add decimal point
 
         thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; });  // add zeros
         d_buffer += num_zeros;
diff --git a/src/main/cpp/src/cast_string.cu b/src/main/cpp/src/cast_string.cu
index 2cfcc62630..6f9de63d10 100644
--- a/src/main/cpp/src/cast_string.cu
+++ b/src/main/cpp/src/cast_string.cu
@@ -653,8 +653,8 @@ struct string_to_integer_impl {
                                      rmm::mr::device_memory_resource* mr)
   {
     if (string_col.size() == 0) {
-      return std::make_unique<column>(data_type{type_to_id<T>()}, 0, rmm::device_buffer{},
-              rmm::device_buffer{}, 0);
+      return std::make_unique<column>(
+        data_type{type_to_id<T>()}, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
     }
 
     rmm::device_uvector<T> data(string_col.size(), stream, mr);
@@ -676,8 +676,11 @@ struct string_to_integer_impl {
 
     auto null_count = cudf::detail::null_count(null_mask.data(), 0, string_col.size(), stream);
 
-    auto col = std::make_unique<column>(
-      data_type{type_to_id<T>()}, string_col.size(), data.release(), null_mask.release(), null_count);
+    auto col = std::make_unique<column>(data_type{type_to_id<T>()},
+                                        string_col.size(),
+                                        data.release(),
+                                        null_mask.release(),
+                                        null_count);
 
     if (ansi_mode) { validate_ansi_column(col->view(), string_col, stream); }
 
@@ -743,9 +746,8 @@ struct string_to_decimal_impl {
 
     auto null_count = cudf::detail::null_count(null_mask.data(), 0, string_col.size(), stream);
 
-    auto col =
-      std::make_unique<column>(dtype, string_col.size(), data.release(),
-              null_mask.release(), null_count);
+    auto col = std::make_unique<column>(
+      dtype, string_col.size(), data.release(), null_mask.release(), null_count);
 
     if (ansi_mode) { validate_ansi_column(col->view(), string_col, stream); }
 
@@ -829,8 +831,15 @@ std::unique_ptr<column> string_to_decimal(int32_t precision,
     return std::make_unique<column>(dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
   }
 
-  return type_dispatcher(
-    dtype, detail::string_to_decimal_impl{}, dtype, precision, string_col, ansi_mode, strip, stream, mr);
+  return type_dispatcher(dtype,
+                         detail::string_to_decimal_impl{},
+                         dtype,
+                         precision,
+                         string_col,
+                         ansi_mode,
+                         strip,
+                         stream,
+                         mr);
 }
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/cast_string_to_float.cu b/src/main/cpp/src/cast_string_to_float.cu
index 3c23c98e0a..8eab9eef16 100644
--- a/src/main/cpp/src/cast_string_to_float.cu
+++ b/src/main/cpp/src/cast_string_to_float.cu
@@ -362,9 +362,9 @@ class string_to_float {
         __ballot_sync(0xffffffff, _warp_lane < num_chars && !is_digit(_c));
       auto const first_non_digit = __ffs(non_digit_mask);
 
-      // first non-digit after location 1 means there is something valid here, note ffs is 0 with no set bits,
-      // so 1 is the 0th character is not a digit.
-      // first non-digit of 0 means all digits, and that means we have seen a valid digit as well.
+      // first non-digit after location 1 means there is something valid here, note ffs is 0 with no
+      // set bits, so 1 is the 0th character is not a digit. first non-digit of 0 means all digits,
+      // and that means we have seen a valid digit as well.
       seen_valid_digit |= (num_chars > 0 && first_non_digit != 1);
 
       num_chars = min(num_chars, first_non_digit > 0 ? first_non_digit - 1 : num_chars);
diff --git a/src/main/cpp/src/decimal_utils.cu b/src/main/cpp/src/decimal_utils.cu
index 625bd4e711..392fb495b4 100644
--- a/src/main/cpp/src/decimal_utils.cu
+++ b/src/main/cpp/src/decimal_utils.cu
@@ -18,8 +18,8 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/error.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cmath>
@@ -32,15 +32,17 @@ struct chunked256 {
   inline chunked256() = default;
 
   // sign-extend a 128-bit value into a chunked 256-bit value
-  inline __device__ explicit chunked256(__int128_t x) {
-    chunks[0] = static_cast<uint64_t>(x);
+  inline __device__ explicit chunked256(__int128_t x)
+  {
+    chunks[0]            = static_cast<uint64_t>(x);
     __int128_t x_shifted = x >> 64;
-    chunks[1] = static_cast<uint64_t>(x_shifted);
-    chunks[2] = static_cast<uint64_t>(x_shifted >> 64);
-    chunks[3] = chunks[2];
+    chunks[1]            = static_cast<uint64_t>(x_shifted);
+    chunks[2]            = static_cast<uint64_t>(x_shifted >> 64);
+    chunks[3]            = chunks[2];
   }
 
-  inline __device__ explicit chunked256(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  inline __device__ explicit chunked256(uint64_t a, uint64_t b, uint64_t c, uint64_t d)
+  {
     chunks[0] = d;
     chunks[1] = c;
     chunks[2] = b;
@@ -48,14 +50,13 @@ struct chunked256 {
   }
 
   inline __device__ uint64_t operator[](int i) const { return chunks[i]; }
-  inline __device__ uint64_t &operator[](int i) { return chunks[i]; }
+  inline __device__ uint64_t& operator[](int i) { return chunks[i]; }
   inline __device__ int64_t sign() const { return static_cast<int64_t>(chunks[3]) >> 63; }
 
-  inline __device__ void add(int a) {
-    add(chunked256(static_cast<__int128_t>(a)));
-  }
+  inline __device__ void add(int a) { add(chunked256(static_cast<__int128_t>(a))); }
 
-  inline __device__ void add(chunked256 const &a) {
+  inline __device__ void add(chunked256 const& a)
+  {
     __uint128_t carry_and_sum = 0;
     for (int i = 0; i < 4; ++i) {
       carry_and_sum += static_cast<__uint128_t>(chunks[i]) + a.chunks[i];
@@ -64,14 +65,16 @@ struct chunked256 {
     }
   }
 
-  inline __device__ void negate() {
+  inline __device__ void negate()
+  {
     for (int i = 0; i < 4; i++) {
       chunks[i] = ~chunks[i];
     }
     add(1);
   }
 
-  inline __device__ bool lt_unsigned(chunked256 const &other) const {
+  inline __device__ bool lt_unsigned(chunked256 const& other) const
+  {
     for (int i = 3; i >= 0; i--) {
       if (chunks[i] < other.chunks[i]) {
         return true;
@@ -82,11 +85,10 @@ struct chunked256 {
     return false;
   }
 
-  inline __device__ bool gte_unsigned(chunked256 const &other) const {
-      return !lt_unsigned(other);
-  }
+  inline __device__ bool gte_unsigned(chunked256 const& other) const { return !lt_unsigned(other); }
 
-  inline __device__ int leading_zeros() const {
+  inline __device__ int leading_zeros() const
+  {
     if (sign() < 0) {
       chunked256 tmp = *this;
       tmp.negate();
@@ -104,15 +106,14 @@ struct chunked256 {
     }
   }
 
-  inline __device__ __int128_t as_128_bits() const {
+  inline __device__ __int128_t as_128_bits() const
+  {
     return (static_cast<__int128_t>(chunks[1]) << 64) | chunks[0];
   }
 
-  inline __device__ uint64_t as_64_bits() const {
-    return chunks[0];
-  }
+  inline __device__ uint64_t as_64_bits() const { return chunks[0]; }
 
-private:
+ private:
   uint64_t chunks[4];
 };
 
@@ -122,49 +123,52 @@ struct divmod256 {
 };
 
 // Perform a 256-bit multiply in 64-bit chunks
-__device__ chunked256 multiply(chunked256 const &a, chunked256 const &b) {
+__device__ chunked256 multiply(chunked256 const& a, chunked256 const& b)
+{
   chunked256 r;
   __uint128_t mul;
   uint64_t carry = 0;
   for (int a_idx = 0; a_idx < 4; ++a_idx) {
-    mul = static_cast<__uint128_t>(a[a_idx]) * b[0] + carry;
+    mul      = static_cast<__uint128_t>(a[a_idx]) * b[0] + carry;
     r[a_idx] = static_cast<uint64_t>(mul);
-    carry = static_cast<uint64_t>(mul >> 64);
+    carry    = static_cast<uint64_t>(mul >> 64);
   }
   for (int b_idx = 1; b_idx < 4; ++b_idx) {
     carry = 0;
     for (int a_idx = 0; a_idx < 4 - b_idx; ++a_idx) {
       int r_idx = a_idx + b_idx;
-      mul = static_cast<__uint128_t>(a[a_idx]) * b[b_idx] + r[r_idx] + carry;
-      r[r_idx] = static_cast<uint64_t>(mul);
-      carry = static_cast<uint64_t>(mul >> 64);
+      mul       = static_cast<__uint128_t>(a[a_idx]) * b[b_idx] + r[r_idx] + carry;
+      r[r_idx]  = static_cast<uint64_t>(mul);
+      carry     = static_cast<uint64_t>(mul >> 64);
     }
   }
   return r;
 }
 
-__device__ divmod256 divide_unsigned(chunked256 const &n, __int128_t const &d) {
+__device__ divmod256 divide_unsigned(chunked256 const& n, __int128_t const& d)
+{
   // TODO: FIXME this is long division, and so it is likely very slow...
   chunked256 q(0);
   __uint128_t r = 0;
 
   for (int i = 255; i >= 0; i--) {
     int block = i / 64;
-    int bit = i % 64;
-    int read = (int)((n[block] >> bit) & 0x01);
-    r = r << 1;
-    r = r | read;
+    int bit   = i % 64;
+    int read  = (int)((n[block] >> bit) & 0x01);
+    r         = r << 1;
+    r         = r | read;
 
     if (r >= d) {
-      r = r - d;
+      r               = r - d;
       int64_t bit_set = 1L << bit;
-      q[block] = q[block] | bit_set;
+      q[block]        = q[block] | bit_set;
     }
   }
   return divmod256{q, static_cast<__int128_t>(r)};
 }
 
-__device__ divmod256 divide(chunked256 const &n, __int128_t const &d) {
+__device__ divmod256 divide(chunked256 const& n, __int128_t const& d)
+{
   // We assume that d is not 0. This is because we do the zero check,
   // if needed before calling divide so we can set an overflow properly.
   bool const is_n_neg = n.sign() < 0;
@@ -173,26 +177,23 @@ __device__ divmod256 divide(chunked256 const &n, __int128_t const &d) {
   // beause we are dealing with decimal numbers that should not go to
   // the maximum value that can be held by d or n
   chunked256 abs_n = n;
-  if (is_n_neg) {
-    abs_n.negate();
-  }
+  if (is_n_neg) { abs_n.negate(); }
 
   __int128_t abs_d = is_d_neg ? -d : d;
   divmod256 result = divide_unsigned(abs_n, abs_d);
 
-  if (is_d_neg != is_n_neg) {
-    result.quotient.negate();
-  }
+  if (is_d_neg != is_n_neg) { result.quotient.negate(); }
 
-  if (is_n_neg) {
-    result.remainder = -result.remainder;
-  }
+  if (is_n_neg) { result.remainder = -result.remainder; }
 
   return result;
 }
 
-__device__ chunked256 round_from_remainder(chunked256 const &q, __int128_t const &r, 
-        chunked256 const & n, __int128_t const &d) {
+__device__ chunked256 round_from_remainder(chunked256 const& q,
+                                           __int128_t const& r,
+                                           chunked256 const& n,
+                                           __int128_t const& d)
+{
   // We are going to round if the abs value of the remainder is >= half of the divisor
   // but if we divide the divisor in half, we can lose data so instead we are going to
   // multiply the remainder by 2
@@ -204,19 +205,20 @@ __device__ chunked256 round_from_remainder(chunked256 const &q, __int128_t const
   // is in a range that would have us round because the divisor has to fit within
   // an __int128_t.
 
-  bool const need_inc = ((double_remainder >> 1) != r) || // if we lost info or
-      (double_remainder < 0 ? -double_remainder : double_remainder) >= // abs remainder is >=
-      (d < 0 ? -d : d); // abs divisor
+  bool const need_inc =
+    ((double_remainder >> 1) != r) ||                                 // if we lost info or
+    (double_remainder < 0 ? -double_remainder : double_remainder) >=  // abs remainder is >=
+      (d < 0 ? -d : d);                                               // abs divisor
 
   // To know which way to round, more specifically when the quotient is 0
   // we need to know what the sign of the quotient would have been. In this
   // case that happens if only one of the inputs was negative (xor)
-  bool const is_n_neg = n.sign() < 0;
-  bool const is_d_neg = d < 0;
+  bool const is_n_neg   = n.sign() < 0;
+  bool const is_d_neg   = d < 0;
   bool const round_down = is_n_neg != is_d_neg;
 
   int const round_inc = (need_inc ? (round_down ? -1 : 1) : 0);
-  chunked256 ret = q;
+  chunked256 ret      = q;
   ret.add(round_inc);
   return ret;
 }
@@ -224,7 +226,8 @@ __device__ chunked256 round_from_remainder(chunked256 const &q, __int128_t const
 /**
  * Divide n by d and do half up rounding based off of the remainder returned.
  */
-__device__ chunked256 divide_and_round(chunked256 const &n, __int128_t const &d) {
+__device__ chunked256 divide_and_round(chunked256 const& n, __int128_t const& d)
+{
   divmod256 div_result = divide(n, d);
 
   return round_from_remainder(div_result.quotient, div_result.remainder, n, d);
@@ -234,13 +237,15 @@ __device__ chunked256 divide_and_round(chunked256 const &n, __int128_t const &d)
  * Divide n by d and return the quotient. This is essentially what `DOWN` rounding does
  * in Java
  */
-__device__ chunked256 integer_divide(chunked256 const &n, __int128_t const &d) {
+__device__ chunked256 integer_divide(chunked256 const& n, __int128_t const& d)
+{
   divmod256 div_result = divide(n, d);
-  //drop the remainder and only return the quotient
+  // drop the remainder and only return the quotient
   return div_result.quotient;
 }
 
-inline __device__ chunked256 pow_ten(int exp) {
+inline __device__ chunked256 pow_ten(int exp)
+{
   // Note that the body of this was generated using the following scala script
   /*
   import java.math.BigInteger
@@ -265,287 +270,287 @@ inline __device__ chunked256 pow_ten(int exp) {
     printAsInt128s(ret)
     System.out.println(");")
   }
-  */ 
-  switch(exp) {
+  */
+  switch (exp) {
     case 0:
-      //1
+      // 1
       return chunked256(0x0, 0x0, 0x0, 0x1);
     case 1:
-      //10
+      // 10
       return chunked256(0x0, 0x0, 0x0, 0xa);
     case 2:
-      //100
+      // 100
       return chunked256(0x0, 0x0, 0x0, 0x64);
     case 3:
-      //1000
+      // 1000
       return chunked256(0x0, 0x0, 0x0, 0x3e8);
     case 4:
-      //10000
+      // 10000
       return chunked256(0x0, 0x0, 0x0, 0x2710);
     case 5:
-      //100000
+      // 100000
       return chunked256(0x0, 0x0, 0x0, 0x186a0);
     case 6:
-      //1000000
+      // 1000000
       return chunked256(0x0, 0x0, 0x0, 0xf4240);
     case 7:
-      //10000000
+      // 10000000
       return chunked256(0x0, 0x0, 0x0, 0x989680);
     case 8:
-      //100000000
+      // 100000000
       return chunked256(0x0, 0x0, 0x0, 0x5f5e100);
     case 9:
-      //1000000000
+      // 1000000000
       return chunked256(0x0, 0x0, 0x0, 0x3b9aca00);
     case 10:
-      //10000000000
+      // 10000000000
       return chunked256(0x0, 0x0, 0x0, 0x2540be400);
     case 11:
-      //100000000000
+      // 100000000000
       return chunked256(0x0, 0x0, 0x0, 0x174876e800);
     case 12:
-      //1000000000000
+      // 1000000000000
       return chunked256(0x0, 0x0, 0x0, 0xe8d4a51000);
     case 13:
-      //10000000000000
+      // 10000000000000
       return chunked256(0x0, 0x0, 0x0, 0x9184e72a000);
     case 14:
-      //100000000000000
+      // 100000000000000
       return chunked256(0x0, 0x0, 0x0, 0x5af3107a4000);
     case 15:
-      //1000000000000000
+      // 1000000000000000
       return chunked256(0x0, 0x0, 0x0, 0x38d7ea4c68000);
     case 16:
-      //10000000000000000
+      // 10000000000000000
       return chunked256(0x0, 0x0, 0x0, 0x2386f26fc10000);
     case 17:
-      //100000000000000000
+      // 100000000000000000
       return chunked256(0x0, 0x0, 0x0, 0x16345785d8a0000);
     case 18:
-      //1000000000000000000
+      // 1000000000000000000
       return chunked256(0x0, 0x0, 0x0, 0xde0b6b3a7640000);
     case 19:
-      //10000000000000000000
+      // 10000000000000000000
       return chunked256(0x0, 0x0, 0x0, 0x8ac7230489e80000);
     case 20:
-      //100000000000000000000
+      // 100000000000000000000
       return chunked256(0x0, 0x0, 0x5, 0x6bc75e2d63100000);
     case 21:
-      //1000000000000000000000
+      // 1000000000000000000000
       return chunked256(0x0, 0x0, 0x36, 0x35c9adc5dea00000);
     case 22:
-      //10000000000000000000000
+      // 10000000000000000000000
       return chunked256(0x0, 0x0, 0x21e, 0x19e0c9bab2400000);
     case 23:
-      //100000000000000000000000
+      // 100000000000000000000000
       return chunked256(0x0, 0x0, 0x152d, 0x2c7e14af6800000);
     case 24:
-      //1000000000000000000000000
+      // 1000000000000000000000000
       return chunked256(0x0, 0x0, 0xd3c2, 0x1bcecceda1000000);
     case 25:
-      //10000000000000000000000000
+      // 10000000000000000000000000
       return chunked256(0x0, 0x0, 0x84595, 0x161401484a000000);
     case 26:
-      //100000000000000000000000000
+      // 100000000000000000000000000
       return chunked256(0x0, 0x0, 0x52b7d2, 0xdcc80cd2e4000000);
     case 27:
-      //1000000000000000000000000000
+      // 1000000000000000000000000000
       return chunked256(0x0, 0x0, 0x33b2e3c, 0x9fd0803ce8000000);
     case 28:
-      //10000000000000000000000000000
+      // 10000000000000000000000000000
       return chunked256(0x0, 0x0, 0x204fce5e, 0x3e25026110000000);
     case 29:
-      //100000000000000000000000000000
+      // 100000000000000000000000000000
       return chunked256(0x0, 0x0, 0x1431e0fae, 0x6d7217caa0000000);
     case 30:
-      //1000000000000000000000000000000
+      // 1000000000000000000000000000000
       return chunked256(0x0, 0x0, 0xc9f2c9cd0, 0x4674edea40000000);
     case 31:
-      //10000000000000000000000000000000
+      // 10000000000000000000000000000000
       return chunked256(0x0, 0x0, 0x7e37be2022, 0xc0914b2680000000);
     case 32:
-      //100000000000000000000000000000000
+      // 100000000000000000000000000000000
       return chunked256(0x0, 0x0, 0x4ee2d6d415b, 0x85acef8100000000);
     case 33:
-      //1000000000000000000000000000000000
+      // 1000000000000000000000000000000000
       return chunked256(0x0, 0x0, 0x314dc6448d93, 0x38c15b0a00000000);
     case 34:
-      //10000000000000000000000000000000000
+      // 10000000000000000000000000000000000
       return chunked256(0x0, 0x0, 0x1ed09bead87c0, 0x378d8e6400000000);
     case 35:
-      //100000000000000000000000000000000000
+      // 100000000000000000000000000000000000
       return chunked256(0x0, 0x0, 0x13426172c74d82, 0x2b878fe800000000);
     case 36:
-      //1000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000
       return chunked256(0x0, 0x0, 0xc097ce7bc90715, 0xb34b9f1000000000);
     case 37:
-      //10000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000
       return chunked256(0x0, 0x0, 0x785ee10d5da46d9, 0xf436a000000000);
     case 38:
-      //100000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000
       return chunked256(0x0, 0x0, 0x4b3b4ca85a86c47a, 0x98a224000000000);
     case 39:
-      //1000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000
       return chunked256(0x0, 0x2, 0xf050fe938943acc4, 0x5f65568000000000);
     case 40:
-      //10000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000
       return chunked256(0x0, 0x1d, 0x6329f1c35ca4bfab, 0xb9f5610000000000);
     case 41:
-      //100000000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000000
       return chunked256(0x0, 0x125, 0xdfa371a19e6f7cb5, 0x4395ca0000000000);
     case 42:
-      //1000000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000000
       return chunked256(0x0, 0xb7a, 0xbc627050305adf14, 0xa3d9e40000000000);
     case 43:
-      //10000000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000000
       return chunked256(0x0, 0x72cb, 0x5bd86321e38cb6ce, 0x6682e80000000000);
     case 44:
-      //100000000000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000000000
       return chunked256(0x0, 0x47bf1, 0x9673df52e37f2410, 0x11d100000000000);
     case 45:
-      //1000000000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000000000
       return chunked256(0x0, 0x2cd76f, 0xe086b93ce2f768a0, 0xb22a00000000000);
     case 46:
-      //10000000000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000000000
       return chunked256(0x0, 0x1c06a5e, 0xc5433c60ddaa1640, 0x6f5a400000000000);
     case 47:
-      //100000000000000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000000000000
       return chunked256(0x0, 0x118427b3, 0xb4a05bc8a8a4de84, 0x5986800000000000);
     case 48:
-      //1000000000000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000000000000
       return chunked256(0x0, 0xaf298d05, 0xe4395d69670b12b, 0x7f41000000000000);
     case 49:
-      //10000000000000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000000000000
       return chunked256(0x0, 0x6d79f8232, 0x8ea3da61e066ebb2, 0xf88a000000000000);
     case 50:
-      //100000000000000000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000000000000000
       return chunked256(0x0, 0x446c3b15f9, 0x926687d2c40534fd, 0xb564000000000000);
     case 51:
-      //1000000000000000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000000000000000
       return chunked256(0x0, 0x2ac3a4edbbf, 0xb8014e3ba83411e9, 0x15e8000000000000);
     case 52:
-      //10000000000000000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000000000000000
       return chunked256(0x0, 0x1aba4714957d, 0x300d0e549208b31a, 0xdb10000000000000);
     case 53:
-      //100000000000000000000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000000000000000000
       return chunked256(0x0, 0x10b46c6cdd6e3, 0xe0828f4db456ff0c, 0x8ea0000000000000);
     case 54:
-      //1000000000000000000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000000000000000000
       return chunked256(0x0, 0xa70c3c40a64e6, 0xc51999090b65f67d, 0x9240000000000000);
     case 55:
-      //10000000000000000000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000000000000000000
       return chunked256(0x0, 0x6867a5a867f103, 0xb2fffa5a71fba0e7, 0xb680000000000000);
     case 56:
-      //100000000000000000000000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000000000000000000000
       return chunked256(0x0, 0x4140c78940f6a24, 0xfdffc78873d4490d, 0x2100000000000000);
     case 57:
-      //1000000000000000000000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000000000000000000000
       return chunked256(0x0, 0x28c87cb5c89a2571, 0xebfdcb54864ada83, 0x4a00000000000000);
     case 58:
-      //10000000000000000000000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000000000000000000000
       return chunked256(0x1, 0x97d4df19d6057673, 0x37e9f14d3eec8920, 0xe400000000000000);
     case 59:
-      //100000000000000000000000000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000000000000000000000000
       return chunked256(0xf, 0xee50b7025c36a080, 0x2f236d04753d5b48, 0xe800000000000000);
     case 60:
-      //1000000000000000000000000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x9f, 0x4f2726179a224501, 0xd762422c946590d9, 0x1000000000000000);
     case 61:
-      //10000000000000000000000000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x639, 0x17877cec0556b212, 0x69d695bdcbf7a87a, 0xa000000000000000);
     case 62:
-      //100000000000000000000000000000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x3e3a, 0xeb4ae1383562f4b8, 0x2261d969f7ac94ca, 0x4000000000000000);
     case 63:
-      //1000000000000000000000000000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x26e4d, 0x30eccc3215dd8f31, 0x57d27e23acbdcfe6, 0x8000000000000000);
     case 64:
-      //10000000000000000000000000000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x184f03, 0xe93ff9f4daa797ed, 0x6e38ed64bf6a1f01, 0x0);
     case 65:
-      //100000000000000000000000000000000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0xf31627, 0x1c7fc3908a8bef46, 0x4e3945ef7a25360a, 0x0);
     case 66:
-      //1000000000000000000000000000000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x97edd87, 0x1cfda3a5697758bf, 0xe3cbb5ac5741c64, 0x0);
     case 67:
-      //10000000000000000000000000000000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x5ef4a747, 0x21e864761ea97776, 0x8e5f518bb6891be8, 0x0);
     case 68:
-      //100000000000000000000000000000000000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x3b58e88c7, 0x5313ec9d329eaaa1, 0x8fb92f75215b1710, 0x0);
     case 69:
-      //1000000000000000000000000000000000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x25179157c9, 0x3ec73e23fa32aa4f, 0x9d3bda934d8ee6a0, 0x0);
     case 70:
-      //10000000000000000000000000000000000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x172ebad6ddc, 0x73c86d67c5faa71c, 0x245689c107950240, 0x0);
     case 71:
-      //100000000000000000000000000000000000000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0xe7d34c64a9c, 0x85d4460dbbca8719, 0x6b61618a4bd21680, 0x0);
     case 72:
-      //1000000000000000000000000000000000000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x90e40fbeea1d, 0x3a4abc8955e946fe, 0x31cdcf66f634e100, 0x0);
     case 73:
-      //10000000000000000000000000000000000000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x5a8e89d752524, 0x46eb5d5d5b1cc5ed, 0xf20a1a059e10ca00, 0x0);
     case 74:
-      //100000000000000000000000000000000000000000000000000000000000000000000000000
+      // 100000000000000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x3899162693736a, 0xc531a5a58f1fbb4b, 0x746504382ca7e400, 0x0);
     case 75:
-      //1000000000000000000000000000000000000000000000000000000000000000000000000000
+      // 1000000000000000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x235fadd81c2822b, 0xb3f07877973d50f2, 0x8bf22a31be8ee800, 0x0);
     case 76:
-      //10000000000000000000000000000000000000000000000000000000000000000000000000000
+      // 10000000000000000000000000000000000000000000000000000000000000000000000000000
       return chunked256(0x161bcca7119915b5, 0x764b4abe8652979, 0x7775a5f171951000, 0x0);
     default:
       // This is not a supported value...
       assert(0);
-    }
+  }
 }
 
 // check that the divide is going to do the right thing
-void check_scale_divisor(int source_scale, int target_scale) {
+void check_scale_divisor(int source_scale, int target_scale)
+{
   int exponent = target_scale - source_scale;
   CUDF_EXPECTS(exponent <= cuda::std::numeric_limits<__int128_t>::digits10, "divisor too big");
 }
 
-inline __device__ int precision10(chunked256 value) {
-    if (value.sign() < 0) {
-      // we want to do this on positive numbers
-      value.negate();
-    }
-    // TODO this is a horrible way to do this. We should at least
-    // be able to approximate the log10 using the leading zeros similar to 
-    // http://graphics.stanford.edu/~seander/bithacks.html and then start
-    // the search around the guess.
-    for (int i = 0; i <= 76; i++) {
-      chunked256 tmp = pow_ten(i);
-      if (tmp.gte_unsigned(value)) {
-        return i;
-      }
-    }
-    return -1;
+inline __device__ int precision10(chunked256 value)
+{
+  if (value.sign() < 0) {
+    // we want to do this on positive numbers
+    value.negate();
+  }
+  // TODO this is a horrible way to do this. We should at least
+  // be able to approximate the log10 using the leading zeros similar to
+  // http://graphics.stanford.edu/~seander/bithacks.html and then start
+  // the search around the guess.
+  for (int i = 0; i <= 76; i++) {
+    chunked256 tmp = pow_ten(i);
+    if (tmp.gte_unsigned(value)) { return i; }
+  }
+  return -1;
 }
 
-__device__ bool is_greater_than_decimal_38(chunked256 a) {
+__device__ bool is_greater_than_decimal_38(chunked256 a)
+{
   auto const max_number_for_precision = pow_ten(38);
-  if (a.sign() != 0) {
-    a.negate();
-  }
+  if (a.sign() != 0) { a.negate(); }
   return a.gte_unsigned(max_number_for_precision);
 }
 
-__device__ chunked256 set_scale_and_round(chunked256 data, int old_scale, int new_scale) {
+__device__ chunked256 set_scale_and_round(chunked256 data, int old_scale, int new_scale)
+{
   if (old_scale != new_scale) {
     if (new_scale < old_scale) {
-      int const raise = old_scale - new_scale;
+      int const raise      = old_scale - new_scale;
       int const multiplier = pow_ten(raise).as_128_bits();
-      data = multiply(data, chunked256(multiplier));
+      data                 = multiply(data, chunked256(multiplier));
     } else {
-      int const drop = new_scale - old_scale;
+      int const drop    = new_scale - old_scale;
       int const divisor = pow_ten(drop).as_128_bits();
-      data = divide_and_round(data, divisor);
+      data              = divide_and_round(data, divisor);
     }
   }
   return data;
@@ -553,102 +558,118 @@ __device__ chunked256 set_scale_and_round(chunked256 data, int old_scale, int ne
 
 // Functor to add two DECIMAL128 columns with rounding and overflow detection.
 struct dec128_add_sub {
-  dec128_add_sub(bool *overflows, cudf::mutable_column_view const &result_view,
-                    cudf::column_view const &a_col, cudf::column_view const &b_col)
-      : overflows(overflows), a_data(a_col.data<__int128_t>()), b_data(b_col.data<__int128_t>()),
-        result_data(result_view.data<__int128_t>()),
-        a_scale(a_col.type().scale()), b_scale(b_col.type().scale()),
-        result_scale(result_view.type().scale()) {}
-
-  __device__ void add(chunked256 &a, chunked256 &b) const {
-    do_add_sub(a, b, false);
+  dec128_add_sub(bool* overflows,
+                 cudf::mutable_column_view const& result_view,
+                 cudf::column_view const& a_col,
+                 cudf::column_view const& b_col)
+    : overflows(overflows),
+      a_data(a_col.data<__int128_t>()),
+      b_data(b_col.data<__int128_t>()),
+      result_data(result_view.data<__int128_t>()),
+      a_scale(a_col.type().scale()),
+      b_scale(b_col.type().scale()),
+      result_scale(result_view.type().scale())
+  {
   }
 
-  __device__ void sub(chunked256 &a, chunked256 &b) const {
-    do_add_sub(a, b, true);
-  }
+  __device__ void add(chunked256& a, chunked256& b) const { do_add_sub(a, b, false); }
 
-private:
+  __device__ void sub(chunked256& a, chunked256& b) const { do_add_sub(a, b, true); }
 
-  __device__ void do_add_sub(chunked256 &a, chunked256 &b, bool sub) const {
-      int intermediate_scale = min(a_scale, b_scale);
-      if (a_scale != intermediate_scale) {
-        a = set_scale_and_round(a, a_scale, intermediate_scale);
-      }
-      if (b_scale != intermediate_scale) {
-        b = set_scale_and_round(b, b_scale, intermediate_scale);
-      }
-      if (sub) {
-        // Get 2's complement
-        b.negate();
-      }
-      a.add(b);
+ private:
+  __device__ void do_add_sub(chunked256& a, chunked256& b, bool sub) const
+  {
+    int intermediate_scale = min(a_scale, b_scale);
+    if (a_scale != intermediate_scale) { a = set_scale_and_round(a, a_scale, intermediate_scale); }
+    if (b_scale != intermediate_scale) { b = set_scale_and_round(b, b_scale, intermediate_scale); }
+    if (sub) {
+      // Get 2's complement
+      b.negate();
+    }
+    a.add(b);
 
-      if (result_scale != intermediate_scale) {
-        a = set_scale_and_round(a, intermediate_scale, result_scale);
-      }
+    if (result_scale != intermediate_scale) {
+      a = set_scale_and_round(a, intermediate_scale, result_scale);
+    }
   }
 
-protected:
-
+ protected:
   // output column for overflow detected
-  bool * const overflows;
+  bool* const overflows;
 
   // input data
-  __int128_t const * const a_data;
-  __int128_t const * const b_data;
-  __int128_t * const result_data;
+  __int128_t const* const a_data;
+  __int128_t const* const b_data;
+  __int128_t* const result_data;
   int const a_scale;
   int const b_scale;
   int const result_scale;
 };
 
 // Functor to add two DECIMAL128 columns with rounding and overflow detection.
-struct dec128_add: public dec128_add_sub {
-  dec128_add(bool *overflows, cudf::mutable_column_view const &sum_view,
-                    cudf::column_view const &a_col, cudf::column_view const &b_col)
-       : dec128_add_sub(overflows, sum_view, a_col, b_col) {}
+struct dec128_add : public dec128_add_sub {
+  dec128_add(bool* overflows,
+             cudf::mutable_column_view const& sum_view,
+             cudf::column_view const& a_col,
+             cudf::column_view const& b_col)
+    : dec128_add_sub(overflows, sum_view, a_col, b_col)
+  {
+  }
 
-  __device__ void operator()(cudf::size_type const i) const {
+  __device__ void operator()(cudf::size_type const i) const
+  {
     chunked256 a(a_data[i]);
     chunked256 b(b_data[i]);
 
-    chunked256 &sum = a;
+    chunked256& sum = a;
     add(a, b);
 
-    overflows[i] = is_greater_than_decimal_38(sum);
+    overflows[i]   = is_greater_than_decimal_38(sum);
     result_data[i] = sum.as_128_bits();
   }
 };
 
 // Functor to sub two DECIMAL128 columns with rounding and overflow detection.
-struct dec128_sub: public dec128_add_sub {
-  dec128_sub(bool *overflows, cudf::mutable_column_view const &sub_view,
-                    cudf::column_view const &a_col, cudf::column_view const &b_col)
-      : dec128_add_sub(overflows, sub_view, a_col, b_col) {}
+struct dec128_sub : public dec128_add_sub {
+  dec128_sub(bool* overflows,
+             cudf::mutable_column_view const& sub_view,
+             cudf::column_view const& a_col,
+             cudf::column_view const& b_col)
+    : dec128_add_sub(overflows, sub_view, a_col, b_col)
+  {
+  }
 
-  __device__ void operator()(cudf::size_type const i) const {
+  __device__ void operator()(cudf::size_type const i) const
+  {
     chunked256 a(a_data[i]);
     chunked256 b(b_data[i]);
 
-    chunked256 &res = a;
+    chunked256& res = a;
     sub(a, b);
 
-    overflows[i] = is_greater_than_decimal_38(res);
+    overflows[i]   = is_greater_than_decimal_38(res);
     result_data[i] = res.as_128_bits();
   }
 };
 
 // Functor to multiply two DECIMAL128 columns with rounding and overflow detection.
 struct dec128_multiplier {
-  dec128_multiplier(bool *overflows, cudf::mutable_column_view const &product_view,
-                    cudf::column_view const &a_col, cudf::column_view const &b_col)
-      : overflows(overflows), a_data(a_col.data<__int128_t>()), b_data(b_col.data<__int128_t>()),
-        product_data(product_view.data<__int128_t>()),
-        a_scale(a_col.type().scale()), b_scale(b_col.type().scale()),
-        prod_scale(product_view.type().scale()) {}
-
-  __device__ void operator()(cudf::size_type const i) const {
+  dec128_multiplier(bool* overflows,
+                    cudf::mutable_column_view const& product_view,
+                    cudf::column_view const& a_col,
+                    cudf::column_view const& b_col)
+    : overflows(overflows),
+      a_data(a_col.data<__int128_t>()),
+      b_data(b_col.data<__int128_t>()),
+      product_data(product_view.data<__int128_t>()),
+      a_scale(a_col.type().scale()),
+      b_scale(b_col.type().scale()),
+      prod_scale(product_view.type().scale())
+  {
+  }
+
+  __device__ void operator()(cudf::size_type const i) const
+  {
     chunked256 const a(a_data[i]);
     chunked256 const b(b_data[i]);
 
@@ -659,13 +680,13 @@ struct dec128_multiplier {
     // But to match Spark we need to first round the result to a precision of 38
     // and this is specific to the value in the result of the multiply.
     // Then we need to round the result to the final scale that we care about.
-    int dec_precision = precision10(product);
+    int dec_precision       = precision10(product);
     int first_div_precision = dec_precision - 38;
 
     int mult_scale = a_scale + b_scale;
     if (first_div_precision > 0) {
       auto const first_div_scale_divisor = pow_ten(first_div_precision).as_128_bits();
-      product = divide_and_round(product, first_div_scale_divisor);
+      product                            = divide_and_round(product, first_div_scale_divisor);
 
       // a_scale and b_scale are negative. first_div_precision is not
       mult_scale = a_scale + b_scale + first_div_precision;
@@ -680,37 +701,33 @@ struct dec128_multiplier {
         overflows[i] = true;
         return;
       } else {
-        auto const scale_mult = pow_ten( -exponent).as_128_bits();
-        product = multiply(product, chunked256(scale_mult));
+        auto const scale_mult = pow_ten(-exponent).as_128_bits();
+        product               = multiply(product, chunked256(scale_mult));
       }
     } else {
       auto const scale_divisor = pow_ten(exponent).as_128_bits();
 
       // scale and round to target scale
-      if (scale_divisor != 1) {
-        product = divide_and_round(product, scale_divisor);
-      }
+      if (scale_divisor != 1) { product = divide_and_round(product, scale_divisor); }
     }
 
-    overflows[i] = is_greater_than_decimal_38(product);
+    overflows[i]    = is_greater_than_decimal_38(product);
     product_data[i] = product.as_128_bits();
   }
 
-private:
-
+ private:
   // output column for overflow detected
-  bool * const overflows;
+  bool* const overflows;
 
   // input data for multiply
-  __int128_t const * const a_data;
-  __int128_t const * const b_data;
-  __int128_t * const product_data;
+  __int128_t const* const a_data;
+  __int128_t const* const b_data;
+  __int128_t* const product_data;
   int const a_scale;
   int const b_scale;
   int const prod_scale;
 };
 
-
 /**
  * Functor to divide two DECIMAL128 columns with rounding and overflow detection.
  * This functor should be used for a 128-bit regular division or a 64-bit integer division only
@@ -720,21 +737,29 @@ private:
 template <typename T, bool is_int_div>
 struct dec128_divider {
   static_assert((sizeof(T) == sizeof(uint64_t) && is_int_div) ||
-    (sizeof(T) == sizeof(__int128_t) && !is_int_div));
-  dec128_divider(bool *overflows, cudf::mutable_column_view const &quotient_view,
-                    cudf::column_view const &a_col, cudf::column_view const &b_col)
-      : overflows(overflows), a_data(a_col.data<__int128_t>()), b_data(b_col.data<__int128_t>()),
-        quotient_data(quotient_view.data<T>()),
-        a_scale(a_col.type().scale()), b_scale(b_col.type().scale()),
-        quot_scale(quotient_view.type().scale()) {}
-
-  __device__ void operator()(cudf::size_type const i) const {
+                (sizeof(T) == sizeof(__int128_t) && !is_int_div));
+  dec128_divider(bool* overflows,
+                 cudf::mutable_column_view const& quotient_view,
+                 cudf::column_view const& a_col,
+                 cudf::column_view const& b_col)
+    : overflows(overflows),
+      a_data(a_col.data<__int128_t>()),
+      b_data(b_col.data<__int128_t>()),
+      quotient_data(quotient_view.data<T>()),
+      a_scale(a_col.type().scale()),
+      b_scale(b_col.type().scale()),
+      quot_scale(quotient_view.type().scale())
+  {
+  }
+
+  __device__ void operator()(cudf::size_type const i) const
+  {
     chunked256 n(a_data[i]);
     __int128_t const d(b_data[i]);
 
     // Divide by zero, not sure if we care or not, but...
     if (d == 0) {
-      overflows[i] = true;
+      overflows[i]     = true;
       quotient_data[i] = 0;
       return;
     }
@@ -755,10 +780,10 @@ struct dec128_divider {
       // The second divide gets the result into the scale that we care about and does the rounding.
       chunked256 result;
       if constexpr (is_int_div) {
-        result = integer_divide(first_div_result.quotient, scale_divisor);
+        result           = integer_divide(first_div_result.quotient, scale_divisor);
         quotient_data[i] = result.as_64_bits();
       } else {
-        result = divide_and_round(first_div_result.quotient, scale_divisor);
+        result           = divide_and_round(first_div_result.quotient, scale_divisor);
         quotient_data[i] = result.as_128_bits();
       }
       overflows[i] = is_greater_than_decimal_38(result);
@@ -774,10 +799,10 @@ struct dec128_divider {
       auto const first_div_result = divide(n, d);
       chunked256 const first_div_r(first_div_result.remainder);
 
-      //now we have to multiply each of these by how much is left
+      // now we have to multiply each of these by how much is left
       int const remaining_exp = (-n_shift_exp) - 38;
-      auto const scale_mult = pow_ten(remaining_exp);
-      auto result = multiply(first_div_result.quotient, scale_mult);
+      auto const scale_mult   = pow_ten(remaining_exp);
+      auto result             = multiply(first_div_result.quotient, scale_mult);
       auto const scaled_div_r = multiply(first_div_r, scale_mult);
 
       // Now do a second divide on what is left
@@ -785,7 +810,7 @@ struct dec128_divider {
       result.add(second_div_result.quotient);
 
       if constexpr (is_int_div) {
-        overflows[i] = is_greater_than_decimal_38(result);
+        overflows[i]     = is_greater_than_decimal_38(result);
         quotient_data[i] = result.as_64_bits();
       } else {
         // and finally round
@@ -795,60 +820,65 @@ struct dec128_divider {
       overflows[i] = is_greater_than_decimal_38(result);
     } else {
       // Regular multiply followed by a divide
-      if (n_shift_exp < 0) {
-        n = multiply(n, pow_ten(-n_shift_exp));
-      }
+      if (n_shift_exp < 0) { n = multiply(n, pow_ten(-n_shift_exp)); }
       chunked256 result;
       if constexpr (is_int_div) {
-        result = integer_divide(n, d);
+        result           = integer_divide(n, d);
         quotient_data[i] = result.as_64_bits();
       } else {
-        result = divide_and_round(n, d);
+        result           = divide_and_round(n, d);
         quotient_data[i] = result.as_128_bits();
       }
       overflows[i] = is_greater_than_decimal_38(result);
     }
   }
 
-private:
-
+ private:
   // output column for overflow detected
-  bool * const overflows;
+  bool* const overflows;
   // input data for multiply
-  __int128_t const * const a_data;
-  __int128_t const * const b_data;
-  T * const quotient_data;
+  __int128_t const* const a_data;
+  __int128_t const* const b_data;
+  T* const quotient_data;
   int const a_scale;
   int const b_scale;
   int const quot_scale;
 };
 
 struct dec128_remainder {
-  dec128_remainder(bool *overflows, cudf::mutable_column_view const &remainder_view,
-                    cudf::column_view const &a_col, cudf::column_view const &b_col)
-      : overflows(overflows), a_data(a_col.data<__int128_t>()), b_data(b_col.data<__int128_t>()),
-        remainder_data(remainder_view.data<__int128_t>()),
-        a_scale(a_col.type().scale()), b_scale(b_col.type().scale()),
-        rem_scale(remainder_view.type().scale()) {}
-
-  __device__ void operator()(cudf::size_type const i) const {
+  dec128_remainder(bool* overflows,
+                   cudf::mutable_column_view const& remainder_view,
+                   cudf::column_view const& a_col,
+                   cudf::column_view const& b_col)
+    : overflows(overflows),
+      a_data(a_col.data<__int128_t>()),
+      b_data(b_col.data<__int128_t>()),
+      remainder_data(remainder_view.data<__int128_t>()),
+      a_scale(a_col.type().scale()),
+      b_scale(b_col.type().scale()),
+      rem_scale(remainder_view.type().scale())
+  {
+  }
+
+  __device__ void operator()(cudf::size_type const i) const
+  {
     chunked256 n(a_data[i]);
     __int128_t const d(b_data[i]);
 
     // Divide by zero, not sure if we care or not, but...
     if (d == 0) {
-      overflows[i] = true;
+      overflows[i]      = true;
       remainder_data[i] = 0;
       return;
     }
 
     // This implementation of remainder uses the JAVA definition of remainder
-    // that Spark relies on. It's *not* the most efficient way of calculating 
+    // that Spark relies on. It's *not* the most efficient way of calculating
     // remainder, but we use this to be consistent with CPU Spark.
 
     // The algorithm is:
-    // a % b = a - (a // b) * b 
-    // Basically we substract the integral_divide result times the divisor from 
+    // a % b = a - (a // b) * b
+    // Basically we substract the integral_divide result times the divisor from
     // the dividend
 
     bool const is_n_neg = n.sign() < 0;
@@ -862,33 +892,31 @@ struct dec128_remainder {
     // Then, we have to shift the dividend to compute integer divide
     // We use the formula from dec128_divider
     // Start with: quot_scale - (a_scale - b_scale)
-    // Then substitute 0 for quot_scale (integer divide), and rem_scale for b_scale 
+    // Then substitute 0 for quot_scale (integer divide), and rem_scale for b_scale
     // (since we updated the divisor scale)
     // 0 - (a_scale - rem_scale)
     // rem_scale - a_scale
-    int n_shift_exp = rem_scale - a_scale;
+    int n_shift_exp  = rem_scale - a_scale;
     __int128_t abs_d = is_d_neg ? -d : d;
     // Unlike in divide, where we can scale the dividend to get the right result
-    // remainder relies on the scale on the divisor, so we might have to shift the 
+    // remainder relies on the scale on the divisor, so we might have to shift the
     // divisor itself.
     if (d_shift_exp > 0) {
-      // We need to shift the the scale of the divisor to rem_scale, but 
-      // we actual need to round because of how precision is to be handled, 
+      // We need to shift the the scale of the divisor to rem_scale, but
+      // we actual need to round because of how precision is to be handled,
       // since the new scale is smaller than the old scale
       auto const scale_divisor = pow_ten(d_shift_exp).as_128_bits();
-      abs_d = divide_and_round(chunked256(abs_d), scale_divisor).as_128_bits();
+      abs_d                    = divide_and_round(chunked256(abs_d), scale_divisor).as_128_bits();
     } else {
       // otherwise we are multiplying the bottom by a power of 10, which divides the numerator
-      // by the same power of ten, so we accomodate that in our original n-shift like 
+      // by the same power of ten, so we accomodate that in our original n-shift like
       // divide did before
       n_shift_exp -= d_shift_exp;
     }
     // For remainder, we should do the computation using positive numbers only, and then
     // switch the sign based on [n] *only*.
     chunked256 abs_n = n;
-    if (is_n_neg) {
-      abs_n.negate();
-    }
+    if (is_n_neg) { abs_n.negate(); }
     chunked256 int_div_result;
     if (n_shift_exp > 0) {
       divmod256 const first_div_result = divide(abs_n, abs_d);
@@ -899,178 +927,241 @@ struct dec128_remainder {
       // The second divide gets the result into the scale that we care about and does the rounding.
       int_div_result = integer_divide(first_div_result.quotient, scale_divisor);
     } else {
-      if (n_shift_exp < 0) {
-        abs_n = multiply(abs_n, pow_ten(-n_shift_exp));
-      }
+      if (n_shift_exp < 0) { abs_n = multiply(abs_n, pow_ten(-n_shift_exp)); }
       int_div_result = integer_divide(abs_n, abs_d);
     }
     // Multiply the integer divide result by abs(divisor)
     chunked256 less_n = multiply(int_div_result, chunked256(abs_d));
 
     if (d_shift_exp < 0) {
-      // scale less_n up to equal it to same scale since we were technically scaling up 
+      // scale less_n up to equal it to same scale since we were technically scaling up
       // the divisor earlier (even though we only shifted n)
       less_n = multiply(less_n, pow_ten(-d_shift_exp));
     }
-    // Subtract our integer divide result from n by adding the negated 
+    // Subtract our integer divide result from n by adding the negated
     less_n.negate();
     abs_n.add(less_n);
     // This should almost never overflow, but we check anyways
     overflows[i] = is_greater_than_decimal_38(abs_n);
-    result = abs_n.as_128_bits();
+    result       = abs_n.as_128_bits();
     // Change the sign of the result based on n
-    if (is_n_neg) {
-      result = -result;
-    }
+    if (is_n_neg) { result = -result; }
     remainder_data[i] = result;
   }
 
-private:
+ private:
   // output column for overflow detected
-  bool * const overflows;
+  bool* const overflows;
   // input data for multiply
-  __int128_t const * const a_data;
-  __int128_t const * const b_data;
-  __int128_t * const remainder_data;
+  __int128_t const* const a_data;
+  __int128_t const* const b_data;
+  __int128_t* const remainder_data;
   int const a_scale;
   int const b_scale;
   int const rem_scale;
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf::jni {
 
-std::unique_ptr<cudf::table>
-multiply_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t product_scale,
-                    rmm::cuda_stream_view stream) {
+std::unique_ptr<cudf::table> multiply_decimal128(cudf::column_view const& a,
+                                                 cudf::column_view const& b,
+                                                 int32_t product_scale,
+                                                 rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
   CUDF_EXPECTS(b.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
   auto const num_rows = a.size();
   CUDF_EXPECTS(num_rows == b.size(), "inputs have mismatched row counts");
   auto [result_null_mask, result_null_count] = cudf::detail::bitmask_and(
-      cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource());
+    cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource());
   std::vector<std::unique_ptr<cudf::column>> columns;
   // copy the null mask here, as it will be used again later
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows,
-                                                  rmm::device_buffer(result_null_mask, stream), result_null_count, stream));
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, product_scale}, num_rows, std::move(result_null_mask), result_null_count, stream));
+  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8},
+                                                  num_rows,
+                                                  rmm::device_buffer(result_null_mask, stream),
+                                                  result_null_count,
+                                                  stream));
+  columns.push_back(
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, product_scale},
+                                  num_rows,
+                                  std::move(result_null_mask),
+                                  result_null_count,
+                                  stream));
   auto overflows_view = columns[0]->mutable_view();
-  auto product_view = columns[1]->mutable_view();
+  auto product_view   = columns[1]->mutable_view();
   check_scale_divisor(a.type().scale() + b.type().scale(), product_scale);
-  thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0),
+  thrust::for_each(rmm::exec_policy(stream),
+                   thrust::make_counting_iterator<cudf::size_type>(0),
                    thrust::make_counting_iterator<cudf::size_type>(num_rows),
                    dec128_multiplier(overflows_view.begin<bool>(), product_view, a, b));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-std::unique_ptr<cudf::table>
-divide_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t quotient_scale,
-                  rmm::cuda_stream_view stream) {
+std::unique_ptr<cudf::table> divide_decimal128(cudf::column_view const& a,
+                                               cudf::column_view const& b,
+                                               int32_t quotient_scale,
+                                               rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
   CUDF_EXPECTS(b.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
   auto const num_rows = a.size();
   CUDF_EXPECTS(num_rows == b.size(), "inputs have mismatched row counts");
   auto [result_null_mask, result_null_count] = cudf::detail::bitmask_and(
-      cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource());
+    cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource());
   std::vector<std::unique_ptr<cudf::column>> columns;
   // copy the null mask here, as it will be used again later
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows,
-                                                  rmm::device_buffer(result_null_mask, stream), result_null_count, stream));
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, quotient_scale}, num_rows, std::move(result_null_mask), result_null_count, stream));
+  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8},
+                                                  num_rows,
+                                                  rmm::device_buffer(result_null_mask, stream),
+                                                  result_null_count,
+                                                  stream));
+  columns.push_back(
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, quotient_scale},
+                                  num_rows,
+                                  std::move(result_null_mask),
+                                  result_null_count,
+                                  stream));
   auto overflows_view = columns[0]->mutable_view();
-  auto quotient_view = columns[1]->mutable_view();
-  thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0),
-                   thrust::make_counting_iterator<cudf::size_type>(num_rows),
-                   dec128_divider<__int128_t, false>(overflows_view.begin<bool>(), quotient_view, a, b));
+  auto quotient_view  = columns[1]->mutable_view();
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<cudf::size_type>(0),
+    thrust::make_counting_iterator<cudf::size_type>(num_rows),
+    dec128_divider<__int128_t, false>(overflows_view.begin<bool>(), quotient_view, a, b));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-std::unique_ptr<cudf::table>
-integer_divide_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t quotient_scale,
-                          rmm::cuda_stream_view stream) {
+std::unique_ptr<cudf::table> integer_divide_decimal128(cudf::column_view const& a,
+                                                       cudf::column_view const& b,
+                                                       int32_t quotient_scale,
+                                                       rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
   CUDF_EXPECTS(b.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
   auto const num_rows = a.size();
   CUDF_EXPECTS(num_rows == b.size(), "inputs have mismatched row counts");
   auto [result_null_mask, result_null_count] = cudf::detail::bitmask_and(
-      cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource());
+    cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource());
   std::vector<std::unique_ptr<cudf::column>> columns;
   // copy the null mask here, as it will be used again later
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows,
-                                                  rmm::device_buffer(result_null_mask, stream), result_null_count, stream));
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT64}, num_rows, std::move(result_null_mask), result_null_count, stream));
+  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8},
+                                                  num_rows,
+                                                  rmm::device_buffer(result_null_mask, stream),
+                                                  result_null_count,
+                                                  stream));
+  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT64},
+                                                  num_rows,
+                                                  std::move(result_null_mask),
+                                                  result_null_count,
+                                                  stream));
   auto overflows_view = columns[0]->mutable_view();
-  auto quotient_view = columns[1]->mutable_view();
-  thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0),
-                   thrust::make_counting_iterator<cudf::size_type>(num_rows),
-                   dec128_divider<uint64_t, true>(overflows_view.begin<bool>(), quotient_view, a, b));
+  auto quotient_view  = columns[1]->mutable_view();
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<cudf::size_type>(0),
+    thrust::make_counting_iterator<cudf::size_type>(num_rows),
+    dec128_divider<uint64_t, true>(overflows_view.begin<bool>(), quotient_view, a, b));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-std::unique_ptr<cudf::table>
-remainder_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t remainder_scale, 
-                  rmm::cuda_stream_view stream) {
+std::unique_ptr<cudf::table> remainder_decimal128(cudf::column_view const& a,
+                                                  cudf::column_view const& b,
+                                                  int32_t remainder_scale,
+                                                  rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
   CUDF_EXPECTS(b.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
   auto const num_rows = a.size();
   CUDF_EXPECTS(num_rows == b.size(), "inputs have mismatched row counts");
   auto [result_null_mask, result_null_count] = cudf::detail::bitmask_and(
-      cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource());
+    cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource());
   std::vector<std::unique_ptr<cudf::column>> columns;
   // copy the null mask here, as it will be used again later
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows,
-                                                  rmm::device_buffer(result_null_mask, stream), result_null_count, stream));
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, remainder_scale}, num_rows, std::move(result_null_mask), result_null_count, stream));
+  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8},
+                                                  num_rows,
+                                                  rmm::device_buffer(result_null_mask, stream),
+                                                  result_null_count,
+                                                  stream));
+  columns.push_back(
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, remainder_scale},
+                                  num_rows,
+                                  std::move(result_null_mask),
+                                  result_null_count,
+                                  stream));
   auto overflows_view = columns[0]->mutable_view();
   auto remainder_view = columns[1]->mutable_view();
-  thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0),
+  thrust::for_each(rmm::exec_policy(stream),
+                   thrust::make_counting_iterator<cudf::size_type>(0),
                    thrust::make_counting_iterator<cudf::size_type>(num_rows),
                    dec128_remainder(overflows_view.begin<bool>(), remainder_view, a, b));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-std::unique_ptr<cudf::table>
-add_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t target_scale,
-                  rmm::cuda_stream_view stream) {
+std::unique_ptr<cudf::table> add_decimal128(cudf::column_view const& a,
+                                            cudf::column_view const& b,
+                                            int32_t target_scale,
+                                            rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
   CUDF_EXPECTS(b.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
   auto const num_rows = a.size();
   CUDF_EXPECTS(num_rows == b.size(), "inputs have mismatched row counts");
   auto [result_null_mask, result_null_count] = cudf::detail::bitmask_and(
-      cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource());
+    cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource());
   std::vector<std::unique_ptr<cudf::column>> columns;
   // copy the null mask here, as it will be used again later
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows,
-                                                  rmm::device_buffer(result_null_mask, stream), result_null_count, stream));
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, target_scale}, num_rows, std::move(result_null_mask), result_null_count, stream));
+  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8},
+                                                  num_rows,
+                                                  rmm::device_buffer(result_null_mask, stream),
+                                                  result_null_count,
+                                                  stream));
+  columns.push_back(
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, target_scale},
+                                  num_rows,
+                                  std::move(result_null_mask),
+                                  result_null_count,
+                                  stream));
   auto overflows_view = columns[0]->mutable_view();
-  auto sum_view = columns[1]->mutable_view();
-  thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
+  auto sum_view       = columns[1]->mutable_view();
+  thrust::for_each(rmm::exec_policy(stream),
+                   thrust::make_counting_iterator(0),
                    thrust::make_counting_iterator(num_rows),
                    dec128_add(overflows_view.begin<bool>(), sum_view, a, b));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-std::unique_ptr<cudf::table>
-sub_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t target_scale,
-                  rmm::cuda_stream_view stream) {
+std::unique_ptr<cudf::table> sub_decimal128(cudf::column_view const& a,
+                                            cudf::column_view const& b,
+                                            int32_t target_scale,
+                                            rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(a.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
   CUDF_EXPECTS(b.type().id() == cudf::type_id::DECIMAL128, "not a DECIMAL128 column");
   auto const num_rows = a.size();
   CUDF_EXPECTS(num_rows == b.size(), "inputs have mismatched row counts");
   auto [result_null_mask, result_null_count] = cudf::detail::bitmask_and(
-      cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource());
+    cudf::table_view{{a, b}}, stream, rmm::mr::get_current_device_resource());
   std::vector<std::unique_ptr<cudf::column>> columns;
   // copy the null mask here, as it will be used again later
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows,
-                                                  rmm::device_buffer(result_null_mask, stream), result_null_count, stream));
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, target_scale}, num_rows, std::move(result_null_mask), result_null_count, stream));
+  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8},
+                                                  num_rows,
+                                                  rmm::device_buffer(result_null_mask, stream),
+                                                  result_null_count,
+                                                  stream));
+  columns.push_back(
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::DECIMAL128, target_scale},
+                                  num_rows,
+                                  std::move(result_null_mask),
+                                  result_null_count,
+                                  stream));
   auto overflows_view = columns[0]->mutable_view();
-  auto sub_view = columns[1]->mutable_view();
-  thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
+  auto sub_view       = columns[1]->mutable_view();
+  thrust::for_each(rmm::exec_policy(stream),
+                   thrust::make_counting_iterator(0),
                    thrust::make_counting_iterator(num_rows),
                    dec128_sub(overflows_view.begin<bool>(), sub_view, a, b));
   return std::make_unique<cudf::table>(std::move(columns));
 }
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/src/main/cpp/src/decimal_utils.hpp b/src/main/cpp/src/decimal_utils.hpp
index 1c7c30ed01..1011a0a574 100644
--- a/src/main/cpp/src/decimal_utils.hpp
+++ b/src/main/cpp/src/decimal_utils.hpp
@@ -22,27 +22,39 @@
 
 namespace cudf::jni {
 
-std::unique_ptr<cudf::table>
-multiply_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t product_scale,
-                    rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-
-std::unique_ptr<cudf::table>
-divide_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t quotient_scale,
-                  rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-
-std::unique_ptr<cudf::table>
-integer_divide_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t quotient_scale,
-                          rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-
-std::unique_ptr<cudf::table>
-remainder_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t remainder_scale,
-                  rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-
-std::unique_ptr<cudf::table>
-add_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t quotient_scale,
-               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-
-std::unique_ptr<cudf::table>
-sub_decimal128(cudf::column_view const &a, cudf::column_view const &b, int32_t quotient_scale,
-               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-} // namespace cudf::jni
+std::unique_ptr<cudf::table> multiply_decimal128(
+  cudf::column_view const& a,
+  cudf::column_view const& b,
+  int32_t product_scale,
+  rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+std::unique_ptr<cudf::table> divide_decimal128(
+  cudf::column_view const& a,
+  cudf::column_view const& b,
+  int32_t quotient_scale,
+  rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+std::unique_ptr<cudf::table> integer_divide_decimal128(
+  cudf::column_view const& a,
+  cudf::column_view const& b,
+  int32_t quotient_scale,
+  rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+std::unique_ptr<cudf::table> remainder_decimal128(
+  cudf::column_view const& a,
+  cudf::column_view const& b,
+  int32_t remainder_scale,
+  rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+std::unique_ptr<cudf::table> add_decimal128(
+  cudf::column_view const& a,
+  cudf::column_view const& b,
+  int32_t quotient_scale,
+  rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+std::unique_ptr<cudf::table> sub_decimal128(
+  cudf::column_view const& a,
+  cudf::column_view const& b,
+  int32_t quotient_scale,
+  rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+}  // namespace cudf::jni
diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/map_utils.cu
index d0367206ae..c5384e41b0 100644
--- a/src/main/cpp/src/map_utils.cu
+++ b/src/main/cpp/src/map_utils.cu
@@ -64,30 +64,32 @@ namespace {
 // 1. Append one comma character (',') to the end of each input string, except the last one.
 // 2. Concatenate all input strings into one string.
 // 3. Add a pair of bracket characters ('[' and ']') to the beginning and the end of the output.
-rmm::device_uvector<char> unify_json_strings(cudf::column_view const &input,
-                                             rmm::cuda_stream_view stream) {
+rmm::device_uvector<char> unify_json_strings(cudf::column_view const& input,
+                                             rmm::cuda_stream_view stream)
+{
   if (input.is_empty()) {
-    return cudf::detail::make_device_uvector_async<char>(std::vector<char>{'[', ']'}, stream,
-                                                         rmm::mr::get_current_device_resource());
+    return cudf::detail::make_device_uvector_async<char>(
+      std::vector<char>{'[', ']'}, stream, rmm::mr::get_current_device_resource());
   }
 
-  auto const d_strings = cudf::column_device_view::create(input, stream);
+  auto const d_strings  = cudf::column_device_view::create(input, stream);
   auto const chars_size = input.child(cudf::strings_column_view::chars_column_index).size();
   auto const output_size =
-      2l + // two extra bracket characters '[' and ']'
-      static_cast<int64_t>(chars_size) +
-      static_cast<int64_t>(input.size() - 1) +       // append `,` character between input rows
-      static_cast<int64_t>(input.null_count()) * 2l; // replace null with "{}"
+    2l +                                            // two extra bracket characters '[' and ']'
+    static_cast<int64_t>(chars_size) +
+    static_cast<int64_t>(input.size() - 1) +        // append `,` character between input rows
+    static_cast<int64_t>(input.null_count()) * 2l;  // replace null with "{}"
   CUDF_EXPECTS(output_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
                "The input json column is too large and causes overflow.");
 
   auto const joined_input = cudf::strings::detail::join_strings(
-      cudf::strings_column_view{input},
-      cudf::string_scalar(","),  // append `,` character between the input rows
-      cudf::string_scalar("{}"), // replacement for null rows
-      stream, rmm::mr::get_current_device_resource());
+    cudf::strings_column_view{input},
+    cudf::string_scalar(","),   // append `,` character between the input rows
+    cudf::string_scalar("{}"),  // replacement for null rows
+    stream,
+    rmm::mr::get_current_device_resource());
   auto const joined_input_child =
-      joined_input->child(cudf::strings_column_view::chars_column_index);
+    joined_input->child(cudf::strings_column_view::chars_column_index);
   auto const joined_input_size_bytes = joined_input_child.size();
   CUDF_EXPECTS(joined_input_size_bytes + 2 == output_size, "Incorrect output size computation.");
 
@@ -95,10 +97,13 @@ rmm::device_uvector<char> unify_json_strings(cudf::column_view const &input,
   // For efficiency, let's use memcpy instead of `cudf::strings::detail::concatenate`.
   auto output = rmm::device_uvector<char>(joined_input_size_bytes + 2, stream);
   CUDF_CUDA_TRY(cudaMemsetAsync(output.data(), static_cast<int>('['), 1, stream.value()));
-  CUDF_CUDA_TRY(cudaMemcpyAsync(output.data() + 1, joined_input_child.view().data<char>(),
-                                joined_input_size_bytes, cudaMemcpyDefault, stream.value()));
-  CUDF_CUDA_TRY(cudaMemsetAsync(output.data() + joined_input_size_bytes + 1, static_cast<int>(']'),
-                                1, stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(output.data() + 1,
+                                joined_input_child.view().data<char>(),
+                                joined_input_size_bytes,
+                                cudaMemcpyDefault,
+                                stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(
+    output.data() + joined_input_size_bytes + 1, static_cast<int>(']'), 1, stream.value()));
 
 #ifdef DEBUG_FROM_JSON
   print_debug<char, char>(output, "Processed json string", "", stream);
@@ -107,29 +112,33 @@ rmm::device_uvector<char> unify_json_strings(cudf::column_view const &input,
 }
 
 // Check and throw exception if there is any parsing error.
-void throw_if_error(rmm::device_uvector<char> const &input_json,
-                    rmm::device_uvector<PdaTokenT> const &tokens,
-                    rmm::device_uvector<SymbolOffsetT> const &token_indices,
-                    rmm::cuda_stream_view stream) {
+void throw_if_error(rmm::device_uvector<char> const& input_json,
+                    rmm::device_uvector<PdaTokenT> const& tokens,
+                    rmm::device_uvector<SymbolOffsetT> const& token_indices,
+                    rmm::cuda_stream_view stream)
+{
   auto const error_count =
-      thrust::count(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin);
+    thrust::count(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin);
 
   if (error_count > 0) {
     auto const error_location =
-        thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin);
+      thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin);
     SymbolOffsetT error_index;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-        &error_index, token_indices.data() + thrust::distance(tokens.begin(), error_location),
-        sizeof(SymbolOffsetT), cudaMemcpyDeviceToHost, stream.value()));
+    CUDF_CUDA_TRY(
+      cudaMemcpyAsync(&error_index,
+                      token_indices.data() + thrust::distance(tokens.begin(), error_location),
+                      sizeof(SymbolOffsetT),
+                      cudaMemcpyDeviceToHost,
+                      stream.value()));
     stream.synchronize();
 
-    constexpr auto extension = 100;
+    constexpr auto extension   = 100;
     auto const begin_print_idx = std::max(error_index - extension, SymbolOffsetT{0});
     auto const end_print_idx =
-        std::min(error_index + extension, static_cast<SymbolOffsetT>(input_json.size()));
-    auto const print_size = end_print_idx - begin_print_idx;
+      std::min(error_index + extension, static_cast<SymbolOffsetT>(input_json.size()));
+    auto const print_size   = end_print_idx - begin_print_idx;
     auto const h_input_json = cudf::detail::make_host_vector_sync(
-        cudf::device_span<char const>{input_json.data() + begin_print_idx, print_size}, stream);
+      cudf::device_span<char const>{input_json.data() + begin_print_idx, print_size}, stream);
     std::cerr << "Substring of the input json with " + std::to_string(extension)
               << " characters before+after the error location:\n";
     std::cerr << std::string(h_input_json.data(), h_input_json.size()) << std::endl;
@@ -141,7 +150,8 @@ void throw_if_error(rmm::device_uvector<char> const &input_json,
 
 // Check if a token is a json node.
 struct is_node {
-  __host__ __device__ bool operator()(PdaTokenT const token) const {
+  __host__ __device__ bool operator()(PdaTokenT const token) const
+  {
     switch (token) {
       case token_t::StructBegin:
       case token_t::ListBegin:
@@ -159,8 +169,9 @@ struct is_node {
 // Each row in the input column should have levels starting from 1.
 // This is copied from cudf's `json_tree.cu`.
 rmm::device_uvector<TreeDepthT> compute_node_levels(int64_t num_nodes,
-                                                    rmm::device_uvector<PdaTokenT> const &tokens,
-                                                    rmm::cuda_stream_view stream) {
+                                                    rmm::device_uvector<PdaTokenT> const& tokens,
+                                                    rmm::cuda_stream_view stream)
+{
   auto token_levels = rmm::device_uvector<TreeDepthT>(tokens.size(), stream);
 
   // Whether the token pops from the parent node stack.
@@ -184,16 +195,19 @@ rmm::device_uvector<TreeDepthT> compute_node_levels(int64_t num_nodes,
   };
 
   auto const push_pop_it = thrust::make_transform_iterator(
-      tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> cudf::size_type {
-        return does_push(token) - does_pop(token);
-      });
-  thrust::exclusive_scan(rmm::exec_policy(stream), push_pop_it, push_pop_it + tokens.size(),
-                         token_levels.begin());
-
-  auto node_levels = rmm::device_uvector<TreeDepthT>(num_nodes, stream);
-  auto const copy_end =
-      cudf::detail::copy_if_safe(token_levels.begin(), token_levels.end(), tokens.begin(),
-                                 node_levels.begin(), is_node{}, stream);
+    tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> cudf::size_type {
+      return does_push(token) - does_pop(token);
+    });
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), push_pop_it, push_pop_it + tokens.size(), token_levels.begin());
+
+  auto node_levels    = rmm::device_uvector<TreeDepthT>(num_nodes, stream);
+  auto const copy_end = cudf::detail::copy_if_safe(token_levels.begin(),
+                                                   token_levels.end(),
+                                                   tokens.begin(),
+                                                   node_levels.begin(),
+                                                   is_node{},
+                                                   stream);
   CUDF_EXPECTS(thrust::distance(node_levels.begin(), copy_end) == num_nodes,
                "Node level count mismatch");
 
@@ -204,14 +218,17 @@ rmm::device_uvector<TreeDepthT> compute_node_levels(int64_t num_nodes,
 }
 
 // Compute the map from nodes to their indices in the list of all tokens.
-rmm::device_uvector<NodeIndexT>
-compute_node_to_token_index_map(int64_t num_nodes, rmm::device_uvector<PdaTokenT> const &tokens,
-                                rmm::cuda_stream_view stream) {
-  auto node_token_ids = rmm::device_uvector<NodeIndexT>(num_nodes, stream);
+rmm::device_uvector<NodeIndexT> compute_node_to_token_index_map(
+  int64_t num_nodes, rmm::device_uvector<PdaTokenT> const& tokens, rmm::cuda_stream_view stream)
+{
+  auto node_token_ids   = rmm::device_uvector<NodeIndexT>(num_nodes, stream);
   auto const node_id_it = thrust::counting_iterator<NodeIndexT>(0);
-  auto const copy_end =
-      cudf::detail::copy_if_safe(node_id_it, node_id_it + tokens.size(), tokens.begin(),
-                                 node_token_ids.begin(), is_node{}, stream);
+  auto const copy_end   = cudf::detail::copy_if_safe(node_id_it,
+                                                   node_id_it + tokens.size(),
+                                                   tokens.begin(),
+                                                   node_token_ids.begin(),
+                                                   is_node{},
+                                                   stream);
   CUDF_EXPECTS(thrust::distance(node_token_ids.begin(), copy_end) == num_nodes,
                "Invalid computation for node-to-token-index map");
 
@@ -223,8 +240,9 @@ compute_node_to_token_index_map(int64_t num_nodes, rmm::device_uvector<PdaTokenT
 
 // This is copied from cudf's `json_tree.cu`.
 template <typename KeyType, typename IndexType = cudf::size_type>
-std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>>
-stable_sorted_key_order(rmm::device_uvector<KeyType> const &keys, rmm::cuda_stream_view stream) {
+std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_sorted_key_order(
+  rmm::device_uvector<KeyType> const& keys, rmm::cuda_stream_view stream)
+{
   // Buffers used for storing intermediate results during sorting.
   rmm::device_uvector<KeyType> keys_buffer1(keys.size(), stream);
   rmm::device_uvector<KeyType> keys_buffer2(keys.size(), stream);
@@ -237,43 +255,52 @@ stable_sorted_key_order(rmm::device_uvector<KeyType> const &keys, rmm::cuda_stre
   thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end());
 
   size_t temp_storage_bytes = 0;
-  cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, keys_buffer, order_buffer,
-                                  keys.size());
+  cub::DeviceRadixSort::SortPairs(
+    nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size());
   rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
-  cub::DeviceRadixSort::SortPairs(d_temp_storage.data(), temp_storage_bytes, keys_buffer,
-                                  order_buffer, keys.size(), 0, sizeof(KeyType) * 8,
+  cub::DeviceRadixSort::SortPairs(d_temp_storage.data(),
+                                  temp_storage_bytes,
+                                  keys_buffer,
+                                  order_buffer,
+                                  keys.size(),
+                                  0,
+                                  sizeof(KeyType) * 8,
                                   stream.value());
 
-  return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1) :
-                                                                  std::move(keys_buffer2),
-                   order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1) :
-                                                                    std::move(order_buffer2)};
+  return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1)
+                                                                : std::move(keys_buffer2),
+                   order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1)
+                                                                  : std::move(order_buffer2)};
 }
 
 // This is copied from cudf's `json_tree.cu`.
-void propagate_parent_to_siblings(rmm::device_uvector<TreeDepthT> const &node_levels,
-                                  rmm::device_uvector<NodeIndexT> &parent_node_ids,
-                                  rmm::cuda_stream_view stream) {
+void propagate_parent_to_siblings(rmm::device_uvector<TreeDepthT> const& node_levels,
+                                  rmm::device_uvector<NodeIndexT>& parent_node_ids,
+                                  rmm::cuda_stream_view stream)
+{
   auto const [sorted_node_levels, sorted_order] = stable_sorted_key_order(node_levels, stream);
 
   // Instead of gather, using permutation_iterator, which is ~17% faster.
   thrust::inclusive_scan_by_key(
-      rmm::exec_policy(stream), sorted_node_levels.begin(), sorted_node_levels.end(),
-      thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
-      thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
-      thrust::equal_to<TreeDepthT>{}, thrust::maximum<NodeIndexT>{});
+    rmm::exec_policy(stream),
+    sorted_node_levels.begin(),
+    sorted_node_levels.end(),
+    thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
+    thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
+    thrust::equal_to<TreeDepthT>{},
+    thrust::maximum<NodeIndexT>{});
 }
 
 // This is copied from cudf's `json_tree.cu`.
-rmm::device_uvector<NodeIndexT>
-compute_parent_node_ids(int64_t num_nodes, rmm::device_uvector<PdaTokenT> const &tokens,
-                        rmm::device_uvector<NodeIndexT> const &node_token_ids,
-                        rmm::cuda_stream_view stream) {
+rmm::device_uvector<NodeIndexT> compute_parent_node_ids(
+  int64_t num_nodes,
+  rmm::device_uvector<PdaTokenT> const& tokens,
+  rmm::device_uvector<NodeIndexT> const& node_token_ids,
+  rmm::cuda_stream_view stream)
+{
   auto const first_childs_parent_token_id = [tokens =
-                                                 tokens.begin()] __device__(auto i) -> NodeIndexT {
-    if (i <= 0) {
-      return -1;
-    }
+                                               tokens.begin()] __device__(auto i) -> NodeIndexT {
+    if (i <= 0) { return -1; }
     if (tokens[i - 1] == token_t::StructBegin || tokens[i - 1] == token_t::ListBegin) {
       return i - 1;
     } else if (tokens[i - 1] == token_t::FieldNameEnd) {
@@ -287,16 +314,19 @@ compute_parent_node_ids(int64_t num_nodes, rmm::device_uvector<PdaTokenT> const
   };
 
   auto parent_node_ids = rmm::device_uvector<NodeIndexT>(num_nodes, stream);
-  thrust::transform(rmm::exec_policy(stream), node_token_ids.begin(), node_token_ids.end(),
-                    parent_node_ids.begin(),
-                    [node_ids_gpu = node_token_ids.begin(), num_nodes,
-                     first_childs_parent_token_id] __device__(NodeIndexT const tid) -> NodeIndexT {
-                      auto const pid = first_childs_parent_token_id(tid);
-                      return pid < 0 ? cudf::io::json::parent_node_sentinel :
-                                       thrust::lower_bound(thrust::seq, node_ids_gpu,
-                                                           node_ids_gpu + num_nodes, pid) -
-                                           node_ids_gpu;
-                    });
+  thrust::transform(
+    rmm::exec_policy(stream),
+    node_token_ids.begin(),
+    node_token_ids.end(),
+    parent_node_ids.begin(),
+    [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__(
+      NodeIndexT const tid) -> NodeIndexT {
+      auto const pid = first_childs_parent_token_id(tid);
+      return pid < 0
+               ? cudf::io::json::parent_node_sentinel
+               : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
+                   node_ids_gpu;
+    });
 
   // Propagate parent node to siblings from first sibling - inplace.
   auto const node_levels = compute_node_levels(num_nodes, tokens, stream);
@@ -312,27 +342,30 @@ constexpr int8_t key_sentinel{1};
 constexpr int8_t value_sentinel{2};
 
 // Check for each node if it is a key or a value field.
-rmm::device_uvector<int8_t>
-check_key_or_value_nodes(rmm::device_uvector<NodeIndexT> const &parent_node_ids,
-                         rmm::cuda_stream_view stream) {
-  auto key_or_value = rmm::device_uvector<int8_t>(parent_node_ids.size(), stream);
+rmm::device_uvector<int8_t> check_key_or_value_nodes(
+  rmm::device_uvector<NodeIndexT> const& parent_node_ids, rmm::cuda_stream_view stream)
+{
+  auto key_or_value       = rmm::device_uvector<int8_t>(parent_node_ids.size(), stream);
   auto const transform_it = thrust::counting_iterator<int>(0);
   thrust::transform(
-      rmm::exec_policy(stream), transform_it, transform_it + parent_node_ids.size(),
-      key_or_value.begin(),
-      [key_sentinel = key_sentinel, value_sentinel = value_sentinel,
-       parent_ids = parent_node_ids.begin()] __device__(auto const node_id) -> int8_t {
-        if (parent_ids[node_id] > 0) {
-          auto const grand_parent = parent_ids[parent_ids[node_id]];
-          if (grand_parent == 0) {
-            return key_sentinel;
-          } else if (parent_ids[grand_parent] == 0) {
-            return value_sentinel;
-          }
+    rmm::exec_policy(stream),
+    transform_it,
+    transform_it + parent_node_ids.size(),
+    key_or_value.begin(),
+    [key_sentinel   = key_sentinel,
+     value_sentinel = value_sentinel,
+     parent_ids     = parent_node_ids.begin()] __device__(auto const node_id) -> int8_t {
+      if (parent_ids[node_id] > 0) {
+        auto const grand_parent = parent_ids[parent_ids[node_id]];
+        if (grand_parent == 0) {
+          return key_sentinel;
+        } else if (parent_ids[grand_parent] == 0) {
+          return value_sentinel;
         }
+      }
 
-        return 0;
-      });
+      return 0;
+    });
 
 #ifdef DEBUG_FROM_JSON
   print_debug(key_or_value, "Nodes are key/value (1==key, 2==value)", ", ", stream);
@@ -351,7 +384,8 @@ struct node_ranges_fn {
   // Whether the extracted string values from json map will have the quote character.
   static const bool include_quote_char{false};
 
-  __device__ thrust::pair<SymbolOffsetT, SymbolOffsetT> operator()(cudf::size_type node_id) const {
+  __device__ thrust::pair<SymbolOffsetT, SymbolOffsetT> operator()(cudf::size_type node_id) const
+  {
     [[maybe_unused]] auto const is_begin_of_section = [] __device__(PdaTokenT const token) {
       switch (token) {
         case token_t::StructBegin:
@@ -387,7 +421,7 @@ struct node_ranges_fn {
     };
 
     auto const get_token_index = [include_quote_char = include_quote_char] __device__(
-                                     PdaTokenT const token, SymbolOffsetT const token_index) {
+                                   PdaTokenT const token, SymbolOffsetT const token_index) {
       constexpr SymbolOffsetT quote_char_size = 1;
       switch (token) {
         // Strip off quote char included for StringBegin
@@ -405,18 +439,18 @@ struct node_ranges_fn {
     }
 
     auto const token_idx = node_token_ids[node_id];
-    auto const token = tokens[token_idx];
+    auto const token     = tokens[token_idx];
     cudf_assert(is_begin_of_section(token) && "Invalid node category.");
 
     // The section from the original JSON input that this token demarcates.
     auto const range_begin = get_token_index(token, token_indices[token_idx]);
-    auto range_end = range_begin + 1; // non-leaf, non-field nodes ignore this value.
+    auto range_end         = range_begin + 1;  // non-leaf, non-field nodes ignore this value.
     if ((token_idx + 1) < tokens.size() && end_of_partner(token) == tokens[token_idx + 1]) {
       // Update the range_end for this pair of tokens
       range_end = get_token_index(tokens[token_idx + 1], token_indices[token_idx + 1]);
     } else {
-      auto nested_range_value = nested_node_to_value(token); // iterate until this is zero
-      auto end_idx = token_idx + 1;
+      auto nested_range_value = nested_node_to_value(token);  // iterate until this is zero
+      auto end_idx            = token_idx + 1;
       while (end_idx < tokens.size()) {
         nested_range_value += nested_node_to_value(tokens[end_idx]);
         if (nested_range_value == 0) {
@@ -434,18 +468,24 @@ struct node_ranges_fn {
 
 // Compute index range for each node.
 // These ranges identify positions to extract nodes from the unified json string.
-rmm::device_uvector<thrust::pair<SymbolOffsetT, SymbolOffsetT>>
-compute_node_ranges(int64_t num_nodes, rmm::device_uvector<PdaTokenT> const &tokens,
-                    rmm::device_uvector<SymbolOffsetT> const &token_indices,
-                    rmm::device_uvector<NodeIndexT> const &node_token_ids,
-                    rmm::device_uvector<NodeIndexT> const &parent_node_ids,
-                    rmm::device_uvector<int8_t> const &key_or_value, rmm::cuda_stream_view stream) {
+rmm::device_uvector<thrust::pair<SymbolOffsetT, SymbolOffsetT>> compute_node_ranges(
+  int64_t num_nodes,
+  rmm::device_uvector<PdaTokenT> const& tokens,
+  rmm::device_uvector<SymbolOffsetT> const& token_indices,
+  rmm::device_uvector<NodeIndexT> const& node_token_ids,
+  rmm::device_uvector<NodeIndexT> const& parent_node_ids,
+  rmm::device_uvector<int8_t> const& key_or_value,
+  rmm::cuda_stream_view stream)
+{
   auto node_ranges =
-      rmm::device_uvector<thrust::pair<SymbolOffsetT, SymbolOffsetT>>(num_nodes, stream);
+    rmm::device_uvector<thrust::pair<SymbolOffsetT, SymbolOffsetT>>(num_nodes, stream);
   auto const transform_it = thrust::counting_iterator<int>(0);
   thrust::transform(
-      rmm::exec_policy(stream), transform_it, transform_it + num_nodes, node_ranges.begin(),
-      node_ranges_fn{tokens, token_indices, node_token_ids, parent_node_ids, key_or_value});
+    rmm::exec_policy(stream),
+    transform_it,
+    transform_it + num_nodes,
+    node_ranges.begin(),
+    node_ranges_fn{tokens, token_indices, node_token_ids, parent_node_ids, key_or_value});
 
 #ifdef DEBUG_FROM_JSON
   print_pair_debug(node_ranges, "Node ranges", stream);
@@ -460,12 +500,13 @@ struct substring_fn {
   cudf::device_span<char const> const d_string;
   cudf::device_span<thrust::pair<SymbolOffsetT, SymbolOffsetT> const> const d_ranges;
 
-  cudf::offset_type *d_offsets{};
-  char *d_chars{};
+  cudf::offset_type* d_offsets{};
+  char* d_chars{};
 
-  __device__ void operator()(cudf::size_type const idx) {
+  __device__ void operator()(cudf::size_type const idx)
+  {
     auto const range = d_ranges[idx];
-    auto const size = range.second - range.first;
+    auto const size  = range.second - range.first;
     if (d_chars) {
       memcpy(d_chars + d_offsets[idx], d_string.data() + range.first, size);
     } else {
@@ -476,11 +517,14 @@ struct substring_fn {
 
 // Extract key-value string pairs from the input json string.
 std::unique_ptr<cudf::column> extract_keys_or_values(
-    bool extract_key, int64_t num_nodes,
-    rmm::device_uvector<thrust::pair<SymbolOffsetT, SymbolOffsetT>> const &node_ranges,
-    rmm::device_uvector<int8_t> const &key_or_value,
-    rmm::device_uvector<char> const &unified_json_buff, rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr) {
+  bool extract_key,
+  int64_t num_nodes,
+  rmm::device_uvector<thrust::pair<SymbolOffsetT, SymbolOffsetT>> const& node_ranges,
+  rmm::device_uvector<int8_t> const& key_or_value,
+  rmm::device_uvector<char> const& unified_json_buff,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
   auto const is_key = [key_or_value = key_or_value.begin()] __device__(auto const node_id) {
     return key_or_value[node_id] == key_sentinel;
   };
@@ -490,35 +534,47 @@ std::unique_ptr<cudf::column> extract_keys_or_values(
   };
 
   auto extract_ranges =
-      rmm::device_uvector<thrust::pair<SymbolOffsetT, SymbolOffsetT>>(num_nodes, stream, mr);
-  auto const stencil_it = thrust::make_counting_iterator(0);
-  auto const range_end =
-      extract_key ? cudf::detail::copy_if_safe(node_ranges.begin(), node_ranges.end(), stencil_it,
-                                               extract_ranges.begin(), is_key, stream) :
-                    cudf::detail::copy_if_safe(node_ranges.begin(), node_ranges.end(), stencil_it,
-                                               extract_ranges.begin(), is_value, stream);
+    rmm::device_uvector<thrust::pair<SymbolOffsetT, SymbolOffsetT>>(num_nodes, stream, mr);
+  auto const stencil_it  = thrust::make_counting_iterator(0);
+  auto const range_end   = extract_key ? cudf::detail::copy_if_safe(node_ranges.begin(),
+                                                                  node_ranges.end(),
+                                                                  stencil_it,
+                                                                  extract_ranges.begin(),
+                                                                  is_key,
+                                                                  stream)
+                                       : cudf::detail::copy_if_safe(node_ranges.begin(),
+                                                                  node_ranges.end(),
+                                                                  stencil_it,
+                                                                  extract_ranges.begin(),
+                                                                  is_value,
+                                                                  stream);
   auto const num_extract = thrust::distance(extract_ranges.begin(), range_end);
 
   auto children = cudf::strings::detail::make_strings_children(
-      substring_fn{unified_json_buff, extract_ranges}, num_extract, stream, mr);
-  return cudf::make_strings_column(num_extract, std::move(children.first),
-                                   std::move(children.second), 0, rmm::device_buffer{});
+    substring_fn{unified_json_buff, extract_ranges}, num_extract, stream, mr);
+  return cudf::make_strings_column(
+    num_extract, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
 }
 
 // Compute the offsets for the final lists of Struct<String,String>.
-rmm::device_uvector<cudf::offset_type>
-compute_list_offsets(cudf::size_type n_lists,
-                     rmm::device_uvector<NodeIndexT> const &parent_node_ids,
-                     rmm::device_uvector<int8_t> const &key_or_value, rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource *mr) {
+rmm::device_uvector<cudf::offset_type> compute_list_offsets(
+  cudf::size_type n_lists,
+  rmm::device_uvector<NodeIndexT> const& parent_node_ids,
+  rmm::device_uvector<int8_t> const& key_or_value,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
   // Count the number of children nodes for the json object nodes.
   // These object nodes are given as one row of the input json strings column.
   auto node_child_counts = rmm::device_uvector<NodeIndexT>(parent_node_ids.size(), stream);
 
   // For the nodes having parent_id == 0 (they are json object given by one input row), set their
   // child counts to zero. Otherwise, set child counts to `-1` (a sentinel number).
-  thrust::transform(rmm::exec_policy(stream), parent_node_ids.begin(), parent_node_ids.end(),
-                    node_child_counts.begin(), [] __device__(auto const parent_id) -> NodeIndexT {
+  thrust::transform(rmm::exec_policy(stream),
+                    parent_node_ids.begin(),
+                    parent_node_ids.end(),
+                    node_child_counts.begin(),
+                    [] __device__(auto const parent_id) -> NodeIndexT {
                       return parent_id == 0 ? 0 : std::numeric_limits<NodeIndexT>::lowest();
                     });
 
@@ -528,9 +584,12 @@ compute_list_offsets(cudf::size_type n_lists,
 
   // Count the number of keys for each json object using `atomicAdd`.
   auto const transform_it = thrust::counting_iterator<int>(0);
-  thrust::for_each(rmm::exec_policy(stream), transform_it, transform_it + parent_node_ids.size(),
-                   [is_key, child_counts = node_child_counts.begin(),
-                    parent_ids = parent_node_ids.begin()] __device__(auto const node_id) {
+  thrust::for_each(rmm::exec_policy(stream),
+                   transform_it,
+                   transform_it + parent_node_ids.size(),
+                   [is_key,
+                    child_counts = node_child_counts.begin(),
+                    parent_ids   = parent_node_ids.begin()] __device__(auto const node_id) {
                      if (is_key(node_id)) {
                        auto const parent_id = parent_ids[node_id];
                        atomicAdd(&child_counts[parent_id], 1);
@@ -540,29 +599,33 @@ compute_list_offsets(cudf::size_type n_lists,
   print_debug(node_child_counts, "Nodes' child keys counts", ", ", stream);
 #endif
 
-  auto list_offsets = rmm::device_uvector<cudf::offset_type>(n_lists + 1, stream, mr);
+  auto list_offsets   = rmm::device_uvector<cudf::offset_type>(n_lists + 1, stream, mr);
   auto const copy_end = cudf::detail::copy_if_safe(
-      node_child_counts.begin(), node_child_counts.end(), list_offsets.begin(),
-      [] __device__(auto const count) { return count >= 0; }, stream);
+    node_child_counts.begin(),
+    node_child_counts.end(),
+    list_offsets.begin(),
+    [] __device__(auto const count) { return count >= 0; },
+    stream);
   CUDF_EXPECTS(thrust::distance(list_offsets.begin(), copy_end) == static_cast<int64_t>(n_lists),
                "Invalid list size computation.");
 #ifdef DEBUG_FROM_JSON
   print_debug(list_offsets, "Output list sizes (except the last one)", ", ", stream);
 #endif
 
-  thrust::exclusive_scan(rmm::exec_policy(stream), list_offsets.begin(), list_offsets.end(),
-                         list_offsets.begin());
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), list_offsets.begin(), list_offsets.end(), list_offsets.begin());
 #ifdef DEBUG_FROM_JSON
   print_debug(list_offsets, "Output list offsets", ", ", stream);
 #endif
   return list_offsets;
 }
 
-} // namespace
+}  // namespace
 
-std::unique_ptr<cudf::column> from_json(cudf::column_view const &input,
+std::unique_ptr<cudf::column> from_json(cudf::column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource *mr) {
+                                        rmm::mr::device_memory_resource* mr)
+{
   CUDF_EXPECTS(input.type().id() == cudf::type_id::STRING, "Invalid input format");
 
   // Firstly, concatenate all the input json strings into one giant input json string.
@@ -574,8 +637,10 @@ std::unique_ptr<cudf::column> from_json(cudf::column_view const &input,
   static_assert(sizeof(SymbolT) == sizeof(char),
                 "Invalid internal data for nested json tokenizer.");
   auto const [tokens, token_indices] = cudf::io::json::detail::get_token_stream(
-      cudf::device_span<char const>{unified_json_buff.data(), unified_json_buff.size()},
-      cudf::io::json_reader_options{}, stream, rmm::mr::get_current_device_resource());
+    cudf::device_span<char const>{unified_json_buff.data(), unified_json_buff.size()},
+    cudf::io::json_reader_options{},
+    stream,
+    rmm::mr::get_current_device_resource());
 
 #ifdef DEBUG_FROM_JSON
   print_debug(tokens, "Tokens", ", ", stream);
@@ -586,7 +651,7 @@ std::unique_ptr<cudf::column> from_json(cudf::column_view const &input,
   throw_if_error(unified_json_buff, tokens, token_indices, stream);
 
   auto const num_nodes =
-      thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_node{});
+    thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_node{});
 
   // Compute the map from nodes to their indices in the list of all tokens.
   auto const node_token_ids = compute_node_to_token_index_map(num_nodes, tokens, stream);
@@ -599,23 +664,23 @@ std::unique_ptr<cudf::column> from_json(cudf::column_view const &input,
 
   // Compute index range for each node.
   // These ranges identify positions to extract nodes from the unified json string.
-  auto const node_ranges = compute_node_ranges(num_nodes, tokens, token_indices, node_token_ids,
-                                               parent_node_ids, key_or_value_node, stream);
+  auto const node_ranges = compute_node_ranges(
+    num_nodes, tokens, token_indices, node_token_ids, parent_node_ids, key_or_value_node, stream);
 
   //
   // From below are variables for returning output.
   //
 
-  auto extracted_keys = extract_keys_or_values(true /*key*/, num_nodes, node_ranges,
-                                               key_or_value_node, unified_json_buff, stream, mr);
-  auto extracted_values = extract_keys_or_values(false /*value*/, num_nodes, node_ranges,
-                                                 key_or_value_node, unified_json_buff, stream, mr);
+  auto extracted_keys = extract_keys_or_values(
+    true /*key*/, num_nodes, node_ranges, key_or_value_node, unified_json_buff, stream, mr);
+  auto extracted_values = extract_keys_or_values(
+    false /*value*/, num_nodes, node_ranges, key_or_value_node, unified_json_buff, stream, mr);
   CUDF_EXPECTS(extracted_keys->size() == extracted_values->size(),
                "Invalid key-value pair extraction.");
 
   // Compute the offsets of the final output lists column.
   auto list_offsets =
-      compute_list_offsets(input.size(), parent_node_ids, key_or_value_node, stream, mr);
+    compute_list_offsets(input.size(), parent_node_ids, key_or_value_node, stream, mr);
 
 #ifdef DEBUG_FROM_JSON
   print_output_spark_map(list_offsets, extracted_keys, extracted_values, stream);
@@ -625,15 +690,18 @@ std::unique_ptr<cudf::column> from_json(cudf::column_view const &input,
   std::vector<std::unique_ptr<cudf::column>> out_keys_vals;
   out_keys_vals.emplace_back(std::move(extracted_keys));
   out_keys_vals.emplace_back(std::move(extracted_values));
-  auto structs_col = cudf::make_structs_column(num_pairs, std::move(out_keys_vals), 0,
-                                               rmm::device_buffer{}, stream, mr);
-
-  auto offsets = std::make_unique<cudf::column>(std::move(list_offsets), 
-              rmm::device_buffer{}, 0);
-
-  return cudf::make_lists_column(
-      input.size(), std::move(offsets), std::move(structs_col),
-      input.null_count(), cudf::detail::copy_bitmask(input, stream, mr), stream, mr);
+  auto structs_col = cudf::make_structs_column(
+    num_pairs, std::move(out_keys_vals), 0, rmm::device_buffer{}, stream, mr);
+
+  auto offsets = std::make_unique<cudf::column>(std::move(list_offsets), rmm::device_buffer{}, 0);
+
+  return cudf::make_lists_column(input.size(),
+                                 std::move(offsets),
+                                 std::move(structs_col),
+                                 input.null_count(),
+                                 cudf::detail::copy_bitmask(input, stream, mr),
+                                 stream,
+                                 mr);
 }
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/map_utils.hpp b/src/main/cpp/src/map_utils.hpp
index ddf66b07de..445bc89c7b 100644
--- a/src/main/cpp/src/map_utils.hpp
+++ b/src/main/cpp/src/map_utils.hpp
@@ -24,8 +24,9 @@
 
 namespace spark_rapids_jni {
 
-std::unique_ptr<cudf::column>
-from_json(cudf::column_view const &input, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-          rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> from_json(
+  cudf::column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/map_utils_debug.cuh b/src/main/cpp/src/map_utils_debug.cuh
index 652fa84672..39446b2971 100644
--- a/src/main/cpp/src/map_utils_debug.cuh
+++ b/src/main/cpp/src/map_utils_debug.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-//#define DEBUG_FROM_JSON
+// #define DEBUG_FROM_JSON
 
 #ifdef DEBUG_FROM_JSON
 
@@ -36,7 +36,8 @@ namespace spark_rapids_jni {
 using namespace cudf::io::json;
 
 // Convert the token value into string name, for debugging purpose.
-std::string token_to_string(PdaTokenT const token_type) {
+std::string token_to_string(PdaTokenT const token_type)
+{
   switch (token_type) {
     case token_t::StructBegin: return "StructBegin";
     case token_t::StructEnd: return "StructEnd";
@@ -57,27 +58,30 @@ std::string token_to_string(PdaTokenT const token_type) {
 
 // Print the content of the input device vector.
 template <typename T, typename U = int>
-void print_debug(rmm::device_uvector<T> const &input, std::string const &name,
-                 std::string const &separator, rmm::cuda_stream_view stream) {
+void print_debug(rmm::device_uvector<T> const& input,
+                 std::string const& name,
+                 std::string const& separator,
+                 rmm::cuda_stream_view stream)
+{
   auto const h_input = cudf::detail::make_host_vector_sync(
-      cudf::device_span<T const>{input.data(), input.size()}, stream);
+    cudf::device_span<T const>{input.data(), input.size()}, stream);
   std::stringstream ss;
   ss << name << ":\n";
   for (size_t i = 0; i < h_input.size(); ++i) {
     ss << static_cast<U>(h_input[i]);
-    if (separator.size() > 0 && i + 1 < h_input.size()) {
-      ss << separator;
-    }
+    if (separator.size() > 0 && i + 1 < h_input.size()) { ss << separator; }
   }
   std::cerr << ss.str() << std::endl;
 }
 
 // Print the content of the input map given by a device vector.
 template <typename T, typename U = int>
-void print_map_debug(rmm::device_uvector<T> const &input, std::string const &name,
-                     rmm::cuda_stream_view stream) {
+void print_map_debug(rmm::device_uvector<T> const& input,
+                     std::string const& name,
+                     rmm::cuda_stream_view stream)
+{
   auto const h_input = cudf::detail::make_host_vector_sync(
-      cudf::device_span<T const>{input.data(), input.size()}, stream);
+    cudf::device_span<T const>{input.data(), input.size()}, stream);
   std::stringstream ss;
   ss << name << ":\n";
   for (size_t i = 0; i < h_input.size(); ++i) {
@@ -88,10 +92,12 @@ void print_map_debug(rmm::device_uvector<T> const &input, std::string const &nam
 
 // Print the content of the input pairs given by a device vector.
 template <typename T, typename U = int>
-void print_pair_debug(rmm::device_uvector<T> const &input, std::string const &name,
-                      rmm::cuda_stream_view stream) {
+void print_pair_debug(rmm::device_uvector<T> const& input,
+                      std::string const& name,
+                      rmm::cuda_stream_view stream)
+{
   auto const h_input = cudf::detail::make_host_vector_sync(
-      cudf::device_span<T const>{input.data(), input.size()}, stream);
+    cudf::device_span<T const>{input.data(), input.size()}, stream);
   std::stringstream ss;
   ss << name << ":\n";
   for (size_t i = 0; i < h_input.size(); ++i) {
@@ -102,36 +108,37 @@ void print_pair_debug(rmm::device_uvector<T> const &input, std::string const &na
 }
 
 // Print the final output map data (Spark's MapType, i.e., List<Struct<String,String>>).
-void print_output_spark_map(rmm::device_uvector<cudf::offset_type> const &list_offsets,
-                            std::unique_ptr<cudf::column> const &extracted_keys,
-                            std::unique_ptr<cudf::column> const &extracted_values,
-                            rmm::cuda_stream_view stream) {
-  auto const keys_child = extracted_keys->child(cudf::strings_column_view::chars_column_index);
+void print_output_spark_map(rmm::device_uvector<cudf::offset_type> const& list_offsets,
+                            std::unique_ptr<cudf::column> const& extracted_keys,
+                            std::unique_ptr<cudf::column> const& extracted_values,
+                            rmm::cuda_stream_view stream)
+{
+  auto const keys_child   = extracted_keys->child(cudf::strings_column_view::chars_column_index);
   auto const keys_offsets = extracted_keys->child(cudf::strings_column_view::offsets_column_index);
   auto const values_child = extracted_values->child(cudf::strings_column_view::chars_column_index);
   auto const values_offsets =
-      extracted_values->child(cudf::strings_column_view::offsets_column_index);
+    extracted_values->child(cudf::strings_column_view::offsets_column_index);
 
   auto const h_extracted_keys_child = cudf::detail::make_host_vector_sync(
-      cudf::device_span<char const>{keys_child.view().data<char>(),
-                                    static_cast<size_t>(keys_child.size())},
-      stream);
+    cudf::device_span<char const>{keys_child.view().data<char>(),
+                                  static_cast<size_t>(keys_child.size())},
+    stream);
   auto const h_extracted_keys_offsets = cudf::detail::make_host_vector_sync(
-      cudf::device_span<int const>{keys_offsets.view().data<int>(),
-                                   static_cast<size_t>(keys_offsets.size())},
-      stream);
+    cudf::device_span<int const>{keys_offsets.view().data<int>(),
+                                 static_cast<size_t>(keys_offsets.size())},
+    stream);
 
   auto const h_extracted_values_child = cudf::detail::make_host_vector_sync(
-      cudf::device_span<char const>{values_child.view().data<char>(),
-                                    static_cast<size_t>(values_child.size())},
-      stream);
+    cudf::device_span<char const>{values_child.view().data<char>(),
+                                  static_cast<size_t>(values_child.size())},
+    stream);
   auto const h_extracted_values_offsets = cudf::detail::make_host_vector_sync(
-      cudf::device_span<int const>{values_offsets.view().data<int>(),
-                                   static_cast<size_t>(values_offsets.size())},
-      stream);
+    cudf::device_span<int const>{values_offsets.view().data<int>(),
+                                 static_cast<size_t>(values_offsets.size())},
+    stream);
 
   auto const h_list_offsets = cudf::detail::make_host_vector_sync(
-      cudf::device_span<cudf::offset_type const>{list_offsets.data(), list_offsets.size()}, stream);
+    cudf::device_span<cudf::offset_type const>{list_offsets.data(), list_offsets.size()}, stream);
   CUDF_EXPECTS(h_list_offsets.back() == extracted_keys->size(),
                "Invalid list offsets computation.");
 
@@ -144,16 +151,16 @@ void print_output_spark_map(rmm::device_uvector<cudf::offset_type> const &list_o
          ++string_idx) {
       {
         auto const string_begin = h_extracted_keys_offsets[string_idx];
-        auto const string_end = h_extracted_keys_offsets[string_idx + 1];
-        auto const size = string_end - string_begin;
-        auto const ptr = &h_extracted_keys_child[string_begin];
+        auto const string_end   = h_extracted_keys_offsets[string_idx + 1];
+        auto const size         = string_end - string_begin;
+        auto const ptr          = &h_extracted_keys_child[string_begin];
         ss << "\t\"" << std::string(ptr, size) << "\" : ";
       }
       {
         auto const string_begin = h_extracted_values_offsets[string_idx];
-        auto const string_end = h_extracted_values_offsets[string_idx + 1];
-        auto const size = string_end - string_begin;
-        auto const ptr = &h_extracted_values_child[string_begin];
+        auto const string_end   = h_extracted_values_offsets[string_idx + 1];
+        auto const size         = string_end - string_begin;
+        auto const ptr          = &h_extracted_values_child[string_begin];
         ss << "\"" << std::string(ptr, size) << "\"\n";
       }
     }
@@ -161,6 +168,6 @@ void print_output_spark_map(rmm::device_uvector<cudf::offset_type> const &list_o
   std::cerr << ss.str() << std::endl;
 }
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
 
-#endif // DEBUG_FROM_JSON
+#endif  // DEBUG_FROM_JSON
diff --git a/src/main/cpp/src/row_conversion.cu b/src/main/cpp/src/row_conversion.cu
index aa772908dc..1960ce392b 100644
--- a/src/main/cpp/src/row_conversion.cu
+++ b/src/main/cpp/src/row_conversion.cu
@@ -50,7 +50,7 @@
 
 #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED)
 #include <cuda/barrier>
-#endif // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED)
+#endif  // #if !defined(__CUDA_ARCH__) || defined(ASYNC_MEMCPY_SUPPORTED)
 
 #include <algorithm>
 #include <cstdarg>
@@ -68,14 +68,14 @@ constexpr auto JCUDF_ROW_ALIGNMENT = 8;
 constexpr auto MAX_BATCH_SIZE = std::numeric_limits<cudf::size_type>::max();
 
 // Number of rows each block processes in the two kernels. Tuned via nsight
-constexpr auto NUM_STRING_ROWS_PER_BLOCK_TO_ROWS = 1024;
+constexpr auto NUM_STRING_ROWS_PER_BLOCK_TO_ROWS   = 1024;
 constexpr auto NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS = 64;
-constexpr auto MIN_STRING_BLOCKS = 32;
-constexpr auto MAX_STRING_BLOCKS = MAX_BATCH_SIZE;
+constexpr auto MIN_STRING_BLOCKS                   = 32;
+constexpr auto MAX_STRING_BLOCKS                   = MAX_BATCH_SIZE;
 
 constexpr auto NUM_WARPS_IN_BLOCK = 32;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 // needed to suppress warning about cuda::barrier
 #pragma nv_diag_suppress static_var_with_dynamic_init
@@ -87,8 +87,9 @@ using rmm::device_uvector;
 #ifdef ASYNC_MEMCPY_SUPPORTED
 using cuda::aligned_size_t;
 #else
-template <std::size_t> using aligned_size_t = size_t; // Local stub for cuda::aligned_size_t.
-#endif // ASYNC_MEMCPY_SUPPORTED
+template <std::size_t>
+using aligned_size_t = size_t;  // Local stub for cuda::aligned_size_t.
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
 namespace spark_rapids_jni {
 namespace detail {
@@ -156,8 +157,9 @@ struct tile_info {
   int end_row;
   int batch_number;
 
-  __device__ inline size_type get_shared_row_size(size_type const *const col_offsets,
-                                                  size_type const *const col_sizes) const {
+  __device__ inline size_type get_shared_row_size(size_type const* const col_offsets,
+                                                  size_type const* const col_sizes) const
+  {
     // this calculation is invalid if there are holes in the data such as a variable-width column.
     // It is wrong in a safe way in that it will say this row size is larger than it should be, so
     // we are not losing data we are just not as efficient as we could be with shared memory. This
@@ -179,9 +181,9 @@ struct tile_info {
  *
  */
 struct row_batch {
-  size_type num_bytes;                     // number of bytes in this batch
-  size_type row_count;                     // number of rows in the batch
-  device_uvector<offset_type> row_offsets; // offsets column of output cudf column
+  size_type num_bytes;                      // number of bytes in this batch
+  size_type row_count;                      // number of rows in the batch
+  device_uvector<offset_type> row_offsets;  // offsets column of output cudf column
 };
 
 /**
@@ -189,11 +191,11 @@ struct row_batch {
  *
  */
 struct batch_data {
-  device_uvector<size_type> batch_row_offsets;      // offsets to each row in incoming data
-  device_uvector<size_type> d_batch_row_boundaries; // row numbers for the start of each batch
+  device_uvector<size_type> batch_row_offsets;       // offsets to each row in incoming data
+  device_uvector<size_type> d_batch_row_boundaries;  // row numbers for the start of each batch
   std::vector<size_type>
-      batch_row_boundaries;           // row numbers for the start of each batch: 0, 1500, 2700
-  std::vector<row_batch> row_batches; // information about each batch such as byte count
+    batch_row_boundaries;              // row numbers for the start of each batch: 0, 1500, 2700
+  std::vector<row_batch> row_batches;  // information about each batch such as byte count
 };
 
 /**
@@ -206,8 +208,10 @@ struct batch_data {
  * offsets into the string column
  */
 std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<strings_column_view::offset_iterator>>
-build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validity_size,
-                         rmm::cuda_stream_view stream) {
+build_string_row_offsets(table_view const& tbl,
+                         size_type fixed_width_and_validity_size,
+                         rmm::cuda_stream_view stream)
+{
   auto const num_rows = tbl.num_rows();
   rmm::device_uvector<size_type> d_row_sizes(num_rows, stream);
   thrust::uninitialized_fill(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), 0);
@@ -215,37 +219,44 @@ build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validi
   auto d_offsets_iterators = [&]() {
     std::vector<strings_column_view::offset_iterator> offsets_iterators;
     auto offsets_iter = thrust::make_transform_iterator(
-        tbl.begin(), [](auto const &col) -> strings_column_view::offset_iterator {
-          if (!is_fixed_width(col.type())) {
-            CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!");
-            return strings_column_view(col).offsets_begin();
-          } else {
-            return nullptr;
-          }
-        });
-    std::copy_if(offsets_iter, offsets_iter + tbl.num_columns(),
+      tbl.begin(), [](auto const& col) -> strings_column_view::offset_iterator {
+        if (!is_fixed_width(col.type())) {
+          CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!");
+          return strings_column_view(col).offsets_begin();
+        } else {
+          return nullptr;
+        }
+      });
+    std::copy_if(offsets_iter,
+                 offsets_iter + tbl.num_columns(),
                  std::back_inserter(offsets_iterators),
-                 [](auto const &offset_ptr) { return offset_ptr != nullptr; });
-    return make_device_uvector_async(offsets_iterators, stream,
-                                     rmm::mr::get_current_device_resource());
+                 [](auto const& offset_ptr) { return offset_ptr != nullptr; });
+    return make_device_uvector_async(
+      offsets_iterators, stream, rmm::mr::get_current_device_resource());
   }();
 
   auto const num_columns = static_cast<size_type>(d_offsets_iterators.size());
 
-  thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
+  thrust::for_each(rmm::exec_policy(stream),
+                   thrust::make_counting_iterator(0),
                    thrust::make_counting_iterator(num_columns * num_rows),
-                   [d_offsets_iterators = d_offsets_iterators.data(), num_columns, num_rows,
+                   [d_offsets_iterators = d_offsets_iterators.data(),
+                    num_columns,
+                    num_rows,
                     d_row_sizes = d_row_sizes.data()] __device__(auto element_idx) {
                      auto const row = element_idx % num_rows;
                      auto const col = element_idx / num_rows;
                      auto const val =
-                         d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row];
+                       d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row];
                      atomicAdd(&d_row_sizes[row], val);
                    });
 
   // transform the row sizes to include fixed width size and alignment
-  thrust::transform(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(),
-                    d_row_sizes.begin(), [fixed_width_and_validity_size] __device__(auto row_size) {
+  thrust::transform(rmm::exec_policy(stream),
+                    d_row_sizes.begin(),
+                    d_row_sizes.end(),
+                    d_row_sizes.begin(),
+                    [fixed_width_and_validity_size] __device__(auto row_size) {
                       return util::round_up_unsafe(fixed_width_and_validity_size + row_size,
                                                    JCUDF_ROW_ALIGNMENT);
                     });
@@ -259,9 +270,10 @@ build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validi
  */
 struct string_row_offset_functor {
   string_row_offset_functor(device_span<size_type const> d_row_offsets)
-      : d_row_offsets(d_row_offsets){};
+    : d_row_offsets(d_row_offsets){};
 
-  __device__ inline size_type operator()(int row_number, int) const {
+  __device__ inline size_type operator()(int row_number, int) const
+  {
     return d_row_offsets[row_number];
   }
 
@@ -274,9 +286,10 @@ struct string_row_offset_functor {
  */
 struct fixed_width_row_offset_functor {
   fixed_width_row_offset_functor(size_type fixed_width_only_row_size)
-      : _fixed_width_only_row_size(fixed_width_only_row_size){};
+    : _fixed_width_only_row_size(fixed_width_only_row_size){};
 
-  __device__ inline size_type operator()(int row_number, int tile_row_start) const {
+  __device__ inline size_type operator()(int row_number, int tile_row_start) const
+  {
     return (row_number - tile_row_start) * _fixed_width_only_row_size;
   }
 
@@ -298,11 +311,15 @@ struct fixed_width_row_offset_functor {
  * @param output_nm array of pointers to the output null masks
  * @param input_data pointing to the incoming row data
  */
-__global__ void
-copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type num_columns,
-                                     const size_type row_size, const size_type *input_offset_in_row,
-                                     const size_type *num_bytes, int8_t **output_data,
-                                     bitmask_type **output_nm, const int8_t *input_data) {
+__global__ void copy_from_rows_fixed_width_optimized(const size_type num_rows,
+                                                     const size_type num_columns,
+                                                     const size_type row_size,
+                                                     const size_type* input_offset_in_row,
+                                                     const size_type* num_bytes,
+                                                     int8_t** output_data,
+                                                     bitmask_type** output_nm,
+                                                     const int8_t* input_data)
+{
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -317,30 +334,30 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n
   // are controlled by the x dimension (there are multiple blocks in the x
   // dimension).
 
-  size_type const rows_per_group = blockDim.x;
-  size_type const row_group_start = blockIdx.x;
+  size_type const rows_per_group   = blockDim.x;
+  size_type const row_group_start  = blockIdx.x;
   size_type const row_group_stride = gridDim.x;
-  size_type const row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+  size_type const row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
 
   extern __shared__ int8_t shared_data[];
 
   // Because we are copying fixed width only data and we stride the rows
   // this thread will always start copying from shared data in the same place
-  int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
-  int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+  int8_t* row_tmp     = &shared_data[row_size * threadIdx.x];
+  int8_t* row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
   for (auto row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
     // Step 1: Copy the data into shared memory
     // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
-    int64_t const *long_input = reinterpret_cast<int64_t const *>(input_data);
+    int64_t* long_shared      = reinterpret_cast<int64_t*>(shared_data);
+    int64_t const* long_input = reinterpret_cast<int64_t const*>(input_data);
 
-    auto const shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
+    auto const shared_output_index  = threadIdx.x + (threadIdx.y * blockDim.x);
     auto const shared_output_stride = blockDim.x * blockDim.y;
-    auto const row_index_end = std::min(num_rows, ((row_group_index + 1) * rows_per_group));
-    auto const num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    auto const shared_length = row_size * num_rows_in_group;
+    auto const row_index_end        = std::min(num_rows, ((row_group_index + 1) * rows_per_group));
+    auto const num_rows_in_group    = row_index_end - (row_group_index * rows_per_group);
+    auto const shared_length        = row_size * num_rows_in_group;
 
     size_type const shared_output_end = shared_length / sizeof(int64_t);
 
@@ -363,31 +380,31 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n
     // because we may need them to copy data in for the next row group.
     uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
     if (row_index < num_rows) {
-      auto const col_index_start = threadIdx.y;
+      auto const col_index_start  = threadIdx.y;
       auto const col_index_stride = blockDim.y;
       for (auto col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-        auto const col_size = num_bytes[col_index];
-        int8_t const *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
-        int8_t *col_output = output_data[col_index];
+        auto const col_size   = num_bytes[col_index];
+        int8_t const* col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
+        int8_t* col_output    = output_data[col_index];
         switch (col_size) {
           case 1: {
             col_output[row_index] = *col_tmp;
             break;
           }
           case 2: {
-            int16_t *short_col_output = reinterpret_cast<int16_t *>(col_output);
-            short_col_output[row_index] = *reinterpret_cast<const int16_t *>(col_tmp);
+            int16_t* short_col_output   = reinterpret_cast<int16_t*>(col_output);
+            short_col_output[row_index] = *reinterpret_cast<const int16_t*>(col_tmp);
             break;
           }
           case 4: {
-            int32_t *int_col_output = reinterpret_cast<int32_t *>(col_output);
-            int_col_output[row_index] = *reinterpret_cast<const int32_t *>(col_tmp);
+            int32_t* int_col_output   = reinterpret_cast<int32_t*>(col_output);
+            int_col_output[row_index] = *reinterpret_cast<const int32_t*>(col_tmp);
             break;
           }
           case 8: {
-            int64_t *long_col_output = reinterpret_cast<int64_t *>(col_output);
-            long_col_output[row_index] = *reinterpret_cast<const int64_t *>(col_tmp);
+            int64_t* long_col_output   = reinterpret_cast<int64_t*>(col_output);
+            long_col_output[row_index] = *reinterpret_cast<const int64_t*>(col_tmp);
             break;
           }
           default: {
@@ -400,25 +417,29 @@ copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type n
           }
         }
 
-        bitmask_type *nm = output_nm[col_index];
-        int8_t *valid_byte = &row_vld_tmp[col_index / 8];
+        bitmask_type* nm          = output_nm[col_index];
+        int8_t* valid_byte        = &row_vld_tmp[col_index / 8];
         size_type byte_bit_offset = col_index % 8;
-        int predicate = *valid_byte & (1 << byte_bit_offset);
-        uint32_t bitmask = __ballot_sync(active_mask, predicate);
-        if (row_index % 32 == 0) {
-          nm[word_index(row_index)] = bitmask;
-        }
-      } // end column loop
-    }   // end row copy
+        int predicate             = *valid_byte & (1 << byte_bit_offset);
+        uint32_t bitmask          = __ballot_sync(active_mask, predicate);
+        if (row_index % 32 == 0) { nm[word_index(row_index)] = bitmask; }
+      }  // end column loop
+    }    // end row copy
     // wait for the row_group to be totally copied before starting on the next row group
     __syncthreads();
   }
 }
 
-__global__ void copy_to_rows_fixed_width_optimized(
-    const size_type start_row, const size_type num_rows, const size_type num_columns,
-    const size_type row_size, const size_type *output_offset_in_row, const size_type *num_bytes,
-    const int8_t **input_data, const bitmask_type **input_nm, int8_t *output_data) {
+__global__ void copy_to_rows_fixed_width_optimized(const size_type start_row,
+                                                   const size_type num_rows,
+                                                   const size_type num_columns,
+                                                   const size_type row_size,
+                                                   const size_type* output_offset_in_row,
+                                                   const size_type* num_bytes,
+                                                   const int8_t** input_data,
+                                                   const bitmask_type** input_nm,
+                                                   int8_t* output_data)
+{
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -435,18 +456,18 @@ __global__ void copy_to_rows_fixed_width_optimized(
   // are controlled by the x dimension (there are multiple blocks in the x
   // dimension).
 
-  size_type rows_per_group = blockDim.x;
-  size_type row_group_start = blockIdx.x;
+  size_type rows_per_group   = blockDim.x;
+  size_type row_group_start  = blockIdx.x;
   size_type row_group_stride = gridDim.x;
-  size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+  size_type row_group_end    = (num_rows + rows_per_group - 1) / rows_per_group + 1;
 
   extern __shared__ int8_t shared_data[];
 
   // Because we are copying fixed width only data and we stride the rows
   // this thread will always start copying to shared data in the same place
-  int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
-  int8_t *row_vld_tmp =
-      &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+  int8_t* row_tmp = &shared_data[row_size * threadIdx.x];
+  int8_t* row_vld_tmp =
+    &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
   for (size_type row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
@@ -457,31 +478,31 @@ __global__ void copy_to_rows_fixed_width_optimized(
     // evenly into the thread count. We don't want those threads to exit yet
     // because we may need them to copy data back out.
     if (row_index < (start_row + num_rows)) {
-      size_type col_index_start = threadIdx.y;
+      size_type col_index_start  = threadIdx.y;
       size_type col_index_stride = blockDim.y;
       for (size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-        size_type col_size = num_bytes[col_index];
-        int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]);
-        const int8_t *col_input = input_data[col_index];
+        size_type col_size      = num_bytes[col_index];
+        int8_t* col_tmp         = &(row_tmp[output_offset_in_row[col_index]]);
+        const int8_t* col_input = input_data[col_index];
         switch (col_size) {
           case 1: {
             *col_tmp = col_input[row_index];
             break;
           }
           case 2: {
-            const int16_t *short_col_input = reinterpret_cast<const int16_t *>(col_input);
-            *reinterpret_cast<int16_t *>(col_tmp) = short_col_input[row_index];
+            const int16_t* short_col_input       = reinterpret_cast<const int16_t*>(col_input);
+            *reinterpret_cast<int16_t*>(col_tmp) = short_col_input[row_index];
             break;
           }
           case 4: {
-            const int32_t *int_col_input = reinterpret_cast<const int32_t *>(col_input);
-            *reinterpret_cast<int32_t *>(col_tmp) = int_col_input[row_index];
+            const int32_t* int_col_input         = reinterpret_cast<const int32_t*>(col_input);
+            *reinterpret_cast<int32_t*>(col_tmp) = int_col_input[row_index];
             break;
           }
           case 8: {
-            const int64_t *long_col_input = reinterpret_cast<const int64_t *>(col_input);
-            *reinterpret_cast<int64_t *>(col_tmp) = long_col_input[row_index];
+            const int64_t* long_col_input        = reinterpret_cast<const int64_t*>(col_input);
+            *reinterpret_cast<int64_t*>(col_tmp) = long_col_input[row_index];
             break;
           }
           default: {
@@ -495,11 +516,11 @@ __global__ void copy_to_rows_fixed_width_optimized(
         }
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-        int8_t *valid_byte = &row_vld_tmp[col_index / 8];
+        int8_t* valid_byte        = &row_vld_tmp[col_index / 8];
         size_type byte_bit_offset = col_index % 8;
-        uint64_t fixup_bytes = reinterpret_cast<uint64_t>(valid_byte) % 4;
-        int32_t *valid_int = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
-        size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8);
+        uint64_t fixup_bytes      = reinterpret_cast<uint64_t>(valid_byte) % 4;
+        int32_t* valid_int        = reinterpret_cast<int32_t*>(valid_byte - fixup_bytes);
+        size_type int_bit_offset  = byte_bit_offset + (fixup_bytes * 8);
         // Now copy validity for the column
         if (input_nm[col_index]) {
           if (bit_is_set(input_nm[col_index], row_index)) {
@@ -511,24 +532,22 @@ __global__ void copy_to_rows_fixed_width_optimized(
           // It is valid so just set the bit
           atomicOr_block(valid_int, 1 << int_bit_offset);
         }
-      } // end column loop
-    }   // end row copy
+      }  // end column loop
+    }    // end row copy
     // wait for the row_group to be totally copied into shared memory
     __syncthreads();
 
     // Step 2: Copy the data back out
     // We know row_size is always aligned with and a multiple of int64_t;
-    int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
-    int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
+    int64_t* long_shared = reinterpret_cast<int64_t*>(shared_data);
+    int64_t* long_output = reinterpret_cast<int64_t*>(output_data);
 
-    size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x);
+    size_type shared_input_index  = threadIdx.x + (threadIdx.y * blockDim.x);
     size_type shared_input_stride = blockDim.x * blockDim.y;
-    size_type row_index_end = ((row_group_index + 1) * rows_per_group);
-    if (row_index_end > num_rows) {
-      row_index_end = num_rows;
-    }
+    size_type row_index_end       = ((row_group_index + 1) * rows_per_group);
+    if (row_index_end > num_rows) { row_index_end = num_rows; }
     size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    size_type shared_length = row_size * num_rows_in_group;
+    size_type shared_length     = row_size * num_rows_in_group;
 
     size_type shared_input_end = shared_length / sizeof(int64_t);
 
@@ -547,7 +566,7 @@ __global__ void copy_to_rows_fixed_width_optimized(
 #define MEMCPY(dst, src, size, barrier) cuda::memcpy_async(dst, src, size, barrier)
 #else
 #define MEMCPY(dst, src, size, barrier) memcpy(dst, src, size)
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
 /**
  * @brief copy data from cudf columns into JCUDF format, which is row-based
@@ -566,12 +585,17 @@ __global__ void copy_to_rows_fixed_width_optimized(
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_to_rows(const size_type num_rows, const size_type num_columns,
+__global__ void copy_to_rows(const size_type num_rows,
+                             const size_type num_columns,
                              const size_type shmem_used_per_tile,
-                             device_span<const tile_info> tile_infos, const int8_t **input_data,
-                             const size_type *col_sizes, const size_type *col_offsets,
-                             RowOffsetFunctor row_offsets, size_type const *batch_row_boundaries,
-                             int8_t **output_data) {
+                             device_span<const tile_info> tile_infos,
+                             const int8_t** input_data,
+                             const size_type* col_sizes,
+                             const size_type* col_offsets,
+                             RowOffsetFunctor row_offsets,
+                             size_type const* batch_row_boundaries,
+                             int8_t** output_data)
+{
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -581,21 +605,19 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
   // any calculation to do here, but it is important to note.
 
   auto const group = cooperative_groups::this_thread_block();
-  auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
+  auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
   extern __shared__ int8_t shared_data[];
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
   __shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier;
-  if (group.thread_rank() == 0) {
-    init(&tile_barrier, group.size());
-  }
+  if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); }
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
-  auto const tile = tile_infos[blockIdx.x];
-  auto const num_tile_cols = tile.num_cols();
-  auto const num_tile_rows = tile.num_rows();
-  auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes);
+  auto const tile                   = tile_infos[blockIdx.x];
+  auto const num_tile_cols          = tile.num_cols();
+  auto const num_tile_rows          = tile.num_rows();
+  auto const tile_row_size          = tile.get_shared_row_size(col_offsets, col_sizes);
   auto const starting_column_offset = col_offsets[tile.start_col];
 
   // to do the copy we need to do n column copies followed by m element copies OR we have to do m
@@ -610,12 +632,11 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
   // works on a row
   for (int relative_col = warp.meta_group_rank(); relative_col < num_tile_cols;
        relative_col += warp.meta_group_size()) {
-
-    auto const absolute_col = relative_col + tile.start_col;
-    auto const col_size = col_sizes[absolute_col];
-    auto const col_offset = col_offsets[absolute_col];
+    auto const absolute_col        = relative_col + tile.start_col;
+    auto const col_size            = col_sizes[absolute_col];
+    auto const col_offset          = col_offsets[absolute_col];
     auto const relative_col_offset = col_offset - starting_column_offset;
-    auto const col_ptr = input_data[absolute_col];
+    auto const col_ptr             = input_data[absolute_col];
 
     if (col_ptr == nullptr) {
       // variable-width data column
@@ -624,7 +645,6 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
 
     for (int relative_row = warp.thread_rank(); relative_row < num_tile_rows;
          relative_row += warp.size()) {
-
       if (relative_row >= num_tile_rows) {
         // out of bounds
         continue;
@@ -632,23 +652,23 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
       auto const absolute_row = relative_row + tile.start_row;
 
       auto const shared_offset = relative_row * tile_row_size + relative_col_offset;
-      auto const input_src = col_ptr + col_size * absolute_row;
+      auto const input_src     = col_ptr + col_size * absolute_row;
 
       // copy the element from global memory
       switch (col_size) {
         case 2: {
-          const int16_t *short_col_input = reinterpret_cast<const int16_t *>(input_src);
-          *reinterpret_cast<int16_t *>(&shared_data[shared_offset]) = *short_col_input;
+          const int16_t* short_col_input = reinterpret_cast<const int16_t*>(input_src);
+          *reinterpret_cast<int16_t*>(&shared_data[shared_offset]) = *short_col_input;
           break;
         }
         case 4: {
-          const int32_t *int_col_input = reinterpret_cast<const int32_t *>(input_src);
-          *reinterpret_cast<int32_t *>(&shared_data[shared_offset]) = *int_col_input;
+          const int32_t* int_col_input = reinterpret_cast<const int32_t*>(input_src);
+          *reinterpret_cast<int32_t*>(&shared_data[shared_offset]) = *int_col_input;
           break;
         }
         case 8: {
-          const int64_t *long_col_input = reinterpret_cast<const int64_t *>(input_src);
-          *reinterpret_cast<int64_t *>(&shared_data[shared_offset]) = *long_col_input;
+          const int64_t* long_col_input = reinterpret_cast<const int64_t*>(input_src);
+          *reinterpret_cast<int64_t*>(&shared_data[shared_offset]) = *long_col_input;
           break;
         }
         case 1: shared_data[shared_offset] = *input_src; break;
@@ -689,7 +709,7 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
   tile_barrier.arrive_and_wait();
 #else
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 }
 
 /**
@@ -708,58 +728,60 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void
-copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
-                      const size_type shmem_used_per_tile, RowOffsetFunctor row_offsets,
-                      size_type const *batch_row_boundaries, int8_t **output_data,
-                      const size_type validity_offset, device_span<const tile_info> tile_infos,
-                      const bitmask_type **input_nm) {
+__global__ void copy_validity_to_rows(const size_type num_rows,
+                                      const size_type num_columns,
+                                      const size_type shmem_used_per_tile,
+                                      RowOffsetFunctor row_offsets,
+                                      size_type const* batch_row_boundaries,
+                                      int8_t** output_data,
+                                      const size_type validity_offset,
+                                      device_span<const tile_info> tile_infos,
+                                      const bitmask_type** input_nm)
+{
   extern __shared__ int8_t shared_data[];
 
   // each thread of warp reads a single int32 of validity - so we read 128 bytes then ballot_sync
   // the bits and write the result to shmem after we fill shared mem memcpy it out in a blob.
   auto const group = cooperative_groups::this_thread_block();
-  auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
+  auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
   // Initialize cuda barriers for each tile.
   __shared__ cuda::barrier<cuda::thread_scope_block> shared_tile_barrier;
-  if (group.thread_rank() == 0) {
-    init(&shared_tile_barrier, group.size());
-  }
+  if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); }
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
-  auto tile = tile_infos[blockIdx.x];
+  auto tile                = tile_infos[blockIdx.x];
   auto const num_tile_cols = tile.num_cols();
   auto const num_tile_rows = tile.num_rows();
 
   auto const threads_per_warp = warp.size();
-  auto const rows_per_read = cudf::detail::size_in_bits<bitmask_type>();
+  auto const rows_per_read    = cudf::detail::size_in_bits<bitmask_type>();
 
   auto const num_sections_x = util::div_rounding_up_unsafe(num_tile_cols, threads_per_warp);
   auto const num_sections_y = util::div_rounding_up_unsafe(num_tile_rows, rows_per_read);
   auto const validity_data_row_length = util::round_up_unsafe(
-      util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
+    util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
   auto const total_sections = num_sections_x * num_sections_y;
 
   // the tile is divided into sections. A warp operates on a section at a time.
   for (int my_section_idx = warp.meta_group_rank(); my_section_idx < total_sections;
        my_section_idx += warp.meta_group_size()) {
     // convert to rows and cols
-    auto const section_x = my_section_idx % num_sections_x;
-    auto const section_y = my_section_idx / num_sections_x;
-    auto const relative_col = section_x * threads_per_warp + warp.thread_rank();
-    auto const relative_row = section_y * rows_per_read;
-    auto const absolute_col = relative_col + tile.start_col;
-    auto const absolute_row = relative_row + tile.start_row;
-    auto const participating = absolute_col < num_columns && absolute_row < num_rows;
+    auto const section_x          = my_section_idx % num_sections_x;
+    auto const section_y          = my_section_idx / num_sections_x;
+    auto const relative_col       = section_x * threads_per_warp + warp.thread_rank();
+    auto const relative_row       = section_y * rows_per_read;
+    auto const absolute_col       = relative_col + tile.start_col;
+    auto const absolute_row       = relative_row + tile.start_row;
+    auto const participating      = absolute_col < num_columns && absolute_row < num_rows;
     auto const participation_mask = __ballot_sync(0xFFFFFFFF, participating);
 
     if (participating) {
-      auto my_data = input_nm[absolute_col] != nullptr ?
-                         input_nm[absolute_col][word_index(absolute_row)] :
-                         std::numeric_limits<uint32_t>::max();
+      auto my_data = input_nm[absolute_col] != nullptr
+                       ? input_nm[absolute_col][word_index(absolute_row)]
+                       : std::numeric_limits<uint32_t>::max();
 
       // every thread that is participating in the warp has 4 bytes, but it's column-based data and
       // we need it in row-based. So we shuffle the bits around with ballot_sync to make the bytes
@@ -769,19 +791,19 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
         auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
         // lead thread in each warp writes data
         auto const validity_write_offset =
-            validity_data_row_length * (relative_row + i) + (relative_col / CHAR_BIT);
+          validity_data_row_length * (relative_row + i) + (relative_col / CHAR_BIT);
         if (warp.thread_rank() == 0) {
-          *reinterpret_cast<bitmask_type *>(&shared_data[validity_write_offset]) = validity_data;
+          *reinterpret_cast<bitmask_type*>(&shared_data[validity_write_offset]) = validity_data;
         }
       }
     }
   }
 
   auto const output_data_base =
-      output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT;
+    output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT;
 
   // each warp copies a row at a time
-  auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT);
+  auto const row_bytes       = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT);
   auto const row_batch_start = tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
 
   // make sure entire tile has finished copy
@@ -807,7 +829,7 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
   shared_tile_barrier.arrive_and_wait();
 #else
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 }
 
 /**
@@ -826,42 +848,46 @@ copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_strings_to_rows(size_type const num_rows, size_type const num_variable_columns,
-                                     int8_t const **variable_input_data,
-                                     size_type const *variable_col_output_offsets,
-                                     size_type const **variable_col_offsets,
-                                     size_type fixed_width_row_size, RowOffsetFunctor row_offsets,
-                                     size_type const batch_row_offset, int8_t *output_data) {
+__global__ void copy_strings_to_rows(size_type const num_rows,
+                                     size_type const num_variable_columns,
+                                     int8_t const** variable_input_data,
+                                     size_type const* variable_col_output_offsets,
+                                     size_type const** variable_col_offsets,
+                                     size_type fixed_width_row_size,
+                                     RowOffsetFunctor row_offsets,
+                                     size_type const batch_row_offset,
+                                     int8_t* output_data)
+{
   // Each block will take a group of rows controlled by NUM_STRING_ROWS_PER_BLOCK_TO_ROWS. Each warp
   // will copy a row at a time. The base thread will first go through column data and fill out
   // offset/length information for the column. Then all threads of the warp will participate in the
   // memcpy of the string data.
   auto const my_block = cooperative_groups::this_thread_block();
-  auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
+  auto const warp     = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
 #ifdef ASYNC_MEMCPY_SUPPORTED
   cuda::barrier<cuda::thread_scope_block> block_barrier;
 #endif
 
   auto const start_row =
-      blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset;
+    blockIdx.x * NUM_STRING_ROWS_PER_BLOCK_TO_ROWS + warp.meta_group_rank() + batch_row_offset;
   auto const end_row =
-      std::min(num_rows, static_cast<size_type>(start_row + NUM_STRING_ROWS_PER_BLOCK_TO_ROWS));
+    std::min(num_rows, static_cast<size_type>(start_row + NUM_STRING_ROWS_PER_BLOCK_TO_ROWS));
 
   for (int row = start_row; row < end_row; row += warp.meta_group_size()) {
-    auto offset = fixed_width_row_size; // initial offset to variable-width data
+    auto offset                = fixed_width_row_size;  // initial offset to variable-width data
     auto const base_row_offset = row_offsets(row, 0);
     for (int col = 0; col < num_variable_columns; ++col) {
       auto const string_start_offset = variable_col_offsets[col][row];
-      auto const string_length = variable_col_offsets[col][row + 1] - string_start_offset;
+      auto const string_length       = variable_col_offsets[col][row + 1] - string_start_offset;
       if (warp.thread_rank() == 0) {
         // write the offset/length to column
-        uint32_t *output_dest = reinterpret_cast<uint32_t *>(
-            &output_data[base_row_offset + variable_col_output_offsets[col]]);
+        uint32_t* output_dest = reinterpret_cast<uint32_t*>(
+          &output_data[base_row_offset + variable_col_output_offsets[col]]);
         output_dest[0] = offset;
         output_dest[1] = string_length;
       }
       auto string_output_dest = &output_data[base_row_offset + offset];
-      auto string_output_src = &variable_input_data[col][string_start_offset];
+      auto string_output_src  = &variable_input_data[col][string_start_offset];
       warp.sync();
 #ifdef ASYNC_MEMCPY_SUPPORTED
       cuda::memcpy_async(warp, string_output_dest, string_output_src, string_length, block_barrier);
@@ -891,11 +917,17 @@ __global__ void copy_strings_to_rows(size_type const num_rows, size_type const n
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
-                               const size_type shmem_used_per_tile, RowOffsetFunctor row_offsets,
-                               size_type const *batch_row_boundaries, int8_t **output_data,
-                               const size_type *col_sizes, const size_type *col_offsets,
-                               device_span<const tile_info> tile_infos, const int8_t *input_data) {
+__global__ void copy_from_rows(const size_type num_rows,
+                               const size_type num_columns,
+                               const size_type shmem_used_per_tile,
+                               RowOffsetFunctor row_offsets,
+                               size_type const* batch_row_boundaries,
+                               int8_t** output_data,
+                               const size_type* col_sizes,
+                               const size_type* col_offsets,
+                               device_span<const tile_info> tile_infos,
+                               const int8_t* input_data)
+{
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -908,31 +940,30 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
   // memory for each of the tiles that we work on
 
   auto const group = cooperative_groups::this_thread_block();
-  auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
+  auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
   extern __shared__ int8_t shared[];
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
   // Initialize cuda barriers for each tile.
   __shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier;
-  if (group.thread_rank() == 0) {
-    init(&tile_barrier, group.size());
-  }
+  if (group.thread_rank() == 0) { init(&tile_barrier, group.size()); }
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
   {
-    auto const fetch_tile = tile_infos[blockIdx.x];
+    auto const fetch_tile           = tile_infos[blockIdx.x];
     auto const fetch_tile_start_row = fetch_tile.start_row;
-    auto const starting_col_offset = col_offsets[fetch_tile.start_col];
-    auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes);
+    auto const starting_col_offset  = col_offsets[fetch_tile.start_col];
+    auto const fetch_tile_row_size  = fetch_tile.get_shared_row_size(col_offsets, col_sizes);
     auto const row_batch_start =
-        fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number];
+      fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number];
 
     for (int absolute_row = warp.meta_group_rank() + fetch_tile.start_row;
-         absolute_row <= fetch_tile.end_row; absolute_row += warp.meta_group_size()) {
+         absolute_row <= fetch_tile.end_row;
+         absolute_row += warp.meta_group_size()) {
       warp.sync();
       auto shared_offset = (absolute_row - fetch_tile_start_row) * fetch_tile_row_size;
-      auto dst = &shared[shared_offset];
+      auto dst           = &shared[shared_offset];
       auto src = &input_data[row_offsets(absolute_row, row_batch_start) + starting_col_offset];
       // copy the data
 #ifdef ASYNC_MEMCPY_SUPPORTED
@@ -946,9 +977,9 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
   }
 
   {
-    auto const tile = tile_infos[blockIdx.x];
-    auto const rows_in_tile = tile.num_rows();
-    auto const cols_in_tile = tile.num_cols();
+    auto const tile          = tile_infos[blockIdx.x];
+    auto const rows_in_tile  = tile.num_rows();
+    auto const cols_in_tile  = tile.num_cols();
     auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes);
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
@@ -956,7 +987,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
     tile_barrier.arrive_and_wait();
 #else
     group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
     // Now we copy from shared memory to final destination. The data is laid out in rows in shared
     // memory, so the reads for a column will be "vertical". Because of this and the different sizes
@@ -965,8 +996,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
     // than rows, we do a global index instead of a double for loop with col/row.
     for (int relative_row = warp.thread_rank(); relative_row < rows_in_tile;
          relative_row += warp.size()) {
-
-      auto const absolute_row = relative_row + tile.start_row;
+      auto const absolute_row             = relative_row + tile.start_row;
       auto const shared_memory_row_offset = tile_row_size * relative_row;
 
       for (int relative_col = warp.meta_group_rank(); relative_col < cols_in_tile;
@@ -974,11 +1004,11 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
         auto const absolute_col = relative_col + tile.start_col;
 
         auto const shared_memory_offset =
-            col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset;
+          col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset;
         auto const column_size = col_sizes[absolute_col];
 
-        int8_t *shmem_src = &shared[shared_memory_offset];
-        int8_t *dst = &output_data[absolute_col][absolute_row * column_size];
+        int8_t* shmem_src = &shared[shared_memory_offset];
+        int8_t* dst       = &output_data[absolute_col][absolute_row * column_size];
 
         MEMCPY(dst, shmem_src, column_size, tile_barrier);
       }
@@ -990,7 +1020,7 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
   tile_barrier.arrive_and_wait();
 #else
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 }
 
 /**
@@ -1009,12 +1039,16 @@ __global__ void copy_from_rows(const size_type num_rows, const size_type num_col
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void
-copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
-                        const size_type shmem_used_per_tile, RowOffsetFunctor row_offsets,
-                        size_type const *batch_row_boundaries, bitmask_type **output_nm,
-                        const size_type validity_offset, device_span<const tile_info> tile_infos,
-                        const int8_t *input_data) {
+__global__ void copy_validity_from_rows(const size_type num_rows,
+                                        const size_type num_columns,
+                                        const size_type shmem_used_per_tile,
+                                        RowOffsetFunctor row_offsets,
+                                        size_type const* batch_row_boundaries,
+                                        bitmask_type** output_nm,
+                                        const size_type validity_offset,
+                                        device_span<const tile_info> tile_infos,
+                                        const int8_t* input_data)
+{
   extern __shared__ int8_t shared[];
 
   using cudf::detail::warp_size;
@@ -1034,44 +1068,42 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
   //        __ballot_sync, representing 32 rows of that column.
 
   auto const group = cooperative_groups::this_thread_block();
-  auto const warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
+  auto const warp  = cooperative_groups::tiled_partition<cudf::detail::warp_size>(group);
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
   // Initialize cuda barriers for each tile.
   __shared__ cuda::barrier<cuda::thread_scope_block> shared_tile_barrier;
-  if (group.thread_rank() == 0) {
-    init(&shared_tile_barrier, group.size());
-  }
+  if (group.thread_rank() == 0) { init(&shared_tile_barrier, group.size()); }
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 
-  auto const tile = tile_infos[blockIdx.x];
+  auto const tile           = tile_infos[blockIdx.x];
   auto const tile_start_col = tile.start_col;
   auto const tile_start_row = tile.start_row;
-  auto const num_tile_cols = tile.num_cols();
-  auto const num_tile_rows = tile.num_rows();
+  auto const num_tile_cols  = tile.num_cols();
+  auto const num_tile_rows  = tile.num_rows();
 
   auto const threads_per_warp = warp.size();
-  auto const cols_per_read = CHAR_BIT;
+  auto const cols_per_read    = CHAR_BIT;
 
-  auto const rows_per_read = static_cast<size_type>(threads_per_warp);
-  auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, cols_per_read);
-  auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, rows_per_read);
-  auto const validity_data_col_length = num_sections_y * 4; // words to bytes
-  auto const total_sections = num_sections_x * num_sections_y;
+  auto const rows_per_read            = static_cast<size_type>(threads_per_warp);
+  auto const num_sections_x           = util::div_rounding_up_safe(num_tile_cols, cols_per_read);
+  auto const num_sections_y           = util::div_rounding_up_safe(num_tile_rows, rows_per_read);
+  auto const validity_data_col_length = num_sections_y * 4;  // words to bytes
+  auto const total_sections           = num_sections_x * num_sections_y;
 
   // the tile is divided into sections. A warp operates on a section at a time.
   for (int my_section_idx = warp.meta_group_rank(); my_section_idx < total_sections;
        my_section_idx += warp.meta_group_size()) {
     // convert section to row and col
-    auto const section_x = my_section_idx % num_sections_x;
-    auto const section_y = my_section_idx / num_sections_x;
+    auto const section_x    = my_section_idx % num_sections_x;
+    auto const section_y    = my_section_idx / num_sections_x;
     auto const relative_col = section_x * cols_per_read;
     auto const relative_row = section_y * rows_per_read + warp.thread_rank();
     auto const absolute_col = relative_col + tile_start_col;
     auto const absolute_row = relative_row + tile_start_row;
     auto const row_batch_start =
-        tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
+      tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
 
     auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
 
@@ -1088,8 +1120,8 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
         // lead thread in each warp writes data
         if (warp.thread_rank() == 0) {
           auto const validity_write_offset =
-              validity_data_col_length * (relative_col + i) + relative_row / cols_per_read;
-          *reinterpret_cast<bitmask_type *>(&shared[validity_write_offset]) = validity_data;
+            validity_data_col_length * (relative_col + i) + relative_row / cols_per_read;
+          *reinterpret_cast<bitmask_type*>(&shared[validity_write_offset]) = validity_data;
         }
       }
     }
@@ -1104,13 +1136,13 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
   for (int relative_col = warp.meta_group_rank(); relative_col < num_tile_cols;
        relative_col += warp.meta_group_size()) {
     auto const absolute_col = relative_col + tile_start_col;
-    auto dst = output_nm[absolute_col] + word_index(tile_start_row);
+    auto dst                = output_nm[absolute_col] + word_index(tile_start_row);
     auto const src =
-        reinterpret_cast<bitmask_type *>(&shared[validity_data_col_length * relative_col]);
+      reinterpret_cast<bitmask_type*>(&shared[validity_data_col_length * relative_col]);
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
-    cuda::memcpy_async(warp, dst, src, aligned_size_t<4>(validity_data_col_length),
-                       shared_tile_barrier);
+    cuda::memcpy_async(
+      warp, dst, src, aligned_size_t<4>(validity_data_col_length), shared_tile_barrier);
 #else
     for (int b = warp.thread_rank(); b < col_words; b += warp.size()) {
       dst[b] = src[b];
@@ -1123,7 +1155,7 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
   shared_tile_barrier.arrive_and_wait();
 #else
   group.sync();
-#endif // ASYNC_MEMCPY_SUPPORTED
+#endif  // ASYNC_MEMCPY_SUPPORTED
 }
 
 /**
@@ -1140,38 +1172,42 @@ copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
  * @param num_string_columns number of string columns in the table
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets, int32_t **string_row_offsets,
-                                       int32_t **string_lengths, size_type **string_column_offsets,
-                                       char **string_col_data, int8_t const *row_data,
+__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets,
+                                       int32_t** string_row_offsets,
+                                       int32_t** string_lengths,
+                                       size_type** string_column_offsets,
+                                       char** string_col_data,
+                                       int8_t const* row_data,
                                        size_type const num_rows,
-                                       size_type const num_string_columns) {
+                                       size_type const num_string_columns)
+{
   // Each warp takes a tile, which is a single column and up to ROWS_PER_BLOCK rows. A tile will not
   // wrap around the bottom of the table. The warp will copy the strings for each row in the tile.
   // Traversing in row-major order to coalesce the offsets and size reads.
   auto my_block = cooperative_groups::this_thread_block();
-  auto warp = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
+  auto warp     = cooperative_groups::tiled_partition<cudf::detail::warp_size>(my_block);
 #ifdef ASYNC_MEMCPY_SUPPORTED
   cuda::barrier<cuda::thread_scope_block> block_barrier;
 #endif
 
   // workaround for not being able to take a reference to a constexpr host variable
   auto const ROWS_PER_BLOCK = NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS;
-  auto const tiles_per_col = util::div_rounding_up_unsafe(num_rows, ROWS_PER_BLOCK);
-  auto const starting_tile = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank();
-  auto const num_tiles = tiles_per_col * num_string_columns;
-  auto const tile_stride = warp.meta_group_size() * gridDim.x;
+  auto const tiles_per_col  = util::div_rounding_up_unsafe(num_rows, ROWS_PER_BLOCK);
+  auto const starting_tile  = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank();
+  auto const num_tiles      = tiles_per_col * num_string_columns;
+  auto const tile_stride    = warp.meta_group_size() * gridDim.x;
   // Each warp will copy strings in its tile. This is handled by all the threads of a warp passing
   // the same parameters to async_memcpy and all threads in the warp participating in the copy.
   for (auto my_tile = starting_tile; my_tile < num_tiles; my_tile += tile_stride) {
     auto const starting_row = (my_tile % tiles_per_col) * ROWS_PER_BLOCK;
-    auto const col = my_tile / tiles_per_col;
-    auto const str_len = string_lengths[col];
-    auto const str_row_off = string_row_offsets[col];
-    auto const str_col_off = string_column_offsets[col];
-    auto str_col_data = string_col_data[col];
+    auto const col          = my_tile / tiles_per_col;
+    auto const str_len      = string_lengths[col];
+    auto const str_row_off  = string_row_offsets[col];
+    auto const str_col_off  = string_column_offsets[col];
+    auto str_col_data       = string_col_data[col];
     for (int row = starting_row; row < starting_row + ROWS_PER_BLOCK && row < num_rows; ++row) {
       auto const src = &row_data[row_offsets(row, 0) + str_row_off[row]];
-      auto dst = &str_col_data[str_col_off[row]];
+      auto dst       = &str_col_data[str_col_off[row]];
 
 #ifdef ASYNC_MEMCPY_SUPPORTED
       cuda::memcpy_async(warp, dst, src, str_len[row], block_barrier);
@@ -1194,8 +1230,12 @@ __global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets, int32_t **s
  * @param [out] threads the size of the threads for the kernel
  * @return the size in bytes of shared memory needed for each block.
  */
-static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_type num_rows,
-                                        const size_type size_per_row, dim3 &blocks, dim3 &threads) {
+static int calc_fixed_width_kernel_dims(const size_type num_columns,
+                                        const size_type num_rows,
+                                        const size_type size_per_row,
+                                        dim3& blocks,
+                                        dim3& threads)
+{
   // We have found speed degrades when a thread handles more than 4 columns.
   // Each block is 2 dimensional. The y dimension indicates the columns.
   // We limit this to 32 threads in the y dimension so we can still
@@ -1205,7 +1245,7 @@ static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_
   // in the x dimension because we use atomic operations at the block
   // level when writing validity data out to main memory, and that would
   // need to change if we split a word of validity data between blocks.
-  int const y_block_size = min(util::div_rounding_up_safe(num_columns, 4), 32);
+  int const y_block_size          = min(util::div_rounding_up_safe(num_columns, 4), 32);
   int const x_possible_block_size = 1024 / y_block_size;
   // 48KB is the default setting for shared memory per block according to the cuda tutorials
   // If someone configures the GPU to only have 16 KB this might not work.
@@ -1228,9 +1268,9 @@ static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_
   // to try and future proof this a bit.
   int const num_blocks = std::clamp((num_rows + block_size - 1) / block_size, 1, 10240);
 
-  blocks.x = num_blocks;
-  blocks.y = 1;
-  blocks.z = 1;
+  blocks.x  = num_blocks;
+  blocks.y  = 1;
+  blocks.z  = 1;
   threads.x = block_size;
   threads.y = y_block_size;
   threads.z = 1;
@@ -1244,12 +1284,19 @@ static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_
  * into this function are common between runs and should be calculated once.
  */
 static std::unique_ptr<column> fixed_width_convert_to_rows(
-    const size_type start_row, const size_type num_rows, const size_type num_columns,
-    const size_type size_per_row, rmm::device_uvector<size_type> &column_start,
-    rmm::device_uvector<size_type> &column_size, rmm::device_uvector<const int8_t *> &input_data,
-    rmm::device_uvector<const bitmask_type *> &input_nm, const scalar &zero,
-    const scalar &scalar_size_per_row, rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr) {
+  const size_type start_row,
+  const size_type num_rows,
+  const size_type num_columns,
+  const size_type size_per_row,
+  rmm::device_uvector<size_type>& column_start,
+  rmm::device_uvector<size_type>& column_size,
+  rmm::device_uvector<const int8_t*>& input_data,
+  rmm::device_uvector<const bitmask_type*>& input_nm,
+  const scalar& zero,
+  const scalar& scalar_size_per_row,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
   int64_t const total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
   CUDF_EXPECTS(total_allocation < std::numeric_limits<size_type>::max(),
@@ -1257,28 +1304,43 @@ static std::unique_ptr<column> fixed_width_convert_to_rows(
 
   // Allocate and set the offsets row for the byte array
   std::unique_ptr<column> offsets =
-      cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr);
+    cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr);
 
-  std::unique_ptr<column> data =
-      make_numeric_column(data_type(type_id::INT8), static_cast<size_type>(total_allocation),
-                          mask_state::UNALLOCATED, stream, mr);
+  std::unique_ptr<column> data = make_numeric_column(data_type(type_id::INT8),
+                                                     static_cast<size_type>(total_allocation),
+                                                     mask_state::UNALLOCATED,
+                                                     stream,
+                                                     mr);
 
   dim3 blocks;
   dim3 threads;
   int shared_size =
-      detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+    detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
   copy_to_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
-      start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(),
-      input_data.data(), input_nm.data(), data->mutable_view().data<int8_t>());
-
-  return make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
-                           rmm::device_buffer{0, cudf::get_default_stream(), mr}, stream, mr);
+    start_row,
+    num_rows,
+    num_columns,
+    size_per_row,
+    column_start.data(),
+    column_size.data(),
+    input_data.data(),
+    input_nm.data(),
+    data->mutable_view().data<int8_t>());
+
+  return make_lists_column(num_rows,
+                           std::move(offsets),
+                           std::move(data),
+                           0,
+                           rmm::device_buffer{0, cudf::get_default_stream(), mr},
+                           stream,
+                           mr);
 }
 
-static inline bool are_all_fixed_width(std::vector<data_type> const &schema) {
-  return std::all_of(schema.begin(), schema.end(),
-                     [](const data_type &t) { return is_fixed_width(t); });
+static inline bool are_all_fixed_width(std::vector<data_type> const& schema)
+{
+  return std::all_of(
+    schema.begin(), schema.end(), [](const data_type& t) { return is_fixed_width(t); });
 }
 
 /**
@@ -1289,9 +1351,10 @@ static inline bool are_all_fixed_width(std::vector<data_type> const &schema) {
  * @param [out] column_size the size in bytes of the data for each columns in the row.
  * @return the size in bytes each row needs.
  */
-static inline int32_t compute_fixed_width_layout(std::vector<data_type> const &schema,
-                                                 std::vector<size_type> &column_start,
-                                                 std::vector<size_type> &column_size) {
+static inline int32_t compute_fixed_width_layout(std::vector<data_type> const& schema,
+                                                 std::vector<size_type>& column_start,
+                                                 std::vector<size_type>& column_size)
+{
   // We guarantee that the start of each column is 64-bit aligned so anything can go
   // there, but to make the code simple we will still do an alignment for it.
   int32_t at_offset = 0;
@@ -1299,7 +1362,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<data_type> const &s
     size_type s = size_of(*col);
     column_size.emplace_back(s);
     std::size_t allocation_needed = s;
-    std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types
+    std::size_t alignment_needed  = allocation_needed;  // They are the same for fixed width types
     at_offset = util::round_up_unsafe(at_offset, static_cast<int32_t>(alignment_needed));
     column_start.emplace_back(at_offset);
     at_offset += allocation_needed;
@@ -1309,7 +1372,7 @@ static inline int32_t compute_fixed_width_layout(std::vector<data_type> const &s
   // Eventually we can think about nullable vs not nullable, but for now we will just always add
   // it in
   int32_t const validity_bytes_needed =
-      util::div_rounding_up_safe<int32_t>(schema.size(), CHAR_BIT);
+    util::div_rounding_up_safe<int32_t>(schema.size(), CHAR_BIT);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
@@ -1325,8 +1388,8 @@ struct column_info_s {
   std::vector<size_type> column_sizes;
   std::vector<size_type> variable_width_column_starts;
 
-  column_info_s &operator=(column_info_s const &other) = delete;
-  column_info_s &operator=(column_info_s &&other) = delete;
+  column_info_s& operator=(column_info_s const& other) = delete;
+  column_info_s& operator=(column_info_s&& other)      = delete;
 };
 
 /**
@@ -1340,7 +1403,8 @@ struct column_info_s {
  * @return size of the fixed_width data portion of a row.
  */
 template <typename iterator>
-column_info_s compute_column_information(iterator begin, iterator end) {
+column_info_s compute_column_information(iterator begin, iterator end)
+{
   size_type size_per_row = 0;
   std::vector<size_type> column_starts;
   std::vector<size_type> column_sizes;
@@ -1358,10 +1422,8 @@ column_info_s compute_column_information(iterator begin, iterator end) {
     // align size for this type - They are the same for fixed width types and 4 bytes for variable
     // width length/offset combos
     size_type const alignment_needed = compound_type ? __alignof(uint32_t) : col_size;
-    size_per_row = util::round_up_unsafe(size_per_row, alignment_needed);
-    if (compound_type) {
-      variable_width_column_starts.push_back(size_per_row);
-    }
+    size_per_row                     = util::round_up_unsafe(size_per_row, alignment_needed);
+    if (compound_type) { variable_width_column_starts.push_back(size_per_row); }
     column_starts.push_back(size_per_row);
     column_sizes.push_back(col_size);
     size_per_row += col_size;
@@ -1373,9 +1435,11 @@ column_info_s compute_column_information(iterator begin, iterator end) {
 
   // validity is byte-aligned in the JCUDF format
   size_per_row +=
-      util::div_rounding_up_safe(static_cast<size_type>(std::distance(begin, end)), CHAR_BIT);
+    util::div_rounding_up_safe(static_cast<size_type>(std::distance(begin, end)), CHAR_BIT);
 
-  return {size_per_row, std::move(column_starts), std::move(column_sizes),
+  return {size_per_row,
+          std::move(column_starts),
+          std::move(column_sizes),
           std::move(variable_width_column_starts)};
 }
 
@@ -1388,34 +1452,35 @@ column_info_s compute_column_information(iterator begin, iterator end) {
  * @param row_batches batched row information for multiple output locations
  * @return vector of `tile_info` structs for validity data
  */
-std::vector<detail::tile_info>
-build_validity_tile_infos(size_type const &num_columns, size_type const &num_rows,
-                          size_type const &shmem_limit_per_tile,
-                          std::vector<row_batch> const &row_batches) {
+std::vector<detail::tile_info> build_validity_tile_infos(size_type const& num_columns,
+                                                         size_type const& num_rows,
+                                                         size_type const& shmem_limit_per_tile,
+                                                         std::vector<row_batch> const& row_batches)
+{
   auto const desired_rows_and_columns = static_cast<int>(sqrt(shmem_limit_per_tile));
-  auto const column_stride = util::round_up_unsafe(
-      [&]() {
-        if (desired_rows_and_columns > num_columns) {
-          // not many columns, build a single tile for table width and ship it off
-          return num_columns;
-        } else {
-          return util::round_down_safe(desired_rows_and_columns, CHAR_BIT);
-        }
-      }(),
-      JCUDF_ROW_ALIGNMENT);
+  auto const column_stride            = util::round_up_unsafe(
+    [&]() {
+      if (desired_rows_and_columns > num_columns) {
+        // not many columns, build a single tile for table width and ship it off
+        return num_columns;
+      } else {
+        return util::round_down_safe(desired_rows_and_columns, CHAR_BIT);
+      }
+    }(),
+    JCUDF_ROW_ALIGNMENT);
 
   // we fit as much as we can given the column stride note that an element in the table takes just 1
   // bit, but a row with a single element still takes 8 bytes!
-  auto const bytes_per_row = util::round_up_safe(
-      util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
+  auto const bytes_per_row =
+    util::round_up_safe(util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
   auto const row_stride =
-      std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64));
+    std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64));
   std::vector<detail::tile_info> validity_tile_infos;
   validity_tile_infos.reserve(num_columns / column_stride * num_rows / row_stride);
   for (int col = 0; col < num_columns; col += column_stride) {
     int current_tile_row_batch = 0;
-    int rows_left_in_batch = row_batches[current_tile_row_batch].row_count;
-    int row = 0;
+    int rows_left_in_batch     = row_batches[current_tile_row_batch].row_count;
+    int row                    = 0;
     while (row < num_rows) {
       if (rows_left_in_batch == 0) {
         current_tile_row_batch++;
@@ -1423,8 +1488,11 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row
       }
       int const tile_height = std::min(row_stride, rows_left_in_batch);
       validity_tile_infos.emplace_back(
-          detail::tile_info{col, row, std::min(col + column_stride - 1, num_columns - 1),
-                            row + tile_height - 1, current_tile_row_batch});
+        detail::tile_info{col,
+                          row,
+                          std::min(col + column_stride - 1, num_columns - 1),
+                          row + tile_height - 1,
+                          current_tile_row_batch});
       row += tile_height;
       rows_left_in_batch -= tile_height;
     }
@@ -1439,11 +1507,15 @@ build_validity_tile_infos(size_type const &num_columns, size_type const &num_row
  *
  * @tparam RowSize iterator that returns the size of a specific row
  */
-template <typename RowSize> struct row_size_functor {
+template <typename RowSize>
+struct row_size_functor {
   row_size_functor(size_type row_end, RowSize row_sizes, size_type last_row_end)
-      : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end) {}
+    : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end)
+  {
+  }
 
-  __device__ inline uint64_t operator()(int i) const {
+  __device__ inline uint64_t operator()(int i) const
+  {
     return i >= _row_end ? 0 : _row_sizes[i + _last_row_end];
   }
 
@@ -1465,11 +1537,15 @@ template <typename RowSize> struct row_size_functor {
  * device_uvector of row offsets
  */
 template <typename RowSize>
-batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_width,
-                         rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+batch_data build_batches(size_type num_rows,
+                         RowSize row_sizes,
+                         bool all_fixed_width,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+{
   auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows);
   auto const num_batches = static_cast<int32_t>(
-      util::div_rounding_up_safe(total_size, static_cast<uint64_t>(MAX_BATCH_SIZE)));
+    util::div_rounding_up_safe(total_size, static_cast<uint64_t>(MAX_BATCH_SIZE)));
   auto const num_offsets = num_batches + 1;
   std::vector<row_batch> row_batches;
   std::vector<size_type> batch_row_boundaries;
@@ -1480,8 +1556,8 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
   batch_row_boundaries.push_back(0);
   size_type last_row_end = 0;
   device_uvector<uint64_t> cumulative_row_sizes(num_rows, stream);
-  thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows,
-                         cumulative_row_sizes.begin());
+  thrust::inclusive_scan(
+    rmm::exec_policy(stream), row_sizes, row_sizes + num_rows, cumulative_row_sizes.begin());
 
   // This needs to be split this into 2 gig batches. Care must be taken to avoid a batch larger than
   // 2 gigs. Imagine a table with 900 meg rows. The batches should occur every 2 rows, but if a
@@ -1495,21 +1571,21 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
 
   while (last_row_end < num_rows) {
     auto offset_row_sizes = thrust::make_transform_iterator(
-        cumulative_row_sizes.begin(),
-        [last_row_end, cumulative_row_sizes = cumulative_row_sizes.data()] __device__(auto i) {
-          return i - cumulative_row_sizes[last_row_end];
-        });
+      cumulative_row_sizes.begin(),
+      [last_row_end, cumulative_row_sizes = cumulative_row_sizes.data()] __device__(auto i) {
+        return i - cumulative_row_sizes[last_row_end];
+      });
     auto search_start = offset_row_sizes + last_row_end;
-    auto search_end = offset_row_sizes + num_rows;
+    auto search_end   = offset_row_sizes + num_rows;
 
     // find the next MAX_BATCH_SIZE boundary
     auto const lb =
-        thrust::lower_bound(rmm::exec_policy(stream), search_start, search_end, MAX_BATCH_SIZE);
+      thrust::lower_bound(rmm::exec_policy(stream), search_start, search_end, MAX_BATCH_SIZE);
     size_type const batch_size = lb - search_start;
 
-    size_type const row_end = lb == search_end ?
-                                  batch_size + last_row_end :
-                                  last_row_end + util::round_down_safe(batch_size, 32);
+    size_type const row_end = lb == search_end
+                                ? batch_size + last_row_end
+                                : last_row_end + util::round_down_safe(batch_size, 32);
 
     // build offset list for each row in this batch
     auto const num_rows_in_batch = row_end - last_row_end;
@@ -1519,10 +1595,12 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
     device_uvector<size_type> output_batch_row_offsets(num_entries, stream, mr);
 
     auto row_size_iter_bounded = cudf::detail::make_counting_transform_iterator(
-        0, row_size_functor(row_end, row_sizes, last_row_end));
+      0, row_size_functor(row_end, row_sizes, last_row_end));
 
-    thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded,
-                           row_size_iter_bounded + num_entries, output_batch_row_offsets.begin());
+    thrust::exclusive_scan(rmm::exec_policy(stream),
+                           row_size_iter_bounded,
+                           row_size_iter_bounded + num_entries,
+                           output_batch_row_offsets.begin());
 
     auto const batch_bytes = output_batch_row_offsets.element(num_rows_in_batch, stream);
 
@@ -1530,8 +1608,10 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
     // needs to be individually allocated, but the kernel needs a contiguous array of offsets or
     // more global lookups are necessary.
     if (!all_fixed_width) {
-      cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(),
-                 num_rows_in_batch * sizeof(size_type), cudaMemcpyDeviceToDevice);
+      cudaMemcpy(batch_row_offsets.data() + last_row_end,
+                 output_batch_row_offsets.data(),
+                 num_rows_in_batch * sizeof(size_type),
+                 cudaMemcpyDeviceToDevice);
     }
 
     batch_row_boundaries.push_back(row_end);
@@ -1540,10 +1620,11 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
     last_row_end = row_end;
   }
 
-  return {std::move(batch_row_offsets),
-          make_device_uvector_async(batch_row_boundaries, stream,
-                                    rmm::mr::get_current_device_resource()),
-          std::move(batch_row_boundaries), std::move(row_batches)};
+  return {
+    std::move(batch_row_offsets),
+    make_device_uvector_async(batch_row_boundaries, stream, rmm::mr::get_current_device_resource()),
+    std::move(batch_row_boundaries),
+    std::move(row_batches)};
 }
 
 /**
@@ -1554,19 +1635,24 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
  * @param stream stream to use
  * @return number of tiles necessary
  */
-int compute_tile_counts(device_span<size_type const> const &batch_row_boundaries,
-                        int desired_tile_height, rmm::cuda_stream_view stream) {
+int compute_tile_counts(device_span<size_type const> const& batch_row_boundaries,
+                        int desired_tile_height,
+                        rmm::cuda_stream_view stream)
+{
   size_type const num_batches = batch_row_boundaries.size() - 1;
   device_uvector<size_type> num_tiles(num_batches, stream);
   auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(),
-                    [desired_tile_height,
-                     batch_row_boundaries =
-                         batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
-                      return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] -
-                                                              batch_row_boundaries[batch_index],
-                                                          desired_tile_height);
-                    });
+  thrust::transform(
+    rmm::exec_policy(stream),
+    iter,
+    iter + num_batches,
+    num_tiles.begin(),
+    [desired_tile_height,
+     batch_row_boundaries = batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
+      return util::div_rounding_up_unsafe(
+        batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index],
+        desired_tile_height);
+    });
   return thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end());
 }
 
@@ -1582,61 +1668,73 @@ int compute_tile_counts(device_span<size_type const> const &batch_row_boundaries
  * @param stream stream to use
  * @return number of tiles created
  */
-size_type
-build_tiles(device_span<tile_info> tiles,
-            device_uvector<size_type> const &batch_row_boundaries, // comes from build_batches
-            int column_start, int column_end, int desired_tile_height, int total_number_of_rows,
-            rmm::cuda_stream_view stream) {
+size_type build_tiles(
+  device_span<tile_info> tiles,
+  device_uvector<size_type> const& batch_row_boundaries,  // comes from build_batches
+  int column_start,
+  int column_end,
+  int desired_tile_height,
+  int total_number_of_rows,
+  rmm::cuda_stream_view stream)
+{
   size_type const num_batches = batch_row_boundaries.size() - 1;
   device_uvector<size_type> num_tiles(num_batches, stream);
   auto iter = thrust::make_counting_iterator(0);
-  thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(),
-                    [desired_tile_height,
-                     batch_row_boundaries =
-                         batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
-                      return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] -
-                                                              batch_row_boundaries[batch_index],
-                                                          desired_tile_height);
-                    });
+  thrust::transform(
+    rmm::exec_policy(stream),
+    iter,
+    iter + num_batches,
+    num_tiles.begin(),
+    [desired_tile_height,
+     batch_row_boundaries = batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
+      return util::div_rounding_up_unsafe(
+        batch_row_boundaries[batch_index + 1] - batch_row_boundaries[batch_index],
+        desired_tile_height);
+    });
 
   size_type const total_tiles =
-      thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end());
+    thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end());
 
   device_uvector<size_type> tile_starts(num_batches + 1, stream);
   auto tile_iter = cudf::detail::make_counting_transform_iterator(
-      0, [num_tiles = num_tiles.data(), num_batches] __device__(auto i) {
-        return (i < num_batches) ? num_tiles[i] : 0;
-      });
-  thrust::exclusive_scan(rmm::exec_policy(stream), tile_iter, tile_iter + num_batches + 1,
-                         tile_starts.begin()); // in tiles
+    0, [num_tiles = num_tiles.data(), num_batches] __device__(auto i) {
+      return (i < num_batches) ? num_tiles[i] : 0;
+    });
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         tile_iter,
+                         tile_iter + num_batches + 1,
+                         tile_starts.begin());  // in tiles
 
   thrust::transform(
-      rmm::exec_policy(stream), iter, iter + total_tiles, tiles.begin(),
-      [=, tile_starts = tile_starts.data(),
-       batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) {
-        // what batch this tile falls in
-        auto const batch_index_iter =
-            thrust::upper_bound(thrust::seq, tile_starts, tile_starts + num_batches, tile_index);
-        auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1;
-        // local index within the tile
-        int const local_tile_index = tile_index - tile_starts[batch_index];
-        // the start row for this batch.
-        int const batch_row_start = batch_row_boundaries[batch_index];
-        // the start row for this tile
-        int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height);
-        // the end row for this tile
-        int const max_row =
-            std::min(total_number_of_rows - 1,
-                     batch_index + 1 > num_batches ?
-                         std::numeric_limits<size_type>::max() :
-                         static_cast<int>(batch_row_boundaries[batch_index + 1]) - 1);
-        int const tile_row_end =
-            std::min(batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row);
-
-        // stuff the tile
-        return tile_info{column_start, tile_row_start, column_end, tile_row_end,
-                         static_cast<int>(batch_index)};
-      });
+    rmm::exec_policy(stream),
+    iter,
+    iter + total_tiles,
+    tiles.begin(),
+    [                     =,
+     tile_starts          = tile_starts.data(),
+     batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) {
+      // what batch this tile falls in
+      auto const batch_index_iter =
+        thrust::upper_bound(thrust::seq, tile_starts, tile_starts + num_batches, tile_index);
+      auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1;
+      // local index within the tile
+      int const local_tile_index = tile_index - tile_starts[batch_index];
+      // the start row for this batch.
+      int const batch_row_start = batch_row_boundaries[batch_index];
+      // the start row for this tile
+      int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height);
+      // the end row for this tile
+      int const max_row = std::min(total_number_of_rows - 1,
+                                   batch_index + 1 > num_batches
+                                     ? std::numeric_limits<size_type>::max()
+                                     : static_cast<int>(batch_row_boundaries[batch_index + 1]) - 1);
+      int const tile_row_end =
+        std::min(batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row);
+
+      // stuff the tile
+      return tile_info{
+        column_start, tile_row_start, column_end, tile_row_end, static_cast<int>(batch_index)};
+    });
 
   return total_tiles;
 }
@@ -1654,13 +1752,16 @@ build_tiles(device_span<tile_info> tiles,
  * @param f callback function called when building a tile
  */
 template <typename TileCallback>
-void determine_tiles(std::vector<size_type> const &column_sizes,
-                     std::vector<size_type> const &column_starts,
-                     size_type const first_row_batch_size, size_type const total_number_of_rows,
-                     size_type const &shmem_limit_per_tile, TileCallback f) {
+void determine_tiles(std::vector<size_type> const& column_sizes,
+                     std::vector<size_type> const& column_starts,
+                     size_type const first_row_batch_size,
+                     size_type const total_number_of_rows,
+                     size_type const& shmem_limit_per_tile,
+                     TileCallback f)
+{
   // tile infos are organized with the tile going "down" the columns this provides the most
   // coalescing of memory access
-  int current_tile_width = 0;
+  int current_tile_width     = 0;
   int current_tile_start_col = 0;
 
   // the ideal tile height has lots of 8-byte reads and 8-byte writes. The optimal read/write would
@@ -1669,10 +1770,10 @@ void determine_tiles(std::vector<size_type> const &column_sizes,
   // sizes. x * y = shared_mem_size. Which translates to x^2 = shared_mem_size since we want them
   // equal, so height and width are sqrt(shared_mem_size). The trick is that it's in bytes, not rows
   // or columns.
-  auto const square_bias = 32; // bias towards columns for performance reasons
-  auto const optimal_square_len = static_cast<size_type>(sqrt(shmem_limit_per_tile));
+  auto const square_bias         = 32;  // bias towards columns for performance reasons
+  auto const optimal_square_len  = static_cast<size_type>(sqrt(shmem_limit_per_tile));
   auto const desired_tile_height = util::round_up_safe<int>(
-      std::min(optimal_square_len / square_bias, total_number_of_rows), cudf::detail::warp_size);
+    std::min(optimal_square_len / square_bias, total_number_of_rows), cudf::detail::warp_size);
   auto const tile_height = std::clamp(desired_tile_height, 1, first_row_batch_size);
 
   int row_size = 0;
@@ -1682,22 +1783,22 @@ void determine_tiles(std::vector<size_type> const &column_sizes,
     auto const col_size = column_sizes[col];
 
     // align size for this type
-    auto const alignment_needed = col_size; // They are the same for fixed width types
-    auto const row_size_aligned = util::round_up_unsafe(row_size, alignment_needed);
+    auto const alignment_needed       = col_size;  // They are the same for fixed width types
+    auto const row_size_aligned       = util::round_up_unsafe(row_size, alignment_needed);
     auto const row_size_with_this_col = row_size_aligned + col_size;
     auto const row_size_with_end_pad =
-        util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT);
+      util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT);
 
     if (row_size_with_end_pad * tile_height > shmem_limit_per_tile) {
       // too large, close this tile, generate vertical tiles and restart
       f(current_tile_start_col, col == 0 ? col : col - 1, tile_height);
 
       row_size =
-          util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
-      row_size += col_size; // alignment required for shared memory tile boundary to match alignment
-                            // of output row
+        util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+      row_size += col_size;  // alignment required for shared memory tile boundary to match
+                             // alignment of output row
       current_tile_start_col = col;
-      current_tile_width = 0;
+      current_tile_width     = 0;
     } else {
       row_size = row_size_with_this_col;
       current_tile_width++;
@@ -1725,155 +1826,196 @@ void determine_tiles(std::vector<size_type> const &column_sizes,
  */
 template <typename offsetFunctor>
 std::vector<std::unique_ptr<column>> convert_to_rows(
-    table_view const &tbl, batch_data &batch_info, offsetFunctor offset_functor,
-    column_info_s const &column_info,
-    std::optional<rmm::device_uvector<strings_column_view::offset_iterator>> variable_width_offsets,
-    rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+  table_view const& tbl,
+  batch_data& batch_info,
+  offsetFunctor offset_functor,
+  column_info_s const& column_info,
+  std::optional<rmm::device_uvector<strings_column_view::offset_iterator>> variable_width_offsets,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
   int device_id;
   CUDF_CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem_in_bytes;
   CUDF_CUDA_TRY(
-      cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+    cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-#ifndef __CUDA_ARCH__ // __host__ code.
+#ifndef __CUDA_ARCH__  // __host__ code.
   // Need to reduce total shmem available by the size of barriers in the kernel's shared memory
   total_shmem_in_bytes -=
-      util::round_up_unsafe(sizeof(cuda::barrier<cuda::thread_scope_block>), 16ul);
-#endif // __CUDA_ARCH__
+    util::round_up_unsafe(sizeof(cuda::barrier<cuda::thread_scope_block>), 16ul);
+#endif  // __CUDA_ARCH__
 
   auto const shmem_limit_per_tile = total_shmem_in_bytes;
 
-  auto const num_rows = tbl.num_rows();
+  auto const num_rows         = tbl.num_rows();
   auto const fixed_width_only = !variable_width_offsets.has_value();
 
-  auto select_columns = [](auto const &tbl, auto column_predicate) {
+  auto select_columns = [](auto const& tbl, auto column_predicate) {
     std::vector<column_view> cols;
-    std::copy_if(tbl.begin(), tbl.end(), std::back_inserter(cols),
-                 [&](auto c) { return column_predicate(c); });
+    std::copy_if(tbl.begin(), tbl.end(), std::back_inserter(cols), [&](auto c) {
+      return column_predicate(c);
+    });
     return table_view(cols);
   };
 
-  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream,
-                                                 rmm::mr::get_current_device_resource());
-  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream,
-                                                  rmm::mr::get_current_device_resource());
+  auto dev_col_sizes = make_device_uvector_async(
+    column_info.column_sizes, stream, rmm::mr::get_current_device_resource());
+  auto dev_col_starts = make_device_uvector_async(
+    column_info.column_starts, stream, rmm::mr::get_current_device_resource());
 
   // Get the pointers to the input columnar data ready
-  auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const &c) {
+  auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const& c) {
     return is_compound(c.type()) ? nullptr : c.template data<int8_t>();
   });
-  std::vector<int8_t const *> input_data(data_begin, data_begin + tbl.num_columns());
+  std::vector<int8_t const*> input_data(data_begin, data_begin + tbl.num_columns());
 
   // validity code handles variable and fixed-width data, so give it everything
   auto const nm_begin =
-      thrust::make_transform_iterator(tbl.begin(), [](auto const &c) { return c.null_mask(); });
-  std::vector<bitmask_type const *> input_nm(nm_begin, nm_begin + tbl.num_columns());
+    thrust::make_transform_iterator(tbl.begin(), [](auto const& c) { return c.null_mask(); });
+  std::vector<bitmask_type const*> input_nm(nm_begin, nm_begin + tbl.num_columns());
 
   auto dev_input_data =
-      make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource());
   auto dev_input_nm =
-      make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource());
 
   // the first batch always exists unless we were sent an empty table
   auto const first_batch_size = batch_info.row_batches[0].row_count;
 
   std::vector<rmm::device_buffer> output_buffers;
-  std::vector<int8_t *> output_data;
+  std::vector<int8_t*> output_data;
   output_data.reserve(batch_info.row_batches.size());
   output_buffers.reserve(batch_info.row_batches.size());
-  std::transform(batch_info.row_batches.begin(), batch_info.row_batches.end(),
-                 std::back_inserter(output_buffers), [&](auto const &batch) {
-                   return rmm::device_buffer(batch.num_bytes, stream, mr);
-                 });
-  std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data),
-                 [](auto &buf) { return static_cast<int8_t *>(buf.data()); });
+  std::transform(
+    batch_info.row_batches.begin(),
+    batch_info.row_batches.end(),
+    std::back_inserter(output_buffers),
+    [&](auto const& batch) { return rmm::device_buffer(batch.num_bytes, stream, mr); });
+  std::transform(
+    output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data), [](auto& buf) {
+      return static_cast<int8_t*>(buf.data());
+    });
 
   auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
 
   int info_count = 0;
   detail::determine_tiles(
-      column_info.column_sizes, column_info.column_starts, first_batch_size, num_rows,
-      shmem_limit_per_tile,
-      [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count,
-       &stream](int const start_col, int const end_col, int const tile_height) {
-        int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream);
-        info_count += i;
-      });
+    column_info.column_sizes,
+    column_info.column_starts,
+    first_batch_size,
+    num_rows,
+    shmem_limit_per_tile,
+    [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count, &stream](
+      int const start_col, int const end_col, int const tile_height) {
+      int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream);
+      info_count += i;
+    });
 
   // allocate space for tiles
   device_uvector<detail::tile_info> gpu_tile_infos(info_count, stream);
   int tile_offset = 0;
 
   detail::determine_tiles(
-      column_info.column_sizes, column_info.column_starts, first_batch_size, num_rows,
-      shmem_limit_per_tile,
-      [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &gpu_tile_infos, num_rows,
-       &tile_offset, stream](int const start_col, int const end_col, int const tile_height) {
-        tile_offset += detail::build_tiles(
-            {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
-            gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream);
-      });
+    column_info.column_sizes,
+    column_info.column_starts,
+    first_batch_size,
+    num_rows,
+    shmem_limit_per_tile,
+    [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries,
+     &gpu_tile_infos,
+     num_rows,
+     &tile_offset,
+     stream](int const start_col, int const end_col, int const tile_height) {
+      tile_offset += detail::build_tiles(
+        {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
+        gpu_batch_row_boundaries,
+        start_col,
+        end_col,
+        tile_height,
+        num_rows,
+        stream);
+    });
 
   // build validity tiles for ALL columns, variable and fixed width.
   auto validity_tile_infos = detail::build_validity_tile_infos(
-      tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches);
+    tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches);
 
-  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream,
-                                                           rmm::mr::get_current_device_resource());
+  auto dev_validity_tile_infos =
+    make_device_uvector_async(validity_tile_infos, stream, rmm::mr::get_current_device_resource());
 
   auto const validity_offset = column_info.column_starts.back();
 
   // blast through the entire table and convert it
-  detail::copy_to_rows<<<gpu_tile_infos.size(), NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                         total_shmem_in_bytes, stream.value()>>>(
-      num_rows, tbl.num_columns(), shmem_limit_per_tile, gpu_tile_infos, dev_input_data.data(),
-      dev_col_sizes.data(), dev_col_starts.data(), offset_functor,
-      batch_info.d_batch_row_boundaries.data(),
-      reinterpret_cast<int8_t **>(dev_output_data.data()));
+  detail::copy_to_rows<<<gpu_tile_infos.size(),
+                         NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
+                         total_shmem_in_bytes,
+                         stream.value()>>>(num_rows,
+                                           tbl.num_columns(),
+                                           shmem_limit_per_tile,
+                                           gpu_tile_infos,
+                                           dev_input_data.data(),
+                                           dev_col_sizes.data(),
+                                           dev_col_starts.data(),
+                                           offset_functor,
+                                           batch_info.d_batch_row_boundaries.data(),
+                                           reinterpret_cast<int8_t**>(dev_output_data.data()));
 
   // note that validity gets the entire table and not the fixed-width portion
   detail::copy_validity_to_rows<<<validity_tile_infos.size(),
                                   NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                                  total_shmem_in_bytes, stream.value()>>>(
-      num_rows, tbl.num_columns(), shmem_limit_per_tile, offset_functor,
-      batch_info.d_batch_row_boundaries.data(), dev_output_data.data(), validity_offset,
-      dev_validity_tile_infos, dev_input_nm.data());
+                                  total_shmem_in_bytes,
+                                  stream.value()>>>(num_rows,
+                                                    tbl.num_columns(),
+                                                    shmem_limit_per_tile,
+                                                    offset_functor,
+                                                    batch_info.d_batch_row_boundaries.data(),
+                                                    dev_output_data.data(),
+                                                    validity_offset,
+                                                    dev_validity_tile_infos,
+                                                    dev_input_nm.data());
 
   if (!fixed_width_only) {
     // build table view for variable-width data only
     auto const variable_width_table =
-        select_columns(tbl, [](auto col) { return is_compound(col.type()); });
+      select_columns(tbl, [](auto col) { return is_compound(col.type()); });
 
     CUDF_EXPECTS(!variable_width_table.is_empty(), "No variable-width columns when expected!");
     CUDF_EXPECTS(variable_width_offsets.has_value(), "No variable width offset data!");
 
     auto const variable_data_begin =
-        thrust::make_transform_iterator(variable_width_table.begin(), [](auto const &c) {
-          strings_column_view const scv{c};
-          return is_compound(c.type()) ? scv.chars().template data<int8_t>() : nullptr;
-        });
-    std::vector<int8_t const *> variable_width_input_data(
-        variable_data_begin, variable_data_begin + variable_width_table.num_columns());
+      thrust::make_transform_iterator(variable_width_table.begin(), [](auto const& c) {
+        strings_column_view const scv{c};
+        return is_compound(c.type()) ? scv.chars().template data<int8_t>() : nullptr;
+      });
+    std::vector<int8_t const*> variable_width_input_data(
+      variable_data_begin, variable_data_begin + variable_width_table.num_columns());
 
     auto dev_variable_input_data = make_device_uvector_async(
-        variable_width_input_data, stream, rmm::mr::get_current_device_resource());
+      variable_width_input_data, stream, rmm::mr::get_current_device_resource());
     auto dev_variable_col_output_offsets = make_device_uvector_async(
-        column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource());
+      column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource());
 
     for (uint i = 0; i < batch_info.row_batches.size(); i++) {
       auto const batch_row_offset = batch_info.batch_row_boundaries[i];
-      auto const batch_num_rows = batch_info.row_batches[i].row_count;
-
-      dim3 const string_blocks(std::min(
-          MAX_STRING_BLOCKS,
-          util::div_rounding_up_unsafe(batch_num_rows, NUM_STRING_ROWS_PER_BLOCK_TO_ROWS)));
-
-      detail::copy_strings_to_rows<<<string_blocks, NUM_WARPS_IN_BLOCK * cudf::detail::warp_size, 0,
-                                     stream.value()>>>(
-          batch_num_rows, variable_width_table.num_columns(), dev_variable_input_data.data(),
-          dev_variable_col_output_offsets.data(), variable_width_offsets->data(),
-          column_info.size_per_row, offset_functor, batch_row_offset,
-          reinterpret_cast<int8_t *>(output_data[i]));
+      auto const batch_num_rows   = batch_info.row_batches[i].row_count;
+
+      dim3 const string_blocks(
+        std::min(MAX_STRING_BLOCKS,
+                 util::div_rounding_up_unsafe(batch_num_rows, NUM_STRING_ROWS_PER_BLOCK_TO_ROWS)));
+
+      detail::copy_strings_to_rows<<<string_blocks,
+                                     NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
+                                     0,
+                                     stream.value()>>>(batch_num_rows,
+                                                       variable_width_table.num_columns(),
+                                                       dev_variable_input_data.data(),
+                                                       dev_variable_col_output_offsets.data(),
+                                                       variable_width_offsets->data(),
+                                                       column_info.size_per_row,
+                                                       offset_functor,
+                                                       batch_row_offset,
+                                                       reinterpret_cast<int8_t*>(output_data[i]));
     }
   }
 
@@ -1882,28 +2024,36 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
   std::vector<std::unique_ptr<column>> ret;
   ret.reserve(batch_info.row_batches.size());
   auto counting_iter = thrust::make_counting_iterator(0);
-  std::transform(counting_iter, counting_iter + batch_info.row_batches.size(),
-                 std::back_inserter(ret), [&](auto batch) {
+  std::transform(counting_iter,
+                 counting_iter + batch_info.row_batches.size(),
+                 std::back_inserter(ret),
+                 [&](auto batch) {
                    auto const offset_count = batch_info.row_batches[batch].row_offsets.size();
-                   auto offsets = std::make_unique<column>(
-                       data_type{type_id::INT32}, (size_type)offset_count,
-                       batch_info.row_batches[batch].row_offsets.release(),
-                       rmm::device_buffer{}, 0);
+                   auto offsets =
+                     std::make_unique<column>(data_type{type_id::INT32},
+                                              (size_type)offset_count,
+                                              batch_info.row_batches[batch].row_offsets.release(),
+                                              rmm::device_buffer{},
+                                              0);
                    auto data = std::make_unique<column>(data_type{type_id::INT8},
                                                         batch_info.row_batches[batch].num_bytes,
                                                         std::move(output_buffers[batch]),
                                                         rmm::device_buffer{},
                                                         0);
 
-                   return make_lists_column(
-                       batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data),
-                       0, rmm::device_buffer{0, cudf::get_default_stream(), mr}, stream, mr);
+                   return make_lists_column(batch_info.row_batches[batch].row_count,
+                                            std::move(offsets),
+                                            std::move(data),
+                                            0,
+                                            rmm::device_buffer{0, cudf::get_default_stream(), mr},
+                                            stream,
+                                            mr);
                  });
 
   return ret;
 }
 
-} // namespace detail
+}  // namespace detail
 
 /**
  * @brief convert a cudf table to JCUDF row format
@@ -1913,14 +2063,15 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
  * @param mr memory resource used for returned data
  * @return vector of list columns containing byte columns of the JCUDF row data
  */
-std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
+std::vector<std::unique_ptr<column>> convert_to_rows(table_view const& tbl,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource *mr) {
+                                                     rmm::mr::device_memory_resource* mr)
+{
   auto const num_columns = tbl.num_columns();
-  auto const num_rows = tbl.num_rows();
+  auto const num_rows    = tbl.num_rows();
 
   auto const fixed_width_only = std::all_of(
-      tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); });
+    tbl.begin(), tbl.end(), [](column_view const& c) { return is_fixed_width(c.type()); });
 
   // Break up the work into tiles, which are a starting and ending row/col #. This tile size is
   // calculated based on the shared memory size available we want a single tile to fill up the
@@ -1936,94 +2087,107 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
   // before building the tiles so the tiles can be properly cut around them.
 
   auto schema_column_iter =
-      thrust::make_transform_iterator(tbl.begin(), [](auto const &i) { return i.type(); });
+    thrust::make_transform_iterator(tbl.begin(), [](auto const& i) { return i.type(); });
 
   auto column_info =
-      detail::compute_column_information(schema_column_iter, schema_column_iter + num_columns);
+    detail::compute_column_information(schema_column_iter, schema_column_iter + num_columns);
   auto const size_per_row = column_info.size_per_row;
   if (fixed_width_only) {
     // total encoded row size. This includes fixed-width data and validity only. It does not include
     // variable-width data since it isn't copied with the fixed-width and validity kernel.
     auto row_size_iter = thrust::make_constant_iterator<uint64_t>(
-        util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT));
+      util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT));
 
     auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr);
 
     detail::fixed_width_row_offset_functor offset_functor(
-        util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT));
+      util::round_up_unsafe(size_per_row, JCUDF_ROW_ALIGNMENT));
 
-    return detail::convert_to_rows(tbl, batch_info, offset_functor, std::move(column_info),
-                                   std::nullopt, stream, mr);
+    return detail::convert_to_rows(
+      tbl, batch_info, offset_functor, std::move(column_info), std::nullopt, stream, mr);
   } else {
     auto offset_data = detail::build_string_row_offsets(tbl, size_per_row, stream);
-    auto &row_sizes = std::get<0>(offset_data);
+    auto& row_sizes  = std::get<0>(offset_data);
 
     auto row_size_iter = cudf::detail::make_counting_transform_iterator(
-        0, detail::row_size_functor(num_rows, row_sizes.data(), 0));
+      0, detail::row_size_functor(num_rows, row_sizes.data(), 0));
 
     auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr);
 
     detail::string_row_offset_functor offset_functor(batch_info.batch_row_offsets);
 
-    return detail::convert_to_rows(tbl, batch_info, offset_functor, std::move(column_info),
-                                   std::make_optional(std::move(std::get<1>(offset_data))), stream,
+    return detail::convert_to_rows(tbl,
+                                   batch_info,
+                                   offset_functor,
+                                   std::move(column_info),
+                                   std::make_optional(std::move(std::get<1>(offset_data))),
+                                   stream,
                                    mr);
   }
 }
 
-std::vector<std::unique_ptr<column>>
-convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource *mr) {
+std::vector<std::unique_ptr<column>> convert_to_rows_fixed_width_optimized(
+  table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+{
   auto const num_columns = tbl.num_columns();
 
   std::vector<data_type> schema;
   schema.resize(num_columns);
-  std::transform(tbl.begin(), tbl.end(), schema.begin(),
-                 [](auto i) -> data_type { return i.type(); });
+  std::transform(
+    tbl.begin(), tbl.end(), schema.begin(), [](auto i) -> data_type { return i.type(); });
 
   if (detail::are_all_fixed_width(schema)) {
     std::vector<size_type> column_start;
     std::vector<size_type> column_size;
 
     int32_t const size_per_row =
-        detail::compute_fixed_width_layout(schema, column_start, column_size);
+      detail::compute_fixed_width_layout(schema, column_start, column_size);
     auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
-    auto dev_column_size = make_device_uvector_async(column_size, stream, mr);
+    auto dev_column_size  = make_device_uvector_async(column_size, stream, mr);
 
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about splitting
     // validity at a specific row offset.  This might change in the future.
     auto const max_rows_per_batch =
-        util::round_down_safe(std::numeric_limits<size_type>::max() / size_per_row, 32);
+      util::round_down_safe(std::numeric_limits<size_type>::max() / size_per_row, 32);
 
     auto const num_rows = tbl.num_rows();
 
     // Get the pointers to the input columnar data ready
-    std::vector<const int8_t *> input_data;
-    std::vector<bitmask_type const *> input_nm;
+    std::vector<const int8_t*> input_data;
+    std::vector<bitmask_type const*> input_nm;
     for (size_type column_number = 0; column_number < num_columns; column_number++) {
       column_view cv = tbl.column(column_number);
       input_data.emplace_back(cv.data<int8_t>());
       input_nm.emplace_back(cv.null_mask());
     }
     auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
-    auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
+    auto dev_input_nm   = make_device_uvector_async(input_nm, stream, mr);
 
     using ScalarType = scalar_type_t<size_type>;
-    auto zero = make_numeric_scalar(data_type(type_id::INT32), stream.value());
+    auto zero        = make_numeric_scalar(data_type(type_id::INT32), stream.value());
     zero->set_valid_async(true, stream);
-    static_cast<ScalarType *>(zero.get())->set_value(0, stream);
+    static_cast<ScalarType*>(zero.get())->set_value(0, stream);
 
     auto step = make_numeric_scalar(data_type(type_id::INT32), stream.value());
     step->set_valid_async(true, stream);
-    static_cast<ScalarType *>(step.get())->set_value(static_cast<size_type>(size_per_row), stream);
+    static_cast<ScalarType*>(step.get())->set_value(static_cast<size_type>(size_per_row), stream);
 
     std::vector<std::unique_ptr<column>> ret;
     for (size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
       size_type row_count = num_rows - row_start;
-      row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
-      ret.emplace_back(detail::fixed_width_convert_to_rows(
-          row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size,
-          dev_input_data, dev_input_nm, *zero, *step, stream, mr));
+      row_count           = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
+      ret.emplace_back(detail::fixed_width_convert_to_rows(row_start,
+                                                           row_count,
+                                                           num_columns,
+                                                           size_per_row,
+                                                           dev_column_start,
+                                                           dev_column_size,
+                                                           dev_input_data,
+                                                           dev_input_nm,
+                                                           *zero,
+                                                           *step,
+                                                           stream,
+                                                           mr));
     }
 
     return ret;
@@ -2036,14 +2200,14 @@ namespace {
 
 /// @brief Calculates and sets null counts for specified columns
 void fixup_null_counts(std::vector<std::unique_ptr<column>>& output_columns,
-                       rmm::cuda_stream_view stream) {
-  for (auto &col : output_columns) {
-    col->set_null_count(
-      cudf::detail::null_count(col->view().null_mask(), 0, col->size(), stream));
+                       rmm::cuda_stream_view stream)
+{
+  for (auto& col : output_columns) {
+    col->set_null_count(cudf::detail::null_count(col->view().null_mask(), 0, col->size(), stream));
   }
 }
 
-}
+}  // namespace
 
 /**
  * @brief convert from JCUDF row format to cudf columns
@@ -2054,12 +2218,13 @@ void fixup_null_counts(std::vector<std::unique_ptr<column>>& output_columns,
  * @param mr memory resource for returned data
  * @return cudf table of the data
  */
-std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
-                                         std::vector<data_type> const &schema,
+std::unique_ptr<table> convert_from_rows(lists_column_view const& input,
+                                         std::vector<data_type> const& schema,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource *mr) {
+                                         rmm::mr::device_memory_resource* mr)
+{
   // verify that the types are what we expect
-  column_view child = input.child();
+  column_view child    = input.child();
   auto const list_type = child.type().id();
   CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8,
                "Only a list of bytes is supported as input");
@@ -2079,19 +2244,19 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   }
 
   auto const num_columns = string_schema.size();
-  auto const num_rows = input.parent().size();
+  auto const num_rows    = input.parent().size();
 
   int device_id;
   CUDF_CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem_in_bytes;
   CUDF_CUDA_TRY(
-      cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+    cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
 
-#ifndef __CUDA_ARCH__ // __host__ code.
+#ifndef __CUDA_ARCH__  // __host__ code.
   // Need to reduce total shmem available by the size of barriers in the kernel's shared memory
   total_shmem_in_bytes -=
-      util::round_up_unsafe(sizeof(cuda::barrier<cuda::thread_scope_block>), 16ul);
-#endif // __CUDA_ARCH__
+    util::round_up_unsafe(sizeof(cuda::barrier<cuda::thread_scope_block>), 16ul);
+#endif  // __CUDA_ARCH__
 
   auto const shmem_limit_per_tile = total_shmem_in_bytes;
 
@@ -2101,41 +2266,44 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   // Ideally we would check that the offsets are all the same, etc. but for now this is probably
   // fine
   CUDF_EXPECTS(size_per_row * num_rows <= child.size(), "The layout of the data appears to be off");
-  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream,
-                                                  rmm::mr::get_current_device_resource());
-  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream,
-                                                 rmm::mr::get_current_device_resource());
+  auto dev_col_starts = make_device_uvector_async(
+    column_info.column_starts, stream, rmm::mr::get_current_device_resource());
+  auto dev_col_sizes = make_device_uvector_async(
+    column_info.column_sizes, stream, rmm::mr::get_current_device_resource());
 
   // Allocate the columns we are going to write into
   std::vector<std::unique_ptr<column>> output_columns;
   std::vector<std::unique_ptr<column>> string_row_offset_columns;
   std::vector<std::unique_ptr<column>> string_length_columns;
-  std::vector<int8_t *> output_data;
-  std::vector<bitmask_type *> output_nm;
-  std::vector<int32_t *> string_row_offsets;
-  std::vector<int32_t *> string_lengths;
+  std::vector<int8_t*> output_data;
+  std::vector<bitmask_type*> output_nm;
+  std::vector<int32_t*> string_row_offsets;
+  std::vector<int32_t*> string_lengths;
   for (auto i : schema) {
-    auto make_col = [&output_data, &output_nm](data_type type, size_type num_rows, bool include_nm,
+    auto make_col = [&output_data, &output_nm](data_type type,
+                                               size_type num_rows,
+                                               bool include_nm,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource *mr) {
-      auto column = make_fixed_width_column(
-          type, num_rows, include_nm ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, stream,
-          mr);
+                                               rmm::mr::device_memory_resource* mr) {
+      auto column =
+        make_fixed_width_column(type,
+                                num_rows,
+                                include_nm ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED,
+                                stream,
+                                mr);
       auto mut = column->mutable_view();
       output_data.emplace_back(mut.data<int8_t>());
-      if (include_nm) {
-        output_nm.emplace_back(mut.null_mask());
-      }
+      if (include_nm) { output_nm.emplace_back(mut.null_mask()); }
       return column;
     };
     if (i.id() == type_id::STRING) {
       auto const int32type = data_type(type_id::INT32);
       auto offset_col =
-          make_col(int32type, num_rows, true, stream, rmm::mr::get_current_device_resource());
+        make_col(int32type, num_rows, true, stream, rmm::mr::get_current_device_resource());
       string_row_offsets.push_back(offset_col->mutable_view().data<int32_t>());
       string_row_offset_columns.emplace_back(std::move(offset_col));
       auto length_col =
-          make_col(int32type, num_rows, false, stream, rmm::mr::get_current_device_resource());
+        make_col(int32type, num_rows, false, stream, rmm::mr::get_current_device_resource());
       string_lengths.push_back(length_col->mutable_view().data<int32_t>());
       string_length_columns.emplace_back(std::move(length_col));
       // placeholder
@@ -2146,138 +2314,191 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   }
 
   auto dev_string_row_offsets =
-      make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource());
   auto dev_string_lengths =
-      make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource());
 
   // build the row_batches from the passed in list column
   std::vector<detail::row_batch> row_batches;
   row_batches.push_back(
-      {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
+    {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
 
   auto dev_output_data =
-      make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource());
   auto dev_output_nm =
-      make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource());
 
   // only ever get a single batch when going from rows, so boundaries are 0, num_rows
   constexpr auto num_batches = 2;
   device_uvector<size_type> gpu_batch_row_boundaries(num_batches, stream);
 
-  thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
-                    thrust::make_counting_iterator(num_batches), gpu_batch_row_boundaries.begin(),
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(num_batches),
+                    gpu_batch_row_boundaries.begin(),
                     [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; });
 
   int info_count = 0;
-  detail::determine_tiles(
-      column_info.column_sizes, column_info.column_starts, num_rows, num_rows, shmem_limit_per_tile,
-      [&gpu_batch_row_boundaries, &info_count, &stream](int const start_col, int const end_col,
-                                                        int const tile_height) {
-        info_count += detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream);
-      });
+  detail::determine_tiles(column_info.column_sizes,
+                          column_info.column_starts,
+                          num_rows,
+                          num_rows,
+                          shmem_limit_per_tile,
+                          [&gpu_batch_row_boundaries, &info_count, &stream](
+                            int const start_col, int const end_col, int const tile_height) {
+                            info_count += detail::compute_tile_counts(
+                              gpu_batch_row_boundaries, tile_height, stream);
+                          });
 
   // allocate space for tiles
   device_uvector<detail::tile_info> gpu_tile_infos(info_count, stream);
 
   int tile_offset = 0;
   detail::determine_tiles(
-      column_info.column_sizes, column_info.column_starts, num_rows, num_rows, shmem_limit_per_tile,
-      [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset,
-       stream](int const start_col, int const end_col, int const tile_height) {
-        tile_offset += detail::build_tiles(
-            {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
-            gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream);
-      });
+    column_info.column_sizes,
+    column_info.column_starts,
+    num_rows,
+    num_rows,
+    shmem_limit_per_tile,
+    [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset, stream](
+      int const start_col, int const end_col, int const tile_height) {
+      tile_offset += detail::build_tiles(
+        {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
+        gpu_batch_row_boundaries,
+        start_col,
+        end_col,
+        tile_height,
+        num_rows,
+        stream);
+    });
 
   dim3 const blocks(gpu_tile_infos.size());
 
   // validity needs to be calculated based on the actual number of final table columns
   auto validity_tile_infos =
-      detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches);
+    detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches);
 
-  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream,
-                                                           rmm::mr::get_current_device_resource());
+  auto dev_validity_tile_infos =
+    make_device_uvector_async(validity_tile_infos, stream, rmm::mr::get_current_device_resource());
 
   dim3 const validity_blocks(validity_tile_infos.size());
 
   if (dev_string_row_offsets.size() == 0) {
     detail::fixed_width_row_offset_functor offset_functor(size_per_row);
 
-    detail::copy_from_rows<<<gpu_tile_infos.size(), NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                             total_shmem_in_bytes, stream.value()>>>(
-        num_rows, num_columns, shmem_limit_per_tile, offset_functor,
-        gpu_batch_row_boundaries.data(), dev_output_data.data(), dev_col_sizes.data(),
-        dev_col_starts.data(), gpu_tile_infos, child.data<int8_t>());
+    detail::copy_from_rows<<<gpu_tile_infos.size(),
+                             NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
+                             total_shmem_in_bytes,
+                             stream.value()>>>(num_rows,
+                                               num_columns,
+                                               shmem_limit_per_tile,
+                                               offset_functor,
+                                               gpu_batch_row_boundaries.data(),
+                                               dev_output_data.data(),
+                                               dev_col_sizes.data(),
+                                               dev_col_starts.data(),
+                                               gpu_tile_infos,
+                                               child.data<int8_t>());
 
     detail::copy_validity_from_rows<<<validity_tile_infos.size(),
                                       NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                                      total_shmem_in_bytes, stream.value()>>>(
-        num_rows, num_columns, shmem_limit_per_tile, offset_functor,
-        gpu_batch_row_boundaries.data(), dev_output_nm.data(), column_info.column_starts.back(),
-        dev_validity_tile_infos, child.data<int8_t>());
+                                      total_shmem_in_bytes,
+                                      stream.value()>>>(num_rows,
+                                                        num_columns,
+                                                        shmem_limit_per_tile,
+                                                        offset_functor,
+                                                        gpu_batch_row_boundaries.data(),
+                                                        dev_output_nm.data(),
+                                                        column_info.column_starts.back(),
+                                                        dev_validity_tile_infos,
+                                                        child.data<int8_t>());
 
   } else {
     detail::string_row_offset_functor offset_functor(device_span<size_type const>{input.offsets()});
-    detail::copy_from_rows<<<gpu_tile_infos.size(), NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                             total_shmem_in_bytes, stream.value()>>>(
-        num_rows, num_columns, shmem_limit_per_tile, offset_functor,
-        gpu_batch_row_boundaries.data(), dev_output_data.data(), dev_col_sizes.data(),
-        dev_col_starts.data(), gpu_tile_infos, child.data<int8_t>());
+    detail::copy_from_rows<<<gpu_tile_infos.size(),
+                             NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
+                             total_shmem_in_bytes,
+                             stream.value()>>>(num_rows,
+                                               num_columns,
+                                               shmem_limit_per_tile,
+                                               offset_functor,
+                                               gpu_batch_row_boundaries.data(),
+                                               dev_output_data.data(),
+                                               dev_col_sizes.data(),
+                                               dev_col_starts.data(),
+                                               gpu_tile_infos,
+                                               child.data<int8_t>());
 
     detail::copy_validity_from_rows<<<validity_tile_infos.size(),
                                       NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
-                                      total_shmem_in_bytes, stream.value()>>>(
-        num_rows, num_columns, shmem_limit_per_tile, offset_functor,
-        gpu_batch_row_boundaries.data(), dev_output_nm.data(), column_info.column_starts.back(),
-        dev_validity_tile_infos, child.data<int8_t>());
+                                      total_shmem_in_bytes,
+                                      stream.value()>>>(num_rows,
+                                                        num_columns,
+                                                        shmem_limit_per_tile,
+                                                        offset_functor,
+                                                        gpu_batch_row_boundaries.data(),
+                                                        dev_output_nm.data(),
+                                                        column_info.column_starts.back(),
+                                                        dev_validity_tile_infos,
+                                                        child.data<int8_t>());
 
     std::vector<device_uvector<size_type>> string_col_offsets;
     std::vector<rmm::device_uvector<char>> string_data_cols;
-    std::vector<size_type *> string_col_offset_ptrs;
-    std::vector<char *> string_data_col_ptrs;
-    for (auto &col_string_lengths : string_lengths) {
+    std::vector<size_type*> string_col_offset_ptrs;
+    std::vector<char*> string_data_col_ptrs;
+    for (auto& col_string_lengths : string_lengths) {
       device_uvector<size_type> output_string_offsets(num_rows + 1, stream, mr);
-      auto tmp = [num_rows, col_string_lengths] __device__(auto const &i) {
+      auto tmp = [num_rows, col_string_lengths] __device__(auto const& i) {
         return i < num_rows ? col_string_lengths[i] : 0;
       };
       auto bounded_iter = cudf::detail::make_counting_transform_iterator(0, tmp);
-      thrust::exclusive_scan(rmm::exec_policy(stream), bounded_iter, bounded_iter + num_rows + 1,
+      thrust::exclusive_scan(rmm::exec_policy(stream),
+                             bounded_iter,
+                             bounded_iter + num_rows + 1,
                              output_string_offsets.begin());
 
       // allocate destination string column
-      rmm::device_uvector<char> string_data(output_string_offsets.element(num_rows, stream), stream,
-                                            mr);
+      rmm::device_uvector<char> string_data(
+        output_string_offsets.element(num_rows, stream), stream, mr);
 
       string_col_offset_ptrs.push_back(output_string_offsets.data());
       string_data_col_ptrs.push_back(string_data.data());
       string_col_offsets.push_back(std::move(output_string_offsets));
       string_data_cols.push_back(std::move(string_data));
     }
-    auto dev_string_col_offsets = make_device_uvector_async(string_col_offset_ptrs, stream,
-                                                            rmm::mr::get_current_device_resource());
-    auto dev_string_data_cols = make_device_uvector_async(string_data_col_ptrs, stream,
-                                                          rmm::mr::get_current_device_resource());
+    auto dev_string_col_offsets = make_device_uvector_async(
+      string_col_offset_ptrs, stream, rmm::mr::get_current_device_resource());
+    auto dev_string_data_cols = make_device_uvector_async(
+      string_data_col_ptrs, stream, rmm::mr::get_current_device_resource());
 
     dim3 const string_blocks(
-        std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS),
-                 MAX_STRING_BLOCKS));
+      std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS),
+               MAX_STRING_BLOCKS));
 
-    detail::copy_strings_from_rows<<<string_blocks, NUM_WARPS_IN_BLOCK * cudf::detail::warp_size, 0,
+    detail::copy_strings_from_rows<<<string_blocks,
+                                     NUM_WARPS_IN_BLOCK * cudf::detail::warp_size,
+                                     0,
                                      stream.value()>>>(
-        offset_functor, dev_string_row_offsets.data(), dev_string_lengths.data(),
-        dev_string_col_offsets.data(), dev_string_data_cols.data(), child.data<int8_t>(), num_rows,
-        static_cast<cudf::size_type>(string_col_offsets.size()));
+      offset_functor,
+      dev_string_row_offsets.data(),
+      dev_string_lengths.data(),
+      dev_string_col_offsets.data(),
+      dev_string_data_cols.data(),
+      child.data<int8_t>(),
+      num_rows,
+      static_cast<cudf::size_type>(string_col_offsets.size()));
 
     // merge strings back into output_columns
     int string_idx = 0;
     for (int i = 0; i < static_cast<int>(schema.size()); ++i) {
       if (schema[i].id() == type_id::STRING) {
         // stuff real string column
-        auto string_data = string_row_offset_columns[string_idx].release()->release();
-        output_columns[i] =
-            make_strings_column(num_rows, std::move(string_col_offsets[string_idx]),
-                                std::move(string_data_cols[string_idx]),
-                                std::move(*string_data.null_mask.release()), 0);
+        auto string_data  = string_row_offset_columns[string_idx].release()->release();
+        output_columns[i] = make_strings_column(num_rows,
+                                                std::move(string_col_offsets[string_idx]),
+                                                std::move(string_data_cols[string_idx]),
+                                                std::move(*string_data.null_mask.release()),
+                                                0);
         // Null count set to 0, temporarily. Will be fixed up before return.
         string_idx++;
       }
@@ -2292,11 +2513,13 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   return std::make_unique<table>(std::move(output_columns));
 }
 
-std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
-    lists_column_view const &input, std::vector<data_type> const &schema,
-    rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+std::unique_ptr<table> convert_from_rows_fixed_width_optimized(lists_column_view const& input,
+                                                               std::vector<data_type> const& schema,
+                                                               rmm::cuda_stream_view stream,
+                                                               rmm::mr::device_memory_resource* mr)
+{
   // verify that the types are what we expect
-  column_view child = input.child();
+  column_view child    = input.child();
   auto const list_type = child.type().id();
   CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8,
                "Only a list of bytes is supported as input");
@@ -2307,7 +2530,7 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
     std::vector<size_type> column_start;
     std::vector<size_type> column_size;
 
-    auto const num_rows = input.parent().size();
+    auto const num_rows     = input.parent().size();
     auto const size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
 
     // Ideally we would check that the offsets are all the same, etc. but for now this is probably
@@ -2315,17 +2538,17 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
     auto dev_column_start =
-        make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource());
+      make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource());
     auto dev_column_size =
-        make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource());
+      make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource());
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<column>> output_columns;
-    std::vector<int8_t *> output_data;
-    std::vector<bitmask_type *> output_nm;
+    std::vector<int8_t*> output_data;
+    std::vector<bitmask_type*> output_nm;
     for (int i = 0; i < static_cast<int>(num_columns); i++) {
       auto column =
-          make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr);
+        make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr);
       auto mut = column->mutable_view();
       output_data.emplace_back(mut.data<int8_t>());
       output_nm.emplace_back(mut.null_mask());
@@ -2333,16 +2556,22 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
     }
 
     auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
-    auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
+    auto dev_output_nm   = make_device_uvector_async(output_nm, stream, mr);
 
     dim3 blocks;
     dim3 threads;
     int shared_size =
-        detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+      detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
     detail::copy_from_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
-        num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
-        dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
+      num_rows,
+      num_columns,
+      size_per_row,
+      dev_column_start.data(),
+      dev_column_size.data(),
+      dev_output_data.data(),
+      dev_output_nm.data(),
+      child.data<int8_t>());
 
     // Set null counts, because output_columns are modified via mutable-view,
     // in the kernel above.
@@ -2355,4 +2584,4 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
   }
 }
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/row_conversion.hpp b/src/main/cpp/src/row_conversion.hpp
index 635960ad14..84ee729d55 100644
--- a/src/main/cpp/src/row_conversion.hpp
+++ b/src/main/cpp/src/row_conversion.hpp
@@ -48,4 +48,4 @@ std::unique_ptr<cudf::table> convert_from_rows(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/zorder.cu b/src/main/cpp/src/zorder.cu
index 028a30d66c..c0f21b9b3a 100644
--- a/src/main/cpp/src/zorder.cu
+++ b/src/main/cpp/src/zorder.cu
@@ -36,27 +36,32 @@ namespace {
 template <typename data_type>
 struct uint_backed_array {
   uint_backed_array() = delete;
-  __device__ explicit uint_backed_array(int32_t num_bits_per_entry): data(0),
-    num_bits_per_entry(num_bits_per_entry),  mask(static_cast<uint32_t>((1L << num_bits_per_entry) - 1)) {}
+  __device__ explicit uint_backed_array(int32_t num_bits_per_entry)
+    : data(0),
+      num_bits_per_entry(num_bits_per_entry),
+      mask(static_cast<uint32_t>((1L << num_bits_per_entry) - 1))
+  {
+  }
 
-  __device__ uint32_t operator[](int32_t i) const {
+  __device__ uint32_t operator[](int32_t i) const
+  {
     int32_t offset = num_bits_per_entry * i;
     return (data >> offset) & mask;
   }
 
-  __device__ void set(int32_t i, uint32_t value) {
-    int32_t offset = i * num_bits_per_entry;
+  __device__ void set(int32_t i, uint32_t value)
+  {
+    int32_t offset        = i * num_bits_per_entry;
     data_type masked_data = data & ~(static_cast<data_type>(mask) << offset);
-    data = masked_data | (static_cast<data_type>(value & mask) << offset);
+    data                  = masked_data | (static_cast<data_type>(value & mask) << offset);
   }
 
-private:
+ private:
   data_type data;
   int32_t const num_bits_per_entry;
   uint32_t const mask;
 };
 
-
 // Most of the hilbert index code is based off of the work done by David Moten at
 // https://github.com/davidmoten/hilbert-curve, which has the following Note in
 // the code too
@@ -65,17 +70,17 @@ private:
 // With thanks also to Paul Chernoch who published a C# algorithm for Skilling's
 // work on StackOverflow and
 // <a href="https://github.com/paulchernoch/HilbertTransformation">GitHub</a>.
-__device__ uint64_t to_hilbert_index(uint_backed_array<uint64_t> const & transposed_index,
-        int32_t const num_bits_per_entry, int32_t const num_dimensions) {
-  uint64_t b = 0;
+__device__ uint64_t to_hilbert_index(uint_backed_array<uint64_t> const& transposed_index,
+                                     int32_t const num_bits_per_entry,
+                                     int32_t const num_dimensions)
+{
+  uint64_t b           = 0;
   int32_t const length = num_bits_per_entry * num_dimensions;
-  int32_t b_index = length - 1;
-  uint64_t mask = 1L << (num_bits_per_entry - 1);
+  int32_t b_index      = length - 1;
+  uint64_t mask        = 1L << (num_bits_per_entry - 1);
   for (int32_t i = 0; i < num_bits_per_entry; i++) {
     for (int32_t j = 0; j < num_dimensions; j++) {
-      if ((transposed_index[j] & mask) != 0) {
-        b |= 1L << b_index;
-      }
+      if ((transposed_index[j] & mask) != 0) { b |= 1L << b_index; }
       b_index--;
     }
     mask >>= 1;
@@ -84,11 +89,14 @@ __device__ uint64_t to_hilbert_index(uint_backed_array<uint64_t> const & transpo
   return b;
 }
 
-__device__ uint_backed_array<uint64_t> hilbert_transposed_index(uint_backed_array<uint64_t> const & point,
-        int32_t const num_bits_per_entry, int32_t const num_dimensions) {
+__device__ uint_backed_array<uint64_t> hilbert_transposed_index(
+  uint_backed_array<uint64_t> const& point,
+  int32_t const num_bits_per_entry,
+  int32_t const num_dimensions)
+{
   uint32_t const M = 1L << (num_bits_per_entry - 1);
-  int32_t const n = num_dimensions;
-  auto x = point;
+  int32_t const n  = num_dimensions;
+  auto x           = point;
 
   uint32_t p, q, t;
   uint32_t i;
@@ -97,14 +105,14 @@ __device__ uint_backed_array<uint64_t> hilbert_transposed_index(uint_backed_arra
     p = q - 1;
     for (i = 0; i < n; i++) {
       if ((x[i] & q) != 0) {
-        x.set(0, x[0] ^ p); // invert
+        x.set(0, x[0] ^ p);  // invert
       } else {
         t = (x[0] ^ x[i]) & p;
         x.set(0, x[0] ^ t);
         x.set(i, x[i] ^ t);
       }
     }
-  } // exchange
+  }  // exchange
 
   // Gray encode
   for (i = 1; i < n; i++) {
@@ -112,9 +120,7 @@ __device__ uint_backed_array<uint64_t> hilbert_transposed_index(uint_backed_arra
   }
   t = 0;
   for (q = M; q > 1; q >>= 1) {
-    if ((x[n - 1] & q) != 0) {
-      t ^= q - 1;
-    }
+    if ((x[n - 1] & q) != 0) { t ^= q - 1; }
   }
 
   for (i = 0; i < n; i++) {
@@ -124,21 +130,19 @@ __device__ uint_backed_array<uint64_t> hilbert_transposed_index(uint_backed_arra
   return x;
 }
 
-
-} // namespace
+}  // namespace
 
 namespace spark_rapids_jni {
 
-std::unique_ptr<cudf::column> interleave_bits(
-  cudf::table_view const& tbl,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) {
-
+std::unique_ptr<cudf::column> interleave_bits(cudf::table_view const& tbl,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
   auto num_columns = tbl.num_columns();
   CUDF_EXPECTS(num_columns > 0, "The input table must have at least one column.");
   CUDF_EXPECTS(is_fixed_width(tbl.begin()->type()), "Only fixed width columns can be used");
 
-  auto const type_id = tbl.begin()->type().id();
+  auto const type_id        = tbl.begin()->type().id();
   auto const data_type_size = cudf::size_of(tbl.begin()->type());
   CUDF_EXPECTS(
     std::all_of(tbl.begin(),
@@ -152,14 +156,14 @@ std::unique_ptr<cudf::column> interleave_bits(
   const cudf::size_type max_bytes_allowed = std::numeric_limits<cudf::size_type>::max();
 
   int64_t total_output_size = static_cast<int64_t>(num_rows) * data_type_size * num_columns;
-  CUDF_EXPECTS (total_output_size <= max_bytes_allowed, "Input is too large to process");
+  CUDF_EXPECTS(total_output_size <= max_bytes_allowed, "Input is too large to process");
 
   cudf::size_type output_size = static_cast<cudf::size_type>(total_output_size);
 
   auto input_dv = cudf::table_device_view::create(tbl, stream);
 
   auto output_data_col = cudf::make_numeric_column(
-      cudf::data_type{cudf::type_id::UINT8}, output_size, cudf::mask_state::UNALLOCATED, stream, mr);
+    cudf::data_type{cudf::type_id::UINT8}, output_size, cudf::mask_state::UNALLOCATED, stream, mr);
 
   auto output_dv_ptr = cudf::mutable_column_device_view::create(*output_data_col, stream);
 
@@ -167,76 +171,80 @@ std::unique_ptr<cudf::column> interleave_bits(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
     output_size,
-    [col = *output_dv_ptr, 
-     num_columns,
-     data_type_size,
-     input = *input_dv] __device__ (cudf::size_type ret_idx) {
-       // The most significant byte needs to come from the most significant column, so we switch the order of the output
-       // bytes to match that
-       cudf::size_type const flipped_start_byte_index = (ret_idx / num_columns) * num_columns;
-       cudf::size_type const flipped_ret_idx = flipped_start_byte_index + (num_columns - 1 - (ret_idx - flipped_start_byte_index));
-
-       uint8_t ret_byte = 0;
-       for (cudf::size_type output_bit_offset = 7; output_bit_offset >= 0; output_bit_offset--) {
-         // The index (in bits) of the output bit we are computing right now
-         int64_t const output_bit_index = flipped_ret_idx * 8L + output_bit_offset;
-
-         // The most significant bit should come from the most significant column, but 0 is
-         // our most significant column, so switch the order of the columns.
-         cudf::size_type const column_idx = num_columns - 1 - (output_bit_index % num_columns);
-         auto column = input.column(column_idx);
-
-         // Also we need to convert the endian byte order when we read the bytes.
-         int64_t const bit_index_within_column = output_bit_index / num_columns;
-         cudf::size_type const little_endian_read_byte_index = bit_index_within_column / 8;
-         cudf::size_type const read_bit_offset = bit_index_within_column % 8;
-         cudf::size_type const input_row_number = little_endian_read_byte_index / data_type_size;
-         cudf::size_type const start_row_byte_index = input_row_number * data_type_size;
-         cudf::size_type const read_byte_index = start_row_byte_index + (data_type_size - 1 - (little_endian_read_byte_index - start_row_byte_index));
-
-         uint32_t const byte_data = column.is_valid(input_row_number) ? column.data<uint8_t>()[read_byte_index] : 0;
-         uint32_t const tmp = ((byte_data >> read_bit_offset) & 1) << output_bit_offset;
-         ret_byte = static_cast<uint8_t>(ret_byte | tmp);
-       }
-       col.data<uint8_t>()[ret_idx] = ret_byte;
-     });
-  
-  auto offset_begin = thrust::make_constant_iterator(data_type_size * num_columns);
-  auto offsets_column = std::get<0>(cudf::detail::make_offsets_child_column(
-    offset_begin, offset_begin + num_rows, stream, mr));
+    [col = *output_dv_ptr, num_columns, data_type_size, input = *input_dv] __device__(
+      cudf::size_type ret_idx) {
+      // The most significant byte needs to come from the most significant column, so we switch the
+      // order of the output bytes to match that
+      cudf::size_type const flipped_start_byte_index = (ret_idx / num_columns) * num_columns;
+      cudf::size_type const flipped_ret_idx =
+        flipped_start_byte_index + (num_columns - 1 - (ret_idx - flipped_start_byte_index));
+
+      uint8_t ret_byte = 0;
+      for (cudf::size_type output_bit_offset = 7; output_bit_offset >= 0; output_bit_offset--) {
+        // The index (in bits) of the output bit we are computing right now
+        int64_t const output_bit_index = flipped_ret_idx * 8L + output_bit_offset;
+
+        // The most significant bit should come from the most significant column, but 0 is
+        // our most significant column, so switch the order of the columns.
+        cudf::size_type const column_idx = num_columns - 1 - (output_bit_index % num_columns);
+        auto column                      = input.column(column_idx);
+
+        // Also we need to convert the endian byte order when we read the bytes.
+        int64_t const bit_index_within_column               = output_bit_index / num_columns;
+        cudf::size_type const little_endian_read_byte_index = bit_index_within_column / 8;
+        cudf::size_type const read_bit_offset               = bit_index_within_column % 8;
+        cudf::size_type const input_row_number     = little_endian_read_byte_index / data_type_size;
+        cudf::size_type const start_row_byte_index = input_row_number * data_type_size;
+        cudf::size_type const read_byte_index =
+          start_row_byte_index +
+          (data_type_size - 1 - (little_endian_read_byte_index - start_row_byte_index));
+
+        uint32_t const byte_data =
+          column.is_valid(input_row_number) ? column.data<uint8_t>()[read_byte_index] : 0;
+        uint32_t const tmp = ((byte_data >> read_bit_offset) & 1) << output_bit_offset;
+        ret_byte           = static_cast<uint8_t>(ret_byte | tmp);
+      }
+      col.data<uint8_t>()[ret_idx] = ret_byte;
+    });
+
+  auto offset_begin   = thrust::make_constant_iterator(data_type_size * num_columns);
+  auto offsets_column = std::get<0>(
+    cudf::detail::make_offsets_child_column(offset_begin, offset_begin + num_rows, stream, mr));
 
   return cudf::make_lists_column(num_rows,
-    std::move(offsets_column),
-    std::move(output_data_col),
-    0,
-    rmm::device_buffer(),
-    stream,
-    mr);
+                                 std::move(offsets_column),
+                                 std::move(output_data_col),
+                                 0,
+                                 rmm::device_buffer(),
+                                 stream,
+                                 mr);
 }
 
-std::unique_ptr<cudf::column> hilbert_index(
-  int32_t const num_bits_per_entry,
-  cudf::table_view const& tbl,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) {
- 
-  auto const num_rows = tbl.num_rows();
+std::unique_ptr<cudf::column> hilbert_index(int32_t const num_bits_per_entry,
+                                            cudf::table_view const& tbl,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  auto const num_rows    = tbl.num_rows();
   auto const num_columns = tbl.num_columns();
 
-  CUDF_EXPECTS(num_bits_per_entry > 0 && num_bits_per_entry <= 32, "the number of bits must be >0 and <= 32.");
-  CUDF_EXPECTS(num_bits_per_entry * num_columns <= 64, "we only support up to 64 bits of output right now.");
+  CUDF_EXPECTS(num_bits_per_entry > 0 && num_bits_per_entry <= 32,
+               "the number of bits must be >0 and <= 32.");
+  CUDF_EXPECTS(num_bits_per_entry * num_columns <= 64,
+               "we only support up to 64 bits of output right now.");
   CUDF_EXPECTS(num_columns > 0, "at least one column is required.");
 
-  CUDF_EXPECTS(
-    std::all_of(tbl.begin(),
-                tbl.end(),
-                [](cudf::column_view const& col) { return col.type().id() == cudf::type_id::INT32; }),
-    "All columns of the input table must be INT32.");
+  CUDF_EXPECTS(std::all_of(tbl.begin(),
+                           tbl.end(),
+                           [](cudf::column_view const& col) {
+                             return col.type().id() == cudf::type_id::INT32;
+                           }),
+               "All columns of the input table must be INT32.");
 
   auto const input_dv = cudf::table_device_view::create(tbl, stream);
 
   auto output_data_col = cudf::make_numeric_column(
-      cudf::data_type{cudf::type_id::INT64}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr);
+    cudf::data_type{cudf::type_id::INT64}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr);
 
   auto const output_dv_ptr = cudf::mutable_column_device_view::create(*output_data_col, stream);
 
@@ -245,22 +253,20 @@ std::unique_ptr<cudf::column> hilbert_index(
     thrust::make_counting_iterator<cudf::size_type>(0),
     thrust::make_counting_iterator<cudf::size_type>(0) + num_rows,
     output_dv_ptr->begin<int64_t>(),
-    [num_bits_per_entry,
-     num_columns,
-     input = *input_dv] __device__ (cudf::size_type row_index) {
-       uint_backed_array<uint64_t> row(num_bits_per_entry);
-       for (cudf::size_type column_index = 0; column_index < num_columns; column_index++) {
-         auto const column = input.column(column_index);
-         uint32_t const data = column.is_valid(row_index) ? column.data<uint32_t>()[row_index] : 0;
-         row.set(column_index, data);
-       }
-
-       auto const transposed_index = hilbert_transposed_index(row, num_bits_per_entry, num_columns);
-       return static_cast<int64_t>(
-         to_hilbert_index(transposed_index, num_bits_per_entry, num_columns));
-     });
+    [num_bits_per_entry, num_columns, input = *input_dv] __device__(cudf::size_type row_index) {
+      uint_backed_array<uint64_t> row(num_bits_per_entry);
+      for (cudf::size_type column_index = 0; column_index < num_columns; column_index++) {
+        auto const column   = input.column(column_index);
+        uint32_t const data = column.is_valid(row_index) ? column.data<uint32_t>()[row_index] : 0;
+        row.set(column_index, data);
+      }
+
+      auto const transposed_index = hilbert_transposed_index(row, num_bits_per_entry, num_columns);
+      return static_cast<int64_t>(
+        to_hilbert_index(transposed_index, num_bits_per_entry, num_columns));
+    });
 
   return output_data_col;
 }
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/zorder.hpp b/src/main/cpp/src/zorder.hpp
index 975ff79df8..9268f5b71e 100644
--- a/src/main/cpp/src/zorder.hpp
+++ b/src/main/cpp/src/zorder.hpp
@@ -35,4 +35,4 @@ std::unique_ptr<cudf::column> hilbert_index(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/tests/cast_decimal_to_string.cpp b/src/main/cpp/tests/cast_decimal_to_string.cpp
index f59b77ca05..1a93354339 100644
--- a/src/main/cpp/tests/cast_decimal_to_string.cpp
+++ b/src/main/cpp/tests/cast_decimal_to_string.cpp
@@ -30,8 +30,7 @@
 using namespace cudf;
 
 template <typename T>
-struct DecimalToStringTests : public test::BaseFixture {
-};
+struct DecimalToStringTests : public test::BaseFixture {};
 
 TYPED_TEST_SUITE(DecimalToStringTests, cudf::test::FixedPointTypes);
 
diff --git a/src/main/cpp/tests/cast_string.cpp b/src/main/cpp/tests/cast_string.cpp
index 598d570611..c736d5971f 100644
--- a/src/main/cpp/tests/cast_string.cpp
+++ b/src/main/cpp/tests/cast_string.cpp
@@ -30,14 +30,12 @@
 using namespace cudf;
 
 template <typename T>
-struct StringToIntegerTests : public test::BaseFixture {
-};
+struct StringToIntegerTests : public test::BaseFixture {};
 
 struct StringToDecimalTests : public test::BaseFixture {};
 
 template <typename T>
-struct StringToFloatTests : public test::BaseFixture {
-};
+struct StringToFloatTests : public test::BaseFixture {};
 
 TYPED_TEST_SUITE(StringToIntegerTests, cudf::test::IntegralTypesNotBool);
 TYPED_TEST_SUITE(StringToFloatTests, cudf::test::FloatingPointTypes);
@@ -240,8 +238,8 @@ TYPED_TEST(StringToIntegerTests, Overflow)
 
 TYPED_TEST(StringToIntegerTests, Empty)
 {
-  auto empty = std::make_unique<column>(data_type{type_id::STRING}, 0, rmm::device_buffer{},
-          rmm::device_buffer{}, 0);
+  auto empty = std::make_unique<column>(
+    data_type{type_id::STRING}, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
 
   auto result = spark_rapids_jni::string_to_integer(data_type{type_to_id<TypeParam>()},
                                                     strings_column_view{empty->view()},
@@ -542,8 +540,8 @@ TEST_F(StringToDecimalTests, Edges)
 
 TEST_F(StringToDecimalTests, Empty)
 {
-  auto empty = std::make_unique<column>(data_type{type_id::STRING}, 0, rmm::device_buffer{},
-          rmm::device_buffer{}, 0);
+  auto empty = std::make_unique<column>(
+    data_type{type_id::STRING}, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
 
   auto const result = spark_rapids_jni::string_to_decimal(
     8, 2, strings_column_view{empty->view()}, false, true, cudf::get_default_stream());
@@ -698,8 +696,8 @@ TYPED_TEST(StringToFloatTests, TrickyValues)
 
 TYPED_TEST(StringToFloatTests, Empty)
 {
-  auto empty = std::make_unique<column>(data_type{type_id::STRING}, 0, rmm::device_buffer{},
-          rmm::device_buffer{}, 0);
+  auto empty = std::make_unique<column>(
+    data_type{type_id::STRING}, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
 
   auto const result = spark_rapids_jni::string_to_float(data_type{type_to_id<TypeParam>()},
                                                         strings_column_view{empty->view()},
diff --git a/src/main/cpp/tests/row_conversion.cpp b/src/main/cpp/tests/row_conversion.cpp
index e140918f09..7e104c3871 100644
--- a/src/main/cpp/tests/row_conversion.cpp
+++ b/src/main/cpp/tests/row_conversion.cpp
@@ -35,10 +35,8 @@
 
 #include <limits>
 
-struct ColumnToRowTests : public cudf::test::BaseFixture {
-};
-struct RowToColumnTests : public cudf::test::BaseFixture {
-};
+struct ColumnToRowTests : public cudf::test::BaseFixture {};
+struct RowToColumnTests : public cudf::test::BaseFixture {};
 
 TEST_F(ColumnToRowTests, Single)
 {
@@ -51,7 +49,8 @@ TEST_F(ColumnToRowTests, Single)
 
   EXPECT_EQ(old_rows.size(), new_rows.size());
   for (uint i = 0; i < old_rows.size(); ++i) {
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
 
@@ -212,7 +211,8 @@ TEST_F(ColumnToRowTests, Simple)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -234,7 +234,8 @@ TEST_F(ColumnToRowTests, Tall)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -260,7 +261,8 @@ TEST_F(ColumnToRowTests, Wide)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -288,7 +290,8 @@ TEST_F(ColumnToRowTests, SingleByteWide)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -319,7 +322,8 @@ TEST_F(ColumnToRowTests, Non2Power)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
@@ -355,7 +359,8 @@ TEST_F(ColumnToRowTests, Big)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
@@ -390,7 +395,8 @@ TEST_F(ColumnToRowTests, Bigger)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
@@ -426,7 +432,8 @@ TEST_F(ColumnToRowTests, Biggest)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     for (int j = 0; j < old_tbl->num_columns(); ++j) {
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(old_tbl->get_column(j), new_tbl->get_column(j));
@@ -446,7 +453,8 @@ TEST_F(RowToColumnTests, Single)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -462,7 +470,8 @@ TEST_F(RowToColumnTests, Simple)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -484,7 +493,8 @@ TEST_F(RowToColumnTests, Tall)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -511,7 +521,8 @@ TEST_F(RowToColumnTests, Wide)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -537,7 +548,8 @@ TEST_F(RowToColumnTests, SingleByteWide)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -578,7 +590,8 @@ TEST_F(RowToColumnTests, AllTypes)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -700,7 +713,8 @@ TEST_F(RowToColumnTests, AllTypesLarge)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -728,7 +742,8 @@ TEST_F(RowToColumnTests, Non2Power)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -757,7 +772,8 @@ TEST_F(RowToColumnTests, Big)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -786,7 +802,8 @@ TEST_F(RowToColumnTests, Bigger)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*old_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -816,7 +833,8 @@ TEST_F(RowToColumnTests, Biggest)
   for (uint i = 0; i < old_rows.size(); ++i) {
     auto old_tbl = spark_rapids_jni::convert_from_rows_fixed_width_optimized(
       cudf::lists_column_view(*old_rows[i]), schema);
-    auto new_tbl = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+    auto new_tbl =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*old_tbl, *new_tbl);
   }
@@ -870,7 +888,8 @@ TEST_F(RowToColumnTests, DoubleString)
   auto new_rows = spark_rapids_jni::convert_to_rows(in);
 
   for (uint i = 0; i < new_rows.size(); ++i) {
-    auto new_cols = spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
+    auto new_cols =
+      spark_rapids_jni::convert_from_rows(cudf::lists_column_view(*new_rows[i]), schema);
 
     EXPECT_EQ(new_rows[0]->size(), 5);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols);

From 0b4e14176d9b41014127e4964c9d3769aa430db8 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 6 Jul 2023 11:02:39 +0800
Subject: [PATCH 075/113] Update submodule cudf to
 e907d3f00f889b8406fa5fd4afbbc2dcf84beb61 (#1253)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 9da347e3b0..e907d3f00f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 9da347e3b006b72efbe9c55733048dcbb1bc721f
+Subproject commit e907d3f00f889b8406fa5fd4afbbc2dcf84beb61

From 2fc25478237ae19a004f16d0f7c0e8b1656df293 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 6 Jul 2023 21:02:14 +0800
Subject: [PATCH 076/113] Update submodule cudf to
 ecdaa91dc977ca58f94266d40a47ec8dffffbebd (#1254)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e907d3f00f..ecdaa91dc9 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e907d3f00f889b8406fa5fd4afbbc2dcf84beb61
+Subproject commit ecdaa91dc977ca58f94266d40a47ec8dffffbebd

From a8e0319f43a79aac7799abc6097b6fa9d519c432 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Thu, 6 Jul 2023 12:49:25 -0700
Subject: [PATCH 077/113] Added non-empty tests to it's own profile (#1252)

Added an execution tag to run ColumnViewNonEmptyNullsTests for profile no-cufile-tests

Signed-off-by: Raza Jafri <raza.jafri@gmail.com>
---
 pom.xml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pom.xml b/pom.xml
index 58adcd624a..db0ab6832c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -195,9 +195,20 @@
                   <excludes>
                     <exclude>**/CuFileTest.java</exclude>
                     <exclude>**/CudaFatalTest.java</exclude>
+                    <exclude>**/ColumnViewNonEmptyNullsTest.java</exclude>
                   </excludes>
                 </configuration>
               </execution>
+              <execution>
+                <id>non-empty-null-test</id>
+                <goals>
+                  <goal>test</goal>
+                </goals>
+                <configuration>
+                  <argLine>-da:ai.rapids.cudf.AssertEmptyNulls</argLine>
+                  <test>ColumnViewNonEmptyNullsTest</test>
+                </configuration>
+              </execution>
             </executions>
           </plugin>
         </plugins>

From f3588b9df24183c00ac4ce629aff57289f42105e Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Fri, 7 Jul 2023 16:36:33 -0500
Subject: [PATCH 078/113] Copy the spark-specific implementation of murmur32
 hash from cudf into spark-rapids-jni (#1246)

* Back port spark-specific murmur32 hash code from cudf.

* Run pre-commit to format files. We were behind a bit.

* Update pre-commit config to 16.0.1 to match cudf. Re-ran formatting.

* Change jni bindings to use the spark-rapids-jni implementation of murmur hash instead of the cudf version.  Brought over
cpp and java tests.

* Documentation fix.

* Fix cpp tests to actually call the spark_rapids_jni murmur hash.

* Moved murmur32 hash implementaion from cudf to spark-rapids-jni

Signed-off-by: db <dbaranec@nvidia.com>

* Add missing newlines.

* PR review changes.

---------

Signed-off-by: db <dbaranec@nvidia.com>
---
 src/main/cpp/CMakeLists.txt                   |   2 +
 src/main/cpp/src/HashJni.cpp                  |  39 +
 src/main/cpp/src/hash.cuh                     |  94 ++
 src/main/cpp/src/murmur_hash.cu               | 397 ++++++++
 src/main/cpp/tests/CMakeLists.txt             |   2 +
 src/main/cpp/tests/hash.cpp                   | 845 ++++++++++++++++++
 .../com/nvidia/spark/rapids/jni/Hash.java     |  55 ++
 .../com/nvidia/spark/rapids/jni/HashTest.java | 264 ++++++
 8 files changed, 1698 insertions(+)
 create mode 100644 src/main/cpp/src/HashJni.cpp
 create mode 100644 src/main/cpp/src/hash.cuh
 create mode 100644 src/main/cpp/src/murmur_hash.cu
 create mode 100644 src/main/cpp/tests/hash.cpp
 create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/Hash.java
 create mode 100644 src/test/java/com/nvidia/spark/rapids/jni/HashTest.java

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index b2f5c339dd..3e7c388057 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -148,6 +148,7 @@ add_library(
   spark_rapids_jni SHARED
   src/CastStringJni.cpp
   src/DecimalUtilsJni.cpp
+  src/HashJni.cpp
   src/MapUtilsJni.cpp
   src/NativeParquetJni.cpp
   src/RowConversionJni.cpp
@@ -158,6 +159,7 @@ add_library(
   src/cast_string_to_float.cu
   src/decimal_utils.cu
   src/map_utils.cu
+  src/murmur_hash.cu
   src/row_conversion.cu
   src/zorder.cu
 )
diff --git a/src/main/cpp/src/HashJni.cpp b/src/main/cpp/src/HashJni.cpp
new file mode 100644
index 0000000000..bcf72922d4
--- /dev/null
+++ b/src/main/cpp/src/HashJni.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_utils.hpp"
+
+#include "hash.cuh"
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_Hash_murmurHash32(
+  JNIEnv* env, jclass, jint seed, jlongArray column_handles)
+{
+  JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto column_views =
+      cudf::jni::native_jpointerArray<cudf::column_view>{env, column_handles}.get_dereferenced();
+    return cudf::jni::release_as_jlong(
+      spark_rapids_jni::murmur_hash3_32(cudf::table_view{column_views}, seed));
+  }
+  CATCH_STD(env, 0);
+}
+}
diff --git a/src/main/cpp/src/hash.cuh b/src/main/cpp/src/hash.cuh
new file mode 100644
index 0000000000..84a204d08f
--- /dev/null
+++ b/src/main/cpp/src/hash.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/reverse.h>
+
+namespace spark_rapids_jni {
+
+/**
+ * @brief Converts a cudf decimal128 value to a java bigdecimal value.
+ *
+ * @param key The cudf decimal value
+ *
+ * @returns A 128 bit value containing the converted decimal bits and a length
+ *          representing the relevant number of bytes in the value.
+ *
+ */
+__device__ __inline__ std::pair<__int128_t, cudf::size_type> to_java_bigdecimal(
+  numeric::decimal128 key)
+{
+  // java.math.BigDecimal.valueOf(unscaled_value, _scale).unscaledValue().toByteArray()
+  // https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L381
+  __int128_t const val               = key.value();
+  constexpr cudf::size_type key_size = sizeof(__int128_t);
+  std::byte const* data              = reinterpret_cast<std::byte const*>(&val);
+
+  // Small negative values start with 0xff..., small positive values start with 0x00...
+  bool const is_negative     = val < 0;
+  std::byte const zero_value = is_negative ? std::byte{0xff} : std::byte{0x00};
+
+  // If the value can be represented with a shorter than 16-byte integer, the
+  // leading bytes of the little-endian value are truncated and are not hashed.
+  auto const reverse_begin = thrust::reverse_iterator(data + key_size);
+  auto const reverse_end   = thrust::reverse_iterator(data);
+  auto const first_nonzero_byte =
+    thrust::find_if_not(thrust::seq, reverse_begin, reverse_end, [zero_value](std::byte const& v) {
+      return v == zero_value;
+    }).base();
+  // Max handles special case of 0 and -1 which would shorten to 0 length otherwise
+  cudf::size_type length =
+    std::max(1, static_cast<cudf::size_type>(thrust::distance(data, first_nonzero_byte)));
+
+  // Preserve the 2's complement sign bit by adding a byte back on if necessary.
+  // e.g. 0x0000ff would shorten to 0x00ff. The 0x00 byte is retained to
+  // preserve the sign bit, rather than leaving an "f" at the front which would
+  // change the sign bit. However, 0x00007f would shorten to 0x7f. No extra byte
+  // is needed because the leftmost bit matches the sign bit. Similarly for
+  // negative values: 0xffff00 --> 0xff00 and 0xffff80 --> 0x80.
+  if ((length < key_size) && (is_negative ^ bool(data[length - 1] & std::byte{0x80}))) { ++length; }
+
+  // Convert to big endian by reversing the range of nonzero bytes. Only those bytes are hashed.
+  __int128_t big_endian_value = 0;
+  auto big_endian_data        = reinterpret_cast<std::byte*>(&big_endian_value);
+  thrust::reverse_copy(thrust::seq, data, data + length, big_endian_data);
+
+  return {big_endian_value, length};
+}
+
+/**
+ * @brief Computes the murmur32 hash value of each row in the input set of columns.
+ *
+ * @param input The table of columns to hash
+ * @param seed Optional seed value to use for the hash function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a column from the input.
+ */
+std::unique_ptr<cudf::column> murmur_hash3_32(
+  cudf::table_view const& input,
+  uint32_t seed                       = 0,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/murmur_hash.cu b/src/main/cpp/src/murmur_hash.cu
new file mode 100644
index 0000000000..fc8f2db8f6
--- /dev/null
+++ b/src/main/cpp/src/murmur_hash.cu
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/hashing.hpp>
+#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/table/table_device_view.cuh>
+
+#include "hash.cuh"
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/tabulate.h>
+
+namespace spark_rapids_jni {
+
+namespace {
+
+using spark_hash_value_type = int32_t;
+
+template <typename Key, CUDF_ENABLE_IF(not cudf::is_nested<Key>())>
+struct SparkMurmurHash3_32 {
+  using result_type = spark_hash_value_type;
+
+  constexpr SparkMurmurHash3_32() = default;
+  constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {}
+
+  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
+  {
+    h ^= h >> 16;
+    h *= 0x85ebca6b;
+    h ^= h >> 13;
+    h *= 0xc2b2ae35;
+    h ^= h >> 16;
+    return h;
+  }
+
+  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
+                                                      cudf::size_type offset) const
+  {
+    // Read a 4-byte value from the data pointer as individual bytes for safe
+    // unaligned access (very likely for string types).
+    auto block = reinterpret_cast<uint8_t const*>(data + offset);
+    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
+  }
+
+  [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
+  {
+    return compute(key);
+  }
+
+  template <typename T>
+  result_type __device__ inline compute(T const& key) const
+  {
+    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
+  }
+
+  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
+                                                        cudf::size_type len,
+                                                        cudf::size_type tail_offset,
+                                                        result_type h) const
+  {
+    // Process remaining bytes that do not fill a four-byte chunk using Spark's approach
+    // (does not conform to normal MurmurHash3).
+    for (auto i = tail_offset; i < len; i++) {
+      // We require a two-step cast to get the k1 value from the byte. First,
+      // we must cast to a signed int8_t. Then, the sign bit is preserved when
+      // casting to uint32_t under 2's complement. Java preserves the sign when
+      // casting byte-to-int, but C++ does not.
+      uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
+      k1 *= c1;
+      k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
+      k1 *= c2;
+      h ^= k1;
+      h = cudf::detail::rotate_bits_left(h, rot_c2);
+      h = h * 5 + c3;
+    }
+    return h;
+  }
+
+  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
+  {
+    constexpr cudf::size_type BLOCK_SIZE = 4;
+    cudf::size_type const nblocks        = len / BLOCK_SIZE;
+    cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
+    result_type h                        = m_seed;
+
+    // Process all four-byte chunks.
+    for (cudf::size_type i = 0; i < nblocks; i++) {
+      uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
+      k1 *= c1;
+      k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
+      k1 *= c2;
+      h ^= k1;
+      h = cudf::detail::rotate_bits_left(h, rot_c2);
+      h = h * 5 + c3;
+    }
+
+    h = compute_remaining_bytes(data, len, tail_offset, h);
+
+    // Finalize hash.
+    h ^= len;
+    h = fmix32(h);
+    return h;
+  }
+
+ private:
+  uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
+  static constexpr uint32_t c1     = 0xcc9e2d51;
+  static constexpr uint32_t c2     = 0x1b873593;
+  static constexpr uint32_t c3     = 0xe6546b64;
+  static constexpr uint32_t rot_c1 = 15;
+  static constexpr uint32_t rot_c2 = 13;
+};
+
+template <>
+spark_hash_value_type __device__ inline SparkMurmurHash3_32<bool>::operator()(bool const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+spark_hash_value_type __device__ inline SparkMurmurHash3_32<int8_t>::operator()(
+  int8_t const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+spark_hash_value_type __device__ inline SparkMurmurHash3_32<uint8_t>::operator()(
+  uint8_t const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+spark_hash_value_type __device__ inline SparkMurmurHash3_32<int16_t>::operator()(
+  int16_t const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+spark_hash_value_type __device__ inline SparkMurmurHash3_32<uint16_t>::operator()(
+  uint16_t const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+spark_hash_value_type __device__ inline SparkMurmurHash3_32<float>::operator()(
+  float const& key) const
+{
+  return compute<float>(cudf::detail::normalize_nans(key));
+}
+
+template <>
+spark_hash_value_type __device__ inline SparkMurmurHash3_32<double>::operator()(
+  double const& key) const
+{
+  return compute<double>(cudf::detail::normalize_nans(key));
+}
+
+template <>
+spark_hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operator()(
+  cudf::string_view const& key) const
+{
+  auto const data = reinterpret_cast<std::byte const*>(key.data());
+  auto const len  = key.size_bytes();
+  return compute_bytes(data, len);
+}
+
+template <>
+spark_hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal32>::operator()(
+  numeric::decimal32 const& key) const
+{
+  return compute<uint64_t>(key.value());
+}
+
+template <>
+spark_hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal64>::operator()(
+  numeric::decimal64 const& key) const
+{
+  return compute<uint64_t>(key.value());
+}
+
+template <>
+spark_hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal128>::operator()(
+  numeric::decimal128 const& key) const
+{
+  auto [java_d, length] = to_java_bigdecimal(key);
+  auto bytes            = reinterpret_cast<std::byte*>(&java_d);
+  return compute_bytes(bytes, length);
+}
+
+/**
+ * @brief Computes the hash value of a row in the given table.
+ *
+ * This functor uses Spark conventions for Murmur hashing, which differs from
+ * the Murmur implementation used in the rest of libcudf. These differences
+ * include:
+ * - Serially using the output hash as an input seed for the next item
+ * - Ignorance of null values
+ *
+ * The serial use of hashes as seeds means that data of different nested types
+ * can exhibit hash collisions. For example, a row of an integer column
+ * containing a 1 will have the same hash as a lists column of integers
+ * containing a list of [1] and a struct column of a single integer column
+ * containing a struct of {1}.
+ *
+ * As a consequence of ignoring null values, inputs like [1], [1, null], and
+ * [null, 1] have the same hash (an expected hash collision). This kind of
+ * collision can also occur across a table of nullable columns and with nulls
+ * in structs ({1, null} and {null, 1} have the same hash). The seed value (the
+ * previous element's hash value) is returned as the hash if an element is
+ * null.
+ *
+ * For additional differences such as special tail processing and decimal type
+ * handling, refer to the SparkMurmurHash3_32 functor.
+ *
+ * @tparam hash_function Hash functor to use for hashing elements. Must be SparkMurmurHash3_32.
+ * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
+ */
+template <template <typename> class hash_function, typename Nullate>
+class spark_murmur_device_row_hasher {
+  friend class cudf::experimental::row::hash::row_hasher;  ///< Allow row_hasher to access private
+                                                           ///< members.
+
+ public:
+  /**
+   * @brief Return the hash value of a row in the given table.
+   *
+   * @param row_index The row index to compute the hash value of
+   * @return The hash value of the row
+   */
+  __device__ auto operator()(cudf::size_type row_index) const noexcept
+  {
+    return cudf::detail::accumulate(
+      _table.begin(),
+      _table.end(),
+      _seed,
+      [row_index, nulls = this->_check_nulls] __device__(auto hash, auto column) {
+        return cudf::type_dispatcher(
+          column.type(), element_hasher_adapter<hash_function>{nulls, hash}, column, row_index);
+      });
+  }
+
+ private:
+  /**
+   * @brief Computes the hash value of an element in the given column.
+   *
+   * When the column is non-nested, this is a simple wrapper around the element_hasher.
+   * When the column is nested, this uses a seed value to serially compute each
+   * nested element, with the output hash becoming the seed for the next value.
+   * This requires constructing a new hash functor for each nested element,
+   * using the new seed from the previous element's hash. The hash of a null
+   * element is the input seed (the previous element's hash).
+   */
+  template <template <typename> class hash_fn>
+  class element_hasher_adapter {
+   public:
+    __device__ element_hasher_adapter(Nullate check_nulls, uint32_t seed) noexcept
+      : _check_nulls(check_nulls), _seed(seed)
+    {
+    }
+
+    using hash_functor = cudf::experimental::row::hash::element_hasher<hash_fn, Nullate>;
+
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_nested<T>())>
+    __device__ spark_hash_value_type operator()(cudf::column_device_view const& col,
+                                                cudf::size_type row_index) const noexcept
+    {
+      auto const hasher = hash_functor{_check_nulls, _seed, _seed};
+      return hasher.template operator()<T>(col, row_index);
+    }
+
+    template <typename T, CUDF_ENABLE_IF(cudf::is_nested<T>())>
+    __device__ spark_hash_value_type operator()(cudf::column_device_view const& col,
+                                                cudf::size_type row_index) const noexcept
+    {
+      cudf::column_device_view curr_col = col.slice(row_index, 1);
+      while (curr_col.type().id() == cudf::type_id::STRUCT ||
+             curr_col.type().id() == cudf::type_id::LIST) {
+        if (curr_col.type().id() == cudf::type_id::STRUCT) {
+          if (curr_col.num_child_columns() == 0) { return _seed; }
+          // Non-empty structs are assumed to be decomposed and contain only one child
+          curr_col = cudf::detail::structs_column_device_view(curr_col).get_sliced_child(0);
+        } else if (curr_col.type().id() == cudf::type_id::LIST) {
+          curr_col = cudf::detail::lists_column_device_view(curr_col).get_sliced_child();
+        }
+      }
+
+      return cudf::detail::accumulate(
+        thrust::counting_iterator(0),
+        thrust::counting_iterator(curr_col.size()),
+        _seed,
+        [curr_col, nulls = this->_check_nulls] __device__(auto hash, auto element_index) {
+          auto const hasher = hash_functor{nulls, hash, hash};
+          return cudf::type_dispatcher<cudf::experimental::dispatch_void_if_nested>(
+            curr_col.type(), hasher, curr_col, element_index);
+        });
+    }
+
+    Nullate const _check_nulls;  ///< Whether to check for nulls
+    uint32_t const _seed;        ///< The seed to use for hashing, also returned for null elements
+  };
+
+  CUDF_HOST_DEVICE spark_murmur_device_row_hasher(Nullate check_nulls,
+                                                  cudf::table_device_view t,
+                                                  uint32_t seed = cudf::DEFAULT_HASH_SEED) noexcept
+    : _check_nulls{check_nulls}, _table{t}, _seed(seed)
+  {
+    // Error out if passed an unsupported hash_function
+    static_assert(
+      std::is_base_of_v<SparkMurmurHash3_32<int>, hash_function<int>>,
+      "spark_murmur_device_row_hasher only supports the SparkMurmurHash3_32 hash function");
+  }
+
+  Nullate const _check_nulls;
+  cudf::table_device_view const _table;
+  uint32_t const _seed;
+};
+
+void check_hash_compatibility(cudf::table_view const& input)
+{
+  using column_checker_fn_t = std::function<void(cudf::column_view const&)>;
+
+  column_checker_fn_t check_column = [&](cudf::column_view const& c) {
+    if (c.type().id() == cudf::type_id::LIST) {
+      auto const& list_col = cudf::lists_column_view(c);
+      CUDF_EXPECTS(list_col.child().type().id() != cudf::type_id::STRUCT,
+                   "Cannot compute hash of a table with a LIST of STRUCT columns.");
+      check_column(list_col.child());
+    } else if (c.type().id() == cudf::type_id::STRUCT) {
+      for (auto child = c.child_begin(); child != c.child_end(); ++child) {
+        check_column(*child);
+      }
+    }
+  };
+
+  for (cudf::column_view const& c : input) {
+    check_column(c);
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<cudf::column> murmur_hash3_32(cudf::table_view const& input,
+                                              uint32_t seed,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  auto output =
+    cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<spark_hash_value_type>()),
+                              input.num_rows(),
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              mr);
+
+  // Return early if there's nothing to hash
+  if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
+
+  // Lists of structs are not supported
+  check_hash_compatibility(input);
+
+  bool const nullable   = has_nested_nulls(input);
+  auto const row_hasher = cudf::experimental::row::hash::row_hasher(input, stream);
+  auto output_view      = output->mutable_view();
+
+  // Compute the hash value for each row
+  thrust::tabulate(
+    rmm::exec_policy(stream),
+    output_view.begin<spark_hash_value_type>(),
+    output_view.end<spark_hash_value_type>(),
+    row_hasher.device_hasher<SparkMurmurHash3_32, spark_murmur_device_row_hasher>(nullable, seed));
+
+  return output;
+}
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt
index a9baf02e6b..8ef0c2f2e4 100644
--- a/src/main/cpp/tests/CMakeLists.txt
+++ b/src/main/cpp/tests/CMakeLists.txt
@@ -54,3 +54,5 @@ ConfigureTest(CAST_DECIMAL_TO_STRING
 ConfigureTest(ROW_CONVERSION
     row_conversion.cpp)
 
+ConfigureTest(HASH
+    hash.cpp)
diff --git a/src/main/cpp/tests/hash.cpp b/src/main/cpp/tests/hash.cpp
new file mode 100644
index 0000000000..8ac8f2862e
--- /dev/null
+++ b/src/main/cpp/tests/hash.cpp
@@ -0,0 +1,845 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/hashing.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include "hash.cuh"
+
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
+
+class HashTest : public cudf::test::BaseFixture {};
+
+TEST_F(HashTest, MultiValue)
+{
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
+
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, 100, -100, limits::min(), limits::max()});
+
+  // Different truth values should be equal
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
+
+  using ts = cudf::timestamp_s;
+  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col(
+    {ts::duration::zero(),
+     static_cast<ts::duration>(100),
+     static_cast<ts::duration>(-100),
+     ts::duration::min(),
+     ts::duration::max()});
+
+  auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1, secs_col});
+  auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2, secs_col});
+
+  auto const output1 = cudf::hash(input1);
+  auto const output2 = cudf::hash(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+}
+
+TEST_F(HashTest, MultiValueNulls)
+{
+  // Nulls with different values should be equal
+  cudf::test::strings_column_wrapper const strings_col1(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
+    {0, 1, 1, 0, 1});
+  cudf::test::strings_column_wrapper const strings_col2(
+    {"different but null",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "I am Jack's complete lack of null value",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
+    {0, 1, 1, 0, 1});
+
+  // Nulls with different values should be equal
+  using limits = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
+    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
+    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
+
+  // Nulls with different values should be equal
+  // Different truth values should be equal
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
+
+  // Nulls with different values should be equal
+  using ts = cudf::timestamp_s;
+  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col1(
+    {ts::duration::zero(),
+     static_cast<ts::duration>(100),
+     static_cast<ts::duration>(-100),
+     ts::duration::min(),
+     ts::duration::max()},
+    {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col2(
+    {ts::duration::zero(),
+     static_cast<ts::duration>(-200),
+     static_cast<ts::duration>(200),
+     ts::duration::min(),
+     ts::duration::max()},
+    {1, 0, 0, 1, 1});
+
+  auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1});
+  auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2});
+
+  auto const output1 = cudf::hash(input1);
+  auto const output2 = cudf::hash(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+
+  auto const spark_output1 = spark_rapids_jni::murmur_hash3_32(input1, 0);
+  auto const spark_output2 = spark_rapids_jni::murmur_hash3_32(input2);
+
+  EXPECT_EQ(input1.num_rows(), spark_output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
+}
+
+TEST_F(HashTest, BasicList)
+{
+  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
+  using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
+
+  auto const col = LCW{{}, {}, {1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}};
+  auto const input  = cudf::table_view({col});
+  auto const expect = ICW{1607593296,
+                          1607593296,
+                          -636010097,
+                          -132459357,
+                          -636010097,
+                          -2008850957,
+                          -1023787369,
+                          761197503,
+                          761197503,
+                          1340177511,
+                          -1023787369,
+                          -1023787369};
+
+  auto const output = cudf::hash(input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+
+  auto const expect_seeded = ICW{1607594268u,
+                                 1607594268u,
+                                 1576790066u,
+                                 1203671017u,
+                                 1576790066u,
+                                 2107478077u,
+                                 1756855002u,
+                                 2228938758u,
+                                 2228938758u,
+                                 3491134126u,
+                                 1756855002u,
+                                 1756855002u};
+
+  auto const seeded_output = cudf::hash(input, cudf::hash_id::HASH_MURMUR3, 15);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
+}
+
+TEST_F(HashTest, NullableList)
+{
+  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
+  using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
+
+  auto const valids = std::vector<bool>{1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0};
+  auto const col =
+    LCW{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {}, {2, 2}, {2, 2}, {}}, valids.begin()};
+  auto expect = ICW{-2023148619,
+                    -2023148619,
+                    -31671896,
+                    -31671896,
+                    -1205248335,
+                    1865773848,
+                    1865773848,
+                    -2023148682,
+                    -1205248335,
+                    -1205248335,
+                    -2023148682};
+
+  auto const output = cudf::hash(cudf::table_view({col}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+
+  auto const expect_seeded = ICW{2271820643u,
+                                 2271820643u,
+                                 1038318696u,
+                                 1038318696u,
+                                 595138041u,
+                                 3027840870u,
+                                 3027840870u,
+                                 2271820578u,
+                                 595138041u,
+                                 595138041u,
+                                 2271820578u};
+
+  auto const seeded_output = cudf::hash(cudf::table_view({col}), cudf::hash_id::HASH_MURMUR3, 31);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
+}
+
+TEST_F(HashTest, ListOfStruct)
+{
+  auto col1 = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {-1, -1, 0, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, 1, 2},
+    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
+  auto col2 = cudf::test::strings_column_wrapper{
+    {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"},
+    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1}};
+  auto struct_col = cudf::test::structs_column_wrapper{
+    {col1, col2}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18};
+
+  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(
+    17, offsets.release(), struct_col.release(), null_count, std::move(null_mask));
+
+  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{83451479,
+                                                                 83451479,
+                                                                 83455332,
+                                                                 83455332,
+                                                                 -759684425,
+                                                                 -959632766,
+                                                                 -959632766,
+                                                                 -959632766,
+                                                                 -959636527,
+                                                                 -656998704,
+                                                                 613652814,
+                                                                 1902080426,
+                                                                 1902080426,
+                                                                 2061025592,
+                                                                 2061025592,
+                                                                 -319840811,
+                                                                 -319840811};
+
+  auto const output = cudf::hash(cudf::table_view({*list_column}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+
+  auto expect_seeded = cudf::test::fixed_width_column_wrapper<uint32_t>{81710442u,
+                                                                        81710442u,
+                                                                        81729816u,
+                                                                        81729816u,
+                                                                        3532787573u,
+                                                                        3642097855u,
+                                                                        3642097855u,
+                                                                        3642097855u,
+                                                                        3642110391u,
+                                                                        3889855760u,
+                                                                        1494406307u,
+                                                                        103934081u,
+                                                                        103934081u,
+                                                                        3462063680u,
+                                                                        3462063680u,
+                                                                        1696730835u,
+                                                                        1696730835u};
+
+  auto const seeded_output =
+    cudf::hash(cudf::table_view({*list_column}), cudf::hash_id::HASH_MURMUR3, 619);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
+}
+
+TEST_F(HashTest, ListOfEmptyStruct)
+{
+  // []
+  // []
+  // Null
+  // Null
+  // [Null, Null]
+  // [Null, Null]
+  // [Null, Null]
+  // [Null]
+  // [Null]
+  // [{}]
+  // [{}]
+  // [{}, {}]
+  // [{}, {}]
+
+  auto struct_validity = std::vector<bool>{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
+  auto struct_col = cudf::make_structs_column(14, {}, null_count, std::move(null_mask));
+
+  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    0, 0, 0, 0, 0, 2, 4, 6, 7, 8, 9, 10, 12, 14};
+  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  std::tie(null_mask, null_count) =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(
+    13, offsets.release(), std::move(struct_col), null_count, std::move(null_mask));
+
+  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{2271818677u,
+                                                                 2271818677u,
+                                                                 2271818614u,
+                                                                 2271818614u,
+                                                                 3954409013u,
+                                                                 3954409013u,
+                                                                 3954409013u,
+                                                                 2295666275u,
+                                                                 2295666275u,
+                                                                 2295666276u,
+                                                                 2295666276u,
+                                                                 3954409052u,
+                                                                 3954409052u};
+
+  auto output = cudf::hash(cudf::table_view({*list_column}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+}
+
+TEST_F(HashTest, EmptyDeepList)
+{
+  // List<List<int>>, where all lists are empty
+  // []
+  // []
+  // Null
+  // Null
+
+  // Internal empty list
+  auto list1 = cudf::test::lists_column_wrapper<int>{};
+
+  auto offsets       = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
+  auto list_nullmask = std::vector<bool>{1, 1, 0, 0};
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(
+    4, offsets.release(), list1.release(), null_count, std::move(null_mask));
+
+  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{
+    2271818677u, 2271818677u, 2271818614u, 2271818614u};
+
+  auto output = cudf::hash(cudf::table_view({*list_column}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+}
+
+template <typename T>
+class HashTestTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(HashTestTyped, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(HashTestTyped, Equality)
+{
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> const col{0, 127, 1, 2, 8};
+  auto const input = cudf::table_view({col});
+
+  // Hash of same input should be equal
+  auto const output1 = cudf::hash(input);
+  auto const output2 = cudf::hash(input);
+
+  EXPECT_EQ(input.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+
+  auto const spark_output1 = spark_rapids_jni::murmur_hash3_32(input, 0);
+  auto const spark_output2 = spark_rapids_jni::murmur_hash3_32(input);
+
+  EXPECT_EQ(input.num_rows(), spark_output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
+}
+
+TYPED_TEST(HashTestTyped, EqualityNulls)
+{
+  using T = TypeParam;
+
+  // Nulls with different values should be equal
+  cudf::test::fixed_width_column_wrapper<T, int32_t> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+
+  auto const input1 = cudf::table_view({col1});
+  auto const input2 = cudf::table_view({col2});
+
+  auto const output1 = cudf::hash(input1);
+  auto const output2 = cudf::hash(input2);
+
+  EXPECT_EQ(input1.num_rows(), output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+
+  auto const spark_output1 = spark_rapids_jni::murmur_hash3_32(input1, 0);
+  auto const spark_output2 = spark_rapids_jni::murmur_hash3_32(input2);
+
+  EXPECT_EQ(input1.num_rows(), spark_output1->size());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
+}
+
+template <typename T>
+class HashTestFloatTyped : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(HashTestFloatTyped, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(HashTestFloatTyped, TestExtremes)
+{
+  using T = TypeParam;
+  T min   = std::numeric_limits<T>::min();
+  T max   = std::numeric_limits<T>::max();
+  T nan   = std::numeric_limits<T>::quiet_NaN();
+  T inf   = std::numeric_limits<T>::infinity();
+
+  cudf::test::fixed_width_column_wrapper<T> const col(
+    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col_neg_zero(
+    {T(-0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col_neg_nan(
+    {T(0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
+
+  auto const table_col          = cudf::table_view({col});
+  auto const table_col_neg_zero = cudf::table_view({col_neg_zero});
+  auto const table_col_neg_nan  = cudf::table_view({col_neg_nan});
+
+  auto const hash_col          = cudf::hash(table_col);
+  auto const hash_col_neg_zero = cudf::hash(table_col_neg_zero);
+  auto const hash_col_neg_nan  = cudf::hash(table_col_neg_nan);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_zero, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, verbosity);
+
+  // Spark hash is sensitive to 0 and -0
+  auto const spark_col         = spark_rapids_jni::murmur_hash3_32(table_col, 0);
+  auto const spark_col_neg_nan = spark_rapids_jni::murmur_hash3_32(table_col_neg_nan);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
+}
+
+class SparkMurmurHash3Test : public cudf::test::BaseFixture {};
+
+TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
+{
+  // The hash values were determined by running the following Scala code in Apache Spark.
+  // Note that Spark >= 3.2 normalizes the float/double value of -0. to +0. and both values hash
+  // to the same result. This is normalized in the calling code (Spark RAPIDS plugin) for Spark
+  // >= 3.2. However, the reference values for -0. below must be obtained with Spark < 3.2 and
+  // libcudf will continue to implement the Spark < 3.2 behavior until Spark >= 3.2 is required and
+  // the workaround in the calling code is removed. This also affects the combined hash values.
+
+  /*
+  import org.apache.spark.sql.functions._
+  import org.apache.spark.sql.types._
+  import org.apache.spark.sql.Row
+  import org.apache.spark.sql.catalyst.util.DateTimeUtils
+
+  val schema = new StructType()
+      .add("structs", new StructType()
+          .add("a", IntegerType)
+          .add("b", StringType)
+          .add("c", new StructType()
+              .add("x", FloatType)
+              .add("y", LongType)))
+      .add("strings", StringType)
+      .add("doubles", DoubleType)
+      .add("timestamps", TimestampType)
+      .add("decimal64", DecimalType(18, 7))
+      .add("longs", LongType)
+      .add("floats", FloatType)
+      .add("dates", DateType)
+      .add("decimal32", DecimalType(9, 3))
+      .add("ints", IntegerType)
+      .add("shorts", ShortType)
+      .add("bytes", ByteType)
+      .add("bools", BooleanType)
+      .add("decimal128", DecimalType(38, 11))
+
+  val data = Seq(
+      Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble,
+          DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat,
+          DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte,
+          false, BigDecimal(0)),
+      Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble),
+          DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat),
+          DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte,
+          true, BigDecimal("0.000000001")),
+      Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN,
+          DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN,
+          DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte,
+          true, BigDecimal("-0.00000000001")),
+      Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)),
+          "All work and no play makes Jack a dull boy", Double.MinValue,
+          DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"),
+          Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100),
+          BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true,
+          BigDecimal("-9999999999999999.99999999999")),
+      Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)),
+          "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue,
+          DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"),
+          Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100),
+          BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false,
+          BigDecimal("99999999999999999999999999.99999999999")))
+
+  val df = spark.createDataFrame(sc.parallelize(data), schema)
+  df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}"))
+  println(s"combined => ${df.select(hash(col("*"))).collect.mkString(",")}")
+  */
+
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_structs_expected(
+    {-105406170, 90479889, -678041645, 1667387937, 301478567});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected(
+    {142593372, 1217302703, -715697185, -2061143941, -111635966});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_doubles_expected(
+    {-1670924195, -853646085, -1281358385, 1897734433, -508695674});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_timestamps_expected(
+    {-1670924195, 1114849490, 904948192, -1832979433, 1752430209});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal64_expected(
+    {-1670924195, 1114849490, 904948192, 1962370902, -1795328666});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_longs_expected(
+    {-1670924195, 1114849490, 904948192, -853646085, -1604625029});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_floats_expected(
+    {933211791, 723455942, -349261430, -1225560532, -338752985});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_dates_expected(
+    {933211791, 751823303, -1080202046, -1906567553, -1503850410});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal32_expected(
+    {-1670924195, 1114849490, 904948192, -1454351396, -193774131});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_ints_expected(
+    {933211791, 751823303, -1080202046, 723455942, 133916647});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_shorts_expected(
+    {933211791, 751823303, -1080202046, -1871935946, 1249274084});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bytes_expected(
+    {933211791, 751823303, -1080202046, 1110053733, 1135925485});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bools_expected(
+    {933211791, -559580957, -559580957, -559580957, 933211791});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal128_expected(
+    {-783713497, -295670906, 1398487324, -52622807, -1359749815});
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_combined_expected(
+    {401603227, 588162166, 552160517, 1132537411, -326043017});
+
+  using double_limits = std::numeric_limits<double>;
+  using long_limits   = std::numeric_limits<int64_t>;
+  using float_limits  = std::numeric_limits<float>;
+  using int_limits    = std::numeric_limits<int32_t>;
+  cudf::test::fixed_width_column_wrapper<int32_t> a_col{0, 100, -100, 0x1234'5678, -0x7654'3210};
+  cudf::test::strings_column_wrapper b_col{"a", "bc", "def", "ghij", "klmno"};
+  cudf::test::fixed_width_column_wrapper<float> x_col{
+    0.f, 100.f, -100.f, float_limits::infinity(), -float_limits::infinity()};
+  cudf::test::fixed_width_column_wrapper<int64_t> y_col{
+    0L, 100L, -100L, 0x0123'4567'89ab'cdefL, -0x0123'4567'89ab'cdefL};
+  cudf::test::structs_column_wrapper c_col{{x_col, y_col}};
+  cudf::test::structs_column_wrapper const structs_col{{a_col, b_col, c_col}};
+
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
+  cudf::test::fixed_width_column_wrapper<double> const doubles_col(
+    {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()});
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> const
+    timestamps_col({0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000});
+  cudf::test::fixed_point_column_wrapper<int64_t> const decimal64_col(
+    {0L, 100L, -100L, -999999999999999999L, 999999999999999999L}, numeric::scale_type{-7});
+  cudf::test::fixed_width_column_wrapper<int64_t> const longs_col(
+    {0L, 100L, -100L, long_limits::min(), long_limits::max()});
+  cudf::test::fixed_width_column_wrapper<float> const floats_col(
+    {0.f, -0.f, -float_limits::quiet_NaN(), float_limits::lowest(), float_limits::max()});
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> dates_col(
+    {0, 100, -100, int_limits::min() / 100, int_limits::max() / 100});
+  cudf::test::fixed_point_column_wrapper<int32_t> const decimal32_col(
+    {0, 100, -100, -999999999, 999999999}, numeric::scale_type{-3});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, 100, -100, int_limits::min(), int_limits::max()});
+  cudf::test::fixed_width_column_wrapper<int16_t> const shorts_col({0, 100, -100, -32768, 32767});
+  cudf::test::fixed_width_column_wrapper<int8_t> const bytes_col({0, 100, -100, -128, 127});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
+  cudf::test::fixed_point_column_wrapper<__int128_t> const decimal128_col(
+    {static_cast<__int128>(0),
+     static_cast<__int128>(100),
+     static_cast<__int128>(-1),
+     (static_cast<__int128>(0xFFFF'FFFF'FCC4'D1C3u) << 64 | 0x602F'7FC3'1800'0001u),
+     (static_cast<__int128>(0x0785'EE10'D5DA'46D9u) << 64 | 0x00F4'369F'FFFF'FFFFu)},
+    numeric::scale_type{-11});
+
+  auto const hash_structs = spark_rapids_jni::murmur_hash3_32(cudf::table_view({structs_col}), 42);
+  auto const hash_strings = spark_rapids_jni::murmur_hash3_32(cudf::table_view({strings_col}), 42);
+  auto const hash_doubles = spark_rapids_jni::murmur_hash3_32(cudf::table_view({doubles_col}), 42);
+  auto const hash_timestamps =
+    spark_rapids_jni::murmur_hash3_32(cudf::table_view({timestamps_col}), 42);
+  auto const hash_decimal64 =
+    spark_rapids_jni::murmur_hash3_32(cudf::table_view({decimal64_col}), 42);
+  auto const hash_longs  = spark_rapids_jni::murmur_hash3_32(cudf::table_view({longs_col}), 42);
+  auto const hash_floats = spark_rapids_jni::murmur_hash3_32(cudf::table_view({floats_col}), 42);
+  auto const hash_dates  = spark_rapids_jni::murmur_hash3_32(cudf::table_view({dates_col}), 42);
+  auto const hash_decimal32 =
+    spark_rapids_jni::murmur_hash3_32(cudf::table_view({decimal32_col}), 42);
+  auto const hash_ints   = spark_rapids_jni::murmur_hash3_32(cudf::table_view({ints_col}), 42);
+  auto const hash_shorts = spark_rapids_jni::murmur_hash3_32(cudf::table_view({shorts_col}), 42);
+  auto const hash_bytes  = spark_rapids_jni::murmur_hash3_32(cudf::table_view({bytes_col}), 42);
+  auto const hash_bools1 = spark_rapids_jni::murmur_hash3_32(cudf::table_view({bools_col1}), 42);
+  auto const hash_bools2 = spark_rapids_jni::murmur_hash3_32(cudf::table_view({bools_col2}), 42);
+  auto const hash_decimal128 =
+    spark_rapids_jni::murmur_hash3_32(cudf::table_view({decimal128_col}), 42);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal64, hash_decimal64_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_longs, hash_longs_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_floats, hash_floats_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_dates, hash_dates_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal32, hash_decimal32_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_ints, hash_ints_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_shorts, hash_shorts_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal128, hash_decimal128_expected, verbosity);
+
+  auto const combined_table = cudf::table_view({structs_col,
+                                                strings_col,
+                                                doubles_col,
+                                                timestamps_col,
+                                                decimal64_col,
+                                                longs_col,
+                                                floats_col,
+                                                dates_col,
+                                                decimal32_col,
+                                                ints_col,
+                                                shorts_col,
+                                                bytes_col,
+                                                bools_col2,
+                                                decimal128_col});
+  auto const hash_combined  = spark_rapids_jni::murmur_hash3_32(combined_table, 42);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity);
+}
+
+TEST_F(SparkMurmurHash3Test, StringsWithSeed)
+{
+  // The hash values were determined by running the following Scala code in Apache Spark:
+  // val strs = Seq("", "The quick brown fox",
+  //              "jumps over the lazy dog.",
+  //              "All work and no play makes Jack a dull boy",
+  //              "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721")
+  // println(strs.map(org.apache.spark.unsafe.types.UTF8String.fromString)
+  //   .map(org.apache.spark.sql.catalyst.expressions.Murmur3HashFunction.hash(
+  //     _, org.apache.spark.sql.types.StringType, 314)))
+
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected_seed_314(
+    {1467149710, 723257560, -1620282500, -2001858707, 1588473657});
+
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
+
+  auto const hash_strings = spark_rapids_jni::murmur_hash3_32(cudf::table_view({strings_col}), 314);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected_seed_314, verbosity);
+}
+
+TEST_F(SparkMurmurHash3Test, ListValues)
+{
+  /*
+  import org.apache.spark.sql.functions._
+  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
+  import org.apache.spark.sql.Row
+
+  val schema = new StructType()
+    .add("lists",ArrayType(ArrayType(IntegerType)))
+
+  val data = Seq(
+    Row(null),
+    Row(List(null)),
+    Row(List(List())),
+    Row(List(List(1))),
+    Row(List(List(1, 2))),
+    Row(List(List(1, 2, 3))),
+    Row(List(List(1, 2), List(3))),
+    Row(List(List(1), List(2, 3))),
+    Row(List(List(1), List(null, 2, 3))),
+    Row(List(List(1, 2), List(3), List(null))),
+    Row(List(List(1, 2), null, List(3))),
+  )
+
+  val df = spark.createDataFrame(
+    spark.sparkContext.parallelize(data), schema)
+
+  val df2 = df.selectExpr("lists", "hash(lists) as hash")
+  df2.printSchema()
+  df2.show(false)
+  */
+
+  auto const null = -1;
+  auto nested_list =
+    cudf::test::lists_column_wrapper<int>({{},
+                                           {1},
+                                           {1, 2},
+                                           {1, 2, 3},
+                                           {1, 2},
+                                           {3},
+                                           {1},
+                                           {2, 3},
+                                           {1},
+                                           {{null, 2, 3}, cudf::test::iterators::nulls_at({0})},
+                                           {1, 2},
+                                           {3},
+                                           {{null}, cudf::test::iterators::nulls_at({0})},
+                                           {1, 2},
+                                           {},
+                                           {3}},
+                                          cudf::test::iterators::nulls_at({0, 14}));
+  auto offsets =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 1, 2, 3, 4, 6, 8, 10, 13, 16};
+  auto list_validity = cudf::test::iterators::nulls_at({0});
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(list_validity, list_validity + 11);
+  auto list_column = cudf::make_lists_column(
+    11, offsets.release(), nested_list.release(), null_count, std::move(null_mask));
+
+  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{42,
+                                                                42,
+                                                                42,
+                                                                -559580957,
+                                                                -222940379,
+                                                                -912918097,
+                                                                -912918097,
+                                                                -912918097,
+                                                                -912918097,
+                                                                -912918097,
+                                                                -912918097};
+
+  auto output = spark_rapids_jni::murmur_hash3_32(cudf::table_view({*list_column}), 42);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+}
+
+TEST_F(SparkMurmurHash3Test, StructOfListValues)
+{
+  /*
+  import org.apache.spark.sql.functions._
+  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
+  import org.apache.spark.sql.Row
+
+  val schema = new StructType()
+    .add("structs", new StructType()
+        .add("a", ArrayType(IntegerType))
+        .add("b", ArrayType(IntegerType)))
+
+  val data = Seq(
+    Row(Row(List(), List())),
+    Row(Row(List(0), List(0))),
+    Row(Row(List(1, null), null)),
+    Row(Row(List(1, null), List())),
+    Row(Row(List(), List(null, 1))),
+    Row(Row(null, List(1))),
+    Row(Row(List(2, 3), List(4, 5))),
+  )
+
+  val df = spark.createDataFrame(
+    spark.sparkContext.parallelize(data), schema)
+
+  val df2 = df.selectExpr("lists", "hash(lists) as hash")
+  df2.printSchema()
+  df2.show(false)
+  */
+
+  auto const null = -1;
+  auto col1 =
+    cudf::test::lists_column_wrapper<int>({{},
+                                           {0},
+                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
+                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
+                                           {},
+                                           {} /*NULL*/,
+                                           {2, 3}},
+                                          cudf::test::iterators::nulls_at({5}));
+  auto col2 = cudf::test::lists_column_wrapper<int>(
+    {{}, {0}, {} /*NULL*/, {}, {{null, 1}, cudf::test::iterators::nulls_at({0})}, {1}, {4, 5}},
+    cudf::test::iterators::nulls_at({2}));
+  auto struct_column = cudf::test::structs_column_wrapper{{col1, col2}};
+
+  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
+    42, 59727262, -559580957, -559580957, -559580957, -559580957, 170038658};
+
+  auto output = spark_rapids_jni::murmur_hash3_32(cudf::table_view({struct_column}), 42);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+}
+
+TEST_F(SparkMurmurHash3Test, ListOfStructValues)
+{
+  /*
+  import org.apache.spark.sql.functions._
+  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
+  import org.apache.spark.sql.Row
+
+  val schema = new StructType()
+    .add("lists", ArrayType(new StructType()
+      .add("a", IntegerType)
+      .add("b", IntegerType)))
+
+  val data = Seq(
+    Row(List(Row(0, 0))),
+    Row(List(null)),
+    Row(List(Row(null, null))),
+    Row(List(Row(1, null))),
+    Row(List(Row(null, 1))),
+    Row(List(Row(null, 1), Row(2, 3))),
+    Row(List(Row(2, 3), null)),
+    Row(List(Row(2, 3), Row(4, 5))),
+  )
+
+  val df = spark.createDataFrame(
+    spark.sparkContext.parallelize(data), schema)
+
+  val df2 = df.selectExpr("lists", "hash(lists) as hash")
+  df2.printSchema()
+  df2.show(false)
+  */
+
+  auto const null = -1;
+  auto col1       = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {0, null, null, 1, null, null, 2, 2, null, 2, 4},
+    cudf::test::iterators::nulls_at({1, 2, 4, 5, 8}));
+  auto col2 = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {0, null, null, null, 1, 1, 3, 3, null, 3, 5}, cudf::test::iterators::nulls_at({1, 2, 3, 8}));
+  auto struct_column =
+    cudf::test::structs_column_wrapper{{col1, col2}, {1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1}};
+  auto offsets =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3, 4, 5, 7, 9, 11};
+  auto list_nullmask = std::vector<bool>(1, 8);
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
+  auto list_column = cudf::make_lists_column(
+    8, offsets.release(), struct_column.release(), null_count, std::move(null_mask));
+
+  // TODO: Lists of structs are not yet supported. Once support is added,
+  // remove this EXPECT_THROW and uncomment the rest of this test.
+  EXPECT_THROW(spark_rapids_jni::murmur_hash3_32(cudf::table_view({*list_column}), 42),
+               cudf::logic_error);
+
+  /*
+  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
+    59727262, 42, 42, -559580957, -559580957, -912918097, 1092624418, 170038658};
+
+  auto output = spark_rapids_jni::murmur_hash3_32(cudf::table_view({*list_column}), 42);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
+  */
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/Hash.java b/src/main/java/com/nvidia/spark/rapids/jni/Hash.java
new file mode 100644
index 0000000000..f182d68d1f
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/Hash.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.*;
+
+public class Hash {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * Create a new vector containing spark's 32-bit murmur3 hash of each row in the table.
+   * Spark's murmur3 hash uses a different tail processing algorithm.
+   *
+   * @param seed integer seed for the murmur3 hash function
+   * @param columns array of columns to hash, must have identical number of rows.
+   * @return the new ColumnVector of 32-bit values representing each row's hash value.
+   */
+  public static ColumnVector murmurHash32(int seed, ColumnView columns[]) {
+    if (columns.length < 1) {
+      throw new IllegalArgumentException("Murmur3 hashing requires at least 1 column of input");
+    }
+    long[] columnViews = new long[columns.length];
+    long size = columns[0].getRowCount();
+
+    for(int i = 0; i < columns.length; i++) {
+      assert columns[i] != null : "Column vectors passed may not be null";
+      assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size";
+      assert !columns[i].getType().isDurationType() : "Unsupported column type Duration";
+      columnViews[i] = columns[i].getNativeView(); 
+    }
+    return new ColumnVector(murmurHash32(seed, columnViews));
+  }
+
+  public static ColumnVector murmurHash32(ColumnView columns[]) {
+    return murmurHash32(0, columns);
+  }
+
+  private static native long murmurHash32(int seed, long[] viewHandles) throws CudfException;
+}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java b/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java
new file mode 100644
index 0000000000..dda03affe2
--- /dev/null
+++ b/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import com.nvidia.spark.rapids.jni.Hash;
+
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.ColumnView;
+import ai.rapids.cudf.DType;
+import ai.rapids.cudf.HostColumnVector.*;
+import org.junit.jupiter.api.Test;
+
+import java.math.BigDecimal;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static ai.rapids.cudf.AssertUtils.*;
+
+public class HashTest {
+// IEEE 754 NaN values
+  static final float POSITIVE_FLOAT_NAN_LOWER_RANGE = Float.intBitsToFloat(0x7f800001);
+  static final float POSITIVE_FLOAT_NAN_UPPER_RANGE = Float.intBitsToFloat(0x7fffffff);
+  static final float NEGATIVE_FLOAT_NAN_LOWER_RANGE = Float.intBitsToFloat(0xff800001);
+  static final float NEGATIVE_FLOAT_NAN_UPPER_RANGE = Float.intBitsToFloat(0xffffffff);
+
+  static final double POSITIVE_DOUBLE_NAN_LOWER_RANGE = Double.longBitsToDouble(0x7ff0000000000001L);
+  static final double POSITIVE_DOUBLE_NAN_UPPER_RANGE = Double.longBitsToDouble(0x7fffffffffffffffL);
+  static final double NEGATIVE_DOUBLE_NAN_LOWER_RANGE = Double.longBitsToDouble(0xfff0000000000001L);
+  static final double NEGATIVE_DOUBLE_NAN_UPPER_RANGE = Double.longBitsToDouble(0xffffffffffffffffL);
+
+  @Test
+  void testSpark32BitMurmur3HashStrings() {
+    try (ColumnVector v0 = ColumnVector.fromStrings(
+           "a", "B\nc",  "dE\"\u0100\t\u0101 \ud720\ud721\\Fg2\'",
+           "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+           "in the MD5 hash function. This string needed to be longer.A 60 character string to " +
+           "test MD5's message padding algorithm",
+           "hiJ\ud720\ud721\ud720\ud721", null);
+         ColumnVector result = Hash.murmurHash32(42, new ColumnVector[]{v0});
+         ColumnVector expected = ColumnVector.fromBoxedInts(1485273170, 1709559900, 1423943036, 176121990, 1199621434, 42)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashInts() {
+    try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
+         ColumnVector v1 = ColumnVector.fromBoxedInts(0, null, -100, null, null, Integer.MAX_VALUE);
+         ColumnVector result = Hash.murmurHash32(42, new ColumnVector[]{v0, v1});
+         ColumnVector expected = ColumnVector.fromBoxedInts(59727262, 751823303, -1080202046, 42, 723455942, 133916647)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashDoubles() {
+    try (ColumnVector v = ColumnVector.fromBoxedDoubles(
+          0.0, null, 100.0, -100.0, Double.MIN_NORMAL, Double.MAX_VALUE,
+          POSITIVE_DOUBLE_NAN_UPPER_RANGE, POSITIVE_DOUBLE_NAN_LOWER_RANGE,
+          NEGATIVE_DOUBLE_NAN_UPPER_RANGE, NEGATIVE_DOUBLE_NAN_LOWER_RANGE,
+          Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY);
+         ColumnVector result = Hash.murmurHash32(new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedInts(1669671676, 0, -544903190, -1831674681, 150502665, 474144502, 1428788237, 1428788237, 1428788237, 1428788237, 420913893, 1915664072)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashTimestamps() {
+    // The hash values were derived from Apache Spark in a manner similar to the one documented at
+    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
+    try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs(
+        0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL);
+         ColumnVector result = Hash.murmurHash32(42, new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 42, 1114849490, 904948192, 657182333, 42, -57193045)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashDecimal64() {
+    // The hash values were derived from Apache Spark in a manner similar to the one documented at
+    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
+    try (ColumnVector v = ColumnVector.decimalFromLongs(-7,
+        0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL);
+         ColumnVector result = Hash.murmurHash32(42, new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 1114849490, 904948192, 657182333, -57193045)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashDecimal32() {
+    // The hash values were derived from Apache Spark in a manner similar to the one documented at
+    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
+    try (ColumnVector v = ColumnVector.decimalFromInts(-3,
+        0, 100, -100, 0x12345678, -0x12345678);
+         ColumnVector result = Hash.murmurHash32(42, new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 1114849490, 904948192, -958054811, -1447702630)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashDates() {
+    // The hash values were derived from Apache Spark in a manner similar to the one documented at
+    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
+    try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts(
+        0, null, 100, -100, 0x12345678, null, -0x12345678);
+         ColumnVector result = Hash.murmurHash32(42, new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedInts(933211791, 42, 751823303, -1080202046, -1721170160, 42, 1852996993)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashFloats() {
+    try (ColumnVector v = ColumnVector.fromBoxedFloats(
+          0f, 100f, -100f, Float.MIN_NORMAL, Float.MAX_VALUE, null,
+          POSITIVE_FLOAT_NAN_LOWER_RANGE, POSITIVE_FLOAT_NAN_UPPER_RANGE,
+          NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE,
+          Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY);
+         ColumnVector result = Hash.murmurHash32(411, new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedInts(-235179434, 1812056886, 2028471189, 1775092689, -1531511762, 411, -1053523253, -1053523253, -1053523253, -1053523253, -1526256646, 930080402)){
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashBools() {
+    try (ColumnVector v0 = ColumnVector.fromBoxedBooleans(null, true, false, true, null, false);
+         ColumnVector v1 = ColumnVector.fromBoxedBooleans(null, true, false, null, false, true);
+         ColumnVector result = Hash.murmurHash32(0, new ColumnVector[]{v0, v1});
+         ColumnVector expected = ColumnVector.fromBoxedInts(0, -1589400010, -239939054, -68075478, 593689054, -1194558265)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashMixed() {
+    try (ColumnVector strings = ColumnVector.fromStrings(
+          "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
+          "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+          "in the MD5 hash function. This string needed to be longer.",
+          null, null);
+         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
+         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
+          0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
+         ColumnVector floats = ColumnVector.fromBoxedFloats(
+          0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
+         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
+         ColumnVector result = Hash.murmurHash32(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
+         ColumnVector expected = ColumnVector.fromBoxedInts(1936985022, 720652989, 339312041, 1400354989, 769988643, 1868)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashStruct() {
+    try (ColumnVector strings = ColumnVector.fromStrings(
+        "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
+        "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+            "in the MD5 hash function. This string needed to be longer.",
+        null, null);
+         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
+         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
+             0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
+         ColumnVector floats = ColumnVector.fromBoxedFloats(
+             0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
+         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
+         ColumnView structs = ColumnView.makeStructView(strings, integers, doubles, floats, bools);
+         ColumnVector result = Hash.murmurHash32(1868, new ColumnView[]{structs});
+         ColumnVector expected = Hash.murmurHash32(1868, new ColumnVector[]{strings, integers, doubles, floats, bools})) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashNestedStruct() {
+    try (ColumnVector strings = ColumnVector.fromStrings(
+        "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
+        "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+            "in the MD5 hash function. This string needed to be longer.",
+        null, null);
+         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
+         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
+             0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
+         ColumnVector floats = ColumnVector.fromBoxedFloats(
+             0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
+         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
+         ColumnView structs1 = ColumnView.makeStructView(strings, integers);
+         ColumnView structs2 = ColumnView.makeStructView(structs1, doubles);
+         ColumnView structs3 = ColumnView.makeStructView(bools);
+         ColumnView structs = ColumnView.makeStructView(structs2, floats, structs3);
+         ColumnVector expected = Hash.murmurHash32(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
+         ColumnVector result = Hash.murmurHash32(1868, new ColumnView[]{structs})) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testSpark32BitMurmur3HashListsAndNestedLists() {
+    try (ColumnVector stringListCV = ColumnVector.fromLists(
+             new ListType(true, new BasicType(true, DType.STRING)),
+             Arrays.asList(null, "a"),
+             Arrays.asList("B\n", ""),
+             Arrays.asList("dE\"\u0100\t\u0101", " \ud720\ud721"),
+             Collections.singletonList("A very long (greater than 128 bytes/char string) to test a multi" +
+             " hash-step data point in the Murmur3 hash function. This string needed to be longer."),
+             Collections.singletonList(""),
+             null);
+         ColumnVector strings1 = ColumnVector.fromStrings(
+             "a", "B\n", "dE\"\u0100\t\u0101",
+             "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+             "in the Murmur3 hash function. This string needed to be longer.", null, null);
+         ColumnVector strings2 = ColumnVector.fromStrings(
+             null, "", " \ud720\ud721", null, "", null);
+         ColumnView stringStruct = ColumnView.makeStructView(strings1, strings2);
+         ColumnVector stringExpected = Hash.murmurHash32(1868, new ColumnView[]{stringStruct});
+         ColumnVector stringResult = Hash.murmurHash32(1868, new ColumnView[]{stringListCV});
+         ColumnVector intListCV = ColumnVector.fromLists(
+             new ListType(true, new BasicType(true, DType.INT32)),
+             null,
+             Arrays.asList(0, -2, 3),
+             Collections.singletonList(Integer.MAX_VALUE),
+             Arrays.asList(5, -6, null),
+             Collections.singletonList(Integer.MIN_VALUE),
+             null);
+         ColumnVector integers1 = ColumnVector.fromBoxedInts(null, 0, null, 5, Integer.MIN_VALUE, null);
+         ColumnVector integers2 = ColumnVector.fromBoxedInts(null, -2, Integer.MAX_VALUE, null, null, null);
+         ColumnVector integers3 = ColumnVector.fromBoxedInts(null, 3, null, -6, null, null);
+         ColumnVector intExpected =
+             Hash.murmurHash32(1868, new ColumnVector[]{integers1, integers2, integers3});
+         ColumnVector intResult = Hash.murmurHash32(1868, new ColumnVector[]{intListCV});
+         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
+          0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
+         ColumnVector floats = ColumnVector.fromBoxedFloats(
+          0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
+         ColumnView structCV = ColumnView.makeStructView(intListCV, stringListCV, doubles, floats);
+         ColumnVector nestedExpected =
+             Hash.murmurHash32(1868, new ColumnView[]{intListCV, strings1, strings2, doubles, floats});
+         ColumnVector nestedResult =
+             Hash.murmurHash32(1868, new ColumnView[]{structCV})) {
+      assertColumnsAreEqual(stringExpected, stringResult);
+      assertColumnsAreEqual(intExpected, intResult);
+      assertColumnsAreEqual(nestedExpected, nestedResult);
+    }
+  }
+}

From 1fa32edc597ffc5a6a0b9dbe63d5c9414c0db746 Mon Sep 17 00:00:00 2001
From: YanxuanLiu <104543031+YanxuanLiu@users.noreply.github.com>
Date: Mon, 10 Jul 2023 14:14:10 +0800
Subject: [PATCH 079/113] change container name (#1255)

Signed-off-by: YanxuanLiu <yanxuanl@nvidia.com>
---
 ci/Jenkinsfile.premerge | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/Jenkinsfile.premerge b/ci/Jenkinsfile.premerge
index 91a866077e..8ae54f1781 100644
--- a/ci/Jenkinsfile.premerge
+++ b/ci/Jenkinsfile.premerge
@@ -138,7 +138,7 @@ pipeline {
 
                     stash(name: "source_tree", includes: "**")
 
-                    container('docker-build') {
+                    container('cpu') {
                         // check if pre-merge dockerfile modified
                         def dockerfileModified = sh(returnStdout: true,
                             script: 'BASE=$(git --no-pager log --oneline -1 | awk \'{ print $NF }\'); ' +

From 239c22db4d7f58ecf3e6c28af3717031be946cfc Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Mon, 10 Jul 2023 17:02:45 +0800
Subject: [PATCH 080/113] Update submodule cudf to
 2ca46306aad848f5102314374cef1c026856569e (#1256)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index ecdaa91dc9..2ca46306aa 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit ecdaa91dc977ca58f94266d40a47ec8dffffbebd
+Subproject commit 2ca46306aad848f5102314374cef1c026856569e

From fe8eb987f610ab92396b73f5dcdeba7e6bdd283d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 11 Jul 2023 05:02:58 +0800
Subject: [PATCH 081/113] Update submodule cudf to
 67deda086f011a4280a6964639250462aa734242 (#1257)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 2ca46306aa..67deda086f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 2ca46306aad848f5102314374cef1c026856569e
+Subproject commit 67deda086f011a4280a6964639250462aa734242

From a70d154f5848bb17dece59dbd714931533724aa7 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 11 Jul 2023 11:03:28 +0800
Subject: [PATCH 082/113] Update submodule cudf to
 3c51c9ecb56c7130f300bcb21d4ca38a62cbd418 (#1259)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 67deda086f..3c51c9ecb5 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 67deda086f011a4280a6964639250462aa734242
+Subproject commit 3c51c9ecb56c7130f300bcb21d4ca38a62cbd418

From fa0ab68abb8e32a3aeaa8051ceac59cc174ccf0e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 12 Jul 2023 05:06:16 +0800
Subject: [PATCH 083/113] Update submodule cudf to
 dec6d084bf008db343cc8a43487c144f3883ed46 (#1260)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 3c51c9ecb5..dec6d084bf 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 3c51c9ecb56c7130f300bcb21d4ca38a62cbd418
+Subproject commit dec6d084bf008db343cc8a43487c144f3883ed46

From 26e9eda2975956b66ecba654344910d0dfbbd66d Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 12 Jul 2023 11:03:03 +0800
Subject: [PATCH 084/113] Update submodule cudf to
 dc6aefb7ae8a63d4c816ce210a2f188e1db5562c (#1261)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index dec6d084bf..dc6aefb7ae 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit dec6d084bf008db343cc8a43487c144f3883ed46
+Subproject commit dc6aefb7ae8a63d4c816ce210a2f188e1db5562c

From df792c212b160fafb43140e65ec965a3813fe553 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 12 Jul 2023 17:02:51 +0800
Subject: [PATCH 085/113] Update submodule cudf to
 1814ec17d27a5a45ff4eaabfff467e6d5f25eedf (#1262)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index dc6aefb7ae..1814ec17d2 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit dc6aefb7ae8a63d4c816ce210a2f188e1db5562c
+Subproject commit 1814ec17d27a5a45ff4eaabfff467e6d5f25eedf

From c77ab9d15751d10d9d4dc93890373eff21821518 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 12 Jul 2023 10:02:04 -0500
Subject: [PATCH 086/113] Add xxhash64 support (#1248)

* Back port spark-specific murmur32 hash code from cudf.

* Run pre-commit to format files. We were behind a bit.

* Update pre-commit config to 16.0.1 to match cudf. Re-ran formatting.

* Change jni bindings to use the spark-rapids-jni implementation of murmur hash instead of the cudf version.  Brought over
cpp and java tests.

* Documentation fix.

* Fix cpp tests to actually call the spark_rapids_jni murmur hash.

* First pass at xxhash64. cpp tests passing.

* Improve cpp tests - null cases and more floating point edge cases.

* Add Java tests.

* Moved murmur32 hash implementaion from cudf to spark-rapids-jni

Signed-off-by: db <dbaranec@nvidia.com>

* PR review changes.

* Fix copyright data in Hash.java

* Enable 32 bit decimal hash test.

* Implement xxhash64 on the gpu

Signed-off-by: db <dbaranec@nvidia.com>

* Add missing newlines.

* PR review changes.

* Remove default xxhash64 class constructor.  Remove unused parameter (row index) from remaining constructor.

* Fix issues with merge.

* Rectify thirdparty/cudf issues.

* Revert inadvertent change to pom.xml

* PR review feedback changes.

---------

Signed-off-by: db <dbaranec@nvidia.com>
---
 src/main/cpp/CMakeLists.txt                   |   1 +
 src/main/cpp/src/HashJni.cpp                  |  17 +
 src/main/cpp/src/hash.cuh                     |  18 +
 src/main/cpp/src/xxhash64.cu                  | 359 +++++++++++++
 src/main/cpp/tests/hash.cpp                   | 488 ++++++++++++++++--
 .../com/nvidia/spark/rapids/jni/Hash.java     |  38 +-
 .../com/nvidia/spark/rapids/jni/HashTest.java | 127 +++++
 7 files changed, 1017 insertions(+), 31 deletions(-)
 create mode 100644 src/main/cpp/src/xxhash64.cu

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 3e7c388057..5f363fb6a8 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -161,6 +161,7 @@ add_library(
   src/map_utils.cu
   src/murmur_hash.cu
   src/row_conversion.cu
+  src/xxhash64.cu
   src/zorder.cu
 )
 
diff --git a/src/main/cpp/src/HashJni.cpp b/src/main/cpp/src/HashJni.cpp
index bcf72922d4..d106da2b61 100644
--- a/src/main/cpp/src/HashJni.cpp
+++ b/src/main/cpp/src/HashJni.cpp
@@ -36,4 +36,21 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_Hash_murmurHash32(
   }
   CATCH_STD(env, 0);
 }
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_Hash_xxhash64(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong seed,
+                                                                       jlongArray column_handles)
+{
+  JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto column_views =
+      cudf::jni::native_jpointerArray<cudf::column_view>{env, column_handles}.get_dereferenced();
+    return cudf::jni::release_as_jlong(
+      spark_rapids_jni::xxhash64(cudf::table_view{column_views}, seed));
+  }
+  CATCH_STD(env, 0);
+}
 }
diff --git a/src/main/cpp/src/hash.cuh b/src/main/cpp/src/hash.cuh
index 84a204d08f..a6af264ceb 100644
--- a/src/main/cpp/src/hash.cuh
+++ b/src/main/cpp/src/hash.cuh
@@ -25,6 +25,8 @@
 
 namespace spark_rapids_jni {
 
+constexpr int64_t DEFAULT_XXHASH64_SEED = 42;
+
 /**
  * @brief Converts a cudf decimal128 value to a java bigdecimal value.
  *
@@ -91,4 +93,20 @@ std::unique_ptr<cudf::column> murmur_hash3_32(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Computes the xxhash64 hash value of each row in the input set of columns.
+ *
+ * @param input The table of columns to hash
+ * @param seed Optional seed value to use for the hash function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a column from the input.
+ */
+std::unique_ptr<cudf::column> xxhash64(
+  cudf::table_view const& input,
+  int64_t seed                        = DEFAULT_XXHASH64_SEED,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/xxhash64.cu b/src/main/cpp/src/xxhash64.cu
new file mode 100644
index 0000000000..30a8cc15fc
--- /dev/null
+++ b/src/main/cpp/src/xxhash64.cu
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hash.cuh"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/hashing.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
+#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/table/table_device_view.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/tabulate.h>
+
+namespace spark_rapids_jni {
+
+namespace {
+
+using hash_value_type = int64_t;
+using half_size_type  = int32_t;
+
+constexpr __device__ inline int64_t rotate_bits_left_signed(hash_value_type h, int8_t r)
+{
+  return (h << r) | (h >> (64 - r)) & ~(-1 << r);
+}
+
+template <typename Key>
+struct XXHash_64 {
+  using result_type = hash_value_type;
+
+  constexpr XXHash_64() = delete;
+  constexpr XXHash_64(hash_value_type seed) : m_seed(seed) {}
+
+  template <typename T>
+  __device__ inline T getblock32(std::byte const* data, cudf::size_type offset) const
+  {
+    // Read a 4-byte value from the data pointer as individual bytes for safe
+    // unaligned access (very likely for string types).
+    auto block      = reinterpret_cast<std::uint8_t const*>(data + offset);
+    uint32_t result = static_cast<uint32_t>(block[0]) | (static_cast<uint32_t>(block[1]) << 8) |
+                      (static_cast<uint32_t>(block[2]) << 16) |
+                      (static_cast<uint32_t>(block[3]) << 24);
+    return reinterpret_cast<T const*>(&result)[0];
+  }
+
+  __device__ inline hash_value_type getblock64(std::byte const* data, cudf::size_type offset) const
+  {
+    uint64_t result = static_cast<uint64_t>(getblock32<uint32_t>(data, offset)) |
+                      static_cast<uint64_t>(getblock32<uint32_t>(data, offset + 4)) << 32;
+    return reinterpret_cast<hash_value_type const*>(&result)[0];
+  }
+
+  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
+
+  template <typename T>
+  result_type __device__ inline compute(T const& key) const
+  {
+    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
+  }
+
+  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
+                                                        cudf::size_type const nbytes,
+                                                        cudf::size_type offset,
+                                                        result_type h64) const
+  {
+    // remaining data can be processed in 8-byte chunks
+    if ((nbytes % 32) >= 8) {
+      for (; offset <= nbytes - 8; offset += 8) {
+        hash_value_type k1 = getblock64(data, offset) * prime2;
+        k1                 = rotate_bits_left_signed(k1, 31) * prime1;
+        h64 ^= k1;
+        h64 = rotate_bits_left_signed(h64, 27) * prime1 + prime4;
+      }
+    }
+
+    // remaining data can be processed in 4-byte chunks
+    if (((nbytes % 32) % 8) >= 4) {
+      for (; offset <= nbytes - 4; offset += 4) {
+        h64 ^= (getblock32<half_size_type>(data, offset) & 0xffffffffL) * prime1;
+        h64 = rotate_bits_left_signed(h64, 23) * prime2 + prime3;
+      }
+    }
+
+    // and the rest
+    if (nbytes % 4) {
+      while (offset < nbytes) {
+        h64 ^= (static_cast<uint8_t>(data[offset]) & 0xff) * prime5;
+        h64 = rotate_bits_left_signed(h64, 11) * prime1;
+        ++offset;
+      }
+    }
+    return h64;
+  }
+
+  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const nbytes) const
+  {
+    uint64_t offset = 0;
+    hash_value_type h64;
+    // data can be processed in 32-byte chunks
+    if (nbytes >= 32) {
+      auto limit         = nbytes - 32;
+      hash_value_type v1 = m_seed + prime1 + prime2;
+      hash_value_type v2 = m_seed + prime2;
+      hash_value_type v3 = m_seed;
+      hash_value_type v4 = m_seed - prime1;
+
+      do {
+        // pipeline 4*8byte computations
+        v1 += getblock64(data, offset) * prime2;
+        v1 = rotate_bits_left_signed(v1, 31);
+        v1 *= prime1;
+        offset += 8;
+        v2 += getblock64(data, offset) * prime2;
+        v2 = rotate_bits_left_signed(v2, 31);
+        v2 *= prime1;
+        offset += 8;
+        v3 += getblock64(data, offset) * prime2;
+        v3 = rotate_bits_left_signed(v3, 31);
+        v3 *= prime1;
+        offset += 8;
+        v4 += getblock64(data, offset) * prime2;
+        v4 = rotate_bits_left_signed(v4, 31);
+        v4 *= prime1;
+        offset += 8;
+      } while (offset <= limit);
+
+      h64 = rotate_bits_left_signed(v1, 1) + rotate_bits_left_signed(v2, 7) +
+            rotate_bits_left_signed(v3, 12) + rotate_bits_left_signed(v4, 18);
+
+      v1 *= prime2;
+      v1 = rotate_bits_left_signed(v1, 31);
+      v1 *= prime1;
+      h64 ^= v1;
+      h64 = h64 * prime1 + prime4;
+
+      v2 *= prime2;
+      v2 = rotate_bits_left_signed(v2, 31);
+      v2 *= prime1;
+      h64 ^= v2;
+      h64 = h64 * prime1 + prime4;
+
+      v3 *= prime2;
+      v3 = rotate_bits_left_signed(v3, 31);
+      v3 *= prime1;
+      h64 ^= v3;
+      h64 = h64 * prime1 + prime4;
+
+      v4 *= prime2;
+      v4 = rotate_bits_left_signed(v4, 31);
+      v4 *= prime1;
+      h64 ^= v4;
+      h64 = h64 * prime1 + prime4;
+    } else {
+      h64 = m_seed + prime5;
+    }
+
+    h64 += nbytes;
+    h64 = compute_remaining_bytes(data, nbytes, offset, h64);
+
+    return finalize(h64);
+  }
+
+  constexpr __host__ __device__ hash_value_type finalize(hash_value_type h) const noexcept
+  {
+    h ^= static_cast<hash_value_type>(static_cast<uint64_t>(h) >> 33);
+    h *= prime2;
+    h ^= static_cast<hash_value_type>(static_cast<uint64_t>(h) >> 29);
+    h *= prime3;
+    h ^= static_cast<hash_value_type>(static_cast<uint64_t>(h) >> 32);
+    return h;
+  }
+
+ private:
+  hash_value_type m_seed{};
+
+  static constexpr hash_value_type prime1 = 0x9E3779B185EBCA87L;
+  static constexpr hash_value_type prime2 = 0xC2B2AE3D27D4EB4FL;
+  static constexpr hash_value_type prime3 = 0x165667B19E3779F9L;
+  static constexpr hash_value_type prime4 = 0x85EBCA77C2B2AE63L;
+  static constexpr hash_value_type prime5 = 0x27D4EB2F165667C5L;
+};
+
+template <>
+hash_value_type __device__ inline XXHash_64<bool>::operator()(bool const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<int8_t>::operator()(int8_t const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<uint8_t>::operator()(uint8_t const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<int16_t>::operator()(int16_t const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<uint16_t>::operator()(uint16_t const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<float>::operator()(float const& key) const
+{
+  return compute<float>(cudf::detail::normalize_nans_and_zeros(key));
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<double>::operator()(double const& key) const
+{
+  return compute<double>(cudf::detail::normalize_nans_and_zeros(key));
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<cudf::string_view>::operator()(
+  cudf::string_view const& key) const
+{
+  auto const data = reinterpret_cast<std::byte const*>(key.data());
+  auto const len  = key.size_bytes();
+  return compute_bytes(data, len);
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<numeric::decimal32>::operator()(
+  numeric::decimal32 const& key) const
+{
+  return compute<uint64_t>(key.value());
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<numeric::decimal64>::operator()(
+  numeric::decimal64 const& key) const
+{
+  return compute<uint64_t>(key.value());
+}
+
+template <>
+hash_value_type __device__ inline XXHash_64<numeric::decimal128>::operator()(
+  numeric::decimal128 const& key) const
+{
+  auto [java_d, length] = to_java_bigdecimal(key);
+  auto bytes            = reinterpret_cast<std::byte*>(&java_d);
+  return compute_bytes(bytes, length);
+}
+
+/**
+ * @brief Computes the hash value of a row in the given table.
+ *
+ * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
+ */
+template <typename Nullate>
+class device_row_hasher {
+ public:
+  device_row_hasher(Nullate nulls, cudf::table_device_view const& t, hash_value_type seed)
+    : _check_nulls(nulls), _table(t), _seed(seed)
+  {
+  }
+
+  __device__ auto operator()(cudf::size_type row_index) const noexcept
+  {
+    return cudf::detail::accumulate(
+      _table.begin(),
+      _table.end(),
+      _seed,
+      [row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
+        return cudf::type_dispatcher(
+          column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
+      });
+  }
+
+  /**
+   * @brief Computes the hash value of an element in the given column.
+   */
+  class element_hasher_adapter {
+   public:
+    template <typename T, CUDF_ENABLE_IF(cudf::column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(cudf::column_device_view const& col,
+                                          cudf::size_type row_index,
+                                          Nullate const _check_nulls,
+                                          hash_value_type const _seed) const noexcept
+    {
+      if (_check_nulls && col.is_null(row_index)) { return _seed; }
+      auto const hasher = XXHash_64<T>{_seed};
+      return hasher(col.element<T>(row_index));
+    }
+
+    template <typename T, CUDF_ENABLE_IF(not cudf::column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(cudf::column_device_view const&,
+                                          cudf::size_type,
+                                          Nullate const,
+                                          hash_value_type const) const noexcept
+    {
+      CUDF_UNREACHABLE("Unsupported type for xxhash64");
+    }
+  };
+
+  Nullate const _check_nulls;
+  cudf::table_device_view const _table;
+  hash_value_type const _seed;
+};
+
+}  // namespace
+
+std::unique_ptr<cudf::column> xxhash64(cudf::table_view const& input,
+                                       int64_t _seed,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  hash_value_type seed = static_cast<hash_value_type>(_seed);
+
+  auto output = cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<hash_value_type>()),
+                                          input.num_rows(),
+                                          cudf::mask_state::UNALLOCATED,
+                                          stream,
+                                          mr);
+
+  // Return early if there's nothing to hash
+  if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
+
+  bool const nullable   = has_nulls(input);
+  auto const input_view = cudf::table_device_view::create(input, stream);
+  auto output_view      = output->mutable_view();
+
+  // Compute the hash value for each row
+  thrust::tabulate(rmm::exec_policy(stream),
+                   output_view.begin<hash_value_type>(),
+                   output_view.end<hash_value_type>(),
+                   device_row_hasher(nullable, *input_view, seed));
+
+  return output;
+}
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/tests/hash.cpp b/src/main/cpp/tests/hash.cpp
index 8ac8f2862e..5f6a71ab29 100644
--- a/src/main/cpp/tests/hash.cpp
+++ b/src/main/cpp/tests/hash.cpp
@@ -115,17 +115,30 @@ TEST_F(HashTest, MultiValueNulls)
   auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1});
   auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2});
 
-  auto const output1 = cudf::hash(input1);
-  auto const output2 = cudf::hash(input2);
-
-  EXPECT_EQ(input1.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-
-  auto const spark_output1 = spark_rapids_jni::murmur_hash3_32(input1, 0);
-  auto const spark_output2 = spark_rapids_jni::murmur_hash3_32(input2);
-
-  EXPECT_EQ(input1.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
+  {
+    auto const output1 = cudf::hash(input1);
+    auto const output2 = cudf::hash(input2);
+
+    EXPECT_EQ(input1.num_rows(), output1->size());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+  }
+
+  {
+    auto const output1 = spark_rapids_jni::murmur_hash3_32(input1, 0);
+    auto const output2 = spark_rapids_jni::murmur_hash3_32(input2);
+
+    EXPECT_EQ(input1.num_rows(), output1->size());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+  }
+
+  {
+    auto const output1 =
+      spark_rapids_jni::xxhash64(input1, spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+    auto const output2 = spark_rapids_jni::xxhash64(input2);
+
+    EXPECT_EQ(input1.num_rows(), output1->size());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+  }
 }
 
 TEST_F(HashTest, BasicList)
@@ -354,17 +367,30 @@ TYPED_TEST(HashTestTyped, Equality)
   auto const input = cudf::table_view({col});
 
   // Hash of same input should be equal
-  auto const output1 = cudf::hash(input);
-  auto const output2 = cudf::hash(input);
 
-  EXPECT_EQ(input.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+  {
+    auto const output1 = cudf::hash(input);
+    auto const output2 = cudf::hash(input);
+
+    EXPECT_EQ(input.num_rows(), output1->size());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+  }
 
-  auto const spark_output1 = spark_rapids_jni::murmur_hash3_32(input, 0);
-  auto const spark_output2 = spark_rapids_jni::murmur_hash3_32(input);
+  {
+    auto const output1 = spark_rapids_jni::murmur_hash3_32(input, 0);
+    auto const output2 = spark_rapids_jni::murmur_hash3_32(input);
 
-  EXPECT_EQ(input.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
+    EXPECT_EQ(input.num_rows(), output1->size());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+  }
+
+  {
+    auto const output1 = spark_rapids_jni::xxhash64(input, spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+    auto const output2 = spark_rapids_jni::xxhash64(input);
+
+    EXPECT_EQ(input.num_rows(), output1->size());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+  }
 }
 
 TYPED_TEST(HashTestTyped, EqualityNulls)
@@ -378,17 +404,30 @@ TYPED_TEST(HashTestTyped, EqualityNulls)
   auto const input1 = cudf::table_view({col1});
   auto const input2 = cudf::table_view({col2});
 
-  auto const output1 = cudf::hash(input1);
-  auto const output2 = cudf::hash(input2);
+  {
+    auto const output1 = cudf::hash(input1);
+    auto const output2 = cudf::hash(input2);
 
-  EXPECT_EQ(input1.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+    EXPECT_EQ(input1.num_rows(), output1->size());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+  }
+
+  {
+    auto const output1 = spark_rapids_jni::murmur_hash3_32(input1, 0);
+    auto const output2 = spark_rapids_jni::murmur_hash3_32(input2);
+
+    EXPECT_EQ(input1.num_rows(), output1->size());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+  }
 
-  auto const spark_output1 = spark_rapids_jni::murmur_hash3_32(input1, 0);
-  auto const spark_output2 = spark_rapids_jni::murmur_hash3_32(input2);
+  {
+    auto const output1 =
+      spark_rapids_jni::xxhash64(input1, spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+    auto const output2 = spark_rapids_jni::xxhash64(input2);
 
-  EXPECT_EQ(input1.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
+    EXPECT_EQ(input1.num_rows(), output1->size());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
+  }
 }
 
 template <typename T>
@@ -423,10 +462,20 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, verbosity);
 
   // Spark hash is sensitive to 0 and -0
-  auto const spark_col         = spark_rapids_jni::murmur_hash3_32(table_col, 0);
-  auto const spark_col_neg_nan = spark_rapids_jni::murmur_hash3_32(table_col_neg_nan);
+  {
+    auto const spark_col         = spark_rapids_jni::murmur_hash3_32(table_col, 0);
+    auto const spark_col_neg_nan = spark_rapids_jni::murmur_hash3_32(table_col_neg_nan);
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
+  }
+
+  {
+    auto const spark_col =
+      spark_rapids_jni::xxhash64(table_col, spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+    auto const spark_col_neg_nan = spark_rapids_jni::xxhash64(table_col_neg_nan);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
+  }
 }
 
 class SparkMurmurHash3Test : public cudf::test::BaseFixture {};
@@ -843,3 +892,382 @@ TEST_F(SparkMurmurHash3Test, ListOfStructValues)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
   */
 }
+
+class SparkXXHash64Test : public cudf::test::BaseFixture {};
+
+TEST_F(SparkXXHash64Test, MultiValueWithSeeds)
+{
+  // The hash values were determined by running the following Scala code in Apache Spark.
+  // Note that Spark >= 3.2 normalizes the float/double value of -0. to +0. and both values hash
+  // to the same result. This is normalized in the calling code (Spark RAPIDS plugin) for Spark
+  // >= 3.2. However, the reference values for -0. below must be obtained with Spark < 3.2 and
+  // libcudf will continue to implement the Spark < 3.2 behavior until Spark >= 3.2 is required and
+  // the workaround in the calling code is removed. This also affects the combined hash values.
+
+  /*
+  import org.apache.spark.sql.functions._
+  import org.apache.spark.sql.types._
+  import org.apache.spark.sql.Row
+  import org.apache.spark.sql.catalyst.util.DateTimeUtils
+
+  val schema = new StructType()
+      .add("strings", StringType)
+      .add("doubles", DoubleType)
+      .add("timestamps", TimestampType)
+      .add("decimal64", DecimalType(18, 7))
+      .add("longs", LongType)
+      .add("floats", FloatType)
+      .add("dates", DateType)
+      .add("decimal32", DecimalType(9, 3))
+      .add("ints", IntegerType)
+      .add("shorts", ShortType)
+      .add("bytes", ByteType)
+      .add("bools", BooleanType)
+      .add("decimal128", DecimalType(38, 11))
+
+  val data = Seq(
+      Row("", 0.toDouble,
+          DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat,
+          DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte,
+          false, BigDecimal(0)),
+      Row("The quick brown fox", -(0.toDouble),
+          DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat),
+          DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte,
+          true, BigDecimal("0.000000001")),
+      Row("jumps over the lazy dog.", -Double.NaN,
+          DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN,
+          DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte,
+          true, BigDecimal("-0.00000000001")),
+      Row("All work and no play makes Jack a dull boy", Double.MinValue,
+          DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"),
+          Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100),
+          BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true,
+          BigDecimal("-9999999999999999.99999999999")),
+      Row("!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue,
+          DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"),
+          Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100),
+          BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false,
+          BigDecimal("99999999999999999999999999.99999999999")),
+      Row(null, null, null, null, null, null, null, null, null, null, null, null, null, null),
+      Row("abcdefgh", 100.toDouble,
+          DateTimeUtils.toJavaTimestamp(200), BigDecimal("0.0000123"), 0x123456789abcdefL,
+  Float.PositiveInfinity, DateTimeUtils.toJavaDate(-200), BigDecimal("-0.2"), -200, -200.toShort,
+  -90.toByte, false, BigDecimal("-9999999999999999.99999999999")), Row("abcdefghi", 200.toDouble,
+          DateTimeUtils.toJavaTimestamp(300), BigDecimal("0.0000432"), -0x123456789abcdefL,
+  Float.NegativeInfinity, DateTimeUtils.toJavaDate(-300), BigDecimal("-0.3"), -300, -300.toShort,
+  -80.toByte, false, BigDecimal("99999999999999999999999999.99999999999")))
+
+
+  val df = spark.createDataFrame(sc.parallelize(data), schema)
+  df.columns.foreach(c => println(s"$c => ${df.select(xxhash64(col(c))).collect.mkString(",")}"))
+  println(s"combined => ${df.select(xxhash64(col("*"))).collect.mkString(",")}")
+  */
+
+  // cudf::test::fixed_width_column_wrapper<int32_t> const hash_structs_expected(
+  //{-105406170, 90479889, -678041645, 1667387937, 301478567});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_strings_expected(
+    {-7444071767201028348,
+     -3617261401988713833,
+     8198945020833482635,
+     -5346617152005100141,
+     6614298085531227868,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     2470326616177429180,
+     -7093207067522615973});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_doubles_expected(
+    {-5252525462095825812,
+     -5252525462095825812,
+     -3127944061524951246,
+     9065082843545458248,
+     -4222314252576420879,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     -7996023612001835843,
+     -8838535416664833914});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_timestamps_expected(
+    {-5252525462095825812,
+     8713583529807266080,
+     5675770457807661948,
+     7123048472642709644,
+     -5141505295506489983,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     -1244884446866925109,
+     1772389229253425430});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_decimal64_expected(
+    {-5252525462095825812,
+     8713583529807266080,
+     5675770457807661948,
+     4265531446127695490,
+     2162198894918931945,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     -3178482946328430151,
+     4788666723486520022});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_longs_expected(
+    {-5252525462095825812,
+     8713583529807266080,
+     5675770457807661948,
+     -8619748838626508300,
+     -3246596055638297850,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     1941233597257011502,
+     -1318946533059658749});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_floats_expected(
+    {3614696996920510707,
+     3614696996920510707,
+     2692338816207849720,
+     -8545425418825163117,
+     -1065250890878313112,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     -5940311692336719973,
+     -7580553461823983095});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_dates_expected(
+    {3614696996920510707,
+     -7987742665087449293,
+     8990748234399402673,
+     -8442426365007754391,
+     -1447590449373190349,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     -953008374380745918,
+     2895908635257747121});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_decimal32_expected(
+    {-5252525462095825812,
+     8713583529807266080,
+     5675770457807661948,
+     8670643431269007867,
+     6810183316718625826,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     7277994511003214036,
+     6264187449999859617});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_ints_expected(
+    {3614696996920510707,
+     -7987742665087449293,
+     8990748234399402673,
+     2073849959933241805,
+     1508894993788531228,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     -953008374380745918,
+     2895908635257747121});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_shorts_expected(
+    {3614696996920510707,
+     -7987742665087449293,
+     8990748234399402673,
+     -904511417458573795,
+     8952525448871805501,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     -953008374380745918,
+     2895908635257747121});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_bytes_expected(
+    {3614696996920510707,
+     -7987742665087449293,
+     8990748234399402673,
+     4160238337661960656,
+     8632298611707923906,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     -4008061843281999337,
+     6690883199412647955});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_bools_expected(
+    {3614696996920510707,
+     -6698625589789238999,
+     -6698625589789238999,
+     -6698625589789238999,
+     3614696996920510707,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     3614696996920510707,
+     3614696996920510707});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_decimal128_expected(
+    {-8959994473701255385,
+     4409375254388155230,
+     -4006032525457443936,
+     -5423362182451591024,
+     7041733194569950081,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     -5423362182451591024,
+     7041733194569950081});
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_combined_expected(
+    {541735645035655239,
+     9011982951766246298,
+     3834379147931449211,
+     -5406325166887725795,
+     7797509897614041972,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     -9032872913521304524,
+     -604070008711895908});
+
+  using double_limits = std::numeric_limits<double>;
+  using long_limits   = std::numeric_limits<int64_t>;
+  using float_limits  = std::numeric_limits<float>;
+  using int_limits    = std::numeric_limits<int32_t>;
+
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721",
+     "",
+     "abcdefgh",
+     "abcdefghi"},
+    {1, 1, 1, 1, 1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<double> const doubles_col({0.,
+                                                                    -0.,
+                                                                    -double_limits::quiet_NaN(),
+                                                                    double_limits::lowest(),
+                                                                    double_limits::max(),
+                                                                    0.,
+                                                                    100.,
+                                                                    200.},
+                                                                   {1, 1, 1, 1, 1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> const
+    timestamps_col(
+      {0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000, 0L, 200L, 300L},
+      {1, 1, 1, 1, 1, 0, 1, 1});
+  cudf::test::fixed_point_column_wrapper<int64_t> const decimal64_col(
+    {0L, 100L, -100L, -999999999999999999L, 999999999999999999L, 0L, 123L, 432L},
+    {1, 1, 1, 1, 1, 0, 1, 1},
+    numeric::scale_type{-7});
+  cudf::test::fixed_width_column_wrapper<int64_t> const longs_col({0L,
+                                                                   100L,
+                                                                   -100L,
+                                                                   long_limits::min(),
+                                                                   long_limits::max(),
+                                                                   0L,
+                                                                   0x123456789abcdefL,
+                                                                   -0x123456789abcdefL},
+                                                                  {1, 1, 1, 1, 1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<float> const floats_col({0.f,
+                                                                  -0.f,
+                                                                  -float_limits::quiet_NaN(),
+                                                                  float_limits::lowest(),
+                                                                  float_limits::max(),
+                                                                  0.f,
+                                                                  float_limits::infinity(),
+                                                                  -float_limits::infinity()},
+                                                                 {1, 1, 1, 1, 1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> dates_col(
+    {0, 100, -100, int_limits::min() / 100, int_limits::max() / 100, 0, -200, -300},
+    {1, 1, 1, 1, 1, 0, 1, 1});
+  cudf::test::fixed_point_column_wrapper<int32_t> const decimal32_col(
+    {0, 100, -100, -999999999, 999999999, 0, -200, -300},
+    {1, 1, 1, 1, 1, 0, 1, 1},
+    numeric::scale_type{-3});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, 100, -100, int_limits::min(), int_limits::max(), 0, -200, -300}, {1, 1, 1, 1, 1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int16_t> const shorts_col(
+    {0, 100, -100, -32768, 32767, 0, -200, -300}, {1, 1, 1, 1, 1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int8_t> const bytes_col(
+    {0, 100, -100, -128, 127, 0, -90, -80}, {1, 1, 1, 1, 1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0, 0, 0, 0},
+                                                                {1, 1, 1, 1, 1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0, 0, 0, 0},
+                                                                {1, 1, 1, 1, 1, 0, 1, 1});
+  cudf::test::fixed_point_column_wrapper<__int128_t> const decimal128_col(
+    {
+      static_cast<__int128>(0),
+      static_cast<__int128>(100),
+      static_cast<__int128>(-1),
+      (static_cast<__int128>(0xFFFF'FFFF'FCC4'D1C3u) << 64 | 0x602F'7FC3'1800'0001u),
+      (static_cast<__int128>(0x0785'EE10'D5DA'46D9u) << 64 | 0x00F4'369F'FFFF'FFFFu),
+      0,
+      (static_cast<__int128>(0xFFFF'FFFF'FCC4'D1C3u) << 64 | 0x602F'7FC3'1800'0001u),
+      (static_cast<__int128>(0x0785'EE10'D5DA'46D9u) << 64 | 0x00F4'369F'FFFF'FFFFu),
+    },
+    {1, 1, 1, 1, 1, 0, 1, 1},
+    numeric::scale_type{-11});
+
+  // auto const hash_structs = spark_rapids_jni::xxhash64(cudf::table_view({structs_col}),
+  // spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_strings    = spark_rapids_jni::xxhash64(cudf::table_view({strings_col}),
+                                                       spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_doubles    = spark_rapids_jni::xxhash64(cudf::table_view({doubles_col}),
+                                                       spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_timestamps = spark_rapids_jni::xxhash64(cudf::table_view({timestamps_col}),
+                                                          spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_decimal64  = spark_rapids_jni::xxhash64(cudf::table_view({decimal64_col}),
+                                                         spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_longs      = spark_rapids_jni::xxhash64(cudf::table_view({longs_col}),
+                                                     spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_floats     = spark_rapids_jni::xxhash64(cudf::table_view({floats_col}),
+                                                      spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_dates      = spark_rapids_jni::xxhash64(cudf::table_view({dates_col}),
+                                                     spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_decimal32  = spark_rapids_jni::xxhash64(cudf::table_view({decimal32_col}),
+                                                         spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_ints       = spark_rapids_jni::xxhash64(cudf::table_view({ints_col}),
+                                                    spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_shorts     = spark_rapids_jni::xxhash64(cudf::table_view({shorts_col}),
+                                                      spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_bytes      = spark_rapids_jni::xxhash64(cudf::table_view({bytes_col}),
+                                                     spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_bools1     = spark_rapids_jni::xxhash64(cudf::table_view({bools_col1}),
+                                                      spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_bools2     = spark_rapids_jni::xxhash64(cudf::table_view({bools_col2}),
+                                                      spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  auto const hash_decimal128 = spark_rapids_jni::xxhash64(cudf::table_view({decimal128_col}),
+                                                          spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+
+  // CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal64, hash_decimal64_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_longs, hash_longs_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_floats, hash_floats_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_dates, hash_dates_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal32, hash_decimal32_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_ints, hash_ints_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_shorts, hash_shorts_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal128, hash_decimal128_expected, verbosity);
+
+  auto const combined_table = cudf::table_view({// structs_col,
+                                                strings_col,
+                                                doubles_col,
+                                                timestamps_col,
+                                                decimal64_col,
+                                                longs_col,
+                                                floats_col,
+                                                dates_col,
+                                                decimal32_col,
+                                                ints_col,
+                                                shorts_col,
+                                                bytes_col,
+                                                bools_col2,
+                                                decimal128_col});
+  auto const hash_combined =
+    spark_rapids_jni::xxhash64(combined_table, spark_rapids_jni::DEFAULT_XXHASH64_SEED);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity);
+}
+
+TEST_F(SparkXXHash64Test, Strings)
+{
+  // The hash values were determined by running the following Scala code in Apache Spark:
+  // val strs = Seq("",
+  //                null,
+  //                "The quick brown fox",
+  //                "jumps over the lazy dog.",
+  //                "All work and no play makes Jack a dull boy",
+  //                "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721").toDF()
+  // strs.selectExpr("xxhash64(value)")
+
+  cudf::test::fixed_width_column_wrapper<int64_t> const hash_strings_expected(
+    {-7444071767201028348,
+     spark_rapids_jni::DEFAULT_XXHASH64_SEED,
+     -3617261401988713833,
+     8198945020833482635,
+     -5346617152005100141,
+     6614298085531227868});
+
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"},
+    {1, 0, 1, 1, 1, 1});
+
+  auto const hash_strings = spark_rapids_jni::xxhash64(cudf::table_view({strings_col}));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/Hash.java b/src/main/java/com/nvidia/spark/rapids/jni/Hash.java
index f182d68d1f..3059c5e785 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/Hash.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/Hash.java
@@ -16,9 +16,15 @@
 
 package com.nvidia.spark.rapids.jni;
 
-import ai.rapids.cudf.*;
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.ColumnView;
+import ai.rapids.cudf.CudfException;
+import ai.rapids.cudf.NativeDepsLoader;
 
 public class Hash {
+  // there doesn't appear to be a useful constant in spark to reference. this could break.
+  static final long DEFAULT_XXHASH64_SEED = 42;
+
   static {
     NativeDepsLoader.loadNativeDeps();
   }
@@ -51,5 +57,35 @@ public static ColumnVector murmurHash32(ColumnView columns[]) {
     return murmurHash32(0, columns);
   }
 
+  /**
+   * Create a new vector containing the xxhash64 hash of each row in the table.
+   *
+   * @param seed integer seed for the xxhash64 hash function
+   * @param columns array of columns to hash, must have identical number of rows.
+   * @return the new ColumnVector of 64-bit values representing each row's hash value.
+   */
+  public static ColumnVector xxhash64(long seed, ColumnView columns[]) {
+    if (columns.length < 1) {
+      throw new IllegalArgumentException("xxhash64 hashing requires at least 1 column of input");
+    }
+    long[] columnViews = new long[columns.length];
+    long size = columns[0].getRowCount();
+
+    for(int i = 0; i < columns.length; i++) {
+      assert columns[i] != null : "Column vectors passed may not be null";
+      assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size";
+      assert !columns[i].getType().isDurationType() : "Unsupported column type Duration";
+      assert !columns[i].getType().isNestedType() : "Unsupported column type Nested";
+      columnViews[i] = columns[i].getNativeView(); 
+    }
+    return new ColumnVector(xxhash64(seed, columnViews));
+  }
+
+  public static ColumnVector xxhash64(ColumnView columns[]) {
+    return xxhash64(DEFAULT_XXHASH64_SEED, columns);
+  }
+
   private static native long murmurHash32(int seed, long[] viewHandles) throws CudfException;
+  
+  private static native long xxhash64(long seed, long[] viewHandles) throws CudfException;
 }
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java b/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java
index dda03affe2..a124b4849e 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java
@@ -261,4 +261,131 @@ void testSpark32BitMurmur3HashListsAndNestedLists() {
       assertColumnsAreEqual(nestedExpected, nestedResult);
     }
   }
+
+  @Test
+  void testXXHash64Strings() {
+    try (ColumnVector v0 = ColumnVector.fromStrings(
+           "a", "B\nc",  "dE\"\u0100\t\u0101 \ud720\ud721\\Fg2\'",
+           "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+           "in the MD5 hash function. This string needed to be longer.A 60 character string to " +
+           "test MD5's message padding algorithm",
+           "hiJ\ud720\ud721\ud720\ud721", null);
+         ColumnVector result = Hash.xxhash64(new ColumnVector[]{v0});
+         ColumnVector expected = ColumnVector.fromBoxedLongs(-8582455328737087284L, 2221214721321197934L, 5798966295358745941L, -4834097201550955483L, -3782648123388245694L, Hash.DEFAULT_XXHASH64_SEED)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testXXHash64Ints() {
+    try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
+         ColumnVector v1 = ColumnVector.fromBoxedInts(0, null, -100, null, null, Integer.MAX_VALUE);
+         ColumnVector result = Hash.xxhash64(new ColumnVector[]{v0, v1});
+         ColumnVector expected = ColumnVector.fromBoxedLongs(1151812168208346021L, -7987742665087449293L, 8990748234399402673L, Hash.DEFAULT_XXHASH64_SEED, 2073849959933241805L, 1508894993788531228L)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+  
+  @Test
+  void testXXHash64Doubles() {
+    try (ColumnVector v = ColumnVector.fromBoxedDoubles(
+          0.0, null, 100.0, -100.0, Double.MIN_NORMAL, Double.MAX_VALUE,
+          POSITIVE_DOUBLE_NAN_UPPER_RANGE, POSITIVE_DOUBLE_NAN_LOWER_RANGE,
+          NEGATIVE_DOUBLE_NAN_UPPER_RANGE, NEGATIVE_DOUBLE_NAN_LOWER_RANGE,
+          Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY);
+         ColumnVector result = Hash.xxhash64(new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedLongs(-5252525462095825812L, Hash.DEFAULT_XXHASH64_SEED, -7996023612001835843L, 5695175288042369293L, 6181148431538304986L, -4222314252576420879L, -3127944061524951246L, -3127944061524951246L, -3127944061524951246L, -3127944061524951246L, 5810986238603807492L, 5326262080505358431L)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+    
+  @Test
+  void testXXHash64Timestamps() {
+    // The hash values were derived from Apache Spark in a manner similar to the one documented at
+    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
+    try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs(
+        0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL);
+         ColumnVector result = Hash.xxhash64(new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedLongs(-5252525462095825812L, Hash.DEFAULT_XXHASH64_SEED, 8713583529807266080L, 5675770457807661948L, 1941233597257011502L, Hash.DEFAULT_XXHASH64_SEED, -1318946533059658749L)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+  
+  @Test
+  void testXXHash64Decimal64() {
+    // The hash values were derived from Apache Spark in a manner similar to the one documented at
+    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
+    try (ColumnVector v = ColumnVector.decimalFromLongs(-7,
+        0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL);
+         ColumnVector result = Hash.xxhash64(new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedLongs(-5252525462095825812L, 8713583529807266080L, 5675770457807661948L, 1941233597257011502L, -1318946533059658749L)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+    
+  @Test
+  void testXXHash64Decimal32() {
+    // The hash values were derived from Apache Spark in a manner similar to the one documented at
+    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
+    try (ColumnVector v = ColumnVector.decimalFromInts(-3,
+        0, 100, -100, 0x12345678, -0x12345678);
+         ColumnVector result = Hash.xxhash64(new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedLongs(-5252525462095825812L, 8713583529807266080L, 5675770457807661948L, -7728554078125612835L, 3142315292375031143L)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testXXHash64Dates() {
+    // The hash values were derived from Apache Spark in a manner similar to the one documented at
+    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
+    try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts(
+        0, null, 100, -100, 0x12345678, null, -0x12345678);
+         ColumnVector result = Hash.xxhash64(new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedLongs(3614696996920510707L, Hash.DEFAULT_XXHASH64_SEED, -7987742665087449293L, 8990748234399402673L, 6954428822481665164L, Hash.DEFAULT_XXHASH64_SEED, -4294222333805341278L)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testXXHash64Floats() {
+    try (ColumnVector v = ColumnVector.fromBoxedFloats(
+          0f, 100f, -100f, Float.MIN_NORMAL, Float.MAX_VALUE, null,
+          POSITIVE_FLOAT_NAN_LOWER_RANGE, POSITIVE_FLOAT_NAN_UPPER_RANGE,
+          NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE,
+          Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY);
+         ColumnVector result = Hash.xxhash64(new ColumnVector[]{v});
+         ColumnVector expected = ColumnVector.fromBoxedLongs(3614696996920510707L, -8232251799677946044L, -6625719127870404449L, -6699704595004115126L, -1065250890878313112L, Hash.DEFAULT_XXHASH64_SEED, 2692338816207849720L, 2692338816207849720L, 2692338816207849720L, 2692338816207849720L, -5940311692336719973L, -7580553461823983095L)){
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testXXHash64Bools() {
+    try (ColumnVector v0 = ColumnVector.fromBoxedBooleans(null, true, false, true, null, false);
+         ColumnVector v1 = ColumnVector.fromBoxedBooleans(null, true, false, null, false, true);
+         ColumnVector result = Hash.xxhash64(new ColumnVector[]{v0, v1});
+         ColumnVector expected = ColumnVector.fromBoxedLongs(Hash.DEFAULT_XXHASH64_SEED, 9083826852238114423L, 1151812168208346021L, -6698625589789238999L, 3614696996920510707L, 7945966957015589024L)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+  
+  @Test
+  void testXXHash64Mixed() {
+    try (ColumnVector strings = ColumnVector.fromStrings(
+          "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
+          "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+          "in the MD5 hash function. This string needed to be longer.",
+          null, null);
+         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
+         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
+          0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
+         ColumnVector floats = ColumnVector.fromBoxedFloats(
+          0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
+         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
+         ColumnVector result = Hash.xxhash64(new ColumnVector[]{strings, integers, doubles, floats, bools});
+         ColumnVector expected = ColumnVector.fromBoxedLongs(7451748878409563026L, 6024043102550151964L, 3380664624738534402L, 8444697026100086329L, -5888679192448042852L, Hash.DEFAULT_XXHASH64_SEED)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
 }

From cec1ee4ab3cb1dde2850bcf7f4616034ed15e684 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 13 Jul 2023 05:25:47 +0800
Subject: [PATCH 087/113] Update submodule cudf to
 241257f31267efde2c82b8ee689d58a1ce9d1b47 (#1263)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 1814ec17d2..241257f312 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 1814ec17d27a5a45ff4eaabfff467e6d5f25eedf
+Subproject commit 241257f31267efde2c82b8ee689d58a1ce9d1b47

From 36e22fbb8b7615c1d68977e3fc200db6883d2eab Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 13 Jul 2023 17:03:37 +0800
Subject: [PATCH 088/113] Update submodule cudf to
 e1aec7bfe0cf91d8ec1ac833c7598cf34f10be17 (#1264)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 241257f312..e1aec7bfe0 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 241257f31267efde2c82b8ee689d58a1ce9d1b47
+Subproject commit e1aec7bfe0cf91d8ec1ac833c7598cf34f10be17

From c7d2a82f251614a9ffa7ca3aa72bfff71aa1869a Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 14 Jul 2023 05:04:58 +0800
Subject: [PATCH 089/113] Update submodule cudf to
 3bacb12deca71667646057d1790f0786c20c2d53 (#1265)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e1aec7bfe0..3bacb12dec 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e1aec7bfe0cf91d8ec1ac833c7598cf34f10be17
+Subproject commit 3bacb12deca71667646057d1790f0786c20c2d53

From 101b6059544b919438763a83959af149309274be Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 14 Jul 2023 17:02:58 +0800
Subject: [PATCH 090/113] Update submodule cudf to
 d9f1d94f88e6752ad978548fb5decefefb5b93d3 (#1266)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 3bacb12dec..d9f1d94f88 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 3bacb12deca71667646057d1790f0786c20c2d53
+Subproject commit d9f1d94f88e6752ad978548fb5decefefb5b93d3

From a9d7f6b59deaf664c0967f5a5c08e2756bf9fb42 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 14 Jul 2023 21:02:28 +0800
Subject: [PATCH 091/113] Update submodule cudf to
 e0ffbd72e9218dfaa3c2ca617d277dc9ec3cbb81 (#1267)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index d9f1d94f88..e0ffbd72e9 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit d9f1d94f88e6752ad978548fb5decefefb5b93d3
+Subproject commit e0ffbd72e9218dfaa3c2ca617d277dc9ec3cbb81

From 6ae24d362015ce792f13bc0ff7a4916ae1d0dd53 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 15 Jul 2023 05:03:19 +0800
Subject: [PATCH 092/113] Update submodule cudf to
 2436e0bbcb44123d116f8a5b8a6169ffbb125589 (#1268)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e0ffbd72e9..2436e0bbcb 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e0ffbd72e9218dfaa3c2ca617d277dc9ec3cbb81
+Subproject commit 2436e0bbcb44123d116f8a5b8a6169ffbb125589

From 69c8cbfa0363c9f8e9324a33c7db767391957969 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Mon, 17 Jul 2023 12:51:42 -0500
Subject: [PATCH 093/113] Merge cudf 23.08 with spark-rapids-jni fixes for hash
 utility functions that moved. (#1271)

* Merge cudf 23.08 with hash utility function moves.  Fix spark-rapids-jni to compensate.

* Add signoff

Signed-off-by: db <dbaranec@nvidia.com>

---------

Signed-off-by: db <dbaranec@nvidia.com>
---
 src/main/cpp/src/murmur_hash.cu | 15 +++++++--------
 src/main/cpp/src/xxhash64.cu    |  7 +++----
 thirdparty/cudf                 |  2 +-
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/main/cpp/src/murmur_hash.cu b/src/main/cpp/src/murmur_hash.cu
index fc8f2db8f6..633c186c9f 100644
--- a/src/main/cpp/src/murmur_hash.cu
+++ b/src/main/cpp/src/murmur_hash.cu
@@ -15,9 +15,8 @@
  */
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/hashing.hpp>
-#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
@@ -86,10 +85,10 @@ struct SparkMurmurHash3_32 {
       // casting byte-to-int, but C++ does not.
       uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
       k1 *= c1;
-      k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
+      k1 = cudf::hashing::detail::rotate_bits_left(k1, rot_c1);
       k1 *= c2;
       h ^= k1;
-      h = cudf::detail::rotate_bits_left(h, rot_c2);
+      h = cudf::hashing::detail::rotate_bits_left(h, rot_c2);
       h = h * 5 + c3;
     }
     return h;
@@ -106,10 +105,10 @@ struct SparkMurmurHash3_32 {
     for (cudf::size_type i = 0; i < nblocks; i++) {
       uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
       k1 *= c1;
-      k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
+      k1 = cudf::hashing::detail::rotate_bits_left(k1, rot_c1);
       k1 *= c2;
       h ^= k1;
-      h = cudf::detail::rotate_bits_left(h, rot_c2);
+      h = cudf::hashing::detail::rotate_bits_left(h, rot_c2);
       h = h * 5 + c3;
     }
 
@@ -168,14 +167,14 @@ template <>
 spark_hash_value_type __device__ inline SparkMurmurHash3_32<float>::operator()(
   float const& key) const
 {
-  return compute<float>(cudf::detail::normalize_nans(key));
+  return compute<float>(cudf::hashing::detail::normalize_nans(key));
 }
 
 template <>
 spark_hash_value_type __device__ inline SparkMurmurHash3_32<double>::operator()(
   double const& key) const
 {
-  return compute<double>(cudf::detail::normalize_nans(key));
+  return compute<double>(cudf::hashing::detail::normalize_nans(key));
 }
 
 template <>
diff --git a/src/main/cpp/src/xxhash64.cu b/src/main/cpp/src/xxhash64.cu
index 30a8cc15fc..e97684f279 100644
--- a/src/main/cpp/src/xxhash64.cu
+++ b/src/main/cpp/src/xxhash64.cu
@@ -17,9 +17,8 @@
 #include "hash.cuh"
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/hashing.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
-#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -228,13 +227,13 @@ hash_value_type __device__ inline XXHash_64<uint16_t>::operator()(uint16_t const
 template <>
 hash_value_type __device__ inline XXHash_64<float>::operator()(float const& key) const
 {
-  return compute<float>(cudf::detail::normalize_nans_and_zeros(key));
+  return compute<float>(cudf::hashing::detail::normalize_nans_and_zeros(key));
 }
 
 template <>
 hash_value_type __device__ inline XXHash_64<double>::operator()(double const& key) const
 {
-  return compute<double>(cudf::detail::normalize_nans_and_zeros(key));
+  return compute<double>(cudf::hashing::detail::normalize_nans_and_zeros(key));
 }
 
 template <>
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 2436e0bbcb..45763fa1f9 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 2436e0bbcb44123d116f8a5b8a6169ffbb125589
+Subproject commit 45763fa1f90c174ff9123b4839c729e6c0dee700

From e5260d5bf395e1758892e6092bac8e9c453a9599 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 18 Jul 2023 05:04:15 +0800
Subject: [PATCH 094/113] Update submodule cudf to
 d35c4ca80afda131d5ac6f27c5116b0dc8767722 (#1272)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 45763fa1f9..d35c4ca80a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 45763fa1f90c174ff9123b4839c729e6c0dee700
+Subproject commit d35c4ca80afda131d5ac6f27c5116b0dc8767722

From 8f3b121c1fdca931ec260b7a46318fe294df2006 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 18 Jul 2023 11:04:15 +0800
Subject: [PATCH 095/113] Update submodule cudf to
 494535eed60e61fcfda9614eef0f319900d2c277 (#1273)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index d35c4ca80a..494535eed6 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit d35c4ca80afda131d5ac6f27c5116b0dc8767722
+Subproject commit 494535eed60e61fcfda9614eef0f319900d2c277

From e1480adc4b69a3966b8c013cc13e9b9fbd498962 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Tue, 18 Jul 2023 17:04:01 +0800
Subject: [PATCH 096/113] Update submodule cudf to
 9fe127082daefaa5b90ee56686fc2cc68aa6fa9c (#1274)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 494535eed6..9fe127082d 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 494535eed60e61fcfda9614eef0f319900d2c277
+Subproject commit 9fe127082daefaa5b90ee56686fc2cc68aa6fa9c

From fae1756c03c1e6db8fd0fcbf840d02a7ebd77a74 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Tue, 18 Jul 2023 18:29:32 -0500
Subject: [PATCH 097/113] Remove dependency of cudf detail/hash_functions.cuh
 (#1276)

* Remove dependency on cudf detail/hash_functions.cuh.

* Signoff

Signed-off-by: db <dbaranec@nvidia.com>

---------

Signed-off-by: db <dbaranec@nvidia.com>
---
 src/main/cpp/src/hash.cuh       | 24 ++++++++++++++++++++++++
 src/main/cpp/src/murmur_hash.cu | 19 ++++++++++++-------
 src/main/cpp/src/xxhash64.cu    |  5 ++---
 3 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/src/main/cpp/src/hash.cuh b/src/main/cpp/src/hash.cuh
index a6af264ceb..1c6333523c 100644
--- a/src/main/cpp/src/hash.cuh
+++ b/src/main/cpp/src/hash.cuh
@@ -27,6 +27,30 @@ namespace spark_rapids_jni {
 
 constexpr int64_t DEFAULT_XXHASH64_SEED = 42;
 
+/**
+ * Normalization of floating point NaNs, passthrough for all other values.
+ */
+template <typename T>
+T __device__ inline normalize_nans(T const& key)
+{
+  if constexpr (cudf::is_floating_point<T>()) {
+    if (std::isnan(key)) { return std::numeric_limits<T>::quiet_NaN(); }
+  }
+  return key;
+}
+
+/**
+ * Normalization of floating point NaNs and zeros, passthrough for all other values.
+ */
+template <typename T>
+T __device__ inline normalize_nans_and_zeros(T const& key)
+{
+  if constexpr (cudf::is_floating_point<T>()) {
+    if (key == T{0.0}) { return T{0.0}; }
+  }
+  return normalize_nans(key);
+}
+
 /**
  * @brief Converts a cudf decimal128 value to a java bigdecimal value.
  *
diff --git a/src/main/cpp/src/murmur_hash.cu b/src/main/cpp/src/murmur_hash.cu
index 633c186c9f..0574ec3d9f 100644
--- a/src/main/cpp/src/murmur_hash.cu
+++ b/src/main/cpp/src/murmur_hash.cu
@@ -16,7 +16,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
@@ -34,6 +33,12 @@ namespace {
 
 using spark_hash_value_type = int32_t;
 
+__device__ inline uint32_t rotate_bits_left(uint32_t x, uint32_t r)
+{
+  // This function is equivalent to (x << r) | (x >> (32 - r))
+  return __funnelshift_l(x, x, r);
+}
+
 template <typename Key, CUDF_ENABLE_IF(not cudf::is_nested<Key>())>
 struct SparkMurmurHash3_32 {
   using result_type = spark_hash_value_type;
@@ -85,10 +90,10 @@ struct SparkMurmurHash3_32 {
       // casting byte-to-int, but C++ does not.
       uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
       k1 *= c1;
-      k1 = cudf::hashing::detail::rotate_bits_left(k1, rot_c1);
+      k1 = spark_rapids_jni::rotate_bits_left(k1, rot_c1);
       k1 *= c2;
       h ^= k1;
-      h = cudf::hashing::detail::rotate_bits_left(h, rot_c2);
+      h = spark_rapids_jni::rotate_bits_left(h, rot_c2);
       h = h * 5 + c3;
     }
     return h;
@@ -105,10 +110,10 @@ struct SparkMurmurHash3_32 {
     for (cudf::size_type i = 0; i < nblocks; i++) {
       uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
       k1 *= c1;
-      k1 = cudf::hashing::detail::rotate_bits_left(k1, rot_c1);
+      k1 = spark_rapids_jni::rotate_bits_left(k1, rot_c1);
       k1 *= c2;
       h ^= k1;
-      h = cudf::hashing::detail::rotate_bits_left(h, rot_c2);
+      h = spark_rapids_jni::rotate_bits_left(h, rot_c2);
       h = h * 5 + c3;
     }
 
@@ -167,14 +172,14 @@ template <>
 spark_hash_value_type __device__ inline SparkMurmurHash3_32<float>::operator()(
   float const& key) const
 {
-  return compute<float>(cudf::hashing::detail::normalize_nans(key));
+  return compute<float>(spark_rapids_jni::normalize_nans(key));
 }
 
 template <>
 spark_hash_value_type __device__ inline SparkMurmurHash3_32<double>::operator()(
   double const& key) const
 {
-  return compute<double>(cudf::hashing::detail::normalize_nans(key));
+  return compute<double>(spark_rapids_jni::normalize_nans(key));
 }
 
 template <>
diff --git a/src/main/cpp/src/xxhash64.cu b/src/main/cpp/src/xxhash64.cu
index e97684f279..561aa49862 100644
--- a/src/main/cpp/src/xxhash64.cu
+++ b/src/main/cpp/src/xxhash64.cu
@@ -18,7 +18,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
-#include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -227,13 +226,13 @@ hash_value_type __device__ inline XXHash_64<uint16_t>::operator()(uint16_t const
 template <>
 hash_value_type __device__ inline XXHash_64<float>::operator()(float const& key) const
 {
-  return compute<float>(cudf::hashing::detail::normalize_nans_and_zeros(key));
+  return compute<float>(spark_rapids_jni::normalize_nans_and_zeros(key));
 }
 
 template <>
 hash_value_type __device__ inline XXHash_64<double>::operator()(double const& key) const
 {
-  return compute<double>(cudf::hashing::detail::normalize_nans_and_zeros(key));
+  return compute<double>(spark_rapids_jni::normalize_nans_and_zeros(key));
 }
 
 template <>

From 29d4adc556fe82d10409ff2a98fde30cf1f6a259 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 19 Jul 2023 11:03:51 +0800
Subject: [PATCH 098/113] [submodule-sync] bot-submodule-sync-branch-23.08 to
 branch-23.08 [skip ci] [bot] (#1275)

* Update submodule cudf to 515e2250ccd371d61128750e7c469a4a477577ce

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 2aa0babd12514dcf85a7466cd6856fafa8c4d3c5

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 5e3a163ec98f410d5835230a7383c7c910ec744f

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

* Update submodule cudf to 84578d793e4bf5a0924766d39ff717f5d9d71820

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>

---------

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 9fe127082d..84578d793e 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 9fe127082daefaa5b90ee56686fc2cc68aa6fa9c
+Subproject commit 84578d793e4bf5a0924766d39ff717f5d9d71820

From e385815862f5c8c83bb27d1929596b1067b7768e Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 20 Jul 2023 05:10:20 +0800
Subject: [PATCH 099/113] Update submodule cudf to
 d4c2d1ccee07005a02907d0e99a1057ec813c084 (#1277)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 84578d793e..d4c2d1ccee 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 84578d793e4bf5a0924766d39ff717f5d9d71820
+Subproject commit d4c2d1ccee07005a02907d0e99a1057ec813c084

From cfb5e20b5063a624b5dc29da1ab757ae0796271b Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 21 Jul 2023 05:12:29 +0800
Subject: [PATCH 100/113] Update submodule cudf to
 ad64c66add770ad1d69419b65b301fe97980174e (#1278)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index d4c2d1ccee..ad64c66add 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit d4c2d1ccee07005a02907d0e99a1057ec813c084
+Subproject commit ad64c66add770ad1d69419b65b301fe97980174e

From 39cb6f235bccc8dc171898a9a34b2330a3b11149 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Sat, 22 Jul 2023 11:03:16 +0800
Subject: [PATCH 101/113] Update submodule cudf to
 e0fa34b6953490dbf15576884cdd3fbbfac87b3c (#1279)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index ad64c66add..e0fa34b695 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit ad64c66add770ad1d69419b65b301fe97980174e
+Subproject commit e0fa34b6953490dbf15576884cdd3fbbfac87b3c

From 236e3c747e54ccc9d795eff14de85f4a457e3c9c Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Tue, 25 Jul 2023 15:09:53 +0800
Subject: [PATCH 102/113] Auto merge to branch-23.10 from branch-23.08 (#1281)

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 .github/workflows/auto-merge.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
index 7dcfbd65ed..beaa9a64c6 100755
--- a/.github/workflows/auto-merge.yml
+++ b/.github/workflows/auto-merge.yml
@@ -18,12 +18,12 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-      - branch-23.06
+      - branch-23.08
     types: [closed]
 
 env:
-  HEAD: branch-23.06
-  BASE: branch-23.08
+  HEAD: branch-23.08
+  BASE: branch-23.10
 
 jobs:
   auto-merge:

From ce1bafdb760e1cb48b2466b34416463834aad398 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Wed, 26 Jul 2023 11:47:18 +0800
Subject: [PATCH 103/113] Update submodule cudf to
 67e81aef654b782daa4751835577bdeb55e06168 (#1285)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e0fa34b695..67e81aef65 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e0fa34b6953490dbf15576884cdd3fbbfac87b3c
+Subproject commit 67e81aef654b782daa4751835577bdeb55e06168

From 002343e080723b3ae128f5c2b1bf3dfc38ee0e63 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 27 Jul 2023 01:33:17 +0800
Subject: [PATCH 104/113] Update submodule cudf to
 f8e5a89e983065e1202f1151dd499bea3102a537 (#1290)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 67e81aef65..f8e5a89e98 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 67e81aef654b782daa4751835577bdeb55e06168
+Subproject commit f8e5a89e983065e1202f1151dd499bea3102a537

From e1c419c2af72686fa553f9a2208d49a04221d787 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 27 Jul 2023 06:33:16 +0800
Subject: [PATCH 105/113] Update submodule cudf to
 2231b15c9f256c4885dc28e27e2767b2861290d6 (#1292)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index f8e5a89e98..2231b15c9f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit f8e5a89e983065e1202f1151dd499bea3102a537
+Subproject commit 2231b15c9f256c4885dc28e27e2767b2861290d6

From 69c18b8e31ec12316969b88fb30dc64e2eafcbf7 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 27 Jul 2023 10:34:44 +0800
Subject: [PATCH 106/113] Update submodule cudf to
 fa09cca6d9fb799f07cb1205d5bee2896ad594e3 (#1294)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 2231b15c9f..fa09cca6d9 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 2231b15c9f256c4885dc28e27e2767b2861290d6
+Subproject commit fa09cca6d9fb799f07cb1205d5bee2896ad594e3

From a5f847d2b9c26ee6a87ef51b05423c9cb9544734 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 27 Jul 2023 12:33:20 +0800
Subject: [PATCH 107/113] Update submodule cudf to
 abb59c83128f956c7edcb4d7744cb0faecf0026c (#1297)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index fa09cca6d9..abb59c8312 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit fa09cca6d9fb799f07cb1205d5bee2896ad594e3
+Subproject commit abb59c83128f956c7edcb4d7744cb0faecf0026c

From 7fd596774303511366c6f6590a1e8b9918e75091 Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Fri, 28 Jul 2023 05:51:58 +0800
Subject: [PATCH 108/113] Update submodule cudf to
 5600f1989495c9545011228ffc7fcd737e2a39bc (#1299)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index abb59c8312..5600f19894 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit abb59c83128f956c7edcb4d7744cb0faecf0026c
+Subproject commit 5600f1989495c9545011228ffc7fcd737e2a39bc

From d22259acba528cb58b924eb4e1b86aadcaaa1080 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Tue, 1 Aug 2023 20:02:03 -0500
Subject: [PATCH 109/113] Rework BloomFilter interface (#1303)

* Back port spark-specific murmur32 hash code from cudf.

* Run pre-commit to format files. We were behind a bit.

* Update pre-commit config to 16.0.1 to match cudf. Re-ran formatting.

* Change jni bindings to use the spark-rapids-jni implementation of murmur hash instead of the cudf version.  Brought over
cpp and java tests.

* Documentation fix.

* Fix cpp tests to actually call the spark_rapids_jni murmur hash.

* First pass at xxhash64. cpp tests passing.

* Improve cpp tests - null cases and more floating point edge cases.

* Add Java tests.

* Moved murmur32 hash implementaion from cudf to spark-rapids-jni

Signed-off-by: db <dbaranec@nvidia.com>

* PR review changes.

* Fix copyright data in Hash.java

* Enable 32 bit decimal hash test.

* Implement xxhash64 on the gpu

Signed-off-by: db <dbaranec@nvidia.com>

* Add missing newlines.

* PR review changes.

* Remove default xxhash64 class constructor.  Remove unused parameter (row index) from remaining constructor.

* Merge thirdparty/cudf from 23.08

* Basic bloom filter support. c++ side only. Could use some more tests.

* More tests.

* Rectify thirdparty/cudf

* Java bindings and tests.

* Add more tests and general cleanup.

Signed-off-by: db <dbaranec@nvidia.com>

* End-of-file formatting.

* More end-of-file formatting.

* Merge thirdparty/cudf

* Add bloom_filter_put benchmark.  Fixed several benchmark build breakages.

* Submodule update

* Fix small issue from cudf merge.

* Wave of PR review feedback.

* Add static versions of put() and probe() that take bloom filter components instead of an instance. Change BloomFilterInterfaces to take a
BaseDeviceMemoryBuffer instead of a DeviceMemoryBuffer. Handle some exception cases. Reordered some function parameter lists for consistency/cleanliness.

* Change an Exception to a Throwable.

* Produce big-endian swizzled bloom filters from the GPU.  Change the BloomFilter class to be more restrictive about bloom filter bit sizes:
must always be a multiple of 64 bits.

* Change bloom filter Java functions to use a long for bloomFilterBits.  Handles nulls in the c++ code : build will ignore null input values and probe will return
null for any input value.

* Java tests for build/probe with null inputs.

* Rework BloomFilter interface to wrap the entire Spark bloom filter buffer as an opaque cudf Scalar.

* Doc updates. Add checking to the merge function to verify all input bloom filters have matching num_hashes and num_longs
parameters.

* Re-enable Java merge tests. Update benchmarks.

* Change bloom filter list_scalar type to be uint8. Add an additional interface for probing directly from a buffer. Improve error
checking in unpacking code.

* Add a note and reference to an issue for removing the package/bounce workaround for certain Scalar accessors.

* Eof newline.

---------

Signed-off-by: db <dbaranec@nvidia.com>
---
 src/main/cpp/CMakeLists.txt                   |   3 +
 src/main/cpp/benchmarks/CMakeLists.txt        |   3 +
 src/main/cpp/benchmarks/bloom_filter.cu       |  64 +++
 .../cpp/benchmarks/common/generate_input.cu   |  54 ++-
 src/main/cpp/src/BloomFilterJni.cpp           |  98 +++++
 src/main/cpp/src/bloom_filter.cu              | 374 ++++++++++++++++++
 src/main/cpp/src/bloom_filter.hpp             | 117 ++++++
 src/main/cpp/src/murmur_hash.cu               | 216 +---------
 src/main/cpp/src/murmur_hash.cuh              | 207 ++++++++++
 src/main/cpp/src/utilities.cu                 |  70 ++++
 src/main/cpp/src/utilities.hpp                |  41 ++
 src/main/cpp/tests/CMakeLists.txt             |   9 +-
 src/main/cpp/tests/bloom_filter.cu            | 172 ++++++++
 src/main/cpp/tests/utilities.cpp              | 120 ++++++
 .../java/ai/rapids/cudf/CudfAccessor.java     |  29 ++
 .../nvidia/spark/rapids/jni/BloomFilter.java  | 104 +++++
 .../spark/rapids/jni/BloomFilterTest.java     | 185 +++++++++
 17 files changed, 1649 insertions(+), 217 deletions(-)
 create mode 100644 src/main/cpp/benchmarks/bloom_filter.cu
 create mode 100644 src/main/cpp/src/BloomFilterJni.cpp
 create mode 100644 src/main/cpp/src/bloom_filter.cu
 create mode 100644 src/main/cpp/src/bloom_filter.hpp
 create mode 100644 src/main/cpp/src/murmur_hash.cuh
 create mode 100644 src/main/cpp/src/utilities.cu
 create mode 100644 src/main/cpp/src/utilities.hpp
 create mode 100644 src/main/cpp/tests/bloom_filter.cu
 create mode 100644 src/main/cpp/tests/utilities.cpp
 create mode 100644 src/main/java/ai/rapids/cudf/CudfAccessor.java
 create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/BloomFilter.java
 create mode 100644 src/test/java/com/nvidia/spark/rapids/jni/BloomFilterTest.java

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 5f363fb6a8..ea37516555 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -146,6 +146,7 @@ set(CUDFJNI_INCLUDE_DIRS
 
 add_library(
   spark_rapids_jni SHARED
+  src/BloomFilterJni.cpp
   src/CastStringJni.cpp
   src/DecimalUtilsJni.cpp
   src/HashJni.cpp
@@ -154,6 +155,7 @@ add_library(
   src/RowConversionJni.cpp
   src/SparkResourceAdaptorJni.cpp
   src/ZOrderJni.cpp
+  src/bloom_filter.cu
   src/cast_decimal_to_string.cu
   src/cast_string.cu
   src/cast_string_to_float.cu
@@ -161,6 +163,7 @@ add_library(
   src/map_utils.cu
   src/murmur_hash.cu
   src/row_conversion.cu
+  src/utilities.cu
   src/xxhash64.cu
   src/zorder.cu
 )
diff --git a/src/main/cpp/benchmarks/CMakeLists.txt b/src/main/cpp/benchmarks/CMakeLists.txt
index 99907d9db6..7ce778b035 100644
--- a/src/main/cpp/benchmarks/CMakeLists.txt
+++ b/src/main/cpp/benchmarks/CMakeLists.txt
@@ -74,3 +74,6 @@ ConfigureBench(ROW_CONVERSION_BENCH
 
 ConfigureBench(STRING_TO_FLOAT_BENCH
   cast_string_to_float.cpp)
+
+ConfigureBench(BLOOM_FILTER_BENCH
+  bloom_filter.cu)
diff --git a/src/main/cpp/benchmarks/bloom_filter.cu b/src/main/cpp/benchmarks/bloom_filter.cu
new file mode 100644
index 0000000000..668fd90eef
--- /dev/null
+++ b/src/main/cpp/benchmarks/bloom_filter.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <bloom_filter.hpp>
+#include <hash.cuh>
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <cudf_test/column_utilities.hpp>
+
+static void bloom_filter_put(nvbench::state& state)
+{
+  constexpr int num_rows   = 150'000'000;
+  constexpr int num_hashes = 3;
+
+  // create the bloom filter
+  cudf::size_type const bloom_filter_bytes = state.get_int64("bloom_filter_bytes");
+  cudf::size_type const bloom_filter_longs = bloom_filter_bytes / sizeof(int64_t);
+  auto bloom_filter = spark_rapids_jni::bloom_filter_create(num_hashes, bloom_filter_longs);
+
+  // create a column of hashed values
+  data_profile_builder builder;
+  builder.no_validity();
+  auto const src   = create_random_table({{cudf::type_id::INT64}}, row_count{num_rows}, builder);
+  auto const input = spark_rapids_jni::xxhash64(*src);
+
+  auto const stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch, auto& timer) {
+               timer.start();
+               spark_rapids_jni::bloom_filter_put(*bloom_filter, *input);
+               stream.synchronize();
+               timer.stop();
+             });
+
+  size_t const bytes_read    = num_rows * sizeof(int64_t);
+  size_t const bytes_written = num_rows * sizeof(cudf::bitmask_type) * num_hashes;
+  auto const time            = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(std::size_t{num_rows}, "Rows Inserted");
+  state.add_global_memory_reads(bytes_read, "Bytes read");
+  state.add_global_memory_writes(bytes_written, "Bytes written");
+  state.add_element_count(static_cast<double>(bytes_written) / time, "Write bytes/sec");
+}
+
+NVBENCH_BENCH(bloom_filter_put)
+  .set_name("Bloom Filter Put")
+  .add_int64_axis("bloom_filter_bytes",
+                  {512 * 1024, 1024 * 1024, 2 * 1024 * 1024, 4 * 1024 * 1024, 8 * 1024 * 1024});
diff --git a/src/main/cpp/benchmarks/common/generate_input.cu b/src/main/cpp/benchmarks/common/generate_input.cu
index 0bc2df69c9..3b1376c89a 100644
--- a/src/main/cpp/benchmarks/common/generate_input.cu
+++ b/src/main/cpp/benchmarks/common/generate_input.cu
@@ -426,14 +426,19 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
                    null_mask.begin());
   }
 
-  auto [result_bitmask, null_count] = cudf::detail::valid_if(
-    null_mask.begin(), null_mask.end(), thrust::identity<bool>{}, cudf::get_default_stream());
+  auto [result_bitmask, null_count] =
+    cudf::detail::valid_if(null_mask.begin(),
+                           null_mask.end(),
+                           thrust::identity<bool>{},
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
 
   return std::make_unique<cudf::column>(
     cudf::data_type{cudf::type_to_id<T>()},
     num_rows,
     data.release(),
-    profile.get_null_frequency().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
+    profile.get_null_frequency().has_value() ? std::move(result_bitmask) : rmm::device_buffer{},
+    null_count);
 }
 
 struct valid_or_zero {
@@ -505,13 +510,19 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
                      thrust::make_zip_iterator(offsets.begin(), offsets.begin() + 1),
                      num_rows,
                      string_generator{chars.data(), engine});
-  auto [result_bitmask, null_count] = cudf::detail::valid_if(
-    null_mask.begin(), null_mask.end() - 1, thrust::identity<bool>{}, cudf::get_default_stream());
+  auto [result_bitmask, null_count] =
+    cudf::detail::valid_if(null_mask.begin(),
+                           null_mask.end() - 1,
+                           thrust::identity<bool>{},
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
+
   return cudf::make_strings_column(
     num_rows,
     std::move(offsets),
     std::move(chars),
-    profile.get_null_frequency().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
+    profile.get_null_frequency().has_value() ? std::move(result_bitmask) : rmm::device_buffer{},
+    null_count);
 }
 
 /**
@@ -539,7 +550,8 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
                                         sample_indices,
                                         cudf::out_of_bounds_policy::DONT_CHECK,
                                         cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                        cudf::get_default_stream());
+                                        cudf::get_default_stream(),
+                                        rmm::mr::get_current_device_resource());
   return std::move(str_table->release()[0]);
 }
 
@@ -623,8 +635,11 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
       auto [null_mask, null_count] = [&]() {
         if (profile.get_null_frequency().has_value()) {
           auto valids = valid_dist(engine, num_rows);
-          return cudf::detail::valid_if(
-            valids.begin(), valids.end(), thrust::identity<bool>{}, cudf::get_default_stream());
+          return cudf::detail::valid_if(valids.begin(),
+                                        valids.end(),
+                                        thrust::identity<bool>{},
+                                        cudf::get_default_stream(),
+                                        rmm::mr::get_current_device_resource());
         }
         return std::pair<rmm::device_buffer, cudf::size_type>{};
       }();
@@ -704,12 +719,18 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
     thrust::device_pointer_cast(offsets.end())[-1] =
       current_child_column->size();  // Always include all elements
 
-    auto offsets_column = std::make_unique<cudf::column>(
-      cudf::data_type{cudf::type_id::INT32}, num_rows + 1, offsets.release());
-
-    auto [null_mask, null_count] = cudf::detail::valid_if(
-      valids.begin(), valids.end(), thrust::identity<bool>{}, cudf::get_default_stream());
-    list_column = cudf::make_lists_column(
+    auto offsets_column = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
+                                                         num_rows + 1,
+                                                         offsets.release(),
+                                                         rmm::device_buffer{},
+                                                         0);
+
+    auto [null_mask, null_count] = cudf::detail::valid_if(valids.begin(),
+                                                          valids.end(),
+                                                          thrust::identity<bool>{},
+                                                          cudf::get_default_stream(),
+                                                          rmm::mr::get_current_device_resource());
+    list_column                  = cudf::make_lists_column(
       num_rows,
       std::move(offsets_column),
       std::move(current_child_column),
@@ -827,7 +848,8 @@ std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
     return cudf::detail::valid_if(thrust::make_counting_iterator<cudf::size_type>(0),
                                   thrust::make_counting_iterator<cudf::size_type>(size),
                                   bool_generator{seed, 1.0 - *null_probability},
-                                  cudf::get_default_stream());
+                                  cudf::get_default_stream(),
+                                  rmm::mr::get_current_device_resource());
   }
 }
 
diff --git a/src/main/cpp/src/BloomFilterJni.cpp b/src/main/cpp/src/BloomFilterJni.cpp
new file mode 100644
index 0000000000..90c696577a
--- /dev/null
+++ b/src/main/cpp/src/BloomFilterJni.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bloom_filter.hpp"
+#include "utilities.hpp"
+
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_utils.hpp"
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_BloomFilter_creategpu(
+  JNIEnv* env, jclass, jint numHashes, jlong bloomFilterBits)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+
+    int bloom_filter_longs = static_cast<int>((bloomFilterBits + 63) / 64);
+    auto bloom_filter      = spark_rapids_jni::bloom_filter_create(numHashes, bloom_filter_longs);
+    return reinterpret_cast<jlong>(bloom_filter.release());
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jint JNICALL Java_com_nvidia_spark_rapids_jni_BloomFilter_put(JNIEnv* env,
+                                                                        jclass,
+                                                                        jlong bloomFilter,
+                                                                        jlong cv)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::column_view const& input_column = *reinterpret_cast<cudf::column_view const*>(cv);
+    spark_rapids_jni::bloom_filter_put(*(reinterpret_cast<cudf::list_scalar*>(bloomFilter)),
+                                       input_column);
+    return 0;
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_BloomFilter_merge(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong bloomFilters)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::column_view const& input_bloom_filter =
+      *reinterpret_cast<cudf::column_view const*>(bloomFilters);
+    auto bloom_filter = spark_rapids_jni::bloom_filter_merge(input_bloom_filter);
+    return reinterpret_cast<jlong>(bloom_filter.release());
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_BloomFilter_probe(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong bloomFilter,
+                                                                           jlong cv)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::column_view const& input_column = *reinterpret_cast<cudf::column_view const*>(cv);
+    return cudf::jni::release_as_jlong(spark_rapids_jni::bloom_filter_probe(
+      input_column, *(reinterpret_cast<cudf::list_scalar*>(bloomFilter))));
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_BloomFilter_probebuffer(
+  JNIEnv* env, jclass, jlong bloomFilter, jlong bloomFilterSize, jlong cv)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::column_view const& input_column = *reinterpret_cast<cudf::column_view const*>(cv);
+    auto buf                              = reinterpret_cast<uint8_t const*>(bloomFilter);
+    return cudf::jni::release_as_jlong(spark_rapids_jni::bloom_filter_probe(
+      input_column, cudf::device_span<uint8_t const>{buf, static_cast<size_t>(bloomFilterSize)}));
+  }
+  CATCH_STD(env, 0);
+}
+}
diff --git a/src/main/cpp/src/bloom_filter.cu b/src/main/cpp/src/bloom_filter.cu
new file mode 100644
index 0000000000..7637c85f10
--- /dev/null
+++ b/src/main/cpp/src/bloom_filter.cu
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bloom_filter.hpp"
+#include "murmur_hash.cuh"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/logical.h>
+
+#include <byteswap.h>
+
+namespace spark_rapids_jni {
+
+namespace {
+
+using bloom_hash_type = spark_rapids_jni::murmur_hash_value_type;
+
+__device__ inline std::pair<cudf::size_type, cudf::bitmask_type> gpu_get_hash_mask(
+  bloom_hash_type h, cudf::size_type bloom_filter_bits)
+{
+  // https://github.com/apache/spark/blob/7bfbeb62cb1dc58d81243d22888faa688bad8064/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java#L94
+  auto const index = (h < 0 ? ~h : h) % static_cast<bloom_hash_type>(bloom_filter_bits);
+
+  // spark expects serialized bloom filters to be big endian (64 bit longs),
+  // so we will produce a big endian buffer. if spark CPU ends up consuming it, it can do so
+  // directly. the gpu bloom filter implementation will always be handed the same serialized buffer.
+  auto const word_index = cudf::word_index(index) ^ 0x1;  // word-swizzle within 64 bit long
+  auto const bit_index =
+    cudf::intra_word_index(index) ^ 0x18;                 // byte swizzle within the 32 bit word
+
+  return {word_index, (1 << bit_index)};
+}
+
+template <bool nullable>
+__global__ void gpu_bloom_filter_put(cudf::bitmask_type* const bloom_filter,
+                                     cudf::size_type bloom_filter_bits,
+                                     cudf::column_device_view input,
+                                     cudf::size_type num_hashes)
+{
+  size_t const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= input.size()) { return; }
+
+  if constexpr (nullable) {
+    if (!input.is_valid(tid)) { return; }
+  }
+
+  // https://github.com/apache/spark/blob/7bfbeb62cb1dc58d81243d22888faa688bad8064/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java#L87
+  auto const el            = input.element<int64_t>(tid);
+  bloom_hash_type const h1 = MurmurHash3_32<int64_t>(0)(el);
+  bloom_hash_type const h2 = MurmurHash3_32<int64_t>(h1)(el);
+
+  // set a bit in the bloom filter for each hashed value
+  for (auto idx = 1; idx <= num_hashes; idx++) {
+    bloom_hash_type combined_hash = h1 + (idx * h2);
+
+    auto const [word_index, mask] = gpu_get_hash_mask(combined_hash, bloom_filter_bits);
+    atomicOr(bloom_filter + word_index, mask);
+  }
+}
+
+struct bloom_probe_functor {
+  cudf::bitmask_type const* const bloom_filter;
+  cudf::size_type const bloom_filter_bits;
+  cudf::size_type const num_hashes;
+
+  __device__ bool operator()(int64_t input) const
+  {
+    // https://github.com/apache/spark/blob/7bfbeb62cb1dc58d81243d22888faa688bad8064/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java#L110
+    // this code could be combined with the very similar code in gpu_bloom_filter_put. i've
+    // left it this way since the expectation is that we will early out fairly often, whereas
+    // in the build case we never early out so doing the additional if() return check is pointless.
+    bloom_hash_type const h1 = MurmurHash3_32<int64_t>(0)(input);
+    bloom_hash_type const h2 = MurmurHash3_32<int64_t>(h1)(input);
+
+    // set a bit in the bloom filter for each hashed value
+    for (auto idx = 1; idx <= num_hashes; idx++) {
+      bloom_hash_type combined_hash = h1 + (idx * h2);
+      auto const [word_index, mask] = gpu_get_hash_mask(combined_hash, bloom_filter_bits);
+      if (!(bloom_filter[word_index] & mask)) { return false; }
+    }
+    return true;
+  }
+};
+
+constexpr int spark_bloom_filter_version = 1;
+
+bloom_filter_header byte_swap_header(bloom_filter_header const& header)
+{
+  return {static_cast<int32_t>(bswap_32(static_cast<uint32_t>(header.version))),
+          static_cast<int32_t>(bswap_32(static_cast<uint32_t>(header.num_hashes))),
+          static_cast<int32_t>(bswap_32(static_cast<uint32_t>(header.num_longs)))};
+}
+
+/*
+  Pack a bloom_filter_header (passed as little endian) into a bloom filter buffer.
+*/
+void pack_bloom_filter_header(cudf::device_span<uint8_t> buf,
+                              bloom_filter_header const& header,
+                              rmm::cuda_stream_view stream)
+{
+  // swizzle to big endian
+  bloom_filter_header header_swizzled = byte_swap_header(header);
+
+  // header goes at the top of the buffer
+  cudaMemcpyAsync(
+    buf.data(), &header_swizzled, bloom_filter_header_size, cudaMemcpyHostToDevice, stream);
+}
+
+/*
+  Unpack bloom filter information from a bloom filter buffer. returns the header, a span
+  representing the bloom filter bits and the number of bloom filter bits.
+*/
+std::tuple<bloom_filter_header, cudf::device_span<cudf::bitmask_type>, int> unpack_bloom_filter(
+  cudf::device_span<uint8_t> bloom_filter, rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(bloom_filter.size() >= bloom_filter_header_size,
+               "Encountered truncated bloom filter");
+
+  bloom_filter_header header_swizzled;
+  cudaMemcpyAsync(&header_swizzled,
+                  bloom_filter.data(),
+                  bloom_filter_header_size,
+                  cudaMemcpyDeviceToHost,
+                  stream);
+  stream.synchronize();
+
+  // swizzle to little endian.
+  bloom_filter_header header = byte_swap_header(header_swizzled);
+
+  auto const bloom_filter_bits = header.num_longs * 64;
+  auto const num_bitmask_words = static_cast<size_t>(header.num_longs) * 2;
+
+  CUDF_EXPECTS(header.version == 1, "Unexpected bloom filter version");
+  CUDF_EXPECTS(bloom_filter_bits > 0, "Invalid empty bloom filter size");
+  CUDF_EXPECTS(num_bitmask_words == cudf::num_bitmask_words(bloom_filter_bits),
+               "Bloom filter bit/length mismatch");
+
+  return {header,
+          {reinterpret_cast<cudf::bitmask_type*>(bloom_filter.data() + bloom_filter_header_size),
+           num_bitmask_words},
+          bloom_filter_bits};
+}
+
+/*
+  Unpack bloom filter information a from column_view that wraps a single bloom filter buffer.
+  returns the header, a span representing the bloom filter bits and the number of bloom filter bits.
+*/
+std::tuple<bloom_filter_header, cudf::device_span<cudf::bitmask_type>, int> unpack_bloom_filter(
+  cudf::column_view const& bloom_filter, rmm::cuda_stream_view stream)
+{
+  // the const_cast is necessary because list_scalar does not provide a mutable_view() function.
+  return unpack_bloom_filter(
+    cudf::device_span<uint8_t>{const_cast<uint8_t*>(bloom_filter.data<uint8_t>()),
+                               static_cast<size_t>(bloom_filter.size())},
+    stream);
+}
+
+struct bloom_filter_same {
+  bloom_filter_header header;
+  cudf::detail::lists_column_device_view ldv;
+  cudf::size_type stride;
+
+  bool __device__ operator()(cudf::size_type i)
+  {
+    bloom_filter_header const* a =
+      reinterpret_cast<bloom_filter_header const*>(ldv.child().data<uint8_t>() + stride * i);
+    return (a->version == header.version) && (a->num_hashes == header.num_hashes) &&
+           (a->num_longs == header.num_longs);
+  }
+};
+
+/*
+  Returns a pair indicating:
+  - size of the bloom filter bits
+  - total size of the bloom filter buffer (header + bits)
+*/
+std::pair<int, int> get_bloom_filter_stride(int bloom_filter_longs)
+{
+  auto const bloom_filter_size = (bloom_filter_longs * sizeof(int64_t));
+  auto const buf_size          = bloom_filter_header_size + bloom_filter_size;
+  return {bloom_filter_size, buf_size};
+}
+
+}  // anonymous namespace
+
+/*
+  Creates a new bloom filter.  The bloom filter is stored using a cudf list_scalar with a specific
+  structure.
+  - The data type is int8, representing a generic buffer
+  - The first 12 bytes of the buffer are a bloom_filter_header
+  - The remaining bytes are the bloom filter buffer itself. The length of the remaining bytes must
+  be bloom_filter_header.num_longs * 8
+  - All of the data in the buffer is stored in big-endian format.  unpack_bloom_filter() unpacks
+  this into a usable form, and pack_bloom_filter_header packs new data into the output (big-endian)
+  form.
+*/
+std::unique_ptr<cudf::list_scalar> bloom_filter_create(int num_hashes,
+                                                       int bloom_filter_longs,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
+{
+  auto [bloom_filter_size, buf_size] = get_bloom_filter_stride(bloom_filter_longs);
+
+  // build the packed bloom filter buffer ------------------
+  rmm::device_buffer buf{static_cast<size_t>(buf_size), stream, mr};
+
+  // pack the header
+  bloom_filter_header header{spark_bloom_filter_version, num_hashes, bloom_filter_longs};
+  pack_bloom_filter_header(
+    {reinterpret_cast<uint8_t*>(buf.data()), static_cast<size_t>(buf_size)}, header, stream);
+  // memset the bloom filter bits to 0.
+
+  CUDF_CUDA_TRY(cudaMemsetAsync(reinterpret_cast<uint8_t*>(buf.data()) + bloom_filter_header_size,
+                                0,
+                                bloom_filter_size,
+                                stream));
+
+  // create the 1-row list column and move it into a scalar.
+  return std::make_unique<cudf::list_scalar>(
+    cudf::column(
+      cudf::data_type{cudf::type_id::UINT8}, buf_size, std::move(buf), rmm::device_buffer{}, 0),
+    true,
+    stream,
+    mr);
+}
+
+void bloom_filter_put(cudf::list_scalar& bloom_filter,
+                      cudf::column_view const& input,
+                      rmm::cuda_stream_view stream)
+{
+  // unpack the bloom filter
+  auto [header, buffer, bloom_filter_bits] = unpack_bloom_filter(bloom_filter.view(), stream);
+  CUDF_EXPECTS(bloom_filter.view().size() == (buffer.size() * 4) + bloom_filter_header_size,
+               "Encountered invalid/mismatched bloom filter buffer data");
+
+  constexpr int block_size = 256;
+  auto grid                = cudf::detail::grid_1d{input.size(), block_size, 1};
+  auto d_input             = cudf::column_device_view::create(input);
+
+  if (input.has_nulls()) {
+    gpu_bloom_filter_put<true><<<grid.num_blocks, block_size, 0, stream.value()>>>(
+      buffer.data(), bloom_filter_bits, *d_input, header.num_hashes);
+  } else {
+    gpu_bloom_filter_put<false><<<grid.num_blocks, block_size, 0, stream.value()>>>(
+      buffer.data(), bloom_filter_bits, *d_input, header.num_hashes);
+  }
+}
+
+std::unique_ptr<cudf::list_scalar> bloom_filter_merge(cudf::column_view const& bloom_filters,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::mr::device_memory_resource* mr)
+{
+  // unpack the bloom filter
+  cudf::lists_column_view lcv(bloom_filters);
+
+  // since the list child column is just a bunch of packed bloom filter buffers one after another,
+  // we can just pass the base data pointer to unpack the first one.
+  auto [header, buffer, bloom_filter_bits] = unpack_bloom_filter(lcv.child(), stream);
+  // NOTE: since this is a column containing multiple bloom filters, the expected total size is the
+  // size for one bloom filter times the number of rows (bloom_filters.size())
+  CUDF_EXPECTS(
+    lcv.child().size() == ((buffer.size() * 4) + bloom_filter_header_size) * bloom_filters.size(),
+    "Encountered invalid/mismatched bloom filter buffer data");
+
+  auto [bloom_filter_size, buf_size] = get_bloom_filter_stride(header.num_longs);
+
+  // validate all the bloom filters are the same
+  auto dv                             = cudf::column_device_view::create(bloom_filters);
+  bloom_filter_header header_swizzled = byte_swap_header(header);
+  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
+                              thrust::make_counting_iterator(1),
+                              thrust::make_counting_iterator(bloom_filters.size()),
+                              bloom_filter_same{header_swizzled, *dv, buf_size}),
+               "Mismatch of bloom filter parameters");
+
+  // build the packed bloom filter buffer ------------------
+  rmm::device_buffer buf{static_cast<size_t>(buf_size), stream, mr};
+  pack_bloom_filter_header(
+    {reinterpret_cast<uint8_t*>(buf.data()), static_cast<size_t>(buf_size)}, header, stream);
+
+  auto src = lcv.child().data<uint8_t>() + bloom_filter_header_size;
+  auto dst = reinterpret_cast<cudf::bitmask_type*>(reinterpret_cast<uint8_t*>(buf.data()) +
+                                                   bloom_filter_header_size);
+
+  // bitwise-or all the bloom filters together
+  cudf::size_type num_words = header.num_longs * 2;
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(0) + num_words,
+    dst,
+    [src, num_buffers = bloom_filters.size(), stride = buf_size] __device__(
+      cudf::size_type word_index) {
+      cudf::bitmask_type out = (reinterpret_cast<cudf::bitmask_type const*>(src))[word_index];
+      for (auto idx = 1; idx < num_buffers; idx++) {
+        out |= (reinterpret_cast<cudf::bitmask_type const*>(src + idx * stride))[word_index];
+      }
+      return out;
+    });
+
+  // create the 1-row list column and move it into a scalar.
+  return std::make_unique<cudf::list_scalar>(
+    cudf::column(
+      cudf::data_type{cudf::type_id::UINT8}, buf_size, std::move(buf), rmm::device_buffer{}, 0),
+    true,
+    stream,
+    mr);
+}
+
+std::unique_ptr<cudf::column> bloom_filter_probe(cudf::column_view const& input,
+                                                 cudf::device_span<uint8_t const> bloom_filter,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  // unpack the bloom filter
+  auto [header, buffer, bloom_filter_bits] = unpack_bloom_filter(bloom_filter, stream);
+  CUDF_EXPECTS(bloom_filter.size() == (buffer.size() * 4) + bloom_filter_header_size,
+               "Encountered invalid/mismatched bloom filter buffer data");
+
+  // duplicate input mask
+  auto out = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8},
+                                           input.size(),
+                                           cudf::copy_bitmask(input),
+                                           input.null_count(),
+                                           stream,
+                                           mr);
+
+  thrust::transform(
+    rmm::exec_policy(stream),
+    input.begin<int64_t>(),
+    input.end<int64_t>(),
+    out->mutable_view().begin<bool>(),
+    bloom_probe_functor{
+      buffer.data(), static_cast<cudf::size_type>(bloom_filter_bits), header.num_hashes});
+
+  return out;
+}
+
+std::unique_ptr<cudf::column> bloom_filter_probe(cudf::column_view const& input,
+                                                 cudf::list_scalar& bloom_filter,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  return bloom_filter_probe(input, bloom_filter.view(), stream, mr);
+}
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/bloom_filter.hpp b/src/main/cpp/src/bloom_filter.hpp
new file mode 100644
index 0000000000..54170782ca
--- /dev/null
+++ b/src/main/cpp/src/bloom_filter.hpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace spark_rapids_jni {
+
+// included only for testing purposes
+struct bloom_filter_header {
+  int version;
+  int num_hashes;
+  int num_longs;
+};
+constexpr int bloom_filter_header_size = sizeof(bloom_filter_header);
+
+/**
+ * @brief Create an empty bloom filter of the specified size in (64 bit) longs with using
+ * the specified number of hashes to be used when operating on the filter.
+ *
+ * @param num_hashes The number of hashes to use.
+ * @param bloom_filter_longs Size of the bloom filter in bits.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned bloom filter's memory.
+ * @returns An list_scalar wrapping a packed Spark bloom_filter.
+ *
+ */
+std::unique_ptr<cudf::list_scalar> bloom_filter_create(
+  int num_hashes,
+  int bloom_filter_longs,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Inserts input values into a bloom filter.
+ *
+ * Can be called multiple times on the same bloom_filter buffer.
+ *
+ * @param[in,out] bloom_filter The bloom filter to be added to.
+ * @param input Input column of int64_t values to be inserted into the bloom filter.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ *
+ */
+void bloom_filter_put(cudf::list_scalar& bloom_filter,
+                      cudf::column_view const& input,
+                      rmm::cuda_stream_view stream = cudf::get_default_stream());
+
+/**
+ * @brief Probe a bloom filter with an input column of int64_t values.
+ *
+ * @param input The column of int64_t values to probe with.
+ * @param bloom_filter The bloom filter to be probed.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned boolean column's memory.
+ *
+ * @returns A column of booleans where a true value indicates a value may be present in the bloom
+ * filter, and a false indicates the value is not present.
+ */
+std::unique_ptr<cudf::column> bloom_filter_probe(
+  cudf::column_view const& input,
+  cudf::device_span<uint8_t const> bloom_filter,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Probe a bloom filter with an input column of int64_t values.
+ *
+ * @param input The column of int64_t values to probe with.
+ * @param bloom_filter A list_scalar encapsulating the bloom filter to be probed.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned boolean column's memory.
+ *
+ * @returns A column of booleans where a true value indicates a value may be present in the bloom
+ * filter, and a false indicates the value is not present.
+ */
+std::unique_ptr<cudf::column> bloom_filter_probe(
+  cudf::column_view const& input,
+  cudf::list_scalar& bloom_filter,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Merge multiple bloom filters into a single output.
+ *
+ * The incoming bloom filters are expected to be in the form of a list column, with
+ * each row corresponding to an invidual bloom filter.  Each bloom filter must have the
+ * same number of hashes and size.
+ *
+ * @param bloom_filters The bloom filters to be probed.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned boolean column's memory.
+ *
+ * @returns The new bloom filter.
+ */
+std::unique_ptr<cudf::list_scalar> bloom_filter_merge(
+  cudf::column_view const& bloom_filters,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/murmur_hash.cu b/src/main/cpp/src/murmur_hash.cu
index 0574ec3d9f..679f521e77 100644
--- a/src/main/cpp/src/murmur_hash.cu
+++ b/src/main/cpp/src/murmur_hash.cu
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "murmur_hash.cuh"
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
-#include "hash.cuh"
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -31,189 +31,6 @@ namespace spark_rapids_jni {
 
 namespace {
 
-using spark_hash_value_type = int32_t;
-
-__device__ inline uint32_t rotate_bits_left(uint32_t x, uint32_t r)
-{
-  // This function is equivalent to (x << r) | (x >> (32 - r))
-  return __funnelshift_l(x, x, r);
-}
-
-template <typename Key, CUDF_ENABLE_IF(not cudf::is_nested<Key>())>
-struct SparkMurmurHash3_32 {
-  using result_type = spark_hash_value_type;
-
-  constexpr SparkMurmurHash3_32() = default;
-  constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {}
-
-  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
-  {
-    h ^= h >> 16;
-    h *= 0x85ebca6b;
-    h ^= h >> 13;
-    h *= 0xc2b2ae35;
-    h ^= h >> 16;
-    return h;
-  }
-
-  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
-                                                      cudf::size_type offset) const
-  {
-    // Read a 4-byte value from the data pointer as individual bytes for safe
-    // unaligned access (very likely for string types).
-    auto block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
-  }
-
-  [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
-  {
-    return compute(key);
-  }
-
-  template <typename T>
-  result_type __device__ inline compute(T const& key) const
-  {
-    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
-  }
-
-  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
-                                                        cudf::size_type len,
-                                                        cudf::size_type tail_offset,
-                                                        result_type h) const
-  {
-    // Process remaining bytes that do not fill a four-byte chunk using Spark's approach
-    // (does not conform to normal MurmurHash3).
-    for (auto i = tail_offset; i < len; i++) {
-      // We require a two-step cast to get the k1 value from the byte. First,
-      // we must cast to a signed int8_t. Then, the sign bit is preserved when
-      // casting to uint32_t under 2's complement. Java preserves the sign when
-      // casting byte-to-int, but C++ does not.
-      uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
-      k1 *= c1;
-      k1 = spark_rapids_jni::rotate_bits_left(k1, rot_c1);
-      k1 *= c2;
-      h ^= k1;
-      h = spark_rapids_jni::rotate_bits_left(h, rot_c2);
-      h = h * 5 + c3;
-    }
-    return h;
-  }
-
-  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
-  {
-    constexpr cudf::size_type BLOCK_SIZE = 4;
-    cudf::size_type const nblocks        = len / BLOCK_SIZE;
-    cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
-    result_type h                        = m_seed;
-
-    // Process all four-byte chunks.
-    for (cudf::size_type i = 0; i < nblocks; i++) {
-      uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
-      k1 *= c1;
-      k1 = spark_rapids_jni::rotate_bits_left(k1, rot_c1);
-      k1 *= c2;
-      h ^= k1;
-      h = spark_rapids_jni::rotate_bits_left(h, rot_c2);
-      h = h * 5 + c3;
-    }
-
-    h = compute_remaining_bytes(data, len, tail_offset, h);
-
-    // Finalize hash.
-    h ^= len;
-    h = fmix32(h);
-    return h;
-  }
-
- private:
-  uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
-  static constexpr uint32_t c1     = 0xcc9e2d51;
-  static constexpr uint32_t c2     = 0x1b873593;
-  static constexpr uint32_t c3     = 0xe6546b64;
-  static constexpr uint32_t rot_c1 = 15;
-  static constexpr uint32_t rot_c2 = 13;
-};
-
-template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<bool>::operator()(bool const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<int8_t>::operator()(
-  int8_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<uint8_t>::operator()(
-  uint8_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<int16_t>::operator()(
-  int16_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<uint16_t>::operator()(
-  uint16_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<float>::operator()(
-  float const& key) const
-{
-  return compute<float>(spark_rapids_jni::normalize_nans(key));
-}
-
-template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<double>::operator()(
-  double const& key) const
-{
-  return compute<double>(spark_rapids_jni::normalize_nans(key));
-}
-
-template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operator()(
-  cudf::string_view const& key) const
-{
-  auto const data = reinterpret_cast<std::byte const*>(key.data());
-  auto const len  = key.size_bytes();
-  return compute_bytes(data, len);
-}
-
-template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal32>::operator()(
-  numeric::decimal32 const& key) const
-{
-  return compute<uint64_t>(key.value());
-}
-
-template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal64>::operator()(
-  numeric::decimal64 const& key) const
-{
-  return compute<uint64_t>(key.value());
-}
-
-template <>
-spark_hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal128>::operator()(
-  numeric::decimal128 const& key) const
-{
-  auto [java_d, length] = to_java_bigdecimal(key);
-  auto bytes            = reinterpret_cast<std::byte*>(&java_d);
-  return compute_bytes(bytes, length);
-}
-
 /**
  * @brief Computes the hash value of a row in the given table.
  *
@@ -243,7 +60,7 @@ spark_hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal128>
  * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
  */
 template <template <typename> class hash_function, typename Nullate>
-class spark_murmur_device_row_hasher {
+class murmur_device_row_hasher {
   friend class cudf::experimental::row::hash::row_hasher;  ///< Allow row_hasher to access private
                                                            ///< members.
 
@@ -288,16 +105,16 @@ class spark_murmur_device_row_hasher {
     using hash_functor = cudf::experimental::row::hash::element_hasher<hash_fn, Nullate>;
 
     template <typename T, CUDF_ENABLE_IF(not cudf::is_nested<T>())>
-    __device__ spark_hash_value_type operator()(cudf::column_device_view const& col,
-                                                cudf::size_type row_index) const noexcept
+    __device__ murmur_hash_value_type operator()(cudf::column_device_view const& col,
+                                                 cudf::size_type row_index) const noexcept
     {
       auto const hasher = hash_functor{_check_nulls, _seed, _seed};
       return hasher.template operator()<T>(col, row_index);
     }
 
     template <typename T, CUDF_ENABLE_IF(cudf::is_nested<T>())>
-    __device__ spark_hash_value_type operator()(cudf::column_device_view const& col,
-                                                cudf::size_type row_index) const noexcept
+    __device__ murmur_hash_value_type operator()(cudf::column_device_view const& col,
+                                                 cudf::size_type row_index) const noexcept
     {
       cudf::column_device_view curr_col = col.slice(row_index, 1);
       while (curr_col.type().id() == cudf::type_id::STRUCT ||
@@ -326,15 +143,14 @@ class spark_murmur_device_row_hasher {
     uint32_t const _seed;        ///< The seed to use for hashing, also returned for null elements
   };
 
-  CUDF_HOST_DEVICE spark_murmur_device_row_hasher(Nullate check_nulls,
-                                                  cudf::table_device_view t,
-                                                  uint32_t seed = cudf::DEFAULT_HASH_SEED) noexcept
+  CUDF_HOST_DEVICE murmur_device_row_hasher(Nullate check_nulls,
+                                            cudf::table_device_view t,
+                                            uint32_t seed = cudf::DEFAULT_HASH_SEED) noexcept
     : _check_nulls{check_nulls}, _table{t}, _seed(seed)
   {
     // Error out if passed an unsupported hash_function
-    static_assert(
-      std::is_base_of_v<SparkMurmurHash3_32<int>, hash_function<int>>,
-      "spark_murmur_device_row_hasher only supports the SparkMurmurHash3_32 hash function");
+    static_assert(std::is_base_of_v<MurmurHash3_32<int>, hash_function<int>>,
+                  "murmur_device_row_hasher only supports the MurmurHash3_32 hash function");
   }
 
   Nullate const _check_nulls;
@@ -372,7 +188,7 @@ std::unique_ptr<cudf::column> murmur_hash3_32(cudf::table_view const& input,
                                               rmm::mr::device_memory_resource* mr)
 {
   auto output =
-    cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<spark_hash_value_type>()),
+    cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<murmur_hash_value_type>()),
                               input.num_rows(),
                               cudf::mask_state::UNALLOCATED,
                               stream,
@@ -391,9 +207,9 @@ std::unique_ptr<cudf::column> murmur_hash3_32(cudf::table_view const& input,
   // Compute the hash value for each row
   thrust::tabulate(
     rmm::exec_policy(stream),
-    output_view.begin<spark_hash_value_type>(),
-    output_view.end<spark_hash_value_type>(),
-    row_hasher.device_hasher<SparkMurmurHash3_32, spark_murmur_device_row_hasher>(nullable, seed));
+    output_view.begin<murmur_hash_value_type>(),
+    output_view.end<murmur_hash_value_type>(),
+    row_hasher.device_hasher<MurmurHash3_32, murmur_device_row_hasher>(nullable, seed));
 
   return output;
 }
diff --git a/src/main/cpp/src/murmur_hash.cuh b/src/main/cpp/src/murmur_hash.cuh
new file mode 100644
index 0000000000..ecfcdb8222
--- /dev/null
+++ b/src/main/cpp/src/murmur_hash.cuh
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "hash.cuh"
+
+#include <cudf/hashing.hpp>
+#include <cudf/strings/string_view.hpp>
+#include <cudf/types.hpp>
+
+namespace spark_rapids_jni {
+
+using murmur_hash_value_type = int32_t;
+
+__device__ inline uint32_t rotate_bits_left(uint32_t x, uint32_t r)
+{
+  // This function is equivalent to (x << r) | (x >> (32 - r))
+  return __funnelshift_l(x, x, r);
+}
+
+template <typename Key, CUDF_ENABLE_IF(not cudf::is_nested<Key>())>
+struct MurmurHash3_32 {
+  using result_type = murmur_hash_value_type;
+
+  constexpr MurmurHash3_32() = delete;
+  constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {}
+
+  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
+  {
+    h ^= h >> 16;
+    h *= 0x85ebca6b;
+    h ^= h >> 13;
+    h *= 0xc2b2ae35;
+    h ^= h >> 16;
+    return h;
+  }
+
+  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
+                                                      cudf::size_type offset) const
+  {
+    // Read a 4-byte value from the data pointer as individual bytes for safe
+    // unaligned access (very likely for string types).
+    auto block = reinterpret_cast<uint8_t const*>(data + offset);
+    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
+  }
+
+  [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
+  {
+    return compute(key);
+  }
+
+  template <typename T>
+  result_type __device__ inline compute(T const& key) const
+  {
+    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
+  }
+
+  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
+                                                        cudf::size_type len,
+                                                        cudf::size_type tail_offset,
+                                                        result_type h) const
+  {
+    // Process remaining bytes that do not fill a four-byte chunk using Spark's approach
+    // (does not conform to normal MurmurHash3).
+    for (auto i = tail_offset; i < len; i++) {
+      // We require a two-step cast to get the k1 value from the byte. First,
+      // we must cast to a signed int8_t. Then, the sign bit is preserved when
+      // casting to uint32_t under 2's complement. Java preserves the sign when
+      // casting byte-to-int, but C++ does not.
+      uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
+      k1 *= c1;
+      k1 = spark_rapids_jni::rotate_bits_left(k1, rot_c1);
+      k1 *= c2;
+      h ^= k1;
+      h = spark_rapids_jni::rotate_bits_left(h, rot_c2);
+      h = h * 5 + c3;
+    }
+    return h;
+  }
+
+  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
+  {
+    constexpr cudf::size_type BLOCK_SIZE = 4;
+    cudf::size_type const nblocks        = len / BLOCK_SIZE;
+    cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
+    result_type h                        = m_seed;
+
+    // Process all four-byte chunks.
+    for (cudf::size_type i = 0; i < nblocks; i++) {
+      uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
+      k1 *= c1;
+      k1 = spark_rapids_jni::rotate_bits_left(k1, rot_c1);
+      k1 *= c2;
+      h ^= k1;
+      h = spark_rapids_jni::rotate_bits_left(h, rot_c2);
+      h = h * 5 + c3;
+    }
+
+    h = compute_remaining_bytes(data, len, tail_offset, h);
+
+    // Finalize hash.
+    h ^= len;
+    h = fmix32(h);
+    return h;
+  }
+
+ private:
+  uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
+  static constexpr uint32_t c1     = 0xcc9e2d51;
+  static constexpr uint32_t c2     = 0x1b873593;
+  static constexpr uint32_t c3     = 0xe6546b64;
+  static constexpr uint32_t rot_c1 = 15;
+  static constexpr uint32_t rot_c2 = 13;
+};
+
+template <>
+murmur_hash_value_type __device__ inline MurmurHash3_32<bool>::operator()(bool const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+murmur_hash_value_type __device__ inline MurmurHash3_32<int8_t>::operator()(int8_t const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+murmur_hash_value_type __device__ inline MurmurHash3_32<uint8_t>::operator()(
+  uint8_t const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+murmur_hash_value_type __device__ inline MurmurHash3_32<int16_t>::operator()(
+  int16_t const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+murmur_hash_value_type __device__ inline MurmurHash3_32<uint16_t>::operator()(
+  uint16_t const& key) const
+{
+  return compute<uint32_t>(key);
+}
+
+template <>
+murmur_hash_value_type __device__ inline MurmurHash3_32<float>::operator()(float const& key) const
+{
+  return compute<float>(spark_rapids_jni::normalize_nans(key));
+}
+
+template <>
+murmur_hash_value_type __device__ inline MurmurHash3_32<double>::operator()(double const& key) const
+{
+  return compute<double>(spark_rapids_jni::normalize_nans(key));
+}
+
+template <>
+murmur_hash_value_type __device__ inline MurmurHash3_32<cudf::string_view>::operator()(
+  cudf::string_view const& key) const
+{
+  auto const data = reinterpret_cast<std::byte const*>(key.data());
+  auto const len  = key.size_bytes();
+  return compute_bytes(data, len);
+}
+
+template <>
+murmur_hash_value_type __device__ inline MurmurHash3_32<numeric::decimal32>::operator()(
+  numeric::decimal32 const& key) const
+{
+  return compute<uint64_t>(key.value());
+}
+
+template <>
+murmur_hash_value_type __device__ inline MurmurHash3_32<numeric::decimal64>::operator()(
+  numeric::decimal64 const& key) const
+{
+  return compute<uint64_t>(key.value());
+}
+
+template <>
+murmur_hash_value_type __device__ inline MurmurHash3_32<numeric::decimal128>::operator()(
+  numeric::decimal128 const& key) const
+{
+  auto [java_d, length] = to_java_bigdecimal(key);
+  auto bytes            = reinterpret_cast<std::byte*>(&java_d);
+  return compute_bytes(bytes, length);
+}
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/utilities.cu b/src/main/cpp/src/utilities.cu
new file mode 100644
index 0000000000..c66ee5cbcb
--- /dev/null
+++ b/src/main/cpp/src/utilities.cu
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/device_vector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace spark_rapids_jni {
+
+std::unique_ptr<rmm::device_buffer> bitmask_bitwise_or(
+  std::vector<cudf::device_span<cudf::bitmask_type const>> const& input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(input.size() > 0, "Empty input");
+  auto const mask_size = (*input.begin()).size();
+  CUDF_EXPECTS(
+    std::all_of(
+      input.begin(), input.end(), [mask_size](auto mask) { return mask.size() == mask_size; }),
+    "Encountered size mismatch in inputs");
+  if (mask_size == 0) {
+    return std::make_unique<rmm::device_buffer>(rmm::device_buffer{0, stream, mr});
+  }
+
+  // move the pointers to the gpu
+  std::vector<cudf::bitmask_type const*> h_input(input.size());
+  std::transform(
+    input.begin(), input.end(), h_input.begin(), [](auto mask) { return mask.data(); });
+  auto d_input = cudf::detail::make_device_uvector_async(
+    h_input, stream, rmm::mr::get_current_device_resource());
+
+  std::unique_ptr<rmm::device_buffer> out =
+    std::make_unique<rmm::device_buffer>(mask_size * sizeof(cudf::bitmask_type), stream, mr);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(0) + mask_size,
+    static_cast<cudf::bitmask_type*>(out->data()),
+    [buffers = d_input.data(), num_buffers = input.size()] __device__(cudf::size_type word_index) {
+      cudf::bitmask_type out = buffers[0][word_index];
+      for (auto idx = 1; idx < num_buffers; idx++) {
+        out |= buffers[idx][word_index];
+      }
+      return out;
+    });
+
+  return out;
+}
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/utilities.hpp b/src/main/cpp/src/utilities.hpp
new file mode 100644
index 0000000000..261e75befc
--- /dev/null
+++ b/src/main/cpp/src/utilities.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace spark_rapids_jni {
+
+/**
+ * @brief Bitwise-or an array of equally-sized bitmask buffers into a single output buffer
+ *
+ * @param input The array of input bitmask buffers.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned bloom filter's memory.
+ *
+ */
+std::unique_ptr<rmm::device_buffer> bitmask_bitwise_or(
+  std::vector<cudf::device_span<cudf::bitmask_type const>> const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt
index 8ef0c2f2e4..a377c0c1f4 100644
--- a/src/main/cpp/tests/CMakeLists.txt
+++ b/src/main/cpp/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -56,3 +56,10 @@ ConfigureTest(ROW_CONVERSION
 
 ConfigureTest(HASH
     hash.cpp)
+
+ConfigureTest(BLOOM_FILTER
+    bloom_filter.cu)
+
+ConfigureTest(UTILITIES
+    utilities.cpp)
+
diff --git a/src/main/cpp/tests/bloom_filter.cu b/src/main/cpp/tests/bloom_filter.cu
new file mode 100644
index 0000000000..0139be6641
--- /dev/null
+++ b/src/main/cpp/tests/bloom_filter.cu
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bloom_filter.hpp"
+#include "utilities.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/logical.h>
+
+class BloomFilterTest : public cudf::test::BaseFixture {};
+
+struct is_zero {
+  __device__ bool operator()(cudf::bitmask_type w) { return w == 0; }
+};
+
+TEST_F(BloomFilterTest, Initialization)
+{
+  constexpr int num_hashes = 3;
+  std::vector<int> expected{1, 2, 3};
+
+  for (size_t idx = 0; idx < expected.size(); idx++) {
+    auto bloom_filter =
+      spark_rapids_jni::bloom_filter_create(num_hashes, expected[idx], cudf::get_default_stream());
+
+    auto const bloom_filter_size = expected[idx] * sizeof(int64_t);
+    CUDF_EXPECTS(
+      bloom_filter->view().size() == spark_rapids_jni::bloom_filter_header_size + bloom_filter_size,
+      "Bloom filter not of expected size");
+
+    auto bytes = (bloom_filter->view().data<int8_t>()) + spark_rapids_jni::bloom_filter_header_size;
+    CUDF_EXPECTS(
+      thrust::all_of(
+        rmm::exec_policy(cudf::get_default_stream()), bytes, bytes + bloom_filter_size, is_zero{}),
+      "Bloom filter not initialized to 0");
+  }
+}
+
+TEST_F(BloomFilterTest, BuildAndProbe)
+{
+  auto stream                      = cudf::get_default_stream();
+  constexpr int bloom_filter_longs = (1024 * 1024);
+  constexpr int num_hashes         = 3;
+
+  auto bloom_filter = spark_rapids_jni::bloom_filter_create(num_hashes, bloom_filter_longs, stream);
+
+  cudf::test::fixed_width_column_wrapper<int64_t> input{20, 80, 100, 99, 47, -9, 234000000};
+  spark_rapids_jni::bloom_filter_put(*bloom_filter, input, stream);
+
+  // probe
+  cudf::test::fixed_width_column_wrapper<int64_t> probe{
+    20, 80, 100, 99, 47, -9, 234000000, -10, 1, 2, 3};
+  cudf::test::fixed_width_column_wrapper<bool> expected{1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0};
+  auto result = spark_rapids_jni::bloom_filter_probe(probe, *bloom_filter, stream);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(BloomFilterTest, BuildWithNullsAndProbe)
+{
+  auto stream                      = cudf::get_default_stream();
+  constexpr int bloom_filter_longs = (1024 * 1024);
+  constexpr int num_hashes         = 3;
+
+  auto bloom_filter = spark_rapids_jni::bloom_filter_create(num_hashes, bloom_filter_longs, stream);
+  cudf::test::fixed_width_column_wrapper<int64_t> input{{20, 80, 100, 99, 47, -9, 234000000},
+                                                        {0, 1, 1, 1, 0, 1, 1}};
+
+  spark_rapids_jni::bloom_filter_put(*bloom_filter, input, stream);
+
+  // probe
+  cudf::test::fixed_width_column_wrapper<int64_t> probe{
+    20, 80, 100, 99, 47, -9, 234000000, -10, 1, 2, 3};
+  cudf::test::fixed_width_column_wrapper<bool> expected{0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0};
+  auto result = spark_rapids_jni::bloom_filter_probe(probe, *bloom_filter, stream);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(BloomFilterTest, BuildAndProbeWithNulls)
+{
+  auto stream                      = cudf::get_default_stream();
+  constexpr int bloom_filter_longs = (1024 * 1024);
+  constexpr int num_hashes         = 3;
+
+  cudf::test::fixed_width_column_wrapper<int64_t> input{20, 80, 100, 99, 47, -9, 234000000};
+  auto bloom_filter = spark_rapids_jni::bloom_filter_create(num_hashes, bloom_filter_longs, stream);
+
+  spark_rapids_jni::bloom_filter_put(*bloom_filter, input, stream);
+
+  // probe
+  cudf::test::fixed_width_column_wrapper<int64_t> probe{
+    {20, 80, 100, 99, 47, -9, 234000000, -10, 1, 2, 3}, {0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<bool> expected{{1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0},
+                                                        {0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1}};
+  auto result = spark_rapids_jni::bloom_filter_probe(probe, *bloom_filter, stream);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+}
+
+struct bloom_filter_stride_transform {
+  int const stride;
+
+  cudf::offset_type __device__ operator()(cudf::size_type i) { return i * stride; }
+};
+
+TEST_F(BloomFilterTest, ProbeMerged)
+{
+  auto stream                      = cudf::get_default_stream();
+  constexpr int bloom_filter_longs = (1024 * 1024);
+  constexpr int num_hashes         = 3;
+
+  // column a
+  cudf::test::fixed_width_column_wrapper<int64_t> col_a{20, 80, 100, 99, 47, -9, 234000000};
+  auto bloom_filter_a =
+    spark_rapids_jni::bloom_filter_create(num_hashes, bloom_filter_longs, stream);
+  spark_rapids_jni::bloom_filter_put(*bloom_filter_a, col_a, stream);
+
+  // column b
+  cudf::test::fixed_width_column_wrapper<int64_t> col_b{100, 200, 300, 400};
+  auto bloom_filter_b =
+    spark_rapids_jni::bloom_filter_create(num_hashes, bloom_filter_longs, stream);
+  spark_rapids_jni::bloom_filter_put(*bloom_filter_b, col_b, stream);
+
+  // column c
+  cudf::test::fixed_width_column_wrapper<int64_t> col_c{-100, -200, -300, -400};
+  auto bloom_filter_c =
+    spark_rapids_jni::bloom_filter_create(num_hashes, bloom_filter_longs, stream);
+  spark_rapids_jni::bloom_filter_put(*bloom_filter_c, col_c, stream);
+
+  // pre-merge the individual bloom filters. the merge function expects the inputs to be a single
+  // list column, with each row representing a bloom filter.
+  std::vector<cudf::column_view> cols(
+    {bloom_filter_a->view(), bloom_filter_b->view(), bloom_filter_c->view()});
+  auto premerge_children = cudf::concatenate(cols);
+  auto premerge_offsets  = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, 4);
+  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
+                    thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(0) + 4,
+                    premerge_offsets->mutable_view().begin<cudf::offset_type>(),
+                    bloom_filter_stride_transform{bloom_filter_a->view().size()});
+  auto premerged = cudf::make_lists_column(
+    3, std::move(premerge_offsets), std::move(premerge_children), 0, rmm::device_buffer{});
+
+  // merged bloom filter
+  auto bloom_filter_merged = spark_rapids_jni::bloom_filter_merge(*premerged);
+
+  // probe
+  cudf::test::fixed_width_column_wrapper<int64_t> probe{
+    -9, 200, 300, 6000, -2546, 99, 65535, 0, -100, -200, -300, -400};
+  cudf::test::fixed_width_column_wrapper<bool> expected{1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1};
+  auto result = spark_rapids_jni::bloom_filter_probe(probe, *bloom_filter_merged, stream);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
diff --git a/src/main/cpp/tests/utilities.cpp b/src/main/cpp/tests/utilities.cpp
new file mode 100644
index 0000000000..72b16543f6
--- /dev/null
+++ b/src/main/cpp/tests/utilities.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utilities.hpp"
+
+#include <cudf/types.hpp>
+#include <cudf_test/base_fixture.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include "gtest/gtest.h"
+
+class UtilitiesTest : public cudf::test::BaseFixture {};
+
+TEST_F(UtilitiesTest, BitwiseOr)
+{
+  auto stream = cudf::get_default_stream();
+
+  // 2 buffers
+  {
+    std::vector<cudf::bitmask_type> a{0x10011001, 0x0000ffff, 0xffff0000, 0x01010101, 0xab000000};
+    rmm::device_uvector<cudf::bitmask_type> da(a.size(), stream);
+    cudaMemcpyAsync(
+      da.data(), a.data(), sizeof(cudf::bitmask_type) * a.size(), cudaMemcpyHostToDevice);
+
+    std::vector<cudf::bitmask_type> b{0x01100000, 0xffff0000, 0xffff0000, 0xf000000f, 0x000100ab};
+    rmm::device_uvector<cudf::bitmask_type> db(b.size(), stream);
+    cudaMemcpyAsync(
+      db.data(), b.data(), sizeof(cudf::bitmask_type) * b.size(), cudaMemcpyHostToDevice);
+
+    std::vector<cudf::bitmask_type> expect{
+      0x11111001, 0xffffffff, 0xffff0000, 0xf101010f, 0xab0100ab};
+    auto d_result = spark_rapids_jni::bitmask_bitwise_or({{da}, {db}}, stream);
+    CUDF_EXPECTS(d_result->size() == expect.size() * sizeof(cudf::bitmask_type),
+                 "Unexpected output size");
+    std::vector<cudf::bitmask_type> result(expect.size());
+    cudaMemcpy(result.data(), d_result->data(), d_result->size(), cudaMemcpyDeviceToHost);
+    CUDF_EXPECTS(std::equal(result.begin(), result.end(), expect.begin()),
+                 "Unexpected output size");
+  }
+
+  // 4 buffers
+  {
+    std::vector<cudf::bitmask_type> a{0x10000000, 0x0000f000, 0xf0000000, 0x01000000, 0xa0000000};
+    rmm::device_uvector<cudf::bitmask_type> da(a.size(), stream);
+    cudaMemcpyAsync(
+      da.data(), a.data(), sizeof(cudf::bitmask_type) * a.size(), cudaMemcpyHostToDevice);
+
+    std::vector<cudf::bitmask_type> b{0x00010000, 0x00000f00, 0x0f000000, 0x00010000, 0x0b000000};
+    rmm::device_uvector<cudf::bitmask_type> db(b.size(), stream);
+    cudaMemcpyAsync(
+      db.data(), b.data(), sizeof(cudf::bitmask_type) * b.size(), cudaMemcpyHostToDevice);
+
+    std::vector<cudf::bitmask_type> c{0x00001000, 0x000000f0, 0x00f00000, 0x00000100, 0x000000a0};
+    rmm::device_uvector<cudf::bitmask_type> dc(c.size(), stream);
+    cudaMemcpyAsync(
+      dc.data(), c.data(), sizeof(cudf::bitmask_type) * c.size(), cudaMemcpyHostToDevice);
+
+    std::vector<cudf::bitmask_type> d{0x00000001, 0x0000000f, 0x000f0000, 0x00000001, 0x0000000b};
+    rmm::device_uvector<cudf::bitmask_type> dd(c.size(), stream);
+    cudaMemcpyAsync(
+      dd.data(), d.data(), sizeof(cudf::bitmask_type) * d.size(), cudaMemcpyHostToDevice);
+
+    std::vector<cudf::bitmask_type> expect{
+      0x10011001, 0x0000ffff, 0xffff0000, 0x01010101, 0xab0000ab};
+    auto d_result = spark_rapids_jni::bitmask_bitwise_or({{da}, {db}, {dc}, {dd}}, stream);
+    CUDF_EXPECTS(d_result->size() == expect.size() * sizeof(cudf::bitmask_type),
+                 "Unexpected output size");
+    std::vector<cudf::bitmask_type> result(expect.size());
+    cudaMemcpy(result.data(), d_result->data(), d_result->size(), cudaMemcpyDeviceToHost);
+    CUDF_EXPECTS(std::equal(result.begin(), result.end(), expect.begin()),
+                 "Results do not match expected");
+  }
+}
+
+TEST_F(UtilitiesTest, BitwiseOrEmptyInput)
+{
+  auto stream = cudf::get_default_stream();
+
+  rmm::device_uvector<cudf::bitmask_type> da(0, stream);
+  rmm::device_uvector<cudf::bitmask_type> db(0, stream);
+  auto result = spark_rapids_jni::bitmask_bitwise_or({{da}, {db}}, stream);
+  CUDF_EXPECTS(result->size() == 0, "Expected empty output");
+}
+
+TEST_F(UtilitiesTest, BitwiseOrExpectedFailures)
+{
+  auto stream = cudf::get_default_stream();
+
+  {
+    std::vector<cudf::bitmask_type> a{10, 20, 30, 40};
+    rmm::device_uvector<cudf::bitmask_type> da(a.size(), stream);
+    cudaMemcpyAsync(
+      da.data(), a.data(), sizeof(cudf::bitmask_type) * a.size(), cudaMemcpyHostToDevice);
+
+    std::vector<cudf::bitmask_type> b{50, 60, 70};
+    rmm::device_uvector<cudf::bitmask_type> db(b.size(), stream);
+    cudaMemcpyAsync(
+      db.data(), b.data(), sizeof(cudf::bitmask_type) * b.size(), cudaMemcpyHostToDevice);
+
+    EXPECT_THROW(spark_rapids_jni::bitmask_bitwise_or({{da}, {db}}, stream), cudf::logic_error);
+  }
+
+  {
+    EXPECT_THROW(spark_rapids_jni::bitmask_bitwise_or({}, stream), cudf::logic_error);
+  }
+}
diff --git a/src/main/java/ai/rapids/cudf/CudfAccessor.java b/src/main/java/ai/rapids/cudf/CudfAccessor.java
new file mode 100644
index 0000000000..3352ee1ae9
--- /dev/null
+++ b/src/main/java/ai/rapids/cudf/CudfAccessor.java
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+// TODO: properly expose these functions in the actual Scalar API and remove this layer.
+// https://github.com/NVIDIA/spark-rapids-jni/issues/1307
+public class CudfAccessor {
+  public static long getScalarHandle(Scalar s) {
+    return s.getScalarHandle();
+  }
+
+  public static Scalar scalarFromHandle(DType type, long scalarHandle) {
+    return new Scalar(type, scalarHandle);
+  }
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/BloomFilter.java b/src/main/java/com/nvidia/spark/rapids/jni/BloomFilter.java
new file mode 100644
index 0000000000..46bf9a7f08
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/BloomFilter.java
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.CudfAccessor;
+import ai.rapids.cudf.CudfException;
+import ai.rapids.cudf.DType;
+import ai.rapids.cudf.Scalar;
+import ai.rapids.cudf.NativeDepsLoader;
+
+public class BloomFilter {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * Create a bloom filter with the specified number of hashes and bloom filter bits.
+   * @param numHashes The number of hashes to use when inserting values into the bloom filter or
+   * when probing.
+   * @param bloomFilterBits Size of the bloom filter in bits.
+   * @return a Scalar object which encapsulates the bloom filter.
+   */
+  public static Scalar create(int numHashes, long bloomFilterBits){
+    if(numHashes <= 0){
+      throw new IllegalArgumentException("Bloom filters must have a positive hash count");
+    }
+    if(bloomFilterBits <= 0){
+      throw new IllegalArgumentException("Bloom filters must have a positive number of bits");
+    }
+    return CudfAccessor.scalarFromHandle(DType.LIST, creategpu(numHashes, bloomFilterBits));
+  }
+
+  /**
+   * Insert a column of longs into a bloom filter.
+   * @param bloomFilter The bloom filter to which values will be inserted.
+   * @param cv The column containing the values to add.
+   */
+  public static void put(Scalar bloomFilter, ColumnVector cv){
+    put(CudfAccessor.getScalarHandle(bloomFilter), cv.getNativeView());
+  }
+
+  /**
+   * Merge one or more bloom filters into a new bloom filter.
+   * @param bloomFilters A ColumnVector containing a bloom filter per row. 
+   * @return A new bloom filter containing the merged inputs.
+   */
+  public static Scalar merge(ColumnVector bloomFilters){
+    return CudfAccessor.scalarFromHandle(DType.LIST, merge(bloomFilters.getNativeView()));
+  }
+
+  /**
+   * Probe a bloom filter with a column of longs. Returns a column of booleans. For 
+   * each row in the output; a value of true indicates that the corresponding input value
+   * -may- be in the set of values used to build the bloom filter; a value of false indicates
+   * that the corresponding input value is conclusively not in the set of values used to build
+   * the bloom filter. 
+   * @param bloomFilter The bloom filter to be probed.
+   * @param cv The column containing the values to check.
+   * @return A boolean column indicating the results of the probe.
+   */
+  public static ColumnVector probe(Scalar bloomFilter, ColumnVector cv){
+    return new ColumnVector(probe(CudfAccessor.getScalarHandle(bloomFilter), cv.getNativeView()));
+  }
+
+  /**
+   * Probe a bloom filter with a column of longs. Returns a column of booleans. For 
+   * each row in the output; a value of true indicates that the corresponding input value
+   * -may- be in the set of values used to build the bloom filter; a value of false indicates
+   * that the corresponding input value is conclusively not in the set of values used to build
+   * the bloom filter. 
+   * @param bloomFilter The bloom filter to be probed. This buffer is expected to be the 
+   * fully packed Spark bloom filter, including header.
+   * @param cv The column containing the values to check.
+   * @return A boolean column indicating the results of the probe.
+   */
+  public static ColumnVector probe(BaseDeviceMemoryBuffer bloomFilter, ColumnVector cv){
+    return new ColumnVector(probebuffer(bloomFilter.getAddress(), bloomFilter.getLength(), cv.getNativeView()));
+  }
+  
+  private static native long creategpu(int numHashes, long bloomFilterBits) throws CudfException;
+  private static native int put(long bloomFilter, long cv) throws CudfException;
+  private static native long merge(long bloomFilters) throws CudfException;
+  private static native long probe(long bloomFilter, long cv) throws CudfException;  
+  private static native long probebuffer(long bloomFilter, long bloomFilterSize, long cv) throws CudfException;  
+}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/BloomFilterTest.java b/src/test/java/com/nvidia/spark/rapids/jni/BloomFilterTest.java
new file mode 100644
index 0000000000..1ce6f18ed3
--- /dev/null
+++ b/src/test/java/com/nvidia/spark/rapids/jni/BloomFilterTest.java
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import com.nvidia.spark.rapids.jni.BloomFilter;
+
+import ai.rapids.cudf.AssertUtils;
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.Cuda;
+import ai.rapids.cudf.CudfException;
+import ai.rapids.cudf.Scalar;
+import ai.rapids.cudf.DeviceMemoryBuffer;
+
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import org.junit.jupiter.api.Test;
+
+public class BloomFilterTest {
+  @Test
+  void testBuildAndProbe(){
+    int numHashes = 3;
+    long bloomFilterBits = 4 * 1024 * 1024;
+
+    try (ColumnVector input = ColumnVector.fromLongs(20, 80, 100, 99, 47, -9, 234000000);
+         Scalar bloomFilter = BloomFilter.create(numHashes, bloomFilterBits)){
+      
+      BloomFilter.put(bloomFilter, input);
+      try(ColumnVector probe = ColumnVector.fromLongs(20, 80, 100, 99, 47, -9, 234000000, -10, 1, 2, 3);
+          ColumnVector expected = ColumnVector.fromBooleans(true, true, true, true, true, true, true, false, false, false, false);
+          ColumnVector result = BloomFilter.probe(bloomFilter, probe)){
+        AssertUtils.assertColumnsAreEqual(expected, result);
+      }
+    }
+  }
+
+  @Test
+  void testBuildAndProbeBuffer(){
+    int numHashes = 3;
+    long bloomFilterBits = 4 * 1024 * 1024;
+
+    try (ColumnVector input = ColumnVector.fromLongs(20, 80, 100, 99, 47, -9, 234000000);
+         Scalar bloomFilter = BloomFilter.create(numHashes, bloomFilterBits)){
+      
+      BloomFilter.put(bloomFilter, input);
+
+      try(ColumnVector probe = ColumnVector.fromLongs(20, 80, 100, 99, 47, -9, 234000000, -10, 1, 2, 3);
+          ColumnVector expected = ColumnVector.fromBooleans(true, true, true, true, true, true, true, false, false, false, false);
+          ColumnVector result = BloomFilter.probe(bloomFilter.getListAsColumnView().getData(), probe)){
+        AssertUtils.assertColumnsAreEqual(expected, result);
+      }
+    }
+  }
+
+  @Test
+  void testBuildWithNullsAndProbe(){
+    int numHashes = 3;
+    long bloomFilterBits = 4 * 1024 * 1024;
+
+    try (ColumnVector input = ColumnVector.fromBoxedLongs(null, 80L, 100L, null, 47L, -9L, 234000000L);
+         Scalar bloomFilter = BloomFilter.create(numHashes, bloomFilterBits)){
+      
+      BloomFilter.put(bloomFilter, input);
+      try(ColumnVector probe = ColumnVector.fromLongs(20, 80, 100, 99, 47, -9, 234000000, -10, 1, 2, 3);
+          ColumnVector expected = ColumnVector.fromBooleans(false, true, true, false, true, true, true, false, false, false, false);
+          ColumnVector result = BloomFilter.probe(bloomFilter, probe)){
+        AssertUtils.assertColumnsAreEqual(expected, result);
+      }
+    }
+  }
+
+  @Test
+  void testBuildAndProbeWithNulls(){
+    int numHashes = 3;
+    long bloomFilterBits = 4 * 1024 * 1024;
+
+    try (ColumnVector input = ColumnVector.fromLongs(20, 80, 100, 99, 47, -9, 234000000);
+         Scalar bloomFilter = BloomFilter.create(numHashes, bloomFilterBits)){
+      
+      BloomFilter.put(bloomFilter, input);
+      try(ColumnVector probe = ColumnVector.fromBoxedLongs(null, null, null, 99L, 47L, -9L, 234000000L, null, null, 2L, 3L);
+          ColumnVector expected = ColumnVector.fromBoxedBooleans(null, null, null, true, true, true, true, null, null, false, false);
+          ColumnVector result = BloomFilter.probe(bloomFilter, probe)){
+        AssertUtils.assertColumnsAreEqual(expected, result);
+      }
+    }
+  }
+  
+  @Test
+  void testBuildMergeProbe(){
+    int numHashes = 3;
+    long bloomFilterBits = 4 * 1024 * 1024;
+
+    try (ColumnVector colA = ColumnVector.fromLongs(20, 80, 100, 99, 47, -9, 234000000);
+         ColumnVector colB = ColumnVector.fromLongs(100, 200, 300, 400);
+         ColumnVector colC = ColumnVector.fromLongs(-100, -200, -300, -400);
+         Scalar bloomFilterA = BloomFilter.create(numHashes, bloomFilterBits);
+         Scalar bloomFilterB = BloomFilter.create(numHashes, bloomFilterBits);
+         Scalar bloomFilterC = BloomFilter.create(numHashes, bloomFilterBits)){
+
+      BloomFilter.put(bloomFilterA, colA);
+      BloomFilter.put(bloomFilterB, colB);
+      BloomFilter.put(bloomFilterC, colC);
+      
+      ColumnVector premerge = ColumnVector.concatenate(ColumnVector.fromScalar(bloomFilterA, 1),
+                                                       ColumnVector.fromScalar(bloomFilterB, 1),
+                                                       ColumnVector.fromScalar(bloomFilterC, 1));
+
+      try(ColumnVector probe = ColumnVector.fromLongs(-9, 200, 300, 6000, -2546, 99, 65535, 0, -100, -200, -300, -400);
+          ColumnVector expected = ColumnVector.fromBooleans(true, true, true, false, false, true, false, false, true, true, true, true);
+          Scalar merged = BloomFilter.merge(premerge);
+          ColumnVector result = BloomFilter.probe(merged, probe)){
+          AssertUtils.assertColumnsAreEqual(expected, result);
+      }
+    }
+  }
+
+  @Test
+  void testBuildTrivialMergeProbe(){
+    int numHashes = 3;
+    long bloomFilterBits = 4 * 1024 * 1024;
+
+    try (ColumnVector colA = ColumnVector.fromLongs(20, 80, 100, 99, 47, -9, 234000000);
+         Scalar bloomFilter = BloomFilter.create(numHashes, bloomFilterBits)){
+
+      BloomFilter.put(bloomFilter, colA);
+
+      ColumnVector premerge = ColumnVector.fromScalar(bloomFilter, 1);
+
+      try(ColumnVector probe = ColumnVector.fromLongs(-9, 200, 300, 6000, -2546, 99, 65535, 0, -100, -200, -300, -400);
+          ColumnVector expected = ColumnVector.fromBooleans(true, false, false, false, false, true, false, false, false, false, false, false);
+          Scalar merged = BloomFilter.merge(premerge);
+          ColumnVector result = BloomFilter.probe(merged, probe)){
+          AssertUtils.assertColumnsAreEqual(expected, result);
+      }
+    }
+  }
+
+  @Test
+  void testBuildExpectedFailures(){
+    // bloom filter with no hashes
+    assertThrows(IllegalArgumentException.class, () -> {
+      try (Scalar bloomFilter = BloomFilter.create(0, 64)){}
+    });
+
+    // bloom filter with no size
+    assertThrows(IllegalArgumentException.class, () -> {
+      try (Scalar bloomFilter = BloomFilter.create(3, 0)){}
+    });
+    
+    // merge with mixed hash counts
+    assertThrows(CudfException.class, () -> {
+      try (Scalar bloomFilterA = BloomFilter.create(3, 1024);
+           Scalar bloomFilterB = BloomFilter.create(4, 1024);
+           Scalar bloomFilterC = BloomFilter.create(4, 1024);
+           ColumnVector premerge = ColumnVector.concatenate(ColumnVector.fromScalar(bloomFilterA, 1),
+                                                            ColumnVector.fromScalar(bloomFilterB, 1),
+                                                            ColumnVector.fromScalar(bloomFilterC, 1));
+           Scalar merged = BloomFilter.merge(premerge)){}
+    });
+
+    // merge with mixed hash bit sizes
+    assertThrows(CudfException.class, () -> {
+      try (Scalar bloomFilterA = BloomFilter.create(3, 1024);
+           Scalar bloomFilterB = BloomFilter.create(3, 1024);
+           Scalar bloomFilterC = BloomFilter.create(3, 2048);
+           ColumnVector premerge = ColumnVector.concatenate(ColumnVector.fromScalar(bloomFilterA, 1),
+                                                            ColumnVector.fromScalar(bloomFilterB, 1),
+                                                            ColumnVector.fromScalar(bloomFilterC, 1));
+           Scalar merged = BloomFilter.merge(premerge)){}
+    });
+  }
+}

From bed379caccaf48f2a26be3216f094edb909ac14c Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 3 Aug 2023 06:33:47 +0800
Subject: [PATCH 110/113] Update submodule cudf to
 017beb0be18c917c70f60c947e48d29a6e6f96b7 (#1309)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5600f19894..017beb0be1 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5600f1989495c9545011228ffc7fcd737e2a39bc
+Subproject commit 017beb0be18c917c70f60c947e48d29a6e6f96b7

From fe373b09ba5902941d4ef8ba5dca692edd48652e Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Tue, 8 Aug 2023 10:07:43 +0800
Subject: [PATCH 111/113] Change version to v23.08.0

Signed-off-by: Tim Liu <timl@nvidia.com>
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index db0ab6832c..afe3eedd5a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -21,7 +21,7 @@
 
   <groupId>com.nvidia</groupId>
   <artifactId>spark-rapids-jni</artifactId>
-  <version>23.08.0-SNAPSHOT</version>
+  <version>23.08.0</version>
   <packaging>jar</packaging>
   <name>RAPIDS Accelerator JNI for Apache Spark</name>
   <description>

From b578c96f1ffdbb4a9dd6c217a49f6d3a6dcf415f Mon Sep 17 00:00:00 2001
From: Jenkins Automation <70000568+nvauto@users.noreply.github.com>
Date: Thu, 10 Aug 2023 05:48:33 +0800
Subject: [PATCH 112/113] Update submodule cudf to
 9d794877fdb7822ea2194e045aba45e0317cf577 (#1328)

Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 017beb0be1..9d794877fd 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 017beb0be18c917c70f60c947e48d29a6e6f96b7
+Subproject commit 9d794877fdb7822ea2194e045aba45e0317cf577

From e2e4f0a9c42e1c51cf753c6a75e8291eeeea018b Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Thu, 10 Aug 2023 09:37:35 +0800
Subject: [PATCH 113/113] Update cudf submodule to released tag v23.08.00
 (#1330)

Signed-off-by: Peixin Li <pxli@nyu.edu>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 9d794877fd..8150d38e08 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 9d794877fdb7822ea2194e045aba45e0317cf577
+Subproject commit 8150d38e080c8fb021921ade83fe3aa3be04b47d