diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
index c6de34764b..b1c3c2b32b 100755
--- a/.github/workflows/auto-merge.yml
+++ b/.github/workflows/auto-merge.yml
@@ -18,12 +18,12 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-      - branch-24.04
+      - branch-24.06
     types: [closed]
 
 env:
-  HEAD: branch-24.04
-  BASE: branch-24.06
+  HEAD: branch-24.06
+  BASE: branch-24.08
 
 jobs:
   auto-merge:
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index ece14b4201..33ccf50ea8 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -35,20 +35,16 @@ jobs:
     # This job only runs for pull request comments
     if: contains( '\
       abellina,\
-      andygrove,\
       anfeng,\
       firestarman,\
       GaryShen2008,\
-      jbrennan333, \
       jlowe,\
-      krajendrannv,\
       mythrocks,\
       nartal1,\
       nvdbaranec,\
       NvTimLiu,\
       razajafri,\
       revans2,\
-      rongou,\
       rwlee,\
       sameerz,\
       tgravescs,\
@@ -66,6 +62,10 @@ jobs:
       yinqingh,\
       thirtiseven,\
       parthosa,\
+      liurenjie1024,\
+      binmahone,\
+      pmattione-nvidia,\
+      Feng-Jiang28,\
       ', format('{0},', github.actor)) && github.event.comment.body == 'build'
     steps:
       - name: Check if comment is issued by authorized person
diff --git a/.gitmodules b/.gitmodules
index 103a678946..12b07c5b18 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "thirdparty/cudf"]
 	path = thirdparty/cudf
 	url = https://github.com/rapidsai/cudf.git
-	branch = branch-24.04
+	branch = branch-24.06
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 60dd78e5d8..1ada0b474b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -160,7 +160,7 @@ $ ./build/build-in-docker install ...
 ```
 
 Now cd to ~/repos/NVIDIA/spark-rapids and build with one of the options from
-[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-24.04/CONTRIBUTING.md#building-from-source).
+[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-24.06/CONTRIBUTING.md#building-from-source).
 
 ```bash
 $ ./build/buildall
@@ -355,7 +355,7 @@ conda install -c conda-forge pre-commit
 pip install pre-commit
 ```
 
-Then, run pre-commit hooks before committing your code. This wil reformat the stagged files:
+Then, run pre-commit hooks before committing your code. This will reformat the staged files:
 ```
 pre-commit run
 ```
diff --git a/NOTICE b/NOTICE
index 5e01c7e14c..4c06d1da90 100644
--- a/NOTICE
+++ b/NOTICE
@@ -17,4 +17,24 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the License.
\ No newline at end of file
+limitations under the License.
+
+--------------------------------------------------------------------------------
+
+This project includes code from flatbuffers (https://github.com/google/flatbuffers).
+
+Copyright 2021 Google Inc. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+--------------------------------------------------------------------------------
diff --git a/build/build-in-docker b/build/build-in-docker
index 421cc1a855..49032185ba 100755
--- a/build/build-in-docker
+++ b/build/build-in-docker
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,30 +24,27 @@ set -e
 SCRIPTDIR=$(cd $(dirname $0); pwd)
 
 LOCAL_MAVEN_REPO=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
-CUDF_USE_PER_THREAD_DEFAULT_STREAM=${CUDF_USE_PER_THREAD_DEFAULT_STREAM:-ON}
 USE_GDS=${USE_GDS:-ON}
 export CMAKE_GENERATOR=${CMAKE_GENERATOR:-"Ninja"}
+# Make CUDA_VERSION consistent with the file run-in-docker
+export CUDA_VERSION=${CUDA_VERSION:-11.8.0}
+CUDA_CLASSIFIER=cuda${CUDA_VERSION%%.*}
+BUILD_FAULTINJ=${BUILD_FAULTINJ:-ON}
 
 if (( $# == 0 )); then
   echo "Usage: $0 <Maven build arguments>"
   exit 1
 fi
 
-_CUDF_CLEAN_SKIP=""
-# if ccache is enabled and libcudf.clean.skip not provided
-# by the user remove the cpp build directory
-#
-if [[ "$CCACHE_DISABLE" != "1" ]]; then
-  if [[ ! "$*" =~ " -Dlibcudf.clean.skip=" ]]; then
-    # Don't skip clean if ccache is enabled
-    # unless the user overrides
-    _CUDF_CLEAN_SKIP="-Dlibcudf.clean.skip=false"
-  fi
+# Set env for arm64 build, The possible values of 'uname -m' : [x86_64/i386/aarch64/mips/...]
+if [ "$(uname -m)" == "aarch64" ]; then
+  USE_GDS="OFF" # The GDS cuFiles RDMA libraries are not included in the arm64 CUDA toolkit.
+  BUILD_FAULTINJ="OFF" # libcupti_static.a linked by cufaultinj, does not exist in the arm64 CUDA toolkit.
 fi
 
 $SCRIPTDIR/run-in-docker mvn \
     -Dmaven.repo.local=$LOCAL_MAVEN_REPO \
-    -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$CUDF_USE_PER_THREAD_DEFAULT_STREAM \
     -DUSE_GDS=$USE_GDS \
-    $_CUDF_CLEAN_SKIP \
+    -DBUILD_FAULTINJ=${BUILD_FAULTINJ} \
+    -Dcuda.version=$CUDA_CLASSIFIER \
     "$@"
diff --git a/build/run-in-docker b/build/run-in-docker
index 62d40aac48..81152a1d9d 100755
--- a/build/run-in-docker
+++ b/build/run-in-docker
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,11 +27,16 @@ REPODIR=$SCRIPTDIR/..
 CUDA_VERSION=${CUDA_VERSION:-11.8.0}
 DOCKER_CMD=${DOCKER_CMD:-docker}
 DOCKER_BUILD_EXTRA_ARGS=${DOCKER_BUILD_EXTRA_ARGS:-""}
+if [ "$(uname -m)" == "aarch64" ]; then
+    DOCKER_BUILD_EXTRA_ARGS="--build-arg TARGETPLATFORM=linux/arm64 --build-arg CMAKE_ARCH=aarch64 $DOCKER_BUILD_EXTRA_ARGS"
+else
+    DOCKER_BUILD_EXTRA_ARGS="--build-arg TARGETPLATFORM=linux/amd64 --build-arg CMAKE_ARCH=x86_64 $DOCKER_BUILD_EXTRA_ARGS"
+fi
 DOCKER_RUN_EXTRA_ARGS=${DOCKER_RUN_EXTRA_ARGS:-""}
 LOCAL_CCACHE_DIR=${LOCAL_CCACHE_DIR:-"$HOME/.ccache"}
 LOCAL_MAVEN_REPO=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
 
-SPARK_IMAGE_NAME="spark-rapids-jni-build:${CUDA_VERSION}-devel-centos7"
+SPARK_IMAGE_NAME="spark-rapids-jni-build:${CUDA_VERSION}-devel-rockylinux8"
 
 # ensure directories exist
 mkdir -p "$LOCAL_CCACHE_DIR" "$LOCAL_MAVEN_REPO"
@@ -74,4 +79,4 @@ $DOCKER_CMD run $DOCKER_GPU_OPTS $DOCKER_RUN_EXTRA_ARGS -u $(id -u):$(id -g) --r
   -e VERBOSE \
   $DOCKER_OPTS \
   $SPARK_IMAGE_NAME \
-  scl enable devtoolset-11 "$RUN_CMD"
+  scl enable gcc-toolset-11 "$RUN_CMD"
diff --git a/ci/Dockerfile b/ci/Dockerfile
old mode 100755
new mode 100644
index e3b703a11e..b3f4239dc6
--- a/ci/Dockerfile
+++ b/ci/Dockerfile
@@ -17,31 +17,32 @@
 ###
 # Build the image for spark-rapids-jni development environment.
 #
-# Arguments: CUDA_VERSION=11.8.0
+# Arguments: CUDA_VERSION=[11.X.Y, 12.X.Y], OS_RELEASE=[8, 9], TARGETPLATFORM=[linux/amd64, linux/arm64]
 #
 ###
 ARG CUDA_VERSION=11.8.0
-FROM nvidia/cuda:$CUDA_VERSION-devel-centos7
-ARG DEVTOOLSET_VERSION=11
+ARG OS_RELEASE=8
+ARG TARGETPLATFORM=linux/amd64
+# multi-platform build with: docker buildx build --platform linux/arm64,linux/amd64 <ARGS> on either amd64 or arm64 host
+# check available official arm-based docker images at https://hub.docker.com/r/nvidia/cuda/tags (OS/ARCH)
+FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE
+ARG TOOLSET_VERSION=11
 ### Install basic requirements
-RUN yum install -y centos-release-scl
-RUN yum install -y devtoolset-${DEVTOOLSET_VERSION} rh-python38 epel-release
-RUN yum install -y zlib-devel maven tar wget patch ninja-build
-# require git 2.18+ to keep consistent submodule operations
-RUN yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo.x86_64.rpm && yum install -y git
 # pin urllib3<2.0 for https://github.com/psf/requests/issues/6432
-RUN scl enable rh-python38 "pip install requests 'urllib3<2.0'"
-
+RUN dnf --enablerepo=powertools install -y scl-utils gcc-toolset-${TOOLSET_VERSION} python39 zlib-devel maven tar wget patch ninja-build git && \
+  alternatives --set python /usr/bin/python3 && \
+  python -m pip install requests 'urllib3<2.0'
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
-RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
+RUN mkdir -m 777 /usr/local/rapids /rapids
 
 # 3.22.3: CUDA architecture 'native' support + flexible CMAKE_<LANG>_*_LAUNCHER for ccache
 ARG CMAKE_VERSION=3.26.4
-
-RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
-   tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
-   rm cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
-ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:$PATH
+# default x86_64 from x86 build, aarch64 cmake for arm build
+ARG CMAKE_ARCH=x86_64
+RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
+   tar zxf cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
+   rm cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz
+ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}/bin:$PATH
 
 # ccache for interactive builds
 ARG CCACHE_VERSION=4.6
@@ -51,7 +52,7 @@ RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v
    cd ccache-${CCACHE_VERSION} && \
    mkdir build && \
    cd build && \
-   scl enable devtoolset-${DEVTOOLSET_VERSION} \
+   scl enable gcc-toolset-${TOOLSET_VERSION} \
       "cmake .. \
          -DCMAKE_BUILD_TYPE=Release \
          -DZSTD_FROM_INTERNET=ON \
diff --git a/ci/Dockerfile.multi b/ci/Dockerfile.multi
deleted file mode 100644
index d3b198530b..0000000000
--- a/ci/Dockerfile.multi
+++ /dev/null
@@ -1,76 +0,0 @@
-#
-# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-###
-# JNI CI image for multi-platform build
-#
-# Arguments: CUDA_VERSION=11.8.0
-#
-###
-ARG CUDA_VERSION=11.8.0
-ARG OS_RELEASE=8
-# multi-platform build with: docker buildx build --platform linux/arm64,linux/amd64 <ARGS> on either amd64 or arm64 host
-# check available offcial arm-based docker images at https://hub.docker.com/r/nvidia/cuda/tags (OS/ARCH)
-FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE
-ARG TOOLSET_VERSION=11
-### Install basic requirements
-RUN dnf install -y scl-utils
-RUN dnf install -y gcc-toolset-${TOOLSET_VERSION} python39
-RUN dnf --enablerepo=powertools install -y zlib-devel maven tar wget patch ninja-build
-# require git 2.18+ to keep consistent submodule operations
-RUN dnf install -y git
-## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
-RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
-
-# 3.22.3+: CUDA architecture 'native' support + flexible CMAKE_<LANG>_*_LAUNCHER for ccache
-ARG CMAKE_VERSION=3.26.4
-# default as arm64 release
-ARG CMAKE_ARCH=aarch64
-# aarch64 cmake for arm build
-RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
-   tar zxf cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
-   rm cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz
-ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}/bin:$PATH
-
-# ccache for interactive builds
-ARG CCACHE_VERSION=4.6
-RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \
-   tar zxf ccache-${CCACHE_VERSION}.tar.gz && \
-   rm ccache-${CCACHE_VERSION}.tar.gz && \
-   cd ccache-${CCACHE_VERSION} && \
-   mkdir build && \
-   cd build && \
-   scl enable gcc-toolset-${TOOLSET_VERSION} \
-      "cmake .. \
-         -DCMAKE_BUILD_TYPE=Release \
-         -DZSTD_FROM_INTERNET=ON \
-         -DREDIS_STORAGE_BACKEND=OFF && \
-      cmake --build . --parallel 4 --target install" && \
-   cd ../.. && \
-   rm -rf ccache-${CCACHE_VERSION}
-
-## install a version of boost that is needed for arrow/parquet to work
-RUN cd /usr/local && wget --quiet https://archives.boost.io/release/1.79.0/source/boost_1_79_0.tar.gz && \
-  tar -xzf boost_1_79_0.tar.gz && \
-  rm boost_1_79_0.tar.gz && \
-  cd boost_1_79_0 && \
-  ./bootstrap.sh --prefix=/usr/local && \
-  ./b2 install --prefix=/usr/local --with-filesystem --with-system && \
-   cd /usr/local && \
-   rm -rf boost_1_79_0
-
-# disable cuda container constraints to allow running w/ elder drivers on data-center GPUs
-ENV NVIDIA_DISABLE_REQUIRE="true"
diff --git a/ci/Jenkinsfile.premerge b/ci/Jenkinsfile.premerge
index a59db1af9a..0a00eb6f1b 100644
--- a/ci/Jenkinsfile.premerge
+++ b/ci/Jenkinsfile.premerge
@@ -30,7 +30,7 @@ import ipp.blossom.*
 
 def githubHelper // blossom github helper
 def TEMP_IMAGE_BUILD = true
-def IMAGE_PREMERGE = "${common.ARTIFACTORY_NAME}/sw-spark-docker/plugin-jni:centos7-cuda11.8.0-blossom"
+def IMAGE_PREMERGE = "${common.ARTIFACTORY_NAME}/sw-spark-docker/plugin-jni:rockylinux8-cuda11.8.0-blossom"
 def cpuImage = pod.getCPUYAML(IMAGE_PREMERGE)
 def PREMERGE_DOCKERFILE = 'ci/Dockerfile'
 def PREMERGE_TAG
@@ -150,7 +150,7 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
                         }
 
                         if (TEMP_IMAGE_BUILD) {
-                            PREMERGE_TAG = "centos7-cuda11.8.0-blossom-dev-${BUILD_TAG}"
+                            PREMERGE_TAG = "rockylinux8-cuda11.8.0-blossom-dev-${BUILD_TAG}"
                             IMAGE_PREMERGE = "${ARTIFACTORY_NAME}/sw-spark-docker-local/plugin-jni:${PREMERGE_TAG}"
                             docker.build(IMAGE_PREMERGE, "--network=host -f ${PREMERGE_DOCKERFILE} -t $IMAGE_PREMERGE .")
                             uploadDocker(IMAGE_PREMERGE)
@@ -212,7 +212,7 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
                     container('gpu') {
                         timeout(time: 3, unit: 'HOURS') { // step only timeout for test run
                             common.resolveIncompatibleDriverIssue(this)
-                            sh 'scl enable devtoolset-11 "ci/premerge-build.sh"'
+                            sh 'scl enable gcc-toolset-11 "ci/premerge-build.sh"'
                             sh 'bash ci/fuzz-test.sh'
                         }
                     }
diff --git a/ci/submodule-sync.sh b/ci/submodule-sync.sh
index 18119dc45d..1888696ba5 100755
--- a/ci/submodule-sync.sh
+++ b/ci/submodule-sync.sh
@@ -18,7 +18,7 @@
 # NOTE:
 #     this script is for jenkins only, and should not be used for local development
 #     run with ci/Dockerfile in jenkins:
-#         scl enable devtoolset-11 rh-python38 "ci/submodule-sync.sh"
+#         scl enable gcc-toolset-11 ci/submodule-sync.sh
 
 set -ex
 
diff --git a/pom.xml b/pom.xml
index 8c366a946b..566c06b934 100644
--- a/pom.xml
+++ b/pom.xml
@@ -21,7 +21,7 @@
 
   <groupId>com.nvidia</groupId>
   <artifactId>spark-rapids-jni</artifactId>
-  <version>24.04.0</version>
+  <version>24.06.0</version>
   <packaging>jar</packaging>
   <name>RAPIDS Accelerator JNI for Apache Spark</name>
   <description>
@@ -84,6 +84,7 @@
     <BUILD_TESTS>OFF</BUILD_TESTS>
     <BUILD_BENCHMARKS>OFF</BUILD_BENCHMARKS>
     <BUILD_FAULTINJ>ON</BUILD_FAULTINJ>
+    <BUILD_PROFILER>ON</BUILD_PROFILER>
     <ai.rapids.cudf.nvtx.enabled>false</ai.rapids.cudf.nvtx.enabled>
     <ai.rapids.refcount.debug>false</ai.rapids.refcount.debug>
     <cuda.version>cuda11</cuda.version>
@@ -338,8 +339,15 @@
     </profile>
     <profile>
       <id>arm64</id>
+      <activation>
+        <os>
+          <arch>aarch64</arch>
+        </os>
+      </activation>
       <properties>
         <jni.classifier>${cuda.version}-arm64</jni.classifier>
+        <!-- CUPTI does not have a static library for arm64 yet -->
+        <BUILD_PROFILER>OFF</BUILD_PROFILER>
       </properties>
     </profile>
   </profiles>
@@ -452,6 +460,7 @@
                   <arg value="-DBUILD_TESTS=${BUILD_TESTS}"/>
                   <arg value="-DBUILD_BENCHMARKS=${BUILD_BENCHMARKS}"/>
                   <arg value="-DBUILD_FAULTINJ=${BUILD_FAULTINJ}"/>
+                  <arg value="-DBUILD_PROFILER=${BUILD_PROFILER}"/>
                 </exec>
                 <exec dir="${native.build.path}"
                       failonerror="true"
@@ -543,6 +552,12 @@
                     <include>libcufilejni.so</include>
                   </includes>
                 </resource>
+                <resource>
+                  <directory>${native.build.path}/profiler</directory>
+                  <includes>
+                    <include>libprofilerjni.so</include>
+                  </includes>
+                </resource>
                 <resource>
                   <directory>${libcudfjni.build.path}</directory>
                   <includes>
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 1fef6ffab8..88d48e1587 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -32,7 +32,7 @@ rapids_cuda_init_architectures(SPARK_RAPIDS_JNI)
 
 project(
   SPARK_RAPIDS_JNI
-  VERSION 24.04.00
+  VERSION 24.06.00
   LANGUAGES C CXX CUDA
 )
 
@@ -44,6 +44,7 @@ option(USE_GDS "Build with GPUDirect Storage (GDS)/cuFile support" OFF)
 option(BUILD_TESTS "Configure CMake to build tests" OFF)
 option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF)
 option(BUILD_FAULTINJ "Configure CMake to build fault injection" ON)
+option(BUILD_PROFILER "Configure CMake to build profiler" ON)
 
 message(
   VERBOSE "SPARK_RAPIDS_JNI: Build with per-thread default stream:
@@ -60,6 +61,12 @@ set(SPARK_RAPIDS_JNI_CUDA_DEFINITIONS "")
 set(SPARK_RAPIDS_JNI_BUILD_TESTS ${BUILD_TESTS})
 set(SPARK_RAPIDS_JNI_BUILD_BENCHMARKS ${BUILD_BENCHMARKS})
 set(SPARK_RAPIDS_JNI_BUILD_FAULTINJ ${BUILD_FAULTINJ})
+if(NOT SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR)
+  set(SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR ${SPARK_RAPIDS_JNI_BINARY_DIR}/generated/include)
+endif()
+if(NOT SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR)
+  set(SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR ${SPARK_RAPIDS_JNI_BINARY_DIR}/generated/src)
+endif()
 
 # Set RMM logging level
 set(RMM_LOGGING_LEVEL
@@ -94,6 +101,21 @@ include(cmake/Modules/ConfigureCUDA.cmake) # set other CUDA compilation flags
 # ##################################################################################################
 # * dependencies ----------------------------------------------------------------------------------
 
+# version header
+find_package(Git REQUIRED)
+execute_process(COMMAND
+  "${GIT_EXECUTABLE}" describe --abbrev=40 --always --dirty --long
+  WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+  OUTPUT_VARIABLE SPARK_RAPIDS_JNI_COMMIT_DETAILS
+  ERROR_QUIET
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+configure_file(
+  src/spark_rapids_jni_version.cpp.in
+  "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/spark_rapids_jni_version.cpp"
+  @ONLY
+)
+
 # find NVTX
 include(${CUDF_DIR}/cpp/cmake/thirdparty/get_nvtx.cmake)
 
@@ -110,6 +132,8 @@ endif()
 
 # cudf
 if(BUILD_TESTS)
+  include(${rapids-cmake-dir}/cpm/gtest.cmake)
+  rapids_cpm_gtest(BUILD_STATIC)
   rapids_find_package(cudf REQUIRED COMPONENTS testing)
 else()
   rapids_find_package(cudf REQUIRED)
@@ -160,6 +184,7 @@ add_library(
   src/MapUtilsJni.cpp
   src/NativeParquetJni.cpp
   src/ParseURIJni.cpp
+  src/RegexRewriteUtilsJni.cpp
   src/RowConversionJni.cpp
   src/SparkResourceAdaptorJni.cpp
   src/ZOrderJni.cpp
@@ -176,6 +201,7 @@ add_library(
   src/map_utils.cu
   src/murmur_hash.cu
   src/parse_uri.cu
+  src/regex_rewrite_utils.cu
   src/row_conversion.cu
   src/timezones.cu
   src/utilities.cu
@@ -229,7 +255,7 @@ target_link_libraries(
   -Wl,--whole-archive
     ${CUDFJNI_LIB}
     cudf::cudf
-    nvtx3-cpp
+    nvtx3::nvtx3-cpp
   -Wl,--no-whole-archive
     ${PARQUET_LIB}
     ${THRIFT_LIB}
@@ -252,7 +278,7 @@ add_dependencies(cudfjnistub spark_rapids_jni)
 if(USE_GDS)
   include(${CUDF_DIR}/cpp/cmake/Modules/FindcuFile.cmake)
   find_library(CUFILEJNI_LIB "libcufilejni.a" REQUIRED NO_DEFAULT_PATH
-    HINTS "${PROJECT_BINARY_DIR}/../libcudfjni"
+    HINTS "${SPARK_RAPIDS_JNI_BINARY_DIR}/../libcudfjni"
   )
   add_library(cufilejni SHARED src/emptyfile.cpp)
   set_target_properties(
@@ -296,3 +322,7 @@ endif()
 if(SPARK_RAPIDS_JNI_BUILD_FAULTINJ)
   add_subdirectory(faultinj)
 endif()
+
+if(BUILD_PROFILER)
+  add_subdirectory(profiler)
+endif()
diff --git a/src/main/cpp/benchmarks/CMakeLists.txt b/src/main/cpp/benchmarks/CMakeLists.txt
index 23d35b0bea..732777ef10 100644
--- a/src/main/cpp/benchmarks/CMakeLists.txt
+++ b/src/main/cpp/benchmarks/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@ target_compile_options(
 )
 
 target_link_libraries(
-  spark_rapids_jni_datagen PUBLIC cudf::cudf
+  spark_rapids_jni_datagen PUBLIC cudf::cudf nvtx3::nvtx3-cpp
 )
 
 target_include_directories(
@@ -78,5 +78,8 @@ ConfigureBench(STRING_TO_FLOAT_BENCH
 ConfigureBench(BLOOM_FILTER_BENCH
   bloom_filter.cu)
 
+ConfigureBench(GET_JSON_OBJECT_BENCH
+  get_json_object.cu)
+
 ConfigureBench(PARSE_URI_BENCH
   parse_uri.cpp)
diff --git a/src/main/cpp/benchmarks/common/generate_input.cu b/src/main/cpp/benchmarks/common/generate_input.cu
index 75f0a8fca0..d0a61d05a0 100644
--- a/src/main/cpp/benchmarks/common/generate_input.cu
+++ b/src/main/cpp/benchmarks/common/generate_input.cu
@@ -32,6 +32,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
@@ -358,12 +359,13 @@ rmm::device_uvector<cudf::size_type> sample_indices_with_run_length(cudf::size_t
     // This is gather.
     auto avg_repeated_sample_indices_iterator = thrust::make_transform_iterator(
       thrust::make_counting_iterator(0),
-      [rb              = run_lens.begin(),
-       re              = run_lens.end(),
-       samples_indices = samples_indices.begin()] __device__(cudf::size_type i) {
-        auto sample_idx = thrust::upper_bound(thrust::seq, rb, re, i) - rb;
-        return samples_indices[sample_idx];
-      });
+      cuda::proclaim_return_type<cudf::size_type>(
+        [rb              = run_lens.begin(),
+         re              = run_lens.end(),
+         samples_indices = samples_indices.begin()] __device__(cudf::size_type i) {
+          auto sample_idx = thrust::upper_bound(thrust::seq, rb, re, i) - rb;
+          return samples_indices[sample_idx];
+        }));
     rmm::device_uvector<cudf::size_type> repeated_sample_indices(num_rows,
                                                                  cudf::get_default_stream());
     thrust::copy(thrust::device,
@@ -519,10 +521,10 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
 
   return cudf::make_strings_column(
     num_rows,
-    std::move(offsets),
-    std::move(chars->release().data.release()[0]),
-    profile.get_null_frequency().has_value() ? std::move(result_bitmask) : rmm::device_buffer{},
-    null_count);
+    std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
+    chars.release(),
+    null_count,
+    profile.get_null_frequency().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
 }
 
 /**
diff --git a/src/main/cpp/benchmarks/get_json_object.cu b/src/main/cpp/benchmarks/get_json_object.cu
new file mode 100644
index 0000000000..51f9299dba
--- /dev/null
+++ b/src/main/cpp/benchmarks/get_json_object.cu
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/io/json.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/strings/split/split.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <get_json_object.hpp>
+#include <nvbench/nvbench.cuh>
+
+// #define DEBUG_PRINT
+
+#ifdef DEBUG_PRINT
+
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <iostream>
+
+namespace {
+
+// Copy from `cudf/cpp/tests/utilities/column_utilities.cu`.
+struct strings_to_host_fn {
+  template <typename OffsetType,
+            std::enable_if_t<std::is_same_v<OffsetType, int32_t> ||
+                             std::is_same_v<OffsetType, int64_t>>* = nullptr>
+  void operator()(std::vector<std::string>& host_data,
+                  char const* chars,
+                  cudf::column_view const& offsets,
+                  rmm::cuda_stream_view stream)
+  {
+    auto const h_offsets = cudf::detail::make_std_vector_sync(
+      cudf::device_span<OffsetType const>(offsets.data<OffsetType>(), offsets.size()), stream);
+    // build std::string vector from chars and offsets
+    std::transform(std::begin(h_offsets),
+                   std::end(h_offsets) - 1,
+                   std::begin(h_offsets) + 1,
+                   host_data.begin(),
+                   [&](auto start, auto end) { return std::string(chars + start, end - start); });
+  }
+
+  template <typename OffsetType,
+            std::enable_if_t<!std::is_same_v<OffsetType, int32_t> &&
+                             !std::is_same_v<OffsetType, int64_t>>* = nullptr>
+  void operator()(std::vector<std::string>&,
+                  char const*,
+                  cudf::column_view const&,
+                  rmm::cuda_stream_view)
+  {
+    CUDF_FAIL("invalid offsets type");
+  }
+};
+
+template <typename CV>
+std::vector<std::string> to_host_strings(CV const& c)
+{
+  std::vector<std::string> host_strs(c.size());
+  auto stream        = cudf::get_default_stream();
+  auto const scv     = cudf::strings_column_view(c);
+  auto const h_chars = cudf::detail::make_std_vector_sync<char>(
+    cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
+  auto const offsets =
+    cudf::slice(scv.offsets(), {scv.offset(), scv.offset() + scv.size() + 1}).front();
+  cudf::type_dispatcher(
+    offsets.type(), strings_to_host_fn{}, host_strs, h_chars.data(), offsets, stream);
+  return host_strs;
+}
+
+}  // namespace
+#endif  // #ifdef DEBUG_PRINT
+
+constexpr auto list_depth = 2;
+constexpr auto min_width  = 10;
+constexpr auto max_width  = 10;
+
+auto generate_input(std::size_t size_bytes, cudf::size_type max_depth)
+{
+  data_profile const table_profile =
+    data_profile_builder()
+      .no_validity()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
+      .distribution(cudf::type_id::LIST, distribution_id::NORMAL, min_width, max_width)
+      .list_depth(list_depth)
+      .list_type(cudf::type_id::STRING)
+      .struct_depth(max_depth > list_depth ? max_depth - list_depth : 1)
+      .struct_types(std::vector<cudf::type_id>{cudf::type_id::LIST});
+
+  auto const input_table = create_random_table(
+    std::vector<cudf::type_id>{cudf::type_id::INT32, cudf::type_id::STRING, cudf::type_id::STRUCT},
+    table_size_bytes{size_bytes},
+    table_profile);
+
+  std::vector<char> buffer;
+  cudf::io::sink_info sink(&buffer);
+  cudf::io::table_metadata mt{{{"int32"}, {"string"}, {"struct"}}};
+  auto write_opts =
+    cudf::io::json_writer_options::builder(sink, input_table->view()).lines(true).metadata(mt);
+  cudf::io::write_json(write_opts);
+
+  // Split one JSON string into separate JSON objects.
+  auto const json_str = std::string{buffer.begin(), buffer.end()};
+  auto const json_col = cudf::test::strings_column_wrapper{{json_str}};
+  auto split_strs =
+    cudf::strings::split_record(cudf::strings_column_view{json_col}, cudf::string_scalar("\n"))
+      ->release();
+
+  // Note that split_strs is a list of strings thus we need to extract the strings column.
+  auto& json_strings = split_strs.children[cudf::lists_column_view::child_column_index];
+
+#ifdef DEBUG_PRINT
+  {
+    auto const strs = to_host_strings(json_strings->view());
+    std::cout << "First input row: \n" << strs.front() << std::endl;
+  }
+#endif  // #ifdef DEBUG_PRINT
+  return std::move(json_strings);
+}
+
+void BM_get_json_object(nvbench::state& state)
+{
+  auto const size_bytes = static_cast<cudf::size_type>(state.get_int64("size_bytes"));
+  auto const max_depth  = static_cast<cudf::size_type>(state.get_int64("max_depth"));
+
+  auto const json_strings = generate_input(size_bytes, max_depth);
+
+  using path_instruction_type = spark_rapids_jni::path_instruction_type;
+  std::vector<std::tuple<path_instruction_type, std::string, int64_t>> instructions;
+  instructions.emplace_back(path_instruction_type::NAMED, "struct", -1);
+  for (int i = 0; i < max_depth - list_depth; ++i) {
+    instructions.emplace_back(path_instruction_type::NAMED, "0", -1);
+  }
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    // Can also verify at https://jsonpath.com/.
+    [[maybe_unused]] auto const output = spark_rapids_jni::get_json_object(
+      cudf::strings_column_view{json_strings->view()}, instructions);
+
+#ifdef DEBUG_PRINT
+    {
+      auto const strs = to_host_strings(output->view());
+      std::cout << "First output row: \n" << strs.front() << std::endl << std::endl << std::endl;
+    }
+#endif  // #ifdef DEBUG_PRINT
+  });
+  state.add_global_memory_reads<nvbench::int8_t>(size_bytes);
+}
+
+NVBENCH_BENCH(BM_get_json_object)
+  .set_name("get_json_object")
+  .add_int64_axis("size_bytes", {1'000'000, 10'000'000, 100'000'000, 1'000'000'000})
+  .add_int64_axis("max_depth", {2, 4, 6, 8});
diff --git a/src/main/cpp/benchmarks/row_conversion.cpp b/src/main/cpp/benchmarks/row_conversion.cpp
index ff2e11f838..d040715aff 100644
--- a/src/main/cpp/benchmarks/row_conversion.cpp
+++ b/src/main/cpp/benchmarks/row_conversion.cpp
@@ -113,7 +113,7 @@ static void variable_or_fixed_width(nvbench::state& state)
       bytes_per_row += cudf::size_of(t);
     } else if (t.id() == cudf::type_id::STRING) {
       auto sc = cudf::strings_column_view(table->get_column(i));
-      string_bytes += sc.chars_size();
+      string_bytes += sc.chars_size(cudf::get_default_stream());
     }
   }
 
diff --git a/src/main/cpp/cmake/get_flatbuffers.cmake b/src/main/cpp/cmake/get_flatbuffers.cmake
new file mode 100644
index 0000000000..c7e0dfb549
--- /dev/null
+++ b/src/main/cpp/cmake/get_flatbuffers.cmake
@@ -0,0 +1,33 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to find or clone flatbuffers
+function(find_and_configure_flatbuffers VERSION)
+
+  rapids_cpm_find(
+    flatbuffers ${VERSION}
+    GLOBAL_TARGETS flatbuffers
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/google/flatbuffers.git
+    GIT_TAG v${VERSION}
+    GIT_SHALLOW TRUE
+  )
+
+  rapids_export_find_package_root(
+    BUILD flatbuffers "${flatbuffers_BINARY_DIR}" EXPORT_SET profilerjni-exports
+  )
+
+endfunction()
+
+find_and_configure_flatbuffers(24.3.25)
diff --git a/src/main/cpp/profiler/CMakeLists.txt b/src/main/cpp/profiler/CMakeLists.txt
new file mode 100644
index 0000000000..03a552b3ea
--- /dev/null
+++ b/src/main/cpp/profiler/CMakeLists.txt
@@ -0,0 +1,98 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+include(../cmake/get_flatbuffers.cmake)
+
+# ##################################################################################################
+# * flatbuffer generation---------------------------------------------------------------------------
+
+set(SPARK_RAPIDS_JNI_FBS_DIR "${SPARK_RAPIDS_JNI_SOURCE_DIR}/../fbs")
+add_custom_command(
+  OUTPUT ${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}/profiler_generated.h
+  DEPENDS "${SPARK_RAPIDS_JNI_FBS_DIR}/profiler.fbs"
+  WORKING_DIRECTORY "${SPARK_RAPIDS_JNI_FBS_DIR}"
+  VERBATIM
+  COMMAND ${CMAKE_COMMAND} -E make_directory "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}"
+  COMMAND
+    $<TARGET_FILE:flatbuffers::flatc> --cpp -o "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}" profiler.fbs
+  COMMENT "Generating profiler flatbuffer code"
+)
+
+# ##################################################################################################
+# * profiler JNI -----------------------------------------------------------------------------------
+
+add_library(profilerjni SHARED
+  ProfilerJni.cpp
+  profiler_debug.cpp
+  profiler_serializer.cpp
+  "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/spark_rapids_jni_version.cpp"
+  "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}/profiler_generated.h"
+)
+
+set_target_properties(
+  profilerjni
+  PROPERTIES BUILD_RPATH "\$ORIGIN"
+             INSTALL_RPATH "\$ORIGIN"
+             # set target compile options
+             CXX_STANDARD 17
+             CXX_STANDARD_REQUIRED ON
+             CXX_VISIBILITY_PRESET "hidden"
+             VISIBILITY_INLINES_HIDDEN TRUE
+)
+
+target_include_directories(
+  profilerjni
+  PRIVATE "${JNI_INCLUDE_DIRS}"
+          "${CUDAToolkit_INCLUDE_DIRS}"
+          "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}"
+          "${SPARK_RAPIDS_JNI_SOURCE_DIR}/src"
+)
+
+find_library(CUPTI_LIBRARY_PATH cupti_static PATHS
+  "/usr/local/cuda/lib64"
+  "/usr/local/cuda/extras/CUPTI/lib64"
+)
+
+target_link_libraries(profilerjni
+  PRIVATE ${CUPTI_LIBRARY_PATH} nvtx3::nvtx3-cpp flatbuffers::flatbuffers
+)
+
+file(READ "${SPARK_RAPIDS_JNI_FBS_DIR}/profiler.fbs" SPARK_RAPIDS_JNI_PROFILER_SCHEMA)
+configure_file(
+  profiler_schema.cpp.in
+  "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/profiler_schema.cpp"
+  @ONLY
+)
+
+add_executable(spark_rapids_profile_converter
+  spark_rapids_profile_converter.cpp
+  "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/profiler_schema.cpp"
+  "${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/spark_rapids_jni_version.cpp"
+  "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}/profiler_generated.h"
+)
+
+target_include_directories(
+  spark_rapids_profile_converter
+  PRIVATE
+  "${CUDAToolkit_INCLUDE_DIRS}"
+  "${SPARK_RAPIDS_JNI_SOURCE_DIR}/src"
+  "${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}"
+)
+
+target_link_libraries(spark_rapids_profile_converter
+  "${CUPTI_LIBRARY_PATH}"
+  flatbuffers::flatbuffers
+  dl
+  pthread
+  rt)
diff --git a/src/main/cpp/profiler/ProfilerJni.cpp b/src/main/cpp/profiler/ProfilerJni.cpp
new file mode 100644
index 0000000000..1271b89d7b
--- /dev/null
+++ b/src/main/cpp/profiler/ProfilerJni.cpp
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "profiler_generated.h"
+#include "profiler_serializer.hpp"
+#include "spark_rapids_jni_version.h"
+
+#include <cupti.h>
+#include <jni.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <stack>
+#include <thread>
+
+// Set this to true to have each CUPTI buffer dumped to stderr as it arrives.
+#define PROFILER_DEBUG_LOG_BUFFER 0
+
+#define JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val)    \
+  {                                                   \
+    if (env->ExceptionOccurred()) { return ret_val; } \
+  }
+
+#define JNI_THROW_NEW(env, class_name, message, ret_val) \
+  {                                                      \
+    jclass ex_class = env->FindClass(class_name);        \
+    if (ex_class == NULL) { return ret_val; }            \
+    env->ThrowNew(ex_class, message);                    \
+    return ret_val;                                      \
+  }
+
+#define CATCH_STD_CLASS(env, class_name, ret_val) \
+  catch (const std::exception& e) { JNI_THROW_NEW(env, class_name, e.what(), ret_val) }
+
+#define CATCH_STD(env, ret_val) CATCH_STD_CLASS(env, "java/lang/RuntimeException", ret_val)
+
+namespace spark_rapids_jni::profiler {
+
+namespace {
+
+// Encapsulates a buffer of profile data
+struct profile_buffer {
+  explicit profile_buffer(size_t size) : size_(size), valid_size_(0)
+  {
+    auto err = posix_memalign(reinterpret_cast<void**>(&data_), ALIGN_BYTES, size_);
+    if (err != 0) {
+      std::cerr << "PROFILER: Failed to allocate CUPTI buffer: " << strerror(err) << std::endl;
+      data_ = nullptr;
+      size_ = 0;
+    }
+  }
+
+  profile_buffer(uint8_t* data, size_t size, size_t valid_size)
+    : data_(data), size_(size), valid_size_(valid_size)
+  {
+  }
+
+  // Disconnects the underlying buffer of memory from the instance.
+  // The caller is responsible for freeing the resulting buffer.
+  void release(uint8_t** data_ptr_ptr, size_t* size_ptr)
+  {
+    *data_ptr_ptr = data_;
+    *size_ptr     = size_;
+    data_         = nullptr;
+    size_         = 0;
+  }
+
+  ~profile_buffer()
+  {
+    free(data_);
+    data_ = nullptr;
+    size_ = 0;
+  }
+
+  uint8_t const* data() const { return data_; }
+  uint8_t* data() { return data_; }
+  size_t size() const { return size_; }
+  size_t valid_size() const { return valid_size_; }
+  void set_valid_size(size_t size) { valid_size_ = size; }
+
+ private:
+  static constexpr size_t ALIGN_BYTES = 8;
+  uint8_t* data_;
+  size_t size_;
+  size_t valid_size_;
+};
+
+// Queue of profile buffers that have been filled with profile data.
+struct completed_buffer_queue {
+  // Gets the next available buffer of profile data, blocking until a buffer is available
+  // or the queue is shutdown. If the queue is shutdown, a nullptr is returned.
+  std::unique_ptr<profile_buffer> get()
+  {
+    std::unique_lock lock(lock_);
+    cv_.wait(lock, [this] { return shutdown_ || buffers_.size() > 0; });
+    if (buffers_.size() > 0) {
+      auto result = std::move(buffers_.front());
+      buffers_.pop();
+      return result;
+    }
+    return std::unique_ptr<profile_buffer>(nullptr);
+  }
+
+  void put(std::unique_ptr<profile_buffer>&& buffer)
+  {
+    std::unique_lock lock(lock_);
+    if (!shutdown_) {
+      buffers_.push(std::move(buffer));
+      lock.unlock();
+      cv_.notify_one();
+    }
+  }
+
+  void shutdown()
+  {
+    std::unique_lock lock(lock_);
+    shutdown_ = true;
+    lock.unlock();
+    cv_.notify_one();
+  }
+
+ private:
+  std::mutex lock_;
+  std::condition_variable cv_;
+  std::queue<std::unique_ptr<profile_buffer>> buffers_;
+  bool shutdown_ = false;
+};
+
+// Stack of profile buffers that are ready to be filled with profile data.
+struct free_buffer_tracker {
+  explicit free_buffer_tracker(size_t size) : buffer_size_(size) {}
+
+  // Returns the next available profile buffer or creates one if none are available.
+  std::unique_ptr<profile_buffer> get()
+  {
+    {
+      std::lock_guard lock(lock_);
+      if (buffers_.size() > 0) {
+        auto result = std::move(buffers_.top());
+        buffers_.pop();
+        return result;
+      }
+    }
+    return std::make_unique<profile_buffer>(buffer_size_);
+  }
+
+  void put(std::unique_ptr<profile_buffer>&& buffer)
+  {
+    buffer->set_valid_size(0);
+    std::lock_guard lock(lock_);
+    if (buffers_.size() < NUM_CACHED_BUFFERS) {
+      buffers_.push(std::move(buffer));
+    } else {
+      buffer.reset(nullptr);
+    }
+  }
+
+ private:
+  static constexpr size_t NUM_CACHED_BUFFERS = 2;
+  std::mutex lock_;
+  std::stack<std::unique_ptr<profile_buffer>> buffers_;
+  size_t buffer_size_;
+};
+
+void writer_thread_process(JavaVM* vm,
+                           jobject j_writer,
+                           size_t buffer_size,
+                           size_t flush_threshold);
+
+struct subscriber_state {
+  CUpti_SubscriberHandle subscriber_handle;
+  jobject j_writer;
+  std::thread writer_thread;
+  free_buffer_tracker free_buffers;
+  completed_buffer_queue completed_buffers;
+  bool has_cupti_callback_errored;
+  bool is_shutdown;
+
+  subscriber_state(jobject writer, size_t buffer_size)
+    : j_writer(writer),
+      free_buffers(buffer_size),
+      has_cupti_callback_errored(false),
+      is_shutdown(false)
+  {
+  }
+};
+
+// Global variables
+subscriber_state* State = nullptr;
+uint64_t Flush_period_msec;
+std::atomic_uint64_t Last_flush_time_msec;
+
+JavaVM* get_jvm(JNIEnv* env)
+{
+  JavaVM* vm;
+  if (env->GetJavaVM(&vm) != 0) { throw std::runtime_error("Unable to get JavaVM"); }
+  return vm;
+}
+
+JNIEnv* attach_to_jvm(JavaVM* vm)
+{
+  JavaVMAttachArgs args;
+  args.version = JNI_VERSION_1_6;
+  args.name    = const_cast<char*>("profiler writer");
+  args.group   = nullptr;
+  JNIEnv* env;
+  if (vm->AttachCurrentThread(reinterpret_cast<void**>(&env), &args) != JNI_OK) {
+    char const* msg = "PROFILER: unable to attach to JVM";
+    std::cerr << msg << std::endl;
+    throw std::runtime_error(msg);
+  }
+  return env;
+}
+
+char const* get_cupti_error(CUptiResult rc)
+{
+  char const* err;
+  if (cuptiGetResultString(rc, &err) != CUPTI_SUCCESS) { err = "UNKNOWN"; }
+  return err;
+}
+
+void check_cupti(CUptiResult rc, std::string msg)
+{
+  if (rc != CUPTI_SUCCESS) { throw std::runtime_error(msg + ": " + get_cupti_error(rc)); }
+}
+
+uint64_t timestamp_now()
+{
+  timespec info;
+  if (clock_gettime(CLOCK_MONOTONIC_RAW, &info) != 0) {
+    static bool have_logged_error = false;
+    if (!have_logged_error) {
+      std::cerr << "PROFILER: Unable to determine current time!" << std::endl;
+      have_logged_error = true;
+    }
+    // No idea what time it is, so return the last flush time which will effectively
+    // disable periodic flushing but avoid pathologic flushing on every kernel launch.
+    return Last_flush_time_msec;
+  }
+  return info.tv_sec * 1e3 + info.tv_nsec / 1e6;
+}
+
+void on_driver_launch_exit()
+{
+  auto now = timestamp_now();
+  if (now - Last_flush_time_msec >= Flush_period_msec) {
+    auto rc = cuptiActivityFlushAll(0);
+    if (rc != CUPTI_SUCCESS) {
+      std::cerr << "PROFILER: Error interval flushing records: " << get_cupti_error(rc)
+                << std::endl;
+    }
+    Last_flush_time_msec = now;
+  }
+}
+
+void domain_driver_callback(CUpti_CallbackId callback_id, CUpti_CallbackData const* cb_data)
+{
+  if (cb_data->callbackSite == CUPTI_API_ENTER) { return; }
+
+  switch (callback_id) {
+    case CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch:
+    case CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunch:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
+    case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz: on_driver_launch_exit(); break;
+    default:
+      std::cerr << "PROFILER: Unexpected driver API callback for " << callback_id << std::endl;
+      break;
+  }
+}
+
+void domain_runtime_callback(CUpti_CallbackId callback_id, CUpti_CallbackData const* data_ptr)
+{
+  switch (callback_id) {
+    case CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020:
+      if (data_ptr->callbackSite == CUPTI_API_ENTER) {
+        auto rc = cuptiActivityFlushAll(0);
+        if (rc != CUPTI_SUCCESS) {
+          std::cerr << "PROFILER: Error flushing CUPTI activity on device reset: "
+                    << get_cupti_error(rc) << std::endl;
+        }
+      }
+      break;
+    default: break;
+  }
+}
+
+// Invoked by CUPTI when something occurs for which we previously requested a callback.
+void CUPTIAPI callback_handler(void*,
+                               CUpti_CallbackDomain domain,
+                               CUpti_CallbackId callback_id,
+                               const void* callback_data_ptr)
+{
+  auto rc = cuptiGetLastError();
+  if (rc != CUPTI_SUCCESS && !State->has_cupti_callback_errored) {
+    // State->has_cupti_callback_errored = true;
+    std::cerr << "PROFILER: Error handling callback: " << get_cupti_error(rc) << std::endl;
+    return;
+  }
+
+  auto cb_data = static_cast<CUpti_CallbackData const*>(callback_data_ptr);
+  switch (domain) {
+    case CUPTI_CB_DOMAIN_DRIVER_API: domain_driver_callback(callback_id, cb_data); break;
+    case CUPTI_CB_DOMAIN_RUNTIME_API: domain_runtime_callback(callback_id, cb_data); break;
+    default: break;
+  }
+}
+
+// Invoked by CUPTI when a new buffer is needed to record CUPTI activity events.
+void CUPTIAPI buffer_requested_callback(uint8_t** buffer_ptr_ptr,
+                                        size_t* size_ptr,
+                                        size_t* max_num_records_ptr)
+{
+  *max_num_records_ptr = 0;
+  if (!State->is_shutdown) {
+    auto buffer = State->free_buffers.get();
+    buffer->release(buffer_ptr_ptr, size_ptr);
+  } else {
+    *buffer_ptr_ptr = nullptr;
+    *size_ptr       = 0;
+  }
+}
+
+// Invoked by CUPTI when an activity event buffer has completed.
+void CUPTIAPI buffer_completed_callback(
+  CUcontext, uint32_t, uint8_t* buffer, size_t buffer_size, size_t valid_size)
+{
+  auto pb = std::make_unique<profile_buffer>(buffer, buffer_size, valid_size);
+  if (!State->is_shutdown) { State->completed_buffers.put(std::move(pb)); }
+}
+
+// Setup the environment variables for NVTX library injection so we can capture NVTX events.
+void setup_nvtx_env(JNIEnv* env, jstring j_lib_path)
+{
+  auto lib_path = env->GetStringUTFChars(j_lib_path, 0);
+  if (lib_path == NULL) { throw std::runtime_error("Error getting library path"); }
+  setenv("NVTX_INJECTION64_PATH", lib_path, 1);
+  env->ReleaseStringUTFChars(j_lib_path, lib_path);
+}
+
+// Main processing loop for the background writer thread
+void writer_thread_process(JavaVM* vm, jobject j_writer, size_t buffer_size, size_t flush_threshold)
+{
+  try {
+    JNIEnv* env = attach_to_jvm(vm);
+    profiler_serializer serializer(env, j_writer, buffer_size, flush_threshold);
+    auto buffer = State->completed_buffers.get();
+    while (buffer) {
+      serializer.process_cupti_buffer(buffer->data(), buffer->valid_size());
+      State->free_buffers.put(std::move(buffer));
+      buffer = State->completed_buffers.get();
+    }
+    serializer.flush();
+  } catch (std::exception const& e) {
+    std::cerr << "PROFILER: WRITER THREAD ERROR: " << e.what() << std::endl;
+    // no-op process buffers
+    auto buffer = State->completed_buffers.get();
+    while (buffer) {
+      State->free_buffers.put(std::move(buffer));
+      buffer = State->completed_buffers.get();
+    }
+  }
+  vm->DetachCurrentThread();
+}
+
+// Enable/disable capture of CUPTI activity events
+void update_activity_enable(bool enable)
+{
+  CUpti_ActivityKind const activity_ids[] = {CUPTI_ACTIVITY_KIND_DEVICE,
+                                             CUPTI_ACTIVITY_KIND_DRIVER,
+                                             CUPTI_ACTIVITY_KIND_RUNTIME,
+                                             CUPTI_ACTIVITY_KIND_MEMCPY,
+                                             CUPTI_ACTIVITY_KIND_MEMSET,
+                                             CUPTI_ACTIVITY_KIND_NAME,
+                                             CUPTI_ACTIVITY_KIND_MARKER,
+                                             CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL,
+                                             CUPTI_ACTIVITY_KIND_OVERHEAD};
+  if (enable) {
+    for (CUpti_ActivityKind const id : activity_ids) {
+      check_cupti(cuptiActivityEnable(id), "Error enabling device activity");
+    }
+  } else {
+    for (CUpti_ActivityKind const id : activity_ids) {
+      check_cupti(cuptiActivityDisable(id), "Error disabling device activity");
+    }
+    check_cupti(cuptiActivityFlushAll(0), "Error flushing activity records");
+  }
+}
+
+}  // anonymous namespace
+
+}  // namespace spark_rapids_jni::profiler
+
+extern "C" {
+
+using namespace spark_rapids_jni::profiler;
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_Profiler_nativeInit(JNIEnv* env,
+                                                                            jclass,
+                                                                            jstring j_lib_path,
+                                                                            jobject j_writer,
+                                                                            jlong write_buffer_size,
+                                                                            jint flush_period_msec)
+{
+  try {
+    setup_nvtx_env(env, j_lib_path);
+    // grab a global reference to the writer instance so it isn't garbage collected
+    auto writer = static_cast<jobject>(env->NewGlobalRef(j_writer));
+    if (!writer) { throw std::runtime_error("Unable to create a global reference to writer"); }
+    State                = new subscriber_state(writer, write_buffer_size);
+    State->writer_thread = std::thread(
+      writer_thread_process, get_jvm(env), writer, write_buffer_size, write_buffer_size);
+    auto rc = cuptiSubscribe(&State->subscriber_handle, callback_handler, nullptr);
+    check_cupti(rc, "Error initializing CUPTI");
+    rc = cuptiEnableCallback(1,
+                             State->subscriber_handle,
+                             CUPTI_CB_DOMAIN_RUNTIME_API,
+                             CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020);
+    if (flush_period_msec > 0) {
+      std::cerr << "PROFILER: Flushing activity records every " << flush_period_msec
+                << " milliseconds" << std::endl;
+      Flush_period_msec    = static_cast<uint64_t>(flush_period_msec);
+      Last_flush_time_msec = timestamp_now();
+      // CUPTI's periodic flush does not appear to work in this environment. As a workaround,
+      // register a callback for all the various ways a GPU kernel gets launched. The callback
+      // checks if the flush period has elapsed since we last flushed, and if so, forces a flush.
+      CUpti_CallbackId const driver_launch_callback_ids[] = {
+        CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch,
+        CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunch,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
+        CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz};
+      for (CUpti_CallbackId const id : driver_launch_callback_ids) {
+        rc = cuptiEnableCallback(1, State->subscriber_handle, CUPTI_CB_DOMAIN_DRIVER_API, id);
+        check_cupti(rc, "Error registering driver launch callbacks");
+      }
+    }
+    check_cupti(rc, "Error enabling device reset callback");
+    rc = cuptiActivityRegisterCallbacks(buffer_requested_callback, buffer_completed_callback);
+    check_cupti(rc, "Error registering activity buffer callbacks");
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_Profiler_nativeStart(JNIEnv* env, jclass)
+{
+  try {
+    if (State && !State->is_shutdown) { update_activity_enable(true); }
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_Profiler_nativeStop(JNIEnv* env, jclass)
+{
+  try {
+    if (State && !State->is_shutdown) { update_activity_enable(false); }
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_Profiler_nativeShutdown(JNIEnv* env, jclass)
+{
+  try {
+    if (State && !State->is_shutdown) {
+      auto unsub_rc = cuptiUnsubscribe(State->subscriber_handle);
+      auto flush_rc = cuptiActivityFlushAll(1);
+      State->completed_buffers.shutdown();
+      State->writer_thread.join();
+      State->is_shutdown = true;
+      env->DeleteGlobalRef(State->j_writer);
+      // There can be late arrivals of CUPTI activity events and other callbacks, so it's safer
+      // and simpler to _not_ delete the State object on shutdown.
+      check_cupti(unsub_rc, "Error unsubscribing from CUPTI");
+      check_cupti(flush_rc, "Error flushing CUPTI records");
+    }
+  }
+  CATCH_STD(env, );
+}
+
+}  // extern "C"
+
+// Extern the CUPTI NVTX initialization APIs. The APIs are thread-safe.
+extern "C" CUptiResult CUPTIAPI cuptiNvtxInitialize(void* pfnGetExportTable);
+extern "C" CUptiResult CUPTIAPI cuptiNvtxInitialize2(void* pfnGetExportTable);
+
+// Interface that may be called by NVTX to capture NVTX events
+extern "C" JNIEXPORT int InitializeInjectionNvtx(void* p)
+{
+  CUptiResult res = cuptiNvtxInitialize(p);
+  return (res == CUPTI_SUCCESS) ? 1 : 0;
+}
+
+// Interface that may be called by NVTX to capture NVTX events
+extern "C" JNIEXPORT int InitializeInjectionNvtx2(void* p)
+{
+  CUptiResult res = cuptiNvtxInitialize2(p);
+  return (res == CUPTI_SUCCESS) ? 1 : 0;
+}
diff --git a/src/main/cpp/profiler/profiler_debug.cpp b/src/main/cpp/profiler/profiler_debug.cpp
new file mode 100644
index 0000000000..3759b11e0d
--- /dev/null
+++ b/src/main/cpp/profiler/profiler_debug.cpp
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "profiler_debug.hpp"
+
+#include <iostream>
+#include <sstream>
+
+namespace spark_rapids_jni::profiler {
+
+namespace {
+
+std::string marker_flags_to_string(CUpti_ActivityFlag flags)
+{
+  std::string s("");
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS) { s += "INSTANTANEOUS "; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_START) { s += "START "; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_END) { s += "END "; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE) { s += "SYNCACQUIRE "; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_SUCCESS) { s += "SYNCACQUIRESUCCESS "; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_FAILED) { s += "SYNCACQUIREFAILED "; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_RELEASE) { s += "SYNCRELEASE "; }
+  return s;
+}
+
+std::string activity_object_kind_to_string(CUpti_ActivityObjectKind kind)
+{
+  switch (kind) {
+    case CUPTI_ACTIVITY_OBJECT_PROCESS: return "PROCESS";
+    case CUPTI_ACTIVITY_OBJECT_THREAD: return "THREAD";
+    case CUPTI_ACTIVITY_OBJECT_DEVICE: return "DEVICE";
+    case CUPTI_ACTIVITY_OBJECT_CONTEXT: return "CONTEXT";
+    case CUPTI_ACTIVITY_OBJECT_STREAM: return "STREAM";
+    case CUPTI_ACTIVITY_OBJECT_UNKNOWN: return "UNKNOWN";
+    default: {
+      std::ostringstream oss;
+      oss << "UNRECOGNIZED(" << kind << ")";
+      return oss.str();
+    }
+  }
+}
+
+}  // anonymous namespace
+
+std::string activity_kind_to_string(CUpti_ActivityKind kind)
+{
+  switch (kind) {
+    case CUPTI_ACTIVITY_KIND_MEMCPY: return "CUPTI_ACTIVITY_KIND_MEMCPY";
+    case CUPTI_ACTIVITY_KIND_MEMSET: return "CUPTI_ACTIVITY_KIND_MEMSET";
+    case CUPTI_ACTIVITY_KIND_KERNEL: return "CUPTI_ACTIVITY_KIND_KERNEL";
+    case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: return "CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL";
+    case CUPTI_ACTIVITY_KIND_DRIVER: return "CPUTI_ACTIVITY_KIND_DRIVER";
+    case CUPTI_ACTIVITY_KIND_RUNTIME: return "CUPTI_ACTIVITY_KIND_RUNTIME";
+    case CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API: return "CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API";
+    case CUPTI_ACTIVITY_KIND_EVENT: return "CUPTI_ACTIVITY_KIND_EVENT";
+    case CUPTI_ACTIVITY_KIND_METRIC: return "CUPTI_ACTIVITY_KIND_METRIC";
+    case CUPTI_ACTIVITY_KIND_DEVICE: return "CUPTI_ACTIVITY_KIND_DEVICE";
+    case CUPTI_ACTIVITY_KIND_CONTEXT: return "CUPTI_ACTIVITY_KIND_CONTEXT";
+    case CUPTI_ACTIVITY_KIND_NAME: return "CUPTI_ACTIVITY_KIND_NAME";
+    case CUPTI_ACTIVITY_KIND_MARKER: return "CUPTI_ACTIVITY_KIND_MARKER";
+    case CUPTI_ACTIVITY_KIND_MARKER_DATA: return "CUPTI_ACTIVITY_KIND_MARKER_DATA";
+    case CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR: return "CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR";
+    case CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS: return "CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS";
+    case CUPTI_ACTIVITY_KIND_BRANCH: return "CUPTI_ACTIVITY_KIND_BRANCH";
+    case CUPTI_ACTIVITY_KIND_OVERHEAD: return "CUPTI_ACTIVITY_KIND_OVERHEAD";
+    case CUPTI_ACTIVITY_KIND_CDP_KERNEL: return "CUPTI_ACTIVITY_KIND_CDP_KERNEL";
+    case CUPTI_ACTIVITY_KIND_PREEMPTION: return "CUPTI_ACTIVITY_KIND_PREEMPTION";
+    case CUPTI_ACTIVITY_KIND_ENVIRONMENT: return "CUPTI_ACTIVITY_KIND_ENVIRONMENT";
+    case CUPTI_ACTIVITY_KIND_EVENT_INSTANCE: return "CUPTI_ACTIVITY_KIND_EVENT_INSTANCE";
+    case CUPTI_ACTIVITY_KIND_MEMCPY2: return "CUPTI_ACTIVITY_KIND_MEMCPY2";
+    case CUPTI_ACTIVITY_KIND_METRIC_INSTANCE: return "CUPTI_ACTIVITY_KIND_METRIC_INSTANCE";
+    case CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION:
+      return "CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION";
+    case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
+      return "CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER";
+    case CUPTI_ACTIVITY_KIND_FUNCTION: return "CUPTI_ACTIVITY_KIND_FUNCTION";
+    case CUPTI_ACTIVITY_KIND_MODULE: return "CUPTI_ACTIVITY_KIND_MODULE";
+    case CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE: return "CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE";
+    case CUPTI_ACTIVITY_KIND_SHARED_ACCESS: return "CUPTI_ACTIVITY_KIND_SHARED_ACCESS";
+    case CUPTI_ACTIVITY_KIND_PC_SAMPLING: return "CUPTI_ACTIVITY_KIND_PC_SAMPLING";
+    case CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO:
+      return "CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO";
+    case CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION:
+      return "CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION";
+    case CUPTI_ACTIVITY_KIND_OPENACC_DATA: return "CUPTI_ACTIVITY_KIND_OPENACC_DATA";
+    case CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH: return "CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH";
+    case CUPTI_ACTIVITY_KIND_OPENACC_OTHER: return "CUPTI_ACTIVITY_KIND_OPENACC_OTHER";
+    case CUPTI_ACTIVITY_KIND_CUDA_EVENT: return "CUPTI_ACTIVITY_KIND_CUDA_EVENT";
+    case CUPTI_ACTIVITY_KIND_STREAM: return "CUPTI_ACTIVITY_KIND_STREAM";
+    case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION: return "CUPTI_ACTIVITY_KIND_SYNCHRONIZATION";
+    case CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION:
+      return "CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION";
+    case CUPTI_ACTIVITY_KIND_NVLINK: return "CUPTI_ACTIVITY_KIND_NVLINK";
+    case CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT: return "CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT";
+    case CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE:
+      return "CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE";
+    case CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC:
+      return "CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC";
+    case CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE:
+      return "CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE";
+    case CUPTI_ACTIVITY_KIND_MEMORY: return "CUPTI_ACTIVITY_KIND_MEMORY";
+    case CUPTI_ACTIVITY_KIND_PCIE: return "CUPTI_ACTIVITY_KIND_PCIE";
+    case CUPTI_ACTIVITY_KIND_OPENMP: return "CUPTI_ACTIVITY_KIND_OPENMP";
+    case CUPTI_ACTIVITY_KIND_MEMORY2: return "CUPTI_ACTIVITY_KIND_MEMORY2";
+    case CUPTI_ACTIVITY_KIND_MEMORY_POOL: return "CUPTI_ACTIVITY_KIND_MEMORY_POOL";
+    case CUPTI_ACTIVITY_KIND_GRAPH_TRACE: return "CUPTI_ACTIVITY_KIND_GRAPH_TRACE";
+    case CUPTI_ACTIVITY_KIND_JIT: return "CUPTI_ACTIVITY_KIND_JIT";
+    default: {
+      std::ostringstream oss;
+      oss << "UNRECOGNIZED(" << kind << ")";
+      return oss.str();
+    }
+  }
+}
+
+void print_cupti_buffer(uint8_t* buffer, size_t valid_size)
+{
+  if (valid_size > 0) {
+    std::cerr << "PROFILER: CUPTI buffer size: " << valid_size << std::endl;
+    CUpti_Activity* record_ptr = nullptr;
+    auto rc                    = cuptiActivityGetNextRecord(buffer, valid_size, &record_ptr);
+    while (rc == CUPTI_SUCCESS) {
+      std::cerr << "RECORD: " << activity_kind_to_string(record_ptr->kind) << std::endl;
+      switch (record_ptr->kind) {
+        case CUPTI_ACTIVITY_KIND_DRIVER: {
+          auto api_record  = reinterpret_cast<CUpti_ActivityAPI const*>(record_ptr);
+          char const* name = nullptr;
+          cuptiGetCallbackName(CUPTI_CB_DOMAIN_DRIVER_API, api_record->cbid, &name);
+          name = name ? name : "NULL";
+          std::cerr << "  NAME: " << name << " THREAD: " << api_record->threadId << std::endl;
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_DEVICE: {
+          auto device_record = reinterpret_cast<CUpti_ActivityDevice4 const*>(record_ptr);
+          char const* name   = device_record->name != nullptr ? device_record->name : "NULL";
+          std::cerr << "  " << activity_kind_to_string(device_record->kind) << " " << name
+                    << std::endl;
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_RUNTIME: {
+          auto api_record  = reinterpret_cast<CUpti_ActivityAPI const*>(record_ptr);
+          char const* name = nullptr;
+          cuptiGetCallbackName(CUPTI_CB_DOMAIN_RUNTIME_API, api_record->cbid, &name);
+          name = name ? name : "NULL";
+          std::cerr << "  NAME: " << name << " THREAD: " << api_record->threadId << std::endl;
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_MARKER: {
+          auto marker_record = reinterpret_cast<CUpti_ActivityMarker2 const*>(record_ptr);
+          std::cerr << "  FLAGS: " << marker_flags_to_string(marker_record->flags)
+                    << " ID: " << marker_record->id
+                    << " OBJECTKIND: " << activity_object_kind_to_string(marker_record->objectKind)
+                    << " NAME: " << std::string(marker_record->name ? marker_record->name : "NULL")
+                    << " DOMAIN: "
+                    << std::string(marker_record->domain ? marker_record->domain : "NULL")
+                    << std::endl;
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_MARKER_DATA: {
+          auto marker_record = reinterpret_cast<CUpti_ActivityMarkerData const*>(record_ptr);
+          std::cerr << "  FLAGS: " << marker_flags_to_string(marker_record->flags)
+                    << " ID: " << marker_record->id << " COLOR: " << marker_record->color
+                    << " COLOR FLAG: " << marker_record->flags
+                    << " CATEGORY: " << marker_record->category
+                    << " DATA KIND: " << marker_record->payloadKind
+                    << " DATA: " << marker_record->payload.metricValueUint64 << "/"
+                    << marker_record->payload.metricValueDouble << std::endl;
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
+          auto kernel_record = reinterpret_cast<CUpti_ActivityKernel8 const*>(record_ptr);
+          std::cerr << "  NAME: " << kernel_record->name << std::endl;
+        }
+        default: break;
+      }
+      rc = cuptiActivityGetNextRecord(buffer, valid_size, &record_ptr);
+    }
+  }
+}
+
+}  // namespace spark_rapids_jni::profiler
diff --git a/src/main/cpp/profiler/profiler_debug.hpp b/src/main/cpp/profiler/profiler_debug.hpp
new file mode 100644
index 0000000000..e44fdb87ff
--- /dev/null
+++ b/src/main/cpp/profiler/profiler_debug.hpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cupti.h>
+
+#include <cstdint>
+#include <string>
+
+namespace spark_rapids_jni::profiler {
+
+std::string activity_kind_to_string(CUpti_ActivityKind kind);
+
+void print_cupti_buffer(uint8_t* buffer, size_t valid_size);
+
+}  // namespace spark_rapids_jni::profiler
diff --git a/src/main/cpp/profiler/profiler_schema.cpp.in b/src/main/cpp/profiler/profiler_schema.cpp.in
new file mode 100644
index 0000000000..f2940a91bf
--- /dev/null
+++ b/src/main/cpp/profiler/profiler_schema.cpp.in
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace spark_rapids_jni::profiler {
+char const* Profiler_Schema = R"raw(@SPARK_RAPIDS_JNI_PROFILER_SCHEMA@)raw";
+}
diff --git a/src/main/cpp/profiler/profiler_serializer.cpp b/src/main/cpp/profiler/profiler_serializer.cpp
new file mode 100644
index 0000000000..b47ff234ad
--- /dev/null
+++ b/src/main/cpp/profiler/profiler_serializer.cpp
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "profiler_serializer.hpp"
+
+#include "profiler_debug.hpp"
+#include "profiler_generated.h"
+#include "spark_rapids_jni_version.h"
+
+#include <cupti.h>
+
+#include <iostream>
+
+namespace spark_rapids_jni::profiler {
+
+namespace {
+
+constexpr uint32_t PROFILE_VERSION = 1;
+
+flatbuffers::Offset<ActivityObjectId> add_object_id(flatbuffers::FlatBufferBuilder& fbb,
+                                                    CUpti_ActivityObjectKind kind,
+                                                    CUpti_ActivityObjectKindId const& object_id)
+{
+  switch (kind) {
+    case CUPTI_ACTIVITY_OBJECT_PROCESS:
+    case CUPTI_ACTIVITY_OBJECT_THREAD: {
+      ActivityObjectIdBuilder aoib(fbb);
+      aoib.add_process_id(object_id.pt.processId);
+      if (kind == CUPTI_ACTIVITY_OBJECT_THREAD) { aoib.add_thread_id(object_id.pt.threadId); }
+      return aoib.Finish();
+    }
+    case CUPTI_ACTIVITY_OBJECT_DEVICE:
+    case CUPTI_ACTIVITY_OBJECT_CONTEXT:
+    case CUPTI_ACTIVITY_OBJECT_STREAM: {
+      ActivityObjectIdBuilder aoib(fbb);
+      aoib.add_device_id(object_id.dcs.deviceId);
+      if (kind == CUPTI_ACTIVITY_OBJECT_CONTEXT || kind == CUPTI_ACTIVITY_OBJECT_STREAM) {
+        aoib.add_context_id(object_id.dcs.contextId);
+        if (kind == CUPTI_ACTIVITY_OBJECT_STREAM) { aoib.add_stream_id(object_id.dcs.streamId); }
+      }
+      return aoib.Finish();
+    }
+    default:
+      std::cerr << "PROFILER: Unrecognized object kind: " << kind << std::endl;
+      return flatbuffers::Offset<ActivityObjectId>();
+  }
+}
+
+MarkerFlags marker_flags_to_fb(CUpti_ActivityFlag flags)
+{
+  uint8_t result = 0;
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS) { result |= MarkerFlags_Instantaneous; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_START) { result |= MarkerFlags_Start; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_END) { result |= MarkerFlags_End; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE) { result |= MarkerFlags_SyncAcquire; }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_SUCCESS) {
+    result |= MarkerFlags_SyncAcquireSuccess;
+  }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_FAILED) {
+    result |= MarkerFlags_SyncAcquireFailed;
+  }
+  if (flags & CUPTI_ACTIVITY_FLAG_MARKER_SYNC_RELEASE) { result |= MarkerFlags_SyncRelease; }
+  return static_cast<MarkerFlags>(result);
+}
+
+ChannelType to_channel_type(CUpti_ChannelType t)
+{
+  switch (t) {
+    case CUPTI_CHANNEL_TYPE_INVALID: return ChannelType_Invalid;
+    case CUPTI_CHANNEL_TYPE_COMPUTE: return ChannelType_Compute;
+    case CUPTI_CHANNEL_TYPE_ASYNC_MEMCPY: return ChannelType_AsyncMemcpy;
+    default:
+      std::cerr << "PROFILER: Unrecognized channel type: " << t << std::endl;
+      return ChannelType_Invalid;
+  }
+}
+
+LaunchType to_launch_type(uint8_t t)
+{
+  switch (t) {
+    case CUPTI_ACTIVITY_LAUNCH_TYPE_REGULAR: return LaunchType_Regular;
+    case CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_SINGLE_DEVICE:
+      return LaunchType_CooperativeSingleDevice;
+    case CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_MULTI_DEVICE:
+      return LaunchType_CooperativeMultiDevice;
+    default:
+      std::cerr << "PROFILER: Unrecognized launch type: " << t << std::endl;
+      return LaunchType_Regular;
+  }
+}
+
+MemcpyFlags to_memcpy_flags(uint32_t flags)
+{
+  uint8_t result = 0;
+  if (flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC) { result |= MemcpyFlags_Async; }
+  return static_cast<MemcpyFlags>(result);
+}
+
+MemcpyKind to_memcpy_kind(uint8_t k)
+{
+  switch (k) {
+    case CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN: return MemcpyKind_Unknown;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD: return MemcpyKind_HtoD;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH: return MemcpyKind_DtoH;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA: return MemcpyKind_HtoA;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH: return MemcpyKind_AtoH;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA: return MemcpyKind_AtoA;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD: return MemcpyKind_AtoD;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA: return MemcpyKind_DtoA;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD: return MemcpyKind_DtoD;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH: return MemcpyKind_HtoH;
+    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP: return MemcpyKind_PtoP;
+    default:
+      std::cerr << "PROFILER: Unrecognized memcpy kind: " << k << std::endl;
+      return MemcpyKind_Unknown;
+  }
+}
+
+MemoryKind to_memory_kind(uint8_t k)
+{
+  switch (k) {
+    case CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN: return MemoryKind_Unknown;
+    case CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE: return MemoryKind_Pageable;
+    case CUPTI_ACTIVITY_MEMORY_KIND_PINNED: return MemoryKind_Pinned;
+    case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE: return MemoryKind_Device;
+    case CUPTI_ACTIVITY_MEMORY_KIND_ARRAY: return MemoryKind_Array;
+    case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED: return MemoryKind_Managed;
+    case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC: return MemoryKind_DeviceStatic;
+    case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC: return MemoryKind_ManagedStatic;
+    default:
+      std::cerr << "PROFILER: Unrecognized memory kind: " << k << std::endl;
+      return MemoryKind_Unknown;
+  }
+}
+
+MemsetFlags to_memset_flags(uint32_t flags)
+{
+  uint8_t result = 0;
+  if (flags & CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC) { result |= MemsetFlags_Async; }
+  return static_cast<MemsetFlags>(result);
+}
+
+OverheadKind to_overhead_kind(CUpti_ActivityOverheadKind k)
+{
+  switch (k) {
+    case CUPTI_ACTIVITY_OVERHEAD_UNKNOWN: return OverheadKind_Unknown;
+    case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER: return OverheadKind_DriverCompiler;
+    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH: return OverheadKind_CUptiBufferFlush;
+    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION: return OverheadKind_CUptiInstrumentation;
+    case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE: return OverheadKind_CUptiResource;
+    default:
+      std::cerr << "PROFILER: Unrecognized overhead kind: " << k << std::endl;
+      return OverheadKind_Unknown;
+  }
+}
+
+PartitionedGlobalCacheConfig to_partitioned_global_cache_config(
+  CUpti_ActivityPartitionedGlobalCacheConfig c)
+{
+  switch (c) {
+    case CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_UNKNOWN:
+      return PartitionedGlobalCacheConfig_Unknown;
+    case CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_NOT_SUPPORTED:
+      return PartitionedGlobalCacheConfig_NotSupported;
+    case CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_OFF:
+      return PartitionedGlobalCacheConfig_Off;
+    case CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_ON: return PartitionedGlobalCacheConfig_On;
+    default:
+      std::cerr << "PROFILER: Unrecognized partitioned global cache config: " << c << std::endl;
+      return PartitionedGlobalCacheConfig_Unknown;
+  }
+}
+
+ShmemLimitConfig to_shmem_limit_config(CUpti_FuncShmemLimitConfig c)
+{
+  switch (c) {
+    case CUPTI_FUNC_SHMEM_LIMIT_DEFAULT: return ShmemLimitConfig_Default;
+    case CUPTI_FUNC_SHMEM_LIMIT_OPTIN: return ShmemLimitConfig_Optin;
+    default:
+      std::cerr << "PROFILER: Unrecognized shmem limit config: " << c << std::endl;
+      return ShmemLimitConfig_Default;
+  }
+}
+
+}  // anonymous namespace
+
+profiler_serializer::profiler_serializer(JNIEnv* env,
+                                         jobject writer,
+                                         size_t buffer_size,
+                                         size_t flush_threshold)
+  : env_(env), j_writer_(writer), flush_threshold_(flush_threshold), fbb_(buffer_size)
+{
+  auto writer_class = env->GetObjectClass(writer);
+  if (!writer_class) { throw std::runtime_error("Failed to locate class of data writer"); }
+  j_write_method_ = env->GetMethodID(writer_class, "write", "(Ljava/nio/ByteBuffer;)V");
+  if (!j_write_method_) { throw std::runtime_error("Failed to locate data writer write method"); }
+  write_profile_header();
+}
+
+void profiler_serializer::write_profile_header()
+{
+  auto writer_version = fbb_.CreateString(spark_rapids_jni::Version);
+  auto magic          = fbb_.CreateString("spark-rapids profile");
+  auto header         = CreateProfileHeader(fbb_, magic, PROFILE_VERSION, writer_version);
+  fbb_.FinishSizePrefixed(header);
+  write_current_fb();
+}
+
+void profiler_serializer::process_cupti_buffer(uint8_t* buffer, size_t valid_size)
+{
+  report_num_dropped_records();
+  if (valid_size > 0) {
+    CUpti_Activity* record_ptr = nullptr;
+    auto rc                    = cuptiActivityGetNextRecord(buffer, valid_size, &record_ptr);
+    while (rc == CUPTI_SUCCESS) {
+      switch (record_ptr->kind) {
+        case CUPTI_ACTIVITY_KIND_DEVICE: {
+          auto device_record = reinterpret_cast<CUpti_ActivityDevice4 const*>(record_ptr);
+          process_device_activity(device_record);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_DRIVER:
+        case CUPTI_ACTIVITY_KIND_RUNTIME: {
+          auto api_record = reinterpret_cast<CUpti_ActivityAPI const*>(record_ptr);
+          process_api_activity(api_record);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_MARKER: {
+          auto marker = reinterpret_cast<CUpti_ActivityMarker2 const*>(record_ptr);
+          process_marker_activity(marker);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_MARKER_DATA: {
+          auto marker = reinterpret_cast<CUpti_ActivityMarkerData const*>(record_ptr);
+          process_marker_data(marker);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_MEMCPY: {
+          auto r = reinterpret_cast<CUpti_ActivityMemcpy5 const*>(record_ptr);
+          process_memcpy(r);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_MEMSET: {
+          auto r = reinterpret_cast<CUpti_ActivityMemset4 const*>(record_ptr);
+          process_memset(r);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
+          auto r = reinterpret_cast<CUpti_ActivityKernel8 const*>(record_ptr);
+          process_kernel(r);
+          break;
+        }
+        case CUPTI_ACTIVITY_KIND_OVERHEAD: {
+          auto r = reinterpret_cast<CUpti_ActivityOverhead const*>(record_ptr);
+          process_overhead(r);
+          break;
+        }
+        default:
+          std::cerr << "PROFILER: Ignoring activity record "
+                    << activity_kind_to_string(record_ptr->kind) << std::endl;
+          break;
+      }
+      if (fbb_.GetSize() >= flush_threshold_) { flush(); }
+      rc = cuptiActivityGetNextRecord(buffer, valid_size, &record_ptr);
+    }
+  }
+}
+
+void profiler_serializer::flush()
+{
+  if (fbb_.GetSize() > 0) {
+    using flatbuffers::Offset;
+    using flatbuffers::Vector;
+    Offset<Vector<Offset<ApiActivity>>> api_vec;
+    Offset<Vector<Offset<DeviceActivity>>> device_vec;
+    Offset<Vector<Offset<DroppedRecords>>> dropped_vec;
+    Offset<Vector<Offset<KernelActivity>>> kernel_vec;
+    Offset<Vector<Offset<MarkerActivity>>> marker_vec;
+    Offset<Vector<Offset<MarkerData>>> marker_data_vec;
+    Offset<Vector<Offset<MemcpyActivity>>> memcpy_vec;
+    Offset<Vector<Offset<MemsetActivity>>> memset_vec;
+    Offset<Vector<Offset<OverheadActivity>>> overhead_vec;
+    if (api_offsets_.size() > 0) { api_vec = fbb_.CreateVector(api_offsets_); }
+    if (device_offsets_.size() > 0) { device_vec = fbb_.CreateVector(device_offsets_); }
+    if (dropped_offsets_.size() > 0) { dropped_vec = fbb_.CreateVector(dropped_offsets_); }
+    if (kernel_offsets_.size() > 0) { kernel_vec = fbb_.CreateVector(kernel_offsets_); }
+    if (marker_offsets_.size() > 0) { marker_vec = fbb_.CreateVector(marker_offsets_); }
+    if (marker_data_offsets_.size() > 0) {
+      marker_data_vec = fbb_.CreateVector(marker_data_offsets_);
+    }
+    if (memcpy_offsets_.size() > 0) { memcpy_vec = fbb_.CreateVector(memcpy_offsets_); }
+    if (memset_offsets_.size() > 0) { memset_vec = fbb_.CreateVector(memset_offsets_); }
+    if (overhead_offsets_.size() > 0) { overhead_vec = fbb_.CreateVector(overhead_offsets_); }
+    ActivityRecordsBuilder arb(fbb_);
+    arb.add_api(api_vec);
+    arb.add_device(device_vec);
+    arb.add_dropped(dropped_vec);
+    arb.add_kernel(kernel_vec);
+    arb.add_marker(marker_vec);
+    arb.add_marker_data(marker_data_vec);
+    arb.add_memcpy(memcpy_vec);
+    arb.add_memset(memset_vec);
+    arb.add_overhead(overhead_vec);
+    auto r = arb.Finish();
+    fbb_.FinishSizePrefixed(r);
+    write_current_fb();
+  }
+}
+
+void profiler_serializer::process_api_activity(CUpti_ActivityAPI const* r)
+{
+  auto api_kind = ApiKind_Runtime;
+  if (r->kind == CUPTI_ACTIVITY_KIND_DRIVER) {
+    api_kind = ApiKind_Driver;
+  } else if (r->kind == CUPTI_ACTIVITY_KIND_RUNTIME) {
+    // skip some very common and uninteresting APIs to reduce the profile size
+    switch (r->cbid) {
+      case CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020:
+      case CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020:
+      case CUPTI_RUNTIME_TRACE_CBID_cudaPeekAtLastError_v3020:
+      case CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetAttribute_v5000: return;
+      default: break;
+    }
+  } else {
+    std::cerr << "PROFILER: Ignoring API activity record kind: " << activity_kind_to_string(r->kind)
+              << std::endl;
+    return;
+  }
+  ApiActivityBuilder aab(fbb_);
+  aab.add_kind(api_kind);
+  aab.add_cbid(r->cbid);
+  aab.add_start(r->start);
+  aab.add_end(r->end);
+  aab.add_process_id(r->processId);
+  aab.add_thread_id(r->threadId);
+  aab.add_correlation_id(r->correlationId);
+  aab.add_return_value(r->returnValue);
+  api_offsets_.push_back(aab.Finish());
+}
+
+void profiler_serializer::process_device_activity(CUpti_ActivityDevice4 const* r)
+{
+  auto name = fbb_.CreateSharedString(r->name);
+  DeviceActivityBuilder dab(fbb_);
+  dab.add_global_memory_bandwidth(r->globalMemoryBandwidth);
+  dab.add_global_memory_size(r->globalMemorySize);
+  dab.add_constant_memory_size(r->constantMemorySize);
+  dab.add_l2_cache_size(r->l2CacheSize);
+  dab.add_num_threads_per_warp(r->numThreadsPerWarp);
+  dab.add_core_clock_rate(r->coreClockRate);
+  dab.add_num_memcpy_engines(r->numMemcpyEngines);
+  dab.add_num_multiprocessors(r->numMultiprocessors);
+  dab.add_max_ipc(r->maxIPC);
+  dab.add_max_warps_per_multiprocessor(r->maxWarpsPerMultiprocessor);
+  dab.add_max_blocks_per_multiprocessor(r->maxBlocksPerMultiprocessor);
+  dab.add_max_shared_memory_per_multiprocessor(r->maxSharedMemoryPerMultiprocessor);
+  dab.add_max_registers_per_multiprocessor(r->maxRegistersPerMultiprocessor);
+  dab.add_max_registers_per_block(r->maxRegistersPerBlock);
+  dab.add_max_shared_memory_per_block(r->maxSharedMemoryPerBlock);
+  dab.add_max_threads_per_block(r->maxThreadsPerBlock);
+  dab.add_max_block_dim_x(r->maxBlockDimX);
+  dab.add_max_block_dim_y(r->maxBlockDimY);
+  dab.add_max_block_dim_z(r->maxBlockDimZ);
+  dab.add_max_grid_dim_x(r->maxGridDimX);
+  dab.add_max_grid_dim_y(r->maxGridDimY);
+  dab.add_max_grid_dim_z(r->maxGridDimZ);
+  dab.add_compute_capability_major(r->computeCapabilityMajor);
+  dab.add_compute_capability_minor(r->computeCapabilityMinor);
+  dab.add_id(r->id);
+  dab.add_ecc_enabled(r->eccEnabled);
+  dab.add_name(name);
+  device_offsets_.push_back(dab.Finish());
+}
+
+void profiler_serializer::process_dropped_records(size_t num_dropped)
+{
+  auto dropped = CreateDroppedRecords(fbb_, num_dropped);
+  dropped_offsets_.push_back(dropped);
+}
+
+void profiler_serializer::process_kernel(CUpti_ActivityKernel8 const* r)
+{
+  auto name = fbb_.CreateSharedString(r->name);
+  KernelActivityBuilder kab(fbb_);
+  kab.add_requested(r->cacheConfig.config.requested);
+  kab.add_executed(r->cacheConfig.config.executed);
+  kab.add_shared_memory_config(r->sharedMemoryConfig);
+  kab.add_registers_per_thread(r->registersPerThread);
+  kab.add_partitioned_global_cache_requested(
+    to_partitioned_global_cache_config(r->partitionedGlobalCacheRequested));
+  kab.add_partitioned_global_cache_executed(
+    to_partitioned_global_cache_config(r->partitionedGlobalCacheExecuted));
+  kab.add_start(r->start);
+  kab.add_end(r->end);
+  kab.add_completed(r->completed);
+  kab.add_device_id(r->deviceId);
+  kab.add_context_id(r->contextId);
+  kab.add_stream_id(r->streamId);
+  kab.add_grid_x(r->gridX);
+  kab.add_grid_y(r->gridY);
+  kab.add_grid_z(r->gridZ);
+  kab.add_block_x(r->blockX);
+  kab.add_block_y(r->blockY);
+  kab.add_block_z(r->blockZ);
+  kab.add_static_shared_memory(r->staticSharedMemory);
+  kab.add_dynamic_shared_memory(r->dynamicSharedMemory);
+  kab.add_local_memory_per_thread(r->localMemoryPerThread);
+  kab.add_local_memory_total(r->localMemoryTotal);
+  kab.add_correlation_id(r->correlationId);
+  kab.add_grid_id(r->gridId);
+  kab.add_name(name);
+  kab.add_queued(r->queued);
+  kab.add_submitted(r->submitted);
+  kab.add_launch_type(to_launch_type(r->launchType));
+  kab.add_is_shared_memory_carveout_requested(r->isSharedMemoryCarveoutRequested);
+  kab.add_shared_memory_carveout_requested(r->sharedMemoryCarveoutRequested);
+  kab.add_shared_memory_executed(r->sharedMemoryExecuted);
+  kab.add_graph_node_id(r->graphNodeId);
+  kab.add_shmem_limit_config(to_shmem_limit_config(r->shmemLimitConfig));
+  kab.add_graph_id(r->graphId);
+  kab.add_channel_id(r->channelID);
+  kab.add_channel_type(to_channel_type(r->channelType));
+  kab.add_cluster_x(r->clusterX);
+  kab.add_cluster_y(r->clusterY);
+  kab.add_cluster_z(r->clusterZ);
+  kab.add_cluster_scheduling_policy(r->clusterSchedulingPolicy);
+  kab.add_local_memory_total_v2(r->localMemoryTotal_v2);
+  kernel_offsets_.push_back(kab.Finish());
+}
+
+void profiler_serializer::process_marker_activity(CUpti_ActivityMarker2 const* r)
+{
+  auto object_id  = add_object_id(fbb_, r->objectKind, r->objectId);
+  auto has_name   = r->name != nullptr;
+  auto has_domain = r->name != nullptr;
+  flatbuffers::Offset<flatbuffers::String> name;
+  flatbuffers::Offset<flatbuffers::String> domain;
+  if (has_name) { name = fbb_.CreateSharedString(r->name); }
+  if (has_domain) { domain = fbb_.CreateSharedString(r->domain); }
+  MarkerActivityBuilder mab(fbb_);
+  mab.add_flags(marker_flags_to_fb(r->flags));
+  mab.add_timestamp(r->timestamp);
+  mab.add_id(r->id);
+  mab.add_object_id(object_id);
+  mab.add_name(name);
+  mab.add_domain(domain);
+  marker_offsets_.push_back(mab.Finish());
+}
+
+void profiler_serializer::process_marker_data(CUpti_ActivityMarkerData const* r)
+{
+  MarkerDataBuilder mdb(fbb_);
+  mdb.add_flags(marker_flags_to_fb(r->flags));
+  mdb.add_id(r->id);
+  mdb.add_color(r->color);
+  mdb.add_category(r->category);
+  marker_data_offsets_.push_back(mdb.Finish());
+}
+
+void profiler_serializer::process_memcpy(CUpti_ActivityMemcpy5 const* r)
+{
+  MemcpyActivityBuilder mab(fbb_);
+  mab.add_copy_kind(to_memcpy_kind(r->copyKind));
+  mab.add_src_kind(to_memory_kind(r->srcKind));
+  mab.add_dst_kind(to_memory_kind(r->dstKind));
+  mab.add_flags(to_memcpy_flags(r->flags));
+  mab.add_bytes(r->bytes);
+  mab.add_start(r->start);
+  mab.add_end(r->end);
+  mab.add_device_id(r->deviceId);
+  mab.add_context_id(r->contextId);
+  mab.add_stream_id(r->streamId);
+  mab.add_correlation_id(r->correlationId);
+  mab.add_runtime_correlation_id(r->runtimeCorrelationId);
+  mab.add_graph_node_id(r->graphNodeId);
+  mab.add_graph_id(r->graphId);
+  mab.add_channel_id(r->channelID);
+  mab.add_channel_type(to_channel_type(r->channelType));
+  memcpy_offsets_.push_back(mab.Finish());
+}
+
+void profiler_serializer::process_memset(CUpti_ActivityMemset4 const* r)
+{
+  MemsetActivityBuilder mab(fbb_);
+  mab.add_value(r->value);
+  mab.add_bytes(r->bytes);
+  mab.add_start(r->start);
+  mab.add_end(r->end);
+  mab.add_device_id(r->deviceId);
+  mab.add_context_id(r->contextId);
+  mab.add_stream_id(r->streamId);
+  mab.add_correlation_id(r->correlationId);
+  mab.add_flags(to_memset_flags(r->flags));
+  mab.add_memory_kind(to_memory_kind(r->memoryKind));
+  mab.add_graph_node_id(r->graphNodeId);
+  mab.add_graph_id(r->graphId);
+  mab.add_channel_id(r->channelID);
+  mab.add_channel_type(to_channel_type(r->channelType));
+  memset_offsets_.push_back(mab.Finish());
+}
+
+void profiler_serializer::process_overhead(CUpti_ActivityOverhead const* r)
+{
+  auto object_id = add_object_id(fbb_, r->objectKind, r->objectId);
+  OverheadActivityBuilder oab(fbb_);
+  oab.add_overhead_kind(to_overhead_kind(r->overheadKind));
+  oab.add_object_id(object_id);
+  oab.add_start(r->start);
+  oab.add_end(r->end);
+  overhead_offsets_.push_back(oab.Finish());
+}
+
+// Query CUPTI for dropped records, and if any, record in the current activity record
+void profiler_serializer::report_num_dropped_records()
+{
+  size_t num_dropped = 0;
+  auto rc            = cuptiActivityGetNumDroppedRecords(NULL, 0, &num_dropped);
+  if (rc == CUPTI_SUCCESS && num_dropped > 0) { process_dropped_records(num_dropped); }
+}
+
+// Write out the current flatbuffer and reset state for the next flatbuffer.
+void profiler_serializer::write_current_fb()
+{
+  auto fb_size = fbb_.GetSize();
+  if (fb_size > 0) {
+    auto fb          = fbb_.GetBufferPointer();
+    auto bytebuf_obj = env_->NewDirectByteBuffer(fb, fb_size);
+    if (bytebuf_obj != nullptr) {
+      env_->CallVoidMethod(j_writer_, j_write_method_, bytebuf_obj);
+    } else {
+      std::cerr << "PROFILER: Unable to create ByteBuffer for writer" << std::endl;
+    }
+  }
+  fbb_.Clear();
+  api_offsets_.clear();
+  device_offsets_.clear();
+  dropped_offsets_.clear();
+  kernel_offsets_.clear();
+  marker_offsets_.clear();
+  marker_data_offsets_.clear();
+  memcpy_offsets_.clear();
+  memset_offsets_.clear();
+  overhead_offsets_.clear();
+}
+
+}  // namespace spark_rapids_jni::profiler
diff --git a/src/main/cpp/profiler/profiler_serializer.hpp b/src/main/cpp/profiler/profiler_serializer.hpp
new file mode 100644
index 0000000000..1feebf1b96
--- /dev/null
+++ b/src/main/cpp/profiler/profiler_serializer.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "profiler_generated.h"
+
+#include <cupti.h>
+#include <flatbuffers/flatbuffers.h>
+#include <jni.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace spark_rapids_jni::profiler {
+
+// Serializes profile data as flatbuffers
+struct profiler_serializer {
+  profiler_serializer(JNIEnv* env, jobject writer, size_t buffer_size, size_t flush_threshold);
+  void process_cupti_buffer(uint8_t* buffer, size_t valid_size);
+  void flush();
+
+ private:
+  void write_profile_header();
+  void process_api_activity(CUpti_ActivityAPI const*);
+  void process_device_activity(CUpti_ActivityDevice4 const*);
+  void process_dropped_records(size_t num_dropped);
+  void process_marker_activity(CUpti_ActivityMarker2 const*);
+  void process_marker_data(CUpti_ActivityMarkerData const*);
+  void process_memcpy(CUpti_ActivityMemcpy5 const*);
+  void process_memset(CUpti_ActivityMemset4 const*);
+  void process_kernel(CUpti_ActivityKernel8 const*);
+  void process_overhead(CUpti_ActivityOverhead const*);
+  void report_num_dropped_records();
+  void write_current_fb();
+
+  JNIEnv* env_;
+  jmethodID j_write_method_;
+  jobject j_writer_;
+  size_t flush_threshold_;
+  flatbuffers::FlatBufferBuilder fbb_;
+  std::vector<flatbuffers::Offset<ApiActivity>> api_offsets_;
+  std::vector<flatbuffers::Offset<DeviceActivity>> device_offsets_;
+  std::vector<flatbuffers::Offset<DroppedRecords>> dropped_offsets_;
+  std::vector<flatbuffers::Offset<KernelActivity>> kernel_offsets_;
+  std::vector<flatbuffers::Offset<MarkerActivity>> marker_offsets_;
+  std::vector<flatbuffers::Offset<MarkerData>> marker_data_offsets_;
+  std::vector<flatbuffers::Offset<MemcpyActivity>> memcpy_offsets_;
+  std::vector<flatbuffers::Offset<MemsetActivity>> memset_offsets_;
+  std::vector<flatbuffers::Offset<OverheadActivity>> overhead_offsets_;
+};
+
+}  // namespace spark_rapids_jni::profiler
diff --git a/src/main/cpp/profiler/spark_rapids_profile_converter.cpp b/src/main/cpp/profiler/spark_rapids_profile_converter.cpp
new file mode 100644
index 0000000000..b916020392
--- /dev/null
+++ b/src/main/cpp/profiler/spark_rapids_profile_converter.cpp
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* A tool that converts a spark-rapids profile binary into other forms. */
+
+#if 0
+#include <stdexcept>
+#define FLATBUFFERS_ASSERT(x)                                     \
+  do {                                                            \
+    if (!(x)) { throw std::runtime_error("flatbuffers assert"); } \
+  } while (0)
+#define FLATBUFFERS_DEBUG_VERIFICATION_FAILURE
+#endif
+
+#include "profiler_generated.h"
+#include "spark_rapids_jni_version.h"
+
+#include <cupti.h>
+#include <cxxabi.h>
+#include <flatbuffers/idl.h>
+
+#include <cerrno>
+#include <charconv>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace spark_rapids_jni::profiler {
+extern char const* Profiler_Schema;
+}
+
+struct program_options {
+  std::optional<std::filesystem::path> output_path;
+  bool help       = false;
+  bool json       = false;
+  bool nvtxt      = false;
+  int json_indent = 2;
+  bool version    = false;
+};
+
+struct event {
+  enum struct type_id { API, DEVICE, KERNEL, MARKER, MARKER_DATA, MEMCPY, MEMSET, OVERHEAD };
+  type_id id;
+  void const* fb_data;
+};
+
+struct thread_id {
+  uint32_t pid;
+  uint32_t tid;
+
+  bool operator==(thread_id const& o) const { return pid == o.pid && tid == o.tid; }
+};
+
+template <>
+struct std::hash<thread_id> {
+  std::size_t operator()(thread_id const& t) const
+  {
+    return std::hash<uint32_t>{}(t.pid) ^ (std::hash<uint32_t>{}(t.tid) << 1);
+  }
+};
+
+struct stream_id {
+  uint32_t device;
+  uint32_t context;
+  uint32_t stream;
+
+  bool operator==(stream_id const& s) const
+  {
+    return device == s.device && context == s.context && stream == s.stream;
+  }
+};
+
+template <>
+struct std::hash<stream_id> {
+  std::size_t operator()(stream_id const& s) const
+  {
+    return std::hash<uint32_t>{}(s.device) ^ (std::hash<uint32_t>{}(s.context) << 1) ^
+           (std::hash<uint32_t>{}(s.stream) << 2);
+  }
+};
+
+struct event_streams {
+  std::unordered_map<thread_id, std::vector<event>> cpu;
+  std::unordered_map<stream_id, std::vector<event>> gpu;
+};
+
+void print_usage()
+{
+  std::cout << "spark_rapids_profile_converter [OPTION]... profilebin" << std::endl;
+  std::cout << R"(
+Converts the spark-rapids profile in profile.bin into other forms.
+
+  -h, --help                show this usage message
+  -j, --json                convert to JSON, default output is stdout
+  -i, --json-indent=INDENT  indentation to use for JSON. 0 is no indent, less than 0 also removes newlines
+  -o, --output=PATH         use PATH as the output filename
+  -t. --nvtxt               convert to NVTXT, default output is stdout
+  -V, --version             print the version number
+  )" << std::endl;
+}
+
+void print_version()
+{
+  std::cout << "spark_rapids_profile_converter " << spark_rapids_jni::Version << std::endl;
+}
+
+std::pair<program_options, std::vector<std::string_view>> parse_options(
+  std::vector<std::string_view> args)
+{
+  program_options opts{};
+  std::string_view long_output("--output=");
+  std::string_view long_json_indent("--json-indent=");
+  bool seen_output      = false;
+  bool seen_json_indent = false;
+  auto argp             = args.begin();
+  while (argp != args.end()) {
+    if (*argp == "-o" || *argp == "--output") {
+      if (seen_output) { throw std::runtime_error("output path cannot be specified twice"); }
+      seen_output = true;
+      if (++argp != args.end()) {
+        opts.output_path = std::make_optional(*argp++);
+      } else {
+        throw std::runtime_error("missing argument for output path");
+      }
+    } else if (argp->substr(0, long_output.size()) == long_output) {
+      if (seen_output) { throw std::runtime_error("output path cannot be specified twice"); }
+      seen_output = true;
+      argp->remove_prefix(long_output.size());
+      if (argp->empty()) {
+        throw std::runtime_error("missing argument for output path");
+      } else {
+        opts.output_path = std::make_optional(*argp++);
+      }
+    } else if (*argp == "-h" || *argp == "--help") {
+      opts.help = true;
+      ++argp;
+    } else if (*argp == "-i" || *argp == "--json-indent") {
+      if (seen_json_indent) { throw std::runtime_error("JSON indent cannot be specified twice"); }
+      seen_json_indent = true;
+      if (++argp != args.end()) {
+        auto [ptr, err] = std::from_chars(argp->data(), argp->end(), opts.json_indent);
+        if (err != std::errc() || ptr != argp->end()) {
+          throw std::runtime_error("invalid JSON indent value");
+        }
+        ++argp;
+      } else {
+        throw std::runtime_error("missing argument for JSON indent");
+      }
+    } else if (argp->substr(0, long_json_indent.size()) == long_json_indent) {
+      if (seen_json_indent) { throw std::runtime_error("JSON indent cannot be specified twice"); }
+      seen_json_indent = true;
+      argp->remove_prefix(long_json_indent.size());
+      if (argp->empty()) {
+        throw std::runtime_error("missing argument for JSON indent");
+      } else {
+        auto [ptr, err] = std::from_chars(argp->data(), argp->end(), opts.json_indent);
+        if (err != std::errc() || ptr != argp->end()) {
+          throw std::runtime_error("invalid JSON indent value");
+        }
+        ++argp;
+      }
+    } else if (*argp == "-j" || *argp == "--json") {
+      if (opts.nvtxt) { throw std::runtime_error("JSON and NVTXT output are mutually exclusive"); }
+      opts.json = true;
+      ++argp;
+    } else if (*argp == "-t" || *argp == "--nvtxt") {
+      if (opts.json) { throw std::runtime_error("JSON and NVTXT output are mutually exclusive"); }
+      opts.nvtxt = true;
+      ++argp;
+    } else if (*argp == "-V" || *argp == "--version") {
+      opts.version = true;
+      ++argp;
+    } else if (argp->empty()) {
+      throw std::runtime_error("empty argument");
+    } else if (argp->at(0) == '-') {
+      throw std::runtime_error(std::string("unrecognized option: ") + std::string(*argp));
+    } else {
+      break;
+    }
+  }
+  return std::make_pair(opts, std::vector<std::string_view>(argp, args.end()));
+}
+
+void checked_read(std::ifstream& in, char* buffer, size_t size)
+{
+  in.read(buffer, size);
+  if (in.fail()) {
+    if (in.eof()) {
+      throw std::runtime_error("Unexpected EOF");
+    } else {
+      throw std::runtime_error(std::strerror(errno));
+    }
+  }
+}
+
+flatbuffers::uoffset_t read_flatbuffer_size(std::ifstream& in)
+{
+  flatbuffers::uoffset_t fb_size;
+  checked_read(in, reinterpret_cast<char*>(&fb_size), sizeof(fb_size));
+  return flatbuffers::EndianScalar(fb_size);
+}
+
+std::unique_ptr<std::vector<char>> read_flatbuffer(std::ifstream& in)
+{
+  auto size = read_flatbuffer_size(in);
+  // Allocate a buffer that can hold the flatbuffer along with the prefixed size.
+  // SizePrefixed APIs require size to be at the front of the buffer and alignment
+  // of fields is planned out with that size.
+  auto buffer   = std::make_unique<std::vector<char>>(size + sizeof(flatbuffers::uoffset_t));
+  auto size_ptr = reinterpret_cast<flatbuffers::uoffset_t*>(buffer->data());
+  *size_ptr     = size;
+  checked_read(in, buffer->data() + sizeof(flatbuffers::uoffset_t), size);
+  return buffer;
+}
+
+std::ofstream open_output(std::filesystem::path const& path,
+                          std::ios::openmode mode = std::ios::out)
+{
+  if (std::filesystem::exists(path)) {
+    throw std::runtime_error(path.string() + " already exists");
+  }
+  std::ofstream out(path, mode);
+  out.exceptions(std::ios::badbit);
+  return out;
+}
+
+template <typename T>
+T const* validate_fb(std::vector<char> const& fb, std::string_view const& name)
+{
+  flatbuffers::Verifier::Options verifier_opts;
+  verifier_opts.assert = true;
+  flatbuffers::Verifier verifier(
+    reinterpret_cast<uint8_t const*>(fb.data()), fb.size(), verifier_opts);
+  if (not verifier.VerifySizePrefixedBuffer<T>(nullptr)) {
+    throw std::runtime_error(std::string("malformed ") + std::string(name) + " record");
+  }
+  return flatbuffers::GetSizePrefixedRoot<T>(fb.data());
+}
+
+void verify_profile_header(std::ifstream& in)
+{
+  auto fb_ptr = read_flatbuffer(in);
+  auto header = validate_fb<spark_rapids_jni::profiler::ProfileHeader>(*fb_ptr, "profile header");
+  auto magic  = header->magic();
+  if (magic == nullptr) {
+    throw std::runtime_error("does not appear to be a spark-rapids profile");
+  }
+  if (magic->str() != "spark-rapids profile") {
+    std::ostringstream oss;
+    oss << "bad profile magic, expected 'spark-rapids profile' found '" << magic->str() << "'";
+    throw std::runtime_error(oss.str());
+  }
+  auto version = header->version();
+  if (version != 1) {
+    std::ostringstream oss;
+    oss << "unsupported profile version: " << version;
+    throw std::runtime_error(oss.str());
+  }
+}
+
+void convert_to_nsys_rep(std::ifstream& in,
+                         std::string_view const& in_filename,
+                         program_options const& opts)
+{
+  event_streams events;
+  size_t num_dropped_records = 0;
+  while (!in.eof()) {
+    auto fb_ptr = read_flatbuffer(in);
+    auto records =
+      validate_fb<spark_rapids_jni::profiler::ActivityRecords>(*fb_ptr, "ActivityRecords");
+    auto api = records->api();
+    if (api != nullptr) {
+      for (int i = 0; i < api->size(); ++i) {
+        auto a = api->Get(i);
+        thread_id tid{a->process_id(), a->thread_id()};
+        event e{event::type_id::API, a};
+        auto it = events.cpu.find(tid);
+        if (it == events.cpu.end()) {
+          events.cpu.emplace(tid, std::initializer_list<event>{e});
+        } else {
+          it->second.push_back(e);
+        }
+      }
+    }
+    auto device = records->device();
+    if (device != nullptr) { std::cerr << "NUM DEVICES=" << device->size() << std::endl; }
+    auto dropped = records->dropped();
+    if (dropped != nullptr) {
+      for (int i = 0; i < dropped->size(); ++i) {
+        auto d = dropped->Get(i);
+        num_dropped_records += d->num_dropped();
+      }
+    }
+    auto kernel = records->kernel();
+    if (kernel != nullptr) { std::cerr << "NUM KERNEL=" << kernel->size() << std::endl; }
+    auto marker = records->marker();
+    if (marker != nullptr) { std::cerr << "NUM MARKERS=" << marker->size() << std::endl; }
+    auto marker_data = records->marker_data();
+    if (marker_data != nullptr) {
+      std::cerr << "NUM MARKER DATA=" << marker_data->size() << std::endl;
+      for (int i = 0; i < marker_data->size(); ++i) {
+        std::cerr << "MARKER DATA " << i << std::endl;
+        auto md = marker_data->Get(i);
+        std::cerr << " FLAGS: " << md->flags();
+        std::cerr << " ID: " << md->id();
+        std::cerr << " COLOR: " << md->color();
+        std::cerr << " CATEGORY: " << md->category() << std::endl;
+      }
+    }
+    auto memcpy = records->memcpy();
+    if (memcpy != nullptr) { std::cerr << "NUM MEMCPY=" << memcpy->size() << std::endl; }
+    auto memset = records->memset();
+    if (device != nullptr) { std::cerr << "NUM MEMSET=" << memset->size() << std::endl; }
+    auto overhead = records->overhead();
+    if (overhead != nullptr) { std::cerr << "NUM OVERHEADS=" << overhead->size() << std::endl; }
+
+    in.peek();
+  }
+  if (not in.eof()) { throw std::runtime_error(std::strerror(errno)); }
+  if (num_dropped_records) {
+    std::cerr << "Warning: " << num_dropped_records
+              << " records were noted as dropped in the profile" << std::endl;
+  }
+}
+
+void convert_to_json(std::ifstream& in, std::ostream& out, program_options const& opts)
+{
+  flatbuffers::Parser parser;
+  if (parser.Parse(spark_rapids_jni::profiler::Profiler_Schema) != 0) {
+    std::runtime_error("Internal error: Unable to parse profiler schema");
+  }
+  parser.opts.strict_json = true;
+  parser.opts.indent_step = opts.json_indent;
+  while (!in.eof()) {
+    auto fb_ptr = read_flatbuffer(in);
+    auto records =
+      validate_fb<spark_rapids_jni::profiler::ActivityRecords>(*fb_ptr, "ActivityRecords");
+    std::string json;
+    char const* err =
+      flatbuffers::GenText(parser, fb_ptr->data() + sizeof(flatbuffers::uoffset_t), &json);
+    if (err != nullptr) { throw std::runtime_error(std::string("Error generating JSON: ") + err); }
+    out << json;
+
+    in.peek();
+  }
+  if (not in.eof()) { throw std::runtime_error(std::strerror(errno)); }
+}
+
+char const* get_api_name(spark_rapids_jni::profiler::ApiActivity const* a)
+{
+  char const* name = nullptr;
+  switch (a->kind()) {
+    case spark_rapids_jni::profiler::ApiKind_Driver:
+      cuptiGetCallbackName(CUPTI_CB_DOMAIN_DRIVER_API, a->cbid(), &name);
+      break;
+    case spark_rapids_jni::profiler::ApiKind_Runtime:
+      cuptiGetCallbackName(CUPTI_CB_DOMAIN_RUNTIME_API, a->cbid(), &name);
+      break;
+    default: {
+      std::ostringstream oss;
+      oss << "unsupported API kind: " << a->kind();
+      throw std::runtime_error(oss.str());
+    }
+  }
+  return name;
+}
+
+std::string demangle(char const* s)
+{
+  int status      = 0;
+  char* demangled = abi::__cxa_demangle(s, nullptr, nullptr, &status);
+  if (status == 0) {
+    std::string result(demangled);
+    free(demangled);
+    return result;
+  } else {
+    return s;
+  }
+}
+
+std::string memcpy_to_string(spark_rapids_jni::profiler::MemcpyActivity const* m)
+{
+  char const* kind_str;
+  char const* pinned = "";
+  switch (m->copy_kind()) {
+    case spark_rapids_jni::profiler::MemcpyKind_HtoD:
+      kind_str = "HtoD";
+      if (m->src_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned) { pinned = " Pinned"; }
+      break;
+    case spark_rapids_jni::profiler::MemcpyKind_DtoH:
+      kind_str = "DtoH";
+      if (m->dst_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned) { pinned = " Pinned"; }
+      break;
+    case spark_rapids_jni::profiler::MemcpyKind_HtoA:
+      kind_str = "HtoA";
+      if (m->dst_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned) { pinned = " Pinned"; }
+      break;
+    case spark_rapids_jni::profiler::MemcpyKind_AtoH:
+      kind_str = "AtoH";
+      if (m->dst_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned) { pinned = " Pinned"; }
+      break;
+    case spark_rapids_jni::profiler::MemcpyKind_AtoA: kind_str = "AtoA"; break;
+    case spark_rapids_jni::profiler::MemcpyKind_AtoD: kind_str = "AtoD"; break;
+    case spark_rapids_jni::profiler::MemcpyKind_DtoA: kind_str = "DtoA"; break;
+    case spark_rapids_jni::profiler::MemcpyKind_DtoD: kind_str = "DtoD"; break;
+    case spark_rapids_jni::profiler::MemcpyKind_HtoH:
+      kind_str = "HtoH";
+      if (m->src_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned &&
+          m->dst_kind() == m->src_kind()) {
+        pinned = " Pinned";
+      }
+      break;
+    case spark_rapids_jni::profiler::MemcpyKind_PtoP: kind_str = "PtoP"; break;
+    case spark_rapids_jni::profiler::MemcpyKind_Unknown: kind_str = "Unknown"; break;
+    default: kind_str = "Unknown"; break;
+  }
+  std::ostringstream oss;
+  oss << kind_str << pinned;
+  oss << " " << m->bytes() << " bytes";
+  if (m->flags() == spark_rapids_jni::profiler::MemcpyFlags_Async) { oss << " async"; }
+  return oss.str();
+}
+
+const char* memcpy_to_color(spark_rapids_jni::profiler::MemcpyActivity const* m)
+{
+  switch (m->copy_kind()) {
+    case spark_rapids_jni::profiler::MemcpyKind_HtoD:
+      if (m->src_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned) { return "MediumPurple"; }
+      return "Gold";
+    case spark_rapids_jni::profiler::MemcpyKind_DtoH:
+      if (m->dst_kind() == spark_rapids_jni::profiler::MemoryKind_Pinned) { return "MediumPurple"; }
+      return "Gold";
+    case spark_rapids_jni::profiler::MemcpyKind_HtoA:
+    case spark_rapids_jni::profiler::MemcpyKind_AtoH:
+    case spark_rapids_jni::profiler::MemcpyKind_AtoA:
+    case spark_rapids_jni::profiler::MemcpyKind_AtoD:
+    case spark_rapids_jni::profiler::MemcpyKind_DtoA: return "Gold";
+    case spark_rapids_jni::profiler::MemcpyKind_DtoD: return "Gold";
+    case spark_rapids_jni::profiler::MemcpyKind_HtoH: return "Ivory";
+    case spark_rapids_jni::profiler::MemcpyKind_PtoP: return "LightSalmon";
+    case spark_rapids_jni::profiler::MemcpyKind_Unknown:
+    default: return "DarkRed";
+  }
+}
+
+std::string memset_to_string(spark_rapids_jni::profiler::MemsetActivity const* m)
+{
+  std::ostringstream oss;
+  oss << "Memset " << m->bytes() << " bytes to " << m->value();
+  if (m->flags() == spark_rapids_jni::profiler::MemsetFlags_Async) { oss << " async"; }
+  return oss.str();
+}
+
+char const* overhead_kind_to_string(spark_rapids_jni::profiler::OverheadKind k)
+{
+  switch (k) {
+    case spark_rapids_jni::profiler::OverheadKind_Unknown: return "Unknown";
+    case spark_rapids_jni::profiler::OverheadKind_DriverCompiler: return "Driver compiler";
+    case spark_rapids_jni::profiler::OverheadKind_CUptiBufferFlush: return "Buffer flush";
+    case spark_rapids_jni::profiler::OverheadKind_CUptiInstrumentation: return "Instrumentation";
+    case spark_rapids_jni::profiler::OverheadKind_CUptiResource: return "Resource";
+    default: return "Unknown";
+  }
+}
+
+// Convert a CUPTI thread ID to an NVTXT thread ID.
+uint32_t to_nvtxt_tid(uint32_t tid)
+{
+  // NVTXT thread IDs are limited to 24-bit.
+  // Take the upper 24 bits which empirically are the most unique bits returned by CUPTI.
+  return tid >> 8;
+}
+
+void convert_to_nvtxt(std::ifstream& in, std::ostream& out, program_options const& opts)
+{
+  struct marker_start {
+    uint64_t timestamp;
+    uint32_t process_id;
+    uint32_t thread_id;
+    uint32_t color;
+    uint32_t category;
+    std::string name;
+  };
+  std::unordered_set<stream_id> streams_seen;
+  std::unordered_map<int, spark_rapids_jni::profiler::MarkerData const*> marker_data_map;
+  std::unordered_map<int, marker_start> marker_start_map;
+  size_t num_dropped_records = 0;
+  out << "@NameProcess,ProcessId,Name" << std::endl;
+  out << "NameProcess,0,\"GPU\"" << std::endl;
+  out << "@NameOsThread,ProcessId,ThreadId,Name" << std::endl;
+  out << "@RangePush,Time,ProcessId,ThreadId,CategoryId,Color,Message" << std::endl;
+  out << "@RangePop,Time,ProcessId,ThreadId" << std::endl;
+  out << "TimeBase=Relative" << std::endl;
+  out << "Payload=0" << std::endl;
+  while (!in.eof()) {
+    auto fb_ptr = read_flatbuffer(in);
+    auto records =
+      validate_fb<spark_rapids_jni::profiler::ActivityRecords>(*fb_ptr, "ActivityRecords");
+    auto dropped = records->dropped();
+    if (dropped != nullptr) {
+      for (int i = 0; i < dropped->size(); ++i) {
+        auto d = dropped->Get(i);
+        num_dropped_records += d->num_dropped();
+      }
+    }
+    auto api = records->api();
+    if (api != nullptr) {
+      for (int i = 0; i < api->size(); ++i) {
+        auto a = api->Get(i);
+        out << "RangePush," << a->start() << "," << a->process_id() << ","
+            << to_nvtxt_tid(a->thread_id()) << ",0,PaleGreen"
+            << ","
+            << "\"" << get_api_name(a) << "\"" << std::endl;
+        out << "RangePop," << a->end() << "," << a->process_id() << ","
+            << to_nvtxt_tid(a->thread_id()) << std::endl;
+      }
+    }
+    auto marker_data = records->marker_data();
+    if (marker_data != nullptr) {
+      for (int i = 0; i < marker_data->size(); ++i) {
+        auto m              = marker_data->Get(i);
+        auto [it, inserted] = marker_data_map.insert({m->id(), m});
+        if (not inserted) {
+          std::ostringstream oss;
+          oss << "duplicate marker data for " << m->id();
+          throw std::runtime_error(oss.str());
+        }
+      }
+    }
+    auto marker = records->marker();
+    if (marker != nullptr) {
+      for (int i = 0; i < marker->size(); ++i) {
+        auto m         = marker->Get(i);
+        auto object_id = m->object_id();
+        if (object_id != nullptr) {
+          uint32_t process_id = object_id->process_id();
+          uint32_t thread_id  = to_nvtxt_tid(object_id->thread_id());
+          if (process_id == 0) {
+            // abusing thread ID as stream ID since NVTXT does not support GPU activity directly
+            thread_id = object_id->stream_id();
+            // TODO: Ignoring device ID and context here
+            auto [it, inserted] = streams_seen.insert(stream_id{0, 0, thread_id});
+            if (inserted) { out << "NameOsThread,0,\"Stream " << thread_id << "\"" << std::endl; }
+          }
+          if (m->flags() & spark_rapids_jni::profiler::MarkerFlags_Start) {
+            auto it           = marker_data_map.find(m->id());
+            uint32_t color    = 0x444444;
+            uint32_t category = 0;
+            if (it != marker_data_map.end()) {
+              color    = it->second->color();
+              category = it->second->category();
+            }
+            marker_start ms{
+              m->timestamp(), process_id, thread_id, color, category, m->name()->str()};
+            auto [ignored, inserted] = marker_start_map.insert({m->id(), ms});
+            if (not inserted) {
+              std::ostringstream oss;
+              oss << "duplicate marker start for ID " << m->id();
+              throw std::runtime_error(oss.str());
+            }
+          } else if (m->flags() & spark_rapids_jni::profiler::MarkerFlags_End) {
+            auto it = marker_start_map.find(m->id());
+            if (it != marker_start_map.end()) {
+              auto const& ms = it->second;
+              out << "RangePush," << ms.timestamp << "," << ms.process_id << "," << ms.thread_id
+                  << "," << ms.category << "," << ms.color << ","
+                  << "\"" << ms.name << "\"" << std::endl;
+              out << "RangePop," << m->timestamp() << "," << ms.process_id << "," << ms.thread_id
+                  << std::endl;
+              marker_start_map.erase(it);
+            } else {
+              std::cerr << "Ignoring marker end without start for ID " << m->id() << std::endl;
+            }
+          } else {
+            std::cerr << "Ignoring marker with unsupported flags: " << m->flags() << std::endl;
+          }
+        } else {
+          std::cerr << "Marker " << m->id() << " has no object ID" << std::endl;
+        }
+      }
+    }
+    marker_data_map.clear();
+    auto kernel = records->kernel();
+    if (kernel != nullptr) {
+      for (int i = 0; i < kernel->size(); ++i) {
+        auto k              = kernel->Get(i);
+        uint32_t process_id = 0;
+        // abusing thread ID as stream ID since NVTXT does not support GPU activity directly
+        uint32_t thread_id = k->stream_id();
+        // TODO: Ignoring device ID and context here
+        auto [it, inserted] = streams_seen.insert(stream_id{0, 0, thread_id});
+        if (inserted) {
+          out << "NameOsThread,0," << thread_id << ",\"Stream " << thread_id << "\"" << std::endl;
+        }
+        out << "RangePush," << k->start() << "," << process_id << "," << thread_id << ",0,Blue"
+            << ","
+            << "\"" << demangle(k->name()->c_str()) << "\"" << std::endl;
+        out << "RangePop," << k->end() << "," << process_id << "," << thread_id << std::endl;
+      }
+    }
+    auto memcpy = records->memcpy();
+    if (memcpy != nullptr) {
+      for (int i = 0; i < memcpy->size(); ++i) {
+        auto m              = memcpy->Get(i);
+        uint32_t process_id = 0;
+        // abusing thread ID as stream ID since NVTXT does not support GPU activity directly
+        uint32_t thread_id = m->stream_id();
+        // TODO: Ignoring device ID and context here
+        auto [it, inserted] = streams_seen.insert(stream_id{0, 0, thread_id});
+        if (inserted) {
+          out << "NameOsThread,0," << thread_id << ",\"Stream " << thread_id << "\"" << std::endl;
+        }
+        out << "RangePush," << m->start() << "," << process_id << "," << thread_id << ",0,"
+            << memcpy_to_color(m) << ","
+            << "\"" << memcpy_to_string(m) << "\"" << std::endl;
+        out << "RangePop," << m->end() << "," << process_id << "," << thread_id << std::endl;
+      }
+    }
+    auto memset = records->memset();
+    if (memset != nullptr) {
+      for (int i = 0; i < memset->size(); ++i) {
+        auto m              = memset->Get(i);
+        uint32_t process_id = 0;
+        // abusing thread ID as stream ID since NVTXT does not support GPU activity directly
+        uint32_t thread_id = m->stream_id();
+        // TODO: Ignoring device ID and context here
+        auto [it, inserted] = streams_seen.insert(stream_id{0, 0, thread_id});
+        if (inserted) {
+          out << "NameOsThread,0," << thread_id << ",\"Stream " << thread_id << "\"" << std::endl;
+        }
+        out << "RangePush," << m->start() << "," << process_id << "," << thread_id << ",0,Olive"
+            << ","
+            << "\"" << memset_to_string(m) << "\"" << std::endl;
+        out << "RangePop," << m->end() << "," << process_id << "," << thread_id << std::endl;
+      }
+    }
+    auto overhead = records->overhead();
+    if (overhead != nullptr) {
+      for (int i = 0; i < overhead->size(); ++i) {
+        auto o         = overhead->Get(i);
+        auto object_id = o->object_id();
+        if (object_id != nullptr) {
+          uint32_t process_id = object_id->process_id();
+          uint32_t thread_id  = to_nvtxt_tid(object_id->thread_id());
+          if (process_id == 0) {
+            // abusing thread ID as stream ID since NVTXT does not support GPU activity directly
+            thread_id = object_id->stream_id();
+            // TODO: Ignoring device ID and context here
+            auto [it, inserted] = streams_seen.insert(stream_id{0, 0, thread_id});
+            if (inserted) { out << "NameOsThread,0,\"Stream " << thread_id << "\"" << std::endl; }
+          }
+          out << "RangePush," << o->start() << "," << process_id << "," << thread_id
+              << ",0,OrangeRed"
+              << ","
+              << "\"" << overhead_kind_to_string(o->overhead_kind()) << "\"" << std::endl;
+          out << "RangePop," << o->end() << "," << process_id << "," << thread_id << std::endl;
+        } else {
+          std::cerr << "Overhead activity has no object ID" << std::endl;
+        }
+      }
+    }
+
+    in.peek();
+  }
+  if (num_dropped_records) {
+    std::cerr << "Warning: " << num_dropped_records
+              << " records were noted as dropped in the profile" << std::endl;
+  }
+}
+
+int main(int argc, char* argv[])
+{
+  constexpr int RESULT_SUCCESS = 0;
+  constexpr int RESULT_FAILURE = 1;
+  constexpr int RESULT_USAGE   = 2;
+  program_options opts;
+  std::vector<std::string_view> files;
+  if (argc < 2) {
+    print_usage();
+    return RESULT_USAGE;
+  }
+  std::vector<std::string_view> args(argv + 1, argv + argc);
+  try {
+    auto [options, inputs] = parse_options(args);
+    opts                   = options;
+    files                  = inputs;
+  } catch (std::exception const& e) {
+    std::cerr << "spark_rapids_profile_converter: " << e.what() << std::endl;
+    print_usage();
+    return RESULT_USAGE;
+  }
+  if (opts.help) {
+    print_usage();
+    return RESULT_USAGE;
+  }
+  if (opts.version) {
+    print_version();
+    return RESULT_SUCCESS;
+  }
+  if (files.size() != 1) {
+    std::cerr << "Missing input file." << std::endl;
+    print_usage();
+    return RESULT_USAGE;
+  }
+  auto input_file = files.front();
+  try {
+    std::ifstream in(std::string(input_file), std::ios::binary | std::ios::in);
+    in.exceptions(std::istream::badbit);
+    verify_profile_header(in);
+    if (opts.json) {
+      if (opts.output_path) {
+        std::ofstream out = open_output(opts.output_path.value());
+        convert_to_json(in, out, opts);
+      } else {
+        convert_to_json(in, std::cout, opts);
+      }
+    } else if (opts.nvtxt) {
+      if (opts.output_path) {
+        std::ofstream out = open_output(opts.output_path.value());
+        convert_to_nvtxt(in, out, opts);
+      } else {
+        convert_to_nvtxt(in, std::cout, opts);
+      }
+    } else {
+      convert_to_nsys_rep(in, input_file, opts);
+    }
+  } catch (std::system_error const& e) {
+    std::cerr << "Error converting " << input_file << ": " << e.code().message() << std::endl;
+    return RESULT_FAILURE;
+  } catch (std::exception const& e) {
+    std::cerr << "Error converting " << input_file << ": " << e.what() << std::endl;
+    return RESULT_FAILURE;
+  }
+  return RESULT_SUCCESS;
+}
diff --git a/src/main/cpp/src/ParseURIJni.cpp b/src/main/cpp/src/ParseURIJni.cpp
index 354d47c424..91b898048b 100644
--- a/src/main/cpp/src/ParseURIJni.cpp
+++ b/src/main/cpp/src/ParseURIJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -92,4 +92,18 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parseQueryWith
   }
   CATCH_STD(env, 0);
 }
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_ParseURI_parsePath(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong input_column)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input = reinterpret_cast<cudf::column_view const*>(input_column);
+    return cudf::jni::ptr_as_jlong(spark_rapids_jni::parse_uri_to_path(*input).release());
+  }
+  CATCH_STD(env, 0);
+}
 }
diff --git a/src/main/cpp/src/RegexRewriteUtilsJni.cpp b/src/main/cpp/src/RegexRewriteUtilsJni.cpp
new file mode 100644
index 0000000000..28f346582c
--- /dev/null
+++ b/src/main/cpp/src/RegexRewriteUtilsJni.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_utils.hpp"
+#include "regex_rewrite_utils.hpp"
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_RegexRewriteUtils_literalRangePattern(
+  JNIEnv* env, jclass, jlong input, jlong target, jint d, jint start, jint end)
+{
+  JNI_NULL_CHECK(env, input, "input column is null", 0);
+  JNI_NULL_CHECK(env, target, "target is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(input);
+    cudf::strings_column_view scv(*cv);
+    cudf::string_scalar* ss_scalar = reinterpret_cast<cudf::string_scalar*>(target);
+    return cudf::jni::release_as_jlong(
+      spark_rapids_jni::literal_range_pattern(scv, *ss_scalar, d, start, end));
+  }
+  CATCH_STD(env, 0);
+}
+}
diff --git a/src/main/cpp/src/bloom_filter.cu b/src/main/cpp/src/bloom_filter.cu
index d5f868c476..5dfdd582ef 100644
--- a/src/main/cpp/src/bloom_filter.cu
+++ b/src/main/cpp/src/bloom_filter.cu
@@ -225,7 +225,7 @@ std::pair<int, int> get_bloom_filter_stride(int bloom_filter_longs)
 std::unique_ptr<cudf::list_scalar> bloom_filter_create(int num_hashes,
                                                        int bloom_filter_longs,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   auto [bloom_filter_size, buf_size] = get_bloom_filter_stride(bloom_filter_longs);
 
@@ -276,7 +276,7 @@ void bloom_filter_put(cudf::list_scalar& bloom_filter,
 
 std::unique_ptr<cudf::list_scalar> bloom_filter_merge(cudf::column_view const& bloom_filters,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   // unpack the bloom filter
   cudf::lists_column_view lcv(bloom_filters);
@@ -339,7 +339,7 @@ std::unique_ptr<cudf::list_scalar> bloom_filter_merge(cudf::column_view const& b
 std::unique_ptr<cudf::column> bloom_filter_probe(cudf::column_view const& input,
                                                  cudf::device_span<uint8_t const> bloom_filter,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   // unpack the bloom filter
   auto [header, buffer, bloom_filter_bits] = unpack_bloom_filter(bloom_filter, stream);
@@ -368,7 +368,7 @@ std::unique_ptr<cudf::column> bloom_filter_probe(cudf::column_view const& input,
 std::unique_ptr<cudf::column> bloom_filter_probe(cudf::column_view const& input,
                                                  cudf::list_scalar& bloom_filter,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   return bloom_filter_probe(input, bloom_filter.view(), stream, mr);
 }
diff --git a/src/main/cpp/src/bloom_filter.hpp b/src/main/cpp/src/bloom_filter.hpp
index e54d26f630..9bb83e0b8b 100644
--- a/src/main/cpp/src/bloom_filter.hpp
+++ b/src/main/cpp/src/bloom_filter.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace spark_rapids_jni {
 
@@ -46,8 +47,8 @@ constexpr int bloom_filter_header_size = sizeof(bloom_filter_header);
 std::unique_ptr<cudf::list_scalar> bloom_filter_create(
   int num_hashes,
   int bloom_filter_longs,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Inserts input values into a bloom filter.
@@ -77,8 +78,8 @@ void bloom_filter_put(cudf::list_scalar& bloom_filter,
 std::unique_ptr<cudf::column> bloom_filter_probe(
   cudf::column_view const& input,
   cudf::device_span<uint8_t const> bloom_filter,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Probe a bloom filter with an input column of int64_t values.
@@ -94,8 +95,8 @@ std::unique_ptr<cudf::column> bloom_filter_probe(
 std::unique_ptr<cudf::column> bloom_filter_probe(
   cudf::column_view const& input,
   cudf::list_scalar& bloom_filter,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Merge multiple bloom filters into a single output.
@@ -112,7 +113,7 @@ std::unique_ptr<cudf::column> bloom_filter_probe(
  */
 std::unique_ptr<cudf::list_scalar> bloom_filter_merge(
   cudf::column_view const& bloom_filters,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/cast_decimal_to_string.cu b/src/main/cpp/src/cast_decimal_to_string.cu
index 9d0e27ed59..099ac59a04 100644
--- a/src/main/cpp/src/cast_decimal_to_string.cu
+++ b/src/main/cpp/src/cast_decimal_to_string.cu
@@ -52,8 +52,9 @@ namespace {
 template <typename DecimalType>
 struct decimal_to_non_ansi_string_fn {
   column_device_view d_decimals;
-  size_type* d_offsets{};
-  char* d_chars{};
+  cudf::size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Calculates the size of the string required to convert the element, in base-10 format.
@@ -162,13 +163,13 @@ struct decimal_to_non_ansi_string_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_decimals.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       decimal_to_non_ansi_string(idx);
     } else {
-      d_offsets[idx] = compute_output_size(d_decimals.element<DecimalType>(idx));
+      d_sizes[idx] = compute_output_size(d_decimals.element<DecimalType>(idx));
     }
   }
 };
@@ -180,7 +181,7 @@ struct dispatch_decimal_to_non_ansi_string_fn {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using DecimalType = device_storage_type_t<T>;  // underlying value type
 
@@ -199,7 +200,7 @@ struct dispatch_decimal_to_non_ansi_string_fn {
   template <typename T, std::enable_if_t<not cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for decimal_to_non_ansi_string function must be a decimal type.");
   }
@@ -209,7 +210,7 @@ struct dispatch_decimal_to_non_ansi_string_fn {
 
 std::unique_ptr<column> decimal_to_non_ansi_string(column_view const& input,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
   return type_dispatcher(input.type(), dispatch_decimal_to_non_ansi_string_fn{}, input, stream, mr);
@@ -221,7 +222,7 @@ std::unique_ptr<column> decimal_to_non_ansi_string(column_view const& input,
 
 std::unique_ptr<column> decimal_to_non_ansi_string(column_view const& input,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::decimal_to_non_ansi_string(input, stream, mr);
diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index 78cedbbf64..5af0d8c5ce 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,8 +34,9 @@ namespace {
 template <typename FloatType>
 struct float_to_string_fn {
   cudf::column_device_view d_floats;
-  cudf::size_type* d_offsets;
+  cudf::size_type* d_sizes;
   char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ cudf::size_type compute_output_size(cudf::size_type idx) const
   {
@@ -56,13 +57,13 @@ struct float_to_string_fn {
   __device__ void operator()(cudf::size_type idx) const
   {
     if (d_floats.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       float_to_string(idx);
     } else {
-      d_offsets[idx] = compute_output_size(idx);
+      d_sizes[idx] = compute_output_size(idx);
     }
   }
 };
@@ -76,7 +77,7 @@ struct dispatch_float_to_string_fn {
   template <typename FloatType, CUDF_ENABLE_IF(std::is_floating_point_v<FloatType>)>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& floats,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto const strings_count = floats.size();
     if (strings_count == 0) { return cudf::make_empty_column(cudf::type_id::STRING); }
@@ -97,7 +98,7 @@ struct dispatch_float_to_string_fn {
   template <typename T, CUDF_ENABLE_IF(not std::is_floating_point_v<T>)>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*)
+                                           rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Values for float_to_string function must be a float type.");
   }
@@ -108,7 +109,7 @@ struct dispatch_float_to_string_fn {
 // This will convert all float column types into a strings column.
 std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(floats.type(), dispatch_float_to_string_fn{}, floats, stream, mr);
 }
@@ -118,10 +119,10 @@ std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats,
 // external API
 std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::float_to_string(floats, stream, mr);
 }
 
-}  // namespace spark_rapids_jni
\ No newline at end of file
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/cast_string.cu b/src/main/cpp/src/cast_string.cu
index a961981160..bfbbc3777d 100644
--- a/src/main/cpp/src/cast_string.cu
+++ b/src/main/cpp/src/cast_string.cu
@@ -38,13 +38,14 @@ namespace detail {
 constexpr auto NUM_THREADS{256};
 
 /**
- * @brief Identify if a character is whitespace.
+ * @brief Identify if a character is whitespace or C0 control code.
  *
  * @param chr character to test
  * @return true if character is a whitespace character
  */
 constexpr bool is_whitespace(char const chr)
 {
+  if (chr >= 0x0000 && chr <= 0x001F) { return true; }
   switch (chr) {
     case ' ':
     case '\r':
@@ -649,7 +650,7 @@ struct string_to_integer_impl {
                                      bool ansi_mode,
                                      bool strip,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (string_col.size() == 0) {
       return std::make_unique<column>(
@@ -694,7 +695,7 @@ struct string_to_integer_impl {
                                      bool ansi_mode,
                                      bool strip,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("Invalid integer column type");
   }
@@ -721,7 +722,7 @@ struct string_to_decimal_impl {
                                      bool ansi_mode,
                                      bool strip,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     using Type = device_storage_type_t<T>;
 
@@ -763,7 +764,7 @@ struct string_to_decimal_impl {
                                      bool ansi_mode,
                                      bool strip,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("Invalid decimal column type");
   }
@@ -788,7 +789,7 @@ std::unique_ptr<column> string_to_integer(data_type dtype,
                                           bool ansi_mode,
                                           bool strip,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(
     dtype, detail::string_to_integer_impl{}, string_col, ansi_mode, strip, stream, mr);
@@ -813,7 +814,7 @@ std::unique_ptr<column> string_to_decimal(int32_t precision,
                                           bool ansi_mode,
                                           bool strip,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   data_type dtype = [precision, scale]() {
     if (precision <= cuda::std::numeric_limits<int32_t>::digits10)
diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp
index 43ec36e576..2850fbfae5 100644
--- a/src/main/cpp/src/cast_string.hpp
+++ b/src/main/cpp/src/cast_string.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <memory>
 
 namespace spark_rapids_jni {
@@ -73,7 +75,7 @@ std::unique_ptr<cudf::column> string_to_integer(
   bool ansi_mode,
   bool strip,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Convert a string column into an decimal column.
@@ -95,7 +97,7 @@ std::unique_ptr<cudf::column> string_to_decimal(
   bool ansi_mode,
   bool strip,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Convert a string column into an float column.
@@ -113,22 +115,22 @@ std::unique_ptr<cudf::column> string_to_float(
   cudf::strings_column_view const& string_col,
   bool ansi_mode,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> format_float(
   cudf::column_view const& input,
   int const digits,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> float_to_string(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> decimal_to_non_ansi_string(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/cast_string_to_float.cu b/src/main/cpp/src/cast_string_to_float.cu
index 5c3c749f02..e843d645ce 100644
--- a/src/main/cpp/src/cast_string_to_float.cu
+++ b/src/main/cpp/src/cast_string_to_float.cu
@@ -25,6 +25,8 @@
 #include <cudf/strings/detail/convert/string_to_float.cuh>
 #include <cudf/utilities/bit.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cub/warp/warp_reduce.cuh>
 
 using namespace cudf;
@@ -36,13 +38,14 @@ namespace detail {
 __device__ __inline__ bool is_digit(char c) { return c >= '0' && c <= '9'; }
 
 /**
- * @brief Identify if a character is whitespace.
+ * @brief Identify if a character is whitespace or C0 control code.
  *
  * @param chr character to test
  * @return true if character is a whitespace character
  */
 constexpr bool is_whitespace(char const chr)
 {
+  if (chr >= 0x0000 && chr <= 0x001F) { return true; }
   switch (chr) {
     case ' ':
     case '\r':
@@ -654,7 +657,7 @@ std::unique_ptr<column> string_to_float(data_type dtype,
                                         strings_column_view const& string_col,
                                         bool ansi_mode,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(dtype == data_type{type_id::FLOAT32} || dtype == data_type{type_id::FLOAT64},
                "invalid float data type");
diff --git a/src/main/cpp/src/datetime_rebase.cu b/src/main/cpp/src/datetime_rebase.cu
index 9e8e791490..976c9b1530 100644
--- a/src/main/cpp/src/datetime_rebase.cu
+++ b/src/main/cpp/src/datetime_rebase.cu
@@ -25,6 +25,7 @@
 
 //
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 //
 #include <cuda/functional>
@@ -56,7 +57,7 @@ __device__ __inline__ auto days_from_julian(cuda::std::chrono::year_month_day co
 // This is to match with Apache Spark's `localRebaseGregorianToJulianDays` function.
 std::unique_ptr<cudf::column> gregorian_to_julian_days(cudf::column_view const& input,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == cudf::type_id::TIMESTAMP_DAYS,
                "The input column type must be microsecond timestamp.",
@@ -127,7 +128,7 @@ __device__ __inline__ cuda::std::chrono::year_month_day julian_from_days(int32_t
 // `localRebaseJulianToGregorianDays` function.
 std::unique_ptr<cudf::column> julian_to_gregorian_days(cudf::column_view const& input,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == cudf::type_id::TIMESTAMP_DAYS,
                "The input column type must be microsecond timestamp.",
@@ -227,7 +228,7 @@ __device__ __inline__ time_components get_time_components(int64_t micros)
 // fixed to UTC.
 std::unique_ptr<cudf::column> gregorian_to_julian_micros(cudf::column_view const& input,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == cudf::type_id::TIMESTAMP_MICROSECONDS,
                "The input column type must be microsecond timestamp.",
@@ -290,7 +291,7 @@ std::unique_ptr<cudf::column> gregorian_to_julian_micros(cudf::column_view const
 // fixed to UTC.
 std::unique_ptr<cudf::column> julian_to_gregorian_micros(cudf::column_view const& input,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == cudf::type_id::TIMESTAMP_MICROSECONDS,
                "The input column type must be microsecond timestamp.",
diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index 1d537595d7..8d316d6cbf 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -35,8 +35,9 @@ template <typename FloatType>
 struct format_float_fn {
   cudf::column_device_view d_floats;
   int digits;
-  cudf::size_type* d_offsets;
+  cudf::size_type* d_sizes;
   char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ cudf::size_type compute_output_size(FloatType const value) const
   {
@@ -56,13 +57,13 @@ struct format_float_fn {
   __device__ void operator()(cudf::size_type const idx) const
   {
     if (d_floats.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       format_float(idx);
     } else {
-      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx));
+      d_sizes[idx] = compute_output_size(d_floats.element<FloatType>(idx));
     }
   }
 };
@@ -77,7 +78,7 @@ struct dispatch_format_float_fn {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& floats,
                                            int const digits,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr) const
+                                           rmm::device_async_resource_ref mr) const
   {
     auto const strings_count = floats.size();
     if (strings_count == 0) { return cudf::make_empty_column(cudf::type_id::STRING); }
@@ -99,7 +100,7 @@ struct dispatch_format_float_fn {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                            int const,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*) const
+                                           rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for format_float function must be a float type.");
   }
@@ -111,7 +112,7 @@ struct dispatch_format_float_fn {
 std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
                                            int const digits,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(floats.type(), dispatch_format_float_fn{}, floats, digits, stream, mr);
 }
@@ -122,10 +123,10 @@ std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
 std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
                                            int const digits,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::format_float(floats, digits, stream, mr);
 }
 
-}  // namespace spark_rapids_jni
\ No newline at end of file
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
index f2e8ce0006..bffa528e18 100644
--- a/src/main/cpp/src/ftos_converter.cuh
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -807,7 +807,7 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
   // Values in the interval [1E-3, 1E7) are special.
   if (scientificNotation) {
     // Print in the format x.xxxxxE-yy.
-    for (int i = 0; i < olength - 1; ++i) {
+    for (auto i = 0; i < olength - 1; ++i) {
       int const c = output % 10;
       output /= 10;
       result[index + olength - i] = (char)('0' + c);
@@ -836,23 +836,23 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
       // Decimal dot is before any of the digits.
       result[index++] = '0';
       result[index++] = '.';
-      for (int i = -1; i > exp; i--) {
+      for (auto i = -1; i > exp; i--) {
         result[index++] = '0';
       }
       int current = index;
-      for (int i = 0; i < olength; i++) {
+      for (auto i = 0; i < olength; i++) {
         result[current + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
         index++;
       }
     } else if (exp + 1 >= static_cast<int32_t>(olength)) {
       // Decimal dot is after any of the digits.
-      for (int i = 0; i < olength; i++) {
+      for (auto i = 0; i < olength; i++) {
         result[index + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
       }
       index += olength;
-      for (int i = olength; i < exp + 1; i++) {
+      for (auto i = olength; i < exp + 1; i++) {
         result[index++] = '0';
       }
       result[index++] = '.';
@@ -860,7 +860,7 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
     } else {
       // Decimal dot is somewhere between the digits.
       int current = index + 1;
-      for (int i = 0; i < olength; i++) {
+      for (auto i = 0; i < olength; i++) {
         if (olength - i - 1 == exp) {
           result[current + olength - i - 1] = '.';
           current--;
@@ -926,7 +926,7 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha
 
   if (scientificNotation) {
     // Print in the format x.xxxxxE-yy.
-    for (int i = 0; i < olength - 1; i++) {
+    for (auto i = 0; i < olength - 1; i++) {
       int c = output % 10;
       output /= 10;
       result[index + olength - i] = (char)('0' + c);
@@ -950,23 +950,23 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha
       // Decimal dot is before any of the digits.
       result[index++] = '0';
       result[index++] = '.';
-      for (int i = -1; i > exp; i--) {
+      for (auto i = -1; i > exp; i--) {
         result[index++] = '0';
       }
       int current = index;
-      for (int i = 0; i < olength; i++) {
+      for (auto i = 0; i < olength; i++) {
         result[current + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
         index++;
       }
     } else if (exp + 1 >= olength) {
       // Decimal dot is after any of the digits.
-      for (int i = 0; i < olength; i++) {
+      for (auto i = 0; i < olength; i++) {
         result[index + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
       }
       index += olength;
-      for (int i = olength; i < exp + 1; i++) {
+      for (auto i = olength; i < exp + 1; i++) {
         result[index++] = '0';
       }
       result[index++] = '.';
@@ -974,7 +974,7 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha
     } else {
       // Decimal dot is somewhere between the digits.
       int current = index + 1;
-      for (int i = 0; i < olength; i++) {
+      for (auto i = 0; i < olength; i++) {
         if (olength - i - 1 == exp) {
           result[current + olength - i - 1] = '.';
           current--;
@@ -1284,7 +1284,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     if (digits == 0) { return index; }
     result[index++]  = '.';
     int actual_round = digits;
-    for (int i = -1; i > exp; i--) {
+    for (auto i = -1; i > exp; i--) {
       index_for_carrier = index;
       result[index++]   = '0';
       actual_round--;
@@ -1301,14 +1301,14 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
       rounded_output -= POW10_TABLE[actual_olength];
     }
     int current = index;
-    for (int i = 0; i < actual_olength; i++) {
+    for (auto i = 0; i < actual_olength; i++) {
       result[current + actual_olength - i - 1] = (char)('0' + rounded_output % 10);
       rounded_output /= 10;
       index++;
     }
     actual_round -= actual_olength;
     if (actual_round > 0) {
-      for (int i = 0; i < actual_round; i++) {
+      for (auto i = 0; i < actual_round; i++) {
         result[index++] = '0';
       }
     }
@@ -1317,7 +1317,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     int integer_len = index + exp + 1 + exp / 3;
     int sep_cnt     = 0;
     int rev_index   = 0;
-    for (int i = olength; i < exp + 1; i++) {
+    for (auto i = olength; i < exp + 1; i++) {
       result[integer_len - (rev_index++) - 1] = '0';
       sep_cnt++;
       if (sep_cnt == 3) {
@@ -1325,7 +1325,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
         sep_cnt                                 = 0;
       }
     }
-    for (int i = 0; i < olength; i++) {
+    for (auto i = 0; i < olength; i++) {
       if (sep_cnt == 3) {
         result[integer_len - (rev_index++) - 1] = ',';
         sep_cnt                                 = 0;
@@ -1337,7 +1337,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     index = integer_len;
     if (digits == 0) { return index; }
     result[index++] = '.';
-    for (int i = 0; i < digits; i++) {
+    for (auto i = 0; i < digits; i++) {
       result[index++] = '0';
     }
   } else {
@@ -1356,7 +1356,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     int32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
     int32_t sep_cnt              = 0;
     int rev_index                = 0;
-    for (int i = 0; i < integer_len; i++) {
+    for (auto i = 0; i < integer_len; i++) {
       if (sep_cnt == 3) {
         result[formated_integer_len - (rev_index++) - 1] = ',';
         sep_cnt                                          = 0;
@@ -1369,11 +1369,11 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     if (digits == 0) { return index; }
     result[index++] = '.';
     int current     = index;
-    for (int i = 0; i < tailing_zero; i++) {
+    for (auto i = 0; i < tailing_zero; i++) {
       result[current + digits - i - 1] = '0';
       index++;
     }
-    for (int i = tailing_zero; i < digits; i++) {
+    for (auto i = tailing_zero; i < digits; i++) {
       result[current + digits - i - 1] = (char)('0' + decimal % 10);
       decimal /= 10;
       index++;
@@ -1430,7 +1430,7 @@ __device__ inline int copy_format_special_str(char* const result,
   } else {
     result[sign + 1] = '.';
   }
-  for (int i = 0; i < digits; i++) {
+  for (auto i = 0; i < digits; i++) {
     result[sign + 2 + i] = '0';
   }
   return sign + 2 + digits;
diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu
index c7c6c242b8..887b9887de 100644
--- a/src/main/cpp/src/get_json_object.cu
+++ b/src/main/cpp/src/get_json_object.cu
@@ -15,18 +15,15 @@
  */
 
 #include "get_json_object.hpp"
+#include "json_parser.cuh"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/json/json.hpp>
-#include <cudf/lists/list_device_view.cuh>
-#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -34,26 +31,29 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/optional.h>
 #include <thrust/pair.h>
-#include <thrust/scan.h>
 #include <thrust/tuple.h>
 
 namespace spark_rapids_jni {
 
 namespace detail {
 
+// path max depth limitation
+// There is a same constant in JSONUtil.java, keep them consistent when changing
+// Note: Spark-Rapids should guarantee the path depth is less or equal to this limit,
+// or GPU reports cudaErrorIllegalAddress
+constexpr int max_path_depth = 16;
+
 /**
  * write JSON style
  */
-enum class write_style { raw_style, quoted_style, flatten_style };
+enum class write_style { RAW, QUOTED, FLATTEN };
 
 /**
  * path instruction
@@ -276,8 +276,6 @@ class json_generator {
     }
   }
 
-  __device__ void reset() { output_len = 0; }
-
   __device__ inline size_t get_output_len() const { return output_len; }
   __device__ inline char* get_output_start_position() const { return output; }
   __device__ inline char* get_current_output_position() const { return output + output_len; }
@@ -305,353 +303,55 @@ class json_generator {
  */
 __device__ inline bool path_is_empty(size_t path_size) { return path_size == 0; }
 
-__device__ inline bool path_match_element(path_instruction const* path_ptr,
-                                          size_t path_size,
+__device__ inline bool path_match_element(cudf::device_span<path_instruction const> path,
                                           path_instruction_type path_type0)
 {
-  if (path_size < 1) { return false; }
-  return path_ptr[0].type == path_type0;
+  if (path.size() < 1) { return false; }
+  return path.data()[0].type == path_type0;
 }
 
-__device__ inline bool path_match_elements(path_instruction const* path_ptr,
-                                           size_t path_size,
+__device__ inline bool path_match_elements(cudf::device_span<path_instruction const> path,
                                            path_instruction_type path_type0,
                                            path_instruction_type path_type1)
 {
-  if (path_size < 2) { return false; }
-  return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1;
+  if (path.size() < 2) { return false; }
+  return path.data()[0].type == path_type0 && path.data()[1].type == path_type1;
 }
 
-__device__ inline bool path_match_elements(path_instruction const* path_ptr,
-                                           size_t path_size,
-                                           path_instruction_type path_type0,
-                                           path_instruction_type path_type1,
-                                           path_instruction_type path_type2,
-                                           path_instruction_type path_type3)
+__device__ inline thrust::tuple<bool, int> path_match_index(
+  cudf::device_span<path_instruction const> path)
 {
-  if (path_size < 4) { return false; }
-  return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1 &&
-         path_ptr[2].type == path_type2 && path_ptr[3].type == path_type3;
-}
-
-__device__ inline thrust::tuple<bool, int> path_match_subscript_index(
-  path_instruction const* path_ptr, size_t path_size)
-{
-  auto match = path_match_elements(
-    path_ptr, path_size, path_instruction_type::SUBSCRIPT, path_instruction_type::INDEX);
+  auto match = path_match_element(path, path_instruction_type::INDEX);
   if (match) {
-    return thrust::make_tuple(true, path_ptr[1].index);
+    return thrust::make_tuple(true, path.data()[0].index);
   } else {
     return thrust::make_tuple(false, 0);
   }
 }
 
 __device__ inline thrust::tuple<bool, cudf::string_view> path_match_named(
-  path_instruction const* path_ptr, size_t path_size)
+  cudf::device_span<path_instruction const> path)
 {
-  auto match = path_match_element(path_ptr, path_size, path_instruction_type::NAMED);
+  auto match = path_match_element(path, path_instruction_type::NAMED);
   if (match) {
-    return thrust::make_tuple(true, path_ptr[0].name);
+    return thrust::make_tuple(true, path.data()[0].name);
   } else {
     return thrust::make_tuple(false, cudf::string_view());
   }
 }
 
-__device__ inline thrust::tuple<bool, int> path_match_subscript_index_subscript_wildcard(
-  path_instruction const* path_ptr, size_t path_size)
+__device__ inline thrust::tuple<bool, int> path_match_index_wildcard(
+  cudf::device_span<path_instruction const> path)
 {
-  auto match = path_match_elements(path_ptr,
-                                   path_size,
-                                   path_instruction_type::SUBSCRIPT,
-                                   path_instruction_type::INDEX,
-                                   path_instruction_type::SUBSCRIPT,
-                                   path_instruction_type::WILDCARD);
+  auto match =
+    path_match_elements(path, path_instruction_type::INDEX, path_instruction_type::WILDCARD);
   if (match) {
-    return thrust::make_tuple(true, path_ptr[1].index);
+    return thrust::make_tuple(true, path.data()[0].index);
   } else {
     return thrust::make_tuple(false, 0);
   }
 }
 
-/**
- *
- * The following commented function is recursive version,
- * The next function below is the rewritten version,
- * Keep version here is for review purpuse, because rewritten version(iterative)
- * is not human friendly.
- *
- */
-// __device__ bool evaluate_path(json_parser& p,
-//                                            json_generator& g,
-//                                            write_style style,
-//                                            path_instruction const* path_ptr,
-//                                            int path_size)
-// {
-//   auto token = p.get_current_token();
-
-//   // case (VALUE_STRING, Nil) if style == RawStyle
-//   // case path 1
-//   if (json_token::VALUE_STRING == token && path_is_empty(path_size) &&
-//       style == write_style::raw_style) {
-//     // there is no array wildcard or slice parent, emit this string without
-//     // quotes write current string in parser to generator
-//     g.write_raw(p);
-//     return true;
-//   }
-//   // case (START_ARRAY, Nil) if style == FlattenStyle
-//   // case path 2
-//   else if (json_token::START_ARRAY == token && path_is_empty(path_size) &&
-//            style == write_style::flatten_style) {
-//     // flatten this array into the parent
-//     bool dirty = false;
-//     while (json_token::END_ARRAY != p.next_token()) {
-//       // JSON validation check
-//       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//       dirty |= path_evaluator::evaluate_path(p, g, style, nullptr, 0);
-//     }
-//     return dirty;
-//   }
-//   // case (_, Nil)
-//   // case path 3
-//   else if (path_is_empty(path_size)) {
-//     // general case: just copy the child tree verbatim
-//     return g.copy_current_structure(p);
-//   }
-//   // case (START_OBJECT, Key :: xs)
-//   // case path 4
-//   else if (json_token::START_OBJECT == token &&
-//            path_match_element(path_ptr, path_size, path_instruction_type::KEY)) {
-//     bool dirty = false;
-//     while (json_token::END_OBJECT != p.next_token()) {
-//       // JSON validation check
-//       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//       if (dirty) {
-//         // once a match has been found we can skip other fields
-//         if (!p.try_skip_children()) {
-//           // JSON validation check
-//           return false;
-//         }
-//       } else {
-//         dirty = path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1);
-//       }
-//     }
-//     return dirty;
-//   }
-//   // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs)
-//   // case path 5
-//   else if (json_token::START_ARRAY == token &&
-//            path_match_elements(path_ptr,
-//                                path_size,
-//                                path_instruction_type::SUBSCRIPT,
-//                                path_instruction_type::WILDCARD,
-//                                path_instruction_type::SUBSCRIPT,
-//                                path_instruction_type::WILDCARD)) {
-//     // special handling for the non-structure preserving double wildcard
-//     // behavior in Hive
-//     bool dirty = false;
-//     g.write_start_array();
-//     while (p.next_token() != json_token::END_ARRAY) {
-//       // JSON validation check
-//       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//       dirty |= path_evaluator::evaluate_path(
-//         p, g, write_style::flatten_style, path_ptr + 4, path_size - 4);
-//     }
-//     g.write_end_array();
-//     return dirty;
-//   }
-//   // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle
-//   // case path 6
-//   else if (json_token::START_ARRAY == token &&
-//            path_match_elements(path_ptr,
-//                                path_size,
-//                                path_instruction_type::SUBSCRIPT,
-//                                path_instruction_type::WILDCARD) &&
-//            style != write_style::quoted_style) {
-//     // retain Flatten, otherwise use Quoted... cannot use Raw within an array
-//     write_style next_style = write_style::raw_style;
-//     switch (style) {
-//       case write_style::raw_style: next_style = write_style::quoted_style; break;
-//       case write_style::flatten_style: next_style = write_style::flatten_style; break;
-//       case write_style::quoted_style: next_style = write_style::quoted_style;  // never happen
-//     }
-
-//     // temporarily buffer child matches, the emitted json will need to be
-//     // modified slightly if there is only a single element written
-
-//     int dirty = 0;
-//     // create a child generator with hide outer array tokens mode.
-//     auto child_g = g.new_child_generator(/*hide_outer_array_tokens*/ true);
-
-//     // Note: child generator does not actually write the outer start array
-//     // token into buffer it only updates internal nested state
-//     child_g.write_start_array();
-
-//     while (p.next_token() != json_token::END_ARRAY) {
-//       // JSON validation check
-//       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//       // track the number of array elements and only emit an outer array if
-//       // we've written more than one element, this matches Hive's behavior
-//       dirty +=
-//         (path_evaluator::evaluate_path(p, child_g, next_style, path_ptr + 2, path_size - 2) ? 1
-//                                                                                             :
-//                                                                                             0);
-//     }
-
-//     // Note: child generator does not actually write the outer end array token
-//     // into buffer it only updates internal nested state
-//     child_g.write_end_array();
-
-//     char* child_g_start = child_g.get_output_start_position();
-//     size_t child_g_len  = child_g.get_output_len();  // len already excluded outer [ ]
-
-//     if (dirty > 1) {
-//       // add outer array tokens
-//       g.write_child_raw_value(child_g_start, child_g_len, true);
-//     } else if (dirty == 1) {
-//       // remove outer array tokens
-//       g.write_child_raw_value(child_g_start, child_g_len, false);
-//     }  // else do not write anything
-
-//     return dirty > 0;
-//   }
-//   // case (START_ARRAY, Subscript :: Wildcard :: xs)
-//   // case path 7
-//   else if (json_token::START_ARRAY == token &&
-//            path_match_elements(path_ptr,
-//                                path_size,
-//                                path_instruction_type::SUBSCRIPT,
-//                                path_instruction_type::WILDCARD)) {
-//     bool dirty = false;
-//     g.write_start_array();
-//     while (p.next_token() != json_token::END_ARRAY) {
-//       // JSON validation check
-//       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//       // wildcards can have multiple matches, continually update the dirty
-//       // count
-//       dirty |= path_evaluator::evaluate_path(
-//         p, g, write_style::quoted_style, path_ptr + 2, path_size - 2);
-//     }
-//     g.write_end_array();
-
-//     return dirty;
-//   }
-//   /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */
-//   // case path 8
-//   else if (json_token::START_ARRAY == token &&
-//            thrust::get<0>(path_match_subscript_index_subscript_wildcard(path_ptr, path_size)))
-//            {
-//     int idx = thrust::get<1>(path_match_subscript_index_subscript_wildcard(path_ptr,
-//     path_size)); p.next_token();
-//     // JSON validation check
-//     if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//     int i = idx;
-//     while (i >= 0) {
-//       if (p.get_current_token() == json_token::END_ARRAY) {
-//         // terminate, nothing has been written
-//         return false;
-//       }
-//       if (0 == i) {
-//         bool dirty = path_evaluator::evaluate_path(
-//           p, g, write_style::quoted_style, path_ptr + 2, path_size - 2);
-//         while (p.next_token() != json_token::END_ARRAY) {
-//           // JSON validation check
-//           if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//           // advance the token stream to the end of the array
-//           if (!p.try_skip_children()) { return false; }
-//         }
-//         return dirty;
-//       } else {
-//         // i > 0
-//         if (!p.try_skip_children()) { return false; }
-
-//         p.next_token();
-//         // JSON validation check
-//         if (json_token::ERROR == p.get_current_token()) { return false; }
-//       }
-//       --i;
-//     }
-//     // path parser guarantees idx >= 0
-//     // will never reach to here
-//     return false;
-//   }
-//   // case (START_ARRAY, Subscript :: Index(idx) :: xs)
-//   // case path 9
-//   else if (json_token::START_ARRAY == token &&
-//            thrust::get<0>(path_match_subscript_index(path_ptr, path_size))) {
-//     int idx = thrust::get<1>(path_match_subscript_index(path_ptr, path_size));
-//     p.next_token();
-//     // JSON validation check
-//     if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//     int i = idx;
-//     while (i >= 0) {
-//       if (p.get_current_token() == json_token::END_ARRAY) {
-//         // terminate, nothing has been written
-//         return false;
-//       }
-//       if (0 == i) {
-//         bool dirty = path_evaluator::evaluate_path(p, g, style, path_ptr + 2, path_size - 2);
-//         while (p.next_token() != json_token::END_ARRAY) {
-//           // JSON validation check
-//           if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//           // advance the token stream to the end of the array
-//           if (!p.try_skip_children()) { return false; }
-//         }
-//         return dirty;
-//       } else {
-//         // i > 0
-//         if (!p.try_skip_children()) { return false; }
-
-//         p.next_token();
-//         // JSON validation check
-//         if (json_token::ERROR == p.get_current_token()) { return false; }
-//       }
-//       --i;
-//     }
-//     // path parser guarantees idx >= 0
-//     // will never reach to here
-//     return false;
-//   }
-//   // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name
-//   // case path 10
-//   else if (json_token::FIELD_NAME == token &&
-//            thrust::get<0>(path_match_named(path_ptr, path_size)) &&
-//            p.match_current_field_name(thrust::get<1>(path_match_named(path_ptr, path_size)))) {
-//     if (p.next_token() != json_token::VALUE_NULL) {
-//       // JSON validation check
-//       if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//       return path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1);
-//     } else {
-//       return false;
-//     }
-//   }
-//   // case (FIELD_NAME, Wildcard :: xs)
-//   // case path 11
-//   else if (json_token::FIELD_NAME == token &&
-//            path_match_element(path_ptr, path_size, path_instruction_type::WILDCARD)) {
-//     p.next_token();
-//     // JSON validation check
-//     if (json_token::ERROR == p.get_current_token()) { return false; }
-
-//     return path_evaluator::evaluate_path(p, g, style, path_ptr + 1, path_size - 1);
-//   }
-//   // case _ =>
-//   // case path 12
-//   else {
-//     if (!p.try_skip_children()) { return false; }
-//     return false;
-//   }
-// }
-
 /**
  *
  * This function is rewritten from above commented recursive function.
@@ -660,8 +360,7 @@ __device__ inline thrust::tuple<bool, int> path_match_subscript_index_subscript_
 __device__ bool evaluate_path(json_parser& p,
                               json_generator& root_g,
                               write_style root_style,
-                              path_instruction const* root_path_ptr,
-                              int root_path_size)
+                              cudf::device_span<path_instruction const> root_path)
 {
   // manually maintained context stack in lieu of calling evaluate_path recursively.
   struct context {
@@ -675,9 +374,8 @@ __device__ bool evaluate_path(json_parser& p,
     json_generator g;
 
     write_style style;
-    path_instruction const* path_ptr;
-    int path_size;
 
+    cudf::device_span<path_instruction const> path;
     // is this context task is done
     bool task_is_done;
 
@@ -692,12 +390,6 @@ __device__ bool evaluate_path(json_parser& p,
     json_generator child_g;
   };
 
-  // path max depth limitation
-  // There is a same constant in JSONUtil.java, keep them consistent when changing
-  // Note: Spark-Rapids should guarantee the path depth is less or equal to this limit,
-  // or GPU reports cudaErrorIllegalAddress
-  constexpr int max_path_depth = 16;
-
   // define stack; plus 1 indicates root context task needs an extra memory
   context stack[max_path_depth + 1];
   int stack_pos = 0;
@@ -707,8 +399,7 @@ __device__ bool evaluate_path(json_parser& p,
                                            int _case_path,
                                            json_generator _g,
                                            write_style _style,
-                                           path_instruction const* _path_ptr,
-                                           int _path_size) {
+                                           cudf::device_span<path_instruction const> _path) {
     // no need to check stack is full
     // because Spark-Rapids already checked maximum length of `path_instruction`
     auto& ctx          = stack[stack_pos];
@@ -716,8 +407,7 @@ __device__ bool evaluate_path(json_parser& p,
     ctx.case_path      = _case_path;
     ctx.g              = _g;
     ctx.style          = _style;
-    ctx.path_ptr       = _path_ptr;
-    ctx.path_size      = _path_size;
+    ctx.path           = _path;
     ctx.task_is_done   = false;
     ctx.dirty          = 0;
     ctx.is_first_enter = true;
@@ -726,7 +416,7 @@ __device__ bool evaluate_path(json_parser& p,
   };
 
   // put the first context task
-  push_context(p.get_current_token(), -1, root_g, root_style, root_path_ptr, root_path_size);
+  push_context(p.get_current_token(), -1, root_g, root_style, root_path);
 
   while (stack_pos > 0) {
     auto& ctx = stack[stack_pos - 1];
@@ -735,8 +425,8 @@ __device__ bool evaluate_path(json_parser& p,
 
       // case (VALUE_STRING, Nil) if style == RawStyle
       // case path 1
-      if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path_size) &&
-          ctx.style == write_style::raw_style) {
+      if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path.size()) &&
+          ctx.style == write_style::RAW) {
         // there is no array wildcard or slice parent, emit this string without
         // quotes write current string in parser to generator
         ctx.g.write_raw(p);
@@ -745,15 +435,15 @@ __device__ bool evaluate_path(json_parser& p,
       }
       // case (START_ARRAY, Nil) if style == FlattenStyle
       // case path 2
-      else if (json_token::START_ARRAY == ctx.token && path_is_empty(ctx.path_size) &&
-               ctx.style == write_style::flatten_style) {
+      else if (json_token::START_ARRAY == ctx.token && path_is_empty(ctx.path.size()) &&
+               ctx.style == write_style::FLATTEN) {
         // flatten this array into the parent
         if (json_token::END_ARRAY != p.next_token()) {
           // JSON validation check
           if (json_token::ERROR == p.get_current_token()) { return false; }
           // push back task
           // add child task
-          push_context(p.get_current_token(), 2, ctx.g, ctx.style, nullptr, 0);
+          push_context(p.get_current_token(), 2, ctx.g, ctx.style, {nullptr, 0});
         } else {
           // END_ARRAY
           ctx.task_is_done = true;
@@ -761,7 +451,7 @@ __device__ bool evaluate_path(json_parser& p,
       }
       // case (_, Nil)
       // case path 3
-      else if (path_is_empty(ctx.path_size)) {
+      else if (path_is_empty(ctx.path.size())) {
         // general case: just copy the child tree verbatim
         if (!(ctx.g.copy_current_structure(p))) {
           // JSON validation check
@@ -770,38 +460,87 @@ __device__ bool evaluate_path(json_parser& p,
         ctx.dirty        = 1;
         ctx.task_is_done = true;
       }
-      // case (START_OBJECT, Key :: xs)
+      // case (START_OBJECT, Named :: xs)
       // case path 4
       else if (json_token::START_OBJECT == ctx.token &&
-               path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::KEY)) {
-        if (json_token::END_OBJECT != p.next_token()) {
-          // JSON validation check
-          if (json_token::ERROR == p.get_current_token()) { return false; }
-
+               thrust::get<0>(path_match_named(ctx.path))) {
+        if (!ctx.is_first_enter) {
+          // 2st enter
+          // skip the following children after the expect
           if (ctx.dirty > 0) {
-            // once a match has been found we can skip other fields
-            if (!p.try_skip_children()) {
+            while (json_token::END_OBJECT != p.next_token()) {
+              // JSON validation check
+              if (json_token::ERROR == p.get_current_token()) { return false; }
+
+              // skip FIELD_NAME token
+              p.next_token();
               // JSON validation check
-              return false;
+              if (json_token::ERROR == p.get_current_token()) { return false; }
+
+              // skip value of FIELD_NAME
+              if (!p.try_skip_children()) {
+                // JSON validation check
+                return false;
+              }
             }
+            ctx.task_is_done = true;
           } else {
-            // need to try more children
-            push_context(
-              p.get_current_token(), 4, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
+            return false;
           }
         } else {
-          ctx.task_is_done = true;
+          // below is 1st enter
+          ctx.is_first_enter = false;
+          // match first mached children with expected name
+          bool found_expected_child = false;
+          while (json_token::END_OBJECT != p.next_token()) {
+            // JSON validation check
+            if (json_token::ERROR == p.get_current_token()) { return false; }
+
+            // need to try more children
+            auto match_named = path_match_named(ctx.path);
+            auto named       = thrust::get<1>(match_named);
+            // current token is FIELD_NAME
+            if (p.match_current_field_name(named)) {
+              // skip FIELD_NAME token
+              p.next_token();
+              // JSON validation check
+              if (json_token::ERROR == p.get_current_token()) { return false; }
+
+              // meets null token, it's not expected, return false
+              if (json_token::VALUE_NULL == p.get_current_token()) { return false; }
+              // push sub task; sub task will update the result of path 4
+              push_context(p.get_current_token(),
+                           4,
+                           ctx.g,
+                           ctx.style,
+                           {ctx.path.data() + 1, ctx.path.size() - 1});
+              found_expected_child = true;
+              break;
+            } else {
+              // skip FIELD_NAME token
+              p.next_token();
+              // JSON validation check
+              if (json_token::ERROR == p.get_current_token()) { return false; }
+
+              // current child is not expected, skip current child
+              if (!p.try_skip_children()) {
+                // JSON validation check
+                return false;
+              }
+            }
+          }
+          if (!found_expected_child) {
+            // did not find any expected sub child
+            ctx.task_is_done = true;
+            ctx.dirty        = false;
+          }
         }
       }
-      // case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs)
+      // case (START_ARRAY, Wildcard :: Wildcard :: xs)
       // case path 5
       else if (json_token::START_ARRAY == ctx.token &&
-               path_match_elements(ctx.path_ptr,
-                                   ctx.path_size,
-                                   path_instruction_type::SUBSCRIPT,
-                                   path_instruction_type::WILDCARD,
-                                   path_instruction_type::SUBSCRIPT,
-                                   path_instruction_type::WILDCARD)) {
+               path_match_elements(
+                 ctx.path, path_instruction_type::WILDCARD, path_instruction_type::WILDCARD)) {
         // special handling for the non-structure preserving double wildcard
         // behavior in Hive
         if (ctx.is_first_enter) {
@@ -815,28 +554,24 @@ __device__ bool evaluate_path(json_parser& p,
           push_context(p.get_current_token(),
                        5,
                        ctx.g,
-                       write_style::flatten_style,
-                       ctx.path_ptr + 4,
-                       ctx.path_size - 4);
+                       write_style::FLATTEN,
+                       {ctx.path.data() + 2, ctx.path.size() - 2});
         } else {
           ctx.g.write_end_array();
           ctx.task_is_done = true;
         }
       }
-      // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle
+      // case (START_ARRAY, Wildcard :: xs) if style != QuotedStyle
       // case path 6
       else if (json_token::START_ARRAY == ctx.token &&
-               path_match_elements(ctx.path_ptr,
-                                   ctx.path_size,
-                                   path_instruction_type::SUBSCRIPT,
-                                   path_instruction_type::WILDCARD) &&
-               ctx.style != write_style::quoted_style) {
+               path_match_element(ctx.path, path_instruction_type::WILDCARD) &&
+               ctx.style != write_style::QUOTED) {
         // retain Flatten, otherwise use Quoted... cannot use Raw within an array
-        write_style next_style = write_style::raw_style;
+        write_style next_style = write_style::RAW;
         switch (ctx.style) {
-          case write_style::raw_style: next_style = write_style::quoted_style; break;
-          case write_style::flatten_style: next_style = write_style::flatten_style; break;
-          case write_style::quoted_style: next_style = write_style::quoted_style;  // never happen
+          case write_style::RAW: next_style = write_style::QUOTED; break;
+          case write_style::FLATTEN: next_style = write_style::FLATTEN; break;
+          case write_style::QUOTED: next_style = write_style::QUOTED;  // never happen
         }
 
         // temporarily buffer child matches, the emitted json will need to be
@@ -858,12 +593,14 @@ __device__ bool evaluate_path(json_parser& p,
           if (json_token::ERROR == p.get_current_token()) { return false; }
           // track the number of array elements and only emit an outer array if
           // we've written more than one element, this matches Hive's behavior
-          push_context(
-            p.get_current_token(), 6, child_g, next_style, ctx.path_ptr + 2, ctx.path_size - 2);
+          push_context(p.get_current_token(),
+                       6,
+                       child_g,
+                       next_style,
+                       {ctx.path.data() + 1, ctx.path.size() - 1});
         } else {
           char* child_g_start = child_g.get_output_start_position();
           size_t child_g_len  = child_g.get_output_len();
-
           if (ctx.dirty > 1) {
             // add outer array tokens
             ctx.g.write_child_raw_value(
@@ -877,18 +614,14 @@ __device__ bool evaluate_path(json_parser& p,
           }  // else do not write anything
         }
       }
-      // case (START_ARRAY, Subscript :: Wildcard :: xs)
+      // case (START_ARRAY, Wildcard :: xs)
       // case path 7
       else if (json_token::START_ARRAY == ctx.token &&
-               path_match_elements(ctx.path_ptr,
-                                   ctx.path_size,
-                                   path_instruction_type::SUBSCRIPT,
-                                   path_instruction_type::WILDCARD)) {
+               path_match_element(ctx.path, path_instruction_type::WILDCARD)) {
         if (ctx.is_first_enter) {
           ctx.is_first_enter = false;
           ctx.g.write_start_array();
         }
-
         if (p.next_token() != json_token::END_ARRAY) {
           // JSON validation check
           if (json_token::ERROR == p.get_current_token()) { return false; }
@@ -898,21 +631,18 @@ __device__ bool evaluate_path(json_parser& p,
           push_context(p.get_current_token(),
                        7,
                        ctx.g,
-                       write_style::quoted_style,
-                       ctx.path_ptr + 2,
-                       ctx.path_size - 2);
+                       write_style::QUOTED,
+                       {ctx.path.data() + 1, ctx.path.size() - 1});
         } else {
           ctx.g.write_end_array();
           ctx.task_is_done = true;
         }
       }
-      /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */
+      /* case (START_ARRAY, Index(idx) :: (xs@Wildcard :: _)) */
       // case path 8
       else if (json_token::START_ARRAY == ctx.token &&
-               thrust::get<0>(
-                 path_match_subscript_index_subscript_wildcard(ctx.path_ptr, ctx.path_size))) {
-        int idx = thrust::get<1>(
-          path_match_subscript_index_subscript_wildcard(ctx.path_ptr, ctx.path_size));
+               thrust::get<0>(path_match_index_wildcard(ctx.path))) {
+        int idx = thrust::get<1>(path_match_index_wildcard(ctx.path));
 
         p.next_token();
         // JSON validation check
@@ -939,15 +669,13 @@ __device__ bool evaluate_path(json_parser& p,
         push_context(p.get_current_token(),
                      8,
                      ctx.g,
-                     write_style::quoted_style,
-                     ctx.path_ptr + 2,
-                     ctx.path_size - 2);
+                     write_style::QUOTED,
+                     {ctx.path.data() + 1, ctx.path.size() - 1});
       }
-      // case (START_ARRAY, Subscript :: Index(idx) :: xs)
+      // case (START_ARRAY, Index(idx) :: xs)
       // case path 9
-      else if (json_token::START_ARRAY == ctx.token &&
-               thrust::get<0>(path_match_subscript_index(ctx.path_ptr, ctx.path_size))) {
-        int idx = thrust::get<1>(path_match_subscript_index(ctx.path_ptr, ctx.path_size));
+      else if (json_token::START_ARRAY == ctx.token && thrust::get<0>(path_match_index(ctx.path))) {
+        int idx = thrust::get<1>(path_match_index(ctx.path));
 
         p.next_token();
         // JSON validation check
@@ -971,32 +699,7 @@ __device__ bool evaluate_path(json_parser& p,
 
         // i == 0
         push_context(
-          p.get_current_token(), 9, ctx.g, ctx.style, ctx.path_ptr + 2, ctx.path_size - 2);
-      }
-      // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name
-      // case path 10
-      else if (json_token::FIELD_NAME == ctx.token &&
-               thrust::get<0>(path_match_named(ctx.path_ptr, ctx.path_size)) &&
-               p.match_current_field_name(
-                 thrust::get<1>(path_match_named(ctx.path_ptr, ctx.path_size)))) {
-        if (p.next_token() != json_token::VALUE_NULL) {
-          // JSON validation check
-          if (json_token::ERROR == p.get_current_token()) { return false; }
-          push_context(
-            p.get_current_token(), 10, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
-        } else {
-          return false;
-        }
-      }
-      // case (FIELD_NAME, Wildcard :: xs)
-      // case path 11
-      else if (json_token::FIELD_NAME == ctx.token &&
-               path_match_element(ctx.path_ptr, ctx.path_size, path_instruction_type::WILDCARD)) {
-        p.next_token();
-        // JSON validation check
-        if (json_token::ERROR == p.get_current_token()) { return false; }
-        push_context(
-          p.get_current_token(), 11, ctx.g, ctx.style, ctx.path_ptr + 1, ctx.path_size - 1);
+          p.get_current_token(), 9, ctx.g, ctx.style, {ctx.path.data() + 1, ctx.path.size() - 1});
       }
       // case _ =>
       // case path 12
@@ -1024,22 +727,22 @@ __device__ bool evaluate_path(json_parser& p,
           // never happen
         }
         // path 2: case (START_ARRAY, Nil) if style == FlattenStyle
-        // path 5: case (START_ARRAY, Subscript :: Wildcard :: Subscript :: Wildcard :: xs)
-        // path 7: case (START_ARRAY, Subscript :: Wildcard :: xs)
+        // path 5: case (START_ARRAY, Wildcard :: Wildcard :: xs)
+        // path 7: case (START_ARRAY, Wildcard :: xs)
         else if (2 == ctx.case_path || 5 == ctx.case_path || 7 == ctx.case_path) {
           // collect result from child task
           p_ctx.dirty += ctx.dirty;
           // copy generator states to parent task;
           p_ctx.g = ctx.g;
         }
-        // case (START_OBJECT, Key :: xs)
+        // case (START_OBJECT, Named :: xs)
         // case path 4
         else if (4 == ctx.case_path) {
-          if (p_ctx.dirty < 1 && ctx.dirty > 0) { p_ctx.dirty = ctx.dirty; }
+          p_ctx.dirty = ctx.dirty;
           // copy generator states to parent task;
           p_ctx.g = ctx.g;
         }
-        // case (START_ARRAY, Subscript :: Wildcard :: xs) if style != QuotedStyle
+        // case (START_ARRAY, Wildcard :: xs) if style != QuotedStyle
         // case path 6
         else if (6 == ctx.case_path) {
           // collect result from child task
@@ -1047,9 +750,9 @@ __device__ bool evaluate_path(json_parser& p,
           // update child generator for parent task
           p_ctx.child_g = ctx.g;
         }
-        /* case (START_ARRAY, Subscript :: Index(idx) :: (xs@Subscript :: Wildcard :: _)) */
+        /* case (START_ARRAY, Index(idx) :: (xs@Wildcard :: _)) */
         // case path 8
-        // case (START_ARRAY, Subscript :: Index(idx) :: xs)
+        // case (START_ARRAY, Index(idx) :: xs)
         // case path 9
         else if (8 == ctx.case_path || 9 == ctx.case_path) {
           // collect result from child task
@@ -1067,26 +770,6 @@ __device__ bool evaluate_path(json_parser& p,
           // copy generator states to parent task;
           p_ctx.g = ctx.g;
         }
-        // case (FIELD_NAME, Named(name) :: xs) if p.getCurrentName == name
-        // case path 10
-        else if (10 == ctx.case_path) {
-          // collect result from child task
-          p_ctx.dirty += ctx.dirty;
-          // task is done
-          p_ctx.task_is_done = true;
-          // copy generator states to parent task;
-          p_ctx.g = ctx.g;
-        }
-        // case (FIELD_NAME, Wildcard :: xs)
-        // case path 11
-        else if (11 == ctx.case_path) {
-          // collect result from child task
-          p_ctx.dirty += ctx.dirty;
-          // task is done
-          p_ctx.task_is_done = true;
-          // copy generator states to parent task;
-          p_ctx.g = ctx.g;
-        }
         // case path 3: case (_, Nil)
         // case path 12: case _ =>
         // others
@@ -1108,7 +791,7 @@ rmm::device_uvector<path_instruction> construct_path_commands(
   std::vector<std::tuple<path_instruction_type, std::string, int64_t>> const& instructions,
   cudf::string_scalar const& all_names_scalar,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   int name_pos = 0;
 
@@ -1117,18 +800,9 @@ rmm::device_uvector<path_instruction> construct_path_commands(
   for (auto const& inst : instructions) {
     auto const& [type, name, index] = inst;
     switch (type) {
-      case path_instruction_type::SUBSCRIPT:
-        path_commands.emplace_back(path_instruction{path_instruction_type::SUBSCRIPT});
-        break;
       case path_instruction_type::WILDCARD:
         path_commands.emplace_back(path_instruction{path_instruction_type::WILDCARD});
         break;
-      case path_instruction_type::KEY:
-        path_commands.emplace_back(path_instruction{path_instruction_type::KEY});
-        path_commands.back().name =
-          cudf::string_view(all_names_scalar.data() + name_pos, name.size());
-        name_pos += name.size();
-        break;
       case path_instruction_type::INDEX:
         path_commands.emplace_back(path_instruction{path_instruction_type::INDEX});
         path_commands.back().index = index;
@@ -1160,14 +834,12 @@ rmm::device_uvector<path_instruction> construct_path_commands(
  * @returns A pair containing the result code and the output buffer.
  */
 __device__ thrust::pair<bool, size_t> get_json_object_single(
-  char const* input,
-  cudf::size_type input_len,
-  path_instruction const* path_commands_ptr,
-  int path_commands_size,
+  char_range input,
+  cudf::device_span<path_instruction const> path_commands,
   char* out_buf,
   size_t out_buf_size)
 {
-  json_parser j_parser(input, input_len);
+  json_parser j_parser(input);
   j_parser.next_token();
   // JSON validation check
   if (json_token::ERROR == j_parser.get_current_token()) { return {false, 0}; }
@@ -1176,12 +848,12 @@ __device__ thrust::pair<bool, size_t> get_json_object_single(
   // Second pass: writes output.
   // The generator automatically determines which pass based on `out_buf`.
   // If `out_buf_size` is zero, pass in `nullptr` to avoid generator writing trash output.
-  json_generator generator((out_buf == nullptr || out_buf_size == 0) ? nullptr : out_buf);
+  json_generator generator((out_buf_size == 0) ? nullptr : out_buf);
 
   bool const success = evaluate_path(
-    j_parser, generator, write_style::raw_style, path_commands_ptr, path_commands_size);
+    j_parser, generator, write_style::RAW, {path_commands.data(), path_commands.size()});
 
-  if (nullptr == out_buf && !success) {
+  if (!success) {
     // generator may contain trash output, e.g.: generator writes some output,
     // then JSON format is invalid, the previous output becomes trash.
     // set output as zero to tell second step
@@ -1199,19 +871,26 @@ __device__ thrust::pair<bool, size_t> get_json_object_single(
  * (chars and validity).
  *
  * @param col Device view of the incoming string
- * @param commands JSONPath command buffer
+ * @param path_commands JSONPath command buffer
+ * @param d_sizes a buffer used to write the output sizes in the first pass,
+ *        and is read back in on the second pass to compute offsets.
  * @param output_offsets Buffer used to store the string offsets for the results
  *        of the query
  * @param out_buf Buffer used to store the results of the query
  * @param out_validity Output validity buffer
  * @param out_valid_count Output count of # of valid bits
- * @param options Options controlling behavior
  */
 template <int block_size>
-__launch_bounds__(block_size) CUDF_KERNEL
+// We have 1 for the minBlocksPerMultiprocessor in the launch bounds to avoid spilling from
+// the kernel itself. By default NVCC uses a heuristic to find a balance between the
+// maximum number of registers used by a kernel and the parallelism of the kernel.
+// If lots of registers are used the parallelism may suffer. But in our case
+// NVCC gets this wrong and we want to avoid spilling all the time or else
+// the performance is really bad. This essentially tells NVCC to prefer using lots
+// of registers over spilling.
+__launch_bounds__(block_size, 1) CUDF_KERNEL
   void get_json_object_kernel(cudf::column_device_view col,
-                              path_instruction const* path_commands_ptr,
-                              int path_commands_size,
+                              cudf::device_span<path_instruction const> path_commands,
                               cudf::size_type* d_sizes,
                               cudf::detail::input_offsetalator output_offsets,
                               char* out_buf,
@@ -1233,8 +912,8 @@ __launch_bounds__(block_size) CUDF_KERNEL
         out_buf != nullptr ? output_offsets[tid + 1] - output_offsets[tid] : 0;
 
       // process one single row
-      auto [result, output_size] = get_json_object_single(
-        str.data(), str.size_bytes(), path_commands_ptr, path_commands_size, dst, dst_size);
+      auto [result, output_size] =
+        get_json_object_single(str, {path_commands.data(), path_commands.size()}, dst, dst_size);
       if (result) { is_valid = true; }
 
       // filled in only during the precompute step. during the compute step, the
@@ -1272,10 +951,12 @@ std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& input,
   std::vector<std::tuple<path_instruction_type, std::string, int64_t>> const& instructions,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return cudf::make_empty_column(cudf::type_id::STRING);
 
+  if (instructions.size() > max_path_depth) { CUDF_FAIL("JSONPath query exceeds maximum depth"); }
+
   // get a string buffer to store all the names and convert to device
   std::string all_names;
   for (auto const& inst : instructions) {
@@ -1296,14 +977,8 @@ std::unique_ptr<cudf::column> get_json_object(
   auto d_input_ptr = cudf::column_device_view::create(input.parent(), stream);
   // preprocess sizes (returned in the offsets buffer)
   get_json_object_kernel<block_size>
-    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(*d_input_ptr,
-                                                                         path_commands.data(),
-                                                                         path_commands.size(),
-                                                                         sizes.data(),
-                                                                         d_offsets,
-                                                                         nullptr,
-                                                                         nullptr,
-                                                                         nullptr);
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *d_input_ptr, path_commands, sizes.data(), d_offsets, nullptr, nullptr, nullptr);
 
   // convert sizes to offsets
   auto [offsets, output_size] =
@@ -1324,8 +999,7 @@ std::unique_ptr<cudf::column> get_json_object(
   get_json_object_kernel<block_size>
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
       *d_input_ptr,
-      path_commands.data(),
-      path_commands.size(),
+      path_commands,
       sizes.data(),
       d_offsets,
       chars.data(),
@@ -1345,7 +1019,7 @@ std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& input,
   std::vector<std::tuple<path_instruction_type, std::string, int64_t>> const& instructions,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return detail::get_json_object(input, instructions, stream, mr);
 }
diff --git a/src/main/cpp/src/get_json_object.hpp b/src/main/cpp/src/get_json_object.hpp
index cf1f0c3470..bb3294b424 100644
--- a/src/main/cpp/src/get_json_object.hpp
+++ b/src/main/cpp/src/get_json_object.hpp
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include "json_parser.cuh"
-
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/optional.h>
 #include <thrust/pair.h>
 #include <thrust/scan.h>
@@ -35,7 +35,7 @@ namespace spark_rapids_jni {
 /**
  * path instruction type
  */
-enum class path_instruction_type { SUBSCRIPT, WILDCARD, KEY, INDEX, NAMED };
+enum class path_instruction_type { WILDCARD, INDEX, NAMED };
 
 /**
  * Extracts json object from a json string based on json path specified, and
@@ -45,7 +45,7 @@ enum class path_instruction_type { SUBSCRIPT, WILDCARD, KEY, INDEX, NAMED };
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& input,
   std::vector<std::tuple<path_instruction_type, std::string, int64_t>> const& instructions,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/hash.cuh b/src/main/cpp/src/hash.cuh
index 1c6333523c..8cf489a7e7 100644
--- a/src/main/cpp/src/hash.cuh
+++ b/src/main/cpp/src/hash.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/reverse.h>
 
@@ -113,9 +114,9 @@ __device__ __inline__ std::pair<__int128_t, cudf::size_type> to_java_bigdecimal(
  */
 std::unique_ptr<cudf::column> murmur_hash3_32(
   cudf::table_view const& input,
-  uint32_t seed                       = 0,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  uint32_t seed                     = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the xxhash64 hash value of each row in the input set of columns.
@@ -129,8 +130,8 @@ std::unique_ptr<cudf::column> murmur_hash3_32(
  */
 std::unique_ptr<cudf::column> xxhash64(
   cudf::table_view const& input,
-  int64_t seed                        = DEFAULT_XXHASH64_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  int64_t seed                      = DEFAULT_XXHASH64_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/histogram.cu b/src/main/cpp/src/histogram.cu
index 3d606e9f0a..b78c5ae1e0 100644
--- a/src/main/cpp/src/histogram.cu
+++ b/src/main/cpp/src/histogram.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 
 #include "histogram.hpp"
 
-//
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
@@ -33,7 +32,6 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
-//
 #include <thrust/binary_search.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -42,7 +40,6 @@
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/scan.h>
 
-//
 #include <type_traits>
 
 namespace spark_rapids_jni {
@@ -69,7 +66,7 @@ struct fill_percentile_fn {
     auto const has_all_nulls = start >= end;
 
     auto const percentage_idx = idx % percentages.size();
-    if (out_validity && percentage_idx == 0) {
+    if (percentage_idx == 0) {
       // If the histogram only contains null elements, the output percentile will be null.
       out_validity[histogram_idx] = has_all_nulls ? 0 : 1;
     }
@@ -170,7 +167,7 @@ struct percentile_dispatcher {
                          bool has_null,
                          cudf::size_type num_histograms,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr) const
+                         rmm::device_async_resource_ref mr) const
   {
     // Returns all nulls for totally empty input.
     if (data.size() == 0 || percentages.size() == 0) {
@@ -191,7 +188,13 @@ struct percentile_dispatcher {
                                 stream,
                                 mr);
 
-    auto const fill_percentile = [&](auto const sorted_validity_it, auto const out_validity) {
+    // We may always have nulls in the output due to either:
+    // - Having nulls in the input, and/or,
+    // - Having empty histograms.
+    auto out_validities =
+      rmm::device_uvector<int8_t>(num_histograms, stream, rmm::mr::get_current_device_resource());
+
+    auto const fill_percentile = [&](auto const sorted_validity_it) {
       auto const sorted_input_it =
         thrust::make_permutation_iterator(data.begin<T>(), ordered_indices);
       thrust::for_each_n(rmm::exec_policy(stream),
@@ -203,23 +206,21 @@ struct percentile_dispatcher {
                                             accumulated_counts,
                                             percentages,
                                             percentiles->mutable_view().begin<double>(),
-                                            out_validity});
+                                            out_validities.begin()});
     };
 
     if (!has_null) {
-      fill_percentile(thrust::make_constant_iterator(true), nullptr);
+      fill_percentile(thrust::make_constant_iterator(true));
     } else {
       auto const sorted_validity_it = thrust::make_permutation_iterator(
         cudf::detail::make_validity_iterator<false>(data), ordered_indices);
-      auto out_validities =
-        rmm::device_uvector<int8_t>(num_histograms, stream, rmm::mr::get_current_device_resource());
-      fill_percentile(sorted_validity_it, out_validities.begin());
-
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        out_validities.begin(), out_validities.end(), thrust::identity{}, stream, mr);
-      if (null_count > 0) { return {std::move(percentiles), std::move(null_mask), null_count}; }
+      fill_percentile(sorted_validity_it);
     }
 
+    auto [null_mask, null_count] = cudf::detail::valid_if(
+      out_validities.begin(), out_validities.end(), thrust::identity{}, stream, mr);
+    if (null_count > 0) { return {std::move(percentiles), std::move(null_mask), null_count}; }
+
     return {std::move(percentiles), rmm::device_buffer{}, 0};
   }
 };
@@ -256,7 +257,7 @@ std::unique_ptr<cudf::column> wrap_in_list(std::unique_ptr<cudf::column>&& input
                                            cudf::size_type num_histograms,
                                            cudf::size_type num_percentages,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   if (input->size() == 0) {
     return cudf::lists::detail::make_empty_lists_column(input->type(), stream, mr);
@@ -283,7 +284,7 @@ std::unique_ptr<cudf::column> create_histogram_if_valid(cudf::column_view const&
                                                         cudf::column_view const& frequencies,
                                                         bool output_as_lists,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
     !frequencies.has_nulls(), "The input frequencies must not have nulls.", std::invalid_argument);
@@ -429,7 +430,7 @@ std::unique_ptr<cudf::column> percentile_from_histogram(cudf::column_view const&
                                                         std::vector<double> const& percentages,
                                                         bool output_as_list,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   check_input(input, percentages);
 
diff --git a/src/main/cpp/src/histogram.hpp b/src/main/cpp/src/histogram.hpp
index 43058d9522..23318bdfac 100644
--- a/src/main/cpp/src/histogram.hpp
+++ b/src/main/cpp/src/histogram.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 //
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace spark_rapids_jni {
 
@@ -50,8 +51,8 @@ std::unique_ptr<cudf::column> create_histogram_if_valid(
   cudf::column_view const& values,
   cudf::column_view const& frequencies,
   bool output_as_lists,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Compute percentiles from the given histograms and percentage values.
@@ -70,7 +71,7 @@ std::unique_ptr<cudf::column> percentile_from_histogram(
   cudf::column_view const& input,
   std::vector<double> const& percentage,
   bool output_as_lists,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/json_parser.cuh b/src/main/cpp/src/json_parser.cuh
index ec0790aa6b..217ec0047b 100644
--- a/src/main/cpp/src/json_parser.cuh
+++ b/src/main/cpp/src/json_parser.cuh
@@ -16,7 +16,6 @@
 #pragma once
 
 #include "ftos_converter.cuh"
-#include "string_to_float_cudf.cuh"
 
 #include <cudf/strings/detail/convert/string_to_float.cuh>
 #include <cudf/strings/string_view.hpp>
@@ -30,29 +29,15 @@ namespace spark_rapids_jni {
 /**
  * write style when writing out JSON string
  */
-enum class write_style {
+enum class escape_style {
   // e.g.: '\\r' is a string with 2 chars '\' 'r', writes 1 char '\r'
-  unescaped,
+  UNESCAPED,
 
-  // * e.g.: '"' is a string with 1 char '"', writes out 4 chars '"' '\' '\"'
+  // e.g.: '"' is a string with 1 char '"', writes out 4 chars '"' '\' '\"'
   // '"'
-  escaped
+  ESCAPED
 };
 
-// allow single quotes to represent strings in JSON
-// e.g.: {'k': 'v'} is valid when it's true
-constexpr bool allow_single_quotes = true;
-
-// Whether allow unescaped control characters in JSON Strings.
-// Unescaped control characters are ASCII characters with value less than 32,
-// including tab and line feed characters. ASCII values range is [0, 32)
-// e.g.: ["\n"] is valid, here \n is one char
-// If true, JSON is not conventional format.
-// e.g., how to represent carriage return and newline characters:
-//   if true, allow "\n\r" two control characters without escape directly
-//   if false, "\n\r" are not allowed, should use escape characters: "\\n\\r"
-constexpr bool allow_unescaped_control_chars = true;
-
 /**
  * @brief Maximum JSON nesting depth
  * JSON with a greater depth is invalid
@@ -60,12 +45,6 @@ constexpr bool allow_unescaped_control_chars = true;
  */
 constexpr int max_json_nesting_depth = 64;
 
-// Define the maximum JSON String length, counts utf8 bytes.
-// By default, maximum JSON String length is negative one, means no
-// limitation. e.g.: The length of String "\\n" is 1, JSON parser does not
-// count escape characters.
-constexpr int max_string_utf8_bytes = 20000000;
-
 //
 /**
  * Define the maximum JSON number length. Negative or zero means no
@@ -79,15 +58,6 @@ constexpr int max_string_utf8_bytes = 20000000;
  */
 constexpr int max_num_len = 1000;
 
-/**
- * whether allow tailing useless sub-string in JSON.
- *
- * If true, e.g., the following invalid JSON is allowed, because prefix {'k' :
- * 'v'} is valid.
- *   {'k' : 'v'}_extra_tail_sub_string
- */
-constexpr bool allow_tailing_sub_string = true;
-
 /**
  * JSON token enum
  */
@@ -136,6 +106,84 @@ enum class json_token {
 
 };
 
+/**
+ * This is similar to cudf::string_view, but cudf::string_view enforces
+ * UTF-8 encoding, which adds overhead that is not needed for this process.
+ */
+class char_range {
+ public:
+  __device__ inline char_range(char const* const start, cudf::size_type const len)
+    : _data(start), _len(len)
+  {
+  }
+
+  __device__ inline char_range(cudf::string_view const& input)
+    : _data(input.data()), _len(input.size_bytes())
+  {
+  }
+
+  // Warning it looks like there is some kind of a bug in CUDA where you don't want to initialize
+  // a member variable with a static method like this.
+  __device__ inline static char_range null() { return char_range(nullptr, 0); }
+
+  __device__ inline char_range(char_range const&)            = default;
+  __device__ inline char_range(char_range&&)                 = default;
+  __device__ inline char_range& operator=(char_range const&) = default;
+  __device__ inline char_range& operator=(char_range&&)      = default;
+  __device__ inline ~char_range()                            = default;
+
+  __device__ inline cudf::size_type size() const { return _len; }
+  __device__ inline char const* data() const { return _data; }
+  __device__ inline char const* start() const { return _data; }
+  __device__ inline char const* end() const { return _data + _len; }
+
+  __device__ inline bool eof(cudf::size_type pos) const { return pos >= _len; }
+  __device__ inline bool is_null() const { return _data == nullptr; }
+  __device__ inline bool is_empty() const { return _len == 0; }
+
+  __device__ inline char operator[](cudf::size_type pos) const { return _data[pos]; }
+
+  __device__ inline cudf::string_view slice_sv(cudf::size_type pos, cudf::size_type len) const
+  {
+    return cudf::string_view(_data + pos, len);
+  }
+
+  __device__ inline char_range slice(cudf::size_type pos, cudf::size_type len) const
+  {
+    return char_range(_data + pos, len);
+  }
+
+ private:
+  char const* _data;
+  cudf::size_type _len;
+};
+
+/**
+ * A char_range that keeps track of where in the data it currently is.
+ */
+class char_range_reader {
+ public:
+  __device__ inline explicit char_range_reader(char_range range) : _range(range), _pos(0) {}
+
+  __device__ inline char_range_reader(char_range range, cudf::size_type start)
+    : _range(range), _pos(start)
+  {
+  }
+
+  __device__ inline bool eof() const { return _range.eof(_pos); }
+  __device__ inline bool is_null() const { return _range.is_null(); }
+
+  __device__ inline void next() { _pos++; }
+
+  __device__ inline char current_char() const { return _range[_pos]; }
+
+  __device__ inline cudf::size_type pos() const { return _pos; }
+
+ private:
+  char_range _range;
+  cudf::size_type _pos;
+};
+
 /**
  * JSON parser, provides token by token parsing.
  * Follow Jackson JSON format by default.
@@ -144,8 +192,9 @@ enum class json_token {
  * For JSON format:
  * Refer to https://www.json.org/json-en.html.
  *
- * Note: when setting `allow_single_quotes` or `allow_unescaped_control_chars`,
- * then JSON format is not conventional.
+ * Note: This is not conventional as it allows
+ * single quotes and unescaped control characters
+ * to match what SPARK does for get_json_object
  *
  * White space can only be 4 chars: ' ', '\n', '\r', '\t',
  * Jackson does not allow other control chars as white spaces.
@@ -160,31 +209,18 @@ enum class json_token {
  *   infinity, +infinity, -infinity
  *   1e, 1e+, 1e-, -1., 1.
  *
- * When `allow_single_quotes` is true:
- *   Valid string examples:
+ * Valid string examples:
  *     "\'" , "\"" ,  '\'' , '\"' , '"' , "'"
  *
- *  When `allow_single_quotes` is false:
- *   Invalid string examples:
- *     "\'"
- *
- *  When `allow_unescaped_control_chars` is true:
- *    Valid string: "asscii_control_chars"
- *      here `asscii_control_chars` represents control chars which in Ascii code
- * range: [0, 32)
- *
- *  When `allow_unescaped_control_chars` is false:
- *    Invalid string: "asscii_control_chars"
- *      here `asscii_control_chars` represents control chars which in Ascii code
+ * Valid string: "ascii_control_chars"
+ *    here `ascii_control_chars` represents control chars which in Ascii code
  * range: [0, 32)
  *
  */
 class json_parser {
  public:
-  __device__ inline json_parser(char const* const _json_start_pos, cudf::size_type const _json_len)
-    : json_start_pos(_json_start_pos),
-      json_end_pos(_json_start_pos + _json_len),
-      curr_pos(_json_start_pos)
+  __device__ inline explicit json_parser(char_range _chars)
+    : chars(_chars), curr_pos(0), current_token(json_token::INIT)
   {
   }
 
@@ -223,12 +259,13 @@ class json_parser {
   /**
    * is current position EOF
    */
-  __device__ inline bool eof(char const* pos) { return pos >= json_end_pos; }
+  __device__ inline bool eof(cudf::size_type pos) const { return pos >= chars.size(); }
+  __device__ inline bool eof() const { return curr_pos >= chars.size(); }
 
   /**
    * is hex digits: 0-9, A-F, a-f
    */
-  __device__ inline bool is_hex_digit(char c)
+  __device__ inline bool is_hex_digit(char c) const
   {
     return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
   }
@@ -236,12 +273,12 @@ class json_parser {
   /**
    * is 0 to 9 digit
    */
-  __device__ inline bool is_digit(char c) { return (c >= '0' && c <= '9'); }
+  __device__ inline bool is_digit(char c) const { return (c >= '0' && c <= '9'); }
 
   /**
    * is white spaces: ' ', '\t', '\n' '\r'
    */
-  __device__ inline bool is_whitespace(char c)
+  __device__ inline bool is_whitespace(char c) const
   {
     return c == ' ' || c == '\t' || c == '\n' || c == '\r';
   }
@@ -249,19 +286,28 @@ class json_parser {
   /**
    * skips 4 characters: ' ', '\t', '\n' '\r'
    */
-  __device__ inline void skip_whitespaces(char const*& pos)
+  __device__ inline void skip_whitespaces()
   {
-    while (!eof(pos) && is_whitespace(*pos)) {
-      pos++;
+    while (!eof() && is_whitespace(chars[curr_pos])) {
+      curr_pos++;
     }
   }
 
   /**
    * check current char, if it's expected, then plus the position
    */
-  __device__ inline bool try_skip(char const*& pos, char expected)
+  __device__ inline bool try_skip(char_range_reader& reader, char expected)
   {
-    if (!eof(pos) && *pos == expected) {
+    if (!reader.eof() && reader.current_char() == expected) {
+      reader.next();
+      return true;
+    }
+    return false;
+  }
+
+  __device__ inline bool try_skip(cudf::size_type& pos, char expected)
+  {
+    if (!eof(pos) && chars[pos] == expected) {
       pos++;
       return true;
     }
@@ -312,6 +358,8 @@ class json_parser {
    */
   __device__ inline bool is_context_stack_empty() { return stack_size == 0; }
 
+  __device__ inline void set_current_error() { current_token = json_token::ERROR; }
+
   /**
    * parse the first value token from current position
    * e.g., after finished this function:
@@ -320,182 +368,66 @@ class json_parser {
    *   current token is string/num/true/false/null if current value is terminal
    *   current token is ERROR if parse failed
    */
-  __device__ inline void parse_first_token_in_value()
+  __device__ inline void parse_first_token_in_value_and_set_current()
   {
+    current_token_start_pos = curr_pos;
     // already checked eof
-    char c = *curr_pos;
+    char c = chars[curr_pos];
     switch (c) {
       case '{':
         if (!try_push_context(json_token::START_OBJECT)) {
-          curr_token = json_token::ERROR;
-          return;
+          set_current_error();
+        } else {
+          curr_pos++;
+          current_token = json_token::START_OBJECT;
         }
-        curr_pos++;
-        curr_token = json_token::START_OBJECT;
         break;
-
       case '[':
         if (!try_push_context(json_token::START_ARRAY)) {
-          curr_token = json_token::ERROR;
-          return;
-        }
-        curr_pos++;
-        curr_token = json_token::START_ARRAY;
-        break;
-
-      case '"': parse_double_quoted_string(); break;
-
-      case '\'':
-        if (allow_single_quotes) {
-          parse_single_quoted_string();
+          set_current_error();
         } else {
-          curr_token = json_token::ERROR;
+          curr_pos++;
+          current_token = json_token::START_ARRAY;
         }
         break;
-
+      case '"':
+        // fall through
+      case '\'': parse_string_and_set_current(); break;
       case 't':
         curr_pos++;
-        parse_true();
+        parse_true_and_set_current();
         break;
-
       case 'f':
         curr_pos++;
-        parse_false();
+        parse_false_and_set_current();
         break;
-
       case 'n':
         curr_pos++;
-        parse_null();
+        parse_null_and_set_current();
         break;
-
-      default: parse_number();
+      default: parse_number_and_set_current(); break;
     }
   }
 
   // =========== Parse string begin ===========
 
   /**
-   * parse ' quoted string
+   * parse quoted string and set current token
    */
-  __device__ inline void parse_single_quoted_string()
+  __device__ inline void parse_string_and_set_current()
   {
-    auto [success, end_char_pos] =
-      try_parse_single_quoted_string(curr_pos, nullptr, nullptr, nullptr, write_style::unescaped);
+    // TODO eventually chars should be a reader so we can just pass it in...
+    char_range_reader reader(chars, curr_pos);
+    auto [success, end_char_pos] = try_parse_string(reader);
     if (success) {
-      curr_pos   = end_char_pos;
-      curr_token = json_token::VALUE_STRING;
+      // TODO remove end_char_pos, and just get it from the reader...
+      curr_pos      = end_char_pos;
+      current_token = json_token::VALUE_STRING;
     } else {
-      curr_token = json_token::ERROR;
+      set_current_error();
     }
   }
 
-  /**
-   * parse " quoted string
-   */
-  __device__ inline void parse_double_quoted_string()
-  {
-    auto [success, end_char_pos] =
-      try_parse_double_quoted_string(curr_pos, nullptr, nullptr, nullptr, write_style::unescaped);
-    if (success) {
-      curr_pos   = end_char_pos;
-      curr_token = json_token::VALUE_STRING;
-    } else {
-      curr_token = json_token::ERROR;
-    }
-  }
-
-  /*
-   * try parse ' or " quoted string
-   *
-   * when allow single quote, first try single quote
-   * @param str_pos str start position for parsing, should be a position in JSON
-   * string
-   * @param to_match_str_pos expected match str position, nullptr means do not
-   * match
-   * @param to_match_str_end expected match str end
-   * @param copy_destination copy unescaped str to destination, nullptr means do
-   * not copy
-   * @return whether passed successfully and the end position of parsed str
-   *
-   */
-  __device__ inline std::pair<bool, char const*> try_parse_string(
-    char const* str_pos,
-    char const* to_match_str_pos,
-    char const* const to_match_str_end,
-    char* copy_destination,
-    write_style w_style)
-  {
-    if (!eof(str_pos)) {
-      if (allow_single_quotes && *str_pos == '\'') {
-        return try_parse_single_quoted_string(
-          str_pos, to_match_str_pos, to_match_str_end, copy_destination, w_style);
-      } else {
-        return try_parse_double_quoted_string(
-          str_pos, to_match_str_pos, to_match_str_end, copy_destination, w_style);
-      }
-    } else {
-      return std::make_pair(false, nullptr);
-    }
-  }
-
-  /**
-   * try parse ' quoted string
-   *
-   * when allow single quote, first try single quote
-   * @param str_pos str start position for parsing, should be a position in JSON
-   * string
-   * @param to_match_str_pos expected match str position, nullptr means do not
-   * match
-   * @param to_match_str_end expected match str end
-   * @param copy_destination copy unescaped str to destination, nullptr means do
-   * not copy
-   *
-   */
-  __device__ inline std::pair<bool, char const*> try_parse_single_quoted_string(
-    char const* str_pos,
-    char const* to_match_str_pos,
-    char const* const to_match_str_end,
-    char* copy_destination,
-    write_style w_style)
-  {
-    return try_parse_quoted_string(str_pos,
-                                   '\'',
-                                   to_match_str_pos,  // match str pos, nullptr means do not match
-                                   to_match_str_end,  // match str end
-                                   copy_destination,  // copy destination while parsing, nullptr
-                                                      // means do not copy
-                                   w_style);
-  }
-
-  /**
-   * try parse " quoted string.
-   *
-   * when allow single quote, first try single quote
-   * @param str_pos str start position for parsing, should be a position in JSON
-   * string
-   * @param to_match_str_pos expected match str position, nullptr means do not
-   * match
-   * @param to_match_str_end expected match str end
-   * @param copy_destination copy unescaped str to destination, nullptr means do
-   * not copy
-   *
-   */
-  __device__ inline std::pair<bool, char const*> try_parse_double_quoted_string(
-    char const* str_pos,
-    char const* to_match_str_pos,
-    char const* const to_match_str_end,
-    char* copy_destination,
-    write_style w_style)
-  {
-    return try_parse_quoted_string(str_pos,
-                                   '\"',
-                                   to_match_str_pos,  // match str pos, nullptr means do not match
-                                   to_match_str_end,  // match str end
-                                   copy_destination,  // copy destination while parsing, nullptr
-                                                      // means do not copy
-                                   w_style);
-  }
-
   /**
    * transform int value from [0, 15] to hex char
    */
@@ -567,6 +499,81 @@ class json_parser {
     }
   }
 
+  __device__ inline int write_string(char_range_reader& str,
+                                     char* copy_destination,
+                                     escape_style w_style)
+  {
+    if (str.eof()) { return 0; }
+    char const quote_char = str.current_char();
+    int output_size_bytes = 0;
+
+    // write the first " if write style is escaped
+    if (escape_style::ESCAPED == w_style) {
+      output_size_bytes++;
+      if (nullptr != copy_destination) { *copy_destination++ = '"'; }
+    }
+
+    // skip left quote char
+    // No need to check because we just read it in.
+    str.next();
+
+    // scan string content
+    while (!str.eof()) {
+      char const c = str.current_char();
+      int const v  = static_cast<int>(c);
+      if (c == quote_char) {
+        // path 1: match closing quote char
+        str.next();
+
+        // write the end " if write style is escaped
+        if (escape_style::ESCAPED == w_style) {
+          output_size_bytes++;
+          if (nullptr != copy_destination) { *copy_destination++ = '"'; }
+        }
+
+        return output_size_bytes;
+      } else if (v >= 0 && v < 32) {
+        // path 2: unescaped control char
+
+        // copy if enabled, unescape mode, write 1 char
+        if (escape_style::UNESCAPED == w_style) {
+          output_size_bytes++;
+          if (copy_destination != nullptr) { *copy_destination++ = str.current_char(); }
+        } else {
+          // escape_style::ESCAPED
+          int const escape_chars = escape_char(str.current_char(), copy_destination);
+          if (copy_destination != nullptr) { copy_destination += escape_chars; }
+          output_size_bytes += escape_chars;
+        }
+
+        str.next();
+      } else if ('\\' == c) {
+        // path 3: escape path
+        str.next();
+        char_range_reader to_match(char_range::null());
+        if (!try_skip_escape_part(str, to_match, copy_destination, w_style, output_size_bytes)) {
+          return output_size_bytes;
+        }
+      } else {
+        // path 4: safe code point
+
+        // handle single unescaped " char; happens when string is quoted by char '
+        // e.g.:  'A"' string, escape to "A\\"" (5 chars: " A \ " ")
+        if ('\"' == c && escape_style::ESCAPED == w_style) {
+          if (copy_destination != nullptr) { *copy_destination++ = '\\'; }
+          output_size_bytes++;
+        }
+
+        if (copy_destination != nullptr) { *copy_destination++ = c; }
+        str.next();
+        output_size_bytes++;
+      }
+    }
+
+    // technically this is an error state, but we will do our best from here...
+    return output_size_bytes;
+  }
+
   /**
    * utility for parsing string, this function does not update the parser
    * internal try parse quoted string using passed `quote_char` `quote_char` can
@@ -580,7 +587,7 @@ class json_parser {
    * } , :), string quote char(" ') and Escape char \ are all Ascii(The leading
    * bit is 0), so it's safe that do not convert byte array to UTF-8 char.
    *
-   * When quote is " and allow_unescaped_control_chars is false, grammar is:
+   * When quote is " grammar is:
    *
    *   STRING
    *     : '"' (ESC | SAFECODEPOINT)* '"'
@@ -599,130 +606,94 @@ class json_parser {
    *     ;
    *
    *   fragment SAFECODEPOINT
-   *       // 1 not " or ' depending to allow_single_quotes
+   *       // 1 not " or '
    *       // 2 not \
    *       // 3 non control character: Ascii value not in [0, 32)
    *     : ~ ["\\\u0000-\u001F]
    *     ;
    *
-   * When allow_unescaped_control_chars is true:
-   *   Allow [0-32) control Ascii chars directly without escape
-   * When allow_single_quotes is true:
-   *   These strings are allowed: '\'' , '\"' , '"' , "\"" , "\'" , "'"
-   * @param str_pos str start position for parsing, should be a position in JSON
-   * string
-   * @param quote_char expected quote char
-   * @param to_match_str_pos expected match str position, nullptr means do not
-   * match
-   * @param to_match_str_end expected match str end
-   * @param copy_destination copy unescaped str to destination, nullptr means do
-   * not copy
+   * @param str string to parse
+   * @param to_match expected match str
+   * @param w_style the escape style for writing.
+   * @return a pair of success and length, where success is true if the string
+   * is valid and length is the number of bytes needed to encode the string
+   * in the given style.
    */
-  __device__ inline std::pair<bool, char const*> try_parse_quoted_string(
-    char const* str_pos,
-    char const quote_char,
-    char const* to_match_str_pos,
-    char const* const to_match_str_end,
-    char* copy_destination,
-    write_style w_style)
+  __device__ inline std::pair<bool, cudf::size_type> try_parse_string(
+    char_range_reader& str,
+    char_range_reader to_match = char_range_reader(char_range::null()),
+    escape_style w_style       = escape_style::UNESCAPED)
   {
-    // update state
-    string_token_utf8_bytes       = 0;
-    bytes_diff_for_escape_writing = 0;
+    if (str.eof()) { return std::make_pair(false, 0); }
+    char const quote_char = str.current_char();
+    int output_size_bytes = 0;
 
     // write the first " if write style is escaped
-    if (write_style::escaped == w_style) {
-      bytes_diff_for_escape_writing++;
-      if (nullptr != copy_destination) { *copy_destination++ = '"'; }
-    }
+    if (escape_style::ESCAPED == w_style) { output_size_bytes++; }
 
     // skip left quote char
-    if (!try_skip(str_pos, quote_char)) { return std::make_pair(false, nullptr); }
+    // We don't need to actually verify what it is, because we just read it.
+    str.next();
 
     // scan string content
-    while (!eof(str_pos)) {
-      char c = *str_pos;
+    while (!str.eof()) {
+      char c = str.current_char();
       int v  = static_cast<int>(c);
       if (c == quote_char) {
         // path 1: match closing quote char
-        str_pos++;
-
-        // check max str len
-        if (!check_string_max_utf8_bytes()) { return std::make_pair(false, nullptr); }
+        str.next();
 
         // match check, the last char in match_str is quote_char
-        if (nullptr != to_match_str_pos) {
-          // match check, the last char in match_str is quote_char
-          if (to_match_str_pos != to_match_str_end) { return std::make_pair(false, nullptr); }
-        }
+        if (!to_match.is_null() && !to_match.eof()) { return std::make_pair(false, 0); }
 
         // write the end " if write style is escaped
-        if (write_style::escaped == w_style) {
-          bytes_diff_for_escape_writing++;
-          if (nullptr != copy_destination) { *copy_destination++ = '"'; }
-        }
+        if (escape_style::ESCAPED == w_style) { output_size_bytes++; }
 
-        return std::make_pair(true, str_pos);
-      } else if (v >= 0 && v < 32 && allow_unescaped_control_chars) {
+        return std::make_pair(true, str.pos());
+      } else if (v >= 0 && v < 32) {
         // path 2: unescaped control char
 
-        // copy if enabled, unescape mode, write 1 char
-        if (copy_destination != nullptr && write_style::unescaped == w_style) {
-          *copy_destination++ = *str_pos;
-        }
-
         // copy if enabled, escape mode, write more chars
-        if (write_style::escaped == w_style) {
-          int escape_chars = escape_char(*str_pos, copy_destination);
-          if (copy_destination != nullptr) copy_destination += escape_chars;
-          bytes_diff_for_escape_writing += (escape_chars - 1);
+        if (escape_style::ESCAPED == w_style) {
+          int escape_chars = escape_char(str.current_char(), nullptr);
+          output_size_bytes += (escape_chars - 1);
         }
 
         // check match if enabled
-        if (!try_match_char(to_match_str_pos, to_match_str_end, *str_pos)) {
-          return std::make_pair(false, nullptr);
-        }
+        if (!try_match_char(to_match, str.current_char())) { return std::make_pair(false, 0); }
 
-        str_pos++;
-        string_token_utf8_bytes++;
+        str.next();
+        output_size_bytes++;
         continue;
       } else if ('\\' == c) {
         // path 3: escape path
-        str_pos++;
-        if (!try_skip_escape_part(
-              str_pos, to_match_str_pos, to_match_str_end, copy_destination, w_style)) {
-          return std::make_pair(false, nullptr);
+        str.next();
+        char* copy_dest_nullptr = nullptr;
+        if (!try_skip_escape_part(str, to_match, copy_dest_nullptr, w_style, output_size_bytes)) {
+          return std::make_pair(false, 0);
         }
       } else {
         // path 4: safe code point
 
         // handle single unescaped " char; happens when string is quoted by char '
         // e.g.:  'A"' string, escape to "A\\"" (5 chars: " A \ " ")
-        if ('\"' == c && write_style::escaped == w_style) {
-          if (copy_destination != nullptr) { *copy_destination++ = '\\'; }
-          bytes_diff_for_escape_writing++;
-        }
+        if ('\"' == c && escape_style::ESCAPED == w_style) { output_size_bytes++; }
 
-        if (!try_skip_safe_code_point(str_pos, c)) { return std::make_pair(false, nullptr); }
-        if (copy_destination != nullptr) { *copy_destination++ = c; }
+        if (!try_skip_safe_code_point(str, c)) { return std::make_pair(false, 0); }
         // check match if enabled
-        if (!try_match_char(to_match_str_pos, to_match_str_end, c)) {
-          return std::make_pair(false, nullptr);
-        }
-        string_token_utf8_bytes++;
+        if (!try_match_char(to_match, c)) { return std::make_pair(false, 0); }
+        output_size_bytes++;
       }
     }
 
-    return std::make_pair(false, nullptr);
+    return std::make_pair(false, 0);
   }
 
-  __device__ inline bool try_match_char(char const*& char_pos,
-                                        char const* const char_end_pos,
-                                        char c)
+  __device__ inline bool try_match_char(char_range_reader& reader, char c)
   {
-    if (nullptr != char_pos) {
-      if (char_pos < char_end_pos && *char_pos == c) {
-        char_pos++;
+    if (!reader.is_null()) {
+      if (!reader.eof() && reader.current_char() == c) {
+        reader.next();
         return true;
       } else {
         return false;
@@ -737,137 +708,132 @@ class json_parser {
    * skip the HEX chars in \u HEX HEX HEX HEX.
    * @return positive escaped ASCII value if success, -1 otherwise
    */
-  __device__ inline bool try_skip_escape_part(char const*& str_pos,
-                                              char const*& to_match_str_pos,
-                                              char const* const to_match_str_end,
+  __device__ inline bool try_skip_escape_part(char_range_reader& str,
+                                              char_range_reader& to_match,
                                               char*& copy_dest,
-                                              write_style w_style)
+                                              escape_style w_style,
+                                              int& output_size_bytes)
   {
     // already skipped the first '\'
     // try skip second part
-    if (!eof(str_pos)) {
-      char c = *str_pos;
-      switch (*str_pos) {
+    if (!str.eof()) {
+      char const c = str.current_char();
+      switch (c) {
         // path 1: \", \', \\, \/, \b, \f, \n, \r, \t
         case '\"':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = c; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = c; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = '"';
             }
-            bytes_diff_for_escape_writing++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
-          string_token_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, c)) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case '\'':
-          // only allow escape ' when `allow_single_quotes`
-          if (allow_single_quotes) {
-            // for both unescaped/escaped writes a single char '
-            if (nullptr != copy_dest) { *copy_dest++ = c; }
-            if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
-
-            string_token_utf8_bytes++;
-            str_pos++;
-            return true;
-          } else {
-            return false;
-          }
+          // for both unescaped/escaped writes a single char '
+          if (nullptr != copy_dest) { *copy_dest++ = c; }
+          if (!try_match_char(to_match, c)) { return false; }
+
+          output_size_bytes++;
+          str.next();
+          return true;
         case '\\':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = c; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = c; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = '\\';
             }
-            bytes_diff_for_escape_writing++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
-          string_token_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, c)) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case '/':
           // for both unescaped/escaped writes a single char /
           if (nullptr != copy_dest) { *copy_dest++ = c; }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, c)) { return false; }
-          string_token_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, c)) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case 'b':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = '\b'; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\b'; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = 'b';
             }
-            bytes_diff_for_escape_writing++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, '\b')) { return false; }
-          string_token_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, '\b')) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case 'f':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = '\f'; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\f'; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = 'f';
             }
-            bytes_diff_for_escape_writing++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, '\f')) { return false; }
-          string_token_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, '\f')) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case 'n':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = '\n'; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\n'; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = 'n';
             }
-            bytes_diff_for_escape_writing++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, '\n')) { return false; }
-          string_token_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, '\n')) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case 'r':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = '\r'; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\r'; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = 'r';
             }
-            bytes_diff_for_escape_writing++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, '\r')) { return false; }
-          string_token_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, '\r')) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         case 't':
-          if (nullptr != copy_dest && write_style::unescaped == w_style) { *copy_dest++ = '\t'; }
-          if (write_style::escaped == w_style) {
+          if (nullptr != copy_dest && escape_style::UNESCAPED == w_style) { *copy_dest++ = '\t'; }
+          if (escape_style::ESCAPED == w_style) {
             if (copy_dest != nullptr) {
               *copy_dest++ = '\\';
               *copy_dest++ = 't';
             }
-            bytes_diff_for_escape_writing++;
+            output_size_bytes++;
           }
-          if (!try_match_char(to_match_str_pos, to_match_str_end, '\t')) { return false; }
-          string_token_utf8_bytes++;
-          str_pos++;
+          if (!try_match_char(to_match, '\t')) { return false; }
+          output_size_bytes++;
+          str.next();
           return true;
         // path 1 done: \", \', \\, \/, \b, \f, \n, \r, \t
         case 'u':
           // path 2: \u HEX HEX HEX HEX
-          str_pos++;
+          str.next();
 
           // for both unescaped/escaped writes corresponding utf8 bytes, no need
           // to pass in write style
-          return try_skip_unicode(str_pos, to_match_str_pos, to_match_str_end, copy_dest);
+          return try_skip_unicode(str, to_match, copy_dest, output_size_bytes);
         default:
           // path 3: invalid
           return false;
@@ -881,13 +847,13 @@ class json_parser {
   /**
    * parse:
    *   fragment SAFECODEPOINT
-   *       // 1 not " or ' depending to allow_single_quotes
+   *       // 1 not " or '
    *       // 2 not \
    *       // 3 non control character: Ascii value not in [0, 32)
    *     : ~ ["\\\u0000-\u001F]
    *     ;
    */
-  __device__ inline bool try_skip_safe_code_point(char const*& str_pos, char c)
+  __device__ inline bool try_skip_safe_code_point(char_range_reader& str, char c)
   {
     // 1 the char is not quoted(' or ") char, here satisfy, do not need to check
     // again
@@ -897,7 +863,7 @@ class json_parser {
     // 3. chars not in [0, 32)
     int v = static_cast<int>(c);
     if (!(v >= 0 && v < 32)) {
-      str_pos++;
+      str.next();
       return true;
     } else {
       return false;
@@ -915,18 +881,6 @@ class json_parser {
     return 0;
   }
 
-  /**
-   * parse four HEX chars to unsigned int
-   */
-  __device__ inline cudf::char_utf8 parse_code_point(char const* p)
-  {
-    cudf::char_utf8 v = 0;
-    for (size_t i = 0; i < 4; i++) {
-      v = v * 16 + hex_value(p[i]);
-    }
-    return v;
-  }
-
   /**
    * @brief Returns the number of bytes in the specified character.
    *
@@ -995,56 +949,45 @@ class json_parser {
    * try skip 4 HEX chars
    * in pattern: '\\' 'u' HEX HEX HEX HEX, it's a code point of unicode
    */
-  __device__ bool try_skip_unicode(char const*& str_pos,
-                                   char const*& to_match_str_pos,
-                                   char const* const to_match_str_end,
-                                   char*& copy_dest)
+  __device__ bool try_skip_unicode(char_range_reader& str,
+                                   char_range_reader& to_match,
+                                   char*& copy_dest,
+                                   int& output_size_bytes)
   {
-    // already parsed u
-    bool is_success = try_skip_hex(str_pos) && try_skip_hex(str_pos) && try_skip_hex(str_pos) &&
-                      try_skip_hex(str_pos);
-    if (is_success) {
-      // parse 4 HEX chars to uint32_t value
-      auto code_point = parse_code_point(str_pos - 4);
-      auto utf_char   = codepoint_to_utf8(code_point);
-      // write utf8 bytes.
-      // In UTF-8, the maximum number of bytes used to encode a single character
-      // is 4
-      char buff[4];
-      cudf::size_type bytes = from_char_utf8(utf_char, buff);
-      string_token_utf8_bytes += bytes;
-
-      if (nullptr != copy_dest) {
-        for (cudf::size_type i = 0; i < bytes; i++) {
-          *copy_dest++ = buff[i];
-        }
+    // already parsed \u
+    // now we expect 4 hex chars.
+    cudf::char_utf8 code_point = 0;
+    for (size_t i = 0; i < 4; i++) {
+      if (str.eof()) { return false; }
+      char const c = str.current_char();
+      str.next();
+      if (!is_hex_digit(c)) { return false; }
+      code_point = (code_point * 16) + hex_value(c);
+    }
+    auto utf_char = codepoint_to_utf8(code_point);
+    // write utf8 bytes.
+    // In UTF-8, the maximum number of bytes used to encode a single character
+    // is 4
+    char buff[4];
+    cudf::size_type const bytes = from_char_utf8(utf_char, buff);
+    output_size_bytes += bytes;
+
+    // TODO I think if we do an escape sequence for \n/etc it will return
+    // the wrong thing....
+    if (nullptr != copy_dest) {
+      for (cudf::size_type i = 0; i < bytes; i++) {
+        *copy_dest++ = buff[i];
       }
+    }
 
-      if (nullptr != to_match_str_pos) {
-        for (cudf::size_type i = 0; i < bytes; i++) {
-          if (!(to_match_str_pos < to_match_str_end && *to_match_str_pos == buff[i])) {
-            return false;
-          }
-          to_match_str_pos++;
-        }
+    if (!to_match.is_null()) {
+      for (cudf::size_type i = 0; i < bytes; i++) {
+        if (!(to_match.eof() && to_match.current_char() == buff[i])) { return false; }
+        to_match.next();
       }
-
-      return true;
-    } else {
-      return false;
     }
-  }
 
-  /**
-   * try skip HEX
-   */
-  __device__ inline bool try_skip_hex(char const*& str_pos)
-  {
-    if (!eof(str_pos) && is_hex_digit(*str_pos)) {
-      str_pos++;
-      return true;
-    }
-    return false;
+    return true;
   }
 
   // =========== Parse string end ===========
@@ -1072,7 +1015,7 @@ class json_parser {
    *
    * Note: Leading zeroes are not allowed, keep consistent with Spark, e.g.: 00, -01 are invalid
    */
-  __device__ inline void parse_number()
+  __device__ inline void parse_number_and_set_current()
   {
     // parse sign
     try_skip(curr_pos, '-');
@@ -1084,14 +1027,14 @@ class json_parser {
     int number_digits_length = 0;
     if (try_unsigned_number(is_float, number_digits_length)) {
       if (check_max_num_len(number_digits_length)) {
-        curr_token = (is_float ? json_token::VALUE_NUMBER_FLOAT : json_token::VALUE_NUMBER_INT);
+        current_token = (is_float ? json_token::VALUE_NUMBER_FLOAT : json_token::VALUE_NUMBER_INT);
         // success parsed a number, update the token length
         number_token_len = curr_pos - current_token_start_pos;
       } else {
-        curr_token = json_token::ERROR;
+        set_current_error();
       }
     } else {
-      curr_token = json_token::ERROR;
+      set_current_error();
     }
   }
 
@@ -1108,18 +1051,6 @@ class json_parser {
       (max_num_len > 0 && number_digits_length <= max_num_len);
   }
 
-  /**
-   * verify max string length if enabled
-   */
-  __device__ inline bool check_string_max_utf8_bytes()
-  {
-    return
-      // disabled str len check
-      max_string_utf8_bytes <= 0 ||
-      // enabled str len check
-      (max_string_utf8_bytes > 0 && string_token_utf8_bytes <= max_string_utf8_bytes);
-  }
-
   /**
    * parse:  INT ('.' [0-9]+)? EXP?
    * and verify leading zeroes
@@ -1128,8 +1059,8 @@ class json_parser {
    */
   __device__ inline bool try_unsigned_number(bool& is_float, int& number_digits_length)
   {
-    if (!eof(curr_pos)) {
-      char c = *curr_pos;
+    if (!eof()) {
+      char const c = chars[curr_pos];
       if (c >= '1' && c <= '9') {
         curr_pos++;
         number_digits_length++;
@@ -1142,8 +1073,8 @@ class json_parser {
         number_digits_length++;
 
         // check leading zeros
-        if (!eof(curr_pos)) {
-          char next_char_after_zero = *curr_pos;
+        if (!eof()) {
+          char const next_char_after_zero = chars[curr_pos];
           if (next_char_after_zero >= '0' && next_char_after_zero <= '9') {
             // e.g.: 01 is invalid
             return false;
@@ -1178,7 +1109,7 @@ class json_parser {
     }
 
     // parse exp
-    if (!eof(curr_pos) && (*curr_pos == 'e' || *curr_pos == 'E')) {
+    if (!eof() && (chars[curr_pos] == 'e' || chars[curr_pos] == 'E')) {
       curr_pos++;
       is_float = true;
       return try_parse_exp(number_digits_length);
@@ -1194,8 +1125,8 @@ class json_parser {
   __device__ inline int skip_zero_or_more_digits()
   {
     int digits = 0;
-    while (!eof(curr_pos)) {
-      if (is_digit(*curr_pos)) {
+    while (!eof()) {
+      if (is_digit(chars[curr_pos])) {
         digits++;
         curr_pos++;
       } else {
@@ -1213,7 +1144,7 @@ class json_parser {
    */
   __device__ inline bool try_skip_one_or_more_digits(int& number_digits_length)
   {
-    if (!eof(curr_pos) && is_digit(*curr_pos)) {
+    if (!eof() && is_digit(chars[curr_pos])) {
       curr_pos++;
       number_digits_length++;
       number_digits_length += skip_zero_or_more_digits();
@@ -1232,7 +1163,7 @@ class json_parser {
     // already parsed [eE]
 
     // parse [+-]?
-    if (!eof(curr_pos) && (*curr_pos == '+' || *curr_pos == '-')) { curr_pos++; }
+    if (!eof() && (chars[curr_pos] == '+' || chars[curr_pos] == '-')) { curr_pos++; }
 
     // parse [0-9]+
     return try_skip_one_or_more_digits(number_digits_length);
@@ -1243,55 +1174,58 @@ class json_parser {
   /**
    * parse true
    */
-  __device__ inline void parse_true()
+  __device__ inline void parse_true_and_set_current()
   {
     // already parsed 't'
     if (try_skip(curr_pos, 'r') && try_skip(curr_pos, 'u') && try_skip(curr_pos, 'e')) {
-      curr_token = json_token::VALUE_TRUE;
+      current_token = json_token::VALUE_TRUE;
     } else {
-      curr_token = json_token::ERROR;
+      set_current_error();
     }
   }
 
   /**
    * parse false
    */
-  __device__ inline void parse_false()
+  __device__ inline void parse_false_and_set_current()
   {
     // already parsed 'f'
     if (try_skip(curr_pos, 'a') && try_skip(curr_pos, 'l') && try_skip(curr_pos, 's') &&
         try_skip(curr_pos, 'e')) {
-      curr_token = json_token::VALUE_FALSE;
+      current_token = json_token::VALUE_FALSE;
     } else {
-      curr_token = json_token::ERROR;
+      set_current_error();
     }
   }
 
   /**
    * parse null
    */
-  __device__ inline void parse_null()
+  __device__ inline void parse_null_and_set_current()
   {
     // already parsed 'n'
     if (try_skip(curr_pos, 'u') && try_skip(curr_pos, 'l') && try_skip(curr_pos, 'l')) {
-      curr_token = json_token::VALUE_NULL;
+      current_token = json_token::VALUE_NULL;
     } else {
-      curr_token = json_token::ERROR;
+      set_current_error();
     }
   }
 
   /**
    * parse the key string in key:value pair
    */
-  __device__ inline void parse_field_name()
+  __device__ inline void parse_field_name_and_set_current()
   {
-    auto [success, end_char_pos] =
-      try_parse_string(curr_pos, nullptr, nullptr, nullptr, write_style::unescaped);
+    // TODO eventually chars should be a reader so we can just pass it in...
+    char_range_reader reader(chars, curr_pos);
+    current_token_start_pos      = curr_pos;
+    auto [success, end_char_pos] = try_parse_string(reader);
     if (success) {
-      curr_pos   = end_char_pos;
-      curr_token = json_token::FIELD_NAME;
+      // TODO remove end_char_pos, and just get it from the reader...
+      curr_pos      = end_char_pos;
+      current_token = json_token::FIELD_NAME;
     } else {
-      curr_token = json_token::ERROR;
+      set_current_error();
     }
   }
 
@@ -1301,58 +1235,50 @@ class json_parser {
    * @param[out] has_comma_before_token has comma before next token
    * @param[out] has_colon_before_token has colon before next token
    */
-  __device__ inline json_token parse_next_token(bool& has_comma_before_token,
-                                                bool& has_colon_before_token)
+  __device__ inline void parse_next_token_and_set_current(bool& has_comma_before_token,
+                                                          bool& has_colon_before_token)
   {
-    skip_whitespaces(curr_pos);
-    if (!eof(curr_pos)) {
-      char c = *curr_pos;
+    skip_whitespaces();
+    if (!eof()) {
+      char const c = chars[curr_pos];
       if (is_context_stack_empty()) {
         // stack is empty
 
-        if (curr_token == json_token::INIT) {
+        if (current_token == json_token::INIT) {
           // main root entry point
-          current_token_start_pos = curr_pos;
-          parse_first_token_in_value();
+          parse_first_token_in_value_and_set_current();
         } else {
-          if (allow_tailing_sub_string) {
-            // previous token is not INIT, means already get a token; stack is
-            // empty; Successfully parsed. Note: ignore the tailing sub-string
-            curr_token = json_token::SUCCESS;
-          } else {
-            // not eof, has extra useless tailing characters.
-            curr_token = json_token::ERROR;
-          }
+          // previous token is not INIT, means already get a token; stack is
+          // empty; Successfully parsed. Note: ignore the tailing sub-string
+          current_token = json_token::SUCCESS;
         }
       } else {
         // stack is non-empty
 
         if (is_object_context()) {
           // in JSON object context
-          if (curr_token == json_token::START_OBJECT) {
+          if (current_token == json_token::START_OBJECT) {
             // previous token is '{'
             if (c == '}') {
               // empty object
               // close curr object context
               current_token_start_pos = curr_pos;
               curr_pos++;
-              curr_token = json_token::END_OBJECT;
               pop_curr_context();
+              current_token = json_token::END_OBJECT;
             } else {
               // parse key in key:value pair
-              current_token_start_pos = curr_pos;
-              parse_field_name();
+              parse_field_name_and_set_current();
             }
-          } else if (curr_token == json_token::FIELD_NAME) {
+          } else if (current_token == json_token::FIELD_NAME) {
             if (c == ':') {
               has_colon_before_token = true;
               // skip ':' and parse value in key:value pair
               curr_pos++;
-              skip_whitespaces(curr_pos);
-              current_token_start_pos = curr_pos;
-              parse_first_token_in_value();
+              skip_whitespaces();
+              parse_first_token_in_value_and_set_current();
             } else {
-              curr_token = json_token::ERROR;
+              set_current_error();
             }
           } else {
             // expect next key:value pair or '}'
@@ -1360,67 +1286,63 @@ class json_parser {
               // end of object
               current_token_start_pos = curr_pos;
               curr_pos++;
-              curr_token = json_token::END_OBJECT;
               pop_curr_context();
+              current_token = json_token::END_OBJECT;
             } else if (c == ',') {
               has_comma_before_token = true;
               // parse next key:value pair
               curr_pos++;
-              skip_whitespaces(curr_pos);
-              current_token_start_pos = curr_pos;
-              parse_field_name();
+              skip_whitespaces();
+              parse_field_name_and_set_current();
             } else {
-              curr_token = json_token::ERROR;
+              set_current_error();
             }
           }
         } else {
           // in Json array context
-          if (curr_token == json_token::START_ARRAY) {
+          if (current_token == json_token::START_ARRAY) {
             // previous token is '['
             if (c == ']') {
               // curr: ']', empty array
               current_token_start_pos = curr_pos;
               curr_pos++;
-              curr_token = json_token::END_ARRAY;
               pop_curr_context();
+              current_token = json_token::END_ARRAY;
             } else {
               // non-empty array, parse the first value in the array
-              current_token_start_pos = curr_pos;
-              parse_first_token_in_value();
+              parse_first_token_in_value_and_set_current();
             }
           } else {
             if (c == ',') {
               has_comma_before_token = true;
               // skip ',' and parse the next value
               curr_pos++;
-              skip_whitespaces(curr_pos);
-              current_token_start_pos = curr_pos;
-              parse_first_token_in_value();
+              skip_whitespaces();
+              parse_first_token_in_value_and_set_current();
             } else if (c == ']') {
               // end of array
               current_token_start_pos = curr_pos;
               curr_pos++;
-              curr_token = json_token::END_ARRAY;
               pop_curr_context();
+              current_token = json_token::END_ARRAY;
             } else {
-              curr_token = json_token::ERROR;
+              set_current_error();
             }
           }
         }
       }
     } else {
       // eof
-      if (is_context_stack_empty() && curr_token != json_token::INIT) {
+      if (is_context_stack_empty() && current_token != json_token::INIT) {
         // reach eof; stack is empty; previous token is not INIT
-        curr_token = json_token::SUCCESS;
+        current_token = json_token::SUCCESS;
       } else {
         // eof, and meet the following cases:
         //   - has unclosed JSON array/object;
         //   - the whole JSON is empty
-        curr_token = json_token::ERROR;
+        set_current_error();
       }
     }
-    return curr_token;
   }
 
  public:
@@ -1433,23 +1355,19 @@ class json_parser {
     // parse next token
     bool has_comma_before_token;  // no-initialization because of do not care here
     bool has_colon_before_token;  // no-initialization because of do not care here
-    return parse_next_token(has_comma_before_token, has_colon_before_token);
+    parse_next_token_and_set_current(has_comma_before_token, has_colon_before_token);
+    return current_token;
   }
 
   /**
    * get current token
    */
-  __device__ json_token get_current_token() { return curr_token; }
+  __device__ json_token get_current_token() { return current_token; }
 
-  /**
-   * is valid JSON by parsing through all tokens
-   */
-  __device__ bool is_valid()
+  // TODO make this go away!!!!
+  __device__ inline char_range current_range()
   {
-    while (curr_token != json_token::ERROR && curr_token != json_token::SUCCESS) {
-      next_token();
-    }
-    return curr_token == json_token::SUCCESS;
+    return chars.slice(current_token_start_pos, curr_pos - current_token_start_pos);
   }
 
   /**
@@ -1459,12 +1377,12 @@ class json_parser {
    */
   __device__ bool try_skip_children()
   {
-    if (curr_token == json_token::ERROR || curr_token == json_token::INIT ||
-        curr_token == json_token::SUCCESS) {
+    if (current_token == json_token::ERROR || current_token == json_token::INIT ||
+        current_token == json_token::SUCCESS) {
       return false;
     }
 
-    if (curr_token != json_token::START_OBJECT && curr_token != json_token::START_ARRAY) {
+    if (current_token != json_token::START_OBJECT && current_token != json_token::START_ARRAY) {
       return true;
     }
 
@@ -1492,22 +1410,22 @@ class json_parser {
    */
   __device__ cudf::size_type write_unescaped_text(char* destination)
   {
-    switch (curr_token) {
-      case json_token::VALUE_STRING:
+    switch (current_token) {
+      case json_token::VALUE_STRING: {
         // can not copy from JSON directly due to escaped chars
         // rewind the pos; parse again with copy
-        try_parse_string(
-          current_token_start_pos, nullptr, nullptr, destination, write_style::unescaped);
-        return string_token_utf8_bytes;
+        char_range_reader reader(current_range());
+        return write_string(reader, destination, escape_style::UNESCAPED);
+      }
       case json_token::VALUE_NUMBER_INT:
-        if (number_token_len == 2 && current_token_start_pos[0] == '-' &&
-            current_token_start_pos[1] == '0') {
+        if (number_token_len == 2 && chars[current_token_start_pos] == '-' &&
+            chars[current_token_start_pos + 1] == '0') {
           if (nullptr != destination) *destination++ = '0';
           return 1;
         }
         if (nullptr != destination) {
           for (cudf::size_type i = 0; i < number_token_len; ++i) {
-            *destination++ = *(current_token_start_pos + i);
+            *destination++ = chars[current_token_start_pos + i];
           }
         }
         return number_token_len;
@@ -1517,8 +1435,8 @@ class json_parser {
         // 12345678900000000000.0 => 1.23456789E19, 1E308 => 1.0E308
         // 0.0000000000003 => 3.0E-13; 0.003 => 0.003; 0.0003 => 3.0E-4
         // 1.0E309 => "Infinity", -1E309 => "-Infinity"
-        double d_value = spark_rapids_jni::detail::stod(
-          cudf::string_view(current_token_start_pos, number_token_len));
+        double d_value =
+          cudf::strings::detail::stod(chars.slice_sv(current_token_start_pos, number_token_len));
         return spark_rapids_jni::ftos_converter::double_normalization(d_value, destination);
       }
       case json_token::VALUE_TRUE:
@@ -1546,12 +1464,12 @@ class json_parser {
           *destination++ = 'l';
         }
         return 4;
-      case json_token::FIELD_NAME:
+      case json_token::FIELD_NAME: {
         // can not copy from JSON directly due to escaped chars
         // rewind the pos; parse again with copy
-        try_parse_string(
-          current_token_start_pos, nullptr, nullptr, destination, write_style::unescaped);
-        return string_token_utf8_bytes;
+        char_range_reader reader(current_range());
+        return write_string(reader, destination, escape_style::UNESCAPED);
+      }
       case json_token::START_ARRAY:
         if (nullptr != destination) { *destination++ = '['; }
         return 1;
@@ -1582,29 +1500,29 @@ class json_parser {
    */
   __device__ cudf::size_type write_escaped_text(char* destination)
   {
-    switch (curr_token) {
-      case json_token::VALUE_STRING:
+    switch (current_token) {
+      case json_token::VALUE_STRING: {
         // can not copy from JSON directly due to escaped chars
-        // rewind the pos; parse again with copy
-        try_parse_string(
-          current_token_start_pos, nullptr, nullptr, destination, write_style::escaped);
-        return string_token_utf8_bytes + bytes_diff_for_escape_writing;
-      case json_token::VALUE_NUMBER_INT:
-        if (number_token_len == 2 && current_token_start_pos[0] == '-' &&
-            current_token_start_pos[1] == '0') {
+        char_range_reader reader(current_range());
+        return write_string(reader, destination, escape_style::ESCAPED);
+      }
+      case json_token::VALUE_NUMBER_INT: {
+        if (number_token_len == 2 && chars[current_token_start_pos] == '-' &&
+            chars[current_token_start_pos + 1] == '0') {
           if (nullptr != destination) *destination++ = '0';
           return 1;
         }
         if (nullptr != destination) {
           for (cudf::size_type i = 0; i < number_token_len; ++i) {
-            *destination++ = *(current_token_start_pos + i);
+            *destination++ = chars[current_token_start_pos + i];
           }
         }
         return number_token_len;
+      }
       case json_token::VALUE_NUMBER_FLOAT: {
         // number normalization:
-        double d_value = spark_rapids_jni::detail::stod(
-          cudf::string_view(current_token_start_pos, number_token_len));
+        double d_value =
+          cudf::strings::detail::stod(chars.slice_sv(current_token_start_pos, number_token_len));
         return spark_rapids_jni::ftos_converter::double_normalization(d_value, destination);
       }
       case json_token::VALUE_TRUE:
@@ -1632,12 +1550,11 @@ class json_parser {
           *destination++ = 'l';
         }
         return 4;
-      case json_token::FIELD_NAME:
+      case json_token::FIELD_NAME: {
         // can not copy from JSON directly due to escaped chars
-        // rewind the pos; parse again with copy
-        try_parse_string(
-          current_token_start_pos, nullptr, nullptr, destination, write_style::escaped);
-        return string_token_utf8_bytes + bytes_diff_for_escape_writing;
+        char_range_reader reader(current_range());
+        return write_string(reader, destination, escape_style::ESCAPED);
+      }
       case json_token::START_ARRAY:
         if (nullptr != destination) { *destination++ = '['; }
         return 1;
@@ -1658,38 +1575,25 @@ class json_parser {
     return 0;
   }
 
-  /**
-   * reset the parser
-   */
-  __device__ void reset()
-  {
-    curr_pos   = json_start_pos;
-    curr_token = json_token::INIT;
-    stack_size = 0;
-  }
-
   /**
    * match field name string when current token is FIELD_NAME,
    * return true if current token is FIELD_NAME and match successfully.
    * return false otherwise,
-   * Note: to_match_str_ptr should not be nullptr
    */
   __device__ bool match_current_field_name(cudf::string_view name)
   {
-    return match_current_field_name(name.data(), name.size_bytes());
+    return match_current_field_name(char_range(name));
   }
 
   /**
    * match current field name
    */
-  __device__ bool match_current_field_name(char const* to_match_str_ptr, cudf::size_type len)
+  __device__ bool match_current_field_name(char_range name)
   {
-    if (json_token::FIELD_NAME == curr_token) {
-      auto [b, end_pos] = try_parse_string(current_token_start_pos,
-                                           to_match_str_ptr,
-                                           to_match_str_ptr + len,
-                                           nullptr,
-                                           write_style::unescaped);
+    if (json_token::FIELD_NAME == current_token) {
+      char_range_reader reader(current_range());
+      char_range_reader to_match(name);
+      auto [b, end_pos] = try_parse_string(reader, to_match, escape_style::UNESCAPED);
       return b;
     } else {
       return false;
@@ -1704,7 +1608,7 @@ class json_parser {
    */
   __device__ thrust::pair<bool, size_t> copy_current_structure(char* copy_to)
   {
-    switch (curr_token) {
+    switch (current_token) {
       case json_token::INIT:
       case json_token::ERROR:
       case json_token::SUCCESS:
@@ -1746,10 +1650,10 @@ class json_parser {
           bool has_colon_before_token = false;
 
           // parse and get has_comma_before_token, has_colon_before_token
-          parse_next_token(has_comma_before_token, has_colon_before_token);
+          parse_next_token_and_set_current(has_comma_before_token, has_colon_before_token);
 
           // check the JSON format
-          if (curr_token == json_token::ERROR) { return thrust::make_pair(false, 0); }
+          if (current_token == json_token::ERROR) { return thrust::make_pair(false, 0); }
 
           // write out the token
           if (nullptr != copy_to) {
@@ -1783,10 +1687,9 @@ class json_parser {
   }
 
  private:
-  char const* const json_start_pos;
-  char const* const json_end_pos;
-  char const* curr_pos;
-  json_token curr_token{json_token::INIT};
+  char_range const chars;
+  cudf::size_type curr_pos;
+  json_token current_token;
 
   // 64 bits long saves the nested object/array contexts
   // true(bit value 1) is JSON object context
@@ -1795,21 +1698,12 @@ class json_parser {
   int64_t context_stack;
   int stack_size = 0;
 
+  // TODO remove if possible
   // save current token start pos, used by coping current token text
-  char const* current_token_start_pos;
+  cudf::size_type current_token_start_pos;
+  // TODO remove if possible
   // used to store number token length
   cudf::size_type number_token_len;
-
-  // Records string/field name token utf8 bytes size after unescaped
-  // e.g.: For JSON 4 chars string "\\n", after unescaped, get 1 char '\n'
-  // used by checking the max string length
-  int string_token_utf8_bytes;
-
-  // Records bytes diff between escape writing and unescape writing
-  // e.g.: 4 chars string "\\n", string_token_utf8_bytes is 1,
-  // when `write_escaped_text`, will write out 4 chars: " \ n ",
-  // then this diff will be 4 - 1 = 3
-  int bytes_diff_for_escape_writing;
 };
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/map_utils.cu
index 6ae54f4fe9..ebb12eee93 100644
--- a/src/main/cpp/src/map_utils.cu
+++ b/src/main/cpp/src/map_utils.cu
@@ -16,10 +16,6 @@
 
 #include "map_utils_debug.cuh"
 
-//
-#include <limits>
-
-//
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
@@ -31,11 +27,11 @@
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-//
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-//
+#include <cub/device/device_radix_sort.cuh>
+#include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -51,9 +47,7 @@
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
-//
-#include <cub/device/device_radix_sort.cuh>
-#include <cuda/functional>
+#include <limits>
 
 namespace spark_rapids_jni {
 
@@ -520,8 +514,9 @@ struct substring_fn {
   cudf::device_span<char const> const d_string;
   cudf::device_span<thrust::pair<SymbolOffsetT, SymbolOffsetT> const> const d_ranges;
 
-  cudf::size_type* d_offsets{};
-  char* d_chars{};
+  cudf::size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(cudf::size_type const idx)
   {
@@ -530,7 +525,7 @@ struct substring_fn {
     if (d_chars) {
       memcpy(d_chars + d_offsets[idx], d_string.data() + range.first, size);
     } else {
-      d_offsets[idx] = size;
+      d_sizes[idx] = size;
     }
   }
 };
@@ -543,7 +538,7 @@ std::unique_ptr<cudf::column> extract_keys_or_values(
   rmm::device_uvector<int8_t> const& key_or_value,
   rmm::device_uvector<char> const& unified_json_buff,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const is_key = cuda::proclaim_return_type<bool>(
     [key_or_value = key_or_value.begin()] __device__(auto const node_id) {
@@ -584,7 +579,7 @@ rmm::device_uvector<cudf::size_type> compute_list_offsets(
   rmm::device_uvector<NodeIndexT> const& parent_node_ids,
   rmm::device_uvector<int8_t> const& key_or_value,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // Count the number of children nodes for the json object nodes.
   // These object nodes are given as one row of the input json strings column.
@@ -648,7 +643,7 @@ rmm::device_uvector<cudf::size_type> compute_list_offsets(
 
 std::unique_ptr<cudf::column> from_json(cudf::column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == cudf::type_id::STRING, "Invalid input format");
 
diff --git a/src/main/cpp/src/map_utils.hpp b/src/main/cpp/src/map_utils.hpp
index c620b6fb95..96ba6f7e9b 100644
--- a/src/main/cpp/src/map_utils.hpp
+++ b/src/main/cpp/src/map_utils.hpp
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -27,7 +28,7 @@ namespace spark_rapids_jni {
 
 std::unique_ptr<cudf::column> from_json(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/murmur_hash.cu b/src/main/cpp/src/murmur_hash.cu
index d94ca2d5bc..91e8fb97e0 100644
--- a/src/main/cpp/src/murmur_hash.cu
+++ b/src/main/cpp/src/murmur_hash.cu
@@ -187,7 +187,7 @@ void check_hash_compatibility(cudf::table_view const& input)
 std::unique_ptr<cudf::column> murmur_hash3_32(cudf::table_view const& input,
                                               uint32_t seed,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   auto output =
     cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<murmur_hash_value_type>()),
diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index 0e6ea2690d..0e57366358 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -582,7 +582,9 @@ uri_parts __device__ validate_uri(const char* str,
   }
 
   // if the first ':' is after the other tokens, this doesn't have a scheme or it is invalid
-  if (col != -1 && (slash == -1 || col < slash) && (hash == -1 || col < hash)) {
+  bool const has_scheme =
+    (col != -1) && ((slash == -1) || (col < slash)) && ((hash == -1) || (col < hash));
+  if (has_scheme) {
     // we have a scheme up to the :
     ret.scheme = {str, col};
     if (!validate_scheme(ret.scheme)) {
@@ -600,9 +602,12 @@ uri_parts __device__ validate_uri(const char* str,
     slash -= skip;
   }
 
-  // no more string to parse is an error
+  // no more string to parse is generally an error, unless we had no scheme
   if (len <= 0) {
-    ret.valid = 0;
+    // If we had a scheme then this is entirely invalid.
+    // If no scheme then URI is entirely empty or we only had a fragment
+    // This is equivalent to having a path that is present but empty, so mark it ok
+    ret.valid = (static_cast<int>(!has_scheme) << static_cast<int>(URI_chunks::PATH));
     return ret;
   }
 
@@ -655,13 +660,6 @@ uri_parts __device__ validate_uri(const char* str,
                        next_slash == -1 ? question < 0 ? len - 2 : question - 2 : next_slash - 2};
       if (next_slash > 0) { ret.path = {str + next_slash, path_len - next_slash}; }
 
-      if (next_slash == -1 && ret.authority.size_bytes() == 0 && ret.query.size_bytes() == 0 &&
-          ret.fragment.size_bytes() == 0) {
-        // invalid! - but spark like to return things as long as you don't have illegal characters
-        // ret.valid = 0;
-        return ret;
-      }
-
       if (ret.authority.size_bytes() > 0) {
         auto ipv6_address = ret.authority.size_bytes() > 2 && *ret.authority.begin() == '[';
         if (!validate_authority(ret.authority, ipv6_address)) {
@@ -729,6 +727,7 @@ uri_parts __device__ validate_uri(const char* str,
       // path with no authority
       ret.path = {str, path_len};
     }
+
     if (!validate_path(ret.path)) {
       ret.valid = 0;
       return ret;
@@ -878,7 +877,7 @@ std::unique_ptr<column> parse_uri(strings_column_view const& input,
                                   URI_chunks chunk,
                                   std::optional<strings_column_view const> query_match,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
@@ -956,7 +955,7 @@ std::unique_ptr<column> parse_uri(strings_column_view const& input,
 
 std::unique_ptr<column> parse_uri_to_protocol(strings_column_view const& input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::parse_uri(input, detail::URI_chunks::PROTOCOL, std::nullopt, stream, mr);
@@ -964,7 +963,7 @@ std::unique_ptr<column> parse_uri_to_protocol(strings_column_view const& input,
 
 std::unique_ptr<column> parse_uri_to_host(strings_column_view const& input,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::parse_uri(input, detail::URI_chunks::HOST, std::nullopt, stream, mr);
@@ -972,7 +971,7 @@ std::unique_ptr<column> parse_uri_to_host(strings_column_view const& input,
 
 std::unique_ptr<column> parse_uri_to_query(strings_column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::parse_uri(input, detail::URI_chunks::QUERY, std::nullopt, stream, mr);
@@ -981,7 +980,7 @@ std::unique_ptr<column> parse_uri_to_query(strings_column_view const& input,
 std::unique_ptr<cudf::column> parse_uri_to_query(cudf::strings_column_view const& input,
                                                  std::string const& query_match,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -995,7 +994,7 @@ std::unique_ptr<cudf::column> parse_uri_to_query(cudf::strings_column_view const
 std::unique_ptr<cudf::column> parse_uri_to_query(cudf::strings_column_view const& input,
                                                  cudf::strings_column_view const& query_match,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input.size() == query_match.size(), "Query column must be the same size as input!");
@@ -1003,4 +1002,11 @@ std::unique_ptr<cudf::column> parse_uri_to_query(cudf::strings_column_view const
   return detail::parse_uri(input, detail::URI_chunks::QUERY, query_match, stream, mr);
 }
 
+std::unique_ptr<column> parse_uri_to_path(strings_column_view const& input,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::parse_uri(input, detail::URI_chunks::PATH, std::nullopt, stream, mr);
+}
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/parse_uri.hpp b/src/main/cpp/src/parse_uri.hpp
index 004d800ddb..2afc879cba 100644
--- a/src/main/cpp/src/parse_uri.hpp
+++ b/src/main/cpp/src/parse_uri.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,13 +21,14 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
 namespace spark_rapids_jni {
 
 /**
- * @brief Parse protocol and copy from the input string column to the output char buffer.
+ * @brief Parse protocol and copy from the input string column to the output string column.
  *
  * @param input Input string column of URIs to parse
  * @param stream Stream on which to operate.
@@ -36,11 +37,11 @@ namespace spark_rapids_jni {
  */
 std::unique_ptr<cudf::column> parse_uri_to_protocol(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Parse host and copy from the input string column to the output char buffer.
+ * @brief Parse host and copy from the input string column to the output string column.
  *
  * @param input Input string column of URIs to parse
  * @param stream Stream on which to operate.
@@ -49,11 +50,11 @@ std::unique_ptr<cudf::column> parse_uri_to_protocol(
  */
 std::unique_ptr<cudf::column> parse_uri_to_host(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Parse query and copy from the input string column to the output char buffer.
+ * @brief Parse query and copy from the input string column to the output string column.
  *
  * @param input Input string column of URIs to parse
  * @param stream Stream on which to operate.
@@ -62,8 +63,8 @@ std::unique_ptr<cudf::column> parse_uri_to_host(
  */
 std::unique_ptr<cudf::column> parse_uri_to_query(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Parse query and copy from the input string column to the output string column.
@@ -77,8 +78,8 @@ std::unique_ptr<cudf::column> parse_uri_to_query(
 std::unique_ptr<cudf::column> parse_uri_to_query(
   cudf::strings_column_view const& input,
   std::string const& query_match,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Parse query and copy from the input string column to the output string column.
@@ -92,7 +93,20 @@ std::unique_ptr<cudf::column> parse_uri_to_query(
 std::unique_ptr<cudf::column> parse_uri_to_query(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& query_match,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Parse path and copy from the input string column to the output string column.
+ *
+ * @param input Input string column of URIs to parse
+ * @param stream Stream on which to operate.
+ * @param mr Memory resource for returned column
+ * @return std::unique_ptr<column> String column of paths parsed.
+ */
+std::unique_ptr<cudf::column> parse_uri_to_path(
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/regex_rewrite_utils.cu b/src/main/cpp/src/regex_rewrite_utils.cu
new file mode 100644
index 0000000000..2735b134f9
--- /dev/null
+++ b/src/main/cpp/src/regex_rewrite_utils.cu
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utf8.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+namespace spark_rapids_jni {
+
+namespace {
+
+struct literal_range_pattern_fn {
+  __device__ bool operator()(
+    cudf::string_view d_string, cudf::string_view d_prefix, int range_len, int start, int end) const
+  {
+    int const n = d_string.length(), m = d_prefix.length();
+    for (int i = 0; i <= n - m - range_len; i++) {
+      bool match = true;
+      for (int j = 0; j < m; j++) {
+        if (d_string[i + j] != d_prefix[j]) {
+          match = false;
+          break;
+        }
+      }
+      if (match) {
+        for (int j = 0; j < range_len; j++) {
+          auto code_point = cudf::strings::detail::utf8_to_codepoint(d_string[i + m + j]);
+          if (code_point < start || code_point > end) {
+            match = false;
+            break;
+          }
+        }
+        if (match) { return true; }
+      }
+    }
+    return false;
+  }
+};
+
+std::unique_ptr<cudf::column> find_literal_range_pattern(cudf::strings_column_view const& strings,
+                                                         cudf::string_scalar const& prefix,
+                                                         int const range_len,
+                                                         int const start,
+                                                         int const end,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
+{
+  auto const strings_count = strings.size();
+  if (strings_count == 0) { return cudf::make_empty_column(cudf::type_id::BOOL8); }
+
+  CUDF_EXPECTS(prefix.is_valid(stream), "Parameter prefix must be valid.");
+
+  auto const d_prefix       = cudf::string_view(prefix.data(), prefix.size());
+  auto const strings_column = cudf::column_device_view::create(strings.parent(), stream);
+  auto const d_strings      = *strings_column;
+
+  auto results         = make_numeric_column(cudf::data_type{cudf::type_id::BOOL8},
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+  auto const d_results = results->mutable_view().data<bool>();
+  // set the bool values by evaluating the passed function
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<cudf::size_type>(0),
+    thrust::make_counting_iterator<cudf::size_type>(strings_count),
+    d_results,
+    [d_strings, d_prefix, range_len, start, end, check_fn = literal_range_pattern_fn{}] __device__(
+      cudf::size_type idx) {
+      if (!d_strings.is_null(idx)) {
+        return check_fn(d_strings.element<cudf::string_view>(idx), d_prefix, range_len, start, end);
+      }
+      return false;
+    });
+  results->set_null_count(strings.null_count());
+  return results;
+}
+
+}  // namespace
+
+/**
+ * @brief Check if input string contains regex pattern `literal[start-end]{len,}`, which means
+ * a literal string followed by a range of characters in the range of start to end, with at least
+ * len characters.
+ *
+ * @param strings Column of strings to check for literal.
+ * @param literal UTF-8 encoded string to check in strings column.
+ * @param len Minimum number of characters to check after the literal.
+ * @param start Minimum UTF-8 codepoint value to check for in the range.
+ * @param end Maximum UTF-8 codepoint value to check for in the range.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+std::unique_ptr<cudf::column> literal_range_pattern(cudf::strings_column_view const& input,
+                                                    cudf::string_scalar const& prefix,
+                                                    int const range_len,
+                                                    int const start,
+                                                    int const end,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return find_literal_range_pattern(input, prefix, range_len, start, end, stream, mr);
+}
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/regex_rewrite_utils.hpp b/src/main/cpp/src/regex_rewrite_utils.hpp
new file mode 100644
index 0000000000..e5e500b180
--- /dev/null
+++ b/src/main/cpp/src/regex_rewrite_utils.hpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+namespace spark_rapids_jni {
+/**
+ * @brief Check if input string contains regex pattern `literal[start-end]{len,}`, which means
+ * a literal string followed by a range of characters in the range of start to end, with at least
+ * len characters.
+ *
+ * @param strings Column of strings to check for literal.
+ * @param literal UTF-8 encoded string to check in strings column.
+ * @param len Minimum number of characters to check after the literal.
+ * @param start Minimum UTF-8 codepoint value to check for in the range.
+ * @param end Maximum UTF-8 codepoint value to check for in the range.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+std::unique_ptr<cudf::column> literal_range_pattern(
+  cudf::strings_column_view const& input,
+  cudf::string_scalar const& literal,
+  int const len,
+  int const start,
+  int const end,
+  rmm::cuda_stream_view stream      = rmm::cuda_stream_default,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/row_conversion.cu b/src/main/cpp/src/row_conversion.cu
index 3d6e767042..4ad3927a41 100644
--- a/src/main/cpp/src/row_conversion.cu
+++ b/src/main/cpp/src/row_conversion.cu
@@ -1213,7 +1213,7 @@ static std::unique_ptr<column> fixed_width_convert_to_rows(
   const scalar& zero,
   const scalar& scalar_size_per_row,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   int64_t const total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
@@ -1459,7 +1459,7 @@ batch_data build_batches(size_type num_rows,
                          RowSize row_sizes,
                          bool all_fixed_width,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
 {
   auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows);
   auto const num_batches = static_cast<int32_t>(
@@ -1758,7 +1758,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
   column_info_s const& column_info,
   std::optional<rmm::device_uvector<cudf::detail::input_offsetalator>> variable_width_offsets,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   int device_id;
   CUDF_CUDA_TRY(cudaGetDevice(&device_id));
@@ -1989,7 +1989,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
  */
 std::vector<std::unique_ptr<column>> convert_to_rows(table_view const& tbl,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto const num_columns = tbl.num_columns();
   auto const num_rows    = tbl.num_rows();
@@ -2051,7 +2051,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const& tbl,
 }
 
 std::vector<std::unique_ptr<column>> convert_to_rows_fixed_width_optimized(
-  table_view const& tbl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  table_view const& tbl, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto const num_columns = tbl.num_columns();
 
@@ -2145,7 +2145,7 @@ void fixup_null_counts(std::vector<std::unique_ptr<column>>& output_columns,
 std::unique_ptr<table> convert_from_rows(lists_column_view const& input,
                                          std::vector<data_type> const& schema,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // verify that the types are what we expect
   column_view child    = input.child();
@@ -2208,7 +2208,7 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const& input,
                                                size_type num_rows,
                                                bool include_nm,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr) {
+                                               rmm::device_async_resource_ref mr) {
       auto column =
         make_fixed_width_column(type,
                                 num_rows,
@@ -2444,7 +2444,7 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const& input,
 std::unique_ptr<table> convert_from_rows_fixed_width_optimized(lists_column_view const& input,
                                                                std::vector<data_type> const& schema,
                                                                rmm::cuda_stream_view stream,
-                                                               rmm::mr::device_memory_resource* mr)
+                                                               rmm::device_async_resource_ref mr)
 {
   // verify that the types are what we expect
   column_view child    = input.child();
diff --git a/src/main/cpp/src/row_conversion.hpp b/src/main/cpp/src/row_conversion.hpp
index a5abd5b1bd..0aa7593516 100644
--- a/src/main/cpp/src/row_conversion.hpp
+++ b/src/main/cpp/src/row_conversion.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -29,25 +30,25 @@ namespace spark_rapids_jni {
 std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
   cudf::table_view const& tbl,
   // TODO need something for validity
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
   cudf::table_view const& tbl,
   // TODO need something for validity
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
   cudf::lists_column_view const& input,
   std::vector<cudf::data_type> const& schema,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::table> convert_from_rows(
   cudf::lists_column_view const& input,
   std::vector<cudf::data_type> const& schema,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/spark_rapids_jni_version.cpp.in b/src/main/cpp/src/spark_rapids_jni_version.cpp.in
new file mode 100644
index 0000000000..fdc2aa3007
--- /dev/null
+++ b/src/main/cpp/src/spark_rapids_jni_version.cpp.in
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "spark_rapids_jni_version.h"
+
+namespace spark_rapids_jni {
+
+char const Version[] = "@CMAKE_PROJECT_VERSION@ @SPARK_RAPIDS_JNI_COMMIT_DETAILS@";
+
+}
diff --git a/src/main/cpp/src/spark_rapids_jni_version.h b/src/main/cpp/src/spark_rapids_jni_version.h
new file mode 100644
index 0000000000..c77a8ec5a9
--- /dev/null
+++ b/src/main/cpp/src/spark_rapids_jni_version.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace spark_rapids_jni {
+
+extern char const Version[];
+
+}
diff --git a/src/main/cpp/src/string_to_float_cudf.cuh b/src/main/cpp/src/string_to_float_cudf.cuh
deleted file mode 100644
index 5a7824d495..0000000000
--- a/src/main/cpp/src/string_to_float_cudf.cuh
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/strings/detail/convert/is_float.cuh>
-#include <cudf/strings/string_view.cuh>
-
-#include <cmath>
-#include <limits>
-
-namespace spark_rapids_jni {
-namespace detail {
-
-/**
- * @brief This function converts the given string into a
- * floating point double value.
- *
- * This will also map strings containing "NaN", "Inf", etc.
- * to the appropriate float values.
- *
- * This function will also handle scientific notation format.
- *
- * This function is a copy of cudf::strings::detail::stod with
- * the namespace changed to spark_rapids_jni::detail and fixed
- * an overflow bug of `exp_ten`. It is a short-term solution to
- * resolve a bug in get_json_object. We should remove this file
- * once the bug is fixed in cudf in long term.
- * This diff is `if (exp_ten >= 1e8) break;`
- */
-__device__ inline double stod(cudf::string_view const& d_str)
-{
-  char const* in_ptr = d_str.data();
-  char const* end    = in_ptr + d_str.size_bytes();
-  if (end == in_ptr) return 0.0;
-  double sign{1.0};
-  if (*in_ptr == '-' || *in_ptr == '+') {
-    sign = (*in_ptr == '-' ? -1 : 1);
-    ++in_ptr;
-  }
-
-  constexpr double infinity      = std::numeric_limits<double>::infinity();
-  constexpr uint64_t max_holding = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
-
-  // special strings: NaN, Inf
-  if ((in_ptr < end) && *in_ptr > '9') {
-    auto const inf_nan = cudf::string_view(in_ptr, static_cast<cudf::size_type>(end - in_ptr));
-    if (cudf::strings::detail::is_nan_str(inf_nan)) { return nan(""); }
-    if (cudf::strings::detail::is_inf_str(inf_nan)) { return sign * infinity; }
-  }
-
-  // Parse and store the mantissa as much as we can,
-  // until we are about to exceed the limit of uint64_t
-  uint64_t digits = 0;
-  int exp_off     = 0;
-  bool decimal    = false;
-  while (in_ptr < end) {
-    char ch = *in_ptr;
-    if (ch == '.') {
-      decimal = true;
-      ++in_ptr;
-      continue;
-    }
-    if (ch < '0' || ch > '9') break;
-    if (digits > max_holding)
-      exp_off += (int)!decimal;
-    else {
-      digits = (digits * 10L) + static_cast<uint64_t>(ch - '0');
-      if (digits > max_holding) {
-        digits = digits / 10L;
-        exp_off += (int)!decimal;
-      } else
-        exp_off -= (int)decimal;
-    }
-    ++in_ptr;
-  }
-  if (digits == 0) { return sign * static_cast<double>(0); }
-
-  // check for exponent char
-  int exp_ten  = 0;
-  int exp_sign = 1;
-  if (in_ptr < end) {
-    char ch = *in_ptr++;
-    if (ch == 'e' || ch == 'E') {
-      if (in_ptr < end) {
-        ch = *in_ptr;
-        if (ch == '-' || ch == '+') {
-          exp_sign = (ch == '-' ? -1 : 1);
-          ++in_ptr;
-        }
-        while (in_ptr < end) {
-          ch = *in_ptr++;
-          if (ch < '0' || ch > '9') break;
-          exp_ten = (exp_ten * 10) + (int)(ch - '0');
-          if (exp_ten >= 1e8) break;
-        }
-      }
-    }
-  }
-
-  int const num_digits = static_cast<int>(log10(static_cast<double>(digits))) + 1;
-  exp_ten *= exp_sign;
-  exp_ten += exp_off;
-  exp_ten += num_digits - 1;
-  if (exp_ten > std::numeric_limits<double>::max_exponent10) {
-    return sign > 0 ? infinity : -infinity;
-  }
-
-  double base = sign * static_cast<double>(digits);
-
-  exp_ten += 1 - num_digits;
-  // If 10^exp_ten would result in a subnormal value, the base and
-  // exponent should be adjusted so that 10^exp_ten is a normal value
-  auto const subnormal_shift = std::numeric_limits<double>::min_exponent10 - exp_ten;
-  if (subnormal_shift > 0) {
-    // Handle subnormal values. Ensure that both base and exponent are
-    // normal values before computing their product.
-    base = base / exp10(static_cast<double>(num_digits - 1 + subnormal_shift));
-    exp_ten += num_digits - 1;  // adjust exponent
-    auto const exponent = exp10(static_cast<double>(exp_ten + subnormal_shift));
-    return base * exponent;
-  }
-
-  double const exponent = exp10(static_cast<double>(std::abs(exp_ten)));
-  return exp_ten < 0 ? base / exponent : base * exponent;
-}
-
-}  // namespace detail
-}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/timezones.cu b/src/main/cpp/src/timezones.cu
index 30f19d9df0..12278d181d 100644
--- a/src/main/cpp/src/timezones.cu
+++ b/src/main/cpp/src/timezones.cu
@@ -95,7 +95,7 @@ auto convert_timestamp_tz(column_view const& input,
                           size_type tz_index,
                           bool to_utc,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr)
+                          rmm::device_async_resource_ref mr)
 {
   // get the fixed transitions
   auto const ft_cdv_ptr        = column_device_view::create(transitions.column(0), stream);
@@ -127,7 +127,7 @@ std::unique_ptr<column> convert_timestamp(column_view const& input,
                                           size_type tz_index,
                                           bool to_utc,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto const type = input.type().id();
 
@@ -149,7 +149,7 @@ std::unique_ptr<column> convert_timestamp_to_utc(column_view const& input,
                                                  table_view const& transitions,
                                                  size_type tz_index,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   return convert_timestamp(input, transitions, tz_index, true, stream, mr);
 }
@@ -158,7 +158,7 @@ std::unique_ptr<column> convert_utc_timestamp_to_timezone(column_view const& inp
                                                           table_view const& transitions,
                                                           size_type tz_index,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   return convert_timestamp(input, transitions, tz_index, false, stream, mr);
 }
diff --git a/src/main/cpp/src/timezones.hpp b/src/main/cpp/src/timezones.hpp
index c7ab3c0cc8..00173075b6 100644
--- a/src/main/cpp/src/timezones.hpp
+++ b/src/main/cpp/src/timezones.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cstddef>
 
@@ -42,8 +43,8 @@ std::unique_ptr<cudf::column> convert_timestamp_to_utc(
   cudf::column_view const& input,
   cudf::table_view const& transitions,
   cudf::size_type tz_index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Convert input column timestamps in UTC to specified timezone
@@ -63,7 +64,7 @@ std::unique_ptr<cudf::column> convert_utc_timestamp_to_timezone(
   cudf::column_view const& input,
   cudf::table_view const& transitions,
   cudf::size_type tz_index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
\ No newline at end of file
diff --git a/src/main/cpp/src/utilities.cu b/src/main/cpp/src/utilities.cu
index 7c202a1bec..0b44a2a994 100644
--- a/src/main/cpp/src/utilities.cu
+++ b/src/main/cpp/src/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuda/functional>
 
@@ -32,7 +31,7 @@ namespace spark_rapids_jni {
 std::unique_ptr<rmm::device_buffer> bitmask_bitwise_or(
   std::vector<cudf::device_span<cudf::bitmask_type const>> const& input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.size() > 0, "Empty input");
   auto const mask_size = (*input.begin()).size();
diff --git a/src/main/cpp/src/utilities.hpp b/src/main/cpp/src/utilities.hpp
index 261e75befc..ad0eae7dc6 100644
--- a/src/main/cpp/src/utilities.hpp
+++ b/src/main/cpp/src/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace spark_rapids_jni {
 
@@ -35,7 +36,7 @@ namespace spark_rapids_jni {
  */
 std::unique_ptr<rmm::device_buffer> bitmask_bitwise_or(
   std::vector<cudf::device_span<cudf::bitmask_type const>> const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/xxhash64.cu b/src/main/cpp/src/xxhash64.cu
index 78cc4651bd..daed7590c3 100644
--- a/src/main/cpp/src/xxhash64.cu
+++ b/src/main/cpp/src/xxhash64.cu
@@ -330,7 +330,7 @@ class device_row_hasher {
 std::unique_ptr<cudf::column> xxhash64(cudf::table_view const& input,
                                        int64_t _seed,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   hash_value_type seed = static_cast<hash_value_type>(_seed);
 
diff --git a/src/main/cpp/src/zorder.cu b/src/main/cpp/src/zorder.cu
index 405b046528..37089c7736 100644
--- a/src/main/cpp/src/zorder.cu
+++ b/src/main/cpp/src/zorder.cu
@@ -137,7 +137,7 @@ namespace spark_rapids_jni {
 
 std::unique_ptr<cudf::column> interleave_bits(cudf::table_view const& tbl,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   auto num_columns = tbl.num_columns();
   CUDF_EXPECTS(num_columns > 0, "The input table must have at least one column.");
@@ -224,7 +224,7 @@ std::unique_ptr<cudf::column> interleave_bits(cudf::table_view const& tbl,
 std::unique_ptr<cudf::column> hilbert_index(int32_t const num_bits_per_entry,
                                             cudf::table_view const& tbl,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   auto const num_rows    = tbl.num_rows();
   auto const num_columns = tbl.num_columns();
diff --git a/src/main/cpp/src/zorder.hpp b/src/main/cpp/src/zorder.hpp
index c3fffc9b48..1e084a09de 100644
--- a/src/main/cpp/src/zorder.hpp
+++ b/src/main/cpp/src/zorder.hpp
@@ -20,6 +20,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -27,13 +28,13 @@ namespace spark_rapids_jni {
 
 std::unique_ptr<cudf::column> interleave_bits(
   cudf::table_view const& tbl,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> hilbert_index(
   int32_t const num_bits,
   cudf::table_view const& tbl,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/tests/hash.cpp b/src/main/cpp/tests/hash.cpp
index 265603a9af..9ce57ad018 100644
--- a/src/main/cpp/tests/hash.cpp
+++ b/src/main/cpp/tests/hash.cpp
@@ -30,41 +30,6 @@ constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_leve
 
 class HashTest : public cudf::test::BaseFixture {};
 
-TEST_F(HashTest, MultiValue)
-{
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
-
-  using limits = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
-    {0, 100, -100, limits::min(), limits::max()});
-
-  // Different truth values should be equal
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
-
-  using ts = cudf::timestamp_s;
-  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col(
-    {ts::duration::zero(),
-     static_cast<ts::duration>(100),
-     static_cast<ts::duration>(-100),
-     ts::duration::min(),
-     ts::duration::max()});
-
-  auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1, secs_col});
-  auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2, secs_col});
-
-  auto const output1 = cudf::hash(input1);
-  auto const output2 = cudf::hash(input2);
-
-  EXPECT_EQ(input1.num_rows(), output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-}
-
 TEST_F(HashTest, MultiValueNulls)
 {
   // Nulls with different values should be equal
@@ -115,14 +80,6 @@ TEST_F(HashTest, MultiValueNulls)
   auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1});
   auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2});
 
-  {
-    auto const output1 = cudf::hash(input1);
-    auto const output2 = cudf::hash(input2);
-
-    EXPECT_EQ(input1.num_rows(), output1->size());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-  }
-
   {
     auto const output1 = spark_rapids_jni::murmur_hash3_32(input1, 0);
     auto const output2 = spark_rapids_jni::murmur_hash3_32(input2);
@@ -141,221 +98,6 @@ TEST_F(HashTest, MultiValueNulls)
   }
 }
 
-TEST_F(HashTest, BasicList)
-{
-  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
-  using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
-
-  auto const col = LCW{{}, {}, {1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}};
-  auto const input  = cudf::table_view({col});
-  auto const expect = ICW{1607593296,
-                          1607593296,
-                          -636010097,
-                          -132459357,
-                          -636010097,
-                          -2008850957,
-                          -1023787369,
-                          761197503,
-                          761197503,
-                          1340177511,
-                          -1023787369,
-                          -1023787369};
-
-  auto const output = cudf::hash(input);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-
-  auto const expect_seeded = ICW{1607594268u,
-                                 1607594268u,
-                                 1576790066u,
-                                 1203671017u,
-                                 1576790066u,
-                                 2107478077u,
-                                 1756855002u,
-                                 2228938758u,
-                                 2228938758u,
-                                 3491134126u,
-                                 1756855002u,
-                                 1756855002u};
-
-  auto const seeded_output = cudf::hash(input, cudf::hash_id::HASH_MURMUR3, 15);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
-}
-
-TEST_F(HashTest, NullableList)
-{
-  using LCW = cudf::test::lists_column_wrapper<uint64_t>;
-  using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
-
-  auto const valids = std::vector<bool>{1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0};
-  auto const col =
-    LCW{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {}, {2, 2}, {2, 2}, {}}, valids.begin()};
-  auto expect = ICW{-2023148619,
-                    -2023148619,
-                    -31671896,
-                    -31671896,
-                    -1205248335,
-                    1865773848,
-                    1865773848,
-                    -2023148682,
-                    -1205248335,
-                    -1205248335,
-                    -2023148682};
-
-  auto const output = cudf::hash(cudf::table_view({col}));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-
-  auto const expect_seeded = ICW{2271820643u,
-                                 2271820643u,
-                                 1038318696u,
-                                 1038318696u,
-                                 595138041u,
-                                 3027840870u,
-                                 3027840870u,
-                                 2271820578u,
-                                 595138041u,
-                                 595138041u,
-                                 2271820578u};
-
-  auto const seeded_output = cudf::hash(cudf::table_view({col}), cudf::hash_id::HASH_MURMUR3, 31);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
-}
-
-TEST_F(HashTest, ListOfStruct)
-{
-  auto col1 = cudf::test::fixed_width_column_wrapper<int32_t>{
-    {-1, -1, 0, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, 1, 2},
-    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
-  auto col2 = cudf::test::strings_column_wrapper{
-    {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"},
-    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1}};
-  auto struct_col = cudf::test::structs_column_wrapper{
-    {col1, col2}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
-
-  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-    0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18};
-
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    17, offsets.release(), struct_col.release(), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{83451479,
-                                                                 83451479,
-                                                                 83455332,
-                                                                 83455332,
-                                                                 -759684425,
-                                                                 -959632766,
-                                                                 -959632766,
-                                                                 -959632766,
-                                                                 -959636527,
-                                                                 -656998704,
-                                                                 613652814,
-                                                                 1902080426,
-                                                                 1902080426,
-                                                                 2061025592,
-                                                                 2061025592,
-                                                                 -319840811,
-                                                                 -319840811};
-
-  auto const output = cudf::hash(cudf::table_view({*list_column}));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-
-  auto expect_seeded = cudf::test::fixed_width_column_wrapper<uint32_t>{81710442u,
-                                                                        81710442u,
-                                                                        81729816u,
-                                                                        81729816u,
-                                                                        3532787573u,
-                                                                        3642097855u,
-                                                                        3642097855u,
-                                                                        3642097855u,
-                                                                        3642110391u,
-                                                                        3889855760u,
-                                                                        1494406307u,
-                                                                        103934081u,
-                                                                        103934081u,
-                                                                        3462063680u,
-                                                                        3462063680u,
-                                                                        1696730835u,
-                                                                        1696730835u};
-
-  auto const seeded_output =
-    cudf::hash(cudf::table_view({*list_column}), cudf::hash_id::HASH_MURMUR3, 619);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect_seeded, seeded_output->view(), verbosity);
-}
-
-TEST_F(HashTest, ListOfEmptyStruct)
-{
-  // []
-  // []
-  // Null
-  // Null
-  // [Null, Null]
-  // [Null, Null]
-  // [Null, Null]
-  // [Null]
-  // [Null]
-  // [{}]
-  // [{}]
-  // [{}, {}]
-  // [{}, {}]
-
-  auto struct_validity = std::vector<bool>{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
-  auto struct_col = cudf::make_structs_column(14, {}, null_count, std::move(null_mask));
-
-  auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
-    0, 0, 0, 0, 0, 2, 4, 6, 7, 8, 9, 10, 12, 14};
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  std::tie(null_mask, null_count) =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    13, offsets.release(), std::move(struct_col), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{2271818677u,
-                                                                 2271818677u,
-                                                                 2271818614u,
-                                                                 2271818614u,
-                                                                 3954409013u,
-                                                                 3954409013u,
-                                                                 3954409013u,
-                                                                 2295666275u,
-                                                                 2295666275u,
-                                                                 2295666276u,
-                                                                 2295666276u,
-                                                                 3954409052u,
-                                                                 3954409052u};
-
-  auto output = cudf::hash(cudf::table_view({*list_column}));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
-TEST_F(HashTest, EmptyDeepList)
-{
-  // List<List<int>>, where all lists are empty
-  // []
-  // []
-  // Null
-  // Null
-
-  // Internal empty list
-  auto list1 = cudf::test::lists_column_wrapper<int>{};
-
-  auto offsets       = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0};
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    4, offsets.release(), list1.release(), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<uint32_t>{
-    2271818677u, 2271818677u, 2271818614u, 2271818614u};
-
-  auto output = cudf::hash(cudf::table_view({*list_column}));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
 template <typename T>
 class HashTestTyped : public cudf::test::BaseFixture {};
 
@@ -367,15 +109,6 @@ TYPED_TEST(HashTestTyped, Equality)
   auto const input = cudf::table_view({col});
 
   // Hash of same input should be equal
-
-  {
-    auto const output1 = cudf::hash(input);
-    auto const output2 = cudf::hash(input);
-
-    EXPECT_EQ(input.num_rows(), output1->size());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-  }
-
   {
     auto const output1 = spark_rapids_jni::murmur_hash3_32(input, 0);
     auto const output2 = spark_rapids_jni::murmur_hash3_32(input);
@@ -404,14 +137,6 @@ TYPED_TEST(HashTestTyped, EqualityNulls)
   auto const input1 = cudf::table_view({col1});
   auto const input2 = cudf::table_view({col2});
 
-  {
-    auto const output1 = cudf::hash(input1);
-    auto const output2 = cudf::hash(input2);
-
-    EXPECT_EQ(input1.num_rows(), output1->size());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
-  }
-
   {
     auto const output1 = spark_rapids_jni::murmur_hash3_32(input1, 0);
     auto const output2 = spark_rapids_jni::murmur_hash3_32(input2);
@@ -454,13 +179,6 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
   auto const table_col_neg_zero = cudf::table_view({col_neg_zero});
   auto const table_col_neg_nan  = cudf::table_view({col_neg_nan});
 
-  auto const hash_col          = cudf::hash(table_col);
-  auto const hash_col_neg_zero = cudf::hash(table_col_neg_zero);
-  auto const hash_col_neg_nan  = cudf::hash(table_col_neg_nan);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_zero, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, verbosity);
-
   // Spark hash is sensitive to 0 and -0
   {
     auto const spark_col         = spark_rapids_jni::murmur_hash3_32(table_col, 0);
diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp
index a042ff46e9..4c7cf6446a 100644
--- a/src/main/cpp/tests/parse_uri.cpp
+++ b/src/main/cpp/tests/parse_uri.cpp
@@ -25,6 +25,7 @@
 struct ParseURIProtocolTests : public cudf::test::BaseFixture {};
 struct ParseURIHostTests : public cudf::test::BaseFixture {};
 struct ParseURIQueryTests : public cudf::test::BaseFixture {};
+struct ParseURIPathTests : public cudf::test::BaseFixture {};
 
 enum class test_types {
   SIMPLE,
@@ -79,8 +80,7 @@ cudf::test::strings_column_wrapper get_test_data(test_types t)
          "http://[fe80::7:8%eth0]",
          "http://[fe80::7:8%1]",
          "http://foo.bar/abc/\\\\\\http://foo.bar/abc.gif\\\\\\",
-         "www.nvidia.com:8100/servlet/"
-         "impc.DisplayCredits?primekey_in=2000041100:05:14115240636",
+         "www.nvidia.com:8100/servlet/impc.DisplayCredits?primekey_in=2000041100:05:14115240636",
          "https://nvidia.com/2Ru15Ss ",
          "http://www.nvidia.com/plugins//##",
          "www.nvidia.com:81/Free.fr/L7D9qw9X4S-aC0&amp;D4X0/Panels&amp;solutionId=0X54a/"
@@ -407,3 +407,109 @@ TEST_F(ParseURIQueryTests, Queries)
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
   }
 }
+
+TEST_F(ParseURIPathTests, Simple)
+{
+  auto const col    = get_test_data(test_types::SIMPLE);
+  auto const result = spark_rapids_jni::parse_uri_to_path(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected({"/s/uri",
+                                                     "",
+                                                     "/to/a/cool/file",
+                                                     "/path/to/file",
+                                                     "/www.nvidia.com",
+                                                     "",
+                                                     "/network/path/to/file",
+                                                     "nvidia.com",
+                                                     "www.nvidia.com/s/uri"},
+                                                    {1, 1, 1, 1, 1, 0, 1, 1, 1});
+
+  cudf::test::print(result->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(ParseURIPathTests, SparkEdges)
+{
+  auto const col    = get_test_data(test_types::SPARK_EDGES);
+  auto const result = spark_rapids_jni::parse_uri_to_path(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected(
+    {"/https&",
+     "//www.nvidia.com",
+     "",
+     "",
+     "",
+     "/absolute/path",
+     "",
+     "",
+     "",
+     "",
+     "/absolute/path",
+     "",
+     "",
+     "/q/This%20is%20a%20query",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "/file;param",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "",
+     "/",
+     "/",
+     "/"},
+    {1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
+     1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1});
+
+  cudf::test::print(result->view());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(ParseURIPathTests, IP6)
+{
+  auto const col    = get_test_data(test_types::IPv6);
+  auto const result = spark_rapids_jni::parse_uri_to_path(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected(
+    {"", "", "", "", "", "", "", "/path/to/file", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
+
+  cudf::test::print(result->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(ParseURIPathTests, IP4)
+{
+  auto const col    = get_test_data(test_types::IPv4);
+  auto const result = spark_rapids_jni::parse_uri_to_path(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected({"/", "/", "/", "/", "/", "/path/to/file"});
+
+  cudf::test::print(result->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(ParseURIPathTests, UTF8)
+{
+  auto const col    = get_test_data(test_types::UTF8);
+  auto const result = spark_rapids_jni::parse_uri_to_path(cudf::strings_column_view{col});
+
+  cudf::test::strings_column_wrapper const expected({"/%4EV%49%44%49%41", "", "/123", ""},
+                                                    {1, 1, 1, 0});
+
+  cudf::test::print(result->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
diff --git a/src/main/fbs/profiler.fbs b/src/main/fbs/profiler.fbs
new file mode 100644
index 0000000000..0770be33cf
--- /dev/null
+++ b/src/main/fbs/profiler.fbs
@@ -0,0 +1,287 @@
+// Copyright (c) 2024, NVIDIA CORPORATION.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Flatbuffer schema for the profiler
+// NOTE: The schema needs to be in a single file because the build embeds it
+//       into the converter tool to be able to emit profile records as JSON.
+
+// Profiling data is written as a series of size-prefixed flatbuffers.
+// The first flatbuffer is always ProfileHeader followed by zero or more ActivityRecords.
+
+namespace spark_rapids_jni.profiler;
+
+table ActivityObjectId {
+  process_id:uint32;  // present if object kind is Process or Thread
+  thread_id:uint32;   // present if object kind is Thread
+  device_id:uint32;   // present if object kind is Device or Context or Stream
+  context_id:uint32;  // present if object kind is Context or Stream
+  stream_id:uint32;   // present if object kind is Stream
+}
+
+enum ApiKind:byte {
+  Driver = 0,
+  Runtime = 1
+}
+
+enum ChannelType:uint8 {
+  Invalid = 0,
+  Compute = 1,
+  AsyncMemcpy = 2
+}
+
+table CommandBufferFullData {
+  command_buffer_length:uint32;
+  channel_id:uint32;
+  channel_type:uint32;
+}
+
+enum LaunchType:uint8 {
+  Regular = 0,
+  CooperativeSingleDevice = 1,
+  CooperativeMultiDevice = 2
+}
+
+enum MarkerFlags:uint8 (bit_flags) {
+  Instantaneous = 0,
+  Start = 1,
+  End = 2,
+  SyncAcquire = 3,
+  SyncAcquireSuccess = 4,
+  SyncAcquireFailed = 5,
+  SyncRelease = 6
+}
+
+enum MemcpyFlags:uint8 (bit_flags) {
+  Async = 0
+}
+
+enum MemcpyKind:uint8 {
+  Unknown = 0,
+  HtoD = 1,
+  DtoH = 2,
+  HtoA = 3,
+  AtoH = 4,
+  AtoA = 5,
+  AtoD = 6,
+  DtoA = 7,
+  DtoD = 8,
+  HtoH = 9,
+  PtoP = 10
+}
+
+enum MemoryKind:uint8 {
+  Unknown = 0,
+  Pageable = 1,
+  Pinned = 2,
+  Device = 3,
+  Array = 4,
+  Managed = 5,
+  DeviceStatic = 6,
+  ManagedStatic = 7
+}
+
+enum MemsetFlags:uint8 (bit_flags) {
+  Async = 0
+}
+
+enum OverheadKind:uint8 {
+  Unknown = 0,
+  DriverCompiler = 1,
+  CUptiBufferFlush = 2,
+  CUptiInstrumentation = 3,
+  CUptiResource = 4
+}
+
+enum PartitionedGlobalCacheConfig:uint8 {
+  Unknown = 0,
+  NotSupported = 1,
+  Off = 2,
+  On = 3
+}
+
+enum ShmemLimitConfig:uint8 {
+  Default = 0,
+  Optin = 1
+}
+
+table ProfileHeader {
+  magic:string;
+  version:uint32;
+  writer_version:string;
+}
+
+table ActivityRecords {
+  api:[ApiActivity];
+  device:[DeviceActivity];
+  dropped:[DroppedRecords];
+  kernel:[KernelActivity];
+  marker:[MarkerActivity];
+  marker_data:[MarkerData];
+  memcpy:[MemcpyActivity];
+  memset:[MemsetActivity];
+  overhead:[OverheadActivity];
+}
+
+table ApiActivity {
+  kind:ApiKind = Runtime;
+  cbid:uint32;
+  start:uint64;
+  end:uint64;
+  process_id:uint32;
+  thread_id:uint32;
+  correlation_id:uint32;
+  return_value:uint32 = 0;
+}
+
+table DeviceActivity {
+  global_memory_bandwidth:uint64;
+  global_memory_size:uint64;
+  constant_memory_size:uint32;
+  l2_cache_size:uint32;
+  num_threads_per_warp:uint32;
+  core_clock_rate:uint32;
+  num_memcpy_engines:uint32;
+  num_multiprocessors:uint32;
+  max_ipc:uint32;
+  max_warps_per_multiprocessor:uint32;
+  max_blocks_per_multiprocessor:uint32;
+  max_shared_memory_per_multiprocessor:uint32;
+  max_registers_per_multiprocessor:uint32;
+  max_registers_per_block:uint32;
+  max_shared_memory_per_block:uint32;
+  max_threads_per_block:uint32;
+  max_block_dim_x:uint32;
+  max_block_dim_y:uint32;
+  max_block_dim_z:uint32;
+  max_grid_dim_x:uint32;
+  max_grid_dim_y:uint32;
+  max_grid_dim_z:uint32;
+  compute_capability_major:uint32;
+  compute_capability_minor:uint32;
+  id:uint32;
+  ecc_enabled:uint32;
+  name:string;
+}
+
+table DroppedRecords {
+  num_dropped:uint64;
+}
+
+table KernelActivity {
+  requested:uint8;
+  executed:uint8;
+  shared_memory_config:uint8;
+  registers_per_thread:uint16;
+  partitioned_global_cache_requested:PartitionedGlobalCacheConfig;
+  partitioned_global_cache_executed:PartitionedGlobalCacheConfig;
+  start:uint64;
+  end:uint64;
+  completed:uint64 = 0;
+  device_id:uint32;
+  context_id:uint32;
+  stream_id:uint32;
+  grid_x:int32;
+  grid_y:int32;
+  grid_z:int32;
+  block_x:int32;
+  block_y:int32;
+  block_z:int32;
+  static_shared_memory:int32;
+  dynamic_shared_memory:int32;
+  local_memory_per_thread:uint32;
+  local_memory_total:uint32;
+  correlation_id:uint32;
+  grid_id:int64;
+  name:string;
+  queued:uint64 = 0;
+  submitted:uint64 = 0;
+  launch_type:LaunchType = Regular;
+  is_shared_memory_carveout_requested:uint8;
+  shared_memory_carveout_requested:uint8;
+  shared_memory_executed:uint32;
+  graph_node_id:uint64 = 0;
+  shmem_limit_config:ShmemLimitConfig = Default;
+  graph_id:uint32 = 0;
+  //access_policy_window:???;
+  channel_id:uint32;
+  channel_type:ChannelType;
+  cluster_x:uint32;
+  cluster_y:uint32;
+  cluster_z:uint32;
+  cluster_scheduling_policy:uint32;
+  local_memory_total_v2:uint64;
+}
+
+table MarkerActivity {
+  flags:MarkerFlags = Start;
+  timestamp:uint64;
+  id:int32;
+  object_id:ActivityObjectId;
+  name:string;
+  domain:string;
+}
+
+table MarkerData {
+  flags:MarkerFlags = Start;
+  id:int32;
+  //payload_kind:MetricValueKind;
+  //payload:MetricValue;
+  color:uint32;
+  category:uint32;
+}
+
+table MemcpyActivity {
+  copy_kind:MemcpyKind;
+  src_kind:MemoryKind;
+  dst_kind:MemoryKind;
+  flags:MemcpyFlags;
+  bytes:uint64;
+  start:uint64;
+  end:uint64;
+  device_id:uint32;
+  context_id:uint32;
+  stream_id:uint32;
+  correlation_id:uint32;
+  runtime_correlation_id:uint32;
+  graph_node_id:uint64 = 0;
+  graph_id:uint32 = 0;
+  channel_id:uint32;
+  channel_type:ChannelType;
+}
+
+table MemsetActivity {
+  value:uint32;
+  bytes:uint64;
+  start:uint64;
+  end:uint64;
+  device_id:uint32;
+  context_id:uint32;
+  stream_id:uint32;
+  correlation_id:uint32;
+  flags:MemsetFlags;
+  memory_kind:MemoryKind;
+  graph_node_id:uint64 = 0;
+  graph_id:uint32 = 0;
+  channel_id:uint32;
+  channel_type:ChannelType;
+}
+
+table OverheadActivity {
+  overhead_kind:OverheadKind;
+  object_id:ActivityObjectId;
+  start:uint64;
+  end:uint64;
+}
+
+root_type ActivityRecords;
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
index 4ff9c91a3f..bee6f1df74 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
@@ -27,9 +27,7 @@ public class JSONUtils {
   public static final int MAX_PATH_DEPTH = 16;
 
   public enum PathInstructionType {
-    SUBSCRIPT,
     WILDCARD,
-    KEY,
     INDEX,
     NAMED
   }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
index 6de84ea519..6b71416dcb 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -85,9 +85,21 @@ public static ColumnVector parseURIQueryWithColumn(ColumnView uriColumn, ColumnV
     return new ColumnVector(parseQueryWithColumn(uriColumn.getNativeView(), queryColumn.getNativeView()));
   }
 
+  /**
+   * Parse path for each URI from the incoming column.
+   *
+   * @param URIColumn The input strings column in which each row contains a URI.
+   * @return A string column with the URI path extracted.
+   */
+  public static ColumnVector parseURIPath(ColumnView uriColumn) {
+    assert uriColumn.getType().equals(DType.STRING) : "Input type must be String";
+    return new ColumnVector(parsePath(uriColumn.getNativeView()));
+  }
+
   private static native long parseProtocol(long inputColumnHandle);
   private static native long parseHost(long inputColumnHandle);
   private static native long parseQuery(long inputColumnHandle);
   private static native long parseQueryWithLiteral(long inputColumnHandle, String query);
   private static native long parseQueryWithColumn(long inputColumnHandle, long queryColumnHandle);
+  private static native long parsePath(long inputColumnHandle);
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/Profiler.java b/src/main/java/com/nvidia/spark/rapids/jni/Profiler.java
new file mode 100644
index 0000000000..86d5b0edde
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/Profiler.java
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.NativeDepsLoader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/** Profiler that collects CUDA and NVTX events for the current process. */
+public class Profiler {
+  private static final long DEFAULT_WRITE_BUFFER_SIZE = 1024 * 1024;
+  private static final int DEFAULT_FLUSH_PERIOD_MILLIS = 0;
+  private static DataWriter writer = null;
+
+  /**
+   * Initialize the profiler in a standby state. The start method must be called after this
+   * to start collecting profiling data.
+   * @param w data writer for writing profiling data
+   */
+  public static void init(DataWriter w) {
+    init(w, DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_FLUSH_PERIOD_MILLIS);
+  }
+
+  /**
+   * Initialize the profiler in a standby state. The start method must be called after this
+   * to start collecting profiling data.
+   * @param w data writer for writing profiling data
+   * @param writeBufferSize size of host memory buffer to use for collecting profiling data.
+   *                        Recommended to be between 1-8 MB in size to balance callback
+   *                        overhead with latency.
+   * @param flushPeriodMillis time period in milliseconds to explicitly flush collected
+   *                          profiling data to the writer. A value <= 0 will disable explicit
+   *                          flushing.
+   */
+  public static void init(DataWriter w, long writeBufferSize, int flushPeriodMillis) {
+    if (writer == null) {
+      File libPath;
+      try {
+        libPath = NativeDepsLoader.loadNativeDep("profilerjni", true);
+      } catch (IOException e) {
+        throw new RuntimeException("Error loading profiler library", e);
+      }
+      nativeInit(libPath.getAbsolutePath(), w, writeBufferSize, flushPeriodMillis);
+      writer = w;
+    } else {
+      throw new IllegalStateException("Already initialized");
+    }
+  }
+
+  /**
+   * Shutdown the profiling session. Flushes collected profiling data to the writer and
+   * closes the writer.
+   */
+  public static void shutdown() {
+    if (writer != null) {
+      nativeShutdown();
+      try {
+        writer.close();
+      } catch (Exception e) {
+        throw new RuntimeException("Error closing writer", e);
+      } finally {
+        writer = null;
+      }
+    }
+  }
+
+  /**
+   * Start collecting profiling data. Safe to call if profiling data is already being collected.
+   */
+  public static void start() {
+    if (writer != null) {
+      nativeStart();
+    } else {
+      throw new IllegalStateException("Profiler not initialized");
+    }
+  }
+
+  /**
+   * Stop collecting profiling data. Safe to call if the profiler is initialized but not
+   * actively collecting data.
+   */
+  public static void stop() {
+    if (writer != null) {
+      nativeStop();
+    } else {
+      throw new IllegalStateException("Profiler not initialized");
+    }
+  }
+
+  private static native void nativeInit(String libPath, DataWriter writer,
+                                        long writeBufferSize, int flushPeriodMillis);
+
+  private static native void nativeStart();
+
+  private static native void nativeStop();
+
+  private static native void nativeShutdown();
+
+  /** Interface for profiler data writers */
+  public interface DataWriter extends AutoCloseable {
+    /**
+     * Called by the profiler to write a block of profiling data. Profiling data is written
+     * in a size-prefixed flatbuffer format. See profiler.fbs for the schema.
+     * @param data profiling data to be written
+     */
+    void write(ByteBuffer data);
+  }
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RegexRewriteUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/RegexRewriteUtils.java
new file mode 100644
index 0000000000..9277c3e0f9
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/RegexRewriteUtils.java
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.*;
+
+public class RegexRewriteUtils {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+/**
+ * @brief Check if input string contains regex pattern `literal[start-end]{len,}`, which means
+ * a literal string followed by a range of characters in the range of start to end, with at least
+ * len characters.
+ *
+ * @param strings Column of strings to check for literal.
+ * @param literal UTF-8 encoded string to check in strings column.
+ * @param len Minimum number of characters to check after the literal.
+ * @param start Minimum UTF-8 codepoint value to check for in the range.
+ * @param end Maximum UTF-8 codepoint value to check for in the range.
+ * @return ColumnVector of booleans where true indicates the string contains the pattern.
+ */
+  public static ColumnVector literalRangePattern(ColumnVector input, Scalar literal, int len, int start, int end) {
+    assert(input.getType().equals(DType.STRING)) : "column must be a String";
+    return new ColumnVector(literalRangePattern(input.getNativeView(), CudfAccessor.getScalarHandle(literal), len, start, end));
+  }
+
+  private static native long literalRangePattern(long input, long literal, int len, int start, int end);
+}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
index c39766454a..a9b1cfaa4d 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/CastStringsTest.java
@@ -33,14 +33,14 @@ public class CastStringsTest {
   @Test
   void castToIntegerTest() {
     Table.TestBuilder tb = new Table.TestBuilder();
-    tb.column(3l, 9l, 4l, 2l, 20l, null, null);
-    tb.column(5, 1, 0, 2, 7, null, null);
-    tb.column(new Byte[]{2, 3, 4, 5, 9, null, null});
+    tb.column(3l, 9l, 4l, 2l, 20l, null, null, 1l);
+    tb.column(5, 1, 0, 2, 7, null, null, 1);
+    tb.column(new Byte[]{2, 3, 4, 5, 9, null, null, 1});
     try (Table expected = tb.build()) {
       Table.TestBuilder tb2 = new Table.TestBuilder();
-      tb2.column(" 3", "9", "4", "2", "20.5", null, "7.6asd");
-      tb2.column("5", "1  ", "0", "2", "7.1", null, "asdf");
-      tb2.column("2", "3", " 4 ", "5", " 9.2 ", null, "7.8.3");
+      tb2.column(" 3", "9", "4", "2", "20.5", null, "7.6asd", "\u0000 \u001f1\u0014");
+      tb2.column("5", "1  ", "0", "2", "7.1", null, "asdf", "\u0000 \u001f1\u0014");
+      tb2.column("2", "3", " 4 ", "5", " 9.2 ", null, "7.8.3", "\u0000 \u001f1\u0014");
 
       List<ColumnVector> result = new ArrayList<>();
       try (Table origTable = tb2.build()) {
@@ -129,20 +129,49 @@ void castToIntegerAnsiTest() {
     }
   }
 
+  @Test
+  void castToFloatsTrimTest() {
+    Table.TestBuilder tb = new Table.TestBuilder();
+    tb.column(1.1f, 1.2f, 1.3f, 1.4f, 1.5f, null, null);
+    tb.column(1.1d, 1.2d, 1.3d, 1.4d, 1.5d, null, null);
+    try (Table expected = tb.build()) {
+      Table.TestBuilder tb2 = new Table.TestBuilder();
+      tb2.column("1.1\u0000", "1.2\u0014", "1.3\u001f", 
+          "\u0000\u00001.4\u0000", "1.5\u0000\u0020\u0000", "1.6\u009f", "1.7\u0021");
+      tb2.column("1.1\u0000", "1.2\u0014", "1.3\u001f", 
+          "\u0000\u00001.4\u0000", "1.5\u0000\u0020\u0000", "1.6\u009f", "1.7\u0021");
+
+      List<ColumnVector> result = new ArrayList<>();
+      try (Table origTable = tb2.build()) {
+        for (int i = 0; i < origTable.getNumberOfColumns(); i++) {
+          ColumnVector string_col = origTable.getColumn(i);
+          result.add(CastStrings.toFloat(string_col, false, 
+              expected.getColumn(i).getType()));
+        }
+        try (Table result_tbl = new Table(
+            result.toArray(new ColumnVector[result.size()]))) {
+          AssertUtils.assertTablesAreEqual(expected, result_tbl);
+        }
+      } finally {
+        result.forEach(ColumnVector::close);
+      }
+    }
+  }
+
   @Test
   void castToDecimalTest() {
     Table.TestBuilder tb = new Table.TestBuilder();
-    tb.decimal32Column(0,3, 9, 4, 2, 21, null, null);
-    tb.decimal64Column(0, 5l, 1l, 0l, 2l, 7l, null, null);
-    tb.decimal32Column(-1, 20, 30, 40, 51, 92, null, null);
+    tb.decimal32Column(0,3, 9, 4, 2, 21, null, null, 1);
+    tb.decimal64Column(0, 5l, 1l, 0l, 2l, 7l, null, null, 1l);
+    tb.decimal32Column(-1, 20, 30, 40, 51, 92, null, null, 10);
     try (Table expected = tb.build()) {
       int[] desiredPrecision = new int[]{2, 10, 3};
       int[] desiredScale = new int[]{0, 0, -1};
 
       Table.TestBuilder tb2 = new Table.TestBuilder();
-      tb2.column(" 3", "9", "4", "2", "20.5", null, "7.6asd");
-      tb2.column("5", "1 ", "0", "2", "7.1", null, "asdf");
-      tb2.column("2", "3", " 4 ", "5.07", "9.23", null, "7.8.3");
+      tb2.column(" 3", "9", "4", "2", "20.5", null, "7.6asd", "\u0000 \u001f1\u0014");
+      tb2.column("5", "1 ", "0", "2", "7.1", null, "asdf", "\u0000 \u001f1\u0014");
+      tb2.column("2", "3", " 4 ", "5.07", "9.23", null, "7.8.3", "\u0000 \u001f1\u0014");
 
       List<ColumnVector> result = new ArrayList<>();
       try (Table origTable = tb2.build()) {
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
index ea23c4c9ba..bba6650d0f 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
@@ -28,7 +28,7 @@ public class GetJsonObjectTest {
   @Test
   void getJsonObjectTest() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("k") };
+        namedPath("k") };
     try (ColumnVector jsonCv = ColumnVector.fromStrings(
         "{\"k\": \"v\"}");
         ColumnVector expected = ColumnVector.fromStrings(
@@ -44,7 +44,7 @@ void getJsonObjectTest() {
   @Test
   void getJsonObjectTest2() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(),
+
         namedPath("k1_111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111")
     };
 
@@ -69,7 +69,7 @@ void getJsonObjectTest2() {
   @Test
   void getJsonObjectTest3() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("k1"), keyPath(), namedPath("k2")
+        namedPath("k1"), namedPath("k2")
     };
     String JSON = "{\"k1\":{\"k2\":\"v2\"}}";
     String expectedStr = "v2";
@@ -89,14 +89,14 @@ void getJsonObjectTest3() {
   @Test
   void getJsonObjectTest4() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("k1"),
-        keyPath(), namedPath("k2"),
-        keyPath(), namedPath("k3"),
-        keyPath(), namedPath("k4"),
-        keyPath(), namedPath("k5"),
-        keyPath(), namedPath("k6"),
-        keyPath(), namedPath("k7"),
-        keyPath(), namedPath("k8")
+        namedPath("k1"),
+        namedPath("k2"),
+        namedPath("k3"),
+        namedPath("k4"),
+        namedPath("k5"),
+        namedPath("k6"),
+        namedPath("k7"),
+        namedPath("k8")
     };
 
     String JSON = "{\"k1\":{\"k2\":{\"k3\":{\"k4\":{\"k5\":{\"k6\":{\"k7\":{\"k8\":\"v8\"}}}}}}}}";
@@ -117,7 +117,7 @@ void getJsonObjectTest4() {
   @Test
   void getJsonObjectTest_Baidu_unescape_backslash() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("URdeosurl")
+        namedPath("URdeosurl")
     };
 
     String JSON = "{\"brand\":\"ssssss\",\"duratRon\":15,\"eqTosuresurl\":\"\",\"RsZxarthrl\":false,\"xonRtorsurl\":\"\",\"xonRtorsurlstOTe\":0,\"TRctures\":[{\"RxaGe\":\"VttTs:\\/\\/feed-RxaGe.baRdu.cox\\/0\\/TRc\\/-196588744s840172444s-773690137.zTG\"}],\"Toster\":\"VttTs:\\/\\/feed-RxaGe.baRdu.cox\\/0\\/TRc\\/-196588744s840172444s-773690137.zTG\",\"reserUed\":{\"bRtLate\":391.79,\"xooUZRke\":26876,\"nahrlIeneratRonNOTe\":0,\"useJublRc\":6,\"URdeoRd\":821284086},\"tRtle\":\"ssssssssssmMsssssssssssssssssss\",\"url\":\"s{storehrl}\",\"usersTortraRt\":\"VttTs:\\/\\/feed-RxaGe.baRdu.cox\\/0\\/TRc\\/-6971178959s-664926866s-6096674871.zTG\",\"URdeosurl\":\"http:\\/\\/nadURdeo2.baRdu.cox\\/5fa3893aed7fc0f8231dab7be23efc75s820s6240.xT3\",\"URdeoRd\":821284086}";
@@ -138,7 +138,7 @@ void getJsonObjectTest_Baidu_unescape_backslash() {
   @Test
   void getJsonObjectTest_Baidu_get_unexist_field_name() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("Vgdezsurl")
+        namedPath("Vgdezsurl")
     };
 
     String JSON = "{\"brand\":\"ssssss\",\"duratgzn\":17,\"eSyzsuresurl\":\"\",\"gswUartWrl\":false,\"Uzngtzrsurl\":\"\",\"UzngtzrsurlstJye\":0,\"ygctures\":[{\"gUaqe\":\"Ittys:\\/\\/feed-gUaqe.bagdu.czU\\/0\\/ygc\\/63025364s-376461312s7528698939.Qyq\"}],\"yzster\":\"Ittys:\\/\\/feed-gUaqe.bagdu.czU\\,\"url\":\"s{stHreqrl}\",\"usersPHrtraIt\":\"LttPs:\\/\\/feed-IUaxe.baIdu.cHU\\/0\\/PIc\\/-1043913002s489796992s-1505641721.Pnx\",\"kIdeHsurl\":\"LttP:\\/\\/nadkIdeH9.baIdu.cHU\\/4d7d308bd7c04e63069fd343adfa792as1790s1080.UP3\",\"kIdeHId\":852890923}";
@@ -169,12 +169,14 @@ void getJsonObjectTest_Escape() {
     String JSON4 = "['a','b','\"C\"']";
     // \\u4e2d\\u56FD is 中国
     String JSON5 = "'\\u4e2d\\u56FD\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\b'";
+    String JSON6 = "['\\u4e2d\\u56FD\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\b']"; 
 
     String expectedStr1 = "{\"a\":\"A\"}";
     String expectedStr2 = "{\"a\":\"A\\\"\"}";
     String expectedStr3 = "{\"a\":\"B'\"}";
     String expectedStr4 = "[\"a\",\"b\",\"\\\"C\\\"\"]";
     String expectedStr5 = "中国\"'\\/\b\f\n\r\t\b";
+    String expectedStr6 = "中国\\\"'\\\\/\\b\\f\\n\\r\\t\\b";
 
     try (
         ColumnVector jsonCv = ColumnVector.fromStrings(
@@ -251,7 +253,7 @@ void getJsonObjectTest_Test_leading_zeros() {
   @Test
   void getJsonObjectTest_Test_index() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), indexPath(1)
+        indexPath(1)
     };
 
     String JSON1 = "[ [0, 1, 2] , [10, [11], [121, 122, 123], 13] ,  [20, 21, 22]]";
@@ -271,7 +273,7 @@ void getJsonObjectTest_Test_index() {
   @Test
   void getJsonObjectTest_Test_index_index() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), indexPath(1), subscriptPath(), indexPath(2)
+        indexPath(1), indexPath(2)
     };
 
     String JSON1 = "[ [0, 1, 2] , [10, [11], [121, 122, 123], 13] ,  [20, 21, 22]]";
@@ -309,13 +311,13 @@ void getJsonObjectTest_Test_case_path1() {
    * case path 5: case (START_ARRAY, Subscript :: Wildcard :: Subscript ::
    * Wildcard :: xs), set flatten style
    * case path 2: case (START_ARRAY, Nil) if style == FlattenStyle
-   * 
+   *
    * First use path5 [*][*] to enable flatten style.
    */
   @Test
   void getJsonObjectTest_Test_case_path2() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), wildcardPath(), subscriptPath(), wildcardPath()
+        wildcardPath(), wildcardPath()
     };
 
     String JSON1 = "[ [11, 12], [21, [221, [2221, [22221, 22222]]]], [31, 32] ]";
@@ -355,7 +357,7 @@ void getJsonObjectTest_Test_case_path3() {
   @Test
   void getJsonObjectTest_Test_case_path4() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("k")
+        namedPath("k")
     };
 
     String JSON1 = "{ 'k' : 'v'  }";
@@ -378,8 +380,8 @@ void getJsonObjectTest_Test_case_path4() {
   @Test
   void getJsonObjectTest_Test_case_path5() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), wildcardPath(), subscriptPath(), wildcardPath(), // $[*][*]
-        keyPath(), namedPath("k")
+        wildcardPath(), wildcardPath(), // $[*][*]
+        namedPath("k")
     };
 
     // flatten the arrays, then query named path "k"
@@ -402,7 +404,7 @@ void getJsonObjectTest_Test_case_path5() {
   @Test
   void getJsonObjectTest_Test_case_path6() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), wildcardPath()
+        wildcardPath()
     };
     String JSON1 = "[1, [21, 22], 3]";
     String expectedStr1 = "[1,[21,22],3]";
@@ -430,13 +432,13 @@ void getJsonObjectTest_Test_case_path6() {
    */
   @Test
   void getJsonObjectTest_Test_case_path7() {
-    // subscriptPath(), wildcardPath() subscriptPath(), wildcardPath() will go to
+    // wildcardPath() wildcardPath() will go to
     // path5
-    // so insert keyPath(), namedPath("k")
+    // so insert namedPath("k")
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), wildcardPath(), // path 6
-        keyPath(), namedPath("k"), // path 4, path 10
-        subscriptPath(), wildcardPath() // path 7
+        wildcardPath(), // path 6
+        namedPath("k"), // path 4, path 10
+        wildcardPath() // path 7
     };
 
     String JSON1 = "[ {'k': [0, 1, 2]}, {'k': [10, 11, 12]}, {'k': [20, 21, 22]}  ]";
@@ -459,7 +461,7 @@ void getJsonObjectTest_Test_case_path7() {
   @Test
   void getJsonObjectTest_Test_case_path8() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), indexPath(1), subscriptPath(), wildcardPath()
+        indexPath(1), wildcardPath()
     };
     String JSON1 = "[ [0], [10, 11, 12], [2] ]";
     String expectedStr1 = "[10,11,12]";
@@ -479,7 +481,7 @@ void getJsonObjectTest_Test_case_path8() {
   @Test
   void getJsonObjectTest_Test_case_path9() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), indexPath(1), subscriptPath(), indexPath(1), subscriptPath(), wildcardPath()
+        indexPath(1), indexPath(1), wildcardPath()
     };
     String JSON1 = "[[0, 1, 2], [10, [111, 112, 113], 12], [20, 21, 22]]";
     String expectedStr1 = "[111,112,113]";
@@ -501,7 +503,7 @@ void getJsonObjectTest_Test_case_path9() {
   @Test
   void getJsonObjectTest_Test_case_path10() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), namedPath("k"), subscriptPath(), indexPath(1)
+        namedPath("k"), indexPath(1)
     };
     String JSON1 = "{'k' : [0,1,2]}";
     String expectedStr1 = "1";
@@ -517,26 +519,24 @@ void getJsonObjectTest_Test_case_path10() {
 
   /**
    * Test case paths:
-   * case path 11: case (FIELD_NAME, Wildcard :: xs)
+   * case path 11: case (FIELD_NAME, Key :: Wildcard :: xs)
    * Refer to Spark code:
    * https://github.com/apache/spark/blob/v3.5.0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala#L218
-   * path sequence key, wildcard can test path 11, but parser can not produce this
-   * sequence.
-   * Note: Here use manually created key, wildcard sequence to test.
+   * Can not produce this Paths: (key, wildcard)
+   * e.g.: Spark will produces (wildcard) path for path string $.*, instead of (key, wildcard) path
+   * Anyway, here is testing $.*
    */
   @Test
   void getJsonObjectTest_Test_case_path11() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        keyPath(), wildcardPath()
+        wildcardPath()
     };
     String JSON1 = "{'k' : [0,1,2]}";
-    String expectedStr1 = "[0,1,2]";
     String JSON2 = "{'k' : null}";
-    String expectedStr2 = "null";
 
     try (
         ColumnVector jsonCv = ColumnVector.fromStrings(JSON1, JSON2);
-        ColumnVector expected = ColumnVector.fromStrings(expectedStr1, expectedStr2);
+        ColumnVector expected = ColumnVector.fromStrings(null, null);
         ColumnVector actual = JSONUtils.getJsonObject(jsonCv, query)) {
       assertColumnsAreEqual(expected, actual);
     }
@@ -568,7 +568,7 @@ void getJsonObjectTest_Test_case_path12() {
   @Test
   void getJsonObjectTest_Test_insert_comma_insert_outer_array() {
     JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
-        subscriptPath(), wildcardPath(), subscriptPath(), wildcardPath(), subscriptPath(), wildcardPath()
+        wildcardPath(), wildcardPath(), wildcardPath()
     };
     String JSON1 = "[ [11, 12], [21, 22]]";
     String expectedStr1 = "[[11,12],[21,22]]";
@@ -582,13 +582,24 @@ void getJsonObjectTest_Test_insert_comma_insert_outer_array() {
     }
   }
 
-  private JSONUtils.PathInstructionJni keyPath() {
-    return new JSONUtils.PathInstructionJni(JSONUtils.PathInstructionType.KEY, "", -1);
+  /**
+   * Query: $[*][*][*]
+   */
+  @Test
+  void getJsonObjectTest_15() {
+    JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
+        namedPath("a")
+    };
+    String JSON1 = "{'a':'v1'}";
+    String JSON2 = "{'a':\"b\"c\"}";
+    try (
+        ColumnVector jsonCv = ColumnVector.fromStrings(JSON1, JSON2);
+        ColumnVector expected = ColumnVector.fromStrings("v1", null);
+        ColumnVector actual = JSONUtils.getJsonObject(jsonCv, query)) {
+      assertColumnsAreEqual(expected, actual);
+    }
   }
 
-  private JSONUtils.PathInstructionJni subscriptPath() {
-    return new JSONUtils.PathInstructionJni(JSONUtils.PathInstructionType.SUBSCRIPT, "", -1);
-  }
 
   private JSONUtils.PathInstructionJni wildcardPath() {
     return new JSONUtils.PathInstructionJni(JSONUtils.PathInstructionType.WILDCARD, "", -1);
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/HistogramTest.java b/src/test/java/com/nvidia/spark/rapids/jni/HistogramTest.java
new file mode 100644
index 0000000000..9a1812f660
--- /dev/null
+++ b/src/test/java/com/nvidia/spark/rapids/jni/HistogramTest.java
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.AssertUtils;
+import ai.rapids.cudf.ColumnVector;
+
+import org.junit.jupiter.api.Test;
+
+public class HistogramTest {
+  @Test
+  void testZeroFrequency() {
+    try (ColumnVector values = ColumnVector.fromInts(5, 10, 30);
+         ColumnVector freqs = ColumnVector.fromLongs(1, 0, 1);
+         ColumnVector histogram = Histogram.createHistogramIfValid(values, freqs, true);
+         ColumnVector percentiles = Histogram.percentileFromHistogram(histogram, new double[]{1},
+             false);
+         ColumnVector expected = ColumnVector.fromBoxedDoubles(5.0, null, 30.0)) {
+      AssertUtils.assertColumnsAreEqual(percentiles, expected);
+    }
+  }
+
+  @Test
+  void testAllNulls() {
+    try (ColumnVector values = ColumnVector.fromBoxedInts(null, null, null);
+         ColumnVector freqs = ColumnVector.fromLongs(1, 2, 3);
+         ColumnVector histogram = Histogram.createHistogramIfValid(values, freqs, true);
+         ColumnVector percentiles = Histogram.percentileFromHistogram(histogram, new double[]{0.5},
+             false);
+         ColumnVector expected = ColumnVector.fromBoxedDoubles(null, null, null)) {
+      AssertUtils.assertColumnsAreEqual(percentiles, expected);
+    }
+  }
+}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
index ffe7e9e946..1ddf588b02 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -159,6 +159,27 @@ void testQuery(String[] testData, String[] params) {
     }
   }
 
+  void testPath(String[] testData) {
+    String[] expectedPathStrings = new String[testData.length];
+    for (int i=0; i<testData.length; i++) {
+      String path = null;
+      try {
+        URI uri = new URI(testData[i]);
+        path = uri.getRawPath();
+      } catch (URISyntaxException ex) {
+        // leave the path null if URI is invalid
+      } catch (NullPointerException ex) {
+        // leave the path null if URI is null
+      }
+      expectedPathStrings[i] = path;
+    }
+    try (ColumnVector v0 = ColumnVector.fromStrings(testData);
+      ColumnVector expectedPath = ColumnVector.fromStrings(expectedPathStrings);
+      ColumnVector pathResult = ParseURI.parseURIPath(v0)) {
+      AssertUtils.assertColumnsAreEqual(expectedPath, pathResult);
+    }
+  }
+
   @Test
   void parseURISparkTest() {
     String[] testData = {
@@ -286,6 +307,7 @@ void parseURISparkTest() {
     testQuery(testData);
     testQuery(testData, "query");
     testQuery(testData, queries);
+    testPath(testData);
   }
 
   @Test
@@ -300,6 +322,7 @@ void parseURIUTF8Test() {
     testHost(testData);
     testQuery(testData);
     testQuery(testData, "query");
+    testPath(testData);
   }
 
   @Test
@@ -316,6 +339,7 @@ void parseURIIP4Test() {
     testHost(testData);
     testQuery(testData);
     testQuery(testData, "query");
+    testPath(testData);
   }
 
   @Test
@@ -345,5 +369,6 @@ void parseURIIP6Test() {
     testHost(testData);
     testQuery(testData);
     testQuery(testData, "query");
+    testPath(testData);
   }
 }
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/RegexRewriteUtilsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/RegexRewriteUtilsTest.java
new file mode 100644
index 0000000000..243967055a
--- /dev/null
+++ b/src/test/java/com/nvidia/spark/rapids/jni/RegexRewriteUtilsTest.java
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.Scalar;
+import org.junit.jupiter.api.Test;
+
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+
+public class RegexRewriteUtilsTest {
+
+  @Test
+  void testLiteralRangePattern() {
+    int d = 3;
+    try (ColumnVector inputCv = ColumnVector.fromStrings(
+        "abc123", "aabc123", "aabc12", "abc1232", "aabc1232");
+        Scalar pattern = Scalar.fromString("abc");
+        ColumnVector expected = ColumnVector.fromBooleans(true, true, false, true, true);
+        ColumnVector actual = RegexRewriteUtils.literalRangePattern(inputCv, pattern, d, 48, 57)) {
+      assertColumnsAreEqual(expected, actual);
+    }
+  }
+
+  @Test
+  void testLiteralRangePatternChinese() {
+    int d = 2;
+    try (ColumnVector inputCv = ColumnVector.fromStrings(
+        "数据砖块", "火花-急流英伟达", "英伟达Nvidia", "火花-急流");
+        Scalar pattern = Scalar.fromString("英");
+        ColumnVector expected = ColumnVector.fromBooleans(false, true, true, false);
+        ColumnVector actual = RegexRewriteUtils.literalRangePattern(inputCv, pattern, d, 19968, 40869)) {
+      assertColumnsAreEqual(expected, actual);
+    }
+  }
+
+}
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 578c240a20..7c706cc400 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 578c240a20049a5c4b83b08f54c235aad5318f1a
+Subproject commit 7c706cc4004d5feaae92544b3b29a00c64f7ed86
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index aee5915dc4..356898bd1d 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-69f5222465ec3c8c54f107fcf8750f040034e156
+3c754156fbe1b8ba0b6848d3e2cc1359d0d8918d
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 4261acab1a..49c41c7c53 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -5,9 +5,9 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "740889f413af9b1ae1d81eb1e5a4a9fb4ce9cf97",
+      "git_tag" : "7dd1d34074af176d9e861a360e135ae57b21cf96",
       "git_url" : "https://github.com/apache/arrow.git",
-      "version" : "14.0.2"
+      "version" : "16.1.0"
     },
     "CCCL" : 
     {
@@ -32,20 +32,45 @@
           "fixed_in" : "",
           "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue."
         },
+        {
+          "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff",
+          "fixed_in" : "",
+          "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue."
+        },
+        {
+          "file" : "cccl/kernel_pointer_hiding.diff",
+          "fixed_in" : "2.4",
+          "issue" : "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]"
+        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "fixed_in" : "",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]"
         },
+        {
+          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff",
+          "fixed_in" : "",
+          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]"
+        },
         {
           "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
           "fixed_in" : "",
           "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]"
         },
+        {
+          "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff",
+          "fixed_in" : "",
+          "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]"
+        },
         {
           "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
           "fixed_in" : "",
           "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]"
+        },
+        {
+          "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff",
+          "fixed_in" : "",
+          "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]"
         }
       ],
       "version" : "2.2.0"
@@ -62,23 +87,15 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "5e69e97c42504c17a333a36e1796dce4b83150a4",
+      "git_tag" : "5d4f4a8565c5cdff94f77832a775d73ec9e7513e",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
-      "version" : "24.04"
-    },
-    "NVTX3" : 
-    {
-      "always_download" : true,
-      "git_shallow" : false,
-      "git_tag" : "e170594ac7cf1dac584da473d4ca9301087090c1",
-      "git_url" : "https://github.com/NVIDIA/NVTX.git",
-      "version" : "3.1.0"
+      "version" : "24.06"
     },
     "cuco" : 
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "56c53beb6fb0cafd265b7fcc3df78ae487811b22",
+      "git_tag" : "2101cb31d0210b609cd02c88f9b538e10881d91d",
       "git_url" : "https://github.com/NVIDIA/cuCollections.git",
       "version" : "0.0.1"
     },
@@ -90,6 +107,14 @@
       "git_url" : "https://github.com/dmlc/dlpack.git",
       "version" : "0.8"
     },
+    "flatbuffers" : 
+    {
+      "always_download" : true,
+      "git_shallow" : false,
+      "git_tag" : "595bf0007ab1929570c7671f091313c8fc20644e",
+      "git_url" : "https://github.com/google/flatbuffers.git",
+      "version" : "24.3.25"
+    },
     "fmt" : 
     {
       "always_download" : true,
@@ -114,6 +139,14 @@
       "git_url" : "https://github.com/rapidsai/jitify.git",
       "version" : "2.0.0"
     },
+    "nanoarrow" : 
+    {
+      "always_download" : true,
+      "git_shallow" : false,
+      "git_tag" : "11e73a8c85b45e3d49c8c541b4e1497a649fe03c",
+      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
+      "version" : "0.5.0"
+    },
     "nvcomp" : 
     {
       "always_download" : true,
@@ -127,13 +160,21 @@
       },
       "version" : "3.0.6"
     },
+    "nvtx3" : 
+    {
+      "always_download" : true,
+      "git_shallow" : false,
+      "git_tag" : "e170594ac7cf1dac584da473d4ca9301087090c1",
+      "git_url" : "https://github.com/NVIDIA/NVTX.git",
+      "version" : "3.1.0"
+    },
     "rmm" : 
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "e14a2291301ce9c8ef76b2b2404eb02336584724",
+      "git_tag" : "f47ce3f0d46848cd9d5844d499bf150dd14d823a",
       "git_url" : "https://github.com/rapidsai/rmm.git",
-      "version" : "24.04"
+      "version" : "24.06"
     },
     "spdlog" : 
     {