diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 4ee5c3b271c..f346cdf8e90 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,26 +46,29 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
   docs-build:
-    if: github.ref_type == 'branch' && github.event_name == 'push'
+    if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
-      build_type: branch
-      node_type: "gpu-latest-1"
       arch: "amd64"
+      branch: ${{ inputs.branch }}
+      build_type: ${{ inputs.build_type || 'branch' }}
       container_image: "rapidsai/ci:latest"
+      date: ${{ inputs.date }}
+      node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
+      sha: ${{ inputs.sha }}
   wheel-build-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -78,15 +81,14 @@ jobs:
       # the CMake variables in get_cumlprims_mg.cmake since CMake will just use
       # the clone as is.
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.04
+      extra-repo-sha: branch-23.06
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
 
-      skbuild-configure-options: "-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=/project/cugraph-ops/"
-      uses-setup-env-vars: false
+      skbuild-configure-options: "-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/python/pylibcugraph/cugraph-ops/"
   wheel-publish-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -96,7 +98,7 @@ jobs:
   wheel-build-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -109,15 +111,15 @@ jobs:
       # the CMake variables in get_cumlprims_mg.cmake since CMake will just use
       # the clone as is.
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.04
+      extra-repo-sha: branch-23.06
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
 
-      skbuild-configure-options: "-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=/project/cugraph-ops/"
-      uses-setup-env-vars: false
+      before-wheel: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 /local-wheelhouse"
+      skbuild-configure-options: "-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/python/cugraph/cugraph-ops/"
   wheel-publish-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index b53e139b5ca..2a3f4f073fc 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -24,106 +24,101 @@ jobs:
       - wheel-build-cugraph
       - wheel-tests-cugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.06
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.06
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.06
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.06
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
     with:
       build_type: pull-request
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: pull-request
-      node_type: "gpu-latest-1"
+      node_type: "gpu-v100-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci:latest"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: pull-request
-      node_type: "gpu-latest-1"
+      node_type: "gpu-v100-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibcugraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06
     with:
       build_type: pull-request
       package-name: pylibcugraph
       package-dir: python/pylibcugraph
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.04
+      extra-repo-sha: branch-23.06
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
-      skbuild-configure-options: "-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=/project/cugraph-ops/"
-      uses-setup-env-vars: false
+      skbuild-configure-options: "-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/python/pylibcugraph/cugraph-ops/"
   wheel-tests-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06
     with:
       build_type: pull-request
       package-name: pylibcugraph
-      # On arm also need to install cupy from the specific webpage.
-      test-before-arm64: "pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64"
-      test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets pytest -v ./python/pylibcugraph/pylibcugraph/tests"
+      test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets pytest ./python/pylibcugraph/pylibcugraph/tests"
       test-smoketest: "python ci/wheel_smoke_test_pylibcugraph.py"
   wheel-build-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06
     with:
       build_type: pull-request
       package-name: cugraph
       package-dir: python/cugraph
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.04
+      extra-repo-sha: branch-23.06
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
-      before-wheel: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_cu11 rapids-download-wheels-from-s3 ./local-wheelhouse"
-      skbuild-configure-options: "-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=/project/cugraph-ops/"
-      uses-setup-env-vars: false
+      before-wheel: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 /local-wheelhouse"
+      skbuild-configure-options: "-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/python/cugraph/cugraph-ops/"
   wheel-tests-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06
     with:
       build_type: pull-request
       package-name: cugraph
       # Always want to test against latest dask/distributed.
-      test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && RAPIDS_PY_WHEEL_NAME=pylibcugraph_cu11 rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
       # Skip dataset downloads on arm to save CI time -- arm only runs smoke tests.
-      # On arm also need to install cupy from the specific site.
-      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_cu11 rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64 && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
-      test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets pytest -v -m sg ./python/cugraph/cugraph/tests"
+      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets pytest -m sg ./python/cugraph/cugraph/tests"
       test-smoketest: "python ci/wheel_smoke_test_cugraph.py"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 8856ec3df5d..87dd6104b4e 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,19 +32,17 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       package-name: pylibcugraph
-      # On arm also need to install cupy from the specific webpage.
-      test-before-arm64: "pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64"
-      test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets pytest -v ./python/pylibcugraph/pylibcugraph/tests"
+      test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets pytest ./python/pylibcugraph/pylibcugraph/tests"
   wheel-tests-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -52,7 +50,6 @@ jobs:
       sha: ${{ inputs.sha }}
       package-name: cugraph
       # Always want to test against latest dask/distributed.
-      test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
-      # On arm also need to install cupy from the specific webpage.
-      test-before-arm64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64 && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
-      test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets pytest -v -m sg ./python/cugraph/cugraph/tests"
+      test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-before-arm64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets pytest -m sg ./python/cugraph/cugraph/tests"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3c2f5fe2cfb..0f05aedf1a1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,6 +2,7 @@
 #
 # Before first use: `pre-commit install`
 # To run: `pre-commit run --all-files`
+exclude: '^thirdparty'
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.4.0
@@ -32,13 +33,13 @@ repos:
         additional_dependencies:
           - flake8==6.0.0
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v11.1.0
+    rev: v16.0.1
     hooks:
       - id: clang-format
         exclude: |
           (?x)^(
             cpp/libcugraph_etl|
-            cpp/tests/c_api/.*
+            cpp/tests/c_api
           )
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d40ade9810..4a018c55031 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,111 @@
+# cuGraph 23.06.00 (7 Jun 2023)
+
+## 🚨 Breaking Changes
+
+- [BUG] Fix Incorrect File Selection in cuGraph-PyG Loader ([#3599](https://github.com/rapidsai/cugraph/pull/3599)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Remove legacy leiden ([#3581](https://github.com/rapidsai/cugraph/pull/3581)) [@ChuckHastings](https://github.com/ChuckHastings)
+- [IMP] Match Default PyG Hop ID Behavior in cuGraph-PyG ([#3565](https://github.com/rapidsai/cugraph/pull/3565)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- [IMP] Sample with Offsets in the Bulk Sampler ([#3524](https://github.com/rapidsai/cugraph/pull/3524)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Dropping Python 3.8 ([#3505](https://github.com/rapidsai/cugraph/pull/3505)) [@divyegala](https://github.com/divyegala)
+- Remove legacy renumber and shuffle calls from cython.cu ([#3467](https://github.com/rapidsai/cugraph/pull/3467)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Remove legacy implementation of induce subgraph ([#3464](https://github.com/rapidsai/cugraph/pull/3464)) [@ChuckHastings](https://github.com/ChuckHastings)
+
+## 🐛 Bug Fixes
+
+- Fix MG Test Failing due to Removal of np.float ([#3621](https://github.com/rapidsai/cugraph/pull/3621)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- fix logic for shuffling results ([#3619](https://github.com/rapidsai/cugraph/pull/3619)) [@ChuckHastings](https://github.com/ChuckHastings)
+- [BUG] Fix Calls to cudf.DataFrame/Series.unique that relied on old behavior ([#3616](https://github.com/rapidsai/cugraph/pull/3616)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- correct dgl version in `cugraph-dgl` conda recipe ([#3612](https://github.com/rapidsai/cugraph/pull/3612)) [@tingyu66](https://github.com/tingyu66)
+- [BUG] Fix Issue in cuGraph-PyG Tests Blocking CI ([#3607](https://github.com/rapidsai/cugraph/pull/3607)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- [BUG] Critical: Fix cuGraph-PyG Edge Index Renumbering for Single-Edge Graphs ([#3605](https://github.com/rapidsai/cugraph/pull/3605)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- [BUG] Skip Empty Partitions in Bulk Sample Writing ([#3600](https://github.com/rapidsai/cugraph/pull/3600)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- [BUG] Fix Incorrect File Selection in cuGraph-PyG Loader ([#3599](https://github.com/rapidsai/cugraph/pull/3599)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Fix SSSP bug ([#3597](https://github.com/rapidsai/cugraph/pull/3597)) [@jnke2016](https://github.com/jnke2016)
+- update cudf column constructor calls ([#3592](https://github.com/rapidsai/cugraph/pull/3592)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Fix one more path to cugraphops in build workflow ([#3554](https://github.com/rapidsai/cugraph/pull/3554)) [@vyasr](https://github.com/vyasr)
+- Fix path to cugraphops in build workflow ([#3547](https://github.com/rapidsai/cugraph/pull/3547)) [@vyasr](https://github.com/vyasr)
+- Update dgl APIs for v1.1.0 ([#3546](https://github.com/rapidsai/cugraph/pull/3546)) [@tingyu66](https://github.com/tingyu66)
+- Pin to scikit-build&lt;17.2 ([#3538](https://github.com/rapidsai/cugraph/pull/3538)) [@vyasr](https://github.com/vyasr)
+- Correct results from sampling when grouping batches on specific GPUs ([#3517](https://github.com/rapidsai/cugraph/pull/3517)) [@ChuckHastings](https://github.com/ChuckHastings)
+- [FIX] Match the PyG API for Node Input to the Loader ([#3514](https://github.com/rapidsai/cugraph/pull/3514)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Correct MG Leiden and SCC tests ([#3509](https://github.com/rapidsai/cugraph/pull/3509)) [@ChuckHastings](https://github.com/ChuckHastings)
+- per_v_transform_reduce_incoming|outgoing_e bug fix (when we&#39;re using (key, value) pairs to store edge src|dst property values) ([#3508](https://github.com/rapidsai/cugraph/pull/3508)) [@seunghwak](https://github.com/seunghwak)
+- Updates to allow python benchmarks to run on additional datasets by default ([#3506](https://github.com/rapidsai/cugraph/pull/3506)) [@rlratzel](https://github.com/rlratzel)
+- [BUG] Fix Intermittent Error when Converting cuDF DataFrame to Tensor by Converting to cuPy Array First ([#3498](https://github.com/rapidsai/cugraph/pull/3498)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- [FIX] Update cugraph-PyG Dependencies to include cuGraph ([#3497](https://github.com/rapidsai/cugraph/pull/3497)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Fix graph_properties_t&#39;s members order ([#3484](https://github.com/rapidsai/cugraph/pull/3484)) [@naimnv](https://github.com/naimnv)
+- Fix issue with latest rapids-make ([#3481](https://github.com/rapidsai/cugraph/pull/3481)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Branch 23.06 Fix Forward Merge ([#3462](https://github.com/rapidsai/cugraph/pull/3462)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Update raft dependency to 23.06 ([#3410](https://github.com/rapidsai/cugraph/pull/3410)) [@ChuckHastings](https://github.com/ChuckHastings)
+
+## 📖 Documentation
+
+- updated cugraph Demo notebooks for 23.06 ([#3558](https://github.com/rapidsai/cugraph/pull/3558)) [@acostadon](https://github.com/acostadon)
+- cugraph-ops license ([#3553](https://github.com/rapidsai/cugraph/pull/3553)) [@BradReesWork](https://github.com/BradReesWork)
+- Notebook clean-up and run verification ([#3551](https://github.com/rapidsai/cugraph/pull/3551)) [@acostadon](https://github.com/acostadon)
+- Updates contributing steps to add copyright and license text inclusion instruction ([#3519](https://github.com/rapidsai/cugraph/pull/3519)) [@rlratzel](https://github.com/rlratzel)
+- Fixed notebook links in algorithm and cugraph notebook pages ([#3515](https://github.com/rapidsai/cugraph/pull/3515)) [@acostadon](https://github.com/acostadon)
+- adding cugraph-ops ([#3488](https://github.com/rapidsai/cugraph/pull/3488)) [@BradReesWork](https://github.com/BradReesWork)
+- Sphinx updates ([#3468](https://github.com/rapidsai/cugraph/pull/3468)) [@BradReesWork](https://github.com/BradReesWork)
+
+## 🚀 New Features
+
+- [REVIEW] Add MNMG with training ([#3603](https://github.com/rapidsai/cugraph/pull/3603)) [@VibhuJawa](https://github.com/VibhuJawa)
+- MG Leiden and MG MIS ([#3582](https://github.com/rapidsai/cugraph/pull/3582)) [@naimnv](https://github.com/naimnv)
+- graph primitive transform_e ([#3548](https://github.com/rapidsai/cugraph/pull/3548)) [@seunghwak](https://github.com/seunghwak)
+- Support CUDA 12.0 for pip wheels ([#3544](https://github.com/rapidsai/cugraph/pull/3544)) [@divyegala](https://github.com/divyegala)
+- Updates pytest benchmarks to use synthetic data and multi-GPUs ([#3540](https://github.com/rapidsai/cugraph/pull/3540)) [@rlratzel](https://github.com/rlratzel)
+- Enable edge masking ([#3522](https://github.com/rapidsai/cugraph/pull/3522)) [@seunghwak](https://github.com/seunghwak)
+- [REVIEW] Profile graph creation runtime and memory footprint ([#3518](https://github.com/rapidsai/cugraph/pull/3518)) [@VibhuJawa](https://github.com/VibhuJawa)
+- Bipartite R-mat graph generation. ([#3512](https://github.com/rapidsai/cugraph/pull/3512)) [@seunghwak](https://github.com/seunghwak)
+- Dropping Python 3.8 ([#3505](https://github.com/rapidsai/cugraph/pull/3505)) [@divyegala](https://github.com/divyegala)
+- Creates Notebook that runs Multi-GPU versions of Jaccard, Sorenson and overlap. ([#3504](https://github.com/rapidsai/cugraph/pull/3504)) [@acostadon](https://github.com/acostadon)
+- [cugraph-dgl] Add support for bipartite node features and optional edge features in GATConv ([#3503](https://github.com/rapidsai/cugraph/pull/3503)) [@tingyu66](https://github.com/tingyu66)
+- [cugraph-dgl] Add TransformerConv ([#3501](https://github.com/rapidsai/cugraph/pull/3501)) [@tingyu66](https://github.com/tingyu66)
+- [cugraph-pyg] Add TransformerConv and support for bipartite node features in GATConv ([#3489](https://github.com/rapidsai/cugraph/pull/3489)) [@tingyu66](https://github.com/tingyu66)
+- Branch 23.06 resolve merge conflict for forward merge ([#3409](https://github.com/rapidsai/cugraph/pull/3409)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Refactor Leiden ([#3327](https://github.com/rapidsai/cugraph/pull/3327)) [@jnke2016](https://github.com/jnke2016)
+
+## 🛠️ Improvements
+
+- Refresh requirements ([#3622](https://github.com/rapidsai/cugraph/pull/3622)) [@jakirkham](https://github.com/jakirkham)
+- Pr3266 continue (optional arg for weight attribute for Nx graphs in `sssp`) ([#3611](https://github.com/rapidsai/cugraph/pull/3611)) [@eriknw](https://github.com/eriknw)
+- Enables MG python tests using a single-GPU LocalCUDACluster in CI ([#3596](https://github.com/rapidsai/cugraph/pull/3596)) [@rlratzel](https://github.com/rlratzel)
+- UVM notebook update and add tracker for notebooks to readme ([#3595](https://github.com/rapidsai/cugraph/pull/3595)) [@acostadon](https://github.com/acostadon)
+- [REVIEW]  Skip adding edge types, edge weights ([#3583](https://github.com/rapidsai/cugraph/pull/3583)) [@VibhuJawa](https://github.com/VibhuJawa)
+- Remove legacy leiden ([#3581](https://github.com/rapidsai/cugraph/pull/3581)) [@ChuckHastings](https://github.com/ChuckHastings)
+- run docs nightly too ([#3568](https://github.com/rapidsai/cugraph/pull/3568)) [@AyodeAwe](https://github.com/AyodeAwe)
+- include hop as part of the sort criteria for sampling results ([#3567](https://github.com/rapidsai/cugraph/pull/3567)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Add MG python implementation of Leiden ([#3566](https://github.com/rapidsai/cugraph/pull/3566)) [@jnke2016](https://github.com/jnke2016)
+- [IMP] Match Default PyG Hop ID Behavior in cuGraph-PyG ([#3565](https://github.com/rapidsai/cugraph/pull/3565)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Switch back to using primary shared-action-workflows branch ([#3562](https://github.com/rapidsai/cugraph/pull/3562)) [@vyasr](https://github.com/vyasr)
+- removed deprecated calls and modified demo notebooks to run with 23.06 ([#3561](https://github.com/rapidsai/cugraph/pull/3561)) [@acostadon](https://github.com/acostadon)
+- add unit test for checking is_symmetric is valid, update documentatio… ([#3559](https://github.com/rapidsai/cugraph/pull/3559)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Update recipes to GTest version &gt;=1.13.0 ([#3549](https://github.com/rapidsai/cugraph/pull/3549)) [@bdice](https://github.com/bdice)
+- Improve memory footprint and performance of graph creation ([#3542](https://github.com/rapidsai/cugraph/pull/3542)) [@VibhuJawa](https://github.com/VibhuJawa)
+- Update cupy dependency ([#3539](https://github.com/rapidsai/cugraph/pull/3539)) [@vyasr](https://github.com/vyasr)
+- Perform expensive edge list check in create_graph_from_edgelist() ([#3533](https://github.com/rapidsai/cugraph/pull/3533)) [@seunghwak](https://github.com/seunghwak)
+- Enable sccache hits from local builds ([#3526](https://github.com/rapidsai/cugraph/pull/3526)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Build wheels using new single image workflow ([#3525](https://github.com/rapidsai/cugraph/pull/3525)) [@vyasr](https://github.com/vyasr)
+- [IMP] Sample with Offsets in the Bulk Sampler ([#3524](https://github.com/rapidsai/cugraph/pull/3524)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Revert shared-action-workflows pin ([#3523](https://github.com/rapidsai/cugraph/pull/3523)) [@divyegala](https://github.com/divyegala)
+- [FIX] fix cugraphops namespace ([#3520](https://github.com/rapidsai/cugraph/pull/3520)) [@stadlmax](https://github.com/stadlmax)
+- Add support in C API for handling unweighted graphs in algorithms that expect weights ([#3513](https://github.com/rapidsai/cugraph/pull/3513)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Changes to support gtest version 1.11 ([#3511](https://github.com/rapidsai/cugraph/pull/3511)) [@ChuckHastings](https://github.com/ChuckHastings)
+- update docs ([#3510](https://github.com/rapidsai/cugraph/pull/3510)) [@BradReesWork](https://github.com/BradReesWork)
+- Remove usage of rapids-get-rapids-version-from-git ([#3502](https://github.com/rapidsai/cugraph/pull/3502)) [@jjacobelli](https://github.com/jjacobelli)
+- Remove Dummy Edge Weights, Support Specifying Edge Ids/Edge Types/Weights Separately ([#3495](https://github.com/rapidsai/cugraph/pull/3495)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- [ENH] Add missing include of thrust/optional.h ([#3493](https://github.com/rapidsai/cugraph/pull/3493)) [@ahendriksen](https://github.com/ahendriksen)
+- Remove wheel pytest verbosity ([#3492](https://github.com/rapidsai/cugraph/pull/3492)) [@sevagh](https://github.com/sevagh)
+- Update clang-format to 16.0.1. ([#3485](https://github.com/rapidsai/cugraph/pull/3485)) [@bdice](https://github.com/bdice)
+- Use ARC V2 self-hosted runners for GPU jobs ([#3483](https://github.com/rapidsai/cugraph/pull/3483)) [@jjacobelli](https://github.com/jjacobelli)
+- packed bool specialization to store edge endpoint|edge properties ([#3482](https://github.com/rapidsai/cugraph/pull/3482)) [@seunghwak](https://github.com/seunghwak)
+- Remove legacy renumber and shuffle calls from cython.cu ([#3467](https://github.com/rapidsai/cugraph/pull/3467)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Remove legacy implementation of induce subgraph ([#3464](https://github.com/rapidsai/cugraph/pull/3464)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Remove uses-setup-env-vars ([#3463](https://github.com/rapidsai/cugraph/pull/3463)) [@vyasr](https://github.com/vyasr)
+- Optimize random walks ([#3460](https://github.com/rapidsai/cugraph/pull/3460)) [@jnke2016](https://github.com/jnke2016)
+- Update select_random_vertices to sample from a given distributed set or from (0, V] ([#3455](https://github.com/rapidsai/cugraph/pull/3455)) [@naimnv](https://github.com/naimnv)
+
 # cuGraph 23.04.00 (6 Apr 2023)
 
 ## 🚨 Breaking Changes
diff --git a/README.md b/README.md
index 8c5e057b9f4..b88cf194fa9 100644
--- a/README.md
+++ b/README.md
@@ -26,8 +26,8 @@
 
 <div align="center">
 
-[Getting cuGraph](./readme_pages/getting_cugraph.md) *
-[Graph Algorithms](./readme_pages/algorithms.md) *
+[Getting cuGraph](./docs/cugraph/source/installation/getting_cugraph.md) *
+[Graph Algorithms](./docs/cugraph/source/graph_support/algorithms.md) *
 [Graph Service](./readme_pages/cugraph_service.md) *
 [Property Graph](./readme_pages/property_graph.md) *
 [GNN Support](./readme_pages/gnn_support.md)
@@ -37,8 +37,9 @@
 -----
 
 ## Table of content
-- Getting packages
-  - [Getting cuGraph Packages](./readme_pages/getting_cugraph.md)
+- Installation
+  - [Getting cuGraph Packages](./docs/cugraph/source/installation/getting_cugraph.md)
+  - [Building from Source](./docs/cugraph/source/installation/source_build.md)
   - [Contributing to cuGraph](./readme_pages/CONTRIBUTING.md)
 - General
   - [Latest News](./readme_pages/news.md)
diff --git a/benchmarks/cugraph-dgl/pytest-based/bench_cugraph_dgl_uniform_neighbor_sample.py b/benchmarks/cugraph-dgl/pytest-based/bench_cugraph_dgl_uniform_neighbor_sample.py
index f05c4364840..eeee163b0af 100644
--- a/benchmarks/cugraph-dgl/pytest-based/bench_cugraph_dgl_uniform_neighbor_sample.py
+++ b/benchmarks/cugraph-dgl/pytest-based/bench_cugraph_dgl_uniform_neighbor_sample.py
@@ -39,7 +39,7 @@
 def create_graph(graph_data):
     """
     Create a graph instance based on the data to be loaded/generated.
-    """    
+    """
     print("Initalize Pool on client")
     rmm.reinitialize(pool_allocator=True)
     # Assume strings are names of datasets in the datasets package
@@ -77,7 +77,7 @@ def create_graph(graph_data):
     num_nodes_dict = {'_N':num_nodes}
 
     gs = CuGraphStorage(num_nodes_dict=num_nodes_dict, single_gpu=True)
-    gs.add_edge_data(edgelist_df,   
+    gs.add_edge_data(edgelist_df,
                     # reverse to make same graph as cugraph
                     node_col_names=['dst', 'src'],
                     canonical_etype=['_N', 'connects', '_N'])
@@ -90,11 +90,9 @@ def create_mg_graph(graph_data):
     """
     Create a graph instance based on the data to be loaded/generated.
     """
-    ## Reserving GPU 0 for client(trainer/service project)
-    n_devices = os.getenv('DASK_NUM_WORKERS', 4)
-    n_devices = int(n_devices)
+    # range starts at 1 to let let 0 be used by benchmark/client process
+    visible_devices = os.getenv("DASK_WORKER_DEVICES", "1,2,3,4")
 
-    visible_devices = ','.join([str(i) for i in range(1, n_devices+1)])
     cluster = LocalCUDACluster(protocol='ucx', rmm_pool_size='25GB', CUDA_VISIBLE_DEVICES=visible_devices)
     client = Client(cluster)
     Comms.initialize(p2p=True)
@@ -137,7 +135,7 @@ def create_mg_graph(graph_data):
     num_nodes_dict = {'_N':num_nodes}
 
     gs = CuGraphStorage(num_nodes_dict=num_nodes_dict,  single_gpu=False)
-    gs.add_edge_data(edgelist_df,   
+    gs.add_edge_data(edgelist_df,
                     node_col_names=['dst', 'src'],
                     canonical_etype=['_N', 'C', '_N'])
     return (gs, client, cluster)
@@ -166,7 +164,7 @@ def get_uniform_neighbor_sample_args(
         num_start_verts = int(num_verts * 0.25)
     else:
         num_start_verts = batch_size
-    
+
     srcs = G.graphstore.gdata.get_edge_data()['_SRC_']
     start_list = srcs.head(num_start_verts)
     assert len(start_list) == num_start_verts
@@ -229,7 +227,7 @@ def bench_cugraph_dgl_uniform_neighbor_sample(
     fanout_val.reverse()
     sampler = dgl.dataloading.NeighborSampler(uns_args["fanout"])
     sampler_f = sampler.sample_blocks
-    
+
     # Warmup
     _ = sampler_f(g=G, seed_nodes=uns_args["seed_nodes"])
     # print(f"\n{uns_args}")
diff --git a/benchmarks/cugraph/pytest-based/bench_algos.py b/benchmarks/cugraph/pytest-based/bench_algos.py
index c57731dee8d..d7fcb7812e4 100644
--- a/benchmarks/cugraph/pytest-based/bench_algos.py
+++ b/benchmarks/cugraph/pytest-based/bench_algos.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 import pytest
-
+import numpy as np
 import pytest_benchmark
 # FIXME: Remove this when rapids_pytest_benchmark.gpubenchmark is available
 # everywhere
@@ -29,12 +29,16 @@
     def setFixtureParamNames(*args, **kwargs):
         pass
 
+import rmm
+import dask_cudf
+from pylibcugraph.testing import gen_fixture_params_product
+
 import cugraph
+import cugraph.dask as dask_cugraph
 from cugraph.structure.number_map import NumberMap
-from cugraph.testing import utils
-from pylibcugraph.testing import gen_fixture_params_product
+from cugraph.generators import rmat
+from cugraph.testing import utils, mg_utils
 from cugraph.utilities.utils import is_device_version_less_than
-import rmm
 
 from cugraph_benchmarking.params import (
     directed_datasets,
@@ -43,46 +47,122 @@ def setFixtureParamNames(*args, **kwargs):
     pool_allocator,
 )
 
-fixture_params = gen_fixture_params_product(
-    (directed_datasets + undirected_datasets, "ds"),
+# duck-type compatible Dataset for RMAT data
+class RmatDataset:
+    def __init__(self, scale=4, edgefactor=2, mg=False):
+        self._scale = scale
+        self._edgefactor = edgefactor
+        self._edgelist = None
+
+        self.mg = mg
+
+    def __str__(self):
+        mg_str = "mg" if self.mg else "sg"
+        return f"rmat_{mg_str}_{self._scale}_{self._edgefactor}"
+
+    def get_edgelist(self, fetch=False):
+        seed = 42
+        if self._edgelist is None:
+            self._edgelist = rmat(
+                self._scale,
+                (2**self._scale)*self._edgefactor,
+                0.57,  # from Graph500
+                0.19,  # from Graph500
+                0.19,  # from Graph500
+                seed or 42,
+                clip_and_flip=False,
+                scramble_vertex_ids=True,
+                create_using=None,  # return edgelist instead of Graph instance
+                mg=self.mg
+            )
+            rng = np.random.default_rng(seed)
+            if self.mg:
+                self._edgelist["weight"] = self._edgelist.map_partitions(
+                    lambda df: rng.random(size=len(df)))
+            else:
+                self._edgelist["weight"] = rng.random(size=len(self._edgelist))
+
+        return self._edgelist
+
+    def get_graph(self,
+                  fetch=False,
+                  create_using=cugraph.Graph,
+                  ignore_weights=False,
+                  store_transposed=False):
+        if isinstance(create_using, cugraph.Graph):
+            # what about BFS if trnaposed is True
+            attrs = {"directed": create_using.is_directed()}
+            G = type(create_using)(**attrs)
+        elif type(create_using) is type:
+            G = create_using()
+
+        edge_attr = None if ignore_weights else "weight"
+        df = self.get_edgelist()
+        if isinstance(df, dask_cudf.DataFrame):
+            G.from_dask_cudf_edgelist(df,
+                                      source="src",
+                                      destination="dst",
+                                      edge_attr=edge_attr,
+                                      store_transposed=store_transposed)
+        else:
+            G.from_cudf_edgelist(df,
+                                 source="src",
+                                 destination="dst",
+                                 edge_attr=edge_attr,
+                                 store_transposed=store_transposed)
+        return G
+
+    def get_path(self):
+        """
+        (this is likely not needed for use with pytest-benchmark, just added for
+        API completeness with Dataset.)
+        """
+        return str(self)
+
+    def unload(self):
+        self._edgelist = None
+
+
+_rmat_scale = getattr(pytest, "_rmat_scale", 20)  # ~1M vertices
+_rmat_edgefactor = getattr(pytest, "_rmat_edgefactor", 16)  # ~17M edges
+rmat_sg_dataset = pytest.param(RmatDataset(scale=_rmat_scale,
+                                           edgefactor=_rmat_edgefactor,
+                                           mg=False),
+                               marks=[pytest.mark.rmat_data,
+                                      pytest.mark.sg,
+                               ])
+rmat_mg_dataset = pytest.param(RmatDataset(scale=_rmat_scale,
+                                           edgefactor=_rmat_edgefactor,
+                                           mg=True),
+                               marks=[pytest.mark.rmat_data,
+                                      pytest.mark.mg,
+                               ])
+
+rmm_fixture_params = gen_fixture_params_product(
     (managed_memory, "mm"),
     (pool_allocator, "pa"))
-
-###############################################################################
-# Helpers
-def createGraph(csvFileName, graphType=None):
-    """
-    Helper function to create a Graph (directed or undirected) based on
-    csvFileName.
-    """
-    if graphType is None:
-        # There's potential value in verifying that a directed graph can be
-        # created from a undirected dataset, and an undirected from a directed
-        # dataset. (For now?) do not include those combinations to keep
-        # benchmark runtime and complexity lower, and assume tests have
-        # coverage to verify correctness for those combinations.
-        if "directed" in csvFileName.parts:
-            graphType = cugraph.Graph(directed=True)
-        else:
-            graphType = cugraph.Graph()
-
-    return cugraph.from_cudf_edgelist(
-        utils.read_csv_file(csvFileName),
-        source="0", destination="1", edge_attr="2",
-        create_using=graphType,
-        renumber=True)
-
+dataset_fixture_params = gen_fixture_params_product(
+    (directed_datasets +
+     undirected_datasets +
+     [rmat_sg_dataset, rmat_mg_dataset], "ds"))
 
 # Record the current RMM settings so reinitialize() will be called only when a
-# change is needed (RMM defaults both values to False). This allows the
-# --no-rmm-reinit option to prevent reinitialize() from being called at all
+# change is needed (RMM defaults both values to False). The --allow-rmm-reinit
+# option is required to allow the RMM options to be set by the pytest user
+# directly, in order to prevent reinitialize() from being called more than once
 # (see conftest.py for details).
+# The defaults for managed_mem (False) and pool_alloc (True) are set in
+# conftest.py
 RMM_SETTINGS = {"managed_mem": False,
                 "pool_alloc": False}
 
-
+# FIXME: this only changes the RMM config in a SG environment. The dask config
+# that applies to RMM in an MG environment is not changed by this!
 def reinitRMM(managed_mem, pool_alloc):
-
+    """
+    Reinitializes RMM to the value of managed_mem and pool_alloc, but only if
+    those values are different that the current configuration.
+    """
     if (managed_mem != RMM_SETTINGS["managed_mem"]) or \
        (pool_alloc != RMM_SETTINGS["pool_alloc"]):
 
@@ -104,79 +184,86 @@ def reinitRMM(managed_mem, pool_alloc):
 #
 # For benchmarks, the operations performed in fixtures are not measured as part
 # of the benchmark.
+
 @pytest.fixture(scope="module",
-                params=fixture_params)
-def edgelistCreated(request):
-    """
-    Returns a new edgelist created from a CSV, which is specified as part of
-    the parameterization for this fixture.
-    """
+                params=rmm_fixture_params)
+def rmm_config(request):
     # Since parameterized fixtures do not assign param names to param values,
     # manually call the helper to do so. Ensure the order of the name list
     # passed to it matches if there are >1 params.
     # If the request only contains n params, only the first n names are set.
-    setFixtureParamNames(request, ["dataset", "managed_mem", "pool_allocator"])
-
-    csvFileName = request.param[0]
-    reinitRMM(request.param[1], request.param[2])
-    return utils.read_csv_file(csvFileName)
+    setFixtureParamNames(request, ["managed_mem", "pool_allocator"])
+    reinitRMM(request.param[0], request.param[1])
 
 
 @pytest.fixture(scope="module",
-                params=fixture_params)
-def graphWithAdjListComputed(request):
+                params=dataset_fixture_params)
+def dataset(request, rmm_config):
+
     """
-    Create a Graph obj from the CSV file in param, compute the adjacency list
-    and return it.
+    Fixture which provides a Dataset instance, setting up a Dask cluster and
+    client if necessary for MG, to tests and other fixtures. When all
+    tests/fixtures are done with the Dataset, it has the Dask cluster and
+    client torn down (if MG) and all data loaded is freed.
     """
-    setFixtureParamNames(request, ["dataset", "managed_mem", "pool_allocator"])
-    csvFileName = request.param[0]
-    reinitRMM(request.param[1], request.param[2])
+    setFixtureParamNames(request, ["dataset"])
+    dataset = request.param[0]
+    client = cluster = None
+    # For now, only RmatDataset instanaces support MG and have a "mg" attr.
+    if hasattr(dataset, "mg") and dataset.mg:
+        (client, cluster) = mg_utils.start_dask_client()
+
+    yield dataset
 
-    G = createGraph(csvFileName, cugraph.structure.graph_classes.Graph)
-    G.view_adj_list()
+    dataset.unload()
+    if client is not None:
+        mg_utils.stop_dask_client(client, cluster)
+
+
+@pytest.fixture(scope="module")
+def edgelist(request, dataset):
+    df = dataset.get_edgelist()
+    return df
+
+
+@pytest.fixture(scope="module")
+def graph(request, dataset):
+    G = dataset.get_graph()
     return G
 
 
-@pytest.fixture(scope="module",
-                params=fixture_params)
-def anyGraphWithAdjListComputed(request):
-    """
-    Create a Graph (directed or undirected) obj based on the param, compute the
-    adjacency list and return it.
-    """
-    setFixtureParamNames(request, ["dataset", "managed_mem", "pool_allocator"])
-    csvFileName = request.param[0]
-    reinitRMM(request.param[1], request.param[2])
+@pytest.fixture(scope="module")
+def unweighted_graph(request, dataset):
+    G = dataset.get_graph(ignore_weights=True)
+    return G
+
 
-    G = createGraph(csvFileName)
-    G.view_adj_list()
+@pytest.fixture(scope="module")
+def directed_graph(request, dataset):
+    G = dataset.get_graph(create_using=cugraph.Graph(directed=True))
     return G
 
 
-@pytest.fixture(scope="module",
-                params=fixture_params)
-def anyGraphWithTransposedAdjListComputed(request):
+@pytest.fixture(scope="module")
+def transposed_graph(request, dataset):
+    G = dataset.get_graph(store_transposed=True)
+    return G
+
+
+###############################################################################
+def is_graph_distributed(graph):
     """
-    Create a Graph (directed or undirected) obj based on the param, compute the
-    transposed adjacency list and return it.
+    Return True if graph is distributed (for use with cugraph.dask APIs)
     """
-    setFixtureParamNames(request, ["dataset", "managed_mem", "pool_allocator"])
-    csvFileName = request.param[0]
-    reinitRMM(request.param[1], request.param[2])
-
-    G = createGraph(csvFileName)
-    G.view_transposed_adj_list()
-    return G
+    return isinstance(graph.edgelist.edgelist_df, dask_cudf.DataFrame)
 
 
 ###############################################################################
 # Benchmarks
-@pytest.mark.ETL
-def bench_create_graph(gpubenchmark, edgelistCreated):
+def bench_create_graph(gpubenchmark, edgelist):
     gpubenchmark(cugraph.from_cudf_edgelist,
-                 edgelistCreated,
-                 source="0", destination="1",
+                 edgelist,
+                 source="src", destination="dst",
                  create_using=cugraph.structure.graph_classes.Graph,
                  renumber=False)
 
@@ -184,94 +271,142 @@ def bench_create_graph(gpubenchmark, edgelistCreated):
 # Creating directed Graphs on small datasets runs in micro-seconds, which
 # results in thousands of rounds before the default threshold is met, so lower
 # the max_time for this benchmark.
-@pytest.mark.ETL
 @pytest.mark.benchmark(
     warmup=True,
     warmup_iterations=10,
     max_time=0.005
 )
-def bench_create_digraph(gpubenchmark, edgelistCreated):
+def bench_create_digraph(gpubenchmark, edgelist):
     gpubenchmark(cugraph.from_cudf_edgelist,
-                 edgelistCreated,
-                 source="0", destination="1",
+                 edgelist,
+                 source="src", destination="dst",
                  create_using=cugraph.Graph(directed=True),
                  renumber=False)
 
 
-@pytest.mark.ETL
-def bench_renumber(gpubenchmark, edgelistCreated):
-    gpubenchmark(NumberMap.renumber, edgelistCreated, "0", "1")
+def bench_renumber(gpubenchmark, edgelist):
+    gpubenchmark(NumberMap.renumber, edgelist, "src", "dst")
 
 
-def bench_pagerank(gpubenchmark, anyGraphWithTransposedAdjListComputed):
-    gpubenchmark(cugraph.pagerank, anyGraphWithTransposedAdjListComputed)
+def bench_pagerank(gpubenchmark, transposed_graph):
+    pagerank = dask_cugraph.pagerank if is_graph_distributed(transposed_graph) \
+               else cugraph.pagerank
+    gpubenchmark(pagerank, transposed_graph)
 
 
-def bench_bfs(gpubenchmark, anyGraphWithAdjListComputed):
-    gpubenchmark(cugraph.bfs, anyGraphWithAdjListComputed, 0)
+def bench_bfs(gpubenchmark, graph):
+    bfs = dask_cugraph.bfs if is_graph_distributed(graph) else cugraph.bfs
+    start = graph.edgelist.edgelist_df["src"][0]
+    gpubenchmark(bfs, graph, start)
 
 
-def bench_force_atlas2(gpubenchmark, anyGraphWithAdjListComputed):
-    gpubenchmark(cugraph.force_atlas2, anyGraphWithAdjListComputed,
-                 max_iter=50)
+def bench_force_atlas2(gpubenchmark, graph):
+    if is_graph_distributed(graph):
+        pytest.skip("distributed graphs are not supported")
+    gpubenchmark(cugraph.force_atlas2, graph, max_iter=50)
 
 
-def bench_sssp(gpubenchmark, anyGraphWithAdjListComputed):
-    gpubenchmark(cugraph.sssp, anyGraphWithAdjListComputed, 0)
+def bench_sssp(gpubenchmark, graph):
+    sssp = dask_cugraph.sssp if is_graph_distributed(graph) else cugraph.sssp
+    start = graph.edgelist.edgelist_df["src"][0]
+    gpubenchmark(sssp, graph, start)
 
 
-def bench_jaccard(gpubenchmark, graphWithAdjListComputed):
-    gpubenchmark(cugraph.jaccard, graphWithAdjListComputed)
+def bench_jaccard(gpubenchmark, unweighted_graph):
+    G = unweighted_graph
+    jaccard = dask_cugraph.jaccard if is_graph_distributed(G) else cugraph.jaccard
+    gpubenchmark(jaccard, G)
 
 
 @pytest.mark.skipif(
     is_device_version_less_than((7, 0)), reason="Not supported on Pascal")
-def bench_louvain(gpubenchmark, graphWithAdjListComputed):
-    gpubenchmark(cugraph.louvain, graphWithAdjListComputed)
+def bench_louvain(gpubenchmark, graph):
+    louvain = dask_cugraph.louvain if is_graph_distributed(graph) else cugraph.louvain
+    gpubenchmark(louvain, graph)
 
 
-def bench_weakly_connected_components(gpubenchmark,
-                                      anyGraphWithAdjListComputed):
-    gpubenchmark(cugraph.weakly_connected_components,
-                 anyGraphWithAdjListComputed)
+def bench_weakly_connected_components(gpubenchmark, graph):
+    if is_graph_distributed(graph):
+        pytest.skip("distributed graphs are not supported")
+    if graph.is_directed():
+        G = graph.to_undirected()
+    else:
+        G = graph
+    gpubenchmark(cugraph.weakly_connected_components, G)
 
 
-def bench_overlap(gpubenchmark, anyGraphWithAdjListComputed):
-    gpubenchmark(cugraph.overlap, anyGraphWithAdjListComputed)
+def bench_overlap(gpubenchmark, unweighted_graph):
+    G = unweighted_graph
+    overlap = dask_cugraph.overlap if is_graph_distributed(G) else cugraph.overlap
+    gpubenchmark(overlap, G)
 
 
-def bench_triangle_count(gpubenchmark, graphWithAdjListComputed):
-    gpubenchmark(cugraph.triangle_count, graphWithAdjListComputed)
+def bench_triangle_count(gpubenchmark, graph):
+    tc = dask_cugraph.triangle_count if is_graph_distributed(graph) \
+         else cugraph.triangle_count
+    gpubenchmark(tc, graph)
 
 
-def bench_spectralBalancedCutClustering(gpubenchmark,
-                                        graphWithAdjListComputed):
-    gpubenchmark(cugraph.spectralBalancedCutClustering,
-                 graphWithAdjListComputed, 2)
+def bench_spectralBalancedCutClustering(gpubenchmark, graph):
+    if is_graph_distributed(graph):
+        pytest.skip("distributed graphs are not supported")
+    gpubenchmark(cugraph.spectralBalancedCutClustering, graph, 2)
 
 
 @pytest.mark.skip(reason="Need to guarantee graph has weights, "
                          "not doing that yet")
-def bench_spectralModularityMaximizationClustering(
-        gpubenchmark, anyGraphWithAdjListComputed):
-    gpubenchmark(cugraph.spectralModularityMaximizationClustering,
-                 anyGraphWithAdjListComputed, 2)
+def bench_spectralModularityMaximizationClustering(gpubenchmark, graph):
+    smmc = dask_cugraph.spectralModularityMaximizationClustering \
+           if is_graph_distributed(graph) \
+           else cugraph.spectralModularityMaximizationClustering
+    gpubenchmark(smmc, graph, 2)
+
+
+def bench_graph_degree(gpubenchmark, graph):
+    gpubenchmark(graph.degree)
+
+
+def bench_graph_degrees(gpubenchmark, graph):
+    if is_graph_distributed(graph):
+        pytest.skip("distributed graphs are not supported")
+    gpubenchmark(graph.degrees)
+
+
+def bench_betweenness_centrality(gpubenchmark, graph):
+    bc = dask_cugraph.betweenness_centrality if is_graph_distributed(graph) \
+         else cugraph.betweenness_centrality
+    gpubenchmark(bc, graph, k=10, random_state=123)
+
 
+def bench_edge_betweenness_centrality(gpubenchmark, graph):
+    if is_graph_distributed(graph):
+        pytest.skip("distributed graphs are not supported")
+    gpubenchmark(cugraph.edge_betweenness_centrality, graph, k=10, seed=123)
 
-def bench_graph_degree(gpubenchmark, anyGraphWithAdjListComputed):
-    gpubenchmark(anyGraphWithAdjListComputed.degree)
 
+def bench_uniform_neighbor_sample(gpubenchmark, graph):
+    uns = dask_cugraph.uniform_neighbor_sample if is_graph_distributed(graph) \
+         else cugraph.uniform_neighbor_sample
 
-def bench_graph_degrees(gpubenchmark, anyGraphWithAdjListComputed):
-    gpubenchmark(anyGraphWithAdjListComputed.degrees)
+    seed = 42
+    # FIXME: may need to provide number_of_vertices separately
+    num_verts_in_graph = graph.number_of_vertices()
+    len_start_list = max(int(num_verts_in_graph * 0.01), 2)
+    srcs = graph.edgelist.edgelist_df["src"]
+    frac = len_start_list / num_verts_in_graph
 
+    start_list = srcs.sample(frac=frac, random_state=seed)
+    # Attempt to automatically handle a dask Series
+    if hasattr(start_list, "compute"):
+        start_list = start_list.compute()
 
-def bench_betweenness_centrality(gpubenchmark, anyGraphWithAdjListComputed):
-    gpubenchmark(cugraph.betweenness_centrality,
-                 anyGraphWithAdjListComputed, k=10, random_state=123)
+    fanout_vals = [5, 5, 5]
+    gpubenchmark(uns, graph, start_list=start_list, fanout_vals=fanout_vals)
 
 
-def bench_edge_betweenness_centrality(gpubenchmark,
-                                      anyGraphWithAdjListComputed):
-    gpubenchmark(cugraph.edge_betweenness_centrality,
-                 anyGraphWithAdjListComputed, k=10, seed=123)
+def bench_egonet(gpubenchmark, graph):
+    egonet = dask_cugraph.ego_graph if is_graph_distributed(graph) \
+             else cugraph.ego_graph
+    n = 1
+    radius = 2
+    gpubenchmark(egonet, graph, n, radius=radius)
diff --git a/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py b/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py
index 8fe6e81ccf1..157c64b0b20 100644
--- a/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py
+++ b/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py
@@ -107,10 +107,8 @@ def create_mg_graph(graph_data):
     Create a graph instance based on the data to be loaded/generated, return a
     tuple containing (graph_obj, num_verts, client, cluster)
     """
-    n_devices = os.getenv("DASK_NUM_WORKERS", 4)
-    n_devices = int(n_devices)
     # range starts at 1 to let let 0 be used by benchmark/client process
-    visible_devices = ",".join([str(i) for i in range(1, n_devices+1)])
+    visible_devices = os.getenv("DASK_WORKER_DEVICES", "1,2,3,4")
 
     (client, cluster) = start_dask_client(
         # enable_tcp_over_ucx=True,
diff --git a/benchmarks/cugraph/pytest-based/conftest.py b/benchmarks/cugraph/pytest-based/conftest.py
index 312afb5f824..fd029471869 100644
--- a/benchmarks/cugraph/pytest-based/conftest.py
+++ b/benchmarks/cugraph/pytest-based/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,26 +11,66 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pytest
+
+
 def pytest_addoption(parser):
-    parser.addoption("--no-rmm-reinit", action="store_true", default=False,
-                     help="Do not reinit RMM to run benchmarks with different"
-                          " managed memory and pool allocator options.")
+    parser.addoption("--allow-rmm-reinit",
+                     action="store_true",
+                     default=False,
+                     help="Allow RMM to be reinitialized, possibly multiple times within "
+                     "the same process, in order to run benchmarks with different managed "
+                     "memory and pool allocator options. This is not the default behavior "
+                     "since it does not represent a typical use case, and support for "
+                     "this may be limited. Instead, consider multiple pytest runs that "
+                     "use a fixed set of RMM settings.")
+    parser.addoption("--rmat-scale",
+                     action="store",
+                     type=int,
+                     default=20,
+                     metavar="scale",
+                     help="For use when using synthetic graph data generated using RMAT. "
+                     "This results in a graph with 2^scale vertices. Default is "
+                     "%(default)s.")
+    parser.addoption("--rmat-edgefactor",
+                     action="store",
+                     type=int,
+                     default=16,
+                     metavar="edgefactor",
+                     help="For use when using synthetic graph data generated using RMAT. "
+                     "This results in a graph with (2^scale)*edgefactor edges. Default "
+                     "is %(default)s.")
 
 
 def pytest_sessionstart(session):
-    # if the --no-rmm-reinit option is given, set (or add to) the CLI "mark
-    # expression" (-m) the markers for no managedmem and no poolallocator. This
-    # will cause the RMM reinit() function to not be called.
-    if session.config.getoption("no_rmm_reinit"):
-        newMarkexpr = "managedmem_off and poolallocator_off"
+    # if the --allow-rmm-reinit option is not given, set (or add to) the CLI
+    # "mark expression" (-m) the markers for no managedmem and
+    # poolallocator. This will result in the RMM reinit() function to be called
+    # only once in the running process (the typical use case).
+    #
+    # FIXME: consider making the RMM config options set using a CLI option
+    # instead of by markers. This would mean only one RMM config can be used
+    # per test session, which could eliminate problems related to calling RMM
+    # reinit multiple times in the same process. This would not be a major
+    # change to the benchmark UX since the user is discouraged from doing a
+    # reinit multiple times anyway (hence the --allow-rmm-reinit flag).
+    if session.config.getoption("allow_rmm_reinit") is False:
         currentMarkexpr = session.config.getoption("markexpr")
 
         if ("managedmem" in currentMarkexpr) or \
            ("poolallocator" in currentMarkexpr):
             raise RuntimeError("managedmem and poolallocator markers cannot "
-                               "be used with --no-rmm-reinit")
+                               "be used without --allow-rmm-reinit.")
 
+        newMarkexpr = "managedmem_off and poolallocator_on"
         if currentMarkexpr:
             newMarkexpr = f"({currentMarkexpr}) and ({newMarkexpr})"
 
         session.config.option.markexpr = newMarkexpr
+
+    # Set the value of the CLI options for RMAT here since any RmatDataset
+    # objects must be instantiated prior to running test fixtures in order to
+    # have their test ID generated properly.
+    # FIXME: is there a better way to do this?
+    pytest._rmat_scale = session.config.getoption("rmat_scale")
+    pytest._rmat_edgefactor = session.config.getoption("rmat_edgefactor")
diff --git a/benchmarks/cugraph/standalone/cugraph_dask_funcs.py b/benchmarks/cugraph/standalone/cugraph_dask_funcs.py
index ddc9efc7f77..c6aa4a06100 100644
--- a/benchmarks/cugraph/standalone/cugraph_dask_funcs.py
+++ b/benchmarks/cugraph/standalone/cugraph_dask_funcs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -22,52 +22,11 @@
 
 import cugraph
 from cugraph.dask.comms import comms as Comms
-from cugraph.generators import rmat
 import tempfile
+from cugraph.testing.mg_utils import generate_edgelist
 
-import rmm
 
 
-def generate_edgelist(scale,
-                      edgefactor,
-                      seed=None,
-                      unweighted=False,
-):
-    """
-    Returns a dask_cudf DataFrame created using the R-MAT graph generator.
-
-    The resulting graph is weighted with random values of a uniform distribution
-    from the interval [0, 1)
-
-    scale is used to determine the number of vertices to be generated (num_verts
-    = 2^scale), which is also used to determine the data type for the vertex ID
-    values in the DataFrame.
-
-    edgefactor determies the number of edges (num_edges = num_edges*edgefactor)
-
-    seed, if specified, will be used as the seed to the RNG.
-
-    unweighted determines if the resulting edgelist will have randomly-generated
-    weightes ranging in value between [0, 1). If True, an edgelist with only 2
-    columns is returned.
-    """
-    ddf = rmat(
-        scale,
-        (2**scale)*edgefactor,
-        0.57,  # from Graph500
-        0.19,  # from Graph500
-        0.19,  # from Graph500
-        seed or 42,
-        clip_and_flip=False,
-        scramble_vertex_ids=True,
-        create_using=None,  # return edgelist instead of Graph instance
-        mg=True
-    )
-    if not unweighted:
-        rng = np.random.default_rng(seed)
-        ddf["weight"] = ddf.map_partitions(lambda df: rng.random(size=len(df)))
-    return ddf
-
 
 def read_csv(input_csv_file, scale):
     """
diff --git a/benchmarks/cugraph/standalone/cugraph_graph_creation.py b/benchmarks/cugraph/standalone/cugraph_graph_creation.py
new file mode 100644
index 00000000000..1edf67bba44
--- /dev/null
+++ b/benchmarks/cugraph/standalone/cugraph_graph_creation.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.testing.mg_utils import (
+    generate_edgelist_rmat,
+    get_allocation_counts_dask_persist,
+    get_allocation_counts_dask_lazy,
+    sizeof_fmt,
+    get_peak_output_ratio_across_workers,
+    restart_client,
+)
+
+from cugraph.testing.mg_utils import (
+    start_dask_client,
+    stop_dask_client,
+    enable_spilling,
+)
+from cugraph.structure.symmetrize import symmetrize_ddf
+import cugraph
+import cudf
+from time import sleep
+import pandas as pd
+import time
+
+
+@get_allocation_counts_dask_lazy(return_allocations=True, logging=True)
+def construct_graph(dask_dataframe, directed=False, renumber=False):
+    """
+    Args:
+        dask_dataframe:
+            dask_dataframe contains weighted and undirected edges with self
+            loops. Multiple edges will likely be present as well.
+        directed:
+            If True, the graph will be directed.
+        renumber:
+            If True, the graph will be renumbered.
+    Returns:
+        G:  cugraph.Graph
+    """
+    st = time.time()
+    G = cugraph.Graph(directed=directed)
+    G.from_dask_cudf_edgelist(
+        dask_dataframe, source="src", destination="dst", renumber=renumber
+    )
+    et = time.time()
+    g_creation_time = et - st
+    print(f"Graph creation time = {g_creation_time} s")
+    return G, g_creation_time
+
+
+@get_allocation_counts_dask_persist(return_allocations=True, logging=True)
+def symmetrize_cugraph_df(dask_df, multi=False):
+    output_df = symmetrize_ddf(dask_df, "src", "dst", multi=multi)
+    return output_df
+
+
+def benchmark_cugraph_graph_symmetrize(scale, edgefactor, seed, multi):
+    """
+    Benchmark cugraph graph symmetrization
+    """
+    dask_df = generate_edgelist_rmat(
+        scale=scale, edgefactor=edgefactor, seed=seed, unweighted=True, mg=True
+    )
+    dask_df = dask_df.astype("int64")
+    dask_df = dask_df.reset_index(drop=True)
+    input_memory = dask_df.memory_usage().sum().compute()
+    num_input_edges = len(dask_df)
+    print(f"Number of input edges = {num_input_edges:,}, multi = {multi}")
+    output_df, allocation_counts = symmetrize_cugraph_df(dask_df, multi=multi)
+    (
+        input_to_peak_ratio,
+        output_to_peak_ratio,
+        input_memory_per_worker,
+        peak_allocation_across_workers,
+    ) = get_memory_statistics(
+        allocation_counts=allocation_counts, input_memory=input_memory
+    )
+    print(f"Number of edges after symmetrization = {len(output_df):,}")
+    print("-" * 80)
+    return (
+        num_input_edges,
+        input_to_peak_ratio,
+        output_to_peak_ratio,
+        input_memory_per_worker,
+        peak_allocation_across_workers,
+    )
+
+
+def benchmark_cugraph_graph_creation(scale, edgefactor, seed, directed, renumber):
+    """
+    Entry point for the benchmark.
+    """
+    dask_df = generate_edgelist_rmat(
+        scale=scale,
+        edgefactor=edgefactor,
+        seed=seed,
+        unweighted=True,
+        mg=True,
+    )
+    # We do below to remove the rmat memory overhead
+    # which holds on to GPU memory
+    dask_df = dask_df.map_partitions(lambda df: df.to_pandas()).persist()
+    dask_df = dask_df.map_partitions(cudf.from_pandas)
+    dask_df = dask_df.astype("int64")
+    dask_df = dask_df.reset_index(drop=True)
+    input_memory = dask_df.memory_usage().sum().compute()
+    num_input_edges = len(dask_df)
+    print(
+        f"Number of input edges = {num_input_edges:,}, directed = {directed}, renumber = {renumber}"
+    )
+    (G, g_creation_time), allocation_counts = construct_graph(
+        dask_df, directed=directed, renumber=renumber
+    )
+    (
+        input_to_peak_ratio,
+        output_to_peak_ratio,
+        input_memory_per_worker,
+        peak_allocation_across_workers,
+    ) = get_memory_statistics(
+        allocation_counts=allocation_counts, input_memory=input_memory
+    )
+    print(f"Number of edges in final graph = {G.number_of_edges():,}")
+    print("-" * 80)
+    return (
+        num_input_edges,
+        input_to_peak_ratio,
+        output_to_peak_ratio,
+        input_memory_per_worker,
+        peak_allocation_across_workers,
+        g_creation_time,
+    )
+
+
+def get_memory_statistics(allocation_counts, input_memory):
+    """
+    Get memory statistics for the benchmark.
+    """
+    output_to_peak_ratio = get_peak_output_ratio_across_workers(allocation_counts)
+    peak_allocation_across_workers = max(
+        [a["peak_bytes"] for a in allocation_counts.values()]
+    )
+    input_memory_per_worker = input_memory / len(allocation_counts.keys())
+    input_to_peak_ratio = peak_allocation_across_workers / input_memory_per_worker
+    print(f"Edge List Memory = {sizeof_fmt(input_memory_per_worker)}")
+    print(f"Peak Memory across workers = {sizeof_fmt(peak_allocation_across_workers)}")
+    print(f"Max Peak to output graph ratio across workers = {output_to_peak_ratio:.2f}")
+    print(
+        f"Max Peak to avg input graph ratio across workers = {input_to_peak_ratio:.2f}"
+    )
+    return (
+        input_to_peak_ratio,
+        output_to_peak_ratio,
+        input_memory_per_worker,
+        peak_allocation_across_workers,
+    )
+
+
+if __name__ == "__main__":
+    client, cluster = start_dask_client(dask_worker_devices=[1], jit_unspill=False)
+    enable_spilling()
+    stats_ls = []
+    client.run(enable_spilling)
+    for scale in [23, 24, 25]:
+        for directed in [True, False]:
+            for renumber in [True, False]:
+                try:
+                    stats_d = {}
+                    (
+                        num_input_edges,
+                        input_to_peak_ratio,
+                        output_to_peak_ratio,
+                        input_memory_per_worker,
+                        peak_allocation_across_workers,
+                        g_creation_time,
+                    ) = benchmark_cugraph_graph_creation(
+                        scale=scale,
+                        edgefactor=16,
+                        seed=123,
+                        directed=directed,
+                        renumber=renumber,
+                    )
+                    stats_d["scale"] = scale
+                    stats_d["num_input_edges"] = num_input_edges
+                    stats_d["directed"] = directed
+                    stats_d["renumber"] = renumber
+                    stats_d["input_memory_per_worker"] = sizeof_fmt(
+                        input_memory_per_worker
+                    )
+                    stats_d["peak_allocation_across_workers"] = sizeof_fmt(
+                        peak_allocation_across_workers
+                    )
+                    stats_d["input_to_peak_ratio"] = input_to_peak_ratio
+                    stats_d["output_to_peak_ratio"] = output_to_peak_ratio
+                    stats_d["g_creation_time"] = g_creation_time
+                    stats_ls.append(stats_d)
+                except Exception as e:
+                    print(e)
+                restart_client(client)
+                sleep(10)
+
+            print("-" * 40 + f"renumber completed" + "-" * 40)
+            stats_df = pd.DataFrame(
+            stats_ls,
+                columns=[
+                    "scale",
+                    "num_input_edges",
+                    "directed",
+                    "renumber",
+                    "input_memory_per_worker",
+                    "peak_allocation_across_workers",
+                    "input_to_peak_ratio",
+                    "output_to_peak_ratio",
+                    "g_creation_time",
+                ],
+            )
+            stats_df.to_csv("cugraph_graph_creation_stats.csv")
+        print("-" * 40 + f"scale = {scale} completed" + "-" * 40)
+    # Cleanup Dask Cluster
+    stop_dask_client(client, cluster)
diff --git a/benchmarks/pytest.ini b/benchmarks/pytest.ini
index b61fa92d403..6af3aab27fe 100644
--- a/benchmarks/pytest.ini
+++ b/benchmarks/pytest.ini
@@ -14,7 +14,6 @@ markers =
           managedmem_off: RMM managed memory disabled
           poolallocator_on: RMM pool allocator enabled
           poolallocator_off: RMM pool allocator disabled
-          ETL: benchmarks for ETL steps
           small: small datasets
           tiny: tiny datasets
           directed: directed datasets
@@ -50,6 +49,8 @@ markers =
 	  num_clients_32: start 32 cugraph-service clients
 	  fanout_10_25: fanout [10, 25] for sampling algos
 	  fanout_5_10_15: fanout [5, 10, 15] for sampling algos
+	  rmat_data: RMAT-generated synthetic datasets
+	  file_data: datasets from $RAPIDS_DATASET_ROOT_DIR
 
 python_classes =
                  Bench*
diff --git a/benchmarks/shared/python/cugraph_benchmarking/params.py b/benchmarks/shared/python/cugraph_benchmarking/params.py
index 4cf749d0c21..ee63b8768a6 100644
--- a/benchmarks/shared/python/cugraph_benchmarking/params.py
+++ b/benchmarks/shared/python/cugraph_benchmarking/params.py
@@ -11,32 +11,68 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pathlib import Path
-
 import pytest
 
-from cugraph.testing import utils
 from pylibcugraph.testing.utils import gen_fixture_params
+from cugraph.testing import RAPIDS_DATASET_ROOT_DIR_PATH
+from cugraph.experimental.datasets import (
+    Dataset,
+    karate,
+)
 
+# Create Dataset objects from .csv files.
+# Once the cugraph.dataset package is updated to include the metadata files for
+# these (like karate), these will no longer need to be explicitly instantiated.
+hollywood = Dataset(
+    csv_file=RAPIDS_DATASET_ROOT_DIR_PATH / "csv/undirected/hollywood.csv",
+    csv_col_names=["src", "dst"],
+    csv_col_dtypes=["int32", "int32"])
+europe_osm = Dataset(
+    csv_file=RAPIDS_DATASET_ROOT_DIR_PATH / "csv/undirected/europe_osm.csv",
+    csv_col_names=["src", "dst"],
+    csv_col_dtypes=["int32", "int32"])
+cit_patents = Dataset(
+    csv_file=RAPIDS_DATASET_ROOT_DIR_PATH / "csv/directed/cit-Patents.csv",
+    csv_col_names=["src", "dst"],
+    csv_col_dtypes=["int32", "int32"])
+soc_livejournal = Dataset(
+    csv_file=RAPIDS_DATASET_ROOT_DIR_PATH / "csv/directed/soc-LiveJournal1.csv",
+    csv_col_names=["src", "dst"],
+    csv_col_dtypes=["int32", "int32"])
 
-# FIXME: omitting soc-twitter-2010.csv due to OOM error on some workstations.
+# Assume all "file_data" (.csv file on disk) datasets are too small to be useful for MG.
 undirected_datasets = [
-    pytest.param(Path(utils.RAPIDS_DATASET_ROOT_DIR) / "karate.csv",
-                 marks=[pytest.mark.tiny, pytest.mark.undirected]),
-    pytest.param(Path(utils.RAPIDS_DATASET_ROOT_DIR) / "csv/undirected/hollywood.csv",
-                 marks=[pytest.mark.small, pytest.mark.undirected]),
-    pytest.param(Path(utils.RAPIDS_DATASET_ROOT_DIR) / "csv/undirected/europe_osm.csv",
-                 marks=[pytest.mark.undirected]),
-    # pytest.param("../datasets/csv/undirected/soc-twitter-2010.csv",
-    #              marks=[pytest.mark.undirected]),
+    pytest.param(karate,
+                 marks=[pytest.mark.tiny,
+                        pytest.mark.undirected,
+                        pytest.mark.file_data,
+                        pytest.mark.sg,
+                        ]),
+    pytest.param(hollywood,
+                 marks=[pytest.mark.small,
+                        pytest.mark.undirected,
+                        pytest.mark.file_data,
+                        pytest.mark.sg,
+                        ]),
+    pytest.param(europe_osm,
+                 marks=[pytest.mark.undirected,
+                        pytest.mark.file_data,
+                        pytest.mark.sg,
+                        ]),
 ]
 
 directed_datasets = [
-    pytest.param(Path(utils.RAPIDS_DATASET_ROOT_DIR) / "csv/directed/cit-Patents.csv",
-                 marks=[pytest.mark.small, pytest.mark.directed]),
-    pytest.param(Path(
-        utils.RAPIDS_DATASET_ROOT_DIR) / "csv/directed/soc-LiveJournal1.csv",
-        marks=[pytest.mark.directed]),
+    pytest.param(cit_patents,
+                 marks=[pytest.mark.small,
+                        pytest.mark.directed,
+                        pytest.mark.file_data,
+                        pytest.mark.sg,
+                        ]),
+    pytest.param(soc_livejournal,
+                 marks=[pytest.mark.directed,
+                        pytest.mark.file_data,
+                        pytest.mark.sg,
+                        ]),
 ]
 
 managed_memory = [
diff --git a/build.sh b/build.sh
index 063f881020d..a8e97d924c6 100755
--- a/build.sh
+++ b/build.sh
@@ -272,7 +272,7 @@ if buildAll || hasArg libcugraph_etl; then
             CUGRAPH_CMAKE_CUDA_ARCHITECTURES="NATIVE"
             echo "Building for the architecture of the GPU in the system..."
         else
-            CUGRAPH_CMAKE_CUDA_ARCHITECTURES="ALL"
+            CUGRAPH_CMAKE_CUDA_ARCHITECTURES="RAPIDS"
             echo "Building for *ALL* supported GPU architectures..."
         fi
         mkdir -p ${LIBCUGRAPH_ETL_BUILD_DIR}
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index dc449437704..8dffbc1668c 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -19,7 +19,7 @@ rapids-print-env
 rapids-logger "Downloading artifacts from previous jobs"
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
-VERSION_NUMBER=$(rapids-get-rapids-version-from-git)
+VERSION_NUMBER="23.06"
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
@@ -53,7 +53,7 @@ sphinx-build -b text source _text
 popd
 
 
-if [[ "${RAPIDS_BUILD_TYPE}" == "branch" ]]; then
+if [[ "${RAPIDS_BUILD_TYPE}" != "pull-request" ]]; then
   rapids-logger "Upload Docs to S3"
   aws s3 sync --no-progress --delete docs/cugraph/_html "s3://rapidsai-docs/cugraph/${VERSION_NUMBER}/html"
   aws s3 sync --no-progress --delete docs/cugraph/_text "s3://rapidsai-docs/cugraph/${VERSION_NUMBER}/txt"
diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh
index ed291077494..610a603cef8 100755
--- a/ci/release/apply_wheel_modifications.sh
+++ b/ci/release/apply_wheel_modifications.sh
@@ -29,3 +29,7 @@ sed -i "s/raft-dask/raft-dask${CUDA_SUFFIX}/g" python/cugraph/pyproject.toml
 sed -i "s/pylibcugraph/pylibcugraph${CUDA_SUFFIX}/g" python/cugraph/pyproject.toml
 sed -i "s/pylibraft/pylibraft${CUDA_SUFFIX}/g" python/cugraph/pyproject.toml
 sed -i "s/ucx-py/ucx-py${CUDA_SUFFIX}/g" python/cugraph/pyproject.toml
+
+if [[ $CUDA_SUFFIX == "-cu12" ]]; then
+    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" python/cugraph/pyproject.toml
+fi
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index a221cdea51e..59f39b4828f 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -72,17 +72,18 @@ sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/pylibcugrap
 
 for FILE in conda/environments/*.yaml dependencies.yaml; do
    sed_runner "s/libcugraphops=${CURRENT_SHORT_TAG}/libcugraphops=${NEXT_SHORT_TAG}/g" ${FILE};
+   sed_runner "s/pylibcugraphops=${CURRENT_SHORT_TAG}/pylibcugraphops=${NEXT_SHORT_TAG}/g" ${FILE};
    sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" ${FILE};
    sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE};
    sed_runner "s/libraft-headers=${CURRENT_SHORT_TAG}/libraft-headers=${NEXT_SHORT_TAG}/g" ${FILE};
-   sed_runner "s/libraft-distance=${CURRENT_SHORT_TAG}/libraft-distance=${NEXT_SHORT_TAG}/g" ${FILE};
+   sed_runner "s/libraft=${CURRENT_SHORT_TAG}/libraft=${NEXT_SHORT_TAG}/g" ${FILE};
    sed_runner "s/pyraft=${CURRENT_SHORT_TAG}/pyraft=${NEXT_SHORT_TAG}/g" ${FILE};
    sed_runner "s/raft-dask=${CURRENT_SHORT_TAG}/raft-dask=${NEXT_SHORT_TAG}/g" ${FILE};
    sed_runner "s/pylibraft=${CURRENT_SHORT_TAG}/pylibraft=${NEXT_SHORT_TAG}/g" ${FILE};
    sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE};
    sed_runner "s/dask-cudf=${CURRENT_SHORT_TAG}/dask-cudf=${NEXT_SHORT_TAG}/g" ${FILE};
    sed_runner "s/cuxfilter=${CURRENT_SHORT_TAG}/cuxfilter=${NEXT_SHORT_TAG}/g" ${FILE};
-   sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCX_PY_VERSION}/g" ${FILE};
+   sed_runner "s/ucx-py==.*/ucx-py==${NEXT_UCX_PY_VERSION}/g" ${FILE};
 done
 
 # Doxyfile update
@@ -93,13 +94,15 @@ sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/
 sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/cugraph-service/conda_build_config.yaml
 sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/pylibcugraph/conda_build_config.yaml
 
+# CI files
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
   # Wheel builds clone cugraph-ops, update its branch
   sed_runner "s/extra-repo-sha: branch-.*/extra-repo-sha: branch-${NEXT_SHORT_TAG}/g" "${FILE}"
   # Wheel builds install dask-cuda from source, update its branch
-  sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+  sed_runner "s/dask-cuda.git@branch-[0-9][0-9].[0-9][0-9]/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
+sed_runner "s/VERSION_NUMBER=\".*/VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
 
 
 # Need to distutils-normalize the original version
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index a6c4cdb4a4f..f02ac748f18 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -34,7 +34,7 @@ nvidia-smi
 # RAPIDS_DATASET_ROOT_DIR is used by test scripts
 export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
 pushd "${RAPIDS_DATASET_ROOT_DIR}"
-./get_test_data.sh
+./get_test_data.sh --subset
 popd
 
 EXITCODE=0
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 2a6be338819..3a23f521734 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -43,7 +43,7 @@ nvidia-smi
 # RAPIDS_DATASET_ROOT_DIR is used by test scripts
 export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
 pushd "${RAPIDS_DATASET_ROOT_DIR}"
-./get_test_data.sh
+./get_test_data.sh --benchmark
 popd
 
 EXITCODE=0
@@ -64,14 +64,17 @@ popd
 
 rapids-logger "pytest cugraph"
 pushd python/cugraph/cugraph
+export DASK_WORKER_DEVICES="0"
 pytest \
-  -m sg \
+  -v \
+  --benchmark-disable \
   --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph.xml" \
   --cov-config=../../.coveragerc \
   --cov=cugraph \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-coverage.xml" \
   --cov-report=term \
+  -k "not test_property_graph_mg" \
   tests
 popd
 
@@ -80,7 +83,7 @@ pushd benchmarks
 pytest \
   --capture=no \
   --verbose \
-  -m "managedmem_on and poolallocator_on and tiny" \
+  -m tiny \
   --benchmark-disable \
   cugraph/pytest-based/bench_algos.py
 popd
@@ -124,7 +127,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
       pylibcugraphops \
       cugraph \
       cugraph-dgl \
-      'dgl>=1.0' \
+      'dgl>=1.1.0.cu*' \
       'pytorch>=2.0' \
       'pytorch-cuda>=11.8'
 
@@ -179,6 +182,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
       --channel "${PYTHON_CHANNEL}" \
       libcugraph \
       pylibcugraph \
+      pylibcugraphops \
       cugraph \
       cugraph-pyg
 
@@ -198,13 +202,13 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
       --cov-report=term \
       .
     popd
-    
+
     # Reactivate the test environment back
     set +u
     conda deactivate
     conda activate test
     set -u
-    
+
   else
     rapids-logger "skipping cugraph_pyg pytest on ARM64"
   fi
diff --git a/ci/utils/is_pascal.py b/ci/utils/is_pascal.py
index e55a3153a12..e716f59422f 100644
--- a/ci/utils/is_pascal.py
+++ b/ci/utils/is_pascal.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -26,9 +26,7 @@
 pascal = False
 
 device = cuda.get_current_device()
-# check for the attribute using both pre and post numba 0.53 names
-cc = getattr(device, 'COMPUTE_CAPABILITY', None) or \
-     getattr(device, 'compute_capability')
+cc = device.compute_capability
 if (cc[0] < 7):
     pascal = True
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 2494d4c9c67..f3d2afd2e24 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,35 +10,34 @@ dependencies:
 - aiohttp
 - c-compiler
 - cmake>=3.23.1,!=3.25.0
-- cuda-python>=11.7.1,<12.0
 - cudatoolkit=11.8
-- cudf==23.4.*
-- cupy>=9.5.0,<12.0.0a0
+- cudf==23.6.*
+- cupy>=12.0.0
 - cxx-compiler
 - cython>=0.29,<0.30
 - dask-core==2023.3.2
-- dask-cuda==23.4.*
-- dask-cudf==23.4.*
+- dask-cuda==23.6.*
+- dask-cudf==23.6.*
 - dask==2023.3.2
 - distributed==2023.3.2.1
 - doxygen
 - fsspec[http]>=0.6.0
 - gcc_linux-64=11.*
-- gmock=1.10.0
+- gmock>=1.13.0
 - graphviz
-- gtest=1.10.0
+- gtest>=1.13.0
 - ipython
-- libcudf=23.04.*
-- libcugraphops=23.04.*
-- libraft-headers=23.04.*
-- libraft=23.04.*
-- librmm=23.04.*
+- libcudf=23.6.*
+- libcugraphops=23.6.*
+- libraft-headers=23.6.*
+- libraft=23.6.*
+- librmm=23.6.*
 - nbsphinx
 - nccl>=2.9.9
 - networkx>=2.5.1
 - ninja
 - notebook>=0.5.0
-- numba>=0.56.2
+- numba>=0.57
 - numpy>=1.21
 - numpydoc
 - nvcc_linux-64=11.8
@@ -46,17 +45,18 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==23.4.*
+- pylibcugraphops=23.6.*
+- pylibraft==23.6.*
 - pytest
 - pytest-benchmark
 - pytest-cov
 - pytest-xdist
 - python-louvain
-- raft-dask==23.4.*
+- raft-dask==23.6.*
 - recommonmark
 - requests
-- rmm==23.4.*
-- scikit-build>=0.13.1
+- rmm==23.6.*
+- scikit-build>=0.13.1,<0.17.2
 - scikit-learn>=0.23.1
 - scipy
 - sphinx-copybutton
@@ -64,5 +64,5 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - ucx-proc=*=gpu
-- ucx-py=0.31.*
+- ucx-py==0.32.*
 name: all_cuda-118_arch-x86_64
diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index 240574b5cac..96d25da45fb 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -20,13 +20,13 @@ build:
 
 requirements:
   host:
-    - python x.x
+    - python
   run:
     - cugraph ={{ version }}
-    - dgl >=0.9.1
-    - numba >=0.56.2
-    - numpy
-    - python x.x
+    - dgl >=1.1.0.cu*
+    - numba >=0.57
+    - numpy >=1.21
+    - python
     - pytorch
 
 tests:
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 097f49bf527..71a64c771e2 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -23,14 +23,15 @@ requirements:
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
     - cython >=0.29,<0.30
-    - python x.x
+    - python
     - scikit-build >=0.13.1
   run:
     - distributed ==2023.3.2.1
-    - numba >=0.56.2
-    - numpy
+    - numba >=0.57
+    - numpy >=1.21
+    - python
     - pytorch >=2.0
-    - cupy >=9.5.0,<12.0.0a0
+    - cupy >=12.0.0
     - cugraph ={{ version }}
     - pyg >=2.3,<2.4
 
diff --git a/conda/recipes/cugraph-service/conda_build_config.yaml b/conda/recipes/cugraph-service/conda_build_config.yaml
index a47aacd6699..ab90a8af2a4 100644
--- a/conda/recipes/cugraph-service/conda_build_config.yaml
+++ b/conda/recipes/cugraph-service/conda_build_config.yaml
@@ -1,2 +1,2 @@
 ucx_py_version:
-  - "0.31.*"
+  - "0.32.*"
diff --git a/conda/recipes/cugraph-service/meta.yaml b/conda/recipes/cugraph-service/meta.yaml
index 499e28e88fc..d0a27883010 100644
--- a/conda/recipes/cugraph-service/meta.yaml
+++ b/conda/recipes/cugraph-service/meta.yaml
@@ -27,9 +27,9 @@ outputs:
     requirements:
       host:
         - pip
-        - python x.x
+        - python
       run:
-        - python x.x
+        - python
         - thriftpy2 >=0.4.15
 
   - name: cugraph-service-server
@@ -47,19 +47,20 @@ outputs:
     requirements:
       host:
         - pip
-        - python x.x
+        - python
         - setuptools
         - wheel
       run:
         - {{ pin_subpackage('cugraph-service-client', exact=True) }}
         - cudf ={{ minor_version }}
         - cugraph ={{ minor_version }}
-        - cupy >=9.5.0,<12.0.0a0
+        - cupy >=12.0.0
         - dask-cuda ={{ minor_version }}
         - dask-cudf ={{ minor_version }}
         - distributed ==2023.3.2.1
-        - numpy
-        - python x.x
+        - numba >=0.57
+        - numpy >=1.21
+        - python
         - thriftpy2 >=0.4.15
         - ucx-py {{ ucx_py_version }}
 
diff --git a/conda/recipes/cugraph/conda_build_config.yaml b/conda/recipes/cugraph/conda_build_config.yaml
index 1bf2cf3f5d4..20194c031f4 100644
--- a/conda/recipes/cugraph/conda_build_config.yaml
+++ b/conda/recipes/cugraph/conda_build_config.yaml
@@ -14,7 +14,7 @@ sysroot_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.31.*"
+  - "0.32.*"
 
 # The CTK libraries below are missing from the conda-forge::cudatoolkit
 # package. The "*_host_*" version specifiers correspond to `11.8` packages.
diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
index 0e6946c54bd..10f29e13f11 100644
--- a/conda/recipes/cugraph/meta.yaml
+++ b/conda/recipes/cugraph/meta.yaml
@@ -32,6 +32,7 @@ build:
     - SCCACHE_S3_KEY_PREFIX=cugraph-aarch64 # [aarch64]
     - SCCACHE_S3_KEY_PREFIX=cugraph-linux64 # [linux64]
     - SCCACHE_S3_USE_SSL
+    - SCCACHE_S3_NO_CREDENTIALS
   ignore_run_exports_from:
     - {{ compiler('cuda') }}
 
@@ -59,7 +60,7 @@ requirements:
     - libraft ={{ minor_version }}
     - libraft-headers ={{ minor_version }}
     - pylibraft ={{ minor_version}}
-    - python x.x
+    - python
     - raft-dask ={{ minor_version }}
     - scikit-build >=0.13.1
     - setuptools
@@ -69,7 +70,7 @@ requirements:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
     - cuda-python >=11.7.1,<12.0
     - cudf ={{ minor_version }}
-    - cupy >=9.5.0,<12.0.0a0
+    - cupy >=12.0.0
     - dask-cuda ={{ minor_version }}
     - dask-cudf ={{ minor_version }}
     - dask ==2023.3.2
@@ -80,7 +81,7 @@ requirements:
     - libraft-headers ={{ minor_version }}
     - pylibcugraph ={{ version }}
     - pylibraft ={{ minor_version }}
-    - python x.x
+    - python
     - raft-dask ={{ minor_version }}
     - ucx-proc=*=gpu
     - ucx-py {{ ucx_py_version }}
diff --git a/conda/recipes/libcugraph/conda_build_config.yaml b/conda/recipes/libcugraph/conda_build_config.yaml
index 83a383236a4..2fa26d99c09 100644
--- a/conda/recipes/libcugraph/conda_build_config.yaml
+++ b/conda/recipes/libcugraph/conda_build_config.yaml
@@ -17,7 +17,7 @@ nccl_version:
   - ">=2.9.9"
 
 gtest_version:
-  - "=1.10.0"
+  - ">=1.13.0"
 
 cuda_profiler_api_version:
   - ">=11.8.86,<12"
diff --git a/conda/recipes/libcugraph/meta.yaml b/conda/recipes/libcugraph/meta.yaml
index 5d53d2640b6..f843aabba92 100644
--- a/conda/recipes/libcugraph/meta.yaml
+++ b/conda/recipes/libcugraph/meta.yaml
@@ -29,6 +29,7 @@ build:
     - SCCACHE_S3_KEY_PREFIX=libcugraph-aarch64 # [aarch64]
     - SCCACHE_S3_KEY_PREFIX=libcugraph-linux64 # [linux64]
     - SCCACHE_S3_USE_SSL
+    - SCCACHE_S3_NO_CREDENTIALS
 
 requirements:
   build:
diff --git a/conda/recipes/pylibcugraph/conda_build_config.yaml b/conda/recipes/pylibcugraph/conda_build_config.yaml
index 1bf2cf3f5d4..20194c031f4 100644
--- a/conda/recipes/pylibcugraph/conda_build_config.yaml
+++ b/conda/recipes/pylibcugraph/conda_build_config.yaml
@@ -14,7 +14,7 @@ sysroot_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.31.*"
+  - "0.32.*"
 
 # The CTK libraries below are missing from the conda-forge::cudatoolkit
 # package. The "*_host_*" version specifiers correspond to `11.8` packages.
diff --git a/conda/recipes/pylibcugraph/meta.yaml b/conda/recipes/pylibcugraph/meta.yaml
index a29231ad1df..de031a6fe94 100644
--- a/conda/recipes/pylibcugraph/meta.yaml
+++ b/conda/recipes/pylibcugraph/meta.yaml
@@ -32,6 +32,7 @@ build:
     - SCCACHE_S3_KEY_PREFIX=pylibcugraph-aarch64 # [aarch64]
     - SCCACHE_S3_KEY_PREFIX=pylibcugraph-linux64 # [linux64]
     - SCCACHE_S3_USE_SSL
+    - SCCACHE_S3_NO_CREDENTIALS
   ignore_run_exports_from:
     - {{ compiler('cuda') }}
 
@@ -59,7 +60,7 @@ requirements:
     - libraft ={{ minor_version }}
     - libraft-headers ={{ minor_version }}
     - pylibraft ={{ minor_version}}
-    - python x.x
+    - python
     - rmm ={{ minor_version }}
     - scikit-build >=0.13.1
     - setuptools
@@ -68,7 +69,7 @@ requirements:
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
     - libcugraph ={{ version }}
-    - python x.x
+    - python
 
 tests:
   requirements:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 675637fd210..fe908fbd9bf 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUGRAPH)
 
-project(CUGRAPH VERSION 23.04.01 LANGUAGES C CXX CUDA)
+project(CUGRAPH VERSION 23.06.00 LANGUAGES C CXX CUDA)
 
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
@@ -213,10 +213,8 @@ set(CUGRAPH_SOURCES
     src/community/leiden_sg.cu
     src/community/leiden_mg.cu
     src/community/legacy/louvain.cu
-    src/community/legacy/leiden.cu
     src/community/legacy/ktruss.cu
     src/community/legacy/ecg.cu
-    src/community/legacy/extract_subgraph_by_vertex.cu
     src/community/egonet_sg.cu
     src/community/egonet_mg.cu
     src/sampling/random_walks.cu
@@ -232,6 +230,7 @@ set(CUGRAPH_SOURCES
     src/components/legacy/connectivity.cu
     src/centrality/legacy/betweenness_centrality.cu
     src/generators/generate_rmat_edgelist.cu
+    src/generators/generate_bipartite_rmat_edgelist.cu
     src/generators/generator_tools.cu
     src/generators/simple_generators.cu
     src/generators/erdos_renyi_generator.cu
@@ -404,7 +403,8 @@ add_library(cugraph_c
         src/c_api/capi_helper.cu
         src/c_api/legacy_spectral.cpp
         src/c_api/legacy_ecg.cpp
-        src/c_api/graph_helper.cu
+        src/c_api/graph_helper_sg.cu
+        src/c_api/graph_helper_mg.cu
         src/c_api/graph_generators.cpp
         src/c_api/induced_subgraph_result.cpp
         src/c_api/hits.cpp
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 3428562510f..5d04cd9b539 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "libcugraph"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER=23.04
+PROJECT_NUMBER=23.06
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 5eb347eb716..3bb98ce4150 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -677,47 +677,6 @@ void flatten_dendrogram(raft::handle_t const& handle,
                         Dendrogram<typename graph_view_t::vertex_type> const& dendrogram,
                         typename graph_view_t::vertex_type* clustering);
 
-/**
- * @brief      Legacy Leiden implementation
- *
- * Compute a clustering of the graph by maximizing modularity using the Leiden improvements
- * to the Louvain method.
- *
- * Computed using the Leiden method described in:
- *
- *    Traag, V. A., Waltman, L., & van Eck, N. J. (2019). From Louvain to Leiden:
- *    guaranteeing well-connected communities. Scientific reports, 9(1), 5233.
- *    doi: 10.1038/s41598-019-41695-z
- *
- * @throws cugraph::logic_error when an error occurs.
- *
- * @tparam vertex_t                  Type of vertex identifiers.
- *                                   Supported value : int (signed, 32-bit)
- * @tparam edge_t                    Type of edge identifiers.
- *                                   Supported value : int (signed, 32-bit)
- * @tparam weight_t                  Type of edge weights. Supported values : float or double.
- *
- * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
- * @param[in]  graph                 input graph object (CSR)
- * @param[out] clustering            Pointer to device array where the clustering should be stored
- * @param[in]  max_level             (optional) maximum number of levels to run (default 100)
- * @param[in]  resolution            (optional) The value of the resolution parameter to use.
- *                                   Called gamma in the modularity formula, this changes the size
- *                                   of the communities.  Higher resolutions lead to more smaller
- *                                   communities, lower resolutions lead to fewer larger
- * communities. (default 1)
- *
- * @return                           a pair containing:
- *                                     1) number of levels of the returned clustering
- *                                     2) modularity of the returned clustering
- */
-template <typename vertex_t, typename edge_t, typename weight_t>
-std::pair<size_t, weight_t> leiden(raft::handle_t const& handle,
-                                   legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                                   vertex_t* clustering,
-                                   size_t max_level    = 100,
-                                   weight_t resolution = weight_t{1});
-
 /**
  * @brief      Leiden implementation
  *
@@ -738,7 +697,9 @@ std::pair<size_t, weight_t> leiden(raft::handle_t const& handle,
  *                                   Supported value : int (signed, 32-bit)
  * @tparam weight_t                  Type of edge weights. Supported values : float or double.
  *
- * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param rng_state The RngState instance holding pseudo-random number generator state.
  * @param graph_view Graph view object.
  * @param edge_weight_view Optional view object holding edge weights for @p graph_view. If @p
  * edge_weight_view.has_value() == false, edge weights are assumed to be 1.0.
@@ -748,6 +709,10 @@ std::pair<size_t, weight_t> leiden(raft::handle_t const& handle,
  *                                   of the communities.  Higher resolutions lead to more smaller
  *                                   communities, lower resolutions lead to fewer larger
  * communities. (default 1)
+ * @param[in]  theta                 (optional) The value of the parameter to scale modularity
+ *                                    gain in Leiden refinement phase. It is used to compute
+ *                                    the probability of joining a random leiden community.
+ *                                    Called theta in the Leiden algorithm.
  *
  * @return                           a pair containing:
  *                                     1) unique pointer to dendrogram
@@ -757,10 +722,12 @@ std::pair<size_t, weight_t> leiden(raft::handle_t const& handle,
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   size_t max_level    = 100,
-  weight_t resolution = weight_t{1});
+  weight_t resolution = weight_t{1},
+  weight_t theta      = weight_t{1});
 
 /**
  * @brief      Leiden implementation
@@ -782,7 +749,9 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
  *                                   Supported value : int (signed, 32-bit)
  * @tparam weight_t                  Type of edge weights. Supported values : float or double.
  *
- * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param rng_state The RngState instance holding pseudo-random number generator state.
  * @param graph_view Graph view object.
  * @param edge_weight_view Optional view object holding edge weights for @p graph_view. If @p
  * edge_weight_view.has_value() == false, edge weights are assumed to be 1.0.
@@ -792,6 +761,11 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
  *                                   of the communities.  Higher resolutions lead to more smaller
  *                                   communities, lower resolutions lead to fewer larger
  * communities. (default 1)
+ * @param[in]  theta                 (optional) The value of the parameter to scale modularity
+ *                                    gain in Leiden refinement phase. It is used to compute
+ *                                    the probability of joining a random leiden community.
+ *                                    Called theta in the Leiden algorithm.
+ * communities. (default 1)
  *
  * @return                           a pair containing:
  *                                     1) number of levels of the returned clustering
@@ -800,11 +774,13 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::pair<size_t, weight_t> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   vertex_t* clustering,  // FIXME: Use (device_)span instead
   size_t max_level    = 100,
-  weight_t resolution = weight_t{1});
+  weight_t resolution = weight_t{1},
+  weight_t theta      = weight_t{1});
 
 /**
  * @brief Computes the ecg clustering of the given graph.
@@ -1667,7 +1643,7 @@ sample_neighbors_adjacency_list(raft::handle_t const& handle,
                                 vertex_t const* ptr_d_start,
                                 size_t num_start_vertices,
                                 size_t sampling_size,
-                                ops::gnn::graph::SamplingAlgoT sampling_algo);
+                                ops::graph::SamplingAlgoT sampling_algo);
 
 /**
  * @brief generate sub-sampled graph as an edge list (COO format) given input graph,
@@ -1697,7 +1673,7 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> sample_
   vertex_t const* ptr_d_start,
   size_t num_start_vertices,
   size_t sampling_size,
-  ops::gnn::graph::SamplingAlgoT sampling_algo);
+  ops::graph::SamplingAlgoT sampling_algo);
 #endif
 
 /**
@@ -2033,6 +2009,25 @@ std::tuple<rmm::device_uvector<size_t>, rmm::device_uvector<vertex_t>> k_hop_nbr
   size_t k,
   bool do_expensive_check = false);
 
+/*
+ * @brief Find a Maximal Independent Set
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object.
+ * @param rng_state The RngState instance holding pseudo-random number generator state.
+ * @return A device vector containing vertices found in the maximal independent set
+ */
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> maximal_independent_set(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  raft::random::RngState& rng_state);
+
 }  // namespace cugraph
 
 /**
diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh
index 2e18a71898f..02b931fbde6 100644
--- a/cpp/include/cugraph/edge_partition_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -265,7 +265,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
 
   __host__ __device__ vertex_t minor_range_first() const noexcept { return minor_range_first_; }
 
-  __host__ __device__ vertex_t minor_rage_last() const noexcept { return minor_range_last_; }
+  __host__ __device__ vertex_t minor_range_last() const noexcept { return minor_range_last_; }
 
   __host__ __device__ vertex_t minor_range_size() const noexcept
   {
diff --git a/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh b/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh
index 688d9ecf82f..f71fc167d12 100644
--- a/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_edge_property_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,9 @@
 #pragma once
 
 #include <cugraph/edge_property.hpp>
+#include <cugraph/utilities/atomic_ops.cuh>
+#include <cugraph/utilities/packed_bool_utils.hpp>
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
 
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/optional.h>
@@ -25,10 +28,18 @@ namespace cugraph {
 
 namespace detail {
 
-template <typename edge_t, typename ValueIterator>
+template <typename edge_t,
+          typename ValueIterator,
+          typename value_t = typename thrust::iterator_traits<ValueIterator>::value_type>
 class edge_partition_edge_property_device_view_t {
  public:
-  using value_type = typename thrust::iterator_traits<ValueIterator>::value_type;
+  static_assert(
+    std::is_same_v<typename thrust::iterator_traits<ValueIterator>::value_type, value_t> ||
+    cugraph::has_packed_bool_element<ValueIterator, value_t>());
+  static_assert(cugraph::is_arithmetic_or_thrust_tuple_of_arithmetic<value_t>::value);
+
+  using edge_type  = edge_t;
+  using value_type = value_t;
 
   edge_partition_edge_property_device_view_t() = default;
 
@@ -41,9 +52,116 @@ class edge_partition_edge_property_device_view_t {
 
   __host__ __device__ ValueIterator value_first() { return value_first_; }
 
-  __device__ ValueIterator get_iter(edge_t offset) const { return value_first_ + offset; }
+  __device__ value_t get(edge_t offset) const
+  {
+    if constexpr (cugraph::has_packed_bool_element<ValueIterator, value_t>()) {
+      static_assert(std::is_arithmetic_v<value_t>, "unimplemented for thrust::tuple types.");
+      auto mask = cugraph::packed_bool_mask(offset);
+      return static_cast<bool>(*(value_first_ + cugraph::packed_bool_offset(offset)) & mask);
+    } else {
+      return *(value_first_ + offset);
+    }
+  }
+
+  template <typename Iter = ValueIterator>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>>,
+    void>
+  set(edge_t offset, value_t val) const
+  {
+    if constexpr (cugraph::has_packed_bool_element<ValueIterator, value_t>()) {
+      static_assert(std::is_arithmetic_v<value_t>, "unimplemented for thrust::tuple types.");
+      auto mask = cugraph::packed_bool_mask(offset);
+      if (val) {
+        atomicOr(value_first_ + cugraph::packed_bool_offset(offset), mask);
+      } else {
+        atomicAnd(value_first_ + cugraph::packed_bool_offset(offset), ~mask);
+      }
+    } else {
+      *(value_first_ + offset) = val;
+    }
+  }
+
+  template <typename Iter = ValueIterator>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>>,
+    value_t>
+  atomic_and(edge_t offset, value_t val) const
+  {
+    if constexpr (cugraph::has_packed_bool_element<ValueIterator, value_t>()) {
+      static_assert(std::is_arithmetic_v<value_t>, "unimplemented for thrust::tuple types.");
+      auto mask = cugraph::packed_bool_mask(offset);
+      auto old  = atomicAnd(value_first_ + cugraph::packed_bool_offset(offset),
+                           val ? uint32_t{0xffffffff} : ~mask);
+      return static_cast<bool>(old & mask);
+    } else {
+      return cugraph::atomic_and(value_first_ + offset, val);
+    }
+  }
+
+  template <typename Iter = ValueIterator>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>>,
+    value_t>
+  atomic_or(edge_t offset, value_t val) const
+  {
+    if constexpr (cugraph::has_packed_bool_element<ValueIterator, value_t>()) {
+      static_assert(std::is_arithmetic_v<value_t>, "unimplemented for thrust::tuple types.");
+      auto mask = cugraph::packed_bool_mask(offset);
+      auto old =
+        atomicOr(value_first_ + cugraph::packed_bool_offset(offset), val ? mask : uint32_t{0});
+      return static_cast<bool>(old & mask);
+    } else {
+      return cugraph::atomic_or(value_first_ + offset, val);
+    }
+  }
 
-  __device__ value_type get(edge_t offset) const { return *get_iter(offset); }
+  template <typename Iter = ValueIterator, typename T = value_t>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>> &&
+      !cugraph::has_packed_bool_element<Iter, T>() /* add undefined for (packed-)bool */,
+    value_t>
+  atomic_add(edge_t offset, value_t val) const
+  {
+    cugraph::atomic_add(value_first_ + offset, val);
+  }
+
+  template <typename Iter = ValueIterator>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>>,
+    value_t>
+  elementwise_atomic_cas(edge_t offset, value_t compare, value_t val) const
+  {
+    if constexpr (cugraph::has_packed_bool_element<ValueIterator, value_t>()) {
+      static_assert(std::is_arithmetic_v<value_t>, "unimplemented for thrust::tuple types.");
+      auto mask = cugraph::packed_bool_mask(offset);
+      auto old  = val ? atomicOr(value_first_ + cugraph::packed_bool_offset(offset), mask)
+                      : atomicAnd(value_first_ + cugraph::packed_bool_offset(offset), ~mask);
+      return static_cast<bool>(old & mask);
+    } else {
+      return cugraph::elementwise_atomic_cas(value_first_ + offset, compare, val);
+    }
+  }
+
+  template <typename Iter = ValueIterator, typename T = value_t>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>> &&
+      !cugraph::has_packed_bool_element<Iter, T>() /* min undefined for (packed-)bool */,
+    value_t>
+  elementwise_atomic_min(edge_t offset, value_t val) const
+  {
+    cugraph::elementwise_atomic_min(value_first_ + offset, val);
+  }
+
+  template <typename Iter = ValueIterator, typename T = value_t>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>> &&
+      !cugraph::has_packed_bool_element<Iter, T>() /* max undefined for (packed-)bool */,
+    value_t>
+  elementwise_atomic_max(edge_t offset, value_t val) const
+  {
+    cugraph::elementwise_atomic_max(value_first_ + offset, val);
+  }
 
  private:
   ValueIterator value_first_{};
@@ -52,6 +170,7 @@ class edge_partition_edge_property_device_view_t {
 template <typename edge_t>
 class edge_partition_edge_dummy_property_device_view_t {
  public:
+  using edge_type  = edge_t;
   using value_type = thrust::nullopt_t;
 
   edge_partition_edge_dummy_property_device_view_t() = default;
diff --git a/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh b/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh
index 459547198a5..1ff279fbdca 100644
--- a/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_endpoint_property_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,9 @@
 #pragma once
 
 #include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/utilities/atomic_ops.cuh>
 #include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/packed_bool_utils.hpp>
 
 #include <raft/core/device_span.hpp>
 
@@ -32,15 +34,22 @@ namespace cugraph {
 
 namespace detail {
 
-template <typename vertex_t, typename ValueIterator>
+template <typename vertex_t,
+          typename ValueIterator,
+          typename value_t = typename thrust::iterator_traits<ValueIterator>::value_type>
 class edge_partition_endpoint_property_device_view_t {
  public:
-  using value_type = typename thrust::iterator_traits<ValueIterator>::value_type;
+  static_assert(
+    std::is_same_v<typename thrust::iterator_traits<ValueIterator>::value_type, value_t> ||
+    cugraph::has_packed_bool_element<ValueIterator, value_t>());
+
+  using vertex_type = vertex_t;
+  using value_type  = value_t;
 
   edge_partition_endpoint_property_device_view_t() = default;
 
   edge_partition_endpoint_property_device_view_t(
-    edge_major_property_view_t<vertex_t, ValueIterator> const& view, size_t partition_idx)
+    edge_major_property_view_t<vertex_t, ValueIterator, value_t> const& view, size_t partition_idx)
     : value_first_(view.value_firsts()[partition_idx]),
       range_first_(view.major_range_firsts()[partition_idx])
   {
@@ -54,7 +63,7 @@ class edge_partition_endpoint_property_device_view_t {
   }
 
   edge_partition_endpoint_property_device_view_t(
-    edge_minor_property_view_t<vertex_t, ValueIterator> const& view)
+    edge_minor_property_view_t<vertex_t, ValueIterator, value_t> const& view)
   {
     if (view.keys()) {
       keys_                    = *(view.keys());
@@ -65,25 +74,104 @@ class edge_partition_endpoint_property_device_view_t {
     range_first_ = view.minor_range_first();
   }
 
-  __device__ ValueIterator get_iter(vertex_t offset) const
+  __device__ value_t get(vertex_t offset) const
   {
-    auto value_offset = offset;
-    if (keys_) {
-      auto chunk_idx = static_cast<size_t>(offset) / (*key_chunk_size_);
-      auto it        = thrust::lower_bound(thrust::seq,
-                                    (*keys_).begin() + (*key_chunk_start_offsets_)[chunk_idx],
-                                    (*keys_).begin() + (*key_chunk_start_offsets_)[chunk_idx + 1],
-                                    range_first_ + offset);
-      assert((it != (*keys_).begin() + (*key_chunk_start_offsets_)[chunk_idx + 1]) &&
-             (*it == (range_first_ + offset)));
-      value_offset = (*key_chunk_start_offsets_)[chunk_idx] +
-                     static_cast<vertex_t>(thrust::distance(
-                       (*keys_).begin() + (*key_chunk_start_offsets_)[chunk_idx], it));
+    auto val_offset = value_offset(offset);
+    if constexpr (cugraph::has_packed_bool_element<ValueIterator, value_t>()) {
+      static_assert(std::is_arithmetic_v<value_t>, "unimplemented for thrust::tuple types.");
+      auto mask = cugraph::packed_bool_mask(val_offset);
+      return static_cast<bool>(*(value_first_ + cugraph::packed_bool_offset(val_offset)) & mask);
+    } else {
+      return *(value_first_ + val_offset);
     }
-    return value_first_ + value_offset;
   }
 
-  __device__ value_type get(vertex_t offset) const { return *get_iter(offset); }
+  template <typename Iter = ValueIterator>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>>,
+    value_t>
+  atomic_and(vertex_t offset, value_t val) const
+  {
+    auto val_offset = value_offset(offset);
+    if constexpr (cugraph::has_packed_bool_element<ValueIterator, value_t>()) {
+      static_assert(std::is_arithmetic_v<value_t>, "unimplemented for thrust::tuple types.");
+      auto mask = cugraph::packed_bool_mask(val_offset);
+      auto old  = atomicAnd(value_first_ + cugraph::packed_bool_offset(val_offset),
+                           val ? cugraph::packed_bool_full_mask() : ~mask);
+      return static_cast<bool>(old & mask);
+    } else {
+      return cugraph::atomic_and(value_first_ + val_offset, val);
+    }
+  }
+
+  template <typename Iter = ValueIterator>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>>,
+    value_t>
+  atomic_or(vertex_t offset, value_t val) const
+  {
+    auto val_offset = value_offset(offset);
+    if constexpr (cugraph::has_packed_bool_element<ValueIterator, value_t>()) {
+      static_assert(std::is_arithmetic_v<value_t>, "unimplemented for thrust::tuple types.");
+      auto mask = cugraph::packed_bool_mask(val_offset);
+      auto old  = atomicOr(value_first_ + cugraph::packed_bool_offset(val_offset),
+                          val ? mask : cugraph::packed_bool_empty_mask());
+      return static_cast<bool>(old & mask);
+    } else {
+      return cugraph::atomic_or(value_first_ + val_offset, val);
+    }
+  }
+
+  template <typename Iter = ValueIterator, typename T = value_t>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>> &&
+      !cugraph::has_packed_bool_element<Iter, T>() /* add undefined for (packed-)bool */,
+    value_t>
+  atomic_add(vertex_t offset, value_t val) const
+  {
+    auto val_offset = value_offset(offset);
+    cugraph::atomic_add(value_first_ + val_offset, val);
+  }
+
+  template <typename Iter = ValueIterator>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>>,
+    value_t>
+  elementwise_atomic_cas(vertex_t offset, value_t compare, value_t val) const
+  {
+    auto val_offset = value_offset(offset);
+    if constexpr (cugraph::has_packed_bool_element<ValueIterator, value_t>()) {
+      static_assert(std::is_arithmetic_v<value_t>, "unimplemented for thrust::tuple types.");
+      auto mask = cugraph::packed_bool_mask(val_offset);
+      auto old  = val ? atomicOr(value_first_ + cugraph::packed_bool_offset(val_offset), mask)
+                      : atomicAnd(value_first_ + cugraph::packed_bool_offset(val_offset), ~mask);
+      return static_cast<bool>(old & mask);
+    } else {
+      return cugraph::elementwise_atomic_cas(value_first_ + val_offset, compare, val);
+    }
+  }
+
+  template <typename Iter = ValueIterator, typename T = value_t>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>> &&
+      !cugraph::has_packed_bool_element<Iter, T>() /* min undefined for (packed-)bool */,
+    value_t>
+  elementwise_atomic_min(vertex_t offset, value_t val) const
+  {
+    auto val_offset = value_offset(offset);
+    cugraph::elementwise_atomic_min(value_first_ + val_offset, val);
+  }
+
+  template <typename Iter = ValueIterator, typename T = value_t>
+  __device__ std::enable_if_t<
+    !std::is_const_v<std::remove_reference_t<typename std::iterator_traits<Iter>::reference>> &&
+      !cugraph::has_packed_bool_element<Iter, T>() /* max undefined for (packed-)bool */,
+    value_t>
+  elementwise_atomic_max(vertex_t offset, value_t val) const
+  {
+    auto val_offset = value_offset(offset);
+    cugraph::elementwise_atomic_max(value_first_ + val_offset, val);
+  }
 
  private:
   thrust::optional<raft::device_span<vertex_t const>> keys_{thrust::nullopt};
@@ -92,12 +180,31 @@ class edge_partition_endpoint_property_device_view_t {
 
   ValueIterator value_first_{};
   vertex_t range_first_{};
+
+  __device__ vertex_t value_offset(vertex_t offset) const
+  {
+    auto val_offset = offset;
+    if (keys_) {
+      auto chunk_idx = static_cast<size_t>(offset) / (*key_chunk_size_);
+      auto it        = thrust::lower_bound(thrust::seq,
+                                    (*keys_).begin() + (*key_chunk_start_offsets_)[chunk_idx],
+                                    (*keys_).begin() + (*key_chunk_start_offsets_)[chunk_idx + 1],
+                                    range_first_ + offset);
+      assert((it != (*keys_).begin() + (*key_chunk_start_offsets_)[chunk_idx + 1]) &&
+             (*it == (range_first_ + offset)));
+      val_offset = (*key_chunk_start_offsets_)[chunk_idx] +
+                   static_cast<vertex_t>(thrust::distance(
+                     (*keys_).begin() + (*key_chunk_start_offsets_)[chunk_idx], it));
+    }
+    return val_offset;
+  }
 };
 
 template <typename vertex_t>
 class edge_partition_endpoint_dummy_property_device_view_t {
  public:
-  using value_type = thrust::nullopt_t;
+  using vertex_type = vertex_t;
+  using value_type  = thrust::nullopt_t;
 
   edge_partition_endpoint_dummy_property_device_view_t() = default;
 
diff --git a/cpp/include/cugraph/edge_property.hpp b/cpp/include/cugraph/edge_property.hpp
index fdd28bc1eb6..8904006a2a2 100644
--- a/cpp/include/cugraph/edge_property.hpp
+++ b/cpp/include/cugraph/edge_property.hpp
@@ -17,21 +17,30 @@
 #pragma once
 
 #include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/packed_bool_utils.hpp>
 #include <cugraph/utilities/thrust_tuple_utils.hpp>
 
 #include <raft/core/handle.hpp>
 
 #include <thrust/iterator/iterator_traits.h>
+#include <thrust/optional.h>
 
 #include <optional>
 #include <type_traits>
 
 namespace cugraph {
 
-template <typename edge_t, typename ValueIterator>
+template <typename edge_t,
+          typename ValueIterator,
+          typename value_t = typename thrust::iterator_traits<ValueIterator>::value_type>
 class edge_property_view_t {
  public:
-  using value_type     = typename thrust::iterator_traits<ValueIterator>::value_type;
+  static_assert(
+    std::is_same_v<typename thrust::iterator_traits<ValueIterator>::value_type, value_t> ||
+    cugraph::has_packed_bool_element<ValueIterator, value_t>());
+
+  using edge_type      = edge_t;
+  using value_type     = value_t;
   using value_iterator = ValueIterator;
 
   edge_property_view_t() = default;
@@ -61,6 +70,8 @@ class edge_dummy_property_view_t {
 template <typename GraphViewType, typename T>
 class edge_property_t {
  public:
+  static_assert(cugraph::is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+
   using edge_type   = typename GraphViewType::edge_type;
   using value_type  = T;
   using buffer_type = decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}));
@@ -70,18 +81,39 @@ class edge_property_t {
   edge_property_t(raft::handle_t const& handle, GraphViewType const& graph_view)
   {
     buffers_.reserve(graph_view.number_of_local_edge_partitions());
+    edge_counts_ = std::vector<edge_type>(graph_view.number_of_local_edge_partitions(), 0);
     for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-      buffers_.push_back(allocate_dataframe_buffer<T>(
-        graph_view.local_edge_partition_view(i).number_of_edges(), handle.get_stream()));
+      auto num_edges =
+        static_cast<size_t>(graph_view.local_edge_partition_view(i).number_of_edges());
+      size_t buffer_size =
+        std::is_same_v<T, bool> ? cugraph::packed_bool_size(num_edges) : num_edges;
+      buffers_.push_back(
+        allocate_dataframe_buffer<std::conditional_t<std::is_same_v<T, bool>, uint32_t, T>>(
+          buffer_size, handle.get_stream()));
+      edge_counts_[i] = num_edges;
+    }
+  }
+
+  template <typename value_type = T, typename = std::enable_if_t<!std::is_same_v<value_type, bool>>>
+  edge_property_t(std::vector<buffer_type>&& buffers) : buffers_(std::move(buffers))
+  {
+    edge_counts_.resize(buffers_.size());
+    for (size_t i = 0; i < edge_counts_.size(); ++i) {
+      edge_counts_[i] = size_dataframe_buffer(buffers_[i]);
     }
   }
 
-  edge_property_t(std::vector<buffer_type>&& buffers) : buffers_(std::move(buffers)) {}
+  edge_property_t(std::vector<buffer_type>&& buffers, std::vector<edge_type>&& edge_counts)
+    : buffers_(std::move(buffers)), edge_counts_(std::move(edge_counts))
+  {
+  }
 
   void clear(raft::handle_t const& handle)
   {
     buffers_.clear();
     buffers_.shrink_to_fit();
+    edge_counts_.clear();
+    edge_counts_.shrink_to_fit();
   }
 
   auto view() const
@@ -92,11 +124,11 @@ class edge_property_t {
     std::vector<edge_type> edge_partition_edge_counts(buffers_.size());
     for (size_t i = 0; i < edge_partition_value_firsts.size(); ++i) {
       edge_partition_value_firsts[i] = get_dataframe_buffer_cbegin(buffers_[i]);
-      edge_partition_edge_counts[i]  = size_dataframe_buffer(buffers_[i]);
+      edge_partition_edge_counts[i]  = edge_counts_[i];
     }
 
-    return edge_property_view_t<edge_type, const_value_iterator>(edge_partition_value_firsts,
-                                                                 edge_partition_edge_counts);
+    return edge_property_view_t<edge_type, const_value_iterator, T>(edge_partition_value_firsts,
+                                                                    edge_partition_edge_counts);
   }
 
   auto mutable_view()
@@ -107,15 +139,16 @@ class edge_property_t {
     std::vector<edge_type> edge_partition_edge_counts(buffers_.size());
     for (size_t i = 0; i < edge_partition_value_firsts.size(); ++i) {
       edge_partition_value_firsts[i] = get_dataframe_buffer_begin(buffers_[i]);
-      edge_partition_edge_counts[i]  = size_dataframe_buffer(buffers_[i]);
+      edge_partition_edge_counts[i]  = edge_counts_[i];
     }
 
-    return edge_property_view_t<edge_type, value_iterator>(edge_partition_value_firsts,
-                                                           edge_partition_edge_counts);
+    return edge_property_view_t<edge_type, value_iterator, T>(edge_partition_value_firsts,
+                                                              edge_partition_edge_counts);
   }
 
  private:
   std::vector<buffer_type> buffers_{};
+  std::vector<edge_type> edge_counts_{};
 };
 
 class edge_dummy_property_t {
@@ -125,11 +158,12 @@ class edge_dummy_property_t {
   auto view() const { return edge_dummy_property_view_t{}; }
 };
 
-template <typename edge_t, typename... Ts>
-auto view_concat(edge_property_view_t<edge_t, Ts> const&... views)
+template <typename edge_t, typename... Iters, typename... Types>
+auto view_concat(edge_property_view_t<edge_t, Iters, Types> const&... views)
 {
   using concat_value_iterator = decltype(thrust::make_zip_iterator(
     thrust_tuple_cat(to_thrust_iterator_tuple(views.value_firsts()[0])...)));
+  using concat_value_type     = decltype(thrust_tuple_cat(to_thrust_tuple(Types{})...));
 
   std::vector<concat_value_iterator> edge_partition_concat_value_firsts{};
   auto first_view = get_first_of_pack(views...);
@@ -139,8 +173,8 @@ auto view_concat(edge_property_view_t<edge_t, Ts> const&... views)
       thrust_tuple_cat(to_thrust_iterator_tuple(views.value_firsts()[i])...));
   }
 
-  return edge_property_view_t<edge_t, concat_value_iterator>(edge_partition_concat_value_firsts,
-                                                             first_view.edge_counts());
+  return edge_property_view_t<edge_t, concat_value_iterator, concat_value_type>(
+    edge_partition_concat_value_firsts, first_view.edge_counts());
 }
 
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/edge_src_dst_property.hpp b/cpp/include/cugraph/edge_src_dst_property.hpp
index f894d3d1a60..d27f6856428 100644
--- a/cpp/include/cugraph/edge_src_dst_property.hpp
+++ b/cpp/include/cugraph/edge_src_dst_property.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/packed_bool_utils.hpp>
 #include <cugraph/utilities/thrust_tuple_utils.hpp>
 
 #include <raft/core/device_span.hpp>
@@ -25,6 +26,7 @@
 
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/optional.h>
 #include <thrust/tuple.h>
 
 #include <optional>
@@ -34,10 +36,17 @@ namespace cugraph {
 
 namespace detail {
 
-template <typename vertex_t, typename ValueIterator>
+template <typename vertex_t,
+          typename ValueIterator,
+          typename value_t = typename thrust::iterator_traits<ValueIterator>::value_type>
 class edge_major_property_view_t {
  public:
-  using value_type     = typename thrust::iterator_traits<ValueIterator>::value_type;
+  static_assert(
+    std::is_same_v<typename thrust::iterator_traits<ValueIterator>::value_type, value_t> ||
+    cugraph::has_packed_bool_element<ValueIterator, value_t>());
+
+  using vertex_type    = vertex_t;
+  using value_type     = value_t;
   using value_iterator = ValueIterator;
 
   edge_major_property_view_t() = default;
@@ -101,10 +110,17 @@ class edge_major_property_view_t {
   std::vector<vertex_t> edge_partition_major_range_firsts_{};
 };
 
-template <typename vertex_t, typename ValueIterator>
+template <typename vertex_t,
+          typename ValueIterator,
+          typename value_t = typename thrust::iterator_traits<ValueIterator>::value_type>
 class edge_minor_property_view_t {
  public:
-  using value_type     = typename thrust::iterator_traits<ValueIterator>::value_type;
+  static_assert(
+    std::is_same_v<typename thrust::iterator_traits<ValueIterator>::value_type, value_t> ||
+    cugraph::has_packed_bool_element<ValueIterator, value_t>());
+
+  using vertex_type    = vertex_t;
+  using value_type     = value_t;
   using value_iterator = ValueIterator;
 
   edge_minor_property_view_t() = default;
@@ -152,7 +168,11 @@ class edge_minor_property_view_t {
 template <typename vertex_t, typename T>
 class edge_major_property_t {
  public:
-  using buffer_type = decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}));
+  static_assert(cugraph::is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+
+  using buffer_type =
+    decltype(allocate_dataframe_buffer<std::conditional_t<std::is_same_v<T, bool>, uint32_t, T>>(
+      size_t{0}, rmm::cuda_stream_view{}));
 
   edge_major_property_t(raft::handle_t const& handle) {}
 
@@ -163,8 +183,12 @@ class edge_major_property_t {
   {
     buffers_.reserve(edge_partition_major_range_firsts_.size());
     for (size_t i = 0; i < edge_partition_major_range_firsts_.size(); ++i) {
+      size_t buffer_size = std::is_same_v<T, bool>
+                             ? cugraph::packed_bool_size(edge_partition_major_range_sizes[i])
+                             : edge_partition_major_range_sizes[i];
       buffers_.push_back(
-        allocate_dataframe_buffer<T>(edge_partition_major_range_sizes[i], handle.get_stream()));
+        allocate_dataframe_buffer<std::conditional_t<std::is_same_v<T, bool>, uint32_t, T>>(
+          buffer_size, handle.get_stream()));
     }
   }
 
@@ -181,8 +205,12 @@ class edge_major_property_t {
   {
     buffers_.reserve(edge_partition_major_range_firsts_.size());
     for (size_t i = 0; i < edge_partition_major_range_firsts_.size(); ++i) {
+      size_t buffer_size = std::is_same_v<T, bool>
+                             ? cugraph::packed_bool_size(edge_partition_keys[i].size())
+                             : edge_partition_keys[i].size();
       buffers_.push_back(
-        allocate_dataframe_buffer<T>(edge_partition_keys[i].size(), handle.get_stream()));
+        allocate_dataframe_buffer<std::conditional_t<std::is_same_v<T, bool>, uint32_t, T>>(
+          buffer_size, handle.get_stream()));
     }
   }
 
@@ -208,14 +236,14 @@ class edge_major_property_t {
     }
 
     if (edge_partition_keys_) {
-      return edge_major_property_view_t<vertex_t, const_value_iterator>(
+      return edge_major_property_view_t<vertex_t, const_value_iterator, T>(
         *edge_partition_keys_,
         *edge_partition_key_chunk_start_offsets_,
         *key_chunk_size_,
         edge_partition_value_firsts,
         edge_partition_major_range_firsts_);
     } else {
-      return edge_major_property_view_t<vertex_t, const_value_iterator>(
+      return edge_major_property_view_t<vertex_t, const_value_iterator, T>(
         edge_partition_value_firsts, edge_partition_major_range_firsts_);
     }
   }
@@ -230,14 +258,14 @@ class edge_major_property_t {
     }
 
     if (edge_partition_keys_) {
-      return edge_major_property_view_t<vertex_t, value_iterator>(
+      return edge_major_property_view_t<vertex_t, value_iterator, T>(
         *edge_partition_keys_,
         *edge_partition_key_chunk_start_offsets_,
         *key_chunk_size_,
         edge_partition_value_firsts,
         edge_partition_major_range_firsts_);
     } else {
-      return edge_major_property_view_t<vertex_t, value_iterator>(
+      return edge_major_property_view_t<vertex_t, value_iterator, T>(
         edge_partition_value_firsts, edge_partition_major_range_firsts_);
     }
   }
@@ -256,8 +284,11 @@ class edge_major_property_t {
 template <typename vertex_t, typename T>
 class edge_minor_property_t {
  public:
+  static_assert(cugraph::is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+
   edge_minor_property_t(raft::handle_t const& handle)
-    : buffer_(allocate_dataframe_buffer<T>(size_t{0}, handle.get_stream())),
+    : buffer_(allocate_dataframe_buffer<std::conditional_t<std::is_same_v<T, bool>, uint32_t, T>>(
+        size_t{0}, handle.get_stream())),
       minor_range_first_(vertex_t{0})
   {
   }
@@ -265,7 +296,9 @@ class edge_minor_property_t {
   edge_minor_property_t(raft::handle_t const& handle,
                         vertex_t buffer_size,
                         vertex_t minor_range_first)
-    : buffer_(allocate_dataframe_buffer<T>(buffer_size, handle.get_stream())),
+    : buffer_(allocate_dataframe_buffer<std::conditional_t<std::is_same_v<T, bool>, uint32_t, T>>(
+        std::is_same_v<T, bool> ? cugraph::packed_bool_size(buffer_size) : buffer_size,
+        handle.get_stream())),
       minor_range_first_(minor_range_first)
   {
   }
@@ -278,7 +311,9 @@ class edge_minor_property_t {
     : keys_(keys),
       key_chunk_start_offsets_(key_chunk_start_offsets),
       key_chunk_size_(key_chunk_size),
-      buffer_(allocate_dataframe_buffer<T>(keys.size(), handle.get_stream())),
+      buffer_(allocate_dataframe_buffer<std::conditional_t<std::is_same_v<T, bool>, uint32_t, T>>(
+        std::is_same_v<T, bool> ? cugraph::packed_bool_size(keys.size()) : keys.size(),
+        handle.get_stream())),
       minor_range_first_(minor_range_first)
   {
   }
@@ -298,11 +333,11 @@ class edge_minor_property_t {
   {
     auto value_first = get_dataframe_buffer_cbegin(buffer_);
     if (keys_) {
-      return edge_minor_property_view_t<vertex_t, decltype(value_first)>(
+      return edge_minor_property_view_t<vertex_t, decltype(value_first), T>(
         *keys_, *key_chunk_start_offsets_, *key_chunk_size_, value_first, minor_range_first_);
     } else {
-      return edge_minor_property_view_t<vertex_t, decltype(value_first)>(value_first,
-                                                                         minor_range_first_);
+      return edge_minor_property_view_t<vertex_t, decltype(value_first), T>(value_first,
+                                                                            minor_range_first_);
     }
   }
 
@@ -310,11 +345,11 @@ class edge_minor_property_t {
   {
     auto value_first = get_dataframe_buffer_begin(buffer_);
     if (keys_) {
-      return edge_minor_property_view_t<vertex_t, decltype(value_first)>(
+      return edge_minor_property_view_t<vertex_t, decltype(value_first), T>(
         *keys_, *key_chunk_start_offsets_, *key_chunk_size_, value_first, minor_range_first_);
     } else {
-      return edge_minor_property_view_t<vertex_t, decltype(value_first)>(value_first,
-                                                                         minor_range_first_);
+      return edge_minor_property_view_t<vertex_t, decltype(value_first), T>(value_first,
+                                                                            minor_range_first_);
     }
   }
 
@@ -323,7 +358,8 @@ class edge_minor_property_t {
   std::optional<raft::device_span<vertex_t const>> key_chunk_start_offsets_{std::nullopt};
   std::optional<size_t> key_chunk_size_{std::nullopt};
 
-  decltype(allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{})) buffer_;
+  decltype(allocate_dataframe_buffer<std::conditional_t<std::is_same_v<T, bool>, uint32_t, T>>(
+    size_t{0}, rmm::cuda_stream_view{})) buffer_;
   vertex_t minor_range_first_{};
 };
 
@@ -338,9 +374,10 @@ class edge_endpoint_dummy_property_view_t {
 template <typename GraphViewType, typename T>
 class edge_src_property_t {
  public:
-  using value_type = T;
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
 
+  using value_type = T;
+
   edge_src_property_t(raft::handle_t const& handle) : property_(handle) {}
 
   edge_src_property_t(raft::handle_t const& handle, GraphViewType const& graph_view)
@@ -429,10 +466,10 @@ class edge_src_property_t {
 template <typename GraphViewType, typename T>
 class edge_dst_property_t {
  public:
-  using value_type = T;
-
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
 
+  using value_type = T;
+
   edge_dst_property_t(raft::handle_t const& handle) : property_(handle) {}
 
   edge_dst_property_t(raft::handle_t const& handle, GraphViewType const& graph_view)
@@ -532,11 +569,12 @@ class edge_dst_dummy_property_t {
   auto view() const { return detail::edge_endpoint_dummy_property_view_t{}; }
 };
 
-template <typename vertex_t, typename... Ts>
-auto view_concat(detail::edge_major_property_view_t<vertex_t, Ts> const&... views)
+template <typename vertex_t, typename... Iters, typename... Types>
+auto view_concat(detail::edge_major_property_view_t<vertex_t, Iters, Types> const&... views)
 {
   using concat_value_iterator = decltype(thrust::make_zip_iterator(
     thrust_tuple_cat(to_thrust_iterator_tuple(views.value_firsts()[0])...)));
+  using concat_value_type     = decltype(thrust_tuple_cat(to_thrust_tuple(Types{})...));
 
   std::vector<concat_value_iterator> edge_partition_concat_value_firsts{};
   auto first_view = get_first_of_pack(views...);
@@ -547,23 +585,24 @@ auto view_concat(detail::edge_major_property_view_t<vertex_t, Ts> const&... view
   }
 
   if (first_view.key_chunk_size()) {
-    return detail::edge_major_property_view_t<vertex_t, concat_value_iterator>(
+    return detail::edge_major_property_view_t<vertex_t, concat_value_iterator, concat_value_type>(
       *(first_view.keys()),
       *(first_view.key_chunk_start_offsets()),
       *(first_view.key_chunk_size()),
       edge_partition_concat_value_firsts,
       first_view.major_range_firsts());
   } else {
-    return detail::edge_major_property_view_t<vertex_t, concat_value_iterator>(
+    return detail::edge_major_property_view_t<vertex_t, concat_value_iterator, concat_value_type>(
       edge_partition_concat_value_firsts, first_view.major_range_firsts());
   }
 }
 
-template <typename vertex_t, typename... Ts>
-auto view_concat(detail::edge_minor_property_view_t<vertex_t, Ts> const&... views)
+template <typename vertex_t, typename... Iters, typename... Types>
+auto view_concat(detail::edge_minor_property_view_t<vertex_t, Iters, Types> const&... views)
 {
-  using concat_value_iterator = decltype(
-    thrust::make_zip_iterator(thrust_tuple_cat(to_thrust_iterator_tuple(views.value_first())...)));
+  using concat_value_iterator = decltype(thrust::make_zip_iterator(
+    thrust_tuple_cat(to_thrust_iterator_tuple(views.value_first())...)));
+  using concat_value_type     = decltype(thrust_tuple_cat(to_thrust_tuple(Types{})...));
 
   concat_value_iterator edge_partition_concat_value_first{};
 
@@ -573,14 +612,14 @@ auto view_concat(detail::edge_minor_property_view_t<vertex_t, Ts> const&... view
     thrust::make_zip_iterator(thrust_tuple_cat(to_thrust_iterator_tuple(views.value_first())...));
 
   if (first_view.key_chunk_size()) {
-    return detail::edge_minor_property_view_t<vertex_t, concat_value_iterator>(
+    return detail::edge_minor_property_view_t<vertex_t, concat_value_iterator, concat_value_type>(
       *(first_view.keys()),
       *(first_view.key_chunk_start_offsets()),
       *(first_view.key_chunk_size()),
       edge_partition_concat_value_first,
       first_view.minor_range_first());
   } else {
-    return detail::edge_minor_property_view_t<vertex_t, concat_value_iterator>(
+    return detail::edge_minor_property_view_t<vertex_t, concat_value_iterator, concat_value_type>(
       edge_partition_concat_value_first, first_view.minor_range_first());
   }
 }
diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index 233824049f3..60b9f1a4054 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -310,27 +310,22 @@ template <typename T>
 struct invalid_idx<
   T,
   typename std::enable_if_t<std::is_integral<T>::value && std::is_signed<T>::value>>
-  : std::integral_constant<T, -1> {
-};
+  : std::integral_constant<T, -1> {};
 
 template <typename T>
 struct invalid_idx<
   T,
   typename std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
-  : std::integral_constant<T, std::numeric_limits<T>::max()> {
-};
+  : std::integral_constant<T, std::numeric_limits<T>::max()> {};
 
 template <typename vertex_t>
-struct invalid_vertex_id : invalid_idx<vertex_t> {
-};
+struct invalid_vertex_id : invalid_idx<vertex_t> {};
 
 template <typename edge_t>
-struct invalid_edge_id : invalid_idx<edge_t> {
-};
+struct invalid_edge_id : invalid_idx<edge_t> {};
 
 template <typename vertex_t>
-struct invalid_component_id : invalid_idx<vertex_t> {
-};
+struct invalid_component_id : invalid_idx<vertex_t> {};
 
 template <typename vertex_t>
 __host__ __device__ std::enable_if_t<std::is_signed<vertex_t>::value, bool> is_valid_vertex(
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 64a50b582b5..1c01568ae17 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -892,8 +892,9 @@ weight_t compute_total_edge_weight(
  * or multi-GPU (true).
  * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
- * @param graph_view Graph view object of the input graph to compute the maximum per-vertex outgoing
- * edge weight sums.
+ * @param graph_view Graph view object of the input graph to select random vertices from.
+ * @param given_set Distributed set to sample from. If @p given_set is not specified, sample from
+ *  the entire vertex range provided by @p graph_view.
  * @param  rng_state The RngState instance holding pseudo-random number generator state.
  * @param  select_count The number of vertices to select from the graph
  * @param  with_replacement If true, select with replacement, if false select without replacement
@@ -904,9 +905,11 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 rmm::device_uvector<vertex_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<raft::device_span<vertex_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check = false);
 
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/graph_generators.hpp b/cpp/include/cugraph/graph_generators.hpp
index fab92259196..4944e0f4917 100644
--- a/cpp/include/cugraph/graph_generators.hpp
+++ b/cpp/include/cugraph/graph_generators.hpp
@@ -127,6 +127,45 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> generat
   double c           = 0.19,
   bool clip_and_flip = false);
 
+/**
+ * @brief generate an edge list for a bipartite R-mat graph.
+ *
+ * The source vertex IDs will be in the range of [0, 2^src_scale) and the destination vertex IDs
+ * will be in the range of [0, 2^dst_scale). This function allows multi-edges.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param rng_state RAFT RNG state, updated with each call
+ * @param src_scale Scale factor to set the range of source vertex IDs (or the first vertex set) in
+ * the bipartite graph. Vertex IDs have values in [0, V_src), where V_src = 1 << @p src_scale.
+ * @param dst_scale Scale factor to set the range of destination vertex IDs (or the second vertex
+ * set) in the bipartite graph. Vertex IDs have values in [0, V_dst), where V_dst = 1 << @p
+ * dst_scale.
+ * @param num_edges Number of edges to generate.
+ * @param a a, b, c, d (= 1.0 - (a + b + c)) in the R-mat graph generator (vist https://graph500.org
+ * for additional details). a, b, c, d should be non-negative and a + b + c should be no larger
+ * than 1.0.
+ * @param b a, b, c, d (= 1.0 - (a + b + c)) in the R-mat graph generator (vist https://graph500.org
+ * for additional details). a, b, c, d should be non-negative and a + b + c should be no larger
+ * than 1.0.
+ * @param c a, b, c, d (= 1.0 - (a + b + c)) in the R-mat graph generator (vist https://graph500.org
+ * for additional details). a, b, c, d should be non-negative and a + b + c should be no larger
+ * than 1.0.
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> A tuple of
+ * rmm::device_uvector objects for edge source vertex IDs and edge destination vertex IDs.
+ */
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_bipartite_rmat_edgelist(raft::handle_t const& handle,
+                                 raft::random::RngState& rng_state,
+                                 size_t src_scale,
+                                 size_t dst_scale,
+                                 size_t num_edges,
+                                 double a = 0.57,
+                                 double b = 0.19,
+                                 double c = 0.19);
+
 enum class generator_distribution_t { POWER_LAW = 0, UNIFORM };
 
 /**
@@ -408,11 +447,30 @@ symmetrize_edgelist_from_triangular(
   std::optional<rmm::device_uvector<weight_t>>&& optional_d_weights_v,
   bool check_diagonal = false);
 
+/**
+ * @brief scramble vertex IDs in a graph
+ *
+ * Given a vertex list for a graph, scramble the input vertex IDs.
+ *
+ * The scramble code here follows the algorithm in the Graph 500 reference
+ * implementation version 3.0.0.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices Vector of input vertices
+ * @param lgN The input & output (scrambled) vertex IDs are assumed to be in [0, 2^lgN).
+ * @return rmm::device_uvector object storing scrambled vertex IDs.
+ */
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> scramble_vertex_ids(raft::handle_t const& handle,
+                                                  rmm::device_uvector<vertex_t>&& vertices,
+                                                  size_t lgN);
+
 /**
  * @brief scramble vertex ids in a graph
  *
- * Given an edgelist for a graph, scramble all vertex ids by the given offset.
- * This translation is done in place.
+ * Given an edge list for a graph, scramble the input vertex IDs.
  *
  * The scramble code here follows the algorithm in the Graph 500 reference
  * implementation version 3.0.0.
@@ -420,17 +478,18 @@ symmetrize_edgelist_from_triangular(
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
- * @param d_src_v Vector of source vertices
- * @param d_dst_v Vector of destination vertices
- * @param vertex_id_offset Offset to add to each vertex id
- * @param seed Used to initialize random number generator
+ * @param d_src_v Vector of input source vertices
+ * @param d_dst_v Vector of input destination vertices
+ * @param lgN The input & output (scrambled) vertex IDs are assumed to be in [0, 2^lgN).
+ * @return Tuple of two rmm::device_uvector objects storing scrambled source & destination vertex
+ * IDs, respectively.
  */
 template <typename vertex_t>
-void scramble_vertex_ids(raft::handle_t const& handle,
-                         rmm::device_uvector<vertex_t>& d_src_v,
-                         rmm::device_uvector<vertex_t>& d_dst_v,
-                         vertex_t vertex_id_offset,
-                         uint64_t seed = 0);
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> scramble_vertex_ids(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& srcs,
+  rmm::device_uvector<vertex_t>&& dsts,
+  size_t lgN);
 
 /**
  * @brief Combine edgelists from multiple sources into a single edgelist
diff --git a/cpp/include/cugraph/graph_mask.hpp b/cpp/include/cugraph/graph_mask.hpp
index af5b9b01764..2048d3692c7 100644
--- a/cpp/include/cugraph/graph_mask.hpp
+++ b/cpp/include/cugraph/graph_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -128,7 +128,7 @@ struct graph_mask_view_t {
   ~graph_mask_view_t()                            = default;
   graph_mask_view_t(graph_mask_view_t&&) noexcept = default;
 
-  graph_mask_view_t& operator=(graph_mask_view_t&&) noexcept = default;
+  graph_mask_view_t& operator=(graph_mask_view_t&&) noexcept   = default;
   graph_mask_view_t& operator=(graph_mask_view_t const& other) = default;
 
   /**
@@ -231,7 +231,7 @@ struct graph_mask_t {
   {
   }
 
-  graph_mask_t& operator=(graph_mask_t&&) noexcept = default;
+  graph_mask_t& operator=(graph_mask_t&&) noexcept   = default;
   graph_mask_t& operator=(graph_mask_t const& other) = default;
 
   /**
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index 03bfc6c8045..2d10b435224 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_property.hpp>
 #include <cugraph/partition_manager.hpp>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/vertex_partition_view.hpp>
@@ -26,8 +27,6 @@
 #include <raft/core/host_span.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <cugraph/graph_mask.hpp>
-
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -447,6 +446,7 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
 
   edge_t number_of_local_edge_partition_edges(size_t partition_idx) const
   {
+    CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
     return edge_partition_number_of_edges_[partition_idx];
   }
 
@@ -589,6 +589,8 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   edge_partition_view_t<vertex_t, edge_t, true> local_edge_partition_view(
     size_t partition_idx) const
   {
+    CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
     vertex_t major_range_first{};
     vertex_t major_range_last{};
     vertex_t minor_range_first{};
@@ -737,6 +739,15 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
     return local_sorted_unique_edge_dst_vertex_partition_offsets_;
   }
 
+  void attach_edge_mask(edge_property_view_t<edge_t, uint32_t const*, bool> edge_mask_view)
+  {
+    edge_mask_view_ = edge_mask_view;
+  }
+
+  void clear_edge_mask() { edge_mask_view_ = std::nullopt; }
+
+  bool has_edge_mask() const { return edge_mask_view_.has_value(); }
+
  private:
   std::vector<edge_t const*> edge_partition_offsets_{};
   std::vector<vertex_t const*> edge_partition_indices_{};
@@ -782,6 +793,8 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
                      std::optional<raft::host_span<vertex_t const>>,
                      std::optional<std::byte> /* dummy */>
     local_sorted_unique_edge_dst_vertex_partition_offsets_{std::nullopt};
+
+  std::optional<edge_property_view_t<edge_t, uint32_t const*, bool>> edge_mask_view_{std::nullopt};
 };
 
 // single-GPU version
@@ -1008,12 +1021,23 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
     return std::nullopt;
   }
 
+  void attach_edge_mask(edge_property_view_t<edge_t, uint32_t const*, bool> edge_mask_view)
+  {
+    edge_mask_view_ = edge_mask_view;
+  }
+
+  void clear_edge_mask() { edge_mask_view_ = std::nullopt; }
+
+  bool has_edge_mask() const { return edge_mask_view_.has_value(); }
+
  private:
   edge_t const* offsets_{nullptr};
   vertex_t const* indices_{nullptr};
 
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> segment_offsets_{std::nullopt};
+
+  std::optional<edge_property_view_t<edge_t, uint32_t const*, bool>> edge_mask_view_{std::nullopt};
 };
 
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/legacy/graph.hpp b/cpp/include/cugraph/legacy/graph.hpp
index d207a0a1603..8276853ce7e 100644
--- a/cpp/include/cugraph/legacy/graph.hpp
+++ b/cpp/include/cugraph/legacy/graph.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -553,23 +553,19 @@ template <typename T>
 struct invalid_idx<
   T,
   typename std::enable_if_t<std::is_integral<T>::value && std::is_signed<T>::value>>
-  : std::integral_constant<T, -1> {
-};
+  : std::integral_constant<T, -1> {};
 
 template <typename T>
 struct invalid_idx<
   T,
   typename std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value>>
-  : std::integral_constant<T, std::numeric_limits<T>::max()> {
-};
+  : std::integral_constant<T, std::numeric_limits<T>::max()> {};
 
 template <typename vertex_t>
-struct invalid_vertex_id : invalid_idx<vertex_t> {
-};
+struct invalid_vertex_id : invalid_idx<vertex_t> {};
 
 template <typename edge_t>
-struct invalid_edge_id : invalid_idx<edge_t> {
-};
+struct invalid_edge_id : invalid_idx<edge_t> {};
 
 }  // namespace legacy
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/partition_manager.hpp b/cpp/include/cugraph/partition_manager.hpp
index 433d99d4e01..309b169e646 100644
--- a/cpp/include/cugraph/partition_manager.hpp
+++ b/cpp/include/cugraph/partition_manager.hpp
@@ -16,6 +16,9 @@
 
 #pragma once
 
+#include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/shuffle_comm.cuh>
+
 #include <raft/core/comms.hpp>
 #include <raft/core/handle.hpp>
 
@@ -93,6 +96,39 @@ class partition_manager {
     return std::string(map_major_comm_to_gpu_row_comm ? "gpu_col_comm" : "gpu_row_comm");
   }
 
+  template <typename vertex_t>
+  static std::vector<vertex_t> compute_partition_range_lasts(raft::handle_t const& handle,
+                                                             vertex_t local_partition_size)
+  {
+    auto& comm                 = handle.get_comms();
+    auto const comm_size       = comm.get_size();
+    auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+    auto const major_comm_size = major_comm.get_size();
+    auto const major_comm_rank = major_comm.get_rank();
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_size = minor_comm.get_size();
+    auto const minor_comm_rank = minor_comm.get_rank();
+
+    auto vertex_counts = host_scalar_allgather(comm, local_partition_size, handle.get_stream());
+    auto vertex_partition_ids =
+      host_scalar_allgather(comm,
+                            partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks(
+                              major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank),
+                            handle.get_stream());
+
+    std::vector<vertex_t> vertex_partition_range_offsets(comm_size + 1, 0);
+    for (int i = 0; i < comm_size; ++i) {
+      vertex_partition_range_offsets[vertex_partition_ids[i]] = vertex_counts[i];
+    }
+    std::exclusive_scan(vertex_partition_range_offsets.begin(),
+                        vertex_partition_range_offsets.end(),
+                        vertex_partition_range_offsets.begin(),
+                        vertex_t{0});
+
+    return std::vector<vertex_t>(vertex_partition_range_offsets.begin() + 1,
+                                 vertex_partition_range_offsets.end());
+  }
+
   static void init_subcomm(raft::handle_t& handle, int gpu_row_comm_size)
   {
     auto& comm = handle.get_comms();
diff --git a/cpp/include/cugraph/utilities/atomic_ops.cuh b/cpp/include/cugraph/utilities/atomic_ops.cuh
new file mode 100644
index 00000000000..6af9841d71f
--- /dev/null
+++ b/cpp/include/cugraph/utilities/atomic_ops.cuh
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <raft/util/device_atomics.cuh>
+
+#include <thrust/detail/type_traits/iterator/is_discard_iterator.h>
+#include <thrust/iterator/detail/any_assign.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/memory.h>
+#include <thrust/tuple.h>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename Iterator, typename TupleType, std::size_t... Is>
+__device__ constexpr TupleType thrust_tuple_atomic_and(Iterator iter,
+                                                       TupleType tup,
+                                                       std::index_sequence<Is...>)
+{
+  return thrust::make_tuple(
+    atomicAnd(&(thrust::raw_reference_cast(thrust::get<Is>(*iter))), thrust::get<Is>(tup))...);
+}
+
+template <typename Iterator, typename TupleType, std::size_t... Is>
+__device__ constexpr TupleType thrust_tuple_atomic_or(Iterator iter,
+                                                      TupleType tup,
+                                                      std::index_sequence<Is...>)
+{
+  return thrust::make_tuple(
+    atomicOr(&(thrust::raw_reference_cast(thrust::get<Is>(*iter))), thrust::get<Is>(tup))...);
+}
+
+template <typename Iterator, typename TupleType, std::size_t... Is>
+__device__ constexpr TupleType thrust_tuple_atomic_add(Iterator iter,
+                                                       TupleType tup,
+                                                       std::index_sequence<Is...>)
+{
+  return thrust::make_tuple(
+    atomicAdd(&(thrust::raw_reference_cast(thrust::get<Is>(*iter))), thrust::get<Is>(tup))...);
+}
+
+template <typename Iterator, typename TupleType, std::size_t... Is>
+__device__ constexpr TupleType thrust_tuple_elementwise_atomic_cas(Iterator iter,
+                                                                   TupleType comp_tup,
+                                                                   TupleType val_tup,
+                                                                   std::index_sequence<Is...>)
+{
+  return thrust::make_tuple(atomicCAS(&(thrust::raw_reference_cast(thrust::get<Is>(*iter))),
+                                      thrust::get<Is>(comp_tup),
+                                      thrust::get<Is>(val_tup))...);
+}
+
+template <typename Iterator, typename TupleType, std::size_t... Is>
+__device__ constexpr TupleType thrust_tuple_elementwise_atomic_min(Iterator iter,
+                                                                   TupleType tup,
+                                                                   std::index_sequence<Is...>)
+{
+  return thrust::make_tuple(
+    atomicMin(&(thrust::raw_reference_cast(thrust::get<Is>(*iter))), thrust::get<Is>(tup))...);
+}
+
+template <typename Iterator, typename TupleType, std::size_t... Is>
+__device__ constexpr TupleType thrust_tuple_elementwise_atomic_max(Iterator iter,
+                                                                   TupleType tup,
+                                                                   std::index_sequence<Is...>)
+{
+  return thrust::make_tuple(
+    atomicMax(&(thrust::raw_reference_cast(thrust::get<Is>(*iter))), thrust::get<Is>(tup))...);
+}
+
+}  // namespace detail
+
+template <typename Iterator, typename T>
+__device__ std::enable_if_t<thrust::detail::is_discard_iterator<Iterator>::value, void> atomic_and(
+  Iterator iter, T value)
+{
+  // no-op
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<std::is_arithmetic_v<T> &&
+                     std::is_same_v<typename thrust::iterator_traits<Iterator>::value_type, T>,
+                   T>
+  atomic_and(Iterator iter, T value)
+{
+  return atomicAnd(&(thrust::raw_reference_cast(*iter)), value);
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<is_thrust_tuple<T>::value &&
+                     std::is_same_v<typename thrust::iterator_traits<Iterator>::value_type, T>,
+                   T>
+  atomic_and(Iterator iter, T value)
+{
+  detail::thrust_tuple_atomic_and(
+    iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
+}
+
+template <typename Iterator, typename T>
+__device__ std::enable_if_t<thrust::detail::is_discard_iterator<Iterator>::value, void> atomic_or(
+  Iterator iter, T value)
+{
+  // no-op
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<std::is_arithmetic_v<T> &&
+                     std::is_same_v<typename thrust::iterator_traits<Iterator>::value_type, T>,
+                   T>
+  atomic_or(Iterator iter, T value)
+{
+  return atomicOr(&(thrust::raw_reference_cast(*iter)), value);
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<is_thrust_tuple<T>::value &&
+                     std::is_same_v<typename thrust::iterator_traits<Iterator>::value_type, T>,
+                   T>
+  atomic_or(Iterator iter, T value)
+{
+  detail::thrust_tuple_atomic_or(
+    iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
+}
+
+template <typename Iterator, typename T>
+__device__ std::enable_if_t<thrust::detail::is_discard_iterator<Iterator>::value, void> atomic_add(
+  Iterator iter, T value)
+{
+  // no-op
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<std::is_arithmetic_v<T> &&
+                     std::is_same_v<typename thrust::iterator_traits<Iterator>::value_type, T>,
+                   void>
+  atomic_add(Iterator iter, T value)
+{
+  atomicAdd(&(thrust::raw_reference_cast(*iter)), value);
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
+                     is_thrust_tuple<T>::value,
+                   void>
+  atomic_add(Iterator iter, T value)
+{
+  static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
+                thrust::tuple_size<T>::value);
+  detail::thrust_tuple_atomic_add(
+    iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<std::is_arithmetic_v<T> &&
+                     std::is_same_v<typename thrust::iterator_traits<Iterator>::value_type, T>,
+                   T>
+  elementwise_atomic_cas(Iterator iter, T compare, T value)
+{
+  return atomicCAS(&(thrust::raw_reference_cast(*iter)), compare, value);
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<is_thrust_tuple<T>::value &&
+                     std::is_same_v<typename thrust::iterator_traits<Iterator>::value_type, T>,
+                   T>
+  elementwise_atomic_cas(Iterator iter, T compare, T value)
+{
+  detail::thrust_tuple_elementwise_atomic_cas(
+    iter, compare, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
+}
+
+template <typename Iterator, typename T>
+__device__ std::enable_if_t<thrust::detail::is_discard_iterator<Iterator>::value, void>
+elementwise_atomic_min(Iterator iter, T const& value)
+{
+  // no-op
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<std::is_same<typename thrust::iterator_traits<Iterator>::value_type, T>::value &&
+                     std::is_arithmetic<T>::value,
+                   void>
+  elementwise_atomic_min(Iterator iter, T const& value)
+{
+  atomicMin(&(thrust::raw_reference_cast(*iter)), value);
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
+                     is_thrust_tuple<T>::value,
+                   void>
+  elementwise_atomic_min(Iterator iter, T const& value)
+{
+  static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
+                thrust::tuple_size<T>::value);
+  detail::thrust_tuple_elementwise_atomic_min(
+    iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
+}
+
+template <typename Iterator, typename T>
+__device__ std::enable_if_t<thrust::detail::is_discard_iterator<Iterator>::value, void>
+elementwise_atomic_max(Iterator iter, T const& value)
+{
+  // no-op
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<std::is_same<typename thrust::iterator_traits<Iterator>::value_type, T>::value &&
+                     std::is_arithmetic<T>::value,
+                   void>
+  elementwise_atomic_max(Iterator iter, T const& value)
+{
+  atomicMax(&(thrust::raw_reference_cast(*iter)), value);
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
+                     is_thrust_tuple<T>::value,
+                   void>
+  elementwise_atomic_max(Iterator iter, T const& value)
+{
+  static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
+                thrust::tuple_size<T>::value);
+  detail::thrust_tuple_elementwise_atomic_max(
+    iter, value, std::make_index_sequence<thrust::tuple_size<T>::value>{});
+}
+
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/cython.hpp b/cpp/include/cugraph/utilities/cython.hpp
index 91dbe2c701e..2573752cb98 100644
--- a/cpp/include/cugraph/utilities/cython.hpp
+++ b/cpp/include/cugraph/utilities/cython.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,189 +25,11 @@
 namespace cugraph {
 namespace cython {
 
-enum class numberTypeEnum : int { int32Type, int64Type, floatType, doubleType };
-
-// replacement for std::tuple<,,>, since std::tuple is not
-// supported in cython
-//
-template <typename vertex_t, typename edge_t, typename weight_t>
-struct major_minor_weights_t {
-  explicit major_minor_weights_t(raft::handle_t const& handle)
-    : shuffled_major_vertices_(0, handle.get_stream()),
-      shuffled_minor_vertices_(0, handle.get_stream()),
-      shuffled_weights_(0, handle.get_stream())
-  {
-  }
-
-  rmm::device_uvector<vertex_t>& get_major(void) { return shuffled_major_vertices_; }
-
-  rmm::device_uvector<vertex_t>& get_minor(void) { return shuffled_minor_vertices_; }
-
-  rmm::device_uvector<weight_t>& get_weights(void) { return shuffled_weights_; }
-
-  std::vector<edge_t>& get_edge_counts(void) { return edge_counts_; }
-
-  std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_major_wrap(
-    void)  // const: triggers errors in Cython autogen-ed C++
-  {
-    return std::make_pair(std::make_unique<rmm::device_buffer>(shuffled_major_vertices_.release()),
-                          sizeof(vertex_t));
-  }
-
-  std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_minor_wrap(void)  // const
-  {
-    return std::make_pair(std::make_unique<rmm::device_buffer>(shuffled_minor_vertices_.release()),
-                          sizeof(vertex_t));
-  }
-
-  std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_weights_wrap(void)  // const
-  {
-    return std::make_pair(std::make_unique<rmm::device_buffer>(shuffled_weights_.release()),
-                          sizeof(weight_t));
-  }
-
-  std::unique_ptr<std::vector<edge_t>> get_edge_counts_wrap(void)  // const
-  {
-    return std::make_unique<std::vector<edge_t>>(edge_counts_);
-  }
-
- private:
-  rmm::device_uvector<vertex_t> shuffled_major_vertices_;
-  rmm::device_uvector<vertex_t> shuffled_minor_vertices_;
-  rmm::device_uvector<weight_t> shuffled_weights_;
-  std::vector<edge_t> edge_counts_{};
-};
-
 struct graph_generator_t {
   std::unique_ptr<rmm::device_buffer> d_source;
   std::unique_ptr<rmm::device_buffer> d_destination;
 };
 
-// wrapper for renumber_edgelist() return
-// (unrenumbering maps, etc.)
-//
-template <typename vertex_t, typename edge_t>
-struct renum_tuple_t {
-  explicit renum_tuple_t(raft::handle_t const& handle) : dv_(0, handle.get_stream()), part_() {}
-
-  rmm::device_uvector<vertex_t>& get_dv(void) { return dv_; }
-
-  std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_dv_wrap(
-    void)  // const: see above explanation
-  {
-    return std::make_pair(std::make_unique<rmm::device_buffer>(dv_.release()), sizeof(vertex_t));
-  }
-
-  cugraph::partition_t<vertex_t>& get_partition(void) { return part_; }
-  vertex_t& get_num_vertices(void) { return nv_; }
-  edge_t& get_num_edges(void) { return ne_; }
-
-  std::vector<vertex_t>& get_segment_offsets(void) { return segment_offsets_; }
-
-  std::unique_ptr<std::vector<vertex_t>> get_segment_offsets_wrap()
-  {  // const
-    return std::make_unique<std::vector<vertex_t>>(segment_offsets_);
-  }
-
-  // `partition_t` pass-through getters
-  //
-  int get_part_row_size() const { return part_.row_comm_size(); }
-
-  int get_part_col_size() const { return part_.col_comm_size(); }
-
-  int get_part_comm_rank() const { return part_.comm_rank(); }
-
-  // FIXME: part_.vertex_partition_offsets() returns a std::vector
-  //
-  std::unique_ptr<std::vector<vertex_t>> get_partition_offsets_wrap(void)  // const
-  {
-    return std::make_unique<std::vector<vertex_t>>(part_.vertex_partition_range_offsets());
-  }
-
-  std::pair<vertex_t, vertex_t> get_part_local_vertex_range() const
-  {
-    auto tpl_v = part_.local_vertex_partition_range();
-    return std::make_pair(std::get<0>(tpl_v), std::get<1>(tpl_v));
-  }
-
-  vertex_t get_part_local_vertex_first() const
-  {
-    return part_.local_vertex_partition_range_first();
-  }
-
-  vertex_t get_part_local_vertex_last() const { return part_.local_vertex_partition_range_last(); }
-
-  std::pair<vertex_t, vertex_t> get_part_vertex_partition_range(size_t vertex_partition_idx) const
-  {
-    auto tpl_v = part_.vertex_partition_range(vertex_partition_idx);
-    return std::make_pair(std::get<0>(tpl_v), std::get<1>(tpl_v));
-  }
-
-  vertex_t get_part_vertex_partition_first(size_t vertex_partition_idx) const
-  {
-    return part_.vertex_partition_range_first(vertex_partition_idx);
-  }
-
-  vertex_t get_part_vertex_partition_last(size_t vertex_partition_idx) const
-  {
-    return part_.vertex_partition_range_last(vertex_partition_idx);
-  }
-
-  vertex_t get_part_vertex_partition_size(size_t vertex_partition_idx) const
-  {
-    return part_.vertex_partition_range_size(vertex_partition_idx);
-  }
-
-  size_t get_part_number_of_matrix_partitions() const
-  {
-    return part_.number_of_local_edgex_partitions();
-  }
-
-  std::pair<vertex_t, vertex_t> get_part_matrix_partition_major_range(size_t partition_idx) const
-  {
-    auto tpl_v = part_.local_edgex_partition_major_range(partition_idx);
-    return std::make_pair(std::get<0>(tpl_v), std::get<1>(tpl_v));
-  }
-
-  vertex_t get_part_matrix_partition_major_first(size_t partition_idx) const
-  {
-    return part_.local_edge_partition_major_first(partition_idx);
-  }
-
-  vertex_t get_part_matrix_partition_major_last(size_t partition_idx) const
-  {
-    return part_.local_edge_partition_major_range_last(partition_idx);
-  }
-
-  vertex_t get_part_matrix_partition_major_value_start_offset(size_t partition_idx) const
-  {
-    return part_.local_edge_partition_major_value_start_offset(partition_idx);
-  }
-
-  std::pair<vertex_t, vertex_t> get_part_matrix_partition_minor_range() const
-  {
-    auto tpl_v = part_.local_edge_partition_minor_range();
-    return std::make_pair(std::get<0>(tpl_v), std::get<1>(tpl_v));
-  }
-
-  vertex_t get_part_matrix_partition_minor_first() const
-  {
-    return part_.local_edge_partition_minor_range_first();
-  }
-
-  vertex_t get_part_matrix_partition_minor_last() const
-  {
-    return part_.local_edge_partition_minor_range_last();
-  }
-
- private:
-  rmm::device_uvector<vertex_t> dv_;
-  cugraph::partition_t<vertex_t> part_;
-  vertex_t nv_{0};
-  edge_t ne_{0};
-  std::vector<vertex_t> segment_offsets_;
-};
-
 // Wrapper for calling graph generator
 template <typename vertex_t>
 std::unique_ptr<graph_generator_t> call_generate_rmat_edgelist(raft::handle_t const& handle,
@@ -232,30 +54,6 @@ call_generate_rmat_edgelists(raft::handle_t const& handle,
                              bool clip_and_flip,
                              bool scramble_vertex_ids);
 
-// wrapper for shuffling:
-//
-template <typename vertex_t, typename edge_t, typename weight_t>
-std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
-  raft::handle_t const& handle,
-  vertex_t*
-    edgelist_major_vertices,  // [IN / OUT]: groupby_gpu_id_and_shuffle_values() sorts in-place
-  vertex_t* edgelist_minor_vertices,  // [IN / OUT]
-  weight_t* edgelist_weights,         // [IN / OUT]
-  edge_t num_edgelist_edges,
-  bool is_weighted);
-
-// Wrapper for calling renumber_edgelist() inplace:
-//
-template <typename vertex_t, typename edge_t>
-std::unique_ptr<renum_tuple_t<vertex_t, edge_t>> call_renumber(
-  raft::handle_t const& handle,
-  vertex_t* shuffled_edgelist_src_vertices /* [INOUT] */,
-  vertex_t* shuffled_edgelist_dst_vertices /* [INOUT] */,
-  std::vector<edge_t> const& edge_counts,
-  bool store_transposed,
-  bool do_expensive_check,
-  bool multi_gpu);
-
 // Helper for setting up subcommunicators, typically called as part of the
 // user-initiated comms initialization in Python.
 //
diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.hpp b/cpp/include/cugraph/utilities/dataframe_buffer.hpp
index 055391895d1..49898f6c855 100644
--- a/cpp/include/cugraph/utilities/dataframe_buffer.hpp
+++ b/cpp/include/cugraph/utilities/dataframe_buffer.hpp
@@ -72,9 +72,9 @@ struct dataframe_element {
   using type = void;
 };
 
-template <typename... T>
-struct dataframe_element<std::tuple<rmm::device_uvector<T>...>> {
-  using type = thrust::tuple<T...>;
+template <typename... Ts>
+struct dataframe_element<std::tuple<rmm::device_uvector<Ts>...>> {
+  using type = thrust::tuple<Ts...>;
 };
 
 template <typename T>
diff --git a/cpp/include/cugraph/utilities/device_functors.cuh b/cpp/include/cugraph/utilities/device_functors.cuh
index 1b1cf29057c..d29e7c47d14 100644
--- a/cpp/include/cugraph/utilities/device_functors.cuh
+++ b/cpp/include/cugraph/utilities/device_functors.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,6 +37,26 @@ struct typecast_t {
   __device__ output_t operator()(input_t val) const { return static_cast<output_t>(val); }
 };
 
+template <typename BoolIterator>
+struct pack_bool_t {
+  BoolIterator bool_first{};
+  size_t num_bools{};
+
+  __device__ uint32_t operator()(size_t i) const
+  {
+    auto first = i * (sizeof(uint32_t) * 8);
+    auto last  = std::min((i + 1) * (sizeof(uint32_t) * 8), num_bools);
+    uint32_t ret{0};
+    for (auto j = first; j < last; ++j) {
+      if (*(bool_first + j)) {
+        auto mask = uint32_t{1} << (j % (sizeof(uint32_t) * 8));
+        ret |= mask;
+      }
+    }
+    return ret;
+  }
+};
+
 template <typename Iterator>
 struct indirection_t {
   Iterator first{};
diff --git a/cpp/include/cugraph/utilities/packed_bool_utils.hpp b/cpp/include/cugraph/utilities/packed_bool_utils.hpp
new file mode 100644
index 00000000000..9557b11e8e0
--- /dev/null
+++ b/cpp/include/cugraph/utilities/packed_bool_utils.hpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/utilities/thrust_tuple_utils.hpp>
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/tuple.h>
+
+#include <type_traits>
+#include <utility>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename ValueIterator, typename value_t, std::size_t... Is>
+constexpr std::enable_if_t<cugraph::is_thrust_tuple_of_arithmetic<
+                             typename thrust::iterator_traits<ValueIterator>::value_type>::value &&
+                             cugraph::is_thrust_tuple_of_arithmetic<value_t>::value,
+                           bool>
+has_packed_bool_element(std::index_sequence<Is...>)
+{
+  static_assert(
+    thrust::tuple_size<typename thrust::iterator_traits<ValueIterator>::value_type>::value ==
+    thrust::tuple_size<value_t>::value);
+  return (... ||
+          (std::is_same_v<typename thrust::tuple_element<
+                            Is,
+                            typename thrust::iterator_traits<ValueIterator>::value_type>::type,
+                          uint32_t> &&
+           std::is_same_v<typename thrust::tuple_element<Is, value_t>::type, bool>));
+}
+
+}  // namespace detail
+
+// sizeof(uint32_t) * 8 packed Boolean values are stored using one uint32_t
+template <typename ValueIterator, typename value_t>
+constexpr bool has_packed_bool_element()
+{
+  static_assert(
+    (std::is_arithmetic_v<typename thrust::iterator_traits<ValueIterator>::value_type> &&
+     std::is_arithmetic_v<value_t>) ||
+    (cugraph::is_thrust_tuple_of_arithmetic<
+       typename thrust::iterator_traits<ValueIterator>::value_type>::value &&
+     cugraph::is_thrust_tuple_of_arithmetic<value_t>::value));
+  if constexpr (std::is_arithmetic_v<typename thrust::iterator_traits<ValueIterator>::value_type> &&
+                std::is_arithmetic_v<value_t>) {
+    return std::is_same_v<typename thrust::iterator_traits<ValueIterator>::value_type, uint32_t> &&
+           std::is_same_v<value_t, bool>;
+  } else {
+    static_assert(
+      thrust::tuple_size<typename thrust::iterator_traits<ValueIterator>::value_type>::value ==
+      thrust::tuple_size<value_t>::value);
+    return detail::has_packed_bool_element<ValueIterator, value_t>(
+      std::make_index_sequence<thrust::tuple_size<value_t>::value>());
+  }
+}
+
+constexpr size_t packed_bools_per_word() { return sizeof(uint32_t) * size_t{8}; }
+
+constexpr size_t packed_bool_size(size_t bool_size)
+{
+  return (bool_size + (sizeof(uint32_t) * 8 - 1)) / (sizeof(uint32_t) * 8);
+}
+
+template <typename T>
+constexpr uint32_t packed_bool_mask(T bool_offset)
+{
+  return uint32_t{1} << (bool_offset % (sizeof(uint32_t) * 8));
+}
+
+constexpr uint32_t packed_bool_full_mask() { return uint32_t{0xffffffff}; }
+
+constexpr uint32_t packed_bool_empty_mask() { return uint32_t{0x0}; }
+
+template <typename T>
+constexpr T packed_bool_offset(T bool_offset)
+{
+  return bool_offset / (sizeof(uint32_t) * 8);
+}
+
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp
index cb3b8146153..d98754f51d1 100644
--- a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp
+++ b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp
@@ -78,16 +78,14 @@ auto std_tuple_to_thrust_tuple(TupleType tup, std::index_sequence<Is...>)
 }
 
 template <typename TupleType, std::size_t... Is>
-constexpr TupleType thrust_tuple_of_arithmetic_numeric_limits_lowest(TupleType t,
-                                                                     std::index_sequence<Is...>)
+constexpr TupleType thrust_tuple_of_arithmetic_numeric_limits_lowest(std::index_sequence<Is...>)
 {
   return thrust::make_tuple(
     std::numeric_limits<typename thrust::tuple_element<Is, TupleType>::type>::lowest()...);
 }
 
 template <typename TupleType, std::size_t... Is>
-constexpr TupleType thrust_tuple_of_arithmetic_numeric_limits_max(TupleType t,
-                                                                  std::index_sequence<Is...>)
+constexpr TupleType thrust_tuple_of_arithmetic_numeric_limits_max(std::index_sequence<Is...>)
 {
   return thrust::make_tuple(
     std::numeric_limits<typename thrust::tuple_element<Is, TupleType>::type>::max()...);
@@ -96,71 +94,59 @@ constexpr TupleType thrust_tuple_of_arithmetic_numeric_limits_max(TupleType t,
 }  // namespace detail
 
 template <typename T>
-struct is_thrust_tuple : std::false_type {
-};
+struct is_thrust_tuple : std::false_type {};
 
 template <typename... Ts>
-struct is_thrust_tuple<thrust::tuple<Ts...>> : std::true_type {
-};
+struct is_thrust_tuple<thrust::tuple<Ts...>> : std::true_type {};
 
 template <typename TupleType>
-struct is_thrust_tuple_of_arithmetic : std::false_type {
-};
+struct is_thrust_tuple_of_arithmetic : std::false_type {};
 
-template <typename... Args>
-struct is_thrust_tuple_of_arithmetic<thrust::tuple<Args...>> {
+template <typename... Ts>
+struct is_thrust_tuple_of_arithmetic<thrust::tuple<Ts...>> {
  private:
   template <typename T>
   static constexpr bool is_valid = std::is_arithmetic_v<T> || std::is_same_v<T, thrust::null_type>;
 
  public:
-  static constexpr bool value = (... && is_valid<Args>);
+  static constexpr bool value = (... && is_valid<Ts>);
 };
 
 template <typename T>
-struct is_std_tuple : std::false_type {
-};
+struct is_std_tuple : std::false_type {};
 
 template <typename... Ts>
-struct is_std_tuple<std::tuple<Ts...>> : std::true_type {
-};
+struct is_std_tuple<std::tuple<Ts...>> : std::true_type {};
 
 template <typename T, template <typename> typename Vector>
-struct is_arithmetic_vector : std::false_type {
-};
+struct is_arithmetic_vector : std::false_type {};
 
 template <template <typename> typename Vector, typename T>
 struct is_arithmetic_vector<Vector<T>, Vector>
-  : std::integral_constant<bool, std::is_arithmetic<T>::value> {
-};
+  : std::integral_constant<bool, std::is_arithmetic<T>::value> {};
 
 template <typename T>
-struct is_std_tuple_of_arithmetic_vectors : std::false_type {
-};
+struct is_std_tuple_of_arithmetic_vectors : std::false_type {};
 
-template <typename... Args>
-struct is_std_tuple_of_arithmetic_vectors<std::tuple<rmm::device_uvector<Args>...>> {
-  static constexpr bool value = (... && std::is_arithmetic<Args>::value);
+template <typename... Ts>
+struct is_std_tuple_of_arithmetic_vectors<std::tuple<rmm::device_uvector<Ts>...>> {
+  static constexpr bool value = (... && std::is_arithmetic<Ts>::value);
 };
 
 template <typename T>
 struct is_arithmetic_or_thrust_tuple_of_arithmetic
-  : std::integral_constant<bool, std::is_arithmetic<T>::value> {
-};
+  : std::integral_constant<bool, std::is_arithmetic<T>::value> {};
 
 template <typename... Ts>
 struct is_arithmetic_or_thrust_tuple_of_arithmetic<thrust::tuple<Ts...>>
-  : std::integral_constant<bool, is_thrust_tuple_of_arithmetic<thrust::tuple<Ts...>>::value> {
-};
+  : std::integral_constant<bool, is_thrust_tuple_of_arithmetic<thrust::tuple<Ts...>>::value> {};
 
 template <typename T>
-struct thrust_tuple_size_or_one : std::integral_constant<size_t, 1> {
-};
+struct thrust_tuple_size_or_one : std::integral_constant<size_t, 1> {};
 
 template <typename... Ts>
 struct thrust_tuple_size_or_one<thrust::tuple<Ts...>>
-  : std::integral_constant<size_t, thrust::tuple_size<thrust::tuple<Ts...>>::value> {
-};
+  : std::integral_constant<size_t, thrust::tuple_size<thrust::tuple<Ts...>>::value> {};
 
 template <typename TupleType>
 struct compute_thrust_tuple_element_sizes {
@@ -236,15 +222,15 @@ auto thrust_tuple_cat(TupleTypes... tups)
 template <typename TupleType>
 constexpr TupleType thrust_tuple_of_arithmetic_numeric_limits_lowest()
 {
-  return detail::thrust_tuple_of_arithmetic_numeric_limits_lowest(
-    TupleType{}, std::make_index_sequence<thrust::tuple_size<TupleType>::value>());
+  return detail::thrust_tuple_of_arithmetic_numeric_limits_lowest<TupleType>(
+    std::make_index_sequence<thrust::tuple_size<TupleType>::value>());
 }
 
 template <typename TupleType>
 constexpr TupleType thrust_tuple_of_arithmetic_numeric_limits_max()
 {
-  return detail::thrust_tuple_of_arithmetic_numeric_limits_max(
-    TupleType{}, std::make_index_sequence<thrust::tuple_size<TupleType>::value>());
+  return detail::thrust_tuple_of_arithmetic_numeric_limits_max<TupleType>(
+    std::make_index_sequence<thrust::tuple_size<TupleType>::value>());
 }
 
 template <typename TupleType, size_t I>
diff --git a/cpp/include/cugraph_c/community_algorithms.h b/cpp/include/cugraph_c/community_algorithms.h
index 47d5728880d..fd0e1de9cb4 100644
--- a/cpp/include/cugraph_c/community_algorithms.h
+++ b/cpp/include/cugraph_c/community_algorithms.h
@@ -19,6 +19,7 @@
 #include <cugraph_c/error.h>
 #include <cugraph_c/graph.h>
 #include <cugraph_c/graph_functions.h>
+#include <cugraph_c/random.h>
 #include <cugraph_c/resource_handle.h>
 
 /** @defgroup community Community algorithms
@@ -117,11 +118,16 @@ cugraph_error_code_t cugraph_louvain(const cugraph_resource_handle_t* handle,
  * @param [in]  handle       Handle for accessing resources
  * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
  *                           needs to be transposed
+ * @param [in/out] rng_state State of the random number generator, updated with each call
  * @param [in]  max_level    Maximum level in hierarchy
  * @param [in]  resolution   Resolution parameter (gamma) in modularity formula.
  *                           This changes the size of the communities.  Higher resolutions
  *                           lead to more smaller communities, lower resolutions lead to
  *                           fewer larger communities.
+ * @param[in]  theta         (optional) The value of the parameter to scale modularity
+ *                           gain in Leiden refinement phase. It is used to compute
+ *                           the probability of joining a random leiden community.
+ *                           Called theta in the Leiden algorithm.
  * @param [in]  do_expensive_check
  *                           A flag to run expensive checks for input arguments (if set to true)
  * @param [out] result       Output from the Leiden call
@@ -130,9 +136,11 @@ cugraph_error_code_t cugraph_louvain(const cugraph_resource_handle_t* handle,
  * @return error code
  */
 cugraph_error_code_t cugraph_leiden(const cugraph_resource_handle_t* handle,
+                                    cugraph_rng_state_t* rng_state,
                                     cugraph_graph_t* graph,
                                     size_t max_level,
                                     double resolution,
+                                    double theta,
                                     bool_t do_expensive_check,
                                     cugraph_hierarchical_clustering_result_t** result,
                                     cugraph_error_t** error);
diff --git a/cpp/include/cugraph_c/graph.h b/cpp/include/cugraph_c/graph.h
index aa15cb6cb94..e910d8b1244 100644
--- a/cpp/include/cugraph_c/graph.h
+++ b/cpp/include/cugraph_c/graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,7 +72,7 @@ cugraph_error_code_t cugraph_sg_graph_create(
   const cugraph_type_erased_device_array_view_t* edge_type_ids,
   bool_t store_transposed,
   bool_t renumber,
-  bool_t check,
+  bool_t do_expensive_check,
   cugraph_graph_t** graph,
   cugraph_error_t** error);
 
@@ -112,7 +112,7 @@ cugraph_error_code_t cugraph_sg_graph_create_from_csr(
   const cugraph_type_erased_device_array_view_t* edge_type_ids,
   bool_t store_transposed,
   bool_t renumber,
-  bool_t check,
+  bool_t do_expensive_check,
   cugraph_graph_t** graph,
   cugraph_error_t** error);
 
@@ -144,7 +144,7 @@ void cugraph_sg_graph_free(cugraph_graph_t* graph);
                                 argument that can be NULL if edge types are not used.
  * @param [in]  store_transposed If true create the graph initially in transposed format
  * @param [in]  num_edges       Number of edges
- * @param [in]  check           If true, do expensive checks to validate the input data
+ * @param [in]  do_expensive_check  If true, do expensive checks to validate the input data
  *    is consistent with software assumptions.  If false bypass these checks.
  * @param [out] graph           A pointer to the graph object
  * @param [out] error           Pointer to an error object storing details of any error.  Will
@@ -161,7 +161,7 @@ cugraph_error_code_t cugraph_mg_graph_create(
   const cugraph_type_erased_device_array_view_t* edge_type_ids,
   bool_t store_transposed,
   size_t num_edges,
-  bool_t check,
+  bool_t do_expensive_check,
   cugraph_graph_t** graph,
   cugraph_error_t** error);
 
diff --git a/cpp/libcugraph_etl/CMakeLists.txt b/cpp/libcugraph_etl/CMakeLists.txt
index 081f33d10fb..13cf7b199ec 100644
--- a/cpp/libcugraph_etl/CMakeLists.txt
+++ b/cpp/libcugraph_etl/CMakeLists.txt
@@ -25,7 +25,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUGRAPH_ETL)
 
-project(CUGRAPH_ETL VERSION 23.04.01 LANGUAGES C CXX CUDA)
+project(CUGRAPH_ETL VERSION 23.06.00 LANGUAGES C CXX CUDA)
 
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
diff --git a/cpp/libcugraph_etl/src/renumbering.cu b/cpp/libcugraph_etl/src/renumbering.cu
index bbcdb69781b..b0fdabe996a 100644
--- a/cpp/libcugraph_etl/src/renumbering.cu
+++ b/cpp/libcugraph_etl/src/renumbering.cu
@@ -748,10 +748,8 @@ struct renumber_functor {
              cudf::table_view const& dst_view)
   {
     return std::make_tuple(
-      std::unique_ptr<cudf::column>(new cudf::column(
-        cudf::data_type(cudf::type_id::INT32), 0, rmm::device_buffer{0, cudaStream_t{0}})),
-      std::unique_ptr<cudf::column>(new cudf::column(
-        cudf::data_type(cudf::type_id::INT32), 0, rmm::device_buffer{0, cudaStream_t{0}})),
+      cudf::make_empty_column(cudf::type_id::INT32),
+      cudf::make_empty_column(cudf::type_id::INT32),
       std::make_unique<cudf::table>(std::vector<std::unique_ptr<cudf::column>>{}));
   }
 
@@ -960,14 +958,18 @@ struct renumber_functor {
     std::vector<std::unique_ptr<cudf::column>> renumber_table_vectors;
 
     auto offset_col_1 =
-      std::unique_ptr<cudf::column>(new cudf::column(cudf::data_type(cudf::type_id::INT32),
-                                                     key_value_count + 1,
-                                                     std::move(out_col1_offsets.release())));
+      std::make_unique<cudf::column>(cudf::data_type(cudf::type_id::INT32),
+                                     key_value_count + 1,
+                                     std::move(out_col1_offsets.release()),
+                                     rmm::device_buffer{},
+                                     0);
 
     auto str_col_1 =
-      std::unique_ptr<cudf::column>(new cudf::column(cudf::data_type(cudf::type_id::INT8),
-                                                     hist_insert_counter[0],
-                                                     std::move(unrenumber_col1_chars)));
+      std::make_unique<cudf::column>(cudf::data_type(cudf::type_id::INT8),
+                                     hist_insert_counter[0],
+                                     std::move(unrenumber_col1_chars),
+                                     rmm::device_buffer{},
+                                     0);
 
     renumber_table_vectors.push_back(
       cudf::make_strings_column(size_type(key_value_count),
@@ -977,14 +979,18 @@ struct renumber_functor {
                                 rmm::device_buffer(size_type(0), exec_strm)));
 
     auto offset_col_2 =
-      std::unique_ptr<cudf::column>(new cudf::column(cudf::data_type(cudf::type_id::INT32),
-                                                     key_value_count + 1,
-                                                     std::move(out_col2_offsets.release())));
+      std::make_unique<cudf::column>(cudf::data_type(cudf::type_id::INT32),
+                                     key_value_count + 1,
+                                     std::move(out_col2_offsets.release()),
+                                     rmm::device_buffer{},
+                                     0);
 
     auto str_col_2 =
-      std::unique_ptr<cudf::column>(new cudf::column(cudf::data_type(cudf::type_id::INT8),
-                                                     hist_insert_counter[1],
-                                                     std::move(unrenumber_col2_chars)));
+      std::make_unique<cudf::column>(cudf::data_type(cudf::type_id::INT8),
+                                     hist_insert_counter[1],
+                                     std::move(unrenumber_col2_chars),
+                                     rmm::device_buffer{},
+                                     0);
 
     renumber_table_vectors.push_back(
       cudf::make_strings_column(size_type(key_value_count),
@@ -1036,11 +1042,17 @@ struct renumber_functor {
       reinterpret_cast<Dtype*>(dst_buffer.data()));
 
     std::vector<std::unique_ptr<cudf::column>> cols_vector;
-    cols_vector.push_back(std::unique_ptr<cudf::column>(
-      new cudf::column(cudf::data_type(cudf::type_id::INT32), num_rows, std::move(src_buffer))));
-
-    cols_vector.push_back(std::unique_ptr<cudf::column>(
-      new cudf::column(cudf::data_type(cudf::type_id::INT32), num_rows, std::move(dst_buffer))));
+    cols_vector.push_back(std::make_unique<cudf::column>(cudf::data_type(cudf::type_id::INT32),
+                                                         num_rows,
+                                                         std::move(src_buffer),
+                                                         rmm::device_buffer{},
+                                                         0));
+
+    cols_vector.push_back(std::make_unique<cudf::column>(cudf::data_type(cudf::type_id::INT32),
+                                                         num_rows,
+                                                         std::move(dst_buffer),
+                                                         rmm::device_buffer{},
+                                                         0));
 
     RAFT_CHECK_CUDA(cudaDeviceSynchronize());
 
diff --git a/cpp/src/c_api/core_result.cpp b/cpp/src/c_api/core_result.cpp
index 9cdf34e468b..09476060967 100644
--- a/cpp/src/c_api/core_result.cpp
+++ b/cpp/src/c_api/core_result.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -95,8 +95,10 @@ cugraph_type_erased_device_array_view_t* cugraph_k_core_result_get_weights(
   cugraph_k_core_result_t* result)
 {
   auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_k_core_result_t*>(result);
-  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
-    internal_pointer->weights_->view());
+  return (internal_pointer->weights_ == nullptr)
+           ? NULL
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->weights_->view());
 }
 
 void cugraph_k_core_result_free(cugraph_k_core_result_t* result)
diff --git a/cpp/src/c_api/graph_helper.cu b/cpp/src/c_api/graph_helper.cu
deleted file mode 100644
index 914344f8722..00000000000
--- a/cpp/src/c_api/graph_helper.cu
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <structure/detail/structure_utils.cuh>
-
-namespace cugraph {
-namespace c_api {
-
-template <typename vertex_t, typename edge_t>
-rmm::device_uvector<vertex_t> expand_sparse_offsets(raft::device_span<edge_t const> offsets,
-                                                    vertex_t base_vertex_id,
-                                                    rmm::cuda_stream_view const& stream)
-{
-  return cugraph::detail::expand_sparse_offsets(offsets, base_vertex_id, stream);
-}
-
-template rmm::device_uvector<int32_t> expand_sparse_offsets(
-  raft::device_span<int32_t const> offsets,
-  int32_t base_vertex_id,
-  rmm::cuda_stream_view const& stream);
-
-template rmm::device_uvector<int32_t> expand_sparse_offsets(
-  raft::device_span<int64_t const> offsets,
-  int32_t base_vertex_id,
-  rmm::cuda_stream_view const& stream);
-
-template rmm::device_uvector<int64_t> expand_sparse_offsets(
-  raft::device_span<int64_t const> offsets,
-  int64_t base_vertex_id,
-  rmm::cuda_stream_view const& stream);
-
-template rmm::device_uvector<int32_t> expand_sparse_offsets(raft::device_span<size_t const> offsets,
-                                                            int32_t base_vertex_id,
-                                                            rmm::cuda_stream_view const& stream);
-
-template rmm::device_uvector<int64_t> expand_sparse_offsets(raft::device_span<size_t const> offsets,
-                                                            int64_t base_vertex_id,
-                                                            rmm::cuda_stream_view const& stream);
-
-}  // namespace c_api
-}  // namespace cugraph
diff --git a/cpp/src/c_api/graph_helper.hpp b/cpp/src/c_api/graph_helper.hpp
index 22e8e317ad0..c4f7aaeabc9 100644
--- a/cpp/src/c_api/graph_helper.hpp
+++ b/cpp/src/c_api/graph_helper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,5 +26,10 @@ rmm::device_uvector<vertex_t> expand_sparse_offsets(raft::device_span<edge_t con
                                                     vertex_t base_vertex_id,
                                                     rmm::cuda_stream_view const& stream);
 
+template <typename GraphViewType, typename T>
+edge_property_t<GraphViewType, T> create_constant_edge_property(raft::handle_t const& handle,
+                                                                GraphViewType const& graph_view,
+                                                                T constant_value);
+
 }  // namespace c_api
 }  // namespace cugraph
diff --git a/cpp/src/c_api/graph_helper_impl.cuh b/cpp/src/c_api/graph_helper_impl.cuh
new file mode 100644
index 00000000000..759d7a85286
--- /dev/null
+++ b/cpp/src/c_api/graph_helper_impl.cuh
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <prims/fill_edge_property.cuh>
+#include <structure/detail/structure_utils.cuh>
+
+namespace cugraph {
+namespace c_api {
+
+template <typename vertex_t, typename edge_t>
+rmm::device_uvector<vertex_t> expand_sparse_offsets(raft::device_span<edge_t const> offsets,
+                                                    vertex_t base_vertex_id,
+                                                    rmm::cuda_stream_view const& stream)
+{
+  return cugraph::detail::expand_sparse_offsets(offsets, base_vertex_id, stream);
+}
+
+template <typename GraphViewType, typename T>
+edge_property_t<GraphViewType, T> create_constant_edge_property(raft::handle_t const& handle,
+                                                                GraphViewType const& graph_view,
+                                                                T constant_value)
+{
+  edge_property_t<GraphViewType, T> edge_property(handle, graph_view);
+
+  cugraph::fill_edge_property(handle, graph_view, constant_value, edge_property);
+
+  return edge_property;
+}
+
+}  // namespace c_api
+}  // namespace cugraph
diff --git a/cpp/src/c_api/graph_helper_mg.cu b/cpp/src/c_api/graph_helper_mg.cu
new file mode 100644
index 00000000000..15e24ba530b
--- /dev/null
+++ b/cpp/src/c_api/graph_helper_mg.cu
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <c_api/graph_helper_impl.cuh>
+
+namespace cugraph {
+namespace c_api {
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int32_t, false, true>, float>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  float constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int64_t, false, true>, float>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  float constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int64_t, int64_t, false, true>, float>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  float constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int32_t, true, true>, float>
+create_constant_edge_property(raft::handle_t const& handle,
+                              cugraph::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+                              float constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int64_t, true, true>, float>
+create_constant_edge_property(raft::handle_t const& handle,
+                              cugraph::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+                              float constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int64_t, int64_t, true, true>, float>
+create_constant_edge_property(raft::handle_t const& handle,
+                              cugraph::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+                              float constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int32_t, false, true>, double>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  double constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int64_t, false, true>, double>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  double constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int64_t, int64_t, false, true>, double>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  double constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int32_t, true, true>, double>
+create_constant_edge_property(raft::handle_t const& handle,
+                              cugraph::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+                              double constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int64_t, true, true>, double>
+create_constant_edge_property(raft::handle_t const& handle,
+                              cugraph::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+                              double constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int64_t, int64_t, true, true>, double>
+create_constant_edge_property(raft::handle_t const& handle,
+                              cugraph::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+                              double constant_value);
+
+}  // namespace c_api
+}  // namespace cugraph
diff --git a/cpp/src/c_api/graph_helper_sg.cu b/cpp/src/c_api/graph_helper_sg.cu
new file mode 100644
index 00000000000..dcd6c92325f
--- /dev/null
+++ b/cpp/src/c_api/graph_helper_sg.cu
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <c_api/graph_helper_impl.cuh>
+
+namespace cugraph {
+namespace c_api {
+
+template rmm::device_uvector<int32_t> expand_sparse_offsets(
+  raft::device_span<int32_t const> offsets,
+  int32_t base_vertex_id,
+  rmm::cuda_stream_view const& stream);
+
+template rmm::device_uvector<int32_t> expand_sparse_offsets(
+  raft::device_span<int64_t const> offsets,
+  int32_t base_vertex_id,
+  rmm::cuda_stream_view const& stream);
+
+template rmm::device_uvector<int64_t> expand_sparse_offsets(
+  raft::device_span<int64_t const> offsets,
+  int64_t base_vertex_id,
+  rmm::cuda_stream_view const& stream);
+
+template rmm::device_uvector<int32_t> expand_sparse_offsets(raft::device_span<size_t const> offsets,
+                                                            int32_t base_vertex_id,
+                                                            rmm::cuda_stream_view const& stream);
+
+template rmm::device_uvector<int64_t> expand_sparse_offsets(raft::device_span<size_t const> offsets,
+                                                            int64_t base_vertex_id,
+                                                            rmm::cuda_stream_view const& stream);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int32_t, false, false>, float>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  float constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int64_t, false, false>, float>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  float constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int64_t, int64_t, false, false>, float>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  float constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int32_t, true, false>, float>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  float constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int64_t, true, false>, float>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  float constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int64_t, int64_t, true, false>, float>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  float constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int32_t, false, false>, double>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  double constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int64_t, false, false>, double>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  double constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int64_t, int64_t, false, false>, double>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  double constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int32_t, true, false>, double>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  double constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int32_t, int64_t, true, false>, double>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  double constant_value);
+
+template edge_property_t<cugraph::graph_view_t<int64_t, int64_t, true, false>, double>
+create_constant_edge_property(
+  raft::handle_t const& handle,
+  cugraph::graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  double constant_value);
+
+}  // namespace c_api
+}  // namespace cugraph
diff --git a/cpp/src/c_api/graph_mg.cpp b/cpp/src/c_api/graph_mg.cpp
index 9b7206702ea..f50c7c08fb6 100644
--- a/cpp/src/c_api/graph_mg.cpp
+++ b/cpp/src/c_api/graph_mg.cpp
@@ -338,19 +338,19 @@ extern "C" cugraph_error_code_t cugraph_mg_graph_create(
     weight_type = cugraph_data_type_id_t::FLOAT32;
   }
 
-  CAPI_EXPECTS((edge_type_ids == nullptr) || (p_edge_ids->type_ == edge_type),
+  CAPI_EXPECTS((edge_ids == nullptr) || (p_edge_ids->type_ == edge_type),
                CUGRAPH_INVALID_INPUT,
-               "Invalid input arguments: Edge id type must match edge (src/dst) type",
+               "Invalid input arguments: Edge id type must match edge type",
                *error);
 
-  CAPI_EXPECTS((edge_type_ids == nullptr) || (p_edge_type_ids->size_ == p_src->size_),
+  CAPI_EXPECTS((edge_ids == nullptr) || (p_edge_ids->size_ == p_src->size_),
                CUGRAPH_INVALID_INPUT,
-               "Invalid input arguments: src size != edge prop size",
+               "Invalid input arguments: src size != edge id prop size",
                *error);
 
-  CAPI_EXPECTS((edge_ids == nullptr) || (p_edge_ids->size_ == p_src->size_),
+  CAPI_EXPECTS((edge_type_ids == nullptr) || (p_edge_type_ids->size_ == p_src->size_),
                CUGRAPH_INVALID_INPUT,
-               "Invalid input arguments: src size != edge prop size",
+               "Invalid input arguments: src size != edge type prop size",
                *error);
 
   cugraph_data_type_id_t edge_type_id_type;
diff --git a/cpp/src/c_api/graph_sg.cpp b/cpp/src/c_api/graph_sg.cpp
index 5267516f89b..9536869f123 100644
--- a/cpp/src/c_api/graph_sg.cpp
+++ b/cpp/src/c_api/graph_sg.cpp
@@ -39,7 +39,7 @@ struct create_graph_functor : public cugraph::c_api::abstract_functor {
   cugraph::c_api::cugraph_type_erased_device_array_view_t const* edge_ids_;
   cugraph::c_api::cugraph_type_erased_device_array_view_t const* edge_type_ids_;
   bool_t renumber_;
-  bool_t check_;
+  bool_t do_expensive_check_;
   cugraph_data_type_id_t edge_type_;
   cugraph::c_api::cugraph_graph_t* result_{};
 
@@ -51,7 +51,7 @@ struct create_graph_functor : public cugraph::c_api::abstract_functor {
                        cugraph::c_api::cugraph_type_erased_device_array_view_t const* edge_ids,
                        cugraph::c_api::cugraph_type_erased_device_array_view_t const* edge_type_ids,
                        bool_t renumber,
-                       bool_t check,
+                       bool_t do_expensive_check,
                        cugraph_data_type_id_t edge_type)
     : abstract_functor(),
       properties_(properties),
@@ -62,7 +62,7 @@ struct create_graph_functor : public cugraph::c_api::abstract_functor {
       edge_ids_(edge_ids),
       edge_type_ids_(edge_type_ids),
       renumber_(renumber),
-      check_(check),
+      do_expensive_check_(do_expensive_check),
       edge_type_(edge_type)
   {
   }
@@ -78,7 +78,7 @@ struct create_graph_functor : public cugraph::c_api::abstract_functor {
     if constexpr (multi_gpu || !cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
       unsupported();
     } else {
-      if (check_) {
+      if (do_expensive_check_) {
         // FIXME:  Need an implementation here.
       }
 
@@ -177,7 +177,7 @@ struct create_graph_functor : public cugraph::c_api::abstract_functor {
           std::move(edgelist_edge_types),
           cugraph::graph_properties_t{properties_->is_symmetric, properties_->is_multigraph},
           renumber_,
-          check_);
+          do_expensive_check_);
 
       if (renumber_) {
         *number_map = std::move(new_number_map.value());
@@ -221,7 +221,7 @@ struct create_graph_csr_functor : public cugraph::c_api::abstract_functor {
   cugraph::c_api::cugraph_type_erased_device_array_view_t const* edge_ids_;
   cugraph::c_api::cugraph_type_erased_device_array_view_t const* edge_type_ids_;
   bool_t renumber_;
-  bool_t check_;
+  bool_t do_expensive_check_;
   cugraph::c_api::cugraph_graph_t* result_{};
 
   create_graph_csr_functor(
@@ -233,7 +233,7 @@ struct create_graph_csr_functor : public cugraph::c_api::abstract_functor {
     cugraph::c_api::cugraph_type_erased_device_array_view_t const* edge_ids,
     cugraph::c_api::cugraph_type_erased_device_array_view_t const* edge_type_ids,
     bool_t renumber,
-    bool_t check)
+    bool_t do_expensive_check)
     : abstract_functor(),
       properties_(properties),
       handle_(handle),
@@ -243,7 +243,7 @@ struct create_graph_csr_functor : public cugraph::c_api::abstract_functor {
       edge_ids_(edge_ids),
       edge_type_ids_(edge_type_ids),
       renumber_(renumber),
-      check_(check)
+      do_expensive_check_(do_expensive_check)
   {
   }
 
@@ -258,7 +258,7 @@ struct create_graph_csr_functor : public cugraph::c_api::abstract_functor {
     if constexpr (multi_gpu || !cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
       unsupported();
     } else {
-      if (check_) {
+      if (do_expensive_check_) {
         // FIXME:  Need an implementation here.
       }
 
@@ -362,7 +362,7 @@ struct create_graph_csr_functor : public cugraph::c_api::abstract_functor {
           std::move(edgelist_edge_types),
           cugraph::graph_properties_t{properties_->is_symmetric, properties_->is_multigraph},
           renumber_,
-          check_);
+          do_expensive_check_);
 
       if (renumber_) {
         *number_map = std::move(new_number_map.value());
@@ -462,7 +462,7 @@ extern "C" cugraph_error_code_t cugraph_sg_graph_create(
   const cugraph_type_erased_device_array_view_t* edge_type_ids,
   bool_t store_transposed,
   bool_t renumber,
-  bool_t check,
+  bool_t do_expensive_check,
   cugraph_graph_t** graph,
   cugraph_error_t** error)
 {
@@ -513,26 +513,19 @@ extern "C" cugraph_error_code_t cugraph_sg_graph_create(
     weight_type = cugraph_data_type_id_t::FLOAT32;
   }
 
-  // FIXME:  The combination of edge_ids != nullptr, edge_type_ids == nullptr
-  //         logically should be valid, but the code will currently break if
-  //         that is that is specified
-  CAPI_EXPECTS(
-    (edge_type_ids == nullptr && edge_ids == nullptr) ||
-      (edge_type_ids != nullptr && edge_ids != nullptr),
-    CUGRAPH_INVALID_INPUT,
-    "Invalid input arguments: either none or both of edge ids and edge types must be provided.",
-    *error);
+  CAPI_EXPECTS((edge_ids == nullptr) || (p_edge_ids->type_ == edge_type),
+               CUGRAPH_INVALID_INPUT,
+               "Invalid input arguments: Edge id type must match edge type",
+               *error);
 
-  CAPI_EXPECTS(
-    (edge_type_ids == nullptr && edge_ids == nullptr) || (p_edge_ids->type_ == edge_type),
-    CUGRAPH_INVALID_INPUT,
-    "Invalid input arguments: Edge id type must match edge (src/dst) type",
-    *error);
+  CAPI_EXPECTS((edge_ids == nullptr) || (p_edge_ids->size_ == p_src->size_),
+               CUGRAPH_INVALID_INPUT,
+               "Invalid input arguments: src size != edge id prop size",
+               *error);
 
-  CAPI_EXPECTS((edge_type_ids == nullptr && edge_ids == nullptr) ||
-                 (p_edge_ids->size_ == p_src->size_ && p_edge_type_ids->size_ == p_dst->size_),
+  CAPI_EXPECTS((edge_type_ids == nullptr) || (p_edge_type_ids->size_ == p_src->size_),
                CUGRAPH_INVALID_INPUT,
-               "Invalid input arguments: src size != edge prop size",
+               "Invalid input arguments: src size != edge type prop size",
                *error);
 
   cugraph_data_type_id_t edge_type_id_type = cugraph_data_type_id_t::INT32;
@@ -546,7 +539,7 @@ extern "C" cugraph_error_code_t cugraph_sg_graph_create(
                                  p_edge_ids,
                                  p_edge_type_ids,
                                  renumber,
-                                 check,
+                                 do_expensive_check,
                                  edge_type);
 
   try {
@@ -582,7 +575,7 @@ cugraph_error_code_t cugraph_sg_graph_create_from_csr(
   const cugraph_type_erased_device_array_view_t* edge_type_ids,
   bool_t store_transposed,
   bool_t renumber,
-  bool_t check,
+  bool_t do_expensive_check,
   cugraph_graph_t** graph,
   cugraph_error_t** error)
 {
@@ -644,7 +637,7 @@ cugraph_error_code_t cugraph_sg_graph_create_from_csr(
                                      p_edge_ids,
                                      p_edge_type_ids,
                                      renumber,
-                                     check);
+                                     do_expensive_check);
 
   try {
     cugraph::c_api::vertex_dispatcher(p_indices->type_,
diff --git a/cpp/src/c_api/induced_subgraph_result.cpp b/cpp/src/c_api/induced_subgraph_result.cpp
index 2e22a4c47b5..b9ad0e0d66f 100644
--- a/cpp/src/c_api/induced_subgraph_result.cpp
+++ b/cpp/src/c_api/induced_subgraph_result.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,10 @@ extern "C" cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get
 {
   auto internal_pointer =
     reinterpret_cast<cugraph::c_api::cugraph_induced_subgraph_result_t*>(induced_subgraph);
-  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(internal_pointer->wgt_->view());
+  return (internal_pointer->wgt_ == nullptr)
+           ? NULL
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->wgt_->view());
 }
 
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_subgraph_offsets(
diff --git a/cpp/src/c_api/legacy_spectral.cpp b/cpp/src/c_api/legacy_spectral.cpp
index 9d1a0273057..4b465eebd0f 100644
--- a/cpp/src/c_api/legacy_spectral.cpp
+++ b/cpp/src/c_api/legacy_spectral.cpp
@@ -110,10 +110,18 @@ struct balanced_cut_clustering_functor : public cugraph::c_api::abstract_functor
       auto graph_view          = graph->view();
       auto edge_partition_view = graph_view.local_edge_partition_view();
 
+      rmm::device_uvector<weight_t> tmp_weights(0, handle_.get_stream());
+      if (edge_weights == nullptr) {
+        tmp_weights.resize(edge_partition_view.indices().size(), handle_.get_stream());
+        cugraph::detail::scalar_fill(handle_, tmp_weights.data(), tmp_weights.size(), weight_t{1});
+      }
+
       cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> legacy_graph_view(
         const_cast<edge_t*>(edge_partition_view.offsets().data()),
         const_cast<vertex_t*>(edge_partition_view.indices().data()),
-        const_cast<weight_t*>(edge_weights->view().value_firsts().front()),
+        (edge_weights == nullptr)
+          ? tmp_weights.data()
+          : const_cast<weight_t*>(edge_weights->view().value_firsts().front()),
         edge_partition_view.offsets().size() - 1,
         edge_partition_view.indices().size());
 
@@ -209,10 +217,18 @@ struct spectral_clustering_functor : public cugraph::c_api::abstract_functor {
       auto graph_view          = graph->view();
       auto edge_partition_view = graph_view.local_edge_partition_view();
 
+      rmm::device_uvector<weight_t> tmp_weights(0, handle_.get_stream());
+      if (edge_weights == nullptr) {
+        tmp_weights.resize(edge_partition_view.indices().size(), handle_.get_stream());
+        cugraph::detail::scalar_fill(handle_, tmp_weights.data(), tmp_weights.size(), weight_t{1});
+      }
+
       cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> legacy_graph_view(
         const_cast<edge_t*>(edge_partition_view.offsets().data()),
         const_cast<vertex_t*>(edge_partition_view.indices().data()),
-        const_cast<weight_t*>(edge_weights->view().value_firsts().front()),
+        (edge_weights == nullptr)
+          ? tmp_weights.data()
+          : const_cast<weight_t*>(edge_weights->view().value_firsts().front()),
         edge_partition_view.offsets().size() - 1,
         edge_partition_view.indices().size());
 
@@ -298,10 +314,18 @@ struct analyze_clustering_ratio_cut_functor : public cugraph::c_api::abstract_fu
       auto graph_view          = graph->view();
       auto edge_partition_view = graph_view.local_edge_partition_view();
 
+      rmm::device_uvector<weight_t> tmp_weights(0, handle_.get_stream());
+      if (edge_weights == nullptr) {
+        tmp_weights.resize(edge_partition_view.indices().size(), handle_.get_stream());
+        cugraph::detail::scalar_fill(handle_, tmp_weights.data(), tmp_weights.size(), weight_t{1});
+      }
+
       cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> legacy_graph_view(
         const_cast<edge_t*>(edge_partition_view.offsets().data()),
         const_cast<vertex_t*>(edge_partition_view.indices().data()),
-        const_cast<weight_t*>(edge_weights->view().value_firsts().front()),
+        (edge_weights == nullptr)
+          ? tmp_weights.data()
+          : const_cast<weight_t*>(edge_weights->view().value_firsts().front()),
         edge_partition_view.offsets().size() - 1,
         edge_partition_view.indices().size());
 
@@ -405,10 +429,18 @@ struct analyze_clustering_edge_cut_functor : public cugraph::c_api::abstract_fun
       auto graph_view          = graph->view();
       auto edge_partition_view = graph_view.local_edge_partition_view();
 
+      rmm::device_uvector<weight_t> tmp_weights(0, handle_.get_stream());
+      if (edge_weights == nullptr) {
+        tmp_weights.resize(edge_partition_view.indices().size(), handle_.get_stream());
+        cugraph::detail::scalar_fill(handle_, tmp_weights.data(), tmp_weights.size(), weight_t{1});
+      }
+
       cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> legacy_graph_view(
         const_cast<edge_t*>(edge_partition_view.offsets().data()),
         const_cast<vertex_t*>(edge_partition_view.indices().data()),
-        const_cast<weight_t*>(edge_weights->view().value_firsts().front()),
+        (edge_weights == nullptr)
+          ? tmp_weights.data()
+          : const_cast<weight_t*>(edge_weights->view().value_firsts().front()),
         edge_partition_view.offsets().size() - 1,
         edge_partition_view.indices().size());
 
@@ -512,10 +544,18 @@ struct analyze_clustering_modularity_functor : public cugraph::c_api::abstract_f
       auto graph_view          = graph->view();
       auto edge_partition_view = graph_view.local_edge_partition_view();
 
+      rmm::device_uvector<weight_t> tmp_weights(0, handle_.get_stream());
+      if (edge_weights == nullptr) {
+        tmp_weights.resize(edge_partition_view.indices().size(), handle_.get_stream());
+        cugraph::detail::scalar_fill(handle_, tmp_weights.data(), tmp_weights.size(), weight_t{1});
+      }
+
       cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> legacy_graph_view(
         const_cast<edge_t*>(edge_partition_view.offsets().data()),
         const_cast<vertex_t*>(edge_partition_view.indices().data()),
-        const_cast<weight_t*>(edge_weights->view().value_firsts().front()),
+        (edge_weights == nullptr)
+          ? tmp_weights.data()
+          : const_cast<weight_t*>(edge_weights->view().value_firsts().front()),
         edge_partition_view.offsets().size() - 1,
         edge_partition_view.indices().size());
 
diff --git a/cpp/src/c_api/leiden.cpp b/cpp/src/c_api/leiden.cpp
index 074ffc2d195..7b1ca10545c 100644
--- a/cpp/src/c_api/leiden.cpp
+++ b/cpp/src/c_api/leiden.cpp
@@ -18,7 +18,9 @@
 
 #include <c_api/abstract_functor.hpp>
 #include <c_api/graph.hpp>
+#include <c_api/graph_helper.hpp>
 #include <c_api/hierarchical_clustering_result.hpp>
+#include <c_api/random.hpp>
 #include <c_api/resource_handle.hpp>
 #include <c_api/utils.hpp>
 
@@ -27,25 +29,30 @@
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
 
+#include <raft/core/handle.hpp>
+
 #include <optional>
 
 namespace {
 
 struct leiden_functor : public cugraph::c_api::abstract_functor {
   raft::handle_t const& handle_;
-  cugraph::c_api::cugraph_graph_t* graph_;
+  cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr};
+  cugraph::c_api::cugraph_graph_t* graph_{nullptr};
   size_t max_level_;
   double resolution_;
   bool do_expensive_check_;
   cugraph::c_api::cugraph_hierarchical_clustering_result_t* result_{};
 
   leiden_functor(::cugraph_resource_handle_t const* handle,
+                 cugraph_rng_state_t* rng_state,
                  ::cugraph_graph_t* graph,
                  size_t max_level,
                  double resolution,
                  bool do_expensive_check)
     : abstract_functor(),
       handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
+      rng_state_(reinterpret_cast<cugraph::c_api::cugraph_rng_state_t*>(rng_state)),
       graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
       max_level_(max_level),
       resolution_(resolution),
@@ -86,13 +93,23 @@ struct leiden_functor : public cugraph::c_api::abstract_functor {
       rmm::device_uvector<vertex_t> clusters(graph_view.local_vertex_partition_range_size(),
                                              handle_.get_stream());
 
-      auto [level, modularity] = cugraph::leiden(
-        handle_,
-        graph_view,
-        (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
-        clusters.data(),
-        max_level_,
-        static_cast<weight_t>(resolution_));
+      // FIXME: Revisit the constant edge property idea.  We could consider an alternate
+      // implementation (perhaps involving the thrust::constant_iterator), or we
+      // could add support in Leiden for std::nullopt as the edge weights behaving
+      // as desired and only instantiating a real edge_property_view_t for the
+      // coarsened graphs.
+      auto [level, modularity] =
+        cugraph::leiden(handle_,
+                        rng_state_->rng_state_,
+                        graph_view,
+                        (edge_weights != nullptr)
+                          ? std::make_optional(edge_weights->view())
+                          : std::make_optional(cugraph::c_api::create_constant_edge_property(
+                                                 handle_, graph_view, weight_t{1})
+                                                 .view()),
+                        clusters.data(),
+                        max_level_,
+                        static_cast<weight_t>(resolution_));
 
       rmm::device_uvector<vertex_t> vertices(graph_view.local_vertex_partition_range_size(),
                                              handle_.get_stream());
@@ -109,14 +126,16 @@ struct leiden_functor : public cugraph::c_api::abstract_functor {
 }  // namespace
 
 extern "C" cugraph_error_code_t cugraph_leiden(const cugraph_resource_handle_t* handle,
+                                               cugraph_rng_state_t* rng_state,
                                                cugraph_graph_t* graph,
                                                size_t max_level,
                                                double resolution,
+                                               double theta,
                                                bool_t do_expensive_check,
                                                cugraph_hierarchical_clustering_result_t** result,
                                                cugraph_error_t** error)
 {
-  leiden_functor functor(handle, graph, max_level, resolution, do_expensive_check);
+  leiden_functor functor(handle, rng_state, graph, max_level, resolution, do_expensive_check);
 
   return cugraph::c_api::run_algorithm(graph, functor, result, error);
 }
diff --git a/cpp/src/c_api/louvain.cpp b/cpp/src/c_api/louvain.cpp
index 0982b2b091c..ff75cafa031 100644
--- a/cpp/src/c_api/louvain.cpp
+++ b/cpp/src/c_api/louvain.cpp
@@ -18,6 +18,7 @@
 
 #include <c_api/abstract_functor.hpp>
 #include <c_api/graph.hpp>
+#include <c_api/graph_helper.hpp>
 #include <c_api/hierarchical_clustering_result.hpp>
 #include <c_api/resource_handle.hpp>
 #include <c_api/utils.hpp>
@@ -86,13 +87,22 @@ struct louvain_functor : public cugraph::c_api::abstract_functor {
       rmm::device_uvector<vertex_t> clusters(graph_view.local_vertex_partition_range_size(),
                                              handle_.get_stream());
 
-      auto [level, modularity] = cugraph::louvain(
-        handle_,
-        graph_view,
-        (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
-        clusters.data(),
-        max_level_,
-        static_cast<weight_t>(resolution_));
+      // FIXME: Revisit the constant edge property idea.  We could consider an alternate
+      // implementation (perhaps involving the thrust::constant_iterator), or we
+      // could add support in Louvain for std::nullopt as the edge weights behaving
+      // as desired and only instantiating a real edge_property_view_t for the
+      // coarsened graphs.
+      auto [level, modularity] =
+        cugraph::louvain(handle_,
+                         graph_view,
+                         (edge_weights != nullptr)
+                           ? std::make_optional(edge_weights->view())
+                           : std::make_optional(cugraph::c_api::create_constant_edge_property(
+                                                  handle_, graph_view, weight_t{1})
+                                                  .view()),
+                         clusters.data(),
+                         max_level_,
+                         static_cast<weight_t>(resolution_));
 
       rmm::device_uvector<vertex_t> vertices(graph_view.local_vertex_partition_range_size(),
                                              handle_.get_stream());
diff --git a/cpp/src/c_api/random.cpp b/cpp/src/c_api/random.cpp
index a539fa9c0d1..6cb371c3fac 100644
--- a/cpp/src/c_api/random.cpp
+++ b/cpp/src/c_api/random.cpp
@@ -67,7 +67,13 @@ struct select_random_vertices_functor : public cugraph::c_api::abstract_functor
       rmm::device_uvector<vertex_t> local_vertices(0, handle_.get_stream());
 
       local_vertices = cugraph::select_random_vertices(
-        handle_, graph_view, rng_state_->rng_state_, num_vertices_, false, false);
+        handle_,
+        graph_view,
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+        rng_state_->rng_state_,
+        num_vertices_,
+        false,
+        false);
 
       cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(
         handle_,
diff --git a/cpp/src/c_api/strongly_connected_components.cpp b/cpp/src/c_api/strongly_connected_components.cpp
index 9060772c1c2..9e3b4b3a4a3 100644
--- a/cpp/src/c_api/strongly_connected_components.cpp
+++ b/cpp/src/c_api/strongly_connected_components.cpp
@@ -58,7 +58,10 @@ struct scc_functor : public cugraph::c_api::abstract_functor {
     if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
       unsupported();
     } else if constexpr (multi_gpu) {
-      unsupported();
+      error_code_ = CUGRAPH_NOT_IMPLEMENTED;
+      error_->error_message_ =
+        "strongly connected components not currently implemented for multi-GPU";
+
     } else if constexpr (!std::is_same_v<vertex_t, edge_t>) {
       unsupported();
     } else if constexpr (std::is_same_v<weight_t, double>) {
diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh
index 5dcfea6a3d7..5631fadde96 100644
--- a/cpp/src/centrality/betweenness_centrality_impl.cuh
+++ b/cpp/src/centrality/betweenness_centrality_impl.cuh
@@ -31,6 +31,7 @@
 #include <cugraph/vertex_partition_device_view.cuh>
 
 #include <thrust/functional.h>
+#include <thrust/optional.h>
 #include <thrust/reduce.h>
 
 #include <raft/core/handle.hpp>
@@ -405,6 +406,8 @@ rmm::device_uvector<weight_t> betweenness_centrality(
   bool const include_endpoints,
   bool const do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (vertices) {
     return detail::betweenness_centrality(handle,
                                           graph_view,
@@ -436,6 +439,8 @@ rmm::device_uvector<weight_t> edge_betweenness_centrality(
   bool const normalized,
   bool const do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (vertices) {
     return detail::edge_betweenness_centrality(handle,
                                                graph_view,
diff --git a/cpp/src/centrality/eigenvector_centrality_impl.cuh b/cpp/src/centrality/eigenvector_centrality_impl.cuh
index 8374440ef96..291abf18455 100644
--- a/cpp/src/centrality/eigenvector_centrality_impl.cuh
+++ b/cpp/src/centrality/eigenvector_centrality_impl.cuh
@@ -172,6 +172,8 @@ rmm::device_uvector<weight_t> eigenvector_centrality(
   static_assert(std::is_floating_point<weight_t>::value,
                 "weight_t should be a floating-point type.");
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   CUGRAPH_EXPECTS(epsilon >= 0.0, "Invalid input argument: epsilon should be non-negative.");
   if (initial_centralities)
     CUGRAPH_EXPECTS(initial_centralities->size() ==
diff --git a/cpp/src/centrality/katz_centrality_impl.cuh b/cpp/src/centrality/katz_centrality_impl.cuh
index 8fe1c3cfbf2..202d00a5771 100644
--- a/cpp/src/centrality/katz_centrality_impl.cuh
+++ b/cpp/src/centrality/katz_centrality_impl.cuh
@@ -208,6 +208,8 @@ void katz_centrality(raft::handle_t const& handle,
                      bool normalize,
                      bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   detail::katz_centrality(handle,
                           graph_view,
                           edge_weight_view,
diff --git a/cpp/src/community/detail/mis_impl.cuh b/cpp/src/community/detail/mis_impl.cuh
index c09da35f711..bcd71af5a08 100644
--- a/cpp/src/community/detail/mis_impl.cuh
+++ b/cpp/src/community/detail/mis_impl.cuh
@@ -16,48 +16,38 @@
  */
 #pragma once
 
+#include <community/mis.hpp>
 #include <prims/fill_edge_src_dst_property.cuh>
 #include <prims/per_v_transform_reduce_incoming_outgoing_e.cuh>
 #include <prims/update_edge_src_dst_property.cuh>
 
-#include <community/detail/mis.hpp>
 #include <cugraph/edge_property.hpp>
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
 
-#include <raft/util/cudart_utils.hpp>
-#include <raft/util/integer_utils.hpp>
-#include <rmm/exec_policy.hpp>
-
 #include <thrust/count.h>
 #include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/fill.h>
-#include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/merge.h>
 #include <thrust/optional.h>
 #include <thrust/remove.h>
-#include <thrust/sequence.h>
-#include <thrust/shuffle.h>
+#include <thrust/set_operations.h>
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
 #include <cmath>
-#include <numeric>
-#include <type_traits>
-#include <utility>
 
 namespace cugraph {
 
 namespace detail {
 
-template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-rmm::device_uvector<vertex_t> compute_mis(
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> maximal_independent_set(
   raft::handle_t const& handle,
   cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
-  std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view)
+  raft::random::RngState& rng_state)
 {
   using GraphViewType = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
 
@@ -88,25 +78,17 @@ rmm::device_uvector<vertex_t> compute_mis(
   thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin());
 
   // Set ranks of zero out-degree vetices to std::numeric_limits<vertex_t>::lowest()
-  thrust::for_each(
+  thrust::transform_if(
     handle.get_thrust_policy(),
-    vertex_begin,
-    vertex_end,
-    [out_degrees = raft::device_span<edge_t const>(out_degrees.data(), out_degrees.size()),
-     ranks       = raft::device_span<vertex_t>(ranks.data(), ranks.size()),
-     v_first     = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
-      auto v_offset = v - v_first;
-      if (out_degrees[v_offset] == 0) { ranks[v_offset] = std::numeric_limits<vertex_t>::lowest(); }
-    });
+    out_degrees.begin(),
+    out_degrees.end(),
+    ranks.begin(),
+    [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); },
+    [] __device__(auto deg) { return deg == 0; });
 
   out_degrees.resize(0, handle.get_stream());
   out_degrees.shrink_to_fit(handle.get_stream());
 
-  thrust::default_random_engine g;
-  size_t seed = 0;
-  if constexpr (multi_gpu) { seed = handle.get_comms().get_rank(); }
-  g.seed(seed);
-
   size_t loop_counter = 0;
   while (true) {
     loop_counter++;
@@ -117,22 +99,48 @@ rmm::device_uvector<vertex_t> compute_mis(
     thrust::copy(handle.get_thrust_policy(), ranks.begin(), ranks.end(), temporary_ranks.begin());
 
     // Select a random set of candidate vertices
-    // FIXME: use common utility function to select a subset of remaining vertices
-    // and for MG extension, select from disributed array remaining vertices
-    thrust::shuffle(
-      handle.get_thrust_policy(), remaining_vertices.begin(), remaining_vertices.end(), g);
 
-    vertex_t nr_candidates =
-      (remaining_vertices.size() < 1024)
-        ? remaining_vertices.size()
-        : std::min(static_cast<vertex_t>((0.50 + 0.25 * loop_counter) * remaining_vertices.size()),
-                   static_cast<vertex_t>(remaining_vertices.size()));
+    vertex_t nr_remaining_vertices_to_check = remaining_vertices.size();
+    if (multi_gpu) {
+      nr_remaining_vertices_to_check = host_scalar_allreduce(handle.get_comms(),
+                                                             nr_remaining_vertices_to_check,
+                                                             raft::comms::op_t::SUM,
+                                                             handle.get_stream());
+    }
+
+    vertex_t nr_candidates = (nr_remaining_vertices_to_check < 1024)
+                               ? nr_remaining_vertices_to_check
+                               : std::min(static_cast<vertex_t>((0.50 + 0.25 * loop_counter) *
+                                                                nr_remaining_vertices_to_check),
+                                          nr_remaining_vertices_to_check);
+
+    // FIXME: Can we improve performance here?
+    // FIXME: if(nr_remaining_vertices_to_check < 1024), may avoid calling select_random_vertices
+    auto d_sampled_vertices =
+      cugraph::select_random_vertices(handle,
+                                      graph_view,
+                                      std::make_optional(raft::device_span<vertex_t const>{
+                                        remaining_vertices.data(), remaining_vertices.size()}),
+                                      rng_state,
+                                      nr_candidates,
+                                      false,
+                                      true);
+
+    rmm::device_uvector<vertex_t> non_candidate_vertices(
+      remaining_vertices.size() - d_sampled_vertices.size(), handle.get_stream());
+
+    thrust::set_difference(handle.get_thrust_policy(),
+                           remaining_vertices.begin(),
+                           remaining_vertices.end(),
+                           d_sampled_vertices.begin(),
+                           d_sampled_vertices.end(),
+                           non_candidate_vertices.begin());
 
     // Set temporary ranks of non-candidate vertices to std::numeric_limits<vertex_t>::lowest()
     thrust::for_each(
       handle.get_thrust_policy(),
-      remaining_vertices.begin(),
-      remaining_vertices.end() - nr_candidates,
+      non_candidate_vertices.begin(),
+      non_candidate_vertices.end(),
       [temporary_ranks =
          raft::device_span<vertex_t>(temporary_ranks.data(), temporary_ranks.size()),
        v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
@@ -160,7 +168,6 @@ rmm::device_uvector<vertex_t> compute_mis(
 
     //
     // Find maximum rank outgoing neighbor for each vertex
-    // (In case of Leiden decision graph, each vertex has at most one outgoing edge)
     //
 
     rmm::device_uvector<vertex_t> max_outgoing_ranks(local_vtx_partitoin_size, handle.get_stream());
@@ -224,8 +231,8 @@ rmm::device_uvector<vertex_t> compute_mis(
     //
     auto last = thrust::remove_if(
       handle.get_thrust_policy(),
-      remaining_vertices.end() - nr_candidates,
-      remaining_vertices.end(),
+      d_sampled_vertices.begin(),
+      d_sampled_vertices.end(),
       [max_rank_neighbor_first = max_outgoing_ranks.begin(),
        ranks                   = raft::device_span<vertex_t>(ranks.data(), ranks.size()),
        v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) {
@@ -252,11 +259,23 @@ rmm::device_uvector<vertex_t> compute_mis(
     max_outgoing_ranks.resize(0, handle.get_stream());
     max_outgoing_ranks.shrink_to_fit(handle.get_stream());
 
-    remaining_vertices.resize(thrust::distance(remaining_vertices.begin(), last),
+    d_sampled_vertices.resize(thrust::distance(d_sampled_vertices.begin(), last),
+                              handle.get_stream());
+    d_sampled_vertices.shrink_to_fit(handle.get_stream());
+
+    remaining_vertices.resize(non_candidate_vertices.size() + d_sampled_vertices.size(),
                               handle.get_stream());
     remaining_vertices.shrink_to_fit(handle.get_stream());
 
-    vertex_t nr_remaining_vertices_to_check = remaining_vertices.size();
+    // merge non-candidate and remaining candidate vertices
+    thrust::merge(handle.get_thrust_policy(),
+                  non_candidate_vertices.begin(),
+                  non_candidate_vertices.end(),
+                  d_sampled_vertices.begin(),
+                  d_sampled_vertices.end(),
+                  remaining_vertices.begin());
+
+    nr_remaining_vertices_to_check = remaining_vertices.size();
     if (multi_gpu) {
       nr_remaining_vertices_to_check = host_scalar_allreduce(handle.get_comms(),
                                                              nr_remaining_vertices_to_check,
@@ -289,4 +308,14 @@ rmm::device_uvector<vertex_t> compute_mis(
   return mis;
 }
 }  // namespace detail
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> maximal_independent_set(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  raft::random::RngState& rng_state)
+{
+  return detail::maximal_independent_set(handle, graph_view, rng_state);
+}
+
 }  // namespace cugraph
diff --git a/cpp/src/community/detail/mis_mg.cu b/cpp/src/community/detail/mis_mg.cu
index def60f698ee..8ff0ed4b395 100644
--- a/cpp/src/community/detail/mis_mg.cu
+++ b/cpp/src/community/detail/mis_mg.cu
@@ -16,36 +16,19 @@
 #include <community/detail/mis_impl.cuh>
 
 namespace cugraph {
-namespace detail {
-template rmm::device_uvector<int32_t> compute_mis(
+template rmm::device_uvector<int32_t> maximal_independent_set(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, false, true> const& decision_graph_view,
-  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view);
+  raft::random::RngState& rng_state);
 
-template rmm::device_uvector<int32_t> compute_mis(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& decision_graph_view,
-  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view);
-
-template rmm::device_uvector<int32_t> compute_mis(
+template rmm::device_uvector<int32_t> maximal_independent_set(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, false, true> const& decision_graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view);
-
-template rmm::device_uvector<int32_t> compute_mis(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, false, true> const& decision_graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view);
-
-template rmm::device_uvector<int64_t> compute_mis(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& decision_graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view);
+  raft::random::RngState& rng_state);
 
-template rmm::device_uvector<int64_t> compute_mis(
+template rmm::device_uvector<int64_t> maximal_independent_set(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, false, true> const& decision_graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view);
+  raft::random::RngState& rng_state);
 
-}  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/community/detail/mis_sg.cu b/cpp/src/community/detail/mis_sg.cu
index 4da2b4ea741..d1012ae17bb 100644
--- a/cpp/src/community/detail/mis_sg.cu
+++ b/cpp/src/community/detail/mis_sg.cu
@@ -16,36 +16,19 @@
 #include <community/detail/mis_impl.cuh>
 
 namespace cugraph {
-namespace detail {
-template rmm::device_uvector<int32_t> compute_mis(
+template rmm::device_uvector<int32_t> maximal_independent_set(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, false, false> const& decision_graph_view,
-  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view);
+  raft::random::RngState& rng_state);
 
-template rmm::device_uvector<int32_t> compute_mis(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& decision_graph_view,
-  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view);
-
-template rmm::device_uvector<int32_t> compute_mis(
+template rmm::device_uvector<int32_t> maximal_independent_set(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, false, false> const& decision_graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view);
-
-template rmm::device_uvector<int32_t> compute_mis(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, false, false> const& decision_graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view);
-
-template rmm::device_uvector<int64_t> compute_mis(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& decision_graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view);
+  raft::random::RngState& rng_state);
 
-template rmm::device_uvector<int64_t> compute_mis(
+template rmm::device_uvector<int64_t> maximal_independent_set(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, false, false> const& decision_graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view);
+  raft::random::RngState& rng_state);
 
-}  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/community/detail/refine.hpp b/cpp/src/community/detail/refine.hpp
index 0dd069645f3..69b6702edf8 100644
--- a/cpp/src/community/detail/refine.hpp
+++ b/cpp/src/community/detail/refine.hpp
@@ -20,6 +20,7 @@
 #include <cugraph/graph.hpp>
 
 #include <raft/core/handle.hpp>
+#include <raft/random/rng_state.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace cugraph {
@@ -31,11 +32,13 @@ std::tuple<rmm::device_uvector<typename graph_view_t::vertex_type>,
                      rmm::device_uvector<typename graph_view_t::vertex_type>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t const& graph_view,
   std::optional<edge_property_view_t<typename graph_view_t::edge_type, weight_t const*>>
     edge_weight_view,
   weight_t total_edge_weight,
   weight_t resolution,
+  weight_t theta,
   rmm::device_uvector<weight_t> const& vertex_weights_v,
   rmm::device_uvector<typename graph_view_t::vertex_type>&& cluster_keys_v,
   rmm::device_uvector<weight_t>&& cluster_weights_v,
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index 2976a83773e..bbd720131de 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -16,7 +16,7 @@
 #pragma once
 
 #include <community/detail/common_methods.hpp>
-#include <community/detail/mis.hpp>
+#include <community/mis.hpp>
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
@@ -29,12 +29,15 @@
 #include <prims/update_edge_src_dst_property.cuh>
 #include <utilities/collect_comm.cuh>
 
+#include <raft/random/rng_device.cuh>
+
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/optional.h>
+#include <thrust/random.h>
 #include <thrust/sequence.h>
 #include <thrust/shuffle.h>
 #include <thrust/sort.h>
@@ -43,9 +46,6 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
-#include <algorithm>
-#include <cmath>
-
 CUCO_DECLARE_BITWISE_COMPARABLE(float)
 CUCO_DECLARE_BITWISE_COMPARABLE(double)
 
@@ -57,7 +57,9 @@ namespace detail {
 template <typename vertex_t, typename weight_t, typename cluster_value_t>
 struct leiden_key_aggregated_edge_op_t {
   weight_t total_edge_weight{};
-  weight_t gamma{};
+  weight_t resolution{};  // resolution parameter
+  weight_t theta{};       // scaling factor
+  raft::random::DeviceState<raft::random::PCGenerator> device_state{};
   __device__ auto operator()(
     vertex_t src,
     vertex_t neighboring_leiden_cluster,
@@ -83,22 +85,37 @@ struct leiden_key_aggregated_edge_op_t {
     // E(Cr, S-Cr) > ||Cr||*(||S|| -||Cr||)
     bool is_dst_leiden_cluster_well_connected =
       dst_leiden_cut_to_louvain >
-      gamma * dst_leiden_volume * (louvain_cluster_volume - dst_leiden_volume);
+      resolution * dst_leiden_volume * (louvain_cluster_volume - dst_leiden_volume);
 
     // E(v, Cr-v) - ||v||* ||Cr-v||/||V(G)||
     // aggregated_weight_to_neighboring_leiden_cluster == E(v, Cr-v)?
 
-    weight_t theta = -1.0;
-    // if ((is_src_active > 0) && is_src_well_connected) {
+    weight_t mod_gain = -1.0;
     if (is_src_active > 0) {
       if ((louvain_of_dst_leiden_cluster == src_louvain_cluster) &&
           is_dst_leiden_cluster_well_connected) {
-        theta = aggregated_weight_to_neighboring_leiden_cluster -
-                gamma * src_weighted_deg * dst_leiden_volume / total_edge_weight;
+        mod_gain = aggregated_weight_to_neighboring_leiden_cluster -
+                   resolution * src_weighted_deg * (dst_leiden_volume - src_weighted_deg) /
+                     total_edge_weight;
+
+        weight_t random_number{0.0};
+        if (mod_gain > 0.0) {
+          auto flat_id = uint64_t{threadIdx.x + blockIdx.x * blockDim.x};
+          raft::random::PCGenerator gen(device_state, flat_id);
+          raft::random::UniformDistParams<weight_t> int_params{};
+          int_params.start = weight_t{0.0};
+          int_params.end   = weight_t{1.0};
+          raft::random::custom_next(gen, &random_number, int_params, 0, 0);
+        }
+
+        mod_gain = mod_gain > 0.0
+                     ? __expf(static_cast<float>((2.0 * mod_gain) / (theta * total_edge_weight))) *
+                         random_number
+                     : -1.0;
       }
     }
 
-    return thrust::make_tuple(theta, neighboring_leiden_cluster);
+    return thrust::make_tuple(mod_gain, neighboring_leiden_cluster);
   }
 };
 
@@ -108,11 +125,13 @@ std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
                      rmm::device_uvector<typename GraphViewType::vertex_type>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   GraphViewType const& graph_view,
   std::optional<edge_property_view_t<typename GraphViewType::edge_type, weight_t const*>>
     edge_weight_view,
   weight_t total_edge_weight,
   weight_t resolution,
+  weight_t theta,
   rmm::device_uvector<weight_t> const& weighted_degree_of_vertices,
   rmm::device_uvector<typename GraphViewType::vertex_type>&& louvain_cluster_keys,
   rmm::device_uvector<weight_t>&& louvain_cluster_weights,
@@ -216,11 +235,11 @@ refine_clustering(
                     wcut_deg_and_cluster_vol_triple_begin,
                     wcut_deg_and_cluster_vol_triple_end,
                     singleton_and_connected_flags.begin(),
-                    [gamma = resolution] __device__(auto wcut_wdeg_and_louvain_volume) {
+                    [resolution] __device__(auto wcut_wdeg_and_louvain_volume) {
                       auto wcut           = thrust::get<0>(wcut_wdeg_and_louvain_volume);
                       auto wdeg           = thrust::get<1>(wcut_wdeg_and_louvain_volume);
                       auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
-                      return wcut > (gamma * wdeg * (louvain_volume - wdeg));
+                      return wcut > (resolution * wdeg * (louvain_volume - wdeg));
                     });
 
   edge_src_property_t<GraphViewType, weight_t> src_louvain_cluster_weight_cache(handle);
@@ -352,7 +371,7 @@ refine_clustering(
                       thrust::tuple<vertex_t, vertex_t> src_louvain_leidn,
                       thrust::tuple<vertex_t, vertex_t> dst_louvain_leiden,
                       auto wt) {
-          weight_t refined_partition_volume_contribution{0};
+          weight_t refined_partition_volume_contribution{wt};
           weight_t refined_partition_cut_contribution{0};
 
           auto src_louvain = thrust::get<0>(src_louvain_leidn);
@@ -362,11 +381,7 @@ refine_clustering(
           auto dst_leiden  = thrust::get<1>(dst_louvain_leiden);
 
           if (src_louvain == dst_louvain) {
-            if (src_leiden == dst_leiden) {
-              refined_partition_volume_contribution = wt;
-            } else {
-              refined_partition_cut_contribution = wt;
-            }
+            if (src_leiden != dst_leiden) { refined_partition_cut_contribution = wt; }
           }
           return thrust::make_tuple(refined_partition_volume_contribution,
                                     refined_partition_cut_contribution);
@@ -407,11 +422,49 @@ refine_clustering(
               louvain_assignment_of_vertices.data()));
 
     rmm::device_uvector<vertex_t> louvain_of_leiden_keys_used_in_edge_reduction(
-      leiden_keys_used_in_edge_reduction.size(), handle.get_stream());
-    leiden_to_louvain_map.view().find(leiden_keys_used_in_edge_reduction.begin(),
-                                      leiden_keys_used_in_edge_reduction.end(),
-                                      louvain_of_leiden_keys_used_in_edge_reduction.begin(),
-                                      handle.get_stream());
+      0, handle.get_stream());
+
+    if (GraphViewType::is_multi_gpu) {
+      auto& comm           = handle.get_comms();
+      auto const comm_size = comm.get_size();
+      auto& major_comm     = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+      auto const major_comm_size = major_comm.get_size();
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      auto const minor_comm_size = minor_comm.get_size();
+
+      auto partitions_range_lasts = graph_view.vertex_partition_range_lasts();
+      rmm::device_uvector<vertex_t> d_partitions_range_lasts(partitions_range_lasts.size(),
+                                                             handle.get_stream());
+
+      raft::update_device(d_partitions_range_lasts.data(),
+                          partitions_range_lasts.data(),
+                          partitions_range_lasts.size(),
+                          handle.get_stream());
+
+      cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t> vertex_to_gpu_id_op{
+        raft::device_span<vertex_t const>(d_partitions_range_lasts.data(),
+                                          d_partitions_range_lasts.size()),
+        major_comm_size,
+        minor_comm_size};
+
+      // cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
+      //   comm_size, major_comm_size, minor_comm_size};
+
+      louvain_of_leiden_keys_used_in_edge_reduction =
+        cugraph::collect_values_for_keys(handle,
+                                         leiden_to_louvain_map.view(),
+                                         leiden_keys_used_in_edge_reduction.begin(),
+                                         leiden_keys_used_in_edge_reduction.end(),
+                                         vertex_to_gpu_id_op);
+    } else {
+      louvain_of_leiden_keys_used_in_edge_reduction.resize(
+        leiden_keys_used_in_edge_reduction.size(), handle.get_stream());
+
+      leiden_to_louvain_map.view().find(leiden_keys_used_in_edge_reduction.begin(),
+                                        leiden_keys_used_in_edge_reduction.end(),
+                                        louvain_of_leiden_keys_used_in_edge_reduction.begin(),
+                                        handle.get_stream());
+    }
 
     // ||Cr|| //f(Cr)
     // E(Cr, louvain(v) - Cr) //f(Cr)
@@ -438,6 +491,9 @@ refine_clustering(
     //
     // Decide best/positive move for each vertex
     //
+    unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+    raft::random::RngState rng_state(seed);
+    raft::random::DeviceState<raft::random::PCGenerator> device_state(rng_state);
 
     auto gain_and_dst_output_pairs = allocate_dataframe_buffer<thrust::tuple<weight_t, vertex_t>>(
       graph_view.local_vertex_partition_range_size(), handle.get_stream());
@@ -451,8 +507,8 @@ refine_clustering(
                                   : detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
                                       leiden_assignment.data(), vertex_t{0}),
       leiden_cluster_key_values_map.view(),
-      detail::leiden_key_aggregated_edge_op_t<vertex_t, weight_t, value_t>{total_edge_weight,
-                                                                           resolution},
+      detail::leiden_key_aggregated_edge_op_t<vertex_t, weight_t, value_t>{
+        total_edge_weight, resolution, theta, device_state},
       thrust::make_tuple(weight_t{0}, vertex_t{-1}),
       reduce_op::maximum<thrust::tuple<weight_t, vertex_t>>(),
       cugraph::get_dataframe_buffer_begin(gain_and_dst_output_pairs));
@@ -483,16 +539,6 @@ refine_clustering(
     auto vertex_end =
       thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last());
 
-    // edge (src, dst, gain)
-    auto edge_begin = thrust::make_zip_iterator(
-      thrust::make_tuple(vertex_begin,
-                         thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
-                         thrust::get<0>(gain_and_dst_first.get_iterator_tuple())));
-    auto edge_end = thrust::make_zip_iterator(
-      thrust::make_tuple(vertex_end,
-                         thrust::get<1>(gain_and_dst_last.get_iterator_tuple()),
-                         thrust::get<0>(gain_and_dst_last.get_iterator_tuple())));
-
     //
     // Filter out moves with -ve gains
     //
@@ -501,17 +547,18 @@ refine_clustering(
                                                 gain_and_dst_first,
                                                 gain_and_dst_last,
                                                 [] __device__(auto gain_dst_pair) {
-                                                  weight_t gain = thrust::get<0>(gain_dst_pair);
                                                   vertex_t dst  = thrust::get<1>(gain_dst_pair);
+                                                  weight_t gain = thrust::get<0>(gain_dst_pair);
                                                   return (gain > POSITIVE_GAIN) && (dst >= 0);
                                                 });
 
+    vertex_t total_nr_valid_tuples = nr_valid_tuples;
     if (GraphViewType::is_multi_gpu) {
-      nr_valid_tuples = host_scalar_allreduce(
-        handle.get_comms(), nr_valid_tuples, raft::comms::op_t::SUM, handle.get_stream());
+      total_nr_valid_tuples = host_scalar_allreduce(
+        handle.get_comms(), total_nr_valid_tuples, raft::comms::op_t::SUM, handle.get_stream());
     }
 
-    if (nr_valid_tuples == 0) {
+    if (total_nr_valid_tuples == 0) {
       cugraph::resize_dataframe_buffer(gain_and_dst_output_pairs, 0, handle.get_stream());
       cugraph::shrink_to_fit_dataframe_buffer(gain_and_dst_output_pairs, handle.get_stream());
       break;
@@ -525,6 +572,16 @@ refine_clustering(
     auto d_src_dst_gain_iterator = thrust::make_zip_iterator(
       thrust::make_tuple(d_srcs.begin(), d_dsts.begin(), (*d_weights).begin()));
 
+    // edge (src, dst, gain)
+    auto edge_begin = thrust::make_zip_iterator(
+      thrust::make_tuple(vertex_begin,
+                         thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
+                         thrust::get<0>(gain_and_dst_first.get_iterator_tuple())));
+    auto edge_end = thrust::make_zip_iterator(
+      thrust::make_tuple(vertex_end,
+                         thrust::get<1>(gain_and_dst_last.get_iterator_tuple()),
+                         thrust::get<0>(gain_and_dst_last.get_iterator_tuple())));
+
     thrust::copy_if(handle.get_thrust_policy(),
                     edge_begin,
                     edge_end,
@@ -540,23 +597,41 @@ refine_clustering(
     //
     // Create decision graph from edgelist
     //
-    constexpr bool storage_transposed = false;
-    constexpr bool multi_gpu          = GraphViewType::is_multi_gpu;
-    using DecisionGraphViewType       = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
+    constexpr bool store_transposed = false;
+    constexpr bool multi_gpu        = GraphViewType::is_multi_gpu;
+    using DecisionGraphViewType     = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
 
-    cugraph::graph_t<vertex_t, edge_t, storage_transposed, multi_gpu> decision_graph(handle);
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu> decision_graph(handle);
 
     std::optional<rmm::device_uvector<vertex_t>> renumber_map{std::nullopt};
     std::optional<edge_property_t<DecisionGraphViewType, weight_t>> coarse_edge_weights{
       std::nullopt};
 
+    if constexpr (multi_gpu) {
+      std::tie(store_transposed ? d_dsts : d_srcs,
+               store_transposed ? d_srcs : d_dsts,
+               d_weights,
+               std::ignore,
+               std::ignore) =
+        cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
+          vertex_t,
+          vertex_t,
+          weight_t,
+          int32_t>(handle,
+                   store_transposed ? std::move(d_dsts) : std::move(d_srcs),
+                   store_transposed ? std::move(d_srcs) : std::move(d_dsts),
+                   std::move(d_weights),
+                   std::nullopt,
+                   std::nullopt);
+    }
+
     std::tie(decision_graph, coarse_edge_weights, std::ignore, std::ignore, renumber_map) =
       create_graph_from_edgelist<vertex_t,
                                  edge_t,
                                  weight_t,
                                  edge_t,
                                  int32_t,
-                                 storage_transposed,
+                                 store_transposed,
                                  multi_gpu>(handle,
                                             std::nullopt,
                                             std::move(d_srcs),
@@ -565,7 +640,8 @@ refine_clustering(
                                             std::nullopt,
                                             std::nullopt,
                                             cugraph::graph_properties_t{false, false},
-                                            true);
+                                            true,
+                                            false);
 
     auto decision_graph_view = decision_graph.view();
 
@@ -573,10 +649,8 @@ refine_clustering(
     // Determine a set of moves using MIS of the decision_graph
     //
 
-    auto vertices_in_mis = compute_mis<vertex_t, edge_t, weight_t, multi_gpu>(
-      handle,
-      decision_graph_view,
-      coarse_edge_weights ? std::make_optional(coarse_edge_weights->view()) : std::nullopt);
+    auto vertices_in_mis =
+      maximal_independent_set<vertex_t, edge_t, multi_gpu>(handle, decision_graph_view, rng_state);
 
     rmm::device_uvector<vertex_t> numbering_indices((*renumber_map).size(), handle.get_stream());
     detail::sequence_fill(handle.get_stream(),
@@ -602,6 +676,11 @@ refine_clustering(
     (*renumber_map).resize(0, handle.get_stream());
     (*renumber_map).shrink_to_fit(handle.get_stream());
 
+    if (GraphViewType::is_multi_gpu) {
+      vertices_in_mis = cugraph::detail::shuffle_int_vertices_to_local_gpu_by_vertex_partitioning(
+        handle, std::move(vertices_in_mis), graph_view.vertex_partition_range_lasts());
+    }
+
     //
     // Mark the chosen vertices as non-singleton and update their leiden cluster to dst
     //
@@ -650,9 +729,10 @@ refine_clustering(
         thrust::unique(handle.get_thrust_policy(), dst_vertices.begin(), dst_vertices.end()))),
       handle.get_stream());
 
+    // Shuffle dst vertices to owner GPU, according to vetex partitioning
     if constexpr (GraphViewType::is_multi_gpu) {
-      dst_vertices =
-        shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(handle, std::move(dst_vertices));
+      dst_vertices = cugraph::detail::shuffle_int_vertices_to_local_gpu_by_vertex_partitioning(
+        handle, std::move(dst_vertices), graph_view.vertex_partition_range_lasts());
 
       thrust::sort(handle.get_thrust_policy(), dst_vertices.begin(), dst_vertices.end());
 
@@ -682,9 +762,6 @@ refine_clustering(
   src_louvain_cluster_weight_cache.clear(handle);
   src_cut_to_louvain_cache.clear(handle);
 
-  louvain_assignment_of_vertices.resize(0, handle.get_stream());
-  louvain_assignment_of_vertices.shrink_to_fit(handle.get_stream());
-
   singleton_and_connected_flags.resize(0, handle.get_stream());
   singleton_and_connected_flags.shrink_to_fit(handle.get_stream());
   vertex_louvain_cluster_weights.resize(0, handle.get_stream());
@@ -716,9 +793,13 @@ refine_clustering(
   leiden_keys_to_read_louvain.resize(nr_unique_leiden_clusters, handle.get_stream());
 
   if constexpr (GraphViewType::is_multi_gpu) {
+    // leiden_keys_to_read_louvain =
+    //   cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
+    //     handle, std::move(leiden_keys_to_read_louvain));
+
     leiden_keys_to_read_louvain =
-      cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
-        handle, std::move(leiden_keys_to_read_louvain));
+      cugraph::detail::shuffle_int_vertices_to_local_gpu_by_vertex_partitioning(
+        handle, std::move(leiden_keys_to_read_louvain), graph_view.vertex_partition_range_lasts());
 
     thrust::sort(handle.get_thrust_policy(),
                  leiden_keys_to_read_louvain.begin(),
@@ -742,8 +823,23 @@ refine_clustering(
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
 
-    cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
-      comm_size, major_comm_size, minor_comm_size};
+    auto partitions_range_lasts = graph_view.vertex_partition_range_lasts();
+    rmm::device_uvector<vertex_t> d_partitions_range_lasts(partitions_range_lasts.size(),
+                                                           handle.get_stream());
+
+    raft::update_device(d_partitions_range_lasts.data(),
+                        partitions_range_lasts.data(),
+                        partitions_range_lasts.size(),
+                        handle.get_stream());
+
+    cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t> vertex_to_gpu_id_op{
+      raft::device_span<vertex_t const>(d_partitions_range_lasts.data(),
+                                        d_partitions_range_lasts.size()),
+      major_comm_size,
+      minor_comm_size};
+
+    // cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
+    //   comm_size, major_comm_size, minor_comm_size};
 
     lovain_of_leiden_cluster_keys =
       cugraph::collect_values_for_keys(handle,
@@ -751,6 +847,7 @@ refine_clustering(
                                        leiden_keys_to_read_louvain.begin(),
                                        leiden_keys_to_read_louvain.end(),
                                        vertex_to_gpu_id_op);
+
   } else {
     lovain_of_leiden_cluster_keys.resize(leiden_keys_to_read_louvain.size(), handle.get_stream());
 
diff --git a/cpp/src/community/detail/refine_mg.cu b/cpp/src/community/detail/refine_mg.cu
index 570298126bf..85b4a150e84 100644
--- a/cpp/src/community/detail/refine_mg.cu
+++ b/cpp/src/community/detail/refine_mg.cu
@@ -22,10 +22,12 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     std::pair<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   cugraph::graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
   float total_edge_weight,
   float resolution,
+  float theta,
   rmm::device_uvector<float> const& vertex_weights_v,
   rmm::device_uvector<int32_t>&& cluster_keys_v,
   rmm::device_uvector<float>&& cluster_weights_v,
@@ -42,10 +44,12 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     std::pair<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   cugraph::graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
   float total_edge_weight,
   float resolution,
+  float theta,
   rmm::device_uvector<float> const& vertex_weights_v,
   rmm::device_uvector<int32_t>&& cluster_keys_v,
   rmm::device_uvector<float>&& cluster_weights_v,
@@ -62,10 +66,12 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     std::pair<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   cugraph::graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
   float total_edge_weight,
   float resolution,
+  float theta,
   rmm::device_uvector<float> const& vertex_weights_v,
   rmm::device_uvector<int64_t>&& cluster_keys_v,
   rmm::device_uvector<float>&& cluster_weights_v,
@@ -82,10 +88,12 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     std::pair<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   cugraph::graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
   double total_edge_weight,
   double resolution,
+  double theta,
   rmm::device_uvector<double> const& vertex_weights_v,
   rmm::device_uvector<int32_t>&& cluster_keys_v,
   rmm::device_uvector<double>&& cluster_weights_v,
@@ -102,10 +110,12 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     std::pair<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   cugraph::graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
   double total_edge_weight,
   double resolution,
+  double theta,
   rmm::device_uvector<double> const& vertex_weights_v,
   rmm::device_uvector<int32_t>&& cluster_keys_v,
   rmm::device_uvector<double>&& cluster_weights_v,
@@ -122,10 +132,12 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     std::pair<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   cugraph::graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
   double total_edge_weight,
   double resolution,
+  double theta,
   rmm::device_uvector<double> const& vertex_weights_v,
   rmm::device_uvector<int64_t>&& cluster_keys_v,
   rmm::device_uvector<double>&& cluster_weights_v,
diff --git a/cpp/src/community/detail/refine_sg.cu b/cpp/src/community/detail/refine_sg.cu
index 2e8f80ebb78..140a23b7d53 100644
--- a/cpp/src/community/detail/refine_sg.cu
+++ b/cpp/src/community/detail/refine_sg.cu
@@ -22,10 +22,12 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     std::pair<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   cugraph::graph_view_t<int32_t, int32_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
   float total_edge_weight,
   float resolution,
+  float theta,
   rmm::device_uvector<float> const& vertex_weights_v,
   rmm::device_uvector<int32_t>&& cluster_keys_v,
   rmm::device_uvector<float>&& cluster_weights_v,
@@ -42,10 +44,12 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     std::pair<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   cugraph::graph_view_t<int32_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
   float total_edge_weight,
   float resolution,
+  float theta,
   rmm::device_uvector<float> const& vertex_weights_v,
   rmm::device_uvector<int32_t>&& cluster_keys_v,
   rmm::device_uvector<float>&& cluster_weights_v,
@@ -62,10 +66,12 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     std::pair<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   cugraph::graph_view_t<int64_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
   float total_edge_weight,
   float resolution,
+  float theta,
   rmm::device_uvector<float> const& vertex_weights_v,
   rmm::device_uvector<int64_t>&& cluster_keys_v,
   rmm::device_uvector<float>&& cluster_weights_v,
@@ -82,10 +88,12 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     std::pair<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   cugraph::graph_view_t<int32_t, int32_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
   double total_edge_weight,
   double resolution,
+  double theta,
   rmm::device_uvector<double> const& vertex_weights_v,
   rmm::device_uvector<int32_t>&& cluster_keys_v,
   rmm::device_uvector<double>&& cluster_weights_v,
@@ -102,10 +110,12 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     std::pair<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   cugraph::graph_view_t<int32_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
   double total_edge_weight,
   double resolution,
+  double theta,
   rmm::device_uvector<double> const& vertex_weights_v,
   rmm::device_uvector<int32_t>&& cluster_keys_v,
   rmm::device_uvector<double>&& cluster_weights_v,
@@ -122,10 +132,12 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     std::pair<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>>
 refine_clustering(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   cugraph::graph_view_t<int64_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
   double total_edge_weight,
   double resolution,
+  double theta,
   rmm::device_uvector<double> const& vertex_weights_v,
   rmm::device_uvector<int64_t>&& cluster_keys_v,
   rmm::device_uvector<double>&& cluster_weights_v,
diff --git a/cpp/src/community/egonet_impl.cuh b/cpp/src/community/egonet_impl.cuh
index f7a3c26455f..5cbb0b5e4b6 100644
--- a/cpp/src/community/egonet_impl.cuh
+++ b/cpp/src/community/egonet_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-//#define TIMING
+// #define TIMING
 
 #include <utilities/graph_utils.cuh>
 
@@ -228,6 +228,8 @@ extract_ego(raft::handle_t const& handle,
             vertex_t n_subgraphs,
             vertex_t radius)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   CUGRAPH_EXPECTS(n_subgraphs > 0, "Need at least one source to extract the egonet from");
   CUGRAPH_EXPECTS(n_subgraphs < graph_view.number_of_vertices(),
                   "Can't have more sources to extract from than vertices in the graph");
@@ -255,6 +257,8 @@ extract_ego(raft::handle_t const& handle,
             vertex_t radius,
             bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   CUGRAPH_EXPECTS(radius > 0, "Radius should be at least 1");
   CUGRAPH_EXPECTS(radius < graph_view.number_of_vertices(), "radius is too large");
 
diff --git a/cpp/src/community/legacy/extract_subgraph_by_vertex.cu b/cpp/src/community/legacy/extract_subgraph_by_vertex.cu
deleted file mode 100644
index 481b1fa33e9..00000000000
--- a/cpp/src/community/legacy/extract_subgraph_by_vertex.cu
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/legacy/graph.hpp>
-#include <cugraph/utilities/error.hpp>
-
-#include <raft/util/device_atomics.cuh>
-#include <rmm/device_vector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/count.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
-namespace {
-
-template <typename vertex_t, typename edge_t, typename weight_t, bool has_weight>
-std::unique_ptr<cugraph::legacy::GraphCOO<vertex_t, edge_t, weight_t>> extract_subgraph_by_vertices(
-  cugraph::legacy::GraphCOOView<vertex_t, edge_t, weight_t> const& graph,
-  vertex_t const* vertices,
-  vertex_t num_vertices,
-  cudaStream_t stream)
-{
-  edge_t graph_num_verts = graph.number_of_vertices;
-
-  rmm::device_vector<int64_t> error_count_v{1, 0};
-  rmm::device_vector<vertex_t> vertex_used_v{graph_num_verts, num_vertices};
-
-  vertex_t* d_vertex_used = vertex_used_v.data().get();
-  int64_t* d_error_count  = error_count_v.data().get();
-
-  thrust::for_each(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<vertex_t>(0),
-    thrust::make_counting_iterator<vertex_t>(num_vertices),
-    [vertices, d_vertex_used, d_error_count, graph_num_verts] __device__(vertex_t idx) {
-      vertex_t v = vertices[idx];
-      if ((v >= 0) && (v < graph_num_verts)) {
-        d_vertex_used[v] = idx;
-      } else {
-        atomicAdd(d_error_count, int64_t{1});
-      }
-    });
-
-  CUGRAPH_EXPECTS(error_count_v[0] == 0,
-                  "Input error... vertices specifies vertex id out of range");
-
-  vertex_t* graph_src    = graph.src_indices;
-  vertex_t* graph_dst    = graph.dst_indices;
-  weight_t* graph_weight = graph.edge_data;
-
-  // iterate over the edges and count how many make it into the output
-  int64_t count = thrust::count_if(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
-    [graph_src, graph_dst, d_vertex_used, num_vertices] __device__(edge_t e) {
-      vertex_t s = graph_src[e];
-      vertex_t d = graph_dst[e];
-      return ((d_vertex_used[s] < num_vertices) && (d_vertex_used[d] < num_vertices));
-    });
-
-  if (count > 0) {
-    auto result = std::make_unique<cugraph::legacy::GraphCOO<vertex_t, edge_t, weight_t>>(
-      num_vertices, count, has_weight);
-
-    vertex_t* d_new_src    = result->src_indices();
-    vertex_t* d_new_dst    = result->dst_indices();
-    weight_t* d_new_weight = result->edge_data();
-
-    //  reusing error_count as a vertex counter...
-    thrust::for_each(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<edge_t>(0),
-                     thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
-                     [graph_src,
-                      graph_dst,
-                      graph_weight,
-                      d_vertex_used,
-                      num_vertices,
-                      d_error_count,
-                      d_new_src,
-                      d_new_dst,
-                      d_new_weight] __device__(edge_t e) {
-                       vertex_t s = graph_src[e];
-                       vertex_t d = graph_dst[e];
-                       if ((d_vertex_used[s] < num_vertices) && (d_vertex_used[d] < num_vertices)) {
-                         //  NOTE: Could avoid atomic here by doing a inclusive sum, but that would
-                         //     require 2*|E| temporary memory.  If this becomes important perhaps
-                         //     we make 2 implementations and pick one based on the number of
-                         //     vertices in the subgraph set.
-                         auto pos       = atomicAdd(d_error_count, int64_t{1});
-                         d_new_src[pos] = d_vertex_used[s];
-                         d_new_dst[pos] = d_vertex_used[d];
-                         if (has_weight) d_new_weight[pos] = graph_weight[e];
-                       }
-                     });
-
-    return result;
-  } else {
-    return std::make_unique<cugraph::legacy::GraphCOO<vertex_t, edge_t, weight_t>>(
-      0, 0, has_weight);
-  }
-}
-}  // namespace
-
-namespace cugraph {
-namespace subgraph {
-
-template <typename VT, typename ET, typename WT>
-std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> extract_subgraph_vertex(
-  legacy::GraphCOOView<VT, ET, WT> const& graph, VT const* vertices, VT num_vertices)
-{
-  CUGRAPH_EXPECTS(vertices != nullptr, "Invalid input argument: vertices must be non null");
-
-  cudaStream_t stream{0};
-
-  if (graph.edge_data == nullptr) {
-    return extract_subgraph_by_vertices<VT, ET, WT, false>(graph, vertices, num_vertices, stream);
-  } else {
-    return extract_subgraph_by_vertices<VT, ET, WT, true>(graph, vertices, num_vertices, stream);
-  }
-}
-
-template std::unique_ptr<legacy::GraphCOO<int32_t, int32_t, float>>
-extract_subgraph_vertex<int32_t, int32_t, float>(
-  legacy::GraphCOOView<int32_t, int32_t, float> const&, int32_t const*, int32_t);
-template std::unique_ptr<legacy::GraphCOO<int32_t, int32_t, double>>
-extract_subgraph_vertex<int32_t, int32_t, double>(
-  legacy::GraphCOOView<int32_t, int32_t, double> const&, int32_t const*, int32_t);
-
-}  // namespace subgraph
-}  // namespace cugraph
diff --git a/cpp/src/community/legacy/leiden.cu b/cpp/src/community/legacy/leiden.cu
deleted file mode 100644
index 443bacaac27..00000000000
--- a/cpp/src/community/legacy/leiden.cu
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <community/flatten_dendrogram.hpp>
-#include <community/legacy/leiden.cuh>
-
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/iterator/counting_iterator.h>
-
-namespace cugraph {
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-std::pair<size_t, weight_t> leiden(raft::handle_t const& handle,
-                                   legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                                   vertex_t* clustering,
-                                   size_t max_level,
-                                   weight_t resolution)
-{
-  CUGRAPH_EXPECTS(graph.edge_data != nullptr,
-                  "Invalid input argument: leiden expects a weighted graph");
-  CUGRAPH_EXPECTS(clustering != nullptr,
-                  "Invalid input argument: clustering is null, should be a device pointer to "
-                  "memory for storing the result");
-
-  legacy::Leiden<legacy::GraphCSRView<vertex_t, edge_t, weight_t>> runner(handle, graph);
-  weight_t wt = runner(max_level, resolution);
-
-  rmm::device_uvector<vertex_t> vertex_ids_v(graph.number_of_vertices, handle.get_stream());
-
-  thrust::copy(handle.get_thrust_policy(),
-               thrust::make_counting_iterator<vertex_t>(0),  // MNMG - base vertex id
-               thrust::make_counting_iterator<vertex_t>(
-                 graph.number_of_vertices),  // MNMG - base vertex id + number_of_vertices
-               vertex_ids_v.begin());
-
-  partition_at_level<vertex_t, false>(handle,
-                                      runner.get_dendrogram(),
-                                      vertex_ids_v.data(),
-                                      clustering,
-                                      runner.get_dendrogram().num_levels());
-
-  // FIXME: Consider returning the Dendrogram at some point
-  return std::make_pair(runner.get_dendrogram().num_levels(), wt);
-}
-
-// Explicit template instantations
-template std::pair<size_t, float> leiden(raft::handle_t const&,
-                                         legacy::GraphCSRView<int32_t, int32_t, float> const&,
-                                         int32_t*,
-                                         size_t,
-                                         float);
-
-template std::pair<size_t, double> leiden(raft::handle_t const&,
-                                          legacy::GraphCSRView<int32_t, int32_t, double> const&,
-                                          int32_t*,
-                                          size_t,
-                                          double);
-
-}  // namespace cugraph
diff --git a/cpp/src/community/legacy/leiden.cuh b/cpp/src/community/legacy/leiden.cuh
deleted file mode 100644
index 97d9210000e..00000000000
--- a/cpp/src/community/legacy/leiden.cuh
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <community/legacy/louvain.cuh>
-
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/sequence.h>
-
-namespace cugraph {
-namespace legacy {
-
-template <typename graph_type>
-class Leiden : public Louvain<graph_type> {
- public:
-  using graph_t  = graph_type;
-  using vertex_t = typename graph_type::vertex_type;
-  using edge_t   = typename graph_type::edge_type;
-  using weight_t = typename graph_type::weight_type;
-
-  Leiden(raft::handle_t const& handle, graph_type const& graph)
-    : Louvain<graph_type>(handle, graph),
-      constraint_v_(graph.number_of_vertices, handle.get_stream())
-  {
-  }
-
-  weight_t update_clustering_constrained(weight_t total_edge_weight,
-                                         weight_t resolution,
-                                         graph_type const& graph)
-  {
-    this->timer_start("update_clustering_constrained");
-
-    rmm::device_uvector<vertex_t> next_cluster_v(this->dendrogram_->current_level_size(),
-                                                 this->handle_.get_stream());
-    rmm::device_uvector<weight_t> delta_Q_v(graph.number_of_edges, this->handle_.get_stream());
-    rmm::device_uvector<vertex_t> cluster_hash_v(graph.number_of_edges, this->handle_.get_stream());
-    rmm::device_uvector<weight_t> old_cluster_sum_v(graph.number_of_vertices,
-                                                    this->handle_.get_stream());
-
-    vertex_t const* d_src_indices    = this->src_indices_v_.data();
-    vertex_t const* d_dst_indices    = graph.indices;
-    vertex_t* d_cluster_hash         = cluster_hash_v.data();
-    vertex_t* d_cluster              = this->dendrogram_->current_level_begin();
-    weight_t const* d_vertex_weights = this->vertex_weights_v_.data();
-    weight_t* d_cluster_weights      = this->cluster_weights_v_.data();
-    weight_t* d_delta_Q              = delta_Q_v.data();
-    vertex_t* d_constraint           = constraint_v_.data();
-
-    thrust::copy(this->handle_.get_thrust_policy(),
-                 this->dendrogram_->current_level_begin(),
-                 this->dendrogram_->current_level_end(),
-                 next_cluster_v.data());
-
-    weight_t new_Q = this->modularity(
-      total_edge_weight, resolution, graph, this->dendrogram_->current_level_begin());
-
-    weight_t cur_Q = new_Q - 1;
-
-    // To avoid the potential of having two vertices swap clusters
-    // we will only allow vertices to move up (true) or down (false)
-    // during each iteration of the loop
-    bool up_down = true;
-
-    while (new_Q > (cur_Q + 0.0001)) {
-      cur_Q = new_Q;
-
-      this->compute_delta_modularity(
-        total_edge_weight, resolution, graph, cluster_hash_v, old_cluster_sum_v, delta_Q_v);
-
-      // Filter out positive delta_Q values for nodes not in the same constraint group
-      thrust::for_each(
-        this->handle_.get_thrust_policy(),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(graph.number_of_edges),
-        [d_src_indices, d_dst_indices, d_constraint, d_delta_Q] __device__(vertex_t i) {
-          vertex_t start_cluster = d_constraint[d_src_indices[i]];
-          vertex_t end_cluster   = d_constraint[d_dst_indices[i]];
-          if (start_cluster != end_cluster) d_delta_Q[i] = weight_t{0.0};
-        });
-
-      this->assign_nodes(graph, cluster_hash_v, next_cluster_v, delta_Q_v, up_down);
-
-      up_down = !up_down;
-
-      new_Q = this->modularity(total_edge_weight, resolution, graph, next_cluster_v.data());
-
-      if (new_Q > cur_Q) {
-        thrust::copy(this->handle_.get_thrust_policy(),
-                     next_cluster_v.begin(),
-                     next_cluster_v.end(),
-                     this->dendrogram_->current_level_begin());
-      }
-    }
-
-    this->timer_stop(this->handle_.get_stream());
-    return cur_Q;
-  }
-
-  weight_t operator()(size_t max_level, weight_t resolution) override
-  {
-    size_t num_level{0};
-
-    weight_t total_edge_weight = thrust::reduce(
-      this->handle_.get_thrust_policy(), this->weights_v_.begin(), this->weights_v_.end());
-
-    weight_t best_modularity = weight_t{-1};
-
-    //
-    //  Our copy of the graph.  Each iteration of the outer loop will
-    //  shrink this copy of the graph.
-    //
-    legacy::GraphCSRView<vertex_t, edge_t, weight_t> current_graph(this->offsets_v_.data(),
-                                                                   this->indices_v_.data(),
-                                                                   this->weights_v_.data(),
-                                                                   this->number_of_vertices_,
-                                                                   this->number_of_edges_);
-
-    current_graph.get_source_indices(this->src_indices_v_.data());
-
-    while (num_level < max_level) {
-      //
-      //  Initialize every cluster to reference each vertex to itself
-      //
-      this->dendrogram_->add_level(0, current_graph.number_of_vertices, this->handle_.get_stream());
-
-      thrust::sequence(this->handle_.get_thrust_policy(),
-                       this->dendrogram_->current_level_begin(),
-                       this->dendrogram_->current_level_end());
-
-      this->compute_vertex_and_cluster_weights(current_graph);
-
-      weight_t new_Q = this->update_clustering(total_edge_weight, resolution, current_graph);
-
-      new_Q = update_clustering_constrained(total_edge_weight, resolution, current_graph);
-
-      if (new_Q <= best_modularity) { break; }
-
-      best_modularity = new_Q;
-
-      this->shrink_graph(current_graph);
-
-      num_level++;
-    }
-
-    this->timer_display_and_clear(std::cout);
-
-    return best_modularity;
-  }
-
- private:
-  rmm::device_uvector<vertex_t> constraint_v_;
-};
-
-}  // namespace legacy
-}  // namespace cugraph
diff --git a/cpp/src/community/legacy/louvain.cuh b/cpp/src/community/legacy/louvain.cuh
index 5f577d0c9fb..063676a909a 100644
--- a/cpp/src/community/legacy/louvain.cuh
+++ b/cpp/src/community/legacy/louvain.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-//#define TIMING
+// #define TIMING
 
 #include <cugraph/legacy/graph.hpp>
 
diff --git a/cpp/src/community/leiden_impl.cuh b/cpp/src/community/leiden_impl.cuh
index 8549c1ae8a9..a9faf2f2d82 100644
--- a/cpp/src/community/leiden_impl.cuh
+++ b/cpp/src/community/leiden_impl.cuh
@@ -51,21 +51,23 @@ template <typename vertex_t,
           bool store_transposed = false>
 std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   size_t max_level,
-  weight_t resolution)
+  weight_t resolution,
+  weight_t theta = 1.0)
 {
   using graph_t      = cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>;
   using graph_view_t = cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>;
 
   std::unique_ptr<Dendrogram<vertex_t>> dendrogram = std::make_unique<Dendrogram<vertex_t>>();
 
-  graph_t current_graph(handle);
   graph_view_t current_graph_view(graph_view);
-
   std::optional<edge_property_view_t<edge_t, weight_t const*>> current_edge_weight_view(
     edge_weight_view);
+
+  graph_t coarse_graph(handle);
   std::optional<edge_property_t<graph_view_t, weight_t>> coarsen_graph_edge_weight(handle);
 
 #ifdef TIMING
@@ -82,6 +84,7 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
     //
     //  Initialize every cluster to reference each vertex to itself
     //
+
     dendrogram->add_level(current_graph_view.local_vertex_partition_range_first(),
                           current_graph_view.local_vertex_partition_range_size(),
                           handle.get_stream());
@@ -207,8 +210,6 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
         edge_src_property_t<graph_view_t, weight_t>(handle, current_graph_view);
       update_edge_src_property(
         handle, current_graph_view, vertex_weights.begin(), src_vertex_weights_cache);
-      vertex_weights.resize(0, handle.get_stream());
-      vertex_weights.shrink_to_fit(handle.get_stream());
     }
 
 #ifdef TIMING
@@ -243,9 +244,6 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
                                current_graph_view,
                                louvain_assignment_for_vertices.begin(),
                                dst_louvain_assignment_cache);
-
-      louvain_assignment_for_vertices.resize(0, handle.get_stream());
-      louvain_assignment_for_vertices.shrink_to_fit(handle.get_stream());
     }
 
     weight_t new_Q = detail::compute_modularity(handle,
@@ -262,8 +260,7 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
     // To avoid the potential of having two vertices swap cluster_keys
     // we will only allow vertices to move up (true) or down (false)
     // during each iteration of the loop
-    bool up_down     = true;
-    bool no_movement = true;
+    bool up_down = true;
     while (new_Q > (cur_Q + 1e-4)) {
       cur_Q = new_Q;
 
@@ -334,7 +331,6 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
                    louvain_assignment_for_vertices.begin(),
                    louvain_assignment_for_vertices.size(),
                    handle.get_stream());
-        no_movement = false;
       }
     }
 
@@ -342,14 +338,13 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
     detail::timer_stop<graph_view_t::is_multi_gpu>(handle, hr_timer);
 #endif
 
-    bool terminate = no_movement || (cur_Q <= best_modularity);
+    bool terminate = (cur_Q <= best_modularity);
+    if (!terminate) { best_modularity = cur_Q; }
 
 #ifdef TIMING
     detail::timer_start<graph_view_t::is_multi_gpu>(handle, hr_timer, "contract graph");
 #endif
 
-    if (!terminate) { best_modularity = cur_Q; }
-
     // Count number of unique louvain clusters
 
     rmm::device_uvector<vertex_t> copied_louvain_partition(dendrogram->current_level_size(),
@@ -421,10 +416,12 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
 
       std::tie(refined_leiden_partition, leiden_to_louvain_map) =
         detail::refine_clustering(handle,
+                                  rng_state,
                                   current_graph_view,
                                   current_edge_weight_view,
                                   total_edge_weight,
                                   resolution,
+                                  theta,
                                   vertex_weights,
                                   std::move(cluster_keys),
                                   std::move(cluster_weights),
@@ -454,27 +451,28 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
         nr_unique_leiden = host_scalar_allreduce(
           handle.get_comms(), nr_unique_leiden, raft::comms::op_t::SUM, handle.get_stream());
       }
+
       terminate = terminate || (nr_unique_leiden == current_graph_view.number_of_vertices());
 
       if (nr_unique_leiden < current_graph_view.number_of_vertices()) {
         // Create aggregate graph based on refined (leiden) partition
         std::optional<rmm::device_uvector<vertex_t>> cluster_assignment{std::nullopt};
-        std::tie(current_graph, coarsen_graph_edge_weight, cluster_assignment) =
+        std::tie(coarse_graph, coarsen_graph_edge_weight, cluster_assignment) =
           coarsen_graph(handle,
                         current_graph_view,
                         current_edge_weight_view,
                         refined_leiden_partition.data(),
                         true);
 
-        current_graph_view = current_graph.view();
+        current_graph_view = coarse_graph.view();
 
         current_edge_weight_view =
           std::make_optional<edge_property_view_t<edge_t, weight_t const*>>(
             (*coarsen_graph_edge_weight).view());
 
         // cluster_assignment contains leiden cluster ids of aggregated nodes
-        // After call to relabel, cluster_assignment will louvain cluster ids of the aggregated
-        // nodes
+        // After call to relabel, cluster_assignment will louvain cluster ids
+        // of the aggregated nodes
         relabel<vertex_t, multi_gpu>(
           handle,
           std::make_tuple(static_cast<vertex_t const*>(leiden_to_louvain_map.first.begin()),
@@ -495,12 +493,36 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
     }
 
     // Relabel dendrogram
+    vertex_t local_cluster_id_first{0};
+    if constexpr (multi_gpu) {
+      auto unique_cluster_range_lasts = cugraph::partition_manager::compute_partition_range_lasts(
+        handle, static_cast<vertex_t>(copied_louvain_partition.size()));
+
+      auto& comm           = handle.get_comms();
+      auto const comm_size = comm.get_size();
+      auto const comm_rank = comm.get_rank();
+      auto& major_comm     = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+      auto const major_comm_size = major_comm.get_size();
+      auto const major_comm_rank = major_comm.get_rank();
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      auto const minor_comm_size = minor_comm.get_size();
+      auto const minor_comm_rank = minor_comm.get_rank();
+
+      auto vertex_partition_id =
+        partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks(
+          major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
+
+      local_cluster_id_first = vertex_partition_id == 0
+                                 ? vertex_t{0}
+                                 : unique_cluster_range_lasts[vertex_partition_id - 1];
+    }
+
     rmm::device_uvector<vertex_t> numbering_indices(copied_louvain_partition.size(),
                                                     handle.get_stream());
     detail::sequence_fill(handle.get_stream(),
                           numbering_indices.data(),
                           numbering_indices.size(),
-                          current_graph_view.local_vertex_partition_range_first());
+                          local_cluster_id_first);
 
     relabel<vertex_t, multi_gpu>(
       handle,
@@ -519,7 +541,7 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
 #ifdef TIMING
     detail::timer_stop<graph_view_t::is_multi_gpu>(handle, hr_timer);
 #endif
-  }
+  }  // end of outer while
 
 #ifdef TIMING
   detail::timer_display<graph_view_t::is_multi_gpu>(handle, hr_timer, std::cout);
@@ -552,12 +574,17 @@ void flatten_dendrogram(raft::handle_t const& handle,
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   size_t max_level,
-  weight_t resolution)
+  weight_t resolution,
+  weight_t theta = 1.0)
 {
-  return detail::leiden(handle, graph_view, edge_weight_view, max_level, resolution);
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
+  return detail::leiden(
+    handle, rng_state, graph_view, edge_weight_view, max_level, resolution, theta);
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
@@ -566,18 +593,24 @@ void flatten_dendrogram(raft::handle_t const& handle,
                         Dendrogram<vertex_t> const& dendrogram,
                         vertex_t* clustering)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   detail::flatten_dendrogram(handle, graph_view, dendrogram, clustering);
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::pair<size_t, weight_t> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   vertex_t* clustering,
   size_t max_level,
-  weight_t resolution)
+  weight_t resolution,
+  weight_t theta = 1.0)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   CUGRAPH_EXPECTS(edge_weight_view.has_value(), "Graph must be weighted");
   detail::check_clustering(graph_view, clustering);
 
@@ -585,7 +618,7 @@ std::pair<size_t, weight_t> leiden(
   weight_t modularity;
 
   std::tie(dendrogram, modularity) =
-    detail::leiden(handle, graph_view, edge_weight_view, max_level, resolution);
+    detail::leiden(handle, rng_state, graph_view, edge_weight_view, max_level, resolution, theta);
 
   detail::flatten_dendrogram(handle, graph_view, *dendrogram, clustering);
 
diff --git a/cpp/src/community/leiden_mg.cu b/cpp/src/community/leiden_mg.cu
index 77e4c9a96b6..d74e004927b 100644
--- a/cpp/src/community/leiden_mg.cu
+++ b/cpp/src/community/leiden_mg.cu
@@ -22,84 +22,108 @@ namespace cugraph {
 
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
   size_t max_level,
-  float resolution);
+  float resolution,
+  float theta);
 
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
   size_t max_level,
-  float resolution);
+  float resolution,
+  float theta);
 
 template std::pair<std::unique_ptr<Dendrogram<int64_t>>, float> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
   size_t max_level,
-  float resolution);
+  float resolution,
+  float theta);
 
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<int32_t, int32_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
   size_t max_level,
-  double resolution);
+  double resolution,
+  double theta);
 
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<int32_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
   size_t max_level,
-  double resolution);
+  double resolution,
+  double theta);
 
 template std::pair<std::unique_ptr<Dendrogram<int64_t>>, double> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<int64_t, int64_t, false, true> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
   size_t max_level,
-  double resolution);
+  double resolution,
+  double theta);
 
 template std::pair<size_t, float> leiden(raft::handle_t const&,
+                                         raft::random::RngState&,
                                          graph_view_t<int32_t, int32_t, false, true> const&,
                                          std::optional<edge_property_view_t<int32_t, float const*>>,
                                          int32_t*,
                                          size_t,
+                                         float,
                                          float);
 template std::pair<size_t, double> leiden(
   raft::handle_t const&,
+  raft::random::RngState&,
   graph_view_t<int32_t, int32_t, false, true> const&,
   std::optional<edge_property_view_t<int32_t, double const*>>,
   int32_t*,
   size_t,
+  double,
   double);
 template std::pair<size_t, float> leiden(raft::handle_t const&,
+                                         raft::random::RngState&,
                                          graph_view_t<int32_t, int64_t, false, true> const&,
                                          std::optional<edge_property_view_t<int64_t, float const*>>,
                                          int32_t*,
                                          size_t,
+                                         float,
                                          float);
 template std::pair<size_t, double> leiden(
   raft::handle_t const&,
+  raft::random::RngState&,
   graph_view_t<int32_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   int32_t*,
   size_t,
+  double,
   double);
 template std::pair<size_t, float> leiden(raft::handle_t const&,
+                                         raft::random::RngState&,
                                          graph_view_t<int64_t, int64_t, false, true> const&,
                                          std::optional<edge_property_view_t<int64_t, float const*>>,
                                          int64_t*,
                                          size_t,
+                                         float,
                                          float);
 template std::pair<size_t, double> leiden(
   raft::handle_t const&,
+  raft::random::RngState&,
   graph_view_t<int64_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   int64_t*,
   size_t,
+  double,
   double);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/leiden_sg.cu b/cpp/src/community/leiden_sg.cu
index 1c821649fa1..bc1b4e6cff5 100644
--- a/cpp/src/community/leiden_sg.cu
+++ b/cpp/src/community/leiden_sg.cu
@@ -22,84 +22,108 @@ namespace cugraph {
 
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<int32_t, int32_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
   size_t max_level,
-  float resolution);
+  float resolution,
+  float theta);
 
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<int32_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
   size_t max_level,
-  float resolution);
+  float resolution,
+  float theta);
 
 template std::pair<std::unique_ptr<Dendrogram<int64_t>>, float> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<int64_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
   size_t max_level,
-  float resolution);
+  float resolution,
+  float theta);
 
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<int32_t, int32_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
   size_t max_level,
-  double resolution);
+  double resolution,
+  double theta);
 
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<int32_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
   size_t max_level,
-  double resolution);
+  double resolution,
+  double theta);
 
 template std::pair<std::unique_ptr<Dendrogram<int64_t>>, double> leiden(
   raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
   graph_view_t<int64_t, int64_t, false, false> const& graph_view,
   std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
   size_t max_level,
-  double resolution);
+  double resolution,
+  double theta);
 
 template std::pair<size_t, float> leiden(raft::handle_t const&,
+                                         raft::random::RngState&,
                                          graph_view_t<int32_t, int32_t, false, false> const&,
                                          std::optional<edge_property_view_t<int32_t, float const*>>,
                                          int32_t*,
                                          size_t,
+                                         float,
                                          float);
 template std::pair<size_t, double> leiden(
   raft::handle_t const&,
+  raft::random::RngState&,
   graph_view_t<int32_t, int32_t, false, false> const&,
   std::optional<edge_property_view_t<int32_t, double const*>>,
   int32_t*,
   size_t,
+  double,
   double);
 template std::pair<size_t, float> leiden(raft::handle_t const&,
+                                         raft::random::RngState&,
                                          graph_view_t<int32_t, int64_t, false, false> const&,
                                          std::optional<edge_property_view_t<int64_t, float const*>>,
                                          int32_t*,
                                          size_t,
+                                         float,
                                          float);
 template std::pair<size_t, double> leiden(
   raft::handle_t const&,
+  raft::random::RngState&,
   graph_view_t<int32_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   int32_t*,
   size_t,
+  double,
   double);
 template std::pair<size_t, float> leiden(raft::handle_t const&,
+                                         raft::random::RngState&,
                                          graph_view_t<int64_t, int64_t, false, false> const&,
                                          std::optional<edge_property_view_t<int64_t, float const*>>,
                                          int64_t*,
                                          size_t,
+                                         float,
                                          float);
 template std::pair<size_t, double> leiden(
   raft::handle_t const&,
+  raft::random::RngState&,
   graph_view_t<int64_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   int64_t*,
   size_t,
+  double,
   double);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/louvain_impl.cuh b/cpp/src/community/louvain_impl.cuh
index 7d205ffa48e..167de36dd13 100644
--- a/cpp/src/community/louvain_impl.cuh
+++ b/cpp/src/community/louvain_impl.cuh
@@ -293,6 +293,8 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
   size_t max_level,
   weight_t resolution)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   CUGRAPH_EXPECTS(edge_weight_view.has_value(), "Graph must be weighted");
   return detail::louvain(handle, graph_view, edge_weight_view, max_level, resolution);
 }
@@ -303,6 +305,8 @@ void flatten_dendrogram(raft::handle_t const& handle,
                         Dendrogram<vertex_t> const& dendrogram,
                         vertex_t* clustering)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   detail::flatten_dendrogram(handle, graph_view, dendrogram, clustering);
 }
 
@@ -315,6 +319,8 @@ std::pair<size_t, weight_t> louvain(
   size_t max_level,
   weight_t resolution)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   CUGRAPH_EXPECTS(edge_weight_view.has_value(), "Graph must be weighted");
   detail::check_clustering(graph_view, clustering);
 
diff --git a/cpp/src/community/detail/mis.hpp b/cpp/src/community/mis.hpp
similarity index 76%
rename from cpp/src/community/detail/mis.hpp
rename to cpp/src/community/mis.hpp
index 8a86757a5bc..3f1e655c0c4 100644
--- a/cpp/src/community/detail/mis.hpp
+++ b/cpp/src/community/mis.hpp
@@ -15,19 +15,16 @@
  */
 #pragma once
 #include <cugraph/edge_property.hpp>
-#include <cugraph/graph.hpp>
 #include <cugraph/graph_view.hpp>
 
 #include <raft/core/handle.hpp>
+#include <raft/random/rng_state.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace cugraph {
-namespace detail {
-
-template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-rmm::device_uvector<vertex_t> compute_mis(
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> maximal_independent_set(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
-  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view);
-}  // namespace detail
+  raft::random::RngState& rng_state);
 }  // namespace cugraph
diff --git a/cpp/src/community/triangle_count_impl.cuh b/cpp/src/community/triangle_count_impl.cuh
index 9d4778c5e7c..96874041db6 100644
--- a/cpp/src/community/triangle_count_impl.cuh
+++ b/cpp/src/community/triangle_count_impl.cuh
@@ -68,21 +68,15 @@ struct exclude_self_loop_t {
 
 template <typename edge_t>
 struct is_two_or_greater_t {
-  __device__ uint8_t operator()(edge_t core_number) const
-  {
-    return core_number >= edge_t{2} ? uint8_t{1} : uint8_t{0};
-  }
+  __device__ bool operator()(edge_t core_number) const { return core_number >= edge_t{2}; }
 };
 
 template <typename vertex_t>
 struct extract_two_core_t {
-  __device__ thrust::optional<thrust::tuple<vertex_t, vertex_t>> operator()(vertex_t src,
-                                                                            vertex_t dst,
-                                                                            uint8_t src_in_two_core,
-                                                                            uint8_t dst_in_two_core,
-                                                                            thrust::nullopt_t) const
+  __device__ thrust::optional<thrust::tuple<vertex_t, vertex_t>> operator()(
+    vertex_t src, vertex_t dst, bool src_in_two_core, bool dst_in_two_core, thrust::nullopt_t) const
   {
-    return (src_in_two_core == uint8_t{1}) && (dst_in_two_core == uint8_t{1})
+    return (src_in_two_core && dst_in_two_core)
              ? thrust::optional<thrust::tuple<vertex_t, vertex_t>>{thrust::make_tuple(src, dst)}
              : thrust::nullopt;
   }
@@ -162,6 +156,8 @@ void triangle_count(raft::handle_t const& handle,
 
   // 1. Check input arguments.
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   CUGRAPH_EXPECTS(
     graph_view.is_symmetric(),
     "Invalid input arguments: triangle_count currently supports undirected graphs only.");
@@ -254,13 +250,13 @@ void triangle_count(raft::handle_t const& handle,
     core_number(
       handle, cur_graph_view, core_numbers.data(), k_core_degree_type_t::OUT, size_t{2}, size_t{2});
 
-    edge_src_property_t<decltype(cur_graph_view), uint8_t> edge_src_in_two_cores(handle,
-                                                                                 cur_graph_view);
-    edge_dst_property_t<decltype(cur_graph_view), uint8_t> edge_dst_in_two_cores(handle,
-                                                                                 cur_graph_view);
+    edge_src_property_t<decltype(cur_graph_view), bool> edge_src_in_two_cores(handle,
+                                                                              cur_graph_view);
+    edge_dst_property_t<decltype(cur_graph_view), bool> edge_dst_in_two_cores(handle,
+                                                                              cur_graph_view);
     auto in_two_core_first =
       thrust::make_transform_iterator(core_numbers.begin(), is_two_or_greater_t<edge_t>{});
-    rmm::device_uvector<uint8_t> in_two_core_flags(core_numbers.size(), handle.get_stream());
+    rmm::device_uvector<bool> in_two_core_flags(core_numbers.size(), handle.get_stream());
     thrust::copy(handle.get_thrust_policy(),
                  in_two_core_first,
                  in_two_core_first + core_numbers.size(),
diff --git a/cpp/src/components/legacy/scc_matrix.cuh b/cpp/src/components/legacy/scc_matrix.cuh
index 8c2422644f4..3d56bdc5bf4 100644
--- a/cpp/src/components/legacy/scc_matrix.cuh
+++ b/cpp/src/components/legacy/scc_matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ struct SCC_Data {
   SCC_Data(size_t nrows,
            const IndexT* p_d_r_o,  // row_offsets
            const IndexT* p_d_c_i)
-    :  // column indices
+    :                              // column indices
       nrows_(nrows),
       p_d_r_o_(p_d_r_o),
       p_d_c_i_(p_d_c_i),
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index 53e72cb079b..5e228d2445d 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -195,10 +195,8 @@ struct e_op_t {
   {
     auto tag        = thrust::get<1>(tagged_src);
     auto dst_offset = dst - dst_first;
-    // FIXME: better switch to atomic_ref after
-    // https://github.com/nvidia/libcudacxx/milestone/2
     auto old =
-      atomicCAS(dst_components.get_iter(dst_offset), invalid_component_id<vertex_t>::value, tag);
+      dst_components.elementwise_atomic_cas(dst_offset, invalid_component_id<vertex_t>::value, tag);
     if (old != invalid_component_id<vertex_t>::value && old != tag) {  // conflict
       static_assert(sizeof(unsigned long long int) == sizeof(size_t));
       auto edge_idx = atomicAdd(reinterpret_cast<unsigned long long int*>(num_edge_inserts),
@@ -785,6 +783,8 @@ void weakly_connected_components(raft::handle_t const& handle,
                                  vertex_t* components,
                                  bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   weakly_connected_components_impl(handle, graph_view, components, do_expensive_check);
 }
 
diff --git a/cpp/src/cores/core_number_impl.cuh b/cpp/src/cores/core_number_impl.cuh
index d1a48b6e2a2..b63ae60f052 100644
--- a/cpp/src/cores/core_number_impl.cuh
+++ b/cpp/src/cores/core_number_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,7 +72,7 @@ struct v_to_core_number_t {
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename edge_t>
 struct mult_degree_by_two_t {
-  __device__ edge_t operator()(edge_t d) const { return d * edge_t{2}; }
+  __device__ edge_t operator()(edge_t d) const { return d* edge_t{2}; }
 };
 
 }  // namespace
@@ -88,6 +88,8 @@ void core_number(raft::handle_t const& handle,
 {
   // check input arguments.
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   CUGRAPH_EXPECTS(graph_view.is_symmetric(),
                   "Invalid input argument: core_number currently supports only undirected graphs.");
   CUGRAPH_EXPECTS(!graph_view.is_multigraph(),
diff --git a/cpp/src/cores/k_core_impl.cuh b/cpp/src/cores/k_core_impl.cuh
index 5d15385fa75..06402cc3382 100644
--- a/cpp/src/cores/k_core_impl.cuh
+++ b/cpp/src/cores/k_core_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,6 +37,8 @@ k_core(raft::handle_t const& handle,
        std::optional<raft::device_span<edge_t const>> core_numbers,
        bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   rmm::device_uvector<edge_t> computed_core_numbers(0, handle.get_stream());
 
   if (!core_numbers) {
diff --git a/cpp/src/generators/generate_bipartite_rmat_edgelist.cu b/cpp/src/generators/generate_bipartite_rmat_edgelist.cu
new file mode 100644
index 00000000000..c02e1a7e7fa
--- /dev/null
+++ b/cpp/src/generators/generate_bipartite_rmat_edgelist.cu
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_generators.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/random/rng.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <rmm/detail/error.hpp>
+#include <tuple>
+
+namespace cugraph {
+
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>>
+generate_bipartite_rmat_edgelist(raft::handle_t const& handle,
+                                 raft::random::RngState& rng_state,
+                                 size_t src_scale,
+                                 size_t dst_scale,
+                                 size_t num_edges,
+                                 double a,
+                                 double b,
+                                 double c)
+{
+  CUGRAPH_EXPECTS(
+    (size_t{1} << src_scale) <= static_cast<size_t>(std::numeric_limits<vertex_t>::max()),
+    "Invalid input argument: src_scale too large for vertex_t.");
+  CUGRAPH_EXPECTS(
+    (size_t{1} << dst_scale) <= static_cast<size_t>(std::numeric_limits<vertex_t>::max()),
+    "Invalid input argument: dst_scale too large for vertex_t.");
+  CUGRAPH_EXPECTS((a >= 0.0) && (b >= 0.0) && (c >= 0.0) && (a + b + c <= 1.0),
+                  "Invalid input argument: a, b, c should be non-negative and a + b + c should not "
+                  "be larger than 1.0.");
+
+  // to limit memory footprint (1024 is a tuning parameter)
+  auto max_edges_to_generate_per_iteration =
+    static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * 1024;
+  rmm::device_uvector<float> rands(
+    std::min(num_edges, max_edges_to_generate_per_iteration) * (src_scale + dst_scale),
+    handle.get_stream());
+
+  rmm::device_uvector<vertex_t> srcs(num_edges, handle.get_stream());
+  rmm::device_uvector<vertex_t> dsts(num_edges, handle.get_stream());
+
+  size_t num_edges_generated{0};
+  while (num_edges_generated < num_edges) {
+    auto num_edges_to_generate =
+      std::min(num_edges - num_edges_generated, max_edges_to_generate_per_iteration);
+    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(srcs.begin(), dsts.begin())) +
+                      num_edges_generated;
+
+    detail::uniform_random_fill(handle.get_stream(),
+                                rands.data(),
+                                num_edges_to_generate * (src_scale + dst_scale),
+                                0.0f,
+                                1.0f,
+                                rng_state);
+
+    thrust::transform(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_edges_to_generate),
+      pair_first,
+      // if a + b == 0.0, a_norm is irrelevant, if (1.0 - (a+b)) == 0.0, c_norm is irrelevant
+      [src_scale,
+       dst_scale,
+       rands    = rands.data(),
+       a_plus_b = a + b,
+       a_plus_c = a + c,
+       a_norm   = (a + b) > 0.0 ? a / (a + b) : 0.0,
+       c_norm   = (1.0 - (a + b)) > 0.0 ? c / (1.0 - (a + b)) : 0.0] __device__(auto i) {
+        vertex_t src{0};
+        vertex_t dst{0};
+        size_t rand_offset = i * (src_scale + dst_scale);
+        for (int level = 0; level < static_cast<int>(std::max(src_scale, dst_scale)); ++level) {
+          auto dst_threshold = a_plus_c;
+          if (level < src_scale) {
+            auto r           = rands[rand_offset++];
+            auto src_bit_set = r > a_plus_b;
+            src +=
+              src_bit_set ? static_cast<vertex_t>(vertex_t{1} << (src_scale - (level + 1))) : 0;
+            dst_threshold = src_bit_set ? c_norm : a_norm;
+          }
+          if (level < dst_scale) {
+            auto r           = rands[rand_offset++];
+            auto dst_bit_set = r > dst_threshold;
+            dst +=
+              dst_bit_set ? static_cast<vertex_t>(vertex_t{1} << (dst_scale - (level + 1))) : 0;
+          }
+        }
+        return thrust::make_tuple(src, dst);
+      });
+    num_edges_generated += num_edges_to_generate;
+  }
+
+  return std::make_tuple(std::move(srcs), std::move(dsts));
+}
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+generate_bipartite_rmat_edgelist<int32_t>(raft::handle_t const& handle,
+                                          raft::random::RngState& rng_state,
+                                          size_t src_scale,
+                                          size_t dst_scale,
+                                          size_t num_edges,
+                                          double a,
+                                          double b,
+                                          double c);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+generate_bipartite_rmat_edgelist<int64_t>(raft::handle_t const& handle,
+                                          raft::random::RngState& rng_state,
+                                          size_t src_scale,
+                                          size_t dst_scale,
+                                          size_t num_edges,
+                                          double a,
+                                          double b,
+                                          double c);
+
+}  // namespace cugraph
diff --git a/cpp/src/generators/generator_tools.cu b/cpp/src/generators/generator_tools.cu
index ece07c43efd..1650d3c15c6 100644
--- a/cpp/src/generators/generator_tools.cu
+++ b/cpp/src/generators/generator_tools.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,10 +43,10 @@ template <typename T>
 rmm::device_uvector<T> append_all(raft::handle_t const& handle,
                                   std::vector<rmm::device_uvector<T>>&& input)
 {
-  size_t size{0};
-  // for (size_t i = 0; i < input.size(); ++i) size += input[i].size();
-  for (auto& element : input)
-    size += element.size();
+  auto size = std::transform_reduce(
+    input.begin(), input.end(), size_t{0}, std::plus<size_t>{}, [](auto const& element) {
+      return element.size();
+    });
 
   rmm::device_uvector<T> output(size, handle.get_stream());
   auto output_iter = output.begin();
@@ -56,36 +56,43 @@ rmm::device_uvector<T> append_all(raft::handle_t const& handle,
     output_iter += element.size();
   }
 
-  /*
-for (size_t i = 0; i < input.size(); ++i) {
-  raft::copy(output_iter, input[i].begin(), input[i].size(), handle.get_stream());
-  output_iter += input[i].size();
-}
-  */
-
   return output;
 }
 
 }  // namespace detail
 
 template <typename vertex_t>
-void scramble_vertex_ids(raft::handle_t const& handle,
-                         rmm::device_uvector<vertex_t>& d_src_v,
-                         rmm::device_uvector<vertex_t>& d_dst_v,
-                         vertex_t vertex_id_offset,
-                         uint64_t seed)
+rmm::device_uvector<vertex_t> scramble_vertex_ids(raft::handle_t const& handle,
+                                                  rmm::device_uvector<vertex_t>&& vertices,
+                                                  size_t lgN)
 {
-  vertex_t scale = 1 + raft::log2(d_src_v.size());
+  thrust::transform(handle.get_thrust_policy(),
+                    vertices.begin(),
+                    vertices.end(),
+                    vertices.begin(),
+                    [lgN] __device__(auto v) { return detail::scramble(v, lgN); });
 
-  auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin()));
+  return std::move(vertices);
+}
+
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> scramble_vertex_ids(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& srcs,
+  rmm::device_uvector<vertex_t>&& dsts,
+  size_t lgN)
+{
+  auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(srcs.begin(), dsts.begin()));
   thrust::transform(handle.get_thrust_policy(),
                     pair_first,
-                    pair_first + d_src_v.size(),
+                    pair_first + srcs.size(),
                     pair_first,
-                    [scale] __device__(auto pair) {
-                      return thrust::make_tuple(detail::scramble(thrust::get<0>(pair), scale),
-                                                detail::scramble(thrust::get<1>(pair), scale));
+                    [lgN] __device__(auto pair) {
+                      return thrust::make_tuple(detail::scramble(thrust::get<0>(pair), lgN),
+                                                detail::scramble(thrust::get<1>(pair), lgN));
                     });
+
+  return std::make_tuple(std::move(srcs), std::move(dsts));
 }
 
 template <typename vertex_t, typename weight_t>
@@ -250,17 +257,25 @@ symmetrize_edgelist_from_triangular(
                          optional_d_weights_v ? std::move(optional_d_weights_v) : std::nullopt);
 }
 
-template void scramble_vertex_ids(raft::handle_t const& handle,
-                                  rmm::device_uvector<int32_t>& d_src_v,
-                                  rmm::device_uvector<int32_t>& d_dst_v,
-                                  int32_t vertex_id_offset,
-                                  uint64_t seed);
-
-template void scramble_vertex_ids(raft::handle_t const& handle,
-                                  rmm::device_uvector<int64_t>& d_src_v,
-                                  rmm::device_uvector<int64_t>& d_dst_v,
-                                  int64_t vertex_id_offset,
-                                  uint64_t seed);
+template rmm::device_uvector<int32_t> scramble_vertex_ids(raft::handle_t const& handle,
+                                                          rmm::device_uvector<int32_t>&& vertices,
+                                                          size_t lgN);
+
+template rmm::device_uvector<int64_t> scramble_vertex_ids(raft::handle_t const& handle,
+                                                          rmm::device_uvector<int64_t>&& vertices,
+                                                          size_t lgN);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>> scramble_vertex_ids(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& srcs,
+  rmm::device_uvector<int32_t>&& dsts,
+  size_t lgN);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>> scramble_vertex_ids(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& srcs,
+  rmm::device_uvector<int64_t>&& dsts,
+  size_t lgN);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
diff --git a/cpp/src/linear_assignment/legacy/hungarian.cu b/cpp/src/linear_assignment/legacy/hungarian.cu
index a5b27beb9b7..33bfdd78d15 100644
--- a/cpp/src/linear_assignment/legacy/hungarian.cu
+++ b/cpp/src/linear_assignment/legacy/hungarian.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-//#define TIMING
+// #define TIMING
 
 #include <cugraph/legacy/graph.hpp>
 #include <cugraph/utilities/error.hpp>
diff --git a/cpp/src/link_analysis/hits_impl.cuh b/cpp/src/link_analysis/hits_impl.cuh
index 241b84ee40c..9badb041218 100644
--- a/cpp/src/link_analysis/hits_impl.cuh
+++ b/cpp/src/link_analysis/hits_impl.cuh
@@ -204,6 +204,8 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
                                   bool normalize,
                                   bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   return detail::hits(handle,
                       graph_view,
                       hubs,
diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index 1a523871f94..49d1a3eabb9 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -362,6 +362,8 @@ void pagerank(raft::handle_t const& handle,
               bool has_initial_guess,
               bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   detail::pagerank(handle,
                    graph_view,
                    edge_weight_view,
diff --git a/cpp/src/link_prediction/jaccard_impl.cuh b/cpp/src/link_prediction/jaccard_impl.cuh
index d6d75217c97..b9675e3a578 100644
--- a/cpp/src/link_prediction/jaccard_impl.cuh
+++ b/cpp/src/link_prediction/jaccard_impl.cuh
@@ -53,6 +53,8 @@ rmm::device_uvector<weight_t> jaccard_coefficients(
   std::tuple<raft::device_span<vertex_t const>, raft::device_span<vertex_t const>> vertex_pairs,
   bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (!edge_weight_view)
     return detail::similarity(handle,
                               graph_view,
diff --git a/cpp/src/link_prediction/overlap_impl.cuh b/cpp/src/link_prediction/overlap_impl.cuh
index 05d067ec3ec..4c001a8f243 100644
--- a/cpp/src/link_prediction/overlap_impl.cuh
+++ b/cpp/src/link_prediction/overlap_impl.cuh
@@ -53,6 +53,8 @@ rmm::device_uvector<weight_t> overlap_coefficients(
   std::tuple<raft::device_span<vertex_t const>, raft::device_span<vertex_t const>> vertex_pairs,
   bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (!edge_weight_view)
     return detail::similarity(handle,
                               graph_view,
diff --git a/cpp/src/link_prediction/sorensen_impl.cuh b/cpp/src/link_prediction/sorensen_impl.cuh
index 1cb6f9f6786..ac84358049a 100644
--- a/cpp/src/link_prediction/sorensen_impl.cuh
+++ b/cpp/src/link_prediction/sorensen_impl.cuh
@@ -53,6 +53,8 @@ rmm::device_uvector<weight_t> sorensen_coefficients(
   std::tuple<raft::device_span<vertex_t const>, raft::device_span<vertex_t const>> vertex_pairs,
   bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (!edge_weight_view)
     return detail::similarity(handle,
                               graph_view,
diff --git a/cpp/src/prims/count_if_e.cuh b/cpp/src/prims/count_if_e.cuh
index b6e7325e86e..f6e4bc9bead 100644
--- a/cpp/src/prims/count_if_e.cuh
+++ b/cpp/src/prims/count_if_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ namespace cugraph {
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam EdgeOp Type of the quinary edge operator.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
@@ -74,6 +74,8 @@ typename GraphViewType::edge_type count_if_e(raft::handle_t const& handle,
   using vertex_t = typename GraphViewType::vertex_type;
   using edge_t   = typename GraphViewType::edge_type;
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
index f35b9f9d74e..ac57c8f180a 100644
--- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
+++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh
@@ -97,8 +97,8 @@ void* get_optional_dataframe_buffer_begin(std::byte& optional_dataframe_buffer)
 
 template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
 auto get_optional_dataframe_buffer_begin(
-  std::add_lvalue_reference_t<decltype(
-    allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))> optional_dataframe_buffer)
+  std::add_lvalue_reference_t<decltype(allocate_dataframe_buffer<T>(
+    size_t{0}, rmm::cuda_stream_view{}))> optional_dataframe_buffer)
 {
   return get_dataframe_buffer_begin(optional_dataframe_buffer);
 }
@@ -113,8 +113,8 @@ void resize_optional_dataframe_buffer(std::byte& optional_dataframe_buffer,
 
 template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
 void resize_optional_dataframe_buffer(
-  std::add_lvalue_reference_t<decltype(
-    allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))> optional_dataframe_buffer,
+  std::add_lvalue_reference_t<decltype(allocate_dataframe_buffer<T>(
+    size_t{0}, rmm::cuda_stream_view{}))> optional_dataframe_buffer,
   size_t new_buffer_size,
   rmm::cuda_stream_view stream_view)
 {
@@ -130,8 +130,8 @@ void shrink_to_fit_optional_dataframe_buffer(std::byte& optional_dataframe_buffe
 
 template <typename T, std::enable_if_t<!std::is_same_v<T, void>>* = nullptr>
 void shrink_to_fit_optional_dataframe_buffer(
-  std::add_lvalue_reference_t<decltype(
-    allocate_dataframe_buffer<T>(size_t{0}, rmm::cuda_stream_view{}))> optional_dataframe_buffer,
+  std::add_lvalue_reference_t<decltype(allocate_dataframe_buffer<T>(
+    size_t{0}, rmm::cuda_stream_view{}))> optional_dataframe_buffer,
   rmm::cuda_stream_view stream_view)
 {
   return shrink_to_fit_dataframe_buffer(optional_dataframe_buffer, stream_view);
@@ -740,29 +740,24 @@ extract_transform_v_frontier_e(raft::handle_t const& handle,
   using edge_partition_src_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
     edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
-    std::conditional_t<GraphViewType::is_storage_transposed,
-                       edge_partition_endpoint_property_device_view_t<
-                         vertex_t,
-                         typename EdgeSrcValueInputWrapper::value_iterator>,
-                       edge_partition_endpoint_property_device_view_t<
-                         vertex_t,
-                         typename EdgeSrcValueInputWrapper::value_iterator>>>;
+    edge_partition_endpoint_property_device_view_t<
+      vertex_t,
+      typename EdgeSrcValueInputWrapper::value_iterator,
+      typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
     edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
-    std::conditional_t<GraphViewType::is_storage_transposed,
-                       edge_partition_endpoint_property_device_view_t<
-                         vertex_t,
-                         typename EdgeDstValueInputWrapper::value_iterator>,
-                       edge_partition_endpoint_property_device_view_t<
-                         vertex_t,
-                         typename EdgeDstValueInputWrapper::value_iterator>>>;
+    edge_partition_endpoint_property_device_view_t<
+      vertex_t,
+      typename EdgeDstValueInputWrapper::value_iterator,
+      typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
-      typename EdgeValueInputWrapper::value_iterator>>;
+      typename EdgeValueInputWrapper::value_iterator,
+      typename EdgeValueInputWrapper::value_type>>;
 
   static_assert(GraphViewType::is_storage_transposed == incoming);
   static_assert(!std::is_same_v<output_key_t, void> ||
diff --git a/cpp/src/prims/edge_bucket.cuh b/cpp/src/prims/edge_bucket.cuh
new file mode 100644
index 00000000000..e23ffb22bc5
--- /dev/null
+++ b/cpp/src/prims/edge_bucket.cuh
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/merge.h>
+#include <thrust/partition.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+#include <thrust/unique.h>
+
+#include <cinttypes>
+#include <cstddef>
+#include <optional>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+namespace cugraph {
+
+// key type is either thrust::tuple<vertex_t, vertex_t> (tag_t == void) or thrust::tuple<vertex_t,
+// vertex_t, tag_t> (tag_t != void). tag_t can be used to point a specific edge if there are
+// multiple edges between a source and a destination (e.g. tag_t can be an edge ID type). If
+// sorted_unique is true, stores unique key objects in the sorted (non-descending) order. If false,
+// there can be duplicates and the elements may not be sorted. Use source as the primary key and
+// destination as the secondary key for sorting if src_major is true. Use destination as the primary
+// key and source as the secondary key if src_major is false. If tag_t is not void, use tag as the
+// tertiary key in sorting.
+template <typename vertex_t,
+          typename tag_t     = void,
+          bool src_major     = false,
+          bool multi_gpu     = false,
+          bool sorted_unique = false>
+class edge_bucket_t {
+ public:
+  using key_type = std::conditional_t<std::is_same_v<tag_t, void>,
+                                      thrust::tuple<vertex_t, vertex_t>,
+                                      thrust::tuple<vertex_t, vertex_t, tag_t>>;
+
+  static bool constexpr is_src_major     = src_major;
+  static bool constexpr is_sorted_unique = sorted_unique;
+
+  static_assert(std::is_same_v<tag_t, void> || std::is_arithmetic_v<tag_t>);
+
+  using optional_buffer_type = std::
+    conditional_t<std::is_same_v<tag_t, void>, std::byte /* dummy */, rmm::device_uvector<tag_t>>;
+
+  template <typename tag_type = tag_t, std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
+  edge_bucket_t(raft::handle_t const& handle)
+    : handle_ptr_(&handle),
+      majors_(0, handle.get_stream()),
+      minors_(0, handle.get_stream()),
+      tags_(std::byte{0})
+  {
+  }
+
+  template <typename tag_type = tag_t, std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
+  edge_bucket_t(raft::handle_t const& handle)
+    : handle_ptr_(&handle),
+      majors_(0, handle.get_stream()),
+      minors_(0, handle.get_stream()),
+      tags_(0, handle.get_stream())
+  {
+  }
+
+  /**
+   * @ brief insert an edge to the bucket
+   *
+   * @param src edge source vertex.
+   * @param dst edge destination vertex.
+   */
+  template <typename tag_type = tag_t, std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
+  void insert(vertex_t src, vertex_t dst)
+  {
+    if (majors_.size() > 0) {
+      rmm::device_scalar<vertex_t> tmp_src(src, handle_ptr_->get_stream());
+      rmm::device_scalar<vertex_t> tmp_dst(dst, handle_ptr_->get_stream());
+      auto pair_first =
+        thrust::make_zip_iterator(thrust::make_tuple(tmp_src.data(), tmp_dst.data()));
+      insert(tmp_src.data(), tmp_src.data() + 1, tmp_dst.data());
+    } else {
+      auto major = src_major ? src : dst;
+      auto minor = src_major ? dst : src;
+      majors_.resize(1, handle_ptr_->get_stream());
+      minors_.resize(1, handle_ptr_->get_stream());
+      auto pair_first =
+        thrust::make_zip_iterator(thrust::make_tuple(majors_.data(), minors_.data()));
+      thrust::fill(handle_ptr_->get_thrust_policy(),
+                   pair_first,
+                   pair_first + 1,
+                   thrust::make_tuple(major, minor));
+    }
+  }
+
+  /**
+   * @ brief insert a tagged-edge to the bucket
+   *
+   * @param src edge source vertex.
+   * @param dst edge destination vertex.
+   * @param tag edge tag.
+   */
+  template <typename tag_type = tag_t, std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
+  void insert(vertex_t src, vertex_t dst, tag_type tag)
+  {
+    if (majors_.size() > 0) {
+      rmm::device_scalar<vertex_t> tmp_src(src, handle_ptr_->get_stream());
+      rmm::device_scalar<vertex_t> tmp_dst(dst, handle_ptr_->get_stream());
+      rmm::device_scalar<tag_t> tmp_tag(tag, handle_ptr_->get_stream());
+      auto triplet_first = thrust::make_zip_iterator(
+        thrust::make_tuple(tmp_src.data(), tmp_dst.data(), tmp_tag.data()));
+      insert(tmp_src.data(), tmp_src.data() + 1, tmp_dst.data(), tmp_tag.data());
+    } else {
+      auto major = src_major ? src : dst;
+      auto minor = src_major ? dst : src;
+      majors_.resize(1, handle_ptr_->get_stream());
+      minors_.resize(1, handle_ptr_->get_stream());
+      tags_.resize(1, handle_ptr_->get_stream());
+      auto triplet_first =
+        thrust::make_zip_iterator(thrust::make_tuple(majors_.data(), minors_.data(), tags_.data()));
+      thrust::fill(handle_ptr_->get_thrust_policy(),
+                   triplet_first,
+                   triplet_first + 1,
+                   thrust::make_tuple(major, minor, tag));
+    }
+  }
+
+  /**
+   * @ brief insert a list of edges to the bucket
+   *
+   * @param src_first Iterator pointing to the first (inclusive) element of the edge source vertices
+   * in device memory.
+   * @param src_last Iterator pointing to the last (exclusive) element of the edge source vertices
+   * stored in device memory.
+   * @param dst_first Iterator pointing to the first (inclusive) element of the edge destination
+   * vertices in device memory.
+   */
+  template <typename VertexIterator,
+            typename tag_type                                 = tag_t,
+            std::enable_if_t<std::is_same_v<tag_type, void>>* = nullptr>
+  void insert(VertexIterator src_first, VertexIterator src_last, VertexIterator dst_first)
+  {
+    static_assert(
+      std::is_same_v<typename thrust::iterator_traits<VertexIterator>::value_type, vertex_t>);
+
+    auto major_first    = src_major ? src_first : dst_first;
+    auto major_last     = major_first + thrust::distance(src_first, src_last);
+    auto minor_first    = src_major ? dst_first : src_first;
+    auto new_pair_first = thrust::make_zip_iterator(thrust::make_tuple(major_first, minor_first));
+
+    if (majors_.size() > 0) {
+      if constexpr (sorted_unique) {
+        rmm::device_uvector<vertex_t> merged_majors(
+          majors_.size() + thrust::distance(major_first, major_last), handle_ptr_->get_stream());
+        rmm::device_uvector<vertex_t> merged_minors(merged_majors.size(),
+                                                    handle_ptr_->get_stream());
+        auto old_pair_first =
+          thrust::make_zip_iterator(thrust::make_tuple(majors_.begin(), minors_.begin()));
+        auto merged_pair_first = thrust::make_zip_iterator(
+          thrust::make_tuple(merged_majors.begin(), merged_minors.begin()));
+        thrust::merge(handle_ptr_->get_thrust_policy(),
+                      old_pair_first,
+                      old_pair_first + majors_.size(),
+                      new_pair_first,
+                      new_pair_first + thrust::distance(major_first, major_last),
+                      merged_pair_first);
+        merged_majors.resize(
+          thrust::distance(merged_pair_first,
+                           thrust::unique(handle_ptr_->get_thrust_policy(),
+                                          merged_pair_first,
+                                          merged_pair_first + merged_majors.size())),
+          handle_ptr_->get_stream());
+        merged_minors.resize(merged_majors.size(), handle_ptr_->get_stream());
+        merged_majors.shrink_to_fit(handle_ptr_->get_stream());
+        merged_minors.shrink_to_fit(handle_ptr_->get_stream());
+        majors_ = std::move(merged_majors);
+        minors_ = std::move(merged_minors);
+      } else {
+        auto cur_size = majors_.size();
+        majors_.resize(cur_size + thrust::distance(major_first, major_last),
+                       handle_ptr_->get_stream());
+        minors_.resize(majors_.size(), handle_ptr_->get_stream());
+        thrust::copy(
+          handle_ptr_->get_thrust_policy(),
+          new_pair_first,
+          new_pair_first + thrust::distance(major_first, major_last),
+          thrust::make_zip_iterator(thrust::make_tuple(majors_.begin(), minors_.begin())) +
+            cur_size);
+      }
+    } else {
+      majors_.resize(thrust::distance(major_first, major_last), handle_ptr_->get_stream());
+      minors_.resize(majors_.size(), handle_ptr_->get_stream());
+      thrust::copy(handle_ptr_->get_thrust_policy(),
+                   new_pair_first,
+                   new_pair_first + thrust::distance(major_first, major_last),
+                   thrust::make_zip_iterator(thrust::make_tuple(majors_.begin(), minors_.begin())));
+    }
+  }
+
+  /**
+   * @ brief insert a list of tagged-edges to the bucket
+   *
+   * @param src_first Iterator pointing to the first (inclusive) element of the edge source vertices
+   * in device memory.
+   * @param src_last Iterator pointing to the last (exclusive) element of the edge source vertices
+   * stored in device memory.
+   * @param dst_first Iterator pointing to the first (inclusive) element of the edge destination
+   * vertices in device memory.
+   * @param tag_first Iterator pointing to the first (inclusive) element of the edge tags in device
+   * memory.
+   */
+  template <typename VertexIterator,
+            typename TagIterator,
+            typename tag_type                                  = tag_t,
+            std::enable_if_t<!std::is_same_v<tag_type, void>>* = nullptr>
+  void insert(VertexIterator src_first,
+              VertexIterator src_last,
+              VertexIterator dst_first,
+              TagIterator tag_first)
+  {
+    static_assert(
+      std::is_same_v<typename thrust::iterator_traits<VertexIterator>::value_type, vertex_t>);
+    static_assert(std::is_same_v<typename thrust::iterator_traits<TagIterator>::value_type, tag_t>);
+
+    auto major_first = src_major ? src_first : dst_first;
+    auto major_last  = major_first + thrust::distance(src_first, src_last);
+    auto minor_first = src_major ? dst_first : src_first;
+    auto new_triplet_first =
+      thrust::make_zip_iterator(thrust::make_tuple(major_first, minor_first, tag_first));
+
+    if (majors_.size() > 0) {
+      if constexpr (sorted_unique) {
+        rmm::device_uvector<vertex_t> merged_majors(
+          majors_.size() + thrust::distance(major_first, major_last), handle_ptr_->get_stream());
+        rmm::device_uvector<vertex_t> merged_minors(merged_majors.size(),
+                                                    handle_ptr_->get_stream());
+        rmm::device_uvector<tag_t> merged_tags(merged_majors.size(), handle_ptr_->get_stream());
+        auto old_triplet_first = thrust::make_zip_iterator(
+          thrust::make_tuple(majors_.begin(), minors_.begin(), tags_.begin()));
+        auto merged_triplet_first = thrust::make_zip_iterator(
+          thrust::make_tuple(merged_majors.begin(), merged_minors.begin(), merged_tags.begin()));
+        thrust::merge(handle_ptr_->get_thrust_policy(),
+                      old_triplet_first,
+                      old_triplet_first + majors_.size(),
+                      new_triplet_first,
+                      new_triplet_first + thrust::distance(major_first, major_last),
+                      merged_triplet_first);
+        merged_majors.resize(
+          thrust::distance(merged_triplet_first,
+                           thrust::unique(handle_ptr_->get_thrust_policy(),
+                                          merged_triplet_first,
+                                          merged_triplet_first + merged_majors.size())),
+          handle_ptr_->get_stream());
+        merged_minors.resize(merged_majors.size(), handle_ptr_->get_stream());
+        merged_tags.resize(merged_majors.size(), handle_ptr_->get_stream());
+        merged_majors.shrink_to_fit(handle_ptr_->get_stream());
+        merged_minors.shrink_to_fit(handle_ptr_->get_stream());
+        merged_tags.shrink_to_fit(handle_ptr_->get_stream());
+        majors_ = std::move(merged_majors);
+        minors_ = std::move(merged_minors);
+        tags_   = std::move(merged_tags);
+      } else {
+        auto cur_size = majors_.size();
+        majors_.resize(cur_size + thrust::distance(major_first, major_last),
+                       handle_ptr_->get_stream());
+        minors_.resize(majors_.size(), handle_ptr_->get_stream());
+        tags_.resize(majors_.size(), handle_ptr_->get_stream());
+        thrust::copy(handle_ptr_->get_thrust_policy(),
+                     new_triplet_first,
+                     new_triplet_first + thrust::distance(major_first, major_last),
+                     thrust::make_zip_iterator(
+                       thrust::make_tuple(majors_.begin(), minors_.begin(), tags_.begin())) +
+                       cur_size);
+      }
+    } else {
+      majors_.resize(thrust::distance(major_first, major_last), handle_ptr_->get_stream());
+      minors_.resize(majors_.size(), handle_ptr_->get_stream());
+      tags_.resize(majors_.size(), handle_ptr_->get_stream());
+      thrust::copy(handle_ptr_->get_thrust_policy(),
+                   new_triplet_first,
+                   new_triplet_first + thrust::distance(major_first, major_last),
+                   thrust::make_zip_iterator(
+                     thrust::make_tuple(majors_.begin(), minors_.begin(), tags_.begin())));
+    }
+  }
+
+  size_t size() const { return majors_.size(); }
+
+  template <bool do_aggregate = multi_gpu>
+  std::enable_if_t<do_aggregate, size_t> aggregate_size() const
+  {
+    return host_scalar_allreduce(
+      handle_ptr_->get_comms(), majors_.size(), raft::comms::op_t::SUM, handle_ptr_->get_stream());
+  }
+
+  template <bool do_aggregate = multi_gpu>
+  std::enable_if_t<!do_aggregate, size_t> aggregate_size() const
+  {
+    return majors_.size();
+  }
+
+  void resize(size_t size)
+  {
+    majors_.resize(size, handle_ptr_->get_stream());
+    minors_.resize(size, handle_ptr_->get_stream());
+    if constexpr (!std::is_same_v<tag_t, void>) { tags_.resize(size, handle_ptr_->get_stream()); }
+  }
+
+  void clear() { resize(0); }
+
+  void shrink_to_fit()
+  {
+    majors_.shrink_to_fit(handle_ptr_->get_stream());
+    minors_.shrink_to_fit(handle_ptr_->get_stream());
+    if constexpr (!std::is_same_v<tag_t, void>) { tags_.shrink_to_fit(handle_ptr_->get_stream()); }
+  }
+
+  auto const src_begin() const { return src_major ? majors_.begin() : minors_.begin(); }
+
+  auto src_begin() { return src_major ? majors_.begin() : minors_.begin(); }
+
+  auto const src_end() const { return (src_major ? majors_.begin() : minors_.begin()) + size(); }
+
+  auto src_end() { return (src_major ? majors_.begin() : minors_.begin()) + size(); }
+
+  auto const dst_begin() const { return src_major ? minors_.begin() : majors_.begin(); }
+
+  auto dst_begin() { return src_major ? minors_.begin() : majors_.begin(); }
+
+  auto const dst_end() const { return (src_major ? minors_.begin() : majors_.begin()) + size(); }
+
+  auto dst_end() { return (src_major ? minors_.begin() : majors_.begin()) + size(); }
+
+  auto const tag_begin() const { return tags_.begin(); }
+
+  auto tag_begin() { return tags_.begin(); }
+
+  auto const tag_end() const { return tags_.begin() + size(); }
+
+  auto tag_end() { tags_.begin() + size(); }
+
+ private:
+  raft::handle_t const* handle_ptr_{nullptr};
+  rmm::device_uvector<vertex_t> majors_;
+  rmm::device_uvector<vertex_t> minors_;
+  optional_buffer_type tags_;
+};
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/extract_transform_e.cuh b/cpp/src/prims/extract_transform_e.cuh
index fcd5e4c1483..f135b76d6e3 100644
--- a/cpp/src/prims/extract_transform_e.cuh
+++ b/cpp/src/prims/extract_transform_e.cuh
@@ -107,6 +107,8 @@ extract_transform_e(raft::handle_t const& handle,
   static_assert(!std::is_same_v<e_op_result_t, void>);
   using payload_t = typename e_op_result_t::value_type;
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   // FIXME: Consider updating detail::extract_transform_v_forntier_e to take std::nullopt to as a
   // frontier or create a new key bucket type that just stores [vertex_first, vertex_last) for
   // further optimization. Better revisit this once this becomes a performance bottleneck and after
diff --git a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
index fb5f6991073..42af8a1164d 100644
--- a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
+++ b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh
@@ -98,6 +98,8 @@ extract_transform_v_frontier_outgoing_e(raft::handle_t const& handle,
   static_assert(!std::is_same_v<e_op_result_t, void>);
   using payload_t = typename e_op_result_t::value_type;
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   auto value_buffer = allocate_dataframe_buffer<payload_t>(size_t{0}, handle.get_stream());
   std::tie(std::ignore, value_buffer) =
     detail::extract_transform_v_frontier_e<false, void, payload_t>(handle,
diff --git a/cpp/src/prims/fill_edge_property.cuh b/cpp/src/prims/fill_edge_property.cuh
new file mode 100644
index 00000000000..d446944b65b
--- /dev/null
+++ b/cpp/src/prims/fill_edge_property.cuh
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/edge_property.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <raft/core/handle.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/fill.h>
+
+#include <cstddef>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename GraphViewType, typename T, typename EdgePropertyOutputWrapper>
+void fill_edge_property(raft::handle_t const& handle,
+                        GraphViewType const& graph_view,
+                        T input,
+                        EdgePropertyOutputWrapper edge_property_output)
+{
+  static_assert(std::is_same_v<T, typename EdgePropertyOutputWrapper::value_type>);
+
+  auto value_firsts = edge_property_output.value_firsts();
+  auto edge_counts  = edge_property_output.edge_counts();
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    if constexpr (cugraph::has_packed_bool_element<
+                    std::remove_reference_t<decltype(value_firsts[i])>,
+                    T>()) {
+      static_assert(std::is_arithmetic_v<T>, "unimplemented for thrust::tuple types.");
+      auto packed_input = input ? packed_bool_full_mask() : packed_bool_empty_mask();
+      thrust::fill_n(handle.get_thrust_policy(),
+                     value_firsts[i],
+                     packed_bool_size(static_cast<size_t>(edge_counts[i])),
+                     packed_input);
+    } else {
+      thrust::fill_n(
+        handle.get_thrust_policy(), value_firsts[i], static_cast<size_t>(edge_counts[i]), input);
+    }
+  }
+}
+
+}  // namespace detail
+
+/**
+ * @brief Fill graph edge property values to the input value.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam T Type of the edge property values.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param input Edge property values will be set to @p input.
+ * @param edge_property_output edge_property_t class object to store edge property values (for the
+ * edges assigned to this process in multi-GPU).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename GraphViewType, typename T>
+void fill_edge_property(raft::handle_t const& handle,
+                        GraphViewType const& graph_view,
+                        T input,
+                        edge_property_t<GraphViewType, T>& edge_property_output,
+                        bool do_expensive_check = false)
+{
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
+  if (do_expensive_check) {
+    // currently, nothing to do
+  }
+
+  detail::fill_edge_property(handle, graph_view, input, edge_property_output.mutable_view());
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh
index b3c261f1327..86e23a1a04e 100644
--- a/cpp/src/prims/fill_edge_src_dst_property.cuh
+++ b/cpp/src/prims/fill_edge_src_dst_property.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,20 +36,35 @@ void fill_edge_major_property(raft::handle_t const& handle,
                               T input,
                               EdgeMajorPropertyOutputWrapper edge_major_property_output)
 {
+  static_assert(std::is_same_v<T, typename EdgeMajorPropertyOutputWrapper::value_type>);
+
   auto keys         = edge_major_property_output.keys();
   auto value_firsts = edge_major_property_output.value_firsts();
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
-    size_t buffer_size{0};
+    size_t num_buffer_elements{0};
     if (keys) {
-      buffer_size = (*keys)[i].size();
+      num_buffer_elements = (*keys)[i].size();
     } else {
       if constexpr (GraphViewType::is_storage_transposed) {
-        buffer_size = static_cast<size_t>(graph_view.local_edge_partition_dst_range_size(i));
+        num_buffer_elements =
+          static_cast<size_t>(graph_view.local_edge_partition_dst_range_size(i));
       } else {
-        buffer_size = static_cast<size_t>(graph_view.local_edge_partition_src_range_size(i));
+        num_buffer_elements =
+          static_cast<size_t>(graph_view.local_edge_partition_src_range_size(i));
       }
     }
-    thrust::fill_n(handle.get_thrust_policy(), value_firsts[i], buffer_size, input);
+    if constexpr (cugraph::has_packed_bool_element<
+                    std::remove_reference_t<decltype(value_firsts[i])>,
+                    T>()) {
+      static_assert(std::is_arithmetic_v<T>, "unimplemented for thrust::tuple types.");
+      auto packed_input = input ? packed_bool_full_mask() : packed_bool_empty_mask();
+      thrust::fill_n(handle.get_thrust_policy(),
+                     value_firsts[i],
+                     packed_bool_size(num_buffer_elements),
+                     packed_input);
+    } else {
+      thrust::fill_n(handle.get_thrust_policy(), value_firsts[i], num_buffer_elements, input);
+    }
   }
 }
 
@@ -59,19 +74,28 @@ void fill_edge_minor_property(raft::handle_t const& handle,
                               T input,
                               EdgeMinorPropertyOutputWrapper edge_minor_property_output)
 {
+  static_assert(std::is_same_v<T, typename EdgeMinorPropertyOutputWrapper::value_type>);
+
   auto keys = edge_minor_property_output.keys();
-  size_t buffer_size{0};
+  size_t num_buffer_elements{0};
   if (keys) {
-    buffer_size = (*keys).size();
+    num_buffer_elements = (*keys).size();
   } else {
     if constexpr (GraphViewType::is_storage_transposed) {
-      buffer_size = static_cast<size_t>(graph_view.local_edge_partition_src_range_size());
+      num_buffer_elements = static_cast<size_t>(graph_view.local_edge_partition_src_range_size());
     } else {
-      buffer_size = static_cast<size_t>(graph_view.local_edge_partition_dst_range_size());
+      num_buffer_elements = static_cast<size_t>(graph_view.local_edge_partition_dst_range_size());
     }
   }
-  thrust::fill_n(
-    handle.get_thrust_policy(), edge_minor_property_output.value_first(), buffer_size, input);
+  auto value_first = edge_minor_property_output.value_first();
+  if constexpr (cugraph::has_packed_bool_element<decltype(value_first), T>()) {
+    static_assert(std::is_arithmetic_v<T>, "unimplemented for thrust::tuple types.");
+    auto packed_input = input ? packed_bool_full_mask() : packed_bool_empty_mask();
+    thrust::fill_n(
+      handle.get_thrust_policy(), value_first, packed_bool_size(num_buffer_elements), packed_input);
+  } else {
+    thrust::fill_n(handle.get_thrust_policy(), value_first, num_buffer_elements, input);
+  }
 }
 
 }  // namespace detail
@@ -99,6 +123,8 @@ void fill_edge_src_property(raft::handle_t const& handle,
                             edge_src_property_t<GraphViewType, T>& edge_src_property_output,
                             bool do_expensive_check = false)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -135,6 +161,8 @@ void fill_edge_dst_property(raft::handle_t const& handle,
                             edge_dst_property_t<GraphViewType, T>& edge_dst_property_output,
                             bool do_expensive_check = false)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
index 5e783705700..f30de0750e3 100644
--- a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
+++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
@@ -41,6 +41,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/merge.h>
+#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
@@ -206,6 +207,8 @@ void per_v_pair_transform_dst_nbr_intersection(
   using property_t = typename thrust::iterator_traits<VertexValueInputIterator>::value_type;
   using result_t   = typename thrust::iterator_traits<VertexPairValueOutputIterator>::value_type;
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     auto num_invalids =
       detail::count_invalid_vertex_pairs(handle, graph_view, vertex_pair_first, vertex_pair_last);
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 2b512d16cf1..69cce08d352 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -287,7 +287,7 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
 #ifndef NO_CUGRAPH_OPS
   edge_t mid_partition_degree_range_last = static_cast<edge_t>(K * 10);  // tuning parameter
   assert(mid_partition_degree_range_last > K);
-  size_t high_partition_over_sampling_K = K * 2;  // tuning parameter
+  size_t high_partition_over_sampling_K = K * 2;                         // tuning parameter
   assert(high_partition_over_sampling_K > K);
 
   rmm::device_uvector<edge_t> sample_nbr_indices(frontier_degrees.size() * K, handle.get_stream());
@@ -313,34 +313,33 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
   auto high_partition_size = frontier_degrees.size() - (low_partition_size + mid_partition_size);
 
   if (low_partition_size > 0) {
-    thrust::for_each(
-      handle.get_thrust_policy(),
-      thrust::make_counting_iterator(size_t{0}),
-      thrust::make_counting_iterator(low_partition_size * K),
-      [K,
-       low_first,
-       sample_nbr_indices = sample_nbr_indices.data(),
-       invalid_idx        = cugraph::ops::gnn::graph::INVALID_ID<edge_t>] __device__(size_t i) {
-        auto pair       = *(low_first + (i / K));
-        auto degree     = thrust::get<0>(pair);
-        auto seed_idx   = thrust::get<1>(pair);
-        auto sample_idx = static_cast<edge_t>(i % K);
-        sample_nbr_indices[seed_idx * K + sample_idx] =
-          (sample_idx < degree) ? sample_idx : invalid_idx;
-      });
+    thrust::for_each(handle.get_thrust_policy(),
+                     thrust::make_counting_iterator(size_t{0}),
+                     thrust::make_counting_iterator(low_partition_size * K),
+                     [K,
+                      low_first,
+                      sample_nbr_indices = sample_nbr_indices.data(),
+                      invalid_idx = cugraph::ops::graph::INVALID_ID<edge_t>] __device__(size_t i) {
+                       auto pair       = *(low_first + (i / K));
+                       auto degree     = thrust::get<0>(pair);
+                       auto seed_idx   = thrust::get<1>(pair);
+                       auto sample_idx = static_cast<edge_t>(i % K);
+                       sample_nbr_indices[seed_idx * K + sample_idx] =
+                         (sample_idx < degree) ? sample_idx : invalid_idx;
+                     });
   }
 
   if (mid_partition_size > 0) {
     rmm::device_uvector<edge_t> tmp_sample_nbr_indices(mid_partition_size * K, handle.get_stream());
     // FIXME: we can avoid the follow-up copy if get_sampling_index takes output offsets for
     // sampling output
-    cugraph::ops::gnn::graph::get_sampling_index(tmp_sample_nbr_indices.data(),
-                                                 rng_state,
-                                                 thrust::get<0>(mid_first.get_iterator_tuple()),
-                                                 mid_partition_size,
-                                                 static_cast<int32_t>(K),
-                                                 false,
-                                                 handle.get_stream());
+    cugraph::ops::graph::get_sampling_index(tmp_sample_nbr_indices.data(),
+                                            rng_state,
+                                            thrust::get<0>(mid_first.get_iterator_tuple()),
+                                            mid_partition_size,
+                                            static_cast<int32_t>(K),
+                                            false,
+                                            handle.get_stream());
     thrust::for_each(handle.get_thrust_policy(),
                      thrust::make_counting_iterator(size_t{0}),
                      thrust::make_counting_iterator(mid_partition_size * K),
@@ -415,7 +414,7 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
             (*retry_segment_indices).size() * high_partition_over_sampling_K, handle.get_stream());
         }
 
-        cugraph::ops::gnn::graph::get_sampling_index(
+        cugraph::ops::graph::get_sampling_index(
           retry_segment_indices ? (*retry_sample_nbr_indices).data()
                                 : tmp_sample_nbr_indices.data(),
           rng_state,
@@ -683,19 +682,22 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     edge_partition_endpoint_property_device_view_t<
       vertex_t,
-      typename EdgeSrcValueInputWrapper::value_iterator>>;
+      typename EdgeSrcValueInputWrapper::value_iterator,
+      typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
     edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     edge_partition_endpoint_property_device_view_t<
       vertex_t,
-      typename EdgeDstValueInputWrapper::value_iterator>>;
+      typename EdgeDstValueInputWrapper::value_iterator,
+      typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
-      typename EdgeValueInputWrapper::value_iterator>>;
+      typename EdgeValueInputWrapper::value_iterator,
+      typename EdgeValueInputWrapper::value_type>>;
 
   static_assert(GraphViewType::is_storage_transposed == incoming);
   static_assert(std::is_same_v<
@@ -859,13 +861,13 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
   if (with_replacement) {
     if (frontier_degrees.size() > 0) {
       sample_nbr_indices.resize(frontier.size() * K, handle.get_stream());
-      cugraph::ops::gnn::graph::get_sampling_index(sample_nbr_indices.data(),
-                                                   rng_state,
-                                                   frontier_degrees.data(),
-                                                   static_cast<edge_t>(frontier_degrees.size()),
-                                                   static_cast<int32_t>(K),
-                                                   with_replacement,
-                                                   handle.get_stream());
+      cugraph::ops::graph::get_sampling_index(sample_nbr_indices.data(),
+                                              rng_state,
+                                              frontier_degrees.data(),
+                                              static_cast<edge_t>(frontier_degrees.size()),
+                                              static_cast<int32_t>(K),
+                                              with_replacement,
+                                              handle.get_stream());
       frontier_degrees.resize(0, handle.get_stream());
       frontier_degrees.shrink_to_fit(handle.get_stream());
     }
@@ -880,7 +882,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     sample_nbr_indices);  // neighbor index within an edge partition (note that each vertex's
                           // neighbors are distributed in minor_comm_size partitions)
   std::optional<rmm::device_uvector<size_t>> sample_key_indices{
-    std::nullopt};  // relevant only when (minor_comm_size > 1)
+    std::nullopt};        // relevant only when (minor_comm_size > 1)
   auto local_frontier_sample_counts        = std::vector<size_t>{};
   auto local_frontier_sample_displacements = std::vector<size_t>{};
   if (minor_comm_size > 1) {
@@ -912,7 +914,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
         raft::device_span<size_t>(d_tx_counts.data(), d_tx_counts.size()),
         frontier.size(),
         minor_comm_size,
-        cugraph::ops::gnn::graph::INVALID_ID<edge_t>});
+        cugraph::ops::graph::INVALID_ID<edge_t>});
     rmm::device_uvector<size_t> tx_displacements(minor_comm_size, handle.get_stream());
     thrust::exclusive_scan(
       handle.get_thrust_policy(), d_tx_counts.begin(), d_tx_counts.end(), tx_displacements.begin());
@@ -1022,7 +1024,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
           edge_partition_dst_value_input,
           edge_partition_e_value_input,
           e_op,
-          cugraph::ops::gnn::graph::INVALID_ID<edge_t>,
+          cugraph::ops::graph::INVALID_ID<edge_t>,
           to_thrust_optional(invalid_value),
           K});
     } else {
@@ -1047,7 +1049,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
                                          edge_partition_dst_value_input,
                                          edge_partition_e_value_input,
                                          e_op,
-                                         cugraph::ops::gnn::graph::INVALID_ID<edge_t>,
+                                         cugraph::ops::graph::INVALID_ID<edge_t>,
                                          to_thrust_optional(invalid_value),
                                          K});
     }
@@ -1147,7 +1149,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
         count_valids_t<edge_t>{raft::device_span<edge_t const>(sample_local_nbr_indices.data(),
                                                                sample_local_nbr_indices.size()),
                                K,
-                               cugraph::ops::gnn::graph::INVALID_ID<edge_t>});
+                               cugraph::ops::graph::INVALID_ID<edge_t>});
       (*sample_offsets).set_element_to_zero_async(size_t{0}, handle.get_stream());
       auto typecasted_sample_count_first =
         thrust::make_transform_iterator(sample_counts.begin(), typecast_t<int32_t, size_t>{});
@@ -1164,7 +1166,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
         thrust::remove_if(handle.get_thrust_policy(),
                           pair_first,
                           pair_first + sample_local_nbr_indices.size(),
-                          check_invalid_t<edge_t, T>{cugraph::ops::gnn::graph::INVALID_ID<edge_t>});
+                          check_invalid_t<edge_t, T>{cugraph::ops::graph::INVALID_ID<edge_t>});
       sample_local_nbr_indices.resize(0, handle.get_stream());
       sample_local_nbr_indices.shrink_to_fit(handle.get_stream());
 
@@ -1191,9 +1193,9 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
- * @tparam EdgeBiasOp Type of the quaternary (or quinary) edge operator to set-up selection bias
+ * @tparam EdgeBiasOp Type of the quinary edge operator to set-up selection bias
  * values.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam EdgeOp Type of the quinary edge operator.
  * @tparam T Type of the selected and transformed edge output values.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
@@ -1260,6 +1262,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
                                          std::optional<T> invalid_value,
                                          bool do_expensive_check = false)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   CUGRAPH_FAIL("unimplemented.");
 
   return std::make_tuple(std::nullopt,
@@ -1278,7 +1282,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam EdgeOp Type of the quinary edge operator.
  * @tparam T Type of the selected and transformed edge output values.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
@@ -1340,6 +1344,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle,
                                          std::optional<T> invalid_value,
                                          bool do_expensive_check = false)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   return detail::per_v_random_select_transform_e<false>(handle,
                                                         graph_view,
                                                         frontier,
diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
index f2313cfc9c2..a4d34443413 100644
--- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
@@ -47,6 +47,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/scatter.h>
 #include <thrust/sort.h>
@@ -284,6 +285,8 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
       edge_t,
       typename EdgeValueInputWrapper::value_iterator>>;
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) { /* currently, nothing to do */
   }
 
@@ -748,8 +751,8 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
       tmp_majors.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_values.begin()));
     auto major_value_map_device_view =
       (GraphViewType::is_multi_gpu && edge_src_value_input.keys())
-        ? thrust::make_optional<detail::kv_binary_search_store_device_view_t<decltype(
-            multi_gpu_major_value_map_ptr->view())>>(multi_gpu_major_value_map_ptr->view())
+        ? thrust::make_optional<detail::kv_binary_search_store_device_view_t<
+            decltype(multi_gpu_major_value_map_ptr->view())>>(multi_gpu_major_value_map_ptr->view())
         : thrust::nullopt;
     std::conditional_t<KVStoreViewType::binary_search,
                        detail::kv_binary_search_store_device_view_t<KVStoreViewType>,
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index bf8baf03c80..1349454f5b6 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -149,8 +149,7 @@ __global__ void per_v_transform_reduce_e_hypersparse(
             auto e_op_result  = transform_op(i);
             auto minor        = indices[i];
             auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output.get_iter(minor_offset),
-                                               e_op_result);
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
           });
       } else {
         thrust::for_each(
@@ -259,8 +258,7 @@ __global__ void per_v_transform_reduce_e_low_degree(
             auto e_op_result  = transform_op(i);
             auto minor        = indices[i];
             auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output.get_iter(minor_offset),
-                                               e_op_result);
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
           });
       } else {
         thrust::for_each(
@@ -356,8 +354,7 @@ __global__ void per_v_transform_reduce_e_mid_degree(
         reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
       } else {
         if constexpr (GraphViewType::is_multi_gpu) {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output.get_iter(minor_offset),
-                                             e_op_result);
+          reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
         } else {
           reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
         }
@@ -446,8 +443,7 @@ __global__ void per_v_transform_reduce_e_high_degree(
         reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
       } else {
         if constexpr (GraphViewType::is_multi_gpu) {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output.get_iter(minor_offset),
-                                             e_op_result);
+          reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
         } else {
           reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
         }
@@ -498,19 +494,22 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
-      typename EdgeSrcValueInputWrapper::value_iterator>>;
+      typename EdgeSrcValueInputWrapper::value_iterator,
+      typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
-      typename EdgeDstValueInputWrapper::value_iterator>>;
+      typename EdgeDstValueInputWrapper::value_iterator,
+      typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
-      typename EdgeValueInputWrapper::value_iterator>>;
+      typename EdgeValueInputWrapper::value_iterator,
+      typename EdgeValueInputWrapper::value_type>>;
 
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
 
@@ -548,20 +547,23 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
                    init);
     }
   } else {
-    auto minor_init = init;
-    if constexpr (GraphViewType::is_multi_gpu) {
-      auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
-      auto const major_comm_rank = major_comm.get_rank();
-      minor_init                 = (major_comm_rank == 0) ? init : ReduceOp::identity_element;
-    }
-
     if constexpr (GraphViewType::is_multi_gpu) {
+      auto minor_init = init;
+      auto view       = minor_tmp_buffer.view();
+      if (view.keys()) {  // defer applying the initial value to the end as minor_tmp_buffer may not
+                          // store values for the entire minor range
+        minor_init = ReduceOp::identity_element;
+      } else {
+        auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+        auto const major_comm_rank = major_comm.get_rank();
+        minor_init                 = (major_comm_rank == 0) ? init : ReduceOp::identity_element;
+      }
       fill_edge_minor_property(handle, graph_view, minor_init, minor_tmp_buffer.mutable_view());
     } else {
       thrust::fill(handle.get_thrust_policy(),
                    vertex_value_output_first,
                    vertex_value_output_first + graph_view.local_vertex_partition_range_size(),
-                   minor_init);
+                   init);
     }
   }
 
@@ -912,7 +914,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
     auto const minor_comm_size = minor_comm.get_size();
 
     auto view = minor_tmp_buffer.view();
-    if (view.keys()) {
+    if (view.keys()) {  // applying the initial value is deferred to here
       vertex_t max_vertex_partition_size{0};
       for (int i = 0; i < major_comm_size; ++i) {
         auto this_segment_vertex_partition_id =
@@ -923,7 +925,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
                    graph_view.vertex_partition_range_size(this_segment_vertex_partition_id));
       }
       auto tx_buffer = allocate_dataframe_buffer<T>(max_vertex_partition_size, handle.get_stream());
-      auto tx_first  = get_dataframe_buffer_begin(tx_buffer);
+      auto tx_buffer_first = get_dataframe_buffer_begin(tx_buffer);
       std::optional<raft::host_span<vertex_t const>> minor_key_offsets{};
       if constexpr (GraphViewType::is_storage_transposed) {
         minor_key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets();
@@ -931,25 +933,29 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
         minor_key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets();
       }
       for (int i = 0; i < major_comm_size; ++i) {
+        auto minor_init = (major_comm_rank == i) ? init : ReduceOp::identity_element;
         auto this_segment_vertex_partition_id =
           compute_local_edge_partition_minor_range_vertex_partition_id_t{
             major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-        thrust::fill(
-          handle.get_thrust_policy(),
-          tx_first,
-          tx_first + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id),
-          ReduceOp::identity_element);
+        thrust::fill(handle.get_thrust_policy(),
+                     tx_buffer_first,
+                     tx_buffer_first +
+                       graph_view.vertex_partition_range_size(this_segment_vertex_partition_id),
+                     minor_init);
+        auto value_first = thrust::make_transform_iterator(
+          view.value_first(),
+          [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); });
         thrust::scatter(
           handle.get_thrust_policy(),
-          view.value_first() + (*minor_key_offsets)[i],
-          view.value_first() + (*minor_key_offsets)[i + 1],
+          value_first + (*minor_key_offsets)[i],
+          value_first + (*minor_key_offsets)[i + 1],
           thrust::make_transform_iterator(
             (*(view.keys())).begin() + (*minor_key_offsets)[i],
             [key_first = graph_view.vertex_partition_range_first(
                this_segment_vertex_partition_id)] __device__(auto key) { return key - key_first; }),
-          tx_first);
+          tx_buffer_first);
         device_reduce(major_comm,
-                      tx_first,
+                      tx_buffer_first,
                       vertex_value_output_first,
                       static_cast<size_t>(
                         graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)),
@@ -993,7 +999,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam EdgeOp Type of the quinary edge operator.
  * @tparam ReduceOp Type of the binary reduction operator.
  * @tparam T Type of the initial value for per-vertex reduction.
  * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
@@ -1048,6 +1054,8 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle,
                                        VertexValueOutputIterator vertex_value_output_first,
                                        bool do_expensive_check = false)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -1072,7 +1080,7 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle,
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam EdgeOp Type of the quinary edge operator.
  * @tparam ReduceOp Type of the binary reduction operator.
  * @tparam T Type of the initial value for per-vertex reduction.
  * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
@@ -1127,6 +1135,8 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle,
                                        VertexValueOutputIterator vertex_value_output_first,
                                        bool do_expensive_check = false)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/property_op_utils.cuh b/cpp/src/prims/property_op_utils.cuh
index a55dbfbe5ba..8d74e6be292 100644
--- a/cpp/src/prims/property_op_utils.cuh
+++ b/cpp/src/prims/property_op_utils.cuh
@@ -93,95 +93,6 @@ struct intersection_op_result_type<
                                            raft::device_span<vertex_t const>>::type;
 };
 
-template <typename T>
-__device__ std::enable_if_t<std::is_arithmetic<T>::value, void> atomic_add_impl(
-  thrust::detail::any_assign& /* dereferencing thrust::discard_iterator results in this type */ lhs,
-  T const& rhs)
-{
-  // no-op
-}
-
-template <typename T>
-__device__ std::enable_if_t<std::is_arithmetic<T>::value, void> atomic_add_impl(T& lhs,
-                                                                                T const& rhs)
-{
-  atomicAdd(&lhs, rhs);
-}
-
-template <typename Iterator, typename TupleType, size_t I, size_t N>
-struct atomic_add_thrust_tuple_impl {
-  __device__ constexpr void compute(Iterator iter, TupleType const& value) const
-  {
-    atomic_add_impl(thrust::raw_reference_cast(thrust::get<I>(*iter)), thrust::get<I>(value));
-    atomic_add_thrust_tuple_impl<Iterator, TupleType, I + 1, N>().compute(iter, value);
-  }
-};
-
-template <typename Iterator, typename TupleType, size_t I>
-struct atomic_add_thrust_tuple_impl<Iterator, TupleType, I, I> {
-  __device__ constexpr void compute(Iterator iter, TupleType const& value) const {}
-};
-
-template <typename T>
-__device__ std::enable_if_t<std::is_arithmetic<T>::value, void> elementwise_atomic_min_impl(
-  thrust::detail::any_assign& /* dereferencing thrust::discard_iterator results in this type */ lhs,
-  T const& rhs)
-{
-  // no-op
-}
-
-template <typename T>
-__device__ std::enable_if_t<std::is_arithmetic<T>::value, void> elementwise_atomic_min_impl(
-  T& lhs, T const& rhs)
-{
-  atomicMin(&lhs, rhs);
-}
-
-template <typename Iterator, typename TupleType, size_t I, size_t N>
-struct elementwise_atomic_min_thrust_tuple_impl {
-  __device__ constexpr void compute(Iterator iter, TupleType const& value) const
-  {
-    elementwise_atomic_min_impl(thrust::raw_reference_cast(thrust::get<I>(*iter)),
-                                thrust::get<I>(value));
-    elementwise_atomic_min_thrust_tuple_impl<Iterator, TupleType, I + 1, N>().compute(iter, value);
-  }
-};
-
-template <typename Iterator, typename TupleType, size_t I>
-struct elementwise_atomic_min_thrust_tuple_impl<Iterator, TupleType, I, I> {
-  __device__ constexpr void compute(Iterator iter, TupleType const& value) const {}
-};
-
-template <typename T>
-__device__ std::enable_if_t<std::is_arithmetic<T>::value, void> elementwise_atomic_max_impl(
-  thrust::detail::any_assign& /* dereferencing thrust::discard_iterator results in this type */ lhs,
-  T const& rhs)
-{
-  // no-op
-}
-
-template <typename T>
-__device__ std::enable_if_t<std::is_arithmetic<T>::value, void> elementwise_atomic_max_impl(
-  T& lhs, T const& rhs)
-{
-  atomicMax(&lhs, rhs);
-}
-
-template <typename Iterator, typename TupleType, size_t I, size_t N>
-struct elementwise_atomic_max_thrust_tuple_impl {
-  __device__ constexpr void compute(Iterator iter, TupleType const& value) const
-  {
-    elementwise_atomic_max_impl(thrust::raw_reference_cast(thrust::get<I>(*iter)),
-                                thrust::get<I>(value));
-    elementwise_atomic_max_thrust_tuple_impl<Iterator, TupleType, I + 1, N>().compute(iter, value);
-  }
-};
-
-template <typename Iterator, typename TupleType, size_t I>
-struct elementwise_atomic_max_thrust_tuple_impl<Iterator, TupleType, I, I> {
-  __device__ constexpr void compute(Iterator iter, TupleType const& value) const {}
-};
-
 }  // namespace detail
 
 template <typename GraphViewType,
@@ -214,8 +125,7 @@ struct cast_edge_op_bool_to_integer {
 };
 
 template <typename T, template <typename> typename Op>
-struct property_op : public Op<T> {
-};
+struct property_op : public Op<T> {};
 
 template <typename... Args, template <typename> typename Op>
 struct property_op<thrust::tuple<Args...>, Op>
@@ -262,96 +172,4 @@ constexpr std::enable_if_t<std::is_arithmetic<T>::value, T> max_identity_element
   return std::numeric_limits<T>::max();
 }
 
-template <typename Iterator, typename T>
-__device__ std::enable_if_t<thrust::detail::is_discard_iterator<Iterator>::value, void>
-atomic_add_edge_op_result(Iterator iter, T const& value)
-{
-  // no-op
-}
-
-template <typename Iterator, typename T>
-__device__
-  std::enable_if_t<std::is_same<typename thrust::iterator_traits<Iterator>::value_type, T>::value &&
-                     std::is_arithmetic<T>::value,
-                   void>
-  atomic_add_edge_op_result(Iterator iter, T const& value)
-{
-  atomicAdd(&(thrust::raw_reference_cast(*iter)), value);
-}
-
-template <typename Iterator, typename T>
-__device__
-  std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
-                     is_thrust_tuple<T>::value,
-                   void>
-  atomic_add_edge_op_result(Iterator iter, T const& value)
-{
-  static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
-                thrust::tuple_size<T>::value);
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  detail::atomic_add_thrust_tuple_impl<Iterator, T, size_t{0}, tuple_size>().compute(iter, value);
-}
-
-template <typename Iterator, typename T>
-__device__ std::enable_if_t<thrust::detail::is_discard_iterator<Iterator>::value, void>
-elementwise_atomic_min_edge_op_result(Iterator iter, T const& value)
-{
-  // no-op
-}
-
-template <typename Iterator, typename T>
-__device__
-  std::enable_if_t<std::is_same<typename thrust::iterator_traits<Iterator>::value_type, T>::value &&
-                     std::is_arithmetic<T>::value,
-                   void>
-  elementwise_atomic_min_edge_op_result(Iterator iter, T const& value)
-{
-  atomicMin(&(thrust::raw_reference_cast(*iter)), value);
-}
-
-template <typename Iterator, typename T>
-__device__
-  std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
-                     is_thrust_tuple<T>::value,
-                   void>
-  elementwise_atomic_min_edge_op_result(Iterator iter, T const& value)
-{
-  static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
-                thrust::tuple_size<T>::value);
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  detail::elementwise_atomic_min_thrust_tuple_impl<Iterator, T, size_t{0}, tuple_size>().compute(
-    iter, value);
-}
-
-template <typename Iterator, typename T>
-__device__ std::enable_if_t<thrust::detail::is_discard_iterator<Iterator>::value, void>
-elementwise_atomic_max_edge_op_result(Iterator iter, T const& value)
-{
-  // no-op
-}
-
-template <typename Iterator, typename T>
-__device__
-  std::enable_if_t<std::is_same<typename thrust::iterator_traits<Iterator>::value_type, T>::value &&
-                     std::is_arithmetic<T>::value,
-                   void>
-  elementwise_atomic_max_edge_op_result(Iterator iter, T const& value)
-{
-  atomicMax(&(thrust::raw_reference_cast(*iter)), value);
-}
-
-template <typename Iterator, typename T>
-__device__
-  std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
-                     is_thrust_tuple<T>::value,
-                   void>
-  elementwise_atomic_max_edge_op_result(Iterator iter, T const& value)
-{
-  static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
-                thrust::tuple_size<T>::value);
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  detail::elementwise_atomic_max_thrust_tuple_impl<Iterator, T, size_t{0}, tuple_size>().compute(
-    iter, value);
-}
-
 }  // namespace cugraph
diff --git a/cpp/src/prims/reduce_op.cuh b/cpp/src/prims/reduce_op.cuh
index df3bfdf0ee2..922f2d7be0d 100644
--- a/cpp/src/prims/reduce_op.cuh
+++ b/cpp/src/prims/reduce_op.cuh
@@ -18,6 +18,8 @@
 
 #include <prims/property_op_utils.cuh>
 
+#include <cugraph/edge_partition_endpoint_property_device_view.cuh>
+#include <cugraph/utilities/atomic_ops.cuh>
 #include <cugraph/utilities/thrust_tuple_utils.hpp>
 
 #include <raft/core/comms.hpp>
@@ -222,27 +224,23 @@ struct plus {
 };
 
 template <typename ReduceOp, typename = raft::comms::op_t>
-struct has_compatible_raft_comms_op : std::false_type {
-};
+struct has_compatible_raft_comms_op : std::false_type {};
 
 template <typename ReduceOp>
 struct has_compatible_raft_comms_op<ReduceOp,
                                     std::remove_cv_t<decltype(ReduceOp::compatible_raft_comms_op)>>
-  : std::true_type {
-};
+  : std::true_type {};
 
 template <typename ReduceOp>
 inline constexpr bool has_compatible_raft_comms_op_v =
   has_compatible_raft_comms_op<ReduceOp>::value;
 
 template <typename ReduceOp, typename = typename ReduceOp::value_type>
-struct has_identity_element : std::false_type {
-};
+struct has_identity_element : std::false_type {};
 
 template <typename ReduceOp>
 struct has_identity_element<ReduceOp, std::remove_cv_t<decltype(ReduceOp::identity_element)>>
-  : std::true_type {
-};
+  : std::true_type {};
 
 template <typename ReduceOp>
 inline constexpr bool has_identity_element_v = has_identity_element<ReduceOp>::value;
@@ -260,11 +258,34 @@ __device__ std::enable_if_t<has_compatible_raft_comms_op_v<ReduceOp>, void> atom
      raft::comms::op_t::MAX));  // currently, only (element-wise) sum, min, and max are supported.
 
   if constexpr (ReduceOp::compatible_raft_comms_op == raft::comms::op_t::SUM) {
-    atomic_add_edge_op_result(iter, value);
+    atomic_add(iter, value);
+  } else if constexpr (ReduceOp::compatible_raft_comms_op == raft::comms::op_t::MIN) {
+    elementwise_atomic_min(iter, value);
+  } else {
+    elementwise_atomic_max(iter, value);
+  }
+}
+
+template <typename ReduceOp, typename EdgePartitionEndpointPropertyValueWrapper>
+__device__ std::enable_if_t<has_compatible_raft_comms_op_v<ReduceOp>, void> atomic_reduce(
+  EdgePartitionEndpointPropertyValueWrapper edge_partition_endpoint_property_value,
+  typename EdgePartitionEndpointPropertyValueWrapper::vertex_type offset,
+  typename EdgePartitionEndpointPropertyValueWrapper::value_type value)
+{
+  static_assert(std::is_same_v<typename ReduceOp::value_type,
+                               typename EdgePartitionEndpointPropertyValueWrapper::value_type>);
+  static_assert(
+    (ReduceOp::compatible_raft_comms_op == raft::comms::op_t::SUM) ||
+    (ReduceOp::compatible_raft_comms_op == raft::comms::op_t::MIN) ||
+    (ReduceOp::compatible_raft_comms_op ==
+     raft::comms::op_t::MAX));  // currently, only (element-wise) sum, min, and max are supported.
+
+  if constexpr (ReduceOp::compatible_raft_comms_op == raft::comms::op_t::SUM) {
+    edge_partition_endpoint_property_value.atomic_add(offset, value);
   } else if constexpr (ReduceOp::compatible_raft_comms_op == raft::comms::op_t::MIN) {
-    elementwise_atomic_min_edge_op_result(iter, value);
+    edge_partition_endpoint_property_value.elementwise_atomic_min(offset, value);
   } else {
-    elementwise_atomic_max_edge_op_result(iter, value);
+    edge_partition_endpoint_property_value.elementwise_atomic_max(offset, value);
   }
 }
 
diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh
new file mode 100644
index 00000000000..9be12262574
--- /dev/null
+++ b/cpp/src/prims/transform_e.cuh
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/edge_partition_device_view.cuh>
+#include <cugraph/edge_partition_endpoint_property_device_view.cuh>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <raft/core/handle.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/zip_iterator.h>
+
+#include <type_traits>
+#include <vector>
+
+namespace cugraph {
+
+/**
+ * @brief Iterate over the entire set of edges and update edge property values.
+ *
+ * This function is inspired by thrust::transform().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam EdgeSrcValueInputWrapper Type of the wrapper for input edge source property values.
+ * @tparam EdgeDstValueInputWrapper Type of the wrapper for input edge destination property values.
+ * @tparam EdgeValueInputWrapper Type of the wrapper for input edge property values.
+ * @tparam EdgeOp Type of the quinary edge operator.
+ * @tparam EdgeValueOutputWrapper Type of the wrapper for output edge property values.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param edge_src_value_input Wrapper used to access source input property values (for the edge
+ * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
+ * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
+ * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
+ * wrapper.
+ * @param edge_dst_value_input Wrapper used to access destination input property values (for the
+ * edge destinations assigned to this process in multi-GPU). Use either
+ * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
+ * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
+ * values). Use update_edge_dst_property to fill the wrapper.
+ * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned
+ * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
+ * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
+ * access edge property values).
+ * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
+ * destination, and edge and returns a value to be reduced.
+ * @param edge_value_output Wrapper used to store edge output property values (for the edges
+ * assigned to this process in multi-GPU). Use cugraph::edge_property_t::mutable_view().
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename GraphViewType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeOp,
+          typename EdgeValueOutputWrapper>
+void transform_e(raft::handle_t const& handle,
+                 GraphViewType const& graph_view,
+                 EdgeSrcValueInputWrapper edge_src_value_input,
+                 EdgeDstValueInputWrapper edge_dst_value_input,
+                 EdgeValueInputWrapper edge_value_input,
+                 EdgeOp e_op,
+                 EdgeValueOutputWrapper edge_value_output,
+                 bool do_expensive_check = false)
+{
+  // CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
+  CUGRAPH_FAIL("unimplemented.");
+}
+
+/**
+ * @brief Iterate over the edges in the input edge list and update edge property values.
+ *
+ * This function is inspired by thrust::transform().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam EdgeBucketType Type of the edge bucket class which stores the edge list.
+ * @tparam EdgeSrcValueInputWrapper Type of the wrapper for input edge source property values.
+ * @tparam EdgeDstValueInputWrapper Type of the wrapper for input edge destination property values.
+ * @tparam EdgeValueInputWrapper Type of the wrapper for input edge property values.
+ * @tparam EdgeOp Type of the quinary edge operator.
+ * @tparam EdgeValueOutputWrapper Type of the wrapper for output edge property values.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param edge_list EdgeBucketType class object storing the edge list to update edge property
+ * values.
+ * @param edge_src_value_input Wrapper used to access source input property values (for the edge
+ * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view()
+ * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view()
+ * (if @p e_op does not access source property values). Use update_edge_src_property to fill the
+ * wrapper.
+ * @param edge_dst_value_input Wrapper used to access destination input property values (for the
+ * edge destinations assigned to this process in multi-GPU). Use either
+ * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or
+ * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property
+ * values). Use update_edge_dst_property to fill the wrapper.
+ * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned
+ * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to
+ * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not
+ * access edge property values).
+ * @param e_op Quinary operator takes edge source, edge destination, property values for the source,
+ * destination, and edge and returns a value to be reduced.
+ * @param edge_value_output Wrapper used to store edge output property values (for the edges
+ * assigned to this process in multi-GPU). Use cugraph::edge_property_t::mutable_view().
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename GraphViewType,
+          typename EdgeBucketType,
+          typename EdgeSrcValueInputWrapper,
+          typename EdgeDstValueInputWrapper,
+          typename EdgeValueInputWrapper,
+          typename EdgeOp,
+          typename EdgeValueOutputWrapper>
+void transform_e(raft::handle_t const& handle,
+                 GraphViewType const& graph_view,
+                 EdgeBucketType const& edge_list,
+                 EdgeSrcValueInputWrapper edge_src_value_input,
+                 EdgeDstValueInputWrapper edge_dst_value_input,
+                 EdgeValueInputWrapper edge_value_input,
+                 EdgeOp e_op,
+                 EdgeValueOutputWrapper edge_value_output,
+                 bool do_expensive_check = false)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+
+  static_assert(GraphViewType::is_storage_transposed != EdgeBucketType::is_src_major);
+  static_assert(EdgeBucketType::is_sorted_unique);
+  static_assert(
+    std::is_same_v<typename EdgeBucketType::key_type, thrust::tuple<vertex_t, vertex_t>>);
+
+  using edge_partition_src_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeSrcValueInputWrapper::value_type, thrust::nullopt_t>,
+    detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
+    detail::edge_partition_endpoint_property_device_view_t<
+      vertex_t,
+      typename EdgeSrcValueInputWrapper::value_iterator,
+      typename EdgeSrcValueInputWrapper::value_type>>;
+  using edge_partition_dst_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
+    detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
+    detail::edge_partition_endpoint_property_device_view_t<
+      vertex_t,
+      typename EdgeDstValueInputWrapper::value_iterator,
+      typename EdgeDstValueInputWrapper::value_type>>;
+  using edge_partition_e_input_device_view_t = std::conditional_t<
+    std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
+    detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
+    detail::edge_partition_edge_property_device_view_t<
+      edge_t,
+      typename EdgeValueInputWrapper::value_iterator,
+      typename EdgeValueInputWrapper::value_type>>;
+  using edge_partition_e_output_device_view_t = detail::edge_partition_edge_property_device_view_t<
+    edge_t,
+    typename EdgeValueOutputWrapper::value_iterator,
+    typename EdgeValueOutputWrapper::value_type>;
+
+  // CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
+  auto major_first =
+    GraphViewType::is_storage_transposed ? edge_list.dst_begin() : edge_list.src_begin();
+  auto minor_first =
+    GraphViewType::is_storage_transposed ? edge_list.src_begin() : edge_list.dst_begin();
+
+  auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(major_first, minor_first));
+
+  if (do_expensive_check) {
+    CUGRAPH_EXPECTS(
+      thrust::is_sorted(handle.get_thrust_policy(), edge_first, edge_first + edge_list.size()),
+      "Invalid input arguments: edge_list is not sorted.");
+  }
+
+  std::vector<size_t> edge_partition_offsets(graph_view.number_of_local_edge_partitions() + 1, 0);
+  if constexpr (GraphViewType::is_multi_gpu) {
+    std::vector<vertex_t> h_major_range_lasts(graph_view.number_of_local_edge_partitions());
+    for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+      auto edge_partition =
+        edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+          graph_view.local_edge_partition_view(i));
+      h_major_range_lasts[i] = edge_partition.major_range_last();
+    }
+    rmm::device_uvector<vertex_t> d_major_range_lasts(h_major_range_lasts.size(),
+                                                      handle.get_stream());
+    raft::update_device(d_major_range_lasts.data(),
+                        h_major_range_lasts.data(),
+                        h_major_range_lasts.size(),
+                        handle.get_stream());
+    rmm::device_uvector<size_t> d_lower_bounds(d_major_range_lasts.size(), handle.get_stream());
+    thrust::lower_bound(handle.get_thrust_policy(),
+                        major_first,
+                        major_first + edge_list.size(),
+                        d_major_range_lasts.begin(),
+                        d_major_range_lasts.end(),
+                        d_lower_bounds.begin());
+    raft::update_host(edge_partition_offsets.data() + 1,
+                      d_lower_bounds.data(),
+                      d_lower_bounds.size(),
+                      handle.get_stream());
+    handle.sync_stream();
+  } else {
+    edge_partition_offsets.back() = edge_list.size();
+  }
+
+  for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
+        graph_view.local_edge_partition_view(i));
+
+    if (do_expensive_check) {
+      CUGRAPH_EXPECTS(
+        thrust::count_if(
+          handle.get_thrust_policy(),
+          edge_first + edge_partition_offsets[i],
+          edge_first + edge_partition_offsets[i + 1],
+          [edge_partition] __device__(thrust::tuple<vertex_t, vertex_t> edge) {
+            auto major = thrust::get<0>(edge);
+            auto minor = thrust::get<1>(edge);
+            vertex_t major_idx{};
+            auto major_hypersparse_first = edge_partition.major_hypersparse_first();
+            if (major_hypersparse_first) {
+              if (major < *major_hypersparse_first) {
+                major_idx = edge_partition.major_offset_from_major_nocheck(major);
+              } else {
+                auto major_hypersparse_idx =
+                  edge_partition.major_hypersparse_idx_from_major_nocheck(major);
+                if (!major_hypersparse_idx) { return true; }
+                major_idx =
+                  edge_partition.major_offset_from_major_nocheck(*major_hypersparse_first) +
+                  *major_hypersparse_idx;
+              }
+            } else {
+              major_idx = edge_partition.major_offset_from_major_nocheck(major);
+            }
+            vertex_t const* indices{nullptr};
+            edge_t edge_offset{};
+            edge_t local_degree{};
+            thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
+            auto it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+            return *it != minor;
+          }) == 0,
+        "Invalid input arguments: edge_list contains edges that do not exist in the input graph.");
+    }
+
+    edge_partition_src_input_device_view_t edge_partition_src_value_input{};
+    edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
+    if constexpr (GraphViewType::is_storage_transposed) {
+      edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input);
+      edge_partition_dst_value_input =
+        edge_partition_dst_input_device_view_t(edge_dst_value_input, i);
+    } else {
+      edge_partition_src_value_input =
+        edge_partition_src_input_device_view_t(edge_src_value_input, i);
+      edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input);
+    }
+    auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i);
+    auto edge_partition_e_value_output =
+      edge_partition_e_output_device_view_t(edge_value_output, i);
+
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      edge_first + edge_partition_offsets[i],
+      edge_first + edge_partition_offsets[i + 1],
+      [e_op,
+       edge_partition,
+       edge_partition_src_value_input,
+       edge_partition_dst_value_input,
+       edge_partition_e_value_input,
+       edge_partition_e_value_output] __device__(thrust::tuple<vertex_t, vertex_t> edge) {
+        auto major = thrust::get<0>(edge);
+        auto minor = thrust::get<1>(edge);
+
+        vertex_t major_offset{};
+        vertex_t major_idx{};
+        auto major_hypersparse_first = edge_partition.major_hypersparse_first();
+        if (major_hypersparse_first) {
+          if (major < *major_hypersparse_first) {
+            major_offset = edge_partition.major_offset_from_major_nocheck(major);
+            major_idx    = major_offset;
+          } else {
+            auto major_hypersparse_idx =
+              edge_partition.major_hypersparse_idx_from_major_nocheck(major);
+            assert(major_hypersparse_idx);
+            major_idx = edge_partition.major_offset_from_major_nocheck(*major_hypersparse_first) +
+                        *major_hypersparse_idx;
+          }
+        } else {
+          major_offset = edge_partition.major_offset_from_major_nocheck(major);
+          major_idx    = major_offset;
+        }
+
+        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+
+        vertex_t const* indices{nullptr};
+        edge_t edge_offset{};
+        edge_t local_degree{};
+        thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
+        auto lower_it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+        auto upper_it = thrust::upper_bound(thrust::seq, indices, indices + local_degree, minor);
+
+        auto src        = GraphViewType::is_storage_transposed ? minor : major;
+        auto dst        = GraphViewType::is_storage_transposed ? major : minor;
+        auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+        auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+
+        for (auto it = lower_it; it != upper_it; ++it) {
+          assert(*it == minor);
+          auto e_op_result =
+            e_op(src,
+                 dst,
+                 edge_partition_src_value_input.get(src_offset),
+                 edge_partition_dst_value_input.get(dst_offset),
+                 edge_partition_e_value_input.get(edge_offset + thrust::distance(indices, it)));
+          edge_partition_e_value_output.set(edge_offset + thrust::distance(indices, it),
+                                            e_op_result);
+        }
+      });
+  }
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh b/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
index bcf50a8b968..4823c1febf4 100644
--- a/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
+++ b/cpp/src/prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh
@@ -40,6 +40,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/merge.h>
+#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/sort.h>
 #include <thrust/tabulate.h>
@@ -106,8 +107,8 @@ std::tuple<rmm::device_uvector<vertex_t>, ValueBuffer> sort_and_reduce_by_vertic
   rmm::device_uvector<vertex_t>&& vertices,
   ValueBuffer&& value_buffer)
 {
-  using value_t = typename thrust::iterator_traits<decltype(
-    get_dataframe_buffer_begin(value_buffer))>::value_type;
+  using value_t = typename thrust::iterator_traits<decltype(get_dataframe_buffer_begin(
+    value_buffer))>::value_type;
 
   thrust::sort_by_key(handle.get_thrust_policy(),
                       vertices.begin(),
@@ -244,13 +245,17 @@ void transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v(
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
-      typename EdgeSrcValueInputWrapper::value_iterator>>;
+      typename EdgeSrcValueInputWrapper::value_iterator,
+      typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
-      typename EdgeDstValueInputWrapper::value_iterator>>;
+      typename EdgeDstValueInputWrapper::value_iterator,
+      typename EdgeDstValueInputWrapper::value_type>>;
+
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
   if (do_expensive_check) {
     // currently, nothing to do.
diff --git a/cpp/src/prims/transform_reduce_e.cuh b/cpp/src/prims/transform_reduce_e.cuh
index adfc1373c0b..9c23f3fca18 100644
--- a/cpp/src/prims/transform_reduce_e.cuh
+++ b/cpp/src/prims/transform_reduce_e.cuh
@@ -23,6 +23,7 @@
 #include <cugraph/edge_partition_view.hpp>
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/atomic_ops.cuh>
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
@@ -57,7 +58,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeValueInputWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void trasnform_reduce_e_hypersparse(
+__global__ void transform_reduce_e_hypersparse(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -127,7 +128,7 @@ __global__ void trasnform_reduce_e_hypersparse(
   }
 
   e_op_result_sum = BlockReduce(temp_storage).Reduce(e_op_result_sum, edge_property_add);
-  if (threadIdx.x == 0) { atomic_add_edge_op_result(result_iter, e_op_result_sum); }
+  if (threadIdx.x == 0) { atomic_add(result_iter, e_op_result_sum); }
 }
 
 template <typename GraphViewType,
@@ -136,7 +137,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeValueInputWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void trasnform_reduce_e_low_degree(
+__global__ void transform_reduce_e_low_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -206,7 +207,7 @@ __global__ void trasnform_reduce_e_low_degree(
   }
 
   e_op_result_sum = BlockReduce(temp_storage).Reduce(e_op_result_sum, edge_property_add);
-  if (threadIdx.x == 0) { atomic_add_edge_op_result(result_iter, e_op_result_sum); }
+  if (threadIdx.x == 0) { atomic_add(result_iter, e_op_result_sum); }
 }
 
 template <typename GraphViewType,
@@ -215,7 +216,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeValueInputWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void trasnform_reduce_e_mid_degree(
+__global__ void transform_reduce_e_mid_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -272,7 +273,7 @@ __global__ void trasnform_reduce_e_mid_degree(
   }
 
   e_op_result_sum = BlockReduce(temp_storage).Reduce(e_op_result_sum, edge_property_add);
-  if (threadIdx.x == 0) { atomic_add_edge_op_result(result_iter, e_op_result_sum); }
+  if (threadIdx.x == 0) { atomic_add(result_iter, e_op_result_sum); }
 }
 
 template <typename GraphViewType,
@@ -281,7 +282,7 @@ template <typename GraphViewType,
           typename EdgePartitionEdgeValueInputWrapper,
           typename ResultIterator,
           typename EdgeOp>
-__global__ void trasnform_reduce_e_high_degree(
+__global__ void transform_reduce_e_high_degree(
   edge_partition_device_view_t<typename GraphViewType::vertex_type,
                                typename GraphViewType::edge_type,
                                GraphViewType::is_multi_gpu> edge_partition,
@@ -335,7 +336,7 @@ __global__ void trasnform_reduce_e_high_degree(
   }
 
   e_op_result_sum = BlockReduce(temp_storage).Reduce(e_op_result_sum, edge_property_add);
-  if (threadIdx.x == 0) { atomic_add_edge_op_result(result_iter, e_op_result_sum); }
+  if (threadIdx.x == 0) { atomic_add(result_iter, e_op_result_sum); }
 }
 
 }  // namespace detail
@@ -349,7 +350,7 @@ __global__ void trasnform_reduce_e_high_degree(
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam EdgeOp Type of the quinary edge operator.
  * @tparam T Type of the initial value.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
@@ -399,19 +400,24 @@ T transform_reduce_e(raft::handle_t const& handle,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
-      typename EdgeSrcValueInputWrapper::value_iterator>>;
+      typename EdgeSrcValueInputWrapper::value_iterator,
+      typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
-      typename EdgeDstValueInputWrapper::value_iterator>>;
+      typename EdgeDstValueInputWrapper::value_iterator,
+      typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
-      typename EdgeValueInputWrapper::value_iterator>>;
+      typename EdgeValueInputWrapper::value_iterator,
+      typename EdgeValueInputWrapper::value_type>>;
+
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
   if (do_expensive_check) {
     // currently, nothing to do
@@ -453,7 +459,7 @@ T transform_reduce_e(raft::handle_t const& handle,
         raft::grid_1d_block_t update_grid((*segment_offsets)[1],
                                           detail::transform_reduce_e_kernel_block_size,
                                           handle.get_device_properties().maxGridSize[0]);
-        detail::trasnform_reduce_e_high_degree<GraphViewType>
+        detail::transform_reduce_e_high_degree<GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition.major_range_first(),
@@ -468,7 +474,7 @@ T transform_reduce_e(raft::handle_t const& handle,
         raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1],
                                          detail::transform_reduce_e_kernel_block_size,
                                          handle.get_device_properties().maxGridSize[0]);
-        detail::trasnform_reduce_e_mid_degree<GraphViewType>
+        detail::transform_reduce_e_mid_degree<GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition.major_range_first() + (*segment_offsets)[1],
@@ -483,7 +489,7 @@ T transform_reduce_e(raft::handle_t const& handle,
         raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2],
                                            detail::transform_reduce_e_kernel_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
-        detail::trasnform_reduce_e_low_degree<GraphViewType>
+        detail::transform_reduce_e_low_degree<GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition.major_range_first() + (*segment_offsets)[2],
@@ -498,7 +504,7 @@ T transform_reduce_e(raft::handle_t const& handle,
         raft::grid_1d_thread_t update_grid(*(edge_partition.dcs_nzd_vertex_count()),
                                            detail::transform_reduce_e_kernel_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
-        detail::trasnform_reduce_e_hypersparse<GraphViewType>
+        detail::transform_reduce_e_hypersparse<GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition_src_value_input,
@@ -513,7 +519,7 @@ T transform_reduce_e(raft::handle_t const& handle,
                                            detail::transform_reduce_e_kernel_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
 
-        detail::trasnform_reduce_e_low_degree<GraphViewType>
+        detail::transform_reduce_e_low_degree<GraphViewType>
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             edge_partition,
             edge_partition.major_range_first(),
@@ -551,7 +557,7 @@ T transform_reduce_e(raft::handle_t const& handle,
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam EdgeOp Type of the quinary edge operator.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
@@ -595,6 +601,8 @@ auto transform_reduce_e(raft::handle_t const& handle,
     edge_op_result_type<vertex_t, vertex_t, src_value_t, dst_value_t, e_value_t, EdgeOp>::type;
   static_assert(!std::is_same_v<T, void>);
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
index 42857433208..77bf195b4d7 100644
--- a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
+++ b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
@@ -388,23 +388,27 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
-      typename EdgeSrcValueInputWrapper::value_iterator>>;
+      typename EdgeSrcValueInputWrapper::value_iterator,
+      typename EdgeSrcValueInputWrapper::value_type>>;
   using edge_partition_dst_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeDstValueInputWrapper::value_type, thrust::nullopt_t>,
     detail::edge_partition_endpoint_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
-      typename EdgeDstValueInputWrapper::value_iterator>>;
+      typename EdgeDstValueInputWrapper::value_iterator,
+      typename EdgeDstValueInputWrapper::value_type>>;
   using edge_partition_e_input_device_view_t = std::conditional_t<
     std::is_same_v<typename EdgeValueInputWrapper::value_type, thrust::nullopt_t>,
     detail::edge_partition_edge_dummy_property_device_view_t<vertex_t>,
     detail::edge_partition_edge_property_device_view_t<
       edge_t,
-      typename EdgeValueInputWrapper::value_iterator>>;
+      typename EdgeValueInputWrapper::value_iterator,
+      typename EdgeValueInputWrapper::value_type>>;
   using edge_partition_src_dst_key_device_view_t =
     detail::edge_partition_endpoint_property_device_view_t<
       vertex_t,
-      typename EdgeSrcDstKeyInputWrapper::value_iterator>;
+      typename EdgeSrcDstKeyInputWrapper::value_iterator,
+      typename EdgeSrcDstKeyInputWrapper::value_type>;
 
   rmm::device_uvector<vertex_t> keys(0, handle.get_stream());
   auto value_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
@@ -616,7 +620,7 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeSrcKeyInputWrapper Type of the wrapper for edge source key values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam EdgeOp Type of the  quinary edge operator.
  * @tparam T Type of the values in (key, value) pairs.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
@@ -678,6 +682,8 @@ auto transform_reduce_e_by_src_key(raft::handle_t const& handle,
                              typename GraphViewType::vertex_type>::value);
   static_assert(ReduceOp::pure_function, "ReduceOp should be a pure function.");
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -704,7 +710,7 @@ auto transform_reduce_e_by_src_key(raft::handle_t const& handle,
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeDstKeyInputWrapper Type of the wrapper for edge destination key values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam EdgeOp Type of the quinary edge operator.
  * @tparam T Type of the values in (key, value) pairs.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
@@ -766,6 +772,8 @@ auto transform_reduce_e_by_dst_key(raft::handle_t const& handle,
                              typename GraphViewType::vertex_type>::value);
   static_assert(ReduceOp::pure_function, "ReduceOp should be a pure function.");
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
index 6b08ec98f1e..7216eed1186 100644
--- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
+++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
@@ -189,6 +189,8 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
   using edge_t   = typename GraphViewType::edge_type;
   using key_t    = typename VertexFrontierBucketType::key_type;
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   size_t ret{0};
 
   vertex_t const* local_frontier_vertex_first{nullptr};
@@ -253,7 +255,7 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle,
  * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values.
  * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values.
  * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values.
- * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam EdgeOp Type of the quinary edge operator.
  * @tparam ReduceOp Type of the binary reduction operator.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
@@ -322,6 +324,8 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle,
   using key_t     = typename VertexFrontierBucketType::key_type;
   using payload_t = typename ReduceOp::value_type;
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh
index 35e10ee7426..2d72a075ca5 100644
--- a/cpp/src/prims/update_edge_src_dst_property.cuh
+++ b/cpp/src/prims/update_edge_src_dst_property.cuh
@@ -48,11 +48,78 @@
 #include <numeric>
 #include <type_traits>
 #include <utility>
+#include <variant>
 
 namespace cugraph {
 
 namespace detail {
 
+template <typename Iterator, typename vertex_t>
+__device__ void packed_bool_atomic_set(Iterator value_first, vertex_t offset, bool val)
+{
+  auto packed_output_offset = packed_bool_offset(offset);
+  auto packed_output_mask   = packed_bool_mask(offset);
+  if (val) {
+    atomicOr(value_first + packed_output_offset, packed_output_mask);
+  } else {
+    atomicAnd(value_first + packed_output_offset, ~packed_output_mask);
+  }
+}
+
+template <typename BoolInputIterator, typename PackedBoolOutputIterator>
+void pack_bools(raft::handle_t const& handle,
+                BoolInputIterator input_first,
+                BoolInputIterator input_last,
+                PackedBoolOutputIterator output_first)
+{
+  auto num_bools   = static_cast<size_t>(thrust::distance(input_first, input_last));
+  auto packed_size = cugraph::packed_bool_size(num_bools);
+  thrust::tabulate(handle.get_thrust_policy(),
+                   output_first,
+                   output_first + packed_size,
+                   pack_bool_t<BoolInputIterator>{input_first, num_bools});
+}
+
+template <typename BoolInputIterator, typename PackedBoolOutputIterator>
+void pack_unaligned_bools(raft::handle_t const& handle,
+                          BoolInputIterator input_first,
+                          BoolInputIterator input_last,
+                          PackedBoolOutputIterator output_first,
+                          size_t intraword_start_offset)
+{
+  auto num_bools            = static_cast<size_t>(thrust::distance(input_first, input_last));
+  auto num_first_word_bools = std::min(num_bools, packed_bools_per_word() - intraword_start_offset);
+  auto num_aligned_bools    = (num_bools - num_first_word_bools) -
+                           (num_bools - num_first_word_bools) % packed_bools_per_word();
+  auto num_last_word_bools = num_bools - (num_first_word_bools + num_aligned_bools);
+
+  thrust::for_each(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator(size_t{0}),
+    thrust::make_counting_iterator(num_first_word_bools + num_last_word_bools),
+    [intraword_start_offset,
+     num_first_word_bools,
+     num_last_word_bools,
+     first_word_input_bool_first = input_first,
+     last_word_input_bool_first  = input_first + (num_first_word_bools + num_aligned_bools),
+     first_word_output           = output_first,
+     last_word_output            = output_first + ((num_first_word_bools > 0 ? 1 : 0) +
+                                        packed_bool_size(num_aligned_bools))] __device__(size_t i) {
+      if (i < num_first_word_bools) {
+        auto val = *(first_word_input_bool_first + i);
+        packed_bool_atomic_set(first_word_output, intraword_start_offset + i, val);
+      } else {
+        auto val = *(last_word_input_bool_first + (i - num_first_word_bools));
+        packed_bool_atomic_set(last_word_output, i - num_first_word_bools, val);
+      }
+    });
+
+  pack_bools(handle,
+             input_first + num_first_word_bools,
+             input_first + num_first_word_bools + num_aligned_bools,
+             output_first + (num_first_word_bools > 0 ? 1 : 0));
+}
+
 template <typename GraphViewType,
           typename VertexPropertyInputIterator,
           typename EdgeMajorPropertyOutputWrapper>
@@ -61,6 +128,9 @@ void update_edge_major_property(raft::handle_t const& handle,
                                 VertexPropertyInputIterator vertex_property_input_first,
                                 EdgeMajorPropertyOutputWrapper edge_major_property_output)
 {
+  constexpr bool packed_bool =
+    std::is_same_v<typename EdgeMajorPropertyOutputWrapper::value_type, bool>;
+
   auto edge_partition_value_firsts = edge_major_property_output.value_firsts();
   if constexpr (GraphViewType::is_multi_gpu) {
     using vertex_t = typename GraphViewType::vertex_type;
@@ -85,41 +155,90 @@ void update_edge_major_property(raft::handle_t const& handle,
           max_rx_size, graph_view.vertex_partition_range_size(major_range_vertex_partition_id));
       }
       auto rx_value_buffer = allocate_dataframe_buffer<
-        typename std::iterator_traits<VertexPropertyInputIterator>::value_type>(
-        max_rx_size, handle.get_stream());
+        std::conditional_t<packed_bool,
+                           uint32_t,
+                           typename EdgeMajorPropertyOutputWrapper::value_type>>(
+        packed_bool ? packed_bool_size(max_rx_size) : max_rx_size, handle.get_stream());
       auto rx_value_first = get_dataframe_buffer_begin(rx_value_buffer);
       for (int i = 0; i < minor_comm_size; ++i) {
         auto major_range_vertex_partition_id =
           compute_local_edge_partition_major_range_vertex_partition_id_t{
             major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-        device_bcast(minor_comm,
-                     vertex_property_input_first,
-                     rx_value_first,
-                     graph_view.vertex_partition_range_size(major_range_vertex_partition_id),
-                     i,
-                     handle.get_stream());
-
-        auto v_offset_first = thrust::make_transform_iterator(
-          (*edge_partition_keys)[i].begin(),
-          [v_first = graph_view.vertex_partition_range_first(
-             major_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
-        thrust::gather(handle.get_thrust_policy(),
-                       v_offset_first,
-                       v_offset_first + (*edge_partition_keys)[i].size(),
+        if constexpr (packed_bool) {
+          if (i == minor_comm_rank) {
+            pack_bools(handle,
+                       vertex_property_input_first,
+                       vertex_property_input_first +
+                         graph_view.vertex_partition_range_size(major_range_vertex_partition_id),
+                       rx_value_first);
+          }
+          device_bcast(minor_comm,
                        rx_value_first,
-                       edge_partition_value_firsts[i]);
+                       rx_value_first,
+                       packed_bool_size(
+                         graph_view.vertex_partition_range_size(major_range_vertex_partition_id)),
+                       i,
+                       handle.get_stream());
+          auto bool_first = thrust::make_transform_iterator(
+            (*edge_partition_keys)[i].begin(),
+            [rx_value_first,
+             v_first = graph_view.vertex_partition_range_first(
+               major_range_vertex_partition_id)] __device__(auto v) {
+              auto v_offset = v - v_first;
+              return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                       packed_bool_mask(v_offset));
+            });
+          pack_bools(handle,
+                     bool_first,
+                     bool_first + (*edge_partition_keys)[i].size(),
+                     edge_partition_value_firsts[i]);
+        } else {
+          device_bcast(minor_comm,
+                       vertex_property_input_first,
+                       rx_value_first,
+                       graph_view.vertex_partition_range_size(major_range_vertex_partition_id),
+                       i,
+                       handle.get_stream());
+
+          auto v_offset_first = thrust::make_transform_iterator(
+            (*edge_partition_keys)[i].begin(),
+            [v_first = graph_view.vertex_partition_range_first(
+               major_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
+          thrust::gather(handle.get_thrust_policy(),
+                         v_offset_first,
+                         v_offset_first + (*edge_partition_keys)[i].size(),
+                         rx_value_first,
+                         edge_partition_value_firsts[i]);
+        }
       }
     } else {
       for (int i = 0; i < minor_comm_size; ++i) {
         auto major_range_vertex_partition_id =
           compute_local_edge_partition_major_range_vertex_partition_id_t{
             major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-        device_bcast(minor_comm,
-                     vertex_property_input_first,
-                     edge_partition_value_firsts[i],
-                     graph_view.vertex_partition_range_size(major_range_vertex_partition_id),
-                     i,
-                     handle.get_stream());
+        if constexpr (packed_bool) {
+          if (i == minor_comm_rank) {
+            pack_bools(handle,
+                       vertex_property_input_first,
+                       vertex_property_input_first +
+                         graph_view.vertex_partition_range_size(major_range_vertex_partition_id),
+                       edge_partition_value_firsts[i]);
+          }
+          device_bcast(minor_comm,
+                       edge_partition_value_firsts[i],
+                       edge_partition_value_firsts[i],
+                       packed_bool_size(
+                         graph_view.vertex_partition_range_size(major_range_vertex_partition_id)),
+                       i,
+                       handle.get_stream());
+        } else {
+          device_bcast(minor_comm,
+                       vertex_property_input_first,
+                       edge_partition_value_firsts[i],
+                       graph_view.vertex_partition_range_size(major_range_vertex_partition_id),
+                       i,
+                       handle.get_stream());
+        }
       }
     }
   } else {
@@ -127,10 +246,17 @@ void update_edge_major_property(raft::handle_t const& handle,
              ? graph_view.local_edge_partition_dst_range_size()
              : graph_view.local_edge_partition_src_range_size());
     assert(edge_partition_value_firsts.size() == size_t{1});
-    thrust::copy(handle.get_thrust_policy(),
+    if constexpr (packed_bool) {
+      pack_bools(handle,
                  vertex_property_input_first,
                  vertex_property_input_first + graph_view.local_vertex_partition_range_size(),
                  edge_partition_value_firsts[0]);
+    } else {
+      thrust::copy(handle.get_thrust_policy(),
+                   vertex_property_input_first,
+                   vertex_property_input_first + graph_view.local_vertex_partition_range_size(),
+                   edge_partition_value_firsts[0]);
+    }
   }
 }
 
@@ -145,6 +271,9 @@ void update_edge_major_property(raft::handle_t const& handle,
                                 VertexPropertyInputIterator vertex_property_input_first,
                                 EdgeMajorPropertyOutputWrapper edge_major_property_output)
 {
+  constexpr bool packed_bool =
+    std::is_same_v<typename EdgeMajorPropertyOutputWrapper::value_type, bool>;
+
   using vertex_t = typename GraphViewType::vertex_type;
   using edge_t   = typename GraphViewType::edge_type;
 
@@ -166,8 +295,9 @@ void update_edge_major_property(raft::handle_t const& handle,
       });
     rmm::device_uvector<vertex_t> rx_vertices(max_rx_size, handle.get_stream());
     auto rx_tmp_buffer = allocate_dataframe_buffer<
-      typename std::iterator_traits<VertexPropertyInputIterator>::value_type>(max_rx_size,
-                                                                              handle.get_stream());
+      std::
+        conditional_t<packed_bool, uint32_t, typename EdgeMajorPropertyOutputWrapper::value_type>>(
+      packed_bool ? packed_bool_size(max_rx_size) : max_rx_size, handle.get_stream());
     auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer);
 
     auto edge_partition_keys = edge_major_property_output.keys();
@@ -180,55 +310,98 @@ void update_edge_major_property(raft::handle_t const& handle,
         auto vertex_partition =
           vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
             graph_view.local_vertex_partition_view());
-        auto map_first =
-          thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
-            return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-          });
-        // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
-        // permutation iterator (and directly gathers to the internal buffer)
-        thrust::gather(handle.get_thrust_policy(),
-                       map_first,
-                       map_first + thrust::distance(vertex_first, vertex_last),
-                       vertex_property_input_first,
-                       rx_value_first);
+        if constexpr (packed_bool) {
+          auto bool_first = thrust::make_transform_iterator(
+            vertex_first, [vertex_property_input_first, vertex_partition] __device__(auto v) {
+              auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
+              return static_cast<bool>(
+                *(vertex_property_input_first + packed_bool_offset(v_offset)) &
+                packed_bool_mask(v_offset));
+            });
+          pack_bools(handle,
+                     bool_first,
+                     bool_first + thrust::distance(vertex_first, vertex_last),
+                     rx_value_first);
+        } else {
+          auto map_first =
+            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+              return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
+            });
+          // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
+          // permutation iterator (and directly gathers to the internal buffer)
+          thrust::gather(handle.get_thrust_policy(),
+                         map_first,
+                         map_first + thrust::distance(vertex_first, vertex_last),
+                         vertex_property_input_first,
+                         rx_value_first);
+        }
       }
 
       // FIXME: these broadcast operations can be placed between ncclGroupStart() and
       // ncclGroupEnd()
       device_bcast(
         minor_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
-      device_bcast(
-        minor_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
+      device_bcast(minor_comm,
+                   rx_value_first,
+                   rx_value_first,
+                   packed_bool ? packed_bool_size(rx_counts[i]) : rx_counts[i],
+                   i,
+                   handle.get_stream());
 
       if (edge_partition_keys) {
         thrust::for_each(
           handle.get_thrust_policy(),
-          thrust::make_counting_iterator(vertex_t{0}),
-          thrust::make_counting_iterator(static_cast<vertex_t>((*edge_partition_keys)[i].size())),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(rx_counts[i]),
           [rx_vertex_first = rx_vertices.begin(),
-           rx_vertex_last  = rx_vertices.end(),
            rx_value_first,
-           output_key_first   = ((*edge_partition_keys)[i]).begin(),
-           output_value_first = edge_partition_value_firsts[i]] __device__(auto i) {
-            auto major = *(output_key_first + i);
-            auto it    = thrust::lower_bound(thrust::seq, rx_vertex_first, rx_vertex_last, major);
-            if ((it != rx_vertex_last) && (*it == major)) {
-              auto rx_value             = *(rx_value_first + thrust::distance(rx_vertex_first, it));
-              *(output_value_first + i) = rx_value;
+           edge_partition_key_first   = ((*edge_partition_keys)[i]).begin(),
+           edge_partition_key_last    = ((*edge_partition_keys)[i]).end(),
+           edge_partition_value_first = edge_partition_value_firsts[i]] __device__(size_t i) {
+            auto major = *(rx_vertex_first + i);
+            auto it    = thrust::lower_bound(
+              thrust::seq, edge_partition_key_first, edge_partition_key_last, major);
+            if ((it != edge_partition_key_last) && (*it == major)) {
+              auto edge_partition_offset = thrust::distance(edge_partition_key_first, it);
+              if constexpr (packed_bool) {
+                auto rx_value = static_cast<bool>(*(rx_value_first + packed_bool_offset(i)) &
+                                                  packed_bool_mask(i));
+                packe_bool_atomic_set(edge_partition_value_first, edge_partition_offset, rx_value);
+              } else {
+                auto rx_value                                         = *(rx_value_first + i);
+                *(edge_partition_value_first + edge_partition_offset) = rx_value;
+              }
             }
           });
       } else {
-        auto map_first =
-          thrust::make_transform_iterator(rx_vertices.begin(), [edge_partition] __device__(auto v) {
-            return edge_partition.major_offset_from_major_nocheck(v);
-          });
-        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
-        // directly scatters from the internal buffer)
-        thrust::scatter(handle.get_thrust_policy(),
-                        rx_value_first,
-                        rx_value_first + rx_counts[i],
-                        map_first,
-                        edge_partition_value_firsts[i]);
+        if constexpr (packed_bool) {
+          thrust::for_each(
+            handle.get_thrust_policy(),
+            thrust::make_counting_iterator(vertex_t{0}),
+            thrust::make_counting_iterator(static_cast<vertex_t>(rx_counts[i])),
+            [edge_partition,
+             rx_vertex_first = rx_vertices.begin(),
+             rx_value_first,
+             output_value_first = edge_partition_value_firsts[i]] __device__(auto i) {
+              auto rx_vertex = *(rx_vertex_first + i);
+              auto rx_value =
+                static_cast<bool>(*(rx_value_first + packed_bool_offset(i)) & packed_bool_mask(i));
+              auto major_offset = edge_partition.major_offset_from_major_nocheck(rx_vertex);
+              packed_bool_atomic_set(output_value_first, major_offset, rx_value);
+            });
+        } else {
+          auto map_first = thrust::make_transform_iterator(
+            rx_vertices.begin(), [edge_partition] __device__(auto v) {
+              return edge_partition.major_offset_from_major_nocheck(v);
+            });
+          // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+          // directly scatters from the internal buffer)
+          thrust::scatter(handle.get_thrust_policy(),
+                          rx_value_first,
+                          rx_value_first + rx_counts[i],
+                          map_first,
+                          edge_partition_value_firsts[i]);
+        }
       }
     }
   } else {
@@ -236,12 +409,23 @@ void update_edge_major_property(raft::handle_t const& handle,
              ? graph_view.local_edge_partition_dst_range_size()
              : graph_view.local_edge_partition_src_range_size());
     assert(edge_partition_value_firsts.size() == size_t{1});
-    auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first);
-    thrust::scatter(handle.get_thrust_policy(),
-                    val_first,
-                    val_first + thrust::distance(vertex_first, vertex_last),
-                    vertex_first,
-                    edge_partition_value_firsts[0]);
+    if constexpr (packed_bool) {
+      thrust::for_each(handle.get_thrust_policy(),
+                       vertex_first,
+                       vertex_last,
+                       [vertex_property_input_first,
+                        output_value_first = edge_partition_value_firsts[0]] __device__(auto v) {
+                         bool val = static_cast<bool>(*(vertex_property_input_first + v));
+                         packed_bool_atomic_set(output_value_first, v, val);
+                       });
+    } else {
+      auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first);
+      thrust::scatter(handle.get_thrust_policy(),
+                      val_first,
+                      val_first + thrust::distance(vertex_first, vertex_last),
+                      vertex_first,
+                      edge_partition_value_firsts[0]);
+    }
   }
 }
 
@@ -253,10 +437,18 @@ void update_edge_minor_property(raft::handle_t const& handle,
                                 VertexPropertyInputIterator vertex_property_input_first,
                                 EdgeMinorPropertyOutputWrapper edge_minor_property_output)
 {
+  constexpr bool packed_bool =
+    std::is_same_v<typename EdgeMinorPropertyOutputWrapper::value_type, bool>;
+
   auto edge_partition_value_first = edge_minor_property_output.value_first();
   if constexpr (GraphViewType::is_multi_gpu) {
     using vertex_t = typename GraphViewType::vertex_type;
-    using value_t  = typename thrust::iterator_traits<VertexPropertyInputIterator>::value_type;
+    using bcast_buffer_type =
+      decltype(allocate_dataframe_buffer<
+               std::conditional_t<packed_bool,
+                                  uint32_t,
+                                  typename EdgeMinorPropertyOutputWrapper::value_type>>(
+        size_t{0}, handle.get_stream()));
 
     auto& comm                 = handle.get_comms();
     auto const comm_rank       = comm.get_rank();
@@ -268,31 +460,30 @@ void update_edge_minor_property(raft::handle_t const& handle,
     auto const minor_comm_rank = minor_comm.get_rank();
     auto const minor_comm_size = minor_comm.get_size();
 
+    // memory footprint vs parallelism trade-off
+    // memory requirement per loop is
+    // (V/comm_size) * sizeof(value_t)
+    // and limit memory requirement to (E / comm_size) * sizeof(vertex_t)
+    auto bcast_size = static_cast<size_t>(graph_view.number_of_vertices()) / comm_size;
+    if constexpr (packed_bool) {
+      bcast_size /= 8;  // bits to bytes
+    } else {
+      bcast_size *= sizeof(typename EdgeMinorPropertyOutputWrapper::value_type);
+    }
+    auto num_concurrent_bcasts =
+      (static_cast<size_t>(graph_view.number_of_edges() / comm_size) * sizeof(vertex_t)) /
+      std::max(bcast_size, size_t{1});
+    num_concurrent_bcasts = std::max(num_concurrent_bcasts, size_t{1});
+    num_concurrent_bcasts = std::min(num_concurrent_bcasts, static_cast<size_t>(major_comm_size));
+    auto num_rounds = (static_cast<size_t>(major_comm_size) + num_concurrent_bcasts - size_t{1}) /
+                      num_concurrent_bcasts;
+
     auto edge_partition_keys = edge_minor_property_output.keys();
-    if (edge_partition_keys) {
-      raft::host_span<vertex_t const> key_offsets{};
-      if constexpr (GraphViewType::is_storage_transposed) {
-        key_offsets = *(graph_view.local_sorted_unique_edge_src_vertex_partition_offsets());
-      } else {
-        key_offsets = *(graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets());
-      }
 
-      // memory footprint vs parallelism trade-off
-      // memory requirement per loop is
-      // (V/comm_size) * sizeof(value_t)
-      // and limit memory requirement to (E / comm_size) * sizeof(vertex_t)
-      auto num_concurrent_bcasts =
-        (static_cast<size_t>(graph_view.number_of_edges() / comm_size) * sizeof(vertex_t)) /
-        std::max(static_cast<size_t>(graph_view.number_of_vertices() / comm_size) * sizeof(value_t),
-                 size_t{1});
-      num_concurrent_bcasts = std::max(num_concurrent_bcasts, size_t{1});
-      num_concurrent_bcasts = std::min(num_concurrent_bcasts, static_cast<size_t>(major_comm_size));
-      auto num_rounds = (static_cast<size_t>(major_comm_size) + num_concurrent_bcasts - size_t{1}) /
-                        num_concurrent_bcasts;
-
-      std::vector<decltype(allocate_dataframe_buffer<value_t>(size_t{0}, handle.get_stream()))>
-        rx_value_buffers{};
-      rx_value_buffers.reserve(num_concurrent_bcasts);
+    std::optional<std::vector<bcast_buffer_type>> rx_value_buffers{std::nullopt};
+    if (packed_bool || edge_partition_keys) {
+      rx_value_buffers = std::vector<bcast_buffer_type>{};
+      (*rx_value_buffers).reserve(num_concurrent_bcasts);
       for (size_t i = 0; i < num_concurrent_bcasts; ++i) {
         size_t max_size{0};
         for (size_t round = 0; round < num_rounds; ++round) {
@@ -306,19 +497,76 @@ void update_edge_minor_property(raft::handle_t const& handle,
                                   minor_range_vertex_partition_id)));
           }
         }
-        rx_value_buffers.push_back(
-          allocate_dataframe_buffer<value_t>(max_size, handle.get_stream()));
+        (*rx_value_buffers)
+          .push_back(allocate_dataframe_buffer<
+                     std::conditional_t<packed_bool,
+                                        uint32_t,
+                                        typename EdgeMinorPropertyOutputWrapper::value_type>>(
+            packed_bool ? packed_bool_size(max_size) : max_size, handle.get_stream()));
       }
+    }
 
-      for (size_t round = 0; round < num_rounds; ++round) {
-        device_group_start(major_comm);
+    std::variant<raft::host_span<vertex_t const>, std::vector<size_t>>
+      key_offsets_or_rx_displacements{};
+    if (edge_partition_keys) {
+      if constexpr (GraphViewType::is_storage_transposed) {
+        key_offsets_or_rx_displacements =
+          *(graph_view.local_sorted_unique_edge_src_vertex_partition_offsets());
+      } else {
+        key_offsets_or_rx_displacements =
+          *(graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets());
+      }
+    } else {
+      std::vector<size_t> rx_counts(major_comm_size, size_t{0});
+      for (int i = 0; i < major_comm_size; ++i) {
+        auto minor_range_vertex_partition_id =
+          compute_local_edge_partition_minor_range_vertex_partition_id_t{
+            major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
+        rx_counts[i] = graph_view.vertex_partition_range_size(minor_range_vertex_partition_id);
+      }
+      std::vector<size_t> rx_displacements(major_comm_size, size_t{0});
+      std::exclusive_scan(rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0});
+      key_offsets_or_rx_displacements = std::move(rx_displacements);
+    }
+
+    for (size_t round = 0; round < num_rounds; ++round) {
+      if constexpr (packed_bool) {
         for (size_t i = 0; i < num_concurrent_bcasts; ++i) {
-          auto j = num_rounds * i + round;
-          if (j < static_cast<size_t>(major_comm_size)) {
+          auto j = static_cast<int>(num_rounds * i + round);
+          if (j == major_comm_rank) {
             auto minor_range_vertex_partition_id =
               compute_local_edge_partition_minor_range_vertex_partition_id_t{
                 major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(j);
-            auto rx_value_first = get_dataframe_buffer_begin(rx_value_buffers[i]);
+            auto rx_value_first = get_dataframe_buffer_begin((*rx_value_buffers)[i]);
+            pack_bools(handle,
+                       vertex_property_input_first,
+                       vertex_property_input_first +
+                         graph_view.vertex_partition_range_size(minor_range_vertex_partition_id),
+                       rx_value_first);
+          }
+        }
+      }
+
+      device_group_start(major_comm);
+      for (size_t i = 0; i < num_concurrent_bcasts; ++i) {
+        auto j = num_rounds * i + round;
+        if (j < static_cast<size_t>(major_comm_size)) {
+          auto minor_range_vertex_partition_id =
+            compute_local_edge_partition_minor_range_vertex_partition_id_t{
+              major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(j);
+          auto rx_value_first =
+            rx_value_buffers ? get_dataframe_buffer_begin((*rx_value_buffers)[i])
+                             : edge_partition_value_first +
+                                 std::get<std::vector<size_t>>(key_offsets_or_rx_displacements)[j];
+          if constexpr (packed_bool) {
+            device_bcast(major_comm,
+                         rx_value_first,
+                         rx_value_first,
+                         packed_bool_size(
+                           graph_view.vertex_partition_range_size(minor_range_vertex_partition_id)),
+                         j,
+                         handle.get_stream());
+          } else {
             device_bcast(major_comm,
                          vertex_property_input_first,
                          rx_value_first,
@@ -327,52 +575,88 @@ void update_edge_minor_property(raft::handle_t const& handle,
                          handle.get_stream());
           }
         }
-        device_group_end(major_comm);
+      }
+      device_group_end(major_comm);
 
+      if (rx_value_buffers) {
         for (size_t i = 0; i < num_concurrent_bcasts; ++i) {
           auto j = num_rounds * i + round;
           if (j < static_cast<size_t>(major_comm_size)) {
             auto minor_range_vertex_partition_id =
               compute_local_edge_partition_minor_range_vertex_partition_id_t{
                 major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(j);
-            auto rx_value_first = get_dataframe_buffer_begin(rx_value_buffers[i]);
-            auto v_offset_first = thrust::make_transform_iterator(
-              (*edge_partition_keys).begin() + key_offsets[j],
-              [v_first = graph_view.vertex_partition_range_first(
-                 minor_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
-            thrust::gather(handle.get_thrust_policy(),
-                           v_offset_first,
-                           v_offset_first + (key_offsets[j + 1] - key_offsets[j]),
-                           rx_value_first,
-                           edge_partition_value_first + key_offsets[j]);
+            auto rx_value_first = get_dataframe_buffer_begin((*rx_value_buffers)[i]);
+            if constexpr (packed_bool) {
+              if (edge_partition_keys) {
+                auto key_offsets =
+                  std::get<raft::host_span<vertex_t const>>(key_offsets_or_rx_displacements);
+
+                auto bool_first = thrust::make_transform_iterator(
+                  (*edge_partition_keys).begin() + key_offsets[j],
+                  [rx_value_first,
+                   v_first = graph_view.vertex_partition_range_first(
+                     minor_range_vertex_partition_id)] __device__(auto v) {
+                    auto v_offset = v - v_first;
+                    return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                             packed_bool_mask(v_offset));
+                  });
+                pack_unaligned_bools(
+                  handle,
+                  bool_first,
+                  bool_first + (key_offsets[j + 1] - key_offsets[j]),
+                  edge_partition_value_first + packed_bool_offset(key_offsets[j]),
+                  key_offsets[j] % packed_bools_per_word());
+              } else {
+                auto rx_displacements =
+                  std::get<std::vector<size_t>>(key_offsets_or_rx_displacements);
+                auto bool_first = thrust::make_transform_iterator(
+                  thrust::make_counting_iterator(vertex_t{0}),
+                  [rx_value_first] __device__(vertex_t v_offset) {
+                    return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                             packed_bool_mask(v_offset));
+                  });
+                pack_unaligned_bools(
+                  handle,
+                  bool_first,
+                  bool_first +
+                    graph_view.vertex_partition_range_size(minor_range_vertex_partition_id),
+                  edge_partition_value_first + packed_bool_offset(rx_displacements[j]),
+                  rx_displacements[j] % packed_bools_per_word());
+              }
+            } else {
+              assert(edge_partition_keys);
+              auto key_offsets =
+                std::get<raft::host_span<vertex_t const>>(key_offsets_or_rx_displacements);
+
+              auto v_offset_first = thrust::make_transform_iterator(
+                (*edge_partition_keys).begin() + key_offsets[j],
+                [v_first = graph_view.vertex_partition_range_first(
+                   minor_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
+              thrust::gather(handle.get_thrust_policy(),
+                             v_offset_first,
+                             v_offset_first + (key_offsets[j + 1] - key_offsets[j]),
+                             rx_value_first,
+                             edge_partition_value_first + key_offsets[j]);
+            }
           }
         }
       }
-    } else {
-      std::vector<size_t> rx_counts(major_comm_size, size_t{0});
-      std::vector<size_t> displacements(major_comm_size, size_t{0});
-      for (int i = 0; i < major_comm_size; ++i) {
-        auto minor_range_vertex_partition_id =
-          compute_local_edge_partition_minor_range_vertex_partition_id_t{
-            major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i);
-        rx_counts[i]     = graph_view.vertex_partition_range_size(minor_range_vertex_partition_id);
-        displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
-      }
-      device_allgatherv(major_comm,
-                        vertex_property_input_first,
-                        edge_partition_value_first,
-                        rx_counts,
-                        displacements,
-                        handle.get_stream());
     }
   } else {
     assert(graph_view.local_vertex_partition_range_size() == GraphViewType::is_storage_transposed
              ? graph_view.local_edge_partition_src_range_size()
              : graph_view.local_edge_partition_dst_range_size());
-    thrust::copy(handle.get_thrust_policy(),
+    if constexpr (packed_bool) {
+      pack_bools(handle,
                  vertex_property_input_first,
                  vertex_property_input_first + graph_view.local_vertex_partition_range_size(),
                  edge_partition_value_first);
+    } else {
+      thrust::copy(handle.get_thrust_policy(),
+                   vertex_property_input_first,
+                   vertex_property_input_first + graph_view.local_vertex_partition_range_size(),
+                   edge_partition_value_first);
+    }
   }
 }
 
@@ -387,6 +671,9 @@ void update_edge_minor_property(raft::handle_t const& handle,
                                 VertexPropertyInputIterator vertex_property_input_first,
                                 EdgeMinorPropertyOutputWrapper edge_minor_property_output)
 {
+  constexpr bool packed_bool =
+    std::is_same_v<typename EdgeMinorPropertyOutputWrapper::value_type, bool>;
+
   using vertex_t = typename GraphViewType::vertex_type;
   using edge_t   = typename GraphViewType::edge_type;
 
@@ -408,8 +695,9 @@ void update_edge_minor_property(raft::handle_t const& handle,
       });
     rmm::device_uvector<vertex_t> rx_vertices(max_rx_size, handle.get_stream());
     auto rx_tmp_buffer = allocate_dataframe_buffer<
-      typename std::iterator_traits<VertexPropertyInputIterator>::value_type>(max_rx_size,
-                                                                              handle.get_stream());
+      std::
+        conditional_t<packed_bool, uint32_t, typename EdgeMinorPropertyOutputWrapper::value_type>>(
+      packed_bool ? packed_bool_size(max_rx_size) : max_rx_size, handle.get_stream());
     auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer);
 
     std::optional<raft::host_span<vertex_t const>> key_offsets{};
@@ -428,66 +716,122 @@ void update_edge_minor_property(raft::handle_t const& handle,
         auto vertex_partition =
           vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
             graph_view.local_vertex_partition_view());
-        auto map_first =
-          thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
-            return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-          });
-        // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
-        // permutation iterator (and directly gathers to the internal buffer)
-        thrust::gather(handle.get_thrust_policy(),
-                       map_first,
-                       map_first + thrust::distance(vertex_first, vertex_last),
-                       vertex_property_input_first,
-                       rx_value_first);
+        if constexpr (packed_bool) {
+          auto bool_first = thrust::make_transform_iterator(
+            vertex_first, [vertex_property_input_first, vertex_partition] __device__(auto v) {
+              auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
+              return static_cast<bool>(
+                *(vertex_property_input_first + packed_bool_offset(v_offset)) &
+                packed_bool_mask(v_offset));
+            });
+          pack_bools(handle,
+                     bool_first,
+                     bool_first + thrust::distance(vertex_first, vertex_last),
+                     rx_value_first);
+        } else {
+          auto map_first =
+            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+              return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
+            });
+          // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
+          // permutation iterator (and directly gathers to the internal buffer)
+          thrust::gather(handle.get_thrust_policy(),
+                         map_first,
+                         map_first + thrust::distance(vertex_first, vertex_last),
+                         vertex_property_input_first,
+                         rx_value_first);
+        }
       }
 
       // FIXME: these broadcast operations can be placed between ncclGroupStart() and
       // ncclGroupEnd()
       device_bcast(
         major_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
-      device_bcast(
-        major_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
+      device_bcast(major_comm,
+                   rx_value_first,
+                   rx_value_first,
+                   packed_bool ? packed_bool_size(rx_counts[i]) : rx_counts[i],
+                   i,
+                   handle.get_stream());
 
       if (edge_partition_keys) {
         thrust::for_each(
           handle.get_thrust_policy(),
-          thrust::make_counting_iterator(vertex_t{0}),
-          thrust::make_counting_iterator((*key_offsets)[i + 1] - (*key_offsets)[i]),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(rx_counts[i]),
           [rx_vertex_first = rx_vertices.begin(),
-           rx_vertex_last  = rx_vertices.end(),
            rx_value_first,
-           output_key_first   = (*edge_partition_keys).begin() + (*key_offsets)[i],
-           output_value_first = edge_partition_value_first + (*key_offsets)[i]] __device__(auto i) {
-            auto minor = *(output_key_first + i);
-            auto it    = thrust::lower_bound(thrust::seq, rx_vertex_first, rx_vertex_last, minor);
-            if ((it != rx_vertex_last) && (*it == minor)) {
-              auto rx_value             = *(rx_value_first + thrust::distance(rx_vertex_first, it));
-              *(output_value_first + i) = rx_value;
+           subrange_key_first         = (*edge_partition_keys).begin() + (*key_offsets)[i],
+           subrange_key_last          = (*edge_partition_keys).begin() + (*key_offsets)[i + 1],
+           edge_partition_value_first = edge_partition_value_first,
+           subrange_start_offset      = (*key_offsets)[i]] __device__(auto i) {
+            auto minor = *(rx_vertex_first + i);
+            auto it =
+              thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor);
+            if ((it != subrange_key_last) && (*it == minor)) {
+              auto subrange_offset = thrust::distance(subrange_key_first, it);
+              if constexpr (packed_bool) {
+                auto rx_value = static_cast<bool>(*(rx_value_first + packed_bool_offset(i)) &
+                                                  packed_bool_mask(i));
+                packed_bool_atomic_set(
+                  edge_partition_value_first, subrange_start_offset + subrange_offset, rx_value);
+              } else {
+                auto rx_value = *(rx_value_first + i);
+                *(edge_partition_value_first + subrange_start_offset + subrange_offset) = rx_value;
+              }
             }
           });
       } else {
-        auto map_first =
-          thrust::make_transform_iterator(rx_vertices.begin(), [edge_partition] __device__(auto v) {
-            return edge_partition.minor_offset_from_minor_nocheck(v);
-          });
-        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
-        // directly scatters from the internal buffer)
-        thrust::scatter(handle.get_thrust_policy(),
-                        rx_value_first,
-                        rx_value_first + rx_counts[i],
-                        map_first,
-                        edge_partition_value_first);
+        if constexpr (packed_bool) {
+          thrust::for_each(
+            handle.get_thrust_policy(),
+            thrust::make_counting_iterator(vertex_t{0}),
+            thrust::make_counting_iterator(static_cast<vertex_t>(rx_counts[i])),
+            [edge_partition,
+             rx_vertex_first = rx_vertices.begin(),
+             rx_value_first,
+             output_value_first = edge_partition_value_first] __device__(auto i) {
+              auto rx_vertex = *(rx_vertex_first + i);
+              auto rx_value =
+                static_cast<bool>(*(rx_value_first + packed_bool_offset(i)) & packed_bool_mask(i));
+              auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(rx_vertex);
+              packed_bool_atomic_set(output_value_first, minor_offset, rx_value);
+            });
+        } else {
+          auto map_first = thrust::make_transform_iterator(
+            rx_vertices.begin(), [edge_partition] __device__(auto v) {
+              return edge_partition.minor_offset_from_minor_nocheck(v);
+            });
+          // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+          // directly scatters from the internal buffer)
+          thrust::scatter(handle.get_thrust_policy(),
+                          rx_value_first,
+                          rx_value_first + rx_counts[i],
+                          map_first,
+                          edge_partition_value_first);
+        }
       }
     }
   } else {
     assert(graph_view.local_vertex_partition_range_size() ==
            graph_view.local_edge_partition_src_range_size());
-    auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first);
-    thrust::scatter(handle.get_thrust_policy(),
-                    val_first,
-                    val_first + thrust::distance(vertex_first, vertex_last),
-                    vertex_first,
-                    edge_partition_value_first);
+    if constexpr (packed_bool) {
+      thrust::for_each(handle.get_thrust_policy(),
+                       vertex_first,
+                       vertex_last,
+                       [vertex_property_input_first,
+                        output_value_first = edge_partition_value_first] __device__(auto v) {
+                         bool val = static_cast<bool>(*(vertex_property_input_first + v));
+                         packed_bool_atomic_set(output_value_first, v, val);
+                       });
+    } else {
+      auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first);
+      thrust::scatter(handle.get_thrust_policy(),
+                      val_first,
+                      val_first + thrust::distance(vertex_first, vertex_last),
+                      vertex_first,
+                      edge_partition_value_first);
+    }
   }
 }
 
@@ -522,6 +866,8 @@ void update_edge_src_property(
     edge_src_property_output,
   bool do_expensive_check = false)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -571,6 +917,8 @@ void update_edge_src_property(
     edge_src_property_output,
   bool do_expensive_check = false)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     auto num_invalids = thrust::count_if(
       handle.get_thrust_policy(),
@@ -637,6 +985,8 @@ void update_edge_dst_property(
     edge_dst_property_output,
   bool do_expensive_check = false)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -686,6 +1036,8 @@ void update_edge_dst_property(
     edge_dst_property_output,
   bool do_expensive_check = false)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     auto num_invalids = thrust::count_if(
       handle.get_thrust_policy(),
diff --git a/cpp/src/prims/update_v_frontier.cuh b/cpp/src/prims/update_v_frontier.cuh
index f0e7329ce23..fb94748b7e4 100644
--- a/cpp/src/prims/update_v_frontier.cuh
+++ b/cpp/src/prims/update_v_frontier.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -196,8 +196,8 @@ void update_v_frontier(raft::handle_t const& handle,
   using vertex_t = typename GraphViewType::vertex_type;
   using key_t =
     typename thrust::iterator_traits<decltype(get_dataframe_buffer_begin(key_buffer))>::value_type;
-  using payload_t = typename thrust::iterator_traits<decltype(
-    get_dataframe_buffer_begin(payload_buffer))>::value_type;
+  using payload_t = typename thrust::iterator_traits<decltype(get_dataframe_buffer_begin(
+    payload_buffer))>::value_type;
 
   static_assert(std::is_rvalue_reference_v<decltype(key_buffer)>);
   static_assert(std::is_rvalue_reference_v<decltype(payload_buffer)>);
diff --git a/cpp/src/sampling/detail/sampling_utils_impl.cuh b/cpp/src/sampling/detail/sampling_utils_impl.cuh
index 3ab13d0c7d2..8ba9e45e8d2 100644
--- a/cpp/src/sampling/detail/sampling_utils_impl.cuh
+++ b/cpp/src/sampling/detail/sampling_utils_impl.cuh
@@ -122,6 +122,19 @@ struct sample_edges_op_t {
   }
 };
 
+template <typename label_t>
+struct shuffle_to_output_comm_rank_t {
+  raft::device_span<label_t const> output_label_;
+  raft::device_span<int32_t const> output_rank_;
+
+  template <typename key_t>
+  __device__ int32_t operator()(key_t key) const
+  {
+    auto pos = thrust::lower_bound(thrust::seq, output_label_.begin(), output_label_.end(), key);
+    return output_rank_[thrust::distance(output_label_.begin(), pos)];
+  }
+};
+
 struct segmented_fill_t {
   raft::device_span<int32_t const> fill_values{};
   raft::device_span<size_t const> segment_offsets{};
@@ -626,6 +639,157 @@ sample_edges(raft::handle_t const& handle,
                          std::move(labels));
 }
 
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_t,
+          typename edge_type_t,
+          typename label_t>
+void sort_sampled_tuples(raft::handle_t const& handle,
+                         rmm::device_uvector<vertex_t>& majors,
+                         rmm::device_uvector<vertex_t>& minors,
+                         std::optional<rmm::device_uvector<weight_t>>& weights,
+                         std::optional<rmm::device_uvector<edge_t>>& edge_ids,
+                         std::optional<rmm::device_uvector<edge_type_t>>& edge_types,
+                         std::optional<rmm::device_uvector<int32_t>>& hops,
+                         std::optional<rmm::device_uvector<label_t>>& labels)
+{
+  if (weights) {
+    if (edge_ids) {
+      if (edge_types) {
+        if (hops) {
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              thrust::make_zip_iterator(labels->begin(), hops->begin()),
+                              thrust::make_zip_iterator(labels->end(), hops->end()),
+                              thrust::make_zip_iterator(majors.begin(),
+                                                        minors.begin(),
+                                                        weights->begin(),
+                                                        edge_ids->begin(),
+                                                        edge_types->begin()));
+        } else {
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              labels->begin(),
+                              labels->end(),
+                              thrust::make_zip_iterator(majors.begin(),
+                                                        minors.begin(),
+                                                        weights->begin(),
+                                                        edge_ids->begin(),
+                                                        edge_types->begin()));
+        }
+      } else {
+        if (hops) {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            thrust::make_zip_iterator(labels->begin(), hops->begin()),
+            thrust::make_zip_iterator(labels->end(), hops->end()),
+            thrust::make_zip_iterator(
+              majors.begin(), minors.begin(), weights->begin(), edge_ids->begin()));
+        } else {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            labels->begin(),
+            labels->end(),
+            thrust::make_zip_iterator(
+              majors.begin(), minors.begin(), weights->begin(), edge_ids->begin()));
+        }
+      }
+    } else {
+      if (edge_types) {
+        if (hops) {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            thrust::make_zip_iterator(labels->begin(), hops->begin()),
+            thrust::make_zip_iterator(labels->end(), hops->end()),
+            thrust::make_zip_iterator(
+              majors.begin(), minors.begin(), weights->begin(), edge_types->begin()));
+        } else {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            labels->begin(),
+            labels->end(),
+            thrust::make_zip_iterator(
+              majors.begin(), minors.begin(), weights->begin(), edge_types->begin()));
+        }
+      } else {
+        if (hops) {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            thrust::make_zip_iterator(labels->begin(), hops->begin()),
+            thrust::make_zip_iterator(labels->end(), hops->end()),
+            thrust::make_zip_iterator(majors.begin(), minors.begin(), weights->begin()));
+        } else {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            labels->begin(),
+            labels->end(),
+            thrust::make_zip_iterator(majors.begin(), minors.begin(), weights->begin()));
+        }
+      }
+    }
+  } else {
+    if (edge_ids) {
+      if (edge_types) {
+        if (hops) {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            thrust::make_zip_iterator(labels->begin(), hops->begin()),
+            thrust::make_zip_iterator(labels->end(), hops->end()),
+            thrust::make_zip_iterator(
+              majors.begin(), minors.begin(), edge_ids->begin(), edge_types->begin()));
+        } else {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            labels->begin(),
+            labels->end(),
+            thrust::make_zip_iterator(
+              majors.begin(), minors.begin(), edge_ids->begin(), edge_types->begin()));
+        }
+      } else {
+        if (hops) {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            thrust::make_zip_iterator(labels->begin(), hops->begin()),
+            thrust::make_zip_iterator(labels->end(), hops->end()),
+            thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_ids->begin()));
+        } else {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            labels->begin(),
+            labels->end(),
+            thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_ids->begin()));
+        }
+      }
+    } else {
+      if (edge_types) {
+        if (hops) {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            thrust::make_zip_iterator(labels->begin(), hops->begin()),
+            thrust::make_zip_iterator(labels->end(), hops->end()),
+            thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_types->begin()));
+        } else {
+          thrust::sort_by_key(
+            handle.get_thrust_policy(),
+            labels->begin(),
+            labels->end(),
+            thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_types->begin()));
+        }
+      } else {
+        if (hops) {
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              thrust::make_zip_iterator(labels->begin(), hops->begin()),
+                              thrust::make_zip_iterator(labels->end(), hops->end()),
+                              thrust::make_zip_iterator(majors.begin(), minors.begin()));
+        } else {
+          thrust::sort_by_key(handle.get_thrust_policy(),
+                              labels->begin(),
+                              labels->end(),
+                              thrust::make_zip_iterator(majors.begin(), minors.begin()));
+        }
+      }
+    }
+  }
+}
+
 template <typename vertex_t,
           typename edge_t,
           typename weight_t,
@@ -654,151 +818,7 @@ shuffle_and_organize_output(
   std::optional<rmm::device_uvector<size_t>> offsets{std::nullopt};
 
   if (labels) {
-    if (weights) {
-      if (edge_ids) {
-        if (edge_types) {
-          if (hops) {
-            thrust::sort_by_key(handle.get_thrust_policy(),
-                                labels->begin(),
-                                labels->end(),
-                                thrust::make_zip_iterator(majors.begin(),
-                                                          minors.begin(),
-                                                          weights->begin(),
-                                                          edge_ids->begin(),
-                                                          edge_types->begin(),
-                                                          hops->begin()));
-          } else {
-            thrust::sort_by_key(handle.get_thrust_policy(),
-                                labels->begin(),
-                                labels->end(),
-                                thrust::make_zip_iterator(majors.begin(),
-                                                          minors.begin(),
-                                                          weights->begin(),
-                                                          edge_ids->begin(),
-                                                          edge_types->begin()));
-          }
-        } else {
-          if (hops) {
-            thrust::sort_by_key(handle.get_thrust_policy(),
-                                labels->begin(),
-                                labels->end(),
-                                thrust::make_zip_iterator(majors.begin(),
-                                                          minors.begin(),
-                                                          weights->begin(),
-                                                          edge_ids->begin(),
-                                                          hops->begin()));
-          } else {
-            thrust::sort_by_key(
-              handle.get_thrust_policy(),
-              labels->begin(),
-              labels->end(),
-              thrust::make_zip_iterator(
-                majors.begin(), minors.begin(), weights->begin(), edge_ids->begin()));
-          }
-        }
-      } else {
-        if (edge_types) {
-          if (hops) {
-            thrust::sort_by_key(handle.get_thrust_policy(),
-                                labels->begin(),
-                                labels->end(),
-                                thrust::make_zip_iterator(majors.begin(),
-                                                          minors.begin(),
-                                                          weights->begin(),
-                                                          edge_types->begin(),
-                                                          hops->begin()));
-          } else {
-            thrust::sort_by_key(
-              handle.get_thrust_policy(),
-              labels->begin(),
-              labels->end(),
-              thrust::make_zip_iterator(
-                majors.begin(), minors.begin(), weights->begin(), edge_types->begin()));
-          }
-        } else {
-          if (hops) {
-            thrust::sort_by_key(handle.get_thrust_policy(),
-                                labels->begin(),
-                                labels->end(),
-                                thrust::make_zip_iterator(
-                                  majors.begin(), minors.begin(), weights->begin(), hops->begin()));
-          } else {
-            thrust::sort_by_key(
-              handle.get_thrust_policy(),
-              labels->begin(),
-              labels->end(),
-              thrust::make_zip_iterator(majors.begin(), minors.begin(), weights->begin()));
-          }
-        }
-      }
-    } else {
-      if (edge_ids) {
-        if (edge_types) {
-          if (hops) {
-            thrust::sort_by_key(handle.get_thrust_policy(),
-                                labels->begin(),
-                                labels->end(),
-                                thrust::make_zip_iterator(majors.begin(),
-                                                          minors.begin(),
-                                                          edge_ids->begin(),
-                                                          edge_types->begin(),
-                                                          hops->begin()));
-          } else {
-            thrust::sort_by_key(
-              handle.get_thrust_policy(),
-              labels->begin(),
-              labels->end(),
-              thrust::make_zip_iterator(
-                majors.begin(), minors.begin(), edge_ids->begin(), edge_types->begin()));
-          }
-        } else {
-          if (hops) {
-            thrust::sort_by_key(
-              handle.get_thrust_policy(),
-              labels->begin(),
-              labels->end(),
-              thrust::make_zip_iterator(
-                majors.begin(), minors.begin(), edge_ids->begin(), hops->begin()));
-          } else {
-            thrust::sort_by_key(
-              handle.get_thrust_policy(),
-              labels->begin(),
-              labels->end(),
-              thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_ids->begin()));
-          }
-        }
-      } else {
-        if (edge_types) {
-          if (hops) {
-            thrust::sort_by_key(
-              handle.get_thrust_policy(),
-              labels->begin(),
-              labels->end(),
-              thrust::make_zip_iterator(
-                majors.begin(), minors.begin(), edge_types->begin(), hops->begin()));
-          } else {
-            thrust::sort_by_key(
-              handle.get_thrust_policy(),
-              labels->begin(),
-              labels->end(),
-              thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_types->begin()));
-          }
-        } else {
-          if (hops) {
-            thrust::sort_by_key(
-              handle.get_thrust_policy(),
-              labels->begin(),
-              labels->end(),
-              thrust::make_zip_iterator(majors.begin(), minors.begin(), hops->begin()));
-          } else {
-            thrust::sort_by_key(handle.get_thrust_policy(),
-                                labels->begin(),
-                                labels->end(),
-                                thrust::make_zip_iterator(majors.begin(), minors.begin()));
-          }
-        }
-      }
-    }
+    sort_sampled_tuples(handle, majors, minors, weights, edge_ids, edge_types, hops, labels);
 
     if (label_to_output_comm_rank) {
       CUGRAPH_EXPECTS(labels, "labels must be specified in order to shuffle sampling results");
@@ -820,30 +840,33 @@ shuffle_and_organize_output(
       auto mem_frugal_threshold = static_cast<size_t>(
         static_cast<double>(total_global_mem / element_size) * mem_frugal_ratio);
 
-      auto d_tx_value_counts = cugraph::groupby_and_count(
-        labels->begin(),
-        labels->end(),
-        [output_label = std::get<0>(*label_to_output_comm_rank),
-         output_rank  = std::get<1>(*label_to_output_comm_rank)] __device__(auto val) {
-          auto pos =
-            thrust::lower_bound(thrust::seq, output_label.begin(), output_label.end(), val);
-          return output_rank[thrust::distance(output_label.begin(), pos)];
-        },
-        comm_size,
-        mem_frugal_threshold,
-        handle.get_stream());
-
-      std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
-      raft::update_host(h_tx_value_counts.data(),
-                        d_tx_value_counts.data(),
-                        d_tx_value_counts.size(),
-                        handle.get_stream());
-      handle.sync_stream();
-
       if (weights) {
         if (edge_ids) {
           if (edge_types) {
             if (hops) {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(majors.begin(),
+                                          minors.begin(),
+                                          weights->begin(),
+                                          edge_ids->begin(),
+                                          edge_types->begin(),
+                                          hops->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+
+              handle.sync_stream();
+
               std::forward_as_tuple(
                 std::tie(majors, minors, weights, edge_ids, edge_types, hops, labels),
                 std::ignore) = shuffle_values(comm,
@@ -857,6 +880,27 @@ shuffle_and_organize_output(
                                               h_tx_value_counts,
                                               handle.get_stream());
             } else {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(majors.begin(),
+                                          minors.begin(),
+                                          weights->begin(),
+                                          edge_ids->begin(),
+                                          edge_types->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, weights, edge_ids, edge_types, labels),
                                     std::ignore) =
                 shuffle_values(comm,
@@ -871,6 +915,27 @@ shuffle_and_organize_output(
             }
           } else {
             if (hops) {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(majors.begin(),
+                                          minors.begin(),
+                                          weights->begin(),
+                                          edge_ids->begin(),
+                                          hops->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, weights, edge_ids, hops, labels),
                                     std::ignore) =
                 shuffle_values(comm,
@@ -883,6 +948,24 @@ shuffle_and_organize_output(
                                h_tx_value_counts,
                                handle.get_stream());
             } else {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(
+                  majors.begin(), minors.begin(), weights->begin(), edge_ids->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, weights, edge_ids, labels),
                                     std::ignore) =
                 shuffle_values(comm,
@@ -898,6 +981,27 @@ shuffle_and_organize_output(
         } else {
           if (edge_types) {
             if (hops) {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(majors.begin(),
+                                          minors.begin(),
+                                          weights->begin(),
+                                          edge_types->begin(),
+                                          hops->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, weights, edge_types, hops, labels),
                                     std::ignore) =
                 shuffle_values(comm,
@@ -910,6 +1014,24 @@ shuffle_and_organize_output(
                                h_tx_value_counts,
                                handle.get_stream());
             } else {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(
+                  majors.begin(), minors.begin(), weights->begin(), edge_types->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, weights, edge_types, labels),
                                     std::ignore) =
                 shuffle_values(comm,
@@ -923,6 +1045,24 @@ shuffle_and_organize_output(
             }
           } else {
             if (hops) {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(
+                  majors.begin(), minors.begin(), weights->begin(), hops->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, weights, hops, labels), std::ignore) =
                 shuffle_values(comm,
                                thrust::make_zip_iterator(majors.begin(),
@@ -933,6 +1073,23 @@ shuffle_and_organize_output(
                                h_tx_value_counts,
                                handle.get_stream());
             } else {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(majors.begin(), minors.begin(), weights->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, weights, labels), std::ignore) =
                 shuffle_values(comm,
                                thrust::make_zip_iterator(
@@ -946,6 +1103,27 @@ shuffle_and_organize_output(
         if (edge_ids) {
           if (edge_types) {
             if (hops) {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(majors.begin(),
+                                          minors.begin(),
+                                          edge_ids->begin(),
+                                          edge_types->begin(),
+                                          hops->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, edge_ids, edge_types, hops, labels),
                                     std::ignore) =
                 shuffle_values(comm,
@@ -958,6 +1136,24 @@ shuffle_and_organize_output(
                                h_tx_value_counts,
                                handle.get_stream());
             } else {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(
+                  majors.begin(), minors.begin(), edge_ids->begin(), edge_types->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, edge_ids, edge_types, labels),
                                     std::ignore) =
                 shuffle_values(comm,
@@ -971,6 +1167,24 @@ shuffle_and_organize_output(
             }
           } else {
             if (hops) {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(
+                  majors.begin(), minors.begin(), edge_ids->begin(), hops->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, edge_ids, hops, labels), std::ignore) =
                 shuffle_values(comm,
                                thrust::make_zip_iterator(majors.begin(),
@@ -981,6 +1195,23 @@ shuffle_and_organize_output(
                                h_tx_value_counts,
                                handle.get_stream());
             } else {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_ids->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, edge_ids, labels), std::ignore) =
                 shuffle_values(
                   comm,
@@ -993,6 +1224,24 @@ shuffle_and_organize_output(
         } else {
           if (edge_types) {
             if (hops) {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(
+                  majors.begin(), minors.begin(), edge_types->begin(), hops->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, edge_types, hops, labels),
                                     std::ignore) =
                 shuffle_values(comm,
@@ -1004,6 +1253,23 @@ shuffle_and_organize_output(
                                h_tx_value_counts,
                                handle.get_stream());
             } else {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_types->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, edge_types, labels), std::ignore) =
                 shuffle_values(
                   comm,
@@ -1014,6 +1280,23 @@ shuffle_and_organize_output(
             }
           } else {
             if (hops) {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(majors.begin(), minors.begin(), hops->begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, hops, labels), std::ignore) =
                 shuffle_values(comm,
                                thrust::make_zip_iterator(
@@ -1021,6 +1304,23 @@ shuffle_and_organize_output(
                                h_tx_value_counts,
                                handle.get_stream());
             } else {
+              auto d_tx_value_counts = cugraph::groupby_and_count(
+                labels->begin(),
+                labels->end(),
+                thrust::make_zip_iterator(majors.begin(), minors.begin()),
+                shuffle_to_output_comm_rank_t<label_t>{std::get<0>(*label_to_output_comm_rank),
+                                                       std::get<1>(*label_to_output_comm_rank)},
+                comm_size,
+                mem_frugal_threshold,
+                handle.get_stream());
+
+              std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+              raft::update_host(h_tx_value_counts.data(),
+                                d_tx_value_counts.data(),
+                                d_tx_value_counts.size(),
+                                handle.get_stream());
+              handle.sync_stream();
+
               std::forward_as_tuple(std::tie(majors, minors, labels), std::ignore) = shuffle_values(
                 comm,
                 thrust::make_zip_iterator(majors.begin(), minors.begin(), labels->begin()),
@@ -1030,6 +1330,8 @@ shuffle_and_organize_output(
           }
         }
       }
+
+      sort_sampled_tuples(handle, majors, minors, weights, edge_ids, edge_types, hops, labels);
     }
 
     size_t num_unique_labels =
diff --git a/cpp/src/sampling/neighborhood.cu b/cpp/src/sampling/neighborhood.cu
index 0f97f04ae38..0c0beb8d8b0 100644
--- a/cpp/src/sampling/neighborhood.cu
+++ b/cpp/src/sampling/neighborhood.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,17 +32,17 @@ sample_neighbors_adjacency_list(raft::handle_t const& handle,
                                 vertex_t const* ptr_d_start,
                                 size_t num_start_vertices,
                                 size_t sampling_size,
-                                ops::gnn::graph::SamplingAlgoT sampling_algo)
+                                ops::graph::SamplingAlgoT sampling_algo)
 {
   const auto [ops_graph, max_degree] = detail::get_graph_and_max_degree(graph_view);
-  return ops::gnn::graph::uniform_sample_csr(rng_state,
-                                             ops_graph,
-                                             ptr_d_start,
-                                             num_start_vertices,
-                                             sampling_size,
-                                             sampling_algo,
-                                             max_degree,
-                                             handle.get_stream());
+  return ops::graph::uniform_sample_csr(rng_state,
+                                        ops_graph,
+                                        ptr_d_start,
+                                        num_start_vertices,
+                                        sampling_size,
+                                        sampling_algo,
+                                        max_degree,
+                                        handle.get_stream());
 }
 
 template <typename vertex_t, typename edge_t>
@@ -53,17 +53,17 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> sample_
   vertex_t const* ptr_d_start,
   size_t num_start_vertices,
   size_t sampling_size,
-  ops::gnn::graph::SamplingAlgoT sampling_algo)
+  ops::graph::SamplingAlgoT sampling_algo)
 {
   const auto [ops_graph, max_degree] = detail::get_graph_and_max_degree(graph_view);
-  return ops::gnn::graph::uniform_sample_coo(rng_state,
-                                             ops_graph,
-                                             ptr_d_start,
-                                             num_start_vertices,
-                                             sampling_size,
-                                             sampling_algo,
-                                             max_degree,
-                                             handle.get_stream());
+  return ops::graph::uniform_sample_coo(rng_state,
+                                        ops_graph,
+                                        ptr_d_start,
+                                        num_start_vertices,
+                                        sampling_size,
+                                        sampling_algo,
+                                        max_degree,
+                                        handle.get_stream());
 }
 
 // template explicit instantiation directives (EIDir's):
@@ -76,7 +76,7 @@ sample_neighbors_adjacency_list(raft::handle_t const& handle,
                                 int32_t const* ptr_d_start,
                                 size_t num_start_vertices,
                                 size_t sampling_size,
-                                ops::gnn::graph::SamplingAlgoT sampling_algo);
+                                ops::graph::SamplingAlgoT sampling_algo);
 
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
 sample_neighbors_adjacency_list(raft::handle_t const& handle,
@@ -85,7 +85,7 @@ sample_neighbors_adjacency_list(raft::handle_t const& handle,
                                 int64_t const* ptr_d_start,
                                 size_t num_start_vertices,
                                 size_t sampling_size,
-                                ops::gnn::graph::SamplingAlgoT sampling_algo);
+                                ops::graph::SamplingAlgoT sampling_algo);
 //}
 //
 // COO SG FP32{
@@ -96,7 +96,7 @@ sample_neighbors_edgelist(raft::handle_t const& handle,
                           int32_t const* ptr_d_start,
                           size_t num_start_vertices,
                           size_t sampling_size,
-                          ops::gnn::graph::SamplingAlgoT sampling_algo);
+                          ops::graph::SamplingAlgoT sampling_algo);
 
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
 sample_neighbors_edgelist(raft::handle_t const& handle,
@@ -105,7 +105,7 @@ sample_neighbors_edgelist(raft::handle_t const& handle,
                           int64_t const* ptr_d_start,
                           size_t num_start_vertices,
                           size_t sampling_size,
-                          ops::gnn::graph::SamplingAlgoT sampling_algo);
+                          ops::graph::SamplingAlgoT sampling_algo);
 //}
 
 }  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 6a7334e9f1a..46789c6b8bd 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -197,19 +197,19 @@ struct col_indx_extract_t {
   void operator()(
     original::device_vec_t<vertex_t> const& d_coalesced_src_v,  // in: coalesced vector of vertices
     original::device_vec_t<vertex_t> const&
-      d_v_col_indx,  // in: column indices, given by stepper's random engine
+      d_v_col_indx,       // in: column indices, given by stepper's random engine
     original::device_vec_t<vertex_t>&
       d_v_next_vertices,  // out: set of destination vertices, for next step
     original::device_vec_t<weight_t>&
-      d_v_next_weights)  // out: set of weights between src and destination vertices, for next step
+      d_v_next_weights)   // out: set of weights between src and destination vertices, for next step
     const
   {
     thrust::transform_if(
       handle_.get_thrust_policy(),
       thrust::make_counting_iterator<index_t>(0),
-      thrust::make_counting_iterator<index_t>(num_paths_),  // input1
-      d_v_col_indx.begin(),                                 // input2
-      out_degs_,                                            // stencil
+      thrust::make_counting_iterator<index_t>(num_paths_),                         // input1
+      d_v_col_indx.begin(),                                                        // input2
+      out_degs_,                                                                   // stencil
       thrust::make_zip_iterator(
         thrust::make_tuple(d_v_next_vertices.begin(), d_v_next_weights.begin())),  // output
       [max_depth         = max_depth_,
@@ -575,9 +575,9 @@ struct random_walker_t {
       d_crt_out_degs,  // |current set of vertex out degrees| = nelems,
                        // to be used as stencil (don't scatter if 0)
     original::device_vec_t<index_t> const&
-      d_sizes,  // paths sizes used to provide delta in coalesced paths;
-                // pre-condition: assumed as updated to reflect new vertex additions;
-                // also, this is the number of _vertices_ in each path;
+      d_sizes,         // paths sizes used to provide delta in coalesced paths;
+                       // pre-condition: assumed as updated to reflect new vertex additions;
+                       // also, this is the number of _vertices_ in each path;
     // hence for scattering weights this needs to be adjusted; hence the `adjust` parameter
     index_t
       stride,  // stride = coalesce block size (max_depth for vertices; max_depth-1 for weights)
@@ -762,7 +762,7 @@ random_walks_impl(
   // pre-allocate num_paths * max_depth;
   //
   original::device_vec_t<vertex_t> d_coalesced_v(num_paths * max_depth,
-                                                 stream);  // coalesced vertex set
+                                                 stream);         // coalesced vertex set
   original::device_vec_t<weight_t> d_coalesced_w(num_paths * (max_depth - 1),
                                                  stream);         // coalesced weight set
   original::device_vec_t<index_t> d_paths_sz(num_paths, stream);  // paths sizes
diff --git a/cpp/src/sampling/random_walks_impl.cuh b/cpp/src/sampling/random_walks_impl.cuh
index aed4fa58211..3a21143eb77 100644
--- a/cpp/src/sampling/random_walks_impl.cuh
+++ b/cpp/src/sampling/random_walks_impl.cuh
@@ -432,6 +432,8 @@ uniform_random_walks(raft::handle_t const& handle,
                      size_t max_length,
                      uint64_t seed)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   return detail::random_walk_impl(handle,
                                   graph_view,
                                   edge_weight_view,
@@ -450,6 +452,8 @@ biased_random_walks(raft::handle_t const& handle,
                     size_t max_length,
                     uint64_t seed)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   return detail::random_walk_impl(
     handle,
     graph_view,
@@ -470,6 +474,8 @@ node2vec_random_walks(raft::handle_t const& handle,
                       weight_t q,
                       uint64_t seed)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   return detail::random_walk_impl(
     handle,
     graph_view,
diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index 7922172639b..40b7109937c 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,7 +72,7 @@ struct device_const_vector_view {
   {
   }
 
-  device_const_vector_view(device_const_vector_view const& other) = delete;
+  device_const_vector_view(device_const_vector_view const& other)            = delete;
   device_const_vector_view& operator=(device_const_vector_view const& other) = delete;
 
   device_const_vector_view(device_const_vector_view&& other)
diff --git a/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp b/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp
index e1872f01c1c..7dbc98840e7 100644
--- a/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp
+++ b/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp
@@ -351,6 +351,8 @@ uniform_neighbor_sample(
   bool with_replacement,
   bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   return detail::uniform_neighbor_sample_impl(handle,
                                               graph_view,
                                               edge_weight_view,
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 5fbe8efe116..6dacbee2fb1 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -726,6 +726,8 @@ coarsen_graph(raft::handle_t const& handle,
               bool renumber,
               bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   return detail::coarsen_graph(
     handle, graph_view, edge_weight_view, labels, renumber, do_expensive_check);
 }
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index 6d0f0415788..0d4b12a3e38 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -240,6 +240,65 @@ void expensive_check_edgelist(raft::handle_t const& handle,
   }
 }
 
+template <typename vertex_t, bool store_transposed, bool multi_gpu>
+bool check_symmetric(raft::handle_t const& handle,
+                     raft::device_span<vertex_t const> edgelist_srcs,
+                     raft::device_span<vertex_t const> edgelist_dsts)
+{
+  rmm::device_uvector<vertex_t> org_srcs(edgelist_srcs.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> org_dsts(edgelist_dsts.size(), handle.get_stream());
+  thrust::copy(
+    handle.get_thrust_policy(), edgelist_srcs.begin(), edgelist_srcs.end(), org_srcs.begin());
+  thrust::copy(
+    handle.get_thrust_policy(), edgelist_dsts.begin(), edgelist_dsts.end(), org_dsts.begin());
+
+  rmm::device_uvector<vertex_t> symmetrized_srcs(org_srcs.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> symmetrized_dsts(org_dsts.size(), handle.get_stream());
+  thrust::copy(
+    handle.get_thrust_policy(), org_srcs.begin(), org_srcs.end(), symmetrized_srcs.begin());
+  thrust::copy(
+    handle.get_thrust_policy(), org_dsts.begin(), org_dsts.end(), symmetrized_dsts.begin());
+  std::tie(symmetrized_srcs, symmetrized_dsts, std::ignore) =
+    symmetrize_edgelist<vertex_t, float /* dummy */, store_transposed, multi_gpu>(
+      handle, std::move(symmetrized_srcs), std::move(symmetrized_dsts), std::nullopt, true);
+
+  if (org_srcs.size() != symmetrized_srcs.size()) { return false; }
+
+  auto org_edge_first =
+    thrust::make_zip_iterator(thrust::make_tuple(org_srcs.begin(), org_dsts.begin()));
+  thrust::sort(handle.get_thrust_policy(), org_edge_first, org_edge_first + org_srcs.size());
+  auto symmetrized_edge_first = thrust::make_zip_iterator(
+    thrust::make_tuple(symmetrized_srcs.begin(), symmetrized_dsts.begin()));
+  thrust::sort(handle.get_thrust_policy(),
+               symmetrized_edge_first,
+               symmetrized_edge_first + symmetrized_srcs.size());
+
+  return thrust::equal(handle.get_thrust_policy(),
+                       org_edge_first,
+                       org_edge_first + org_srcs.size(),
+                       symmetrized_edge_first);
+}
+
+template <typename vertex_t>
+bool check_no_parallel_edge(raft::handle_t const& handle,
+                            raft::device_span<vertex_t const> edgelist_srcs,
+                            raft::device_span<vertex_t const> edgelist_dsts)
+{
+  rmm::device_uvector<vertex_t> org_srcs(edgelist_srcs.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> org_dsts(edgelist_dsts.size(), handle.get_stream());
+  thrust::copy(
+    handle.get_thrust_policy(), edgelist_srcs.begin(), edgelist_srcs.end(), org_srcs.begin());
+  thrust::copy(
+    handle.get_thrust_policy(), edgelist_dsts.begin(), edgelist_dsts.end(), org_dsts.begin());
+
+  auto org_edge_first =
+    thrust::make_zip_iterator(thrust::make_tuple(org_srcs.begin(), org_dsts.begin()));
+  thrust::sort(handle.get_thrust_policy(), org_edge_first, org_edge_first + org_srcs.size());
+  return thrust::unique(
+           handle.get_thrust_policy(), org_edge_first, org_edge_first + edgelist_srcs.size()) ==
+         (org_edge_first + edgelist_srcs.size());
+}
+
 template <typename vertex_t,
           typename edge_t,
           typename weight_t,
@@ -294,6 +353,26 @@ create_graph_from_edgelist_impl(
                                                   store_transposed ? edgelist_dsts : edgelist_srcs,
                                                   store_transposed ? edgelist_srcs : edgelist_dsts,
                                                   renumber);
+
+    if (graph_properties.is_symmetric) {
+      CUGRAPH_EXPECTS(
+        (check_symmetric<vertex_t, store_transposed, multi_gpu>(
+          handle,
+          raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()))),
+        "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is "
+        "not symmetric.");
+    }
+
+    if (!graph_properties.is_multigraph) {
+      CUGRAPH_EXPECTS(
+        check_no_parallel_edge(
+          handle,
+          raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size())),
+        "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list "
+        "has parallel edges.");
+    }
   }
 
   // 1. groupby edges to their target local adjacency matrix partition (and further groupby within
@@ -823,6 +902,26 @@ create_graph_from_edgelist_impl(
                                                   store_transposed ? edgelist_dsts : edgelist_srcs,
                                                   store_transposed ? edgelist_srcs : edgelist_dsts,
                                                   renumber);
+
+    if (graph_properties.is_symmetric) {
+      CUGRAPH_EXPECTS(
+        (check_symmetric<vertex_t, store_transposed, multi_gpu>(
+          handle,
+          raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()))),
+        "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is "
+        "not symmetric.");
+    }
+
+    if (!graph_properties.is_multigraph) {
+      CUGRAPH_EXPECTS(
+        check_no_parallel_edge(
+          handle,
+          raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
+          raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size())),
+        "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list "
+        "has parallel edges.");
+    }
   }
 
   // renumber
diff --git a/cpp/src/structure/decompress_to_edgelist_impl.cuh b/cpp/src/structure/decompress_to_edgelist_impl.cuh
index d6652bceaf7..fb0ffdb96c1 100644
--- a/cpp/src/structure/decompress_to_edgelist_impl.cuh
+++ b/cpp/src/structure/decompress_to_edgelist_impl.cuh
@@ -252,6 +252,8 @@ decompress_to_edgelist(
   std::optional<raft::device_span<vertex_t const>> renumber_map,
   bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   return decompress_to_edgelist_impl(
     handle, graph_view, edge_weight_view, renumber_map, do_expensive_check);
 }
diff --git a/cpp/src/structure/detail/structure_utils.cuh b/cpp/src/structure/detail/structure_utils.cuh
index dee3fa156af..6887acf1af4 100644
--- a/cpp/src/structure/detail/structure_utils.cuh
+++ b/cpp/src/structure/detail/structure_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -199,9 +199,8 @@ template <typename edge_t,
 std::tuple<
   rmm::device_uvector<edge_t>,
   rmm::device_uvector<typename thrust::iterator_traits<VertexIterator>::value_type>,
-  decltype(
-    allocate_dataframe_buffer<typename thrust::iterator_traits<EdgeValueIterator>::value_type>(
-      size_t{0}, rmm::cuda_stream_view{})),
+  decltype(allocate_dataframe_buffer<typename thrust::iterator_traits<
+             EdgeValueIterator>::value_type>(size_t{0}, rmm::cuda_stream_view{})),
   std::optional<rmm::device_uvector<typename thrust::iterator_traits<VertexIterator>::value_type>>>
 compress_edgelist(
   VertexIterator edgelist_src_first,
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index 99c475b9b14..97975897e08 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -62,12 +62,6 @@ namespace cugraph {
 
 namespace {
 
-template <typename vertex_t>
-struct edgelist_t {
-  raft::device_span<vertex_t const> srcs{};
-  raft::device_span<vertex_t const> dsts{};
-};
-
 // can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an
 // extended __device__ lambda must allow its address to be taken)
 template <typename vertex_t>
@@ -125,235 +119,6 @@ struct popc_t {
   }
 };
 
-// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an
-// extended __device__ lambda must allow its address to be taken)
-template <typename edge_t>
-struct rebase_offset_t {
-  edge_t base_offset{};
-  __device__ edge_t operator()(edge_t offset) const { return offset - base_offset; }
-};
-
-template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
-bool check_symmetric(raft::handle_t const& handle,
-                     std::vector<edgelist_t<vertex_t>> const& edgelists)
-{
-  size_t number_of_local_edges{0};
-  for (size_t i = 0; i < edgelists.size(); ++i) {
-    number_of_local_edges += edgelists[i].srcs.size();
-  }
-
-  rmm::device_uvector<vertex_t> org_srcs(number_of_local_edges, handle.get_stream());
-  rmm::device_uvector<vertex_t> org_dsts(number_of_local_edges, handle.get_stream());
-  size_t offset{0};
-  for (size_t i = 0; i < edgelists.size(); ++i) {
-    thrust::copy(handle.get_thrust_policy(),
-                 edgelists[i].srcs.begin(),
-                 edgelists[i].srcs.end(),
-                 org_srcs.begin() + offset);
-    thrust::copy(handle.get_thrust_policy(),
-                 edgelists[i].dsts.begin(),
-                 edgelists[i].dsts.end(),
-                 org_dsts.begin() + offset);
-    offset += edgelists[i].srcs.size();
-  }
-  if constexpr (multi_gpu) {
-    std::tie(
-      store_transposed ? org_dsts : org_srcs, store_transposed ? org_srcs : org_dsts, std::ignore) =
-      detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
-        handle,
-        std::move(store_transposed ? org_dsts : org_srcs),
-        std::move(store_transposed ? org_srcs : org_dsts),
-        std::nullopt);
-  }
-
-  rmm::device_uvector<vertex_t> symmetrized_srcs(org_srcs.size(), handle.get_stream());
-  rmm::device_uvector<vertex_t> symmetrized_dsts(org_dsts.size(), handle.get_stream());
-  thrust::copy(
-    handle.get_thrust_policy(), org_srcs.begin(), org_srcs.end(), symmetrized_srcs.begin());
-  thrust::copy(
-    handle.get_thrust_policy(), org_dsts.begin(), org_dsts.end(), symmetrized_dsts.begin());
-  std::tie(symmetrized_srcs, symmetrized_dsts, std::ignore) =
-    symmetrize_edgelist<vertex_t, float /* dummy */, store_transposed, multi_gpu>(
-      handle, std::move(symmetrized_srcs), std::move(symmetrized_dsts), std::nullopt, true);
-
-  if (org_srcs.size() != symmetrized_srcs.size()) { return false; }
-
-  auto org_edge_first =
-    thrust::make_zip_iterator(thrust::make_tuple(org_srcs.begin(), org_dsts.begin()));
-  thrust::sort(handle.get_thrust_policy(), org_edge_first, org_edge_first + org_srcs.size());
-  auto symmetrized_edge_first = thrust::make_zip_iterator(
-    thrust::make_tuple(symmetrized_srcs.begin(), symmetrized_dsts.begin()));
-  thrust::sort(handle.get_thrust_policy(),
-               symmetrized_edge_first,
-               symmetrized_edge_first + symmetrized_srcs.size());
-
-  return thrust::equal(handle.get_thrust_policy(),
-                       org_edge_first,
-                       org_edge_first + org_srcs.size(),
-                       symmetrized_edge_first);
-}
-
-template <typename vertex_t, typename edge_t>
-bool check_no_parallel_edge(raft::handle_t const& handle,
-                            std::vector<edgelist_t<vertex_t>> const& edgelists)
-{
-  size_t number_of_local_edges{0};
-  for (size_t i = 0; i < edgelists.size(); ++i) {
-    number_of_local_edges += edgelists[i].srcs.size();
-  }
-
-  rmm::device_uvector<vertex_t> edgelist_srcs(number_of_local_edges, handle.get_stream());
-  rmm::device_uvector<vertex_t> edgelist_dsts(number_of_local_edges, handle.get_stream());
-  size_t offset{0};
-  for (size_t i = 0; i < edgelists.size(); ++i) {
-    thrust::copy(handle.get_thrust_policy(),
-                 edgelists[i].srcs.begin(),
-                 edgelists[i].srcs.end(),
-                 edgelist_srcs.begin() + offset);
-    thrust::copy(handle.get_thrust_policy(),
-                 edgelists[i].dsts.begin(),
-                 edgelists[i].dsts.end(),
-                 edgelist_dsts.begin() + offset);
-    offset += edgelists[i].srcs.size();
-  }
-
-  auto edge_first =
-    thrust::make_zip_iterator(thrust::make_tuple(edgelist_srcs.begin(), edgelist_dsts.begin()));
-  thrust::sort(handle.get_thrust_policy(), edge_first, edge_first + edgelist_srcs.size());
-  return thrust::unique(handle.get_thrust_policy(),
-                        edge_first,
-                        edge_first + edgelist_srcs.size()) == (edge_first + edgelist_srcs.size());
-}
-
-template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
-std::enable_if_t<multi_gpu, void> check_graph_constructor_input_arguments(
-  raft::handle_t const& handle,
-  std::vector<edgelist_t<vertex_t>> const& edgelists,
-  graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
-  bool do_expensive_check)
-{
-  // cheap error checks
-
-  auto& comm                 = handle.get_comms();
-  auto const comm_size       = comm.get_size();
-  auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-  auto const minor_comm_size = minor_comm.get_size();
-
-  CUGRAPH_EXPECTS(edgelists.size() == static_cast<size_t>(minor_comm_size),
-                  "Invalid input argument: erroneous edgelists.size().");
-  CUGRAPH_EXPECTS(
-    (meta.edge_partition_segment_offsets.size() ==
-     (detail::num_sparse_segments_per_vertex_partition + 2) * minor_comm_size) ||
-      (meta.edge_partition_segment_offsets.size() ==
-       (detail::num_sparse_segments_per_vertex_partition + 3) * minor_comm_size),
-    "Invalid input argument: meta.edge_partition_segment_offsets.size() returns an invalid value.");
-
-  CUGRAPH_EXPECTS(
-    std::any_of(edgelists.begin(),
-                edgelists.end(),
-                [](auto edgelist) { return edgelist.srcs.size() != edgelist.dsts.size(); }) ==
-      false,
-    "Invalid input argument: edgelists[].srcs.size() and edgelists[].dsts.size() should coincide.");
-
-  // optional expensive checks
-
-  if (do_expensive_check) {
-    edge_t number_of_local_edges{0};
-    for (size_t i = 0; i < edgelists.size(); ++i) {
-      auto [major_range_first, major_range_last] =
-        meta.partition.local_edge_partition_major_range(i);
-      auto [minor_range_first, minor_range_last] =
-        meta.partition.local_edge_partition_minor_range();
-
-      number_of_local_edges += static_cast<edge_t>(edgelists[i].srcs.size());
-
-      auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
-        store_transposed ? edgelists[i].dsts.begin() : edgelists[i].srcs.begin(),
-        store_transposed ? edgelists[i].srcs.begin() : edgelists[i].dsts.begin()));
-      // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
-      CUGRAPH_EXPECTS(
-        thrust::count_if(
-          handle.get_thrust_policy(),
-          edge_first,
-          edge_first + edgelists[i].srcs.size(),
-          out_of_range_t<vertex_t>{
-            major_range_first, major_range_last, minor_range_first, minor_range_last}) == 0,
-        "Invalid input argument: edgelists[] have out-of-range values.");
-    }
-    auto number_of_local_edges_sum = host_scalar_allreduce(
-      comm, number_of_local_edges, raft::comms::op_t::SUM, handle.get_stream());
-    CUGRAPH_EXPECTS(number_of_local_edges_sum == meta.number_of_edges,
-                    "Invalid input argument: the sum of local edge counts does not match with "
-                    "meta.number_of_edges.");
-
-    CUGRAPH_EXPECTS(
-      meta.partition.vertex_partition_range_last(comm_size - 1) == meta.number_of_vertices,
-      "Invalid input argument: vertex partition should cover [0, meta.number_of_vertices).");
-
-    if (meta.properties.is_symmetric) {
-      CUGRAPH_EXPECTS(
-        (check_symmetric<vertex_t, edge_t, store_transposed, multi_gpu>(handle, edgelists)),
-        "Invalid input argument: meta.property.is_symmetric is true but the input edge list is not "
-        "symmetric.");
-    }
-    if (!meta.properties.is_multigraph) {
-      CUGRAPH_EXPECTS(
-        check_no_parallel_edge(handle, edgelists),
-        "Invalid input argument: meta.property.is_multigraph is false but the input edge list has "
-        "parallel edges.");
-    }
-  }
-}
-
-template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
-std::enable_if_t<!multi_gpu, void> check_graph_constructor_input_arguments(
-  raft::handle_t const& handle,
-  edgelist_t<vertex_t> const& edgelist,
-  graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
-  bool do_expensive_check)
-{
-  // cheap error checks
-
-  CUGRAPH_EXPECTS(
-    edgelist.srcs.size() == edgelist.dsts.size(),
-    "Invalid input argument: edgelists.srcs.size() and edgelists.dsts.size() should coincide.");
-
-  CUGRAPH_EXPECTS(
-    !meta.segment_offsets.has_value() ||
-      ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 2)),
-    "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value.");
-
-  // optional expensive checks
-
-  if (do_expensive_check) {
-    auto edge_first = thrust::make_zip_iterator(
-      thrust::make_tuple(store_transposed ? edgelist.dsts.begin() : edgelist.srcs.begin(),
-                         store_transposed ? edgelist.srcs.begin() : edgelist.dsts.begin()));
-    // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
-    CUGRAPH_EXPECTS(
-      thrust::count_if(
-        handle.get_thrust_policy(),
-        edge_first,
-        edge_first + edgelist.srcs.size(),
-        out_of_range_t<vertex_t>{0, meta.number_of_vertices, 0, meta.number_of_vertices}) == 0,
-      "Invalid input argument: edgelist have out-of-range values.");
-
-    if (meta.properties.is_symmetric) {
-      CUGRAPH_EXPECTS(
-        (check_symmetric<vertex_t, edge_t, store_transposed, multi_gpu>(
-          handle, std::vector<edgelist_t<vertex_t>>{edgelist})),
-        "Invalid input argument: meta.property.is_symmetric is true but the input edge list is not "
-        "symmetric.");
-    }
-    if (!meta.properties.is_multigraph) {
-      CUGRAPH_EXPECTS(
-        check_no_parallel_edge(handle, std::vector<edgelist_t<vertex_t>>{edgelist}),
-        "Invalid input argument: meta.property.is_multigraph is false but the input edge list has "
-        "parallel edges.");
-    }
-  }
-}
-
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
 std::enable_if_t<multi_gpu,
                  std::tuple<std::optional<std::vector<rmm::device_uvector<vertex_t>>>,
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 1a66c1da7b8..7626784c13c 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -25,6 +25,7 @@
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/partition_manager.hpp>
+#include <cugraph/utilities/atomic_ops.cuh>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
 
@@ -270,7 +271,7 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
   }
 
   count_sum = BlockReduce(temp_storage).Reduce(count_sum, edge_property_add);
-  if (threadIdx.x == 0) { atomic_add_edge_op_result(count, count_sum); }
+  if (threadIdx.x == 0) { atomic_add(count, count_sum); }
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
@@ -302,7 +303,7 @@ __global__ void for_all_major_for_all_nbr_high_degree(
   }
 
   count_sum = BlockReduce(temp_storage).Reduce(count_sum, edge_property_add);
-  if (threadIdx.x == 0) { atomic_add_edge_op_result(count, count_sum); }
+  if (threadIdx.x == 0) { atomic_add(count, count_sum); }
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
@@ -511,6 +512,8 @@ rmm::device_uvector<edge_t>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   compute_in_degrees(raft::handle_t const& handle) const
 {
+  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
   if (store_transposed) {
     return compute_major_degrees(handle,
                                  this->edge_partition_offsets_,
@@ -528,6 +531,8 @@ rmm::device_uvector<edge_t>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   compute_in_degrees(raft::handle_t const& handle) const
 {
+  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
   if (store_transposed) {
     return compute_major_degrees(handle, this->offsets_, this->local_vertex_partition_range_size());
   } else {
@@ -540,6 +545,8 @@ rmm::device_uvector<edge_t>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   compute_out_degrees(raft::handle_t const& handle) const
 {
+  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
   if (store_transposed) {
     return compute_minor_degrees(handle, *this);
   } else {
@@ -557,6 +564,8 @@ rmm::device_uvector<edge_t>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   compute_out_degrees(raft::handle_t const& handle) const
 {
+  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
   if (store_transposed) {
     return compute_minor_degrees(handle, *this);
   } else {
@@ -568,6 +577,8 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   compute_max_in_degree(raft::handle_t const& handle) const
 {
+  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
   auto in_degrees = compute_in_degrees(handle);
   auto it = thrust::max_element(handle.get_thrust_policy(), in_degrees.begin(), in_degrees.end());
   rmm::device_scalar<edge_t> ret(edge_t{0}, handle.get_stream());
@@ -584,6 +595,8 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   compute_max_in_degree(raft::handle_t const& handle) const
 {
+  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
   auto in_degrees = compute_in_degrees(handle);
   auto it = thrust::max_element(handle.get_thrust_policy(), in_degrees.begin(), in_degrees.end());
   edge_t ret{0};
@@ -596,6 +609,8 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   compute_max_out_degree(raft::handle_t const& handle) const
 {
+  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
   auto out_degrees = compute_out_degrees(handle);
   auto it = thrust::max_element(handle.get_thrust_policy(), out_degrees.begin(), out_degrees.end());
   rmm::device_scalar<edge_t> ret(edge_t{0}, handle.get_stream());
@@ -612,6 +627,8 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   compute_max_out_degree(raft::handle_t const& handle) const
 {
+  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
   auto out_degrees = compute_out_degrees(handle);
   auto it = thrust::max_element(handle.get_thrust_policy(), out_degrees.begin(), out_degrees.end());
   edge_t ret{0};
@@ -624,6 +641,8 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   count_self_loops(raft::handle_t const& handle) const
 {
+  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
   return count_if_e(
     handle,
     *this,
@@ -637,6 +656,8 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   count_self_loops(raft::handle_t const& handle) const
 {
+  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
   return count_if_e(
     handle,
     *this,
@@ -650,6 +671,8 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   count_multi_edges(raft::handle_t const& handle) const
 {
+  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
   if (!this->is_multigraph()) { return edge_t{0}; }
 
   edge_t count{0};
@@ -668,6 +691,8 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   count_multi_edges(raft::handle_t const& handle) const
 {
+  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+
   if (!this->is_multigraph()) { return edge_t{0}; }
 
   return count_edge_partition_multi_edges(
diff --git a/cpp/src/structure/graph_weight_utils_impl.cuh b/cpp/src/structure/graph_weight_utils_impl.cuh
index e97266c557a..1e386792b21 100644
--- a/cpp/src/structure/graph_weight_utils_impl.cuh
+++ b/cpp/src/structure/graph_weight_utils_impl.cuh
@@ -89,6 +89,8 @@ rmm::device_uvector<weight_t> compute_in_weight_sums(
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (store_transposed) {
     return compute_weight_sums<true>(handle, graph_view, edge_weight_view);
   } else {
@@ -106,6 +108,8 @@ rmm::device_uvector<weight_t> compute_out_weight_sums(
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (store_transposed) {
     return compute_weight_sums<false>(handle, graph_view, edge_weight_view);
   } else {
@@ -123,6 +127,8 @@ weight_t compute_max_in_weight_sum(
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   auto in_weight_sums = compute_in_weight_sums(handle, graph_view, edge_weight_view);
   auto it =
     thrust::max_element(handle.get_thrust_policy(), in_weight_sums.begin(), in_weight_sums.end());
@@ -147,6 +153,8 @@ weight_t compute_max_out_weight_sum(
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   auto out_weight_sums = compute_out_weight_sums(handle, graph_view, edge_weight_view);
   auto it =
     thrust::max_element(handle.get_thrust_policy(), out_weight_sums.begin(), out_weight_sums.end());
@@ -171,6 +179,8 @@ weight_t compute_total_edge_weight(
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   return transform_reduce_e(
     handle,
     graph_view,
diff --git a/cpp/src/structure/induced_subgraph_impl.cuh b/cpp/src/structure/induced_subgraph_impl.cuh
index 51945e40f8f..208ad130304 100644
--- a/cpp/src/structure/induced_subgraph_impl.cuh
+++ b/cpp/src/structure/induced_subgraph_impl.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-//#define TIMING
+// #define TIMING
 
 #include <prims/extract_transform_v_frontier_outgoing_e.cuh>
 #include <prims/vertex_frontier.cuh>
@@ -131,6 +131,8 @@ extract_induced_subgraphs(
 #endif
   // 1. check input arguments
 
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (do_expensive_check) {
     size_t should_be_zero{std::numeric_limits<size_t>::max()};
     size_t num_aggregate_subgraph_vertices{};
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 6bc19ff4fe1..d7381ba71af 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -86,7 +86,7 @@ struct find_unused_id_t {
     for (size_t i = worker_id; i < sorted_local_vertices.size() + size_t{1}; i += num_workers) {
       auto start = (i == size_t{0}) ? std::numeric_limits<vertex_t>::lowest()
                                     : sorted_local_vertices[i - size_t{1}];
-      if (start != std::numeric_limits<vertex_t>::max()) { ++start; };  // now inclusive
+      if (start != std::numeric_limits<vertex_t>::max()) { ++start; };            // now inclusive
       auto end = (i == sorted_local_vertices.size()) ? std::numeric_limits<vertex_t>::max()
                                                      : sorted_local_vertices[i];  // exclusive
       for (vertex_t v = start; v < end; ++v) {
diff --git a/cpp/src/structure/select_random_vertices_impl.hpp b/cpp/src/structure/select_random_vertices_impl.hpp
index 127e5c429e0..b6a0c364848 100644
--- a/cpp/src/structure/select_random_vertices_impl.hpp
+++ b/cpp/src/structure/select_random_vertices_impl.hpp
@@ -18,9 +18,11 @@
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/utilities/device_functors.cuh>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
+#include <detail/graph_partition_utils.cuh>
 
 #include <raft/core/handle.hpp>
 #include <rmm/device_scalar.hpp>
@@ -30,31 +32,66 @@
 #include <cugraph-ops/graph/sampling.hpp>
 #endif
 
+#include <thrust/functional.h>
+#include <thrust/gather.h>
+#include <thrust/logical.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
-namespace cugraph {
+#include <chrono>
+#include <cstdlib>
+#include <iostream>
 
+namespace cugraph {
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
 rmm::device_uvector<vertex_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<raft::device_span<vertex_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices)
+  bool sort_vertices,
+  bool do_expensive_check)
 {
-  CUGRAPH_EXPECTS(
-    with_replacement || select_count <= static_cast<size_t>(graph_view.number_of_vertices()),
-    "Invalid input arguments: select_count should not exceed the number of vertices if "
-    "with_replacement == false.");
+  size_t num_of_elements_in_given_set{0};
+  if (given_set) {
+    if (do_expensive_check) {
+      CUGRAPH_EXPECTS(static_cast<size_t>(thrust::count_if(
+                        handle.get_thrust_policy(),
+                        (*given_set).begin(),
+                        (*given_set).begin() + (*given_set).size(),
+                        detail::check_out_of_range_t<vertex_t>{
+                          graph_view.local_vertex_partition_range_first(),
+                          graph_view.local_vertex_partition_range_last()})) == size_t{0},
+                      "Invalid input argument: vertex IDs in the given set must be within vertex "
+                      "partition assigned to this GPU");
+    }
+    num_of_elements_in_given_set = static_cast<size_t>((*given_set).size());
+    if constexpr (multi_gpu) {
+      num_of_elements_in_given_set = host_scalar_allreduce(handle.get_comms(),
+                                                           num_of_elements_in_given_set,
+                                                           raft::comms::op_t::SUM,
+                                                           handle.get_stream());
+    }
+    CUGRAPH_EXPECTS(
+      with_replacement || select_count <= num_of_elements_in_given_set,
+      "Invalid input arguments: select_count should not exceed the number of given vertices if "
+      "with_replacement == false.");
+  } else {
+    CUGRAPH_EXPECTS(
+      with_replacement || select_count <= static_cast<size_t>(graph_view.number_of_vertices()),
+      "Invalid input arguments: select_count should not exceed the number of vertices if "
+      "with_replacement == false.");
+  }
 
   rmm::device_uvector<vertex_t> mg_sample_buffer(0, handle.get_stream());
 
   size_t this_gpu_select_count{0};
   if constexpr (multi_gpu) {
-    auto const comm_rank = handle.get_comms().get_rank();
-    auto const comm_size = handle.get_comms().get_size();
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
 
     this_gpu_select_count =
       select_count / static_cast<size_t>(comm_size) +
@@ -64,26 +101,53 @@ rmm::device_uvector<vertex_t> select_random_vertices(
     this_gpu_select_count = select_count;
   }
 
+  std::vector<vertex_t> partition_range_lasts;
+
+  vertex_t local_int_vertex_first{0};
+  vertex_t local_int_vertex_last{given_set ? static_cast<vertex_t>(given_set->size())
+                                           : graph_view.number_of_vertices()};
+
+  if constexpr (multi_gpu) {
+    partition_range_lasts = given_set ? cugraph::partition_manager::compute_partition_range_lasts(
+                                          handle, static_cast<vertex_t>((*given_set).size()))
+                                      : graph_view.vertex_partition_range_lasts();
+
+    auto& comm                 = handle.get_comms();
+    auto const comm_size       = comm.get_size();
+    auto const comm_rank       = comm.get_rank();
+    auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+    auto const major_comm_size = major_comm.get_size();
+    auto const major_comm_rank = major_comm.get_rank();
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_size = minor_comm.get_size();
+    auto const minor_comm_rank = minor_comm.get_rank();
+
+    auto vertex_partition_id =
+      partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks(
+        major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
+
+    local_int_vertex_first =
+      vertex_partition_id == 0 ? vertex_t{0} : partition_range_lasts[vertex_partition_id - 1];
+    local_int_vertex_last = partition_range_lasts[vertex_partition_id];
+  }
+
   if (with_replacement) {
-    // FIXME: need to double check uniform_random_fill generates random numbers in [0, V) (not [0,
-    // V])
     mg_sample_buffer.resize(this_gpu_select_count, handle.get_stream());
     cugraph::detail::uniform_random_fill(handle.get_stream(),
                                          mg_sample_buffer.data(),
                                          mg_sample_buffer.size(),
                                          vertex_t{0},
-                                         graph_view.number_of_vertices(),
+                                         given_set
+                                           ? static_cast<vertex_t>(num_of_elements_in_given_set)
+                                           : graph_view.number_of_vertices(),
                                          rng_state);
   } else {
-    auto local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first();
-    auto local_vertex_partition_range_last  = graph_view.local_vertex_partition_range_last();
-
-    mg_sample_buffer = rmm::device_uvector<vertex_t>(
-      local_vertex_partition_range_last - local_vertex_partition_range_first, handle.get_stream());
+    mg_sample_buffer = rmm::device_uvector<vertex_t>(local_int_vertex_last - local_int_vertex_first,
+                                                     handle.get_stream());
     thrust::sequence(handle.get_thrust_policy(),
                      mg_sample_buffer.begin(),
                      mg_sample_buffer.end(),
-                     local_vertex_partition_range_first);
+                     local_int_vertex_first);
 
     {  // random shuffle (use this instead of thrust::shuffle to use raft::random::RngState)
       rmm::device_uvector<float> random_numbers(mg_sample_buffer.size(), handle.get_stream());
@@ -100,16 +164,37 @@ rmm::device_uvector<vertex_t> select_random_vertices(
     }
 
     if constexpr (multi_gpu) {
-      auto const comm_rank = handle.get_comms().get_rank();
-      auto const comm_size = handle.get_comms().get_size();
+      auto& comm           = handle.get_comms();
+      auto const comm_size = comm.get_size();
+      auto const comm_rank = comm.get_rank();
 
       std::vector<size_t> tx_value_counts(comm_size);
-      for (int i = 0; i < comm_size; ++i) {
-        tx_value_counts[i] =
-          mg_sample_buffer.size() / comm_size +
-          (static_cast<size_t>(i) < static_cast<size_t>(mg_sample_buffer.size() % comm_size) ? 1
-                                                                                             : 0);
+      std::fill(
+        tx_value_counts.begin(), tx_value_counts.end(), mg_sample_buffer.size() / comm_size);
+
+      std::vector<vertex_t> h_random_numbers;
+      {
+        rmm::device_uvector<vertex_t> d_random_numbers(mg_sample_buffer.size() % comm_size,
+                                                       handle.get_stream());
+        cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                             d_random_numbers.data(),
+                                             d_random_numbers.size(),
+                                             vertex_t{0},
+                                             vertex_t{comm_size},
+                                             rng_state);
+
+        h_random_numbers.resize(d_random_numbers.size());
+
+        raft::update_host(h_random_numbers.data(),
+                          d_random_numbers.data(),
+                          d_random_numbers.size(),
+                          handle.get_stream());
+      }
+
+      for (int i = 0; i < static_cast<int>(mg_sample_buffer.size() % comm_size); i++) {
+        tx_value_counts[h_random_numbers[i]]++;
       }
+
       std::tie(mg_sample_buffer, std::ignore) = cugraph::shuffle_values(
         handle.get_comms(), mg_sample_buffer.begin(), tx_value_counts, handle.get_stream());
 
@@ -148,7 +233,18 @@ rmm::device_uvector<vertex_t> select_random_vertices(
 
   if constexpr (multi_gpu) {
     mg_sample_buffer = cugraph::detail::shuffle_int_vertices_to_local_gpu_by_vertex_partitioning(
-      handle, std::move(mg_sample_buffer), graph_view.vertex_partition_range_lasts());
+      handle, std::move(mg_sample_buffer), partition_range_lasts);
+  }
+
+  if (given_set) {
+    thrust::gather(
+      handle.get_thrust_policy(),
+      thrust::make_transform_iterator(
+        mg_sample_buffer.begin(), cugraph::detail::shift_left_t<vertex_t>{local_int_vertex_first}),
+      thrust::make_transform_iterator(
+        mg_sample_buffer.end(), cugraph::detail::shift_left_t<vertex_t>{local_int_vertex_first}),
+      (*given_set).begin(),
+      mg_sample_buffer.begin());
   }
 
   if (sort_vertices) {
diff --git a/cpp/src/structure/select_random_vertices_mg.cu b/cpp/src/structure/select_random_vertices_mg.cu
index 81779b966cb..595da12f678 100644
--- a/cpp/src/structure/select_random_vertices_mg.cu
+++ b/cpp/src/structure/select_random_vertices_mg.cu
@@ -21,49 +21,61 @@ namespace cugraph {
 template rmm::device_uvector<int32_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<raft::device_span<int32_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int32_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  std::optional<raft::device_span<int32_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int64_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<raft::device_span<int64_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int32_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+  std::optional<raft::device_span<int32_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int32_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+  std::optional<raft::device_span<int32_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int64_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+  std::optional<raft::device_span<int64_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check);
 
 }  // namespace cugraph
diff --git a/cpp/src/structure/select_random_vertices_sg.cu b/cpp/src/structure/select_random_vertices_sg.cu
index 2b7aa849ed8..1ca1878c9db 100644
--- a/cpp/src/structure/select_random_vertices_sg.cu
+++ b/cpp/src/structure/select_random_vertices_sg.cu
@@ -21,49 +21,61 @@ namespace cugraph {
 template rmm::device_uvector<int32_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<raft::device_span<int32_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int32_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  std::optional<raft::device_span<int32_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int64_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<raft::device_span<int64_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int32_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  std::optional<raft::device_span<int32_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int32_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  std::optional<raft::device_span<int32_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check);
 
 template rmm::device_uvector<int64_t> select_random_vertices(
   raft::handle_t const& handle,
   graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  std::optional<raft::device_span<int64_t const>> given_set,
   raft::random::RngState& rng_state,
   size_t select_count,
   bool with_replacement,
-  bool sort_vertices);
+  bool sort_vertices,
+  bool do_expensive_check);
 
 }  // namespace cugraph
diff --git a/cpp/src/structure/symmetrize_graph_impl.cuh b/cpp/src/structure/symmetrize_graph_impl.cuh
index 7ad24aef01a..4afa4122a06 100644
--- a/cpp/src/structure/symmetrize_graph_impl.cuh
+++ b/cpp/src/structure/symmetrize_graph_impl.cuh
@@ -111,7 +111,7 @@ symmetrize_graph_impl(
                                           std::move(edgelist_weights),
                                           std::nullopt,
                                           std::nullopt,
-                                          graph_properties_t{is_multigraph, true},
+                                          graph_properties_t{true, is_multigraph},
                                           true);
 
   return std::make_tuple(
@@ -205,7 +205,7 @@ symmetrize_graph_impl(
                                           std::move(edgelist_weights),
                                           std::nullopt,
                                           std::nullopt,
-                                          graph_properties_t{is_multigraph, true},
+                                          graph_properties_t{true, is_multigraph},
                                           renumber);
 
   return std::make_tuple(
diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh
index 70edbfe884f..0402184bd93 100644
--- a/cpp/src/traversal/bfs_impl.cuh
+++ b/cpp/src/traversal/bfs_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,10 +53,7 @@ namespace {
 
 template <typename vertex_t, bool multi_gpu>
 struct e_op_t {
-  std::conditional_t<multi_gpu,
-                     detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint8_t*>,
-                     uint32_t*>
-    visited_flags{};
+  detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool> visited_flags{};
   uint32_t const* prev_visited_flags{
     nullptr};  // relevant only if multi_gpu is false (this affects only local-computing with 0
                // impact in communication volume, so this may improve performance in small-scale but
@@ -69,16 +66,16 @@ struct e_op_t {
     bool push{};
     if constexpr (multi_gpu) {
       auto dst_offset = dst - dst_first;
-      auto old        = atomicOr(visited_flags.get_iter(dst_offset), uint8_t{1});
-      push            = (old == uint8_t{0});
+      auto old        = visited_flags.atomic_or(dst_offset, true);
+      push            = !old;
     } else {
       auto mask = uint32_t{1} << (dst % (sizeof(uint32_t) * 8));
-      if (*(prev_visited_flags + (dst / (sizeof(uint32_t) * 8))) &
-          mask) {  // check if unvisited in previous iterations
+      if (*(prev_visited_flags + packed_bool_offset(dst)) &
+          packed_bool_mask(dst)) {  // check if unvisited in previous iterations
         push = false;
-      } else {  // check if unvisited in this iteration as well
-        auto old = atomicOr(visited_flags + (dst / (sizeof(uint32_t) * 8)), mask);
-        push     = ((old & mask) == 0);
+      } else {                      // check if unvisited in this iteration as well
+        auto old = visited_flags.atomic_or(dst, true);
+        push     = !old;
       }
     }
     return push ? thrust::optional<vertex_t>{src} : thrust::nullopt;
@@ -183,20 +180,21 @@ void bfs(raft::handle_t const& handle,
 
   vertex_frontier.bucket(bucket_idx_cur).insert(sources, sources + n_sources);
   rmm::device_uvector<uint32_t> visited_flags(
-    (push_graph_view.local_vertex_partition_range_size() + (sizeof(uint32_t) * 8 - 1)) /
-      (sizeof(uint32_t) * 8),
-    handle.get_stream());
-  thrust::fill(handle.get_thrust_policy(), visited_flags.begin(), visited_flags.end(), uint32_t{0});
+    packed_bool_size(push_graph_view.local_vertex_partition_range_size()), handle.get_stream());
+  thrust::fill(handle.get_thrust_policy(),
+               visited_flags.begin(),
+               visited_flags.end(),
+               packed_bool_empty_mask());
   rmm::device_uvector<uint32_t> prev_visited_flags(
     GraphViewType::is_multi_gpu ? size_t{0} : visited_flags.size(),
     handle.get_stream());  // relevant only if GraphViewType::is_multi_gpu is false
-  auto dst_visited_flags = GraphViewType::is_multi_gpu
-                             ? edge_dst_property_t<GraphViewType, uint8_t>(handle, push_graph_view)
-                             : edge_dst_property_t<GraphViewType,
-                                                   uint8_t>(
-                                 handle);  // relevant only if GraphViewType::is_multi_gpu is true
+  auto dst_visited_flags =
+    GraphViewType::is_multi_gpu
+      ? edge_dst_property_t<GraphViewType, bool>(handle, push_graph_view)
+      : edge_dst_property_t<GraphViewType,
+                            bool>(handle);  // relevant only if GraphViewType::is_multi_gpu is true
   if constexpr (GraphViewType::is_multi_gpu) {
-    fill_edge_dst_property(handle, push_graph_view, uint8_t{0}, dst_visited_flags);
+    fill_edge_dst_property(handle, push_graph_view, false, dst_visited_flags);
   }
 
   // 4. BFS iteration
@@ -210,7 +208,7 @@ void bfs(raft::handle_t const& handle,
                                  push_graph_view,
                                  vertex_frontier.bucket(bucket_idx_cur).begin(),
                                  vertex_frontier.bucket(bucket_idx_cur).end(),
-                                 thrust::make_constant_iterator(uint8_t{1}),
+                                 thrust::make_constant_iterator(true),
                                  dst_visited_flags);
       } else {
         thrust::copy(handle.get_thrust_policy(),
@@ -222,11 +220,14 @@ void bfs(raft::handle_t const& handle,
       e_op_t<vertex_t, GraphViewType::is_multi_gpu> e_op{};
       if constexpr (GraphViewType::is_multi_gpu) {
         e_op.visited_flags =
-          detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint8_t*>(
+          detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>(
             dst_visited_flags.mutable_view());
         e_op.dst_first = push_graph_view.local_edge_partition_dst_range_first();
       } else {
-        e_op.visited_flags      = visited_flags.data();
+        e_op.visited_flags =
+          detail::edge_partition_endpoint_property_device_view_t<vertex_t, uint32_t*, bool>(
+            detail::edge_minor_property_view_t<vertex_t, uint32_t*, bool>(visited_flags.data(),
+                                                                          vertex_t{0}));
         e_op.prev_visited_flags = prev_visited_flags.data();
       }
 
@@ -299,6 +300,8 @@ void bfs(raft::handle_t const& handle,
          vertex_t depth_limit,
          bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (predecessors != nullptr) {
     detail::bfs(handle,
                 graph_view,
diff --git a/cpp/src/traversal/extract_bfs_paths_impl.cuh b/cpp/src/traversal/extract_bfs_paths_impl.cuh
index e5d824f526a..8cc6fceac5c 100644
--- a/cpp/src/traversal/extract_bfs_paths_impl.cuh
+++ b/cpp/src/traversal/extract_bfs_paths_impl.cuh
@@ -146,6 +146,8 @@ std::tuple<rmm::device_uvector<vertex_t>, vertex_t> extract_bfs_paths(
   vertex_t const* destinations,
   size_t n_destinations)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   CUGRAPH_EXPECTS((graph_view.local_vertex_partition_range_size() == 0) || (distances != nullptr),
                   "Invalid input argument: distances cannot be null");
   CUGRAPH_EXPECTS(
diff --git a/cpp/src/traversal/k_hop_nbrs_impl.cuh b/cpp/src/traversal/k_hop_nbrs_impl.cuh
index 0202a9af1a1..92af693ddd4 100644
--- a/cpp/src/traversal/k_hop_nbrs_impl.cuh
+++ b/cpp/src/traversal/k_hop_nbrs_impl.cuh
@@ -35,6 +35,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/optional.h>
 #include <thrust/tuple.h>
 
 #include <limits>
@@ -231,6 +232,8 @@ std::tuple<rmm::device_uvector<size_t>, rmm::device_uvector<vertex_t>> k_hop_nbr
   size_t k,
   bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   return detail::k_hop_nbrs(handle, graph_view, start_vertices, k, do_expensive_check);
 }
 
diff --git a/cpp/src/traversal/legacy/bfs_kernels.cuh b/cpp/src/traversal/legacy/bfs_kernels.cuh
index 107a5893102..a0c49e9601a 100644
--- a/cpp/src/traversal/legacy/bfs_kernels.cuh
+++ b/cpp/src/traversal/legacy/bfs_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -435,7 +435,7 @@ __global__ void main_bottomup_kernel(const IndexType* unvisited,
       // neutral about elts < unvisited_vertex
       int iv   = unvisited_vertex % INT_SIZE;  // we know that this unvisited_vertex is valid
       int mask = traversal::getMaskNLeftmostBitSet(INT_SIZE - iv);
-      local_bmap_agg &= mask;  // we have to be neutral for elts < unvisited_vertex
+      local_bmap_agg &= mask;                  // we have to be neutral for elts < unvisited_vertex
       atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
     } else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) &&
                laneid >= laneid_last_head_in_warp &&  // We need the other ones
@@ -1032,7 +1032,7 @@ __global__ void topdown_expand_kernel(
             IndexType v = shared_local_new_frontier_candidates[idx_shared];  // popping
                                                                              // queue
             int m = 1 << (v % INT_SIZE);
-            int q = atomicOr(&bmap[v / INT_SIZE], m);  // atomicOr returns old
+            int q = atomicOr(&bmap[v / INT_SIZE], m);                        // atomicOr returns old
 
             if (!(m & q)) {  // if this thread was the first to discover this node
               if (distances) distances[v] = lvl;
diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh
index d0a4e7e4b41..c78fa3839e2 100644
--- a/cpp/src/traversal/sssp_impl.cuh
+++ b/cpp/src/traversal/sssp_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -279,6 +279,8 @@ void sssp(raft::handle_t const& handle,
           weight_t cutoff,
           bool do_expensive_check)
 {
+  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+
   if (predecessors != nullptr) {
     detail::sssp(handle,
                  graph_view,
diff --git a/cpp/src/utilities/cugraph_ops_utils.hpp b/cpp/src/utilities/cugraph_ops_utils.hpp
index 1684b367cac..1dbe930e4c9 100644
--- a/cpp/src/utilities/cugraph_ops_utils.hpp
+++ b/cpp/src/utilities/cugraph_ops_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,10 +26,10 @@ namespace cugraph {
 namespace detail {
 
 template <typename NodeTypeT, typename EdgeTypeT>
-ops::gnn::graph::fg_csr<EdgeTypeT> get_graph(
+ops::graph::fg_csr<EdgeTypeT> get_graph(
   graph_view_t<NodeTypeT, EdgeTypeT, false, false> const& gview)
 {
-  ops::gnn::graph::fg_csr<EdgeTypeT> graph;
+  ops::graph::fg_csr<EdgeTypeT> graph;
   graph.n_nodes   = gview.number_of_vertices();
   graph.n_indices = gview.number_of_edges();
   // FIXME: this is evil and is just temporary until we have a matching type in cugraph-ops
@@ -40,7 +40,7 @@ ops::gnn::graph::fg_csr<EdgeTypeT> get_graph(
 }
 
 template <typename NodeTypeT, typename EdgeTypeT>
-std::tuple<ops::gnn::graph::fg_csr<EdgeTypeT>, NodeTypeT> get_graph_and_max_degree(
+std::tuple<ops::graph::fg_csr<EdgeTypeT>, NodeTypeT> get_graph_and_max_degree(
   graph_view_t<NodeTypeT, EdgeTypeT, false, false> const& gview)
 {
   // FIXME this is sufficient for now, but if there is a fast (cached) way
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index c1ff9c33568..36e231ad570 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -14,22 +14,15 @@
  * limitations under the License.
  */
 
-#include <detail/graph_partition_utils.cuh>
-
-#include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_generators.hpp>
 #include <cugraph/partition_manager.hpp>
 #include <cugraph/utilities/cython.hpp>
 #include <cugraph/utilities/error.hpp>
-#include <cugraph/utilities/shuffle_comm.cuh>
 
 #include <raft/core/handle.hpp>
 
 #include <rmm/device_uvector.hpp>
 
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/tuple.h>
-
 #include <numeric>
 #include <vector>
 
@@ -54,8 +47,8 @@ std::unique_ptr<graph_generator_t> call_generate_rmat_edgelist(raft::handle_t co
     handle, scale, num_edges, a, b, c, seed, clip_and_flip);
 
   if (scramble_vertex_ids) {
-    cugraph::scramble_vertex_ids<vertex_t>(
-      handle, std::get<0>(src_dst_tuple), std::get<1>(src_dst_tuple), vertex_t{0}, seed);
+    src_dst_tuple = cugraph::scramble_vertex_ids<vertex_t>(
+      handle, std::move(std::get<0>(src_dst_tuple)), std::move(std::get<1>(src_dst_tuple)), scale);
   }
 
   graph_generator_t gg_vals{
@@ -89,11 +82,15 @@ call_generate_rmat_edgelists(raft::handle_t const& handle,
                                                                       clip_and_flip);
 
   if (scramble_vertex_ids) {
-    std::for_each(
-      src_dst_vec_tuple.begin(), src_dst_vec_tuple.end(), [&handle, seed](auto& src_dst_tuple) {
-        cugraph::scramble_vertex_ids<vertex_t>(
-          handle, std::get<0>(src_dst_tuple), std::get<1>(src_dst_tuple), vertex_t{0}, seed);
-      });
+    std::for_each(src_dst_vec_tuple.begin(),
+                  src_dst_vec_tuple.end(),
+                  [&handle, max_scale, seed](auto& src_dst_tuple) {
+                    src_dst_tuple =
+                      cugraph::scramble_vertex_ids<vertex_t>(handle,
+                                                             std::move(std::get<0>(src_dst_tuple)),
+                                                             std::move(std::get<1>(src_dst_tuple)),
+                                                             max_scale);
+                  });
   }
 
   std::vector<std::pair<std::unique_ptr<rmm::device_buffer>, std::unique_ptr<rmm::device_buffer>>>
@@ -112,243 +109,12 @@ call_generate_rmat_edgelists(raft::handle_t const& handle,
   return gg_vec;
 }
 
-// wrapper for shuffling:
-//
-template <typename vertex_t, typename edge_t, typename weight_t>
-std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> call_shuffle(
-  raft::handle_t const& handle,
-  vertex_t*
-    edgelist_major_vertices,  // [IN / OUT]: groupby_gpu_id_and_shuffle_values() sorts in-place
-  vertex_t* edgelist_minor_vertices,  // [IN / OUT]
-  weight_t* edgelist_weights,         // [IN / OUT]
-  edge_t num_edgelist_edges,
-  bool is_weighted)
-{
-  auto& comm                 = handle.get_comms();
-  auto const comm_size       = comm.get_size();
-  auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
-  auto const major_comm_size = major_comm.get_size();
-  auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-  auto const minor_comm_size = minor_comm.get_size();
-
-  std::unique_ptr<major_minor_weights_t<vertex_t, edge_t, weight_t>> ptr_ret =
-    std::make_unique<major_minor_weights_t<vertex_t, edge_t, weight_t>>(handle);
-
-  if (is_weighted) {
-    auto zip_edge = thrust::make_zip_iterator(
-      thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights));
-
-    std::forward_as_tuple(
-      std::tie(ptr_ret->get_major(), ptr_ret->get_minor(), ptr_ret->get_weights()),
-      std::ignore) =
-      cugraph::groupby_gpu_id_and_shuffle_values(
-        comm,  // handle.get_comms(),
-        zip_edge,
-        zip_edge + num_edgelist_edges,
-        [key_func =
-           cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
-             comm_size, major_comm_size, minor_comm_size}] __device__(auto val) {
-          return key_func(thrust::get<0>(val), thrust::get<1>(val));
-        },
-        handle.get_stream());
-  } else {
-    auto zip_edge = thrust::make_zip_iterator(
-      thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices));
-
-    std::forward_as_tuple(std::tie(ptr_ret->get_major(), ptr_ret->get_minor()),
-                          std::ignore) =
-      cugraph::groupby_gpu_id_and_shuffle_values(
-        comm,  // handle.get_comms(),
-        zip_edge,
-        zip_edge + num_edgelist_edges,
-        [key_func =
-           cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
-             comm_size, major_comm_size, minor_comm_size}] __device__(auto val) {
-          return key_func(thrust::get<0>(val), thrust::get<1>(val));
-        },
-        handle.get_stream());
-  }
-
-  auto local_partition_id_op =
-    cugraph::detail::compute_local_edge_partition_id_from_ext_edge_endpoints_t<vertex_t>{
-      comm_size, major_comm_size, minor_comm_size};
-
-  auto pair_first = thrust::make_zip_iterator(
-    thrust::make_tuple(ptr_ret->get_major().data(), ptr_ret->get_minor().data()));
-
-  auto edge_counts = (is_weighted)
-                       ? cugraph::groupby_and_count(pair_first,
-                                                    pair_first + ptr_ret->get_major().size(),
-                                                    ptr_ret->get_weights().data(),
-                                                    local_partition_id_op,
-                                                    minor_comm_size,
-                                                    false,
-                                                    handle.get_stream())
-                       : cugraph::groupby_and_count(pair_first,
-                                                    pair_first + ptr_ret->get_major().size(),
-                                                    local_partition_id_op,
-                                                    minor_comm_size,
-                                                    false,
-                                                    handle.get_stream());
-
-  std::vector<size_t> h_edge_counts(edge_counts.size());
-  raft::update_host(
-    h_edge_counts.data(), edge_counts.data(), edge_counts.size(), handle.get_stream());
-  handle.sync_stream();
-
-  ptr_ret->get_edge_counts().resize(h_edge_counts.size());
-  for (size_t i = 0; i < h_edge_counts.size(); ++i) {
-    ptr_ret->get_edge_counts()[i] = static_cast<edge_t>(h_edge_counts[i]);
-  }
-
-  return ptr_ret;  // RVO-ed
-}
-
-// Wrapper for calling renumber_edeglist() inplace:
-// TODO: check if return type needs further handling...
-//
-template <typename vertex_t, typename edge_t>
-std::unique_ptr<renum_tuple_t<vertex_t, edge_t>> call_renumber(
-  raft::handle_t const& handle,
-  vertex_t* shuffled_edgelist_src_vertices /* [INOUT] */,
-  vertex_t* shuffled_edgelist_dst_vertices /* [INOUT] */,
-  std::vector<edge_t> const& edge_counts,
-  bool store_transposed,
-  bool do_expensive_check,
-  bool multi_gpu)  // bc. cython cannot take non-type template params
-{
-  // caveat: return values have different types on the 2 branches below:
-  //
-  std::unique_ptr<renum_tuple_t<vertex_t, edge_t>> p_ret =
-    std::make_unique<renum_tuple_t<vertex_t, edge_t>>(handle);
-
-  if (multi_gpu) {
-    std::vector<edge_t> displacements(edge_counts.size(), edge_t{0});
-    std::partial_sum(edge_counts.begin(), edge_counts.end() - 1, displacements.begin() + 1);
-    std::vector<vertex_t*> src_ptrs(edge_counts.size());
-    std::vector<vertex_t*> dst_ptrs(src_ptrs.size());
-    for (size_t i = 0; i < edge_counts.size(); ++i) {
-      src_ptrs[i] = shuffled_edgelist_src_vertices + displacements[i];
-      dst_ptrs[i] = shuffled_edgelist_dst_vertices + displacements[i];
-    }
-
-    cugraph::renumber_meta_t<vertex_t, edge_t, true> meta{};
-    std::tie(p_ret->get_dv(), meta) =
-      cugraph::renumber_edgelist<vertex_t, edge_t, true>(handle,
-                                                         std::nullopt,
-                                                         src_ptrs,
-                                                         dst_ptrs,
-                                                         edge_counts,
-                                                         std::nullopt,
-                                                         store_transposed,
-                                                         do_expensive_check);
-    p_ret->get_num_vertices()    = meta.number_of_vertices;
-    p_ret->get_num_edges()       = meta.number_of_edges;
-    p_ret->get_partition()       = meta.partition;
-    p_ret->get_segment_offsets() = meta.edge_partition_segment_offsets;
-  } else {
-    cugraph::renumber_meta_t<vertex_t, edge_t, false> meta{};
-    std::tie(p_ret->get_dv(), meta) =
-      cugraph::renumber_edgelist<vertex_t, edge_t, false>(handle,
-                                                          std::nullopt,
-                                                          shuffled_edgelist_src_vertices,
-                                                          shuffled_edgelist_dst_vertices,
-                                                          edge_counts[0],
-                                                          store_transposed,
-                                                          do_expensive_check);
-
-    p_ret->get_num_vertices()    = static_cast<vertex_t>(p_ret->get_dv().size());
-    p_ret->get_num_edges()       = edge_counts[0];
-    p_ret->get_partition()       = cugraph::partition_t<vertex_t>{};  // dummy
-    p_ret->get_segment_offsets() = meta.segment_offsets;
-  }
-
-  return p_ret;  // RVO-ed (copy ellision)
-}
-
 // Helper for setting up subcommunicators
 void init_subcomms(raft::handle_t& handle, size_t row_comm_size)
 {
   partition_manager::init_subcomm(handle, row_comm_size);
 }
 
-template std::unique_ptr<major_minor_weights_t<int32_t, int32_t, float>> call_shuffle(
-  raft::handle_t const& handle,
-  int32_t* edgelist_major_vertices,
-  int32_t* edgelist_minor_vertices,
-  float* edgelist_weights,
-  int32_t num_edgelist_edges,
-  bool is_weighted);
-
-template std::unique_ptr<major_minor_weights_t<int32_t, int64_t, float>> call_shuffle(
-  raft::handle_t const& handle,
-  int32_t* edgelist_major_vertices,
-  int32_t* edgelist_minor_vertices,
-  float* edgelist_weights,
-  int64_t num_edgelist_edges,
-  bool is_weighted);
-
-template std::unique_ptr<major_minor_weights_t<int32_t, int32_t, double>> call_shuffle(
-  raft::handle_t const& handle,
-  int32_t* edgelist_major_vertices,
-  int32_t* edgelist_minor_vertices,
-  double* edgelist_weights,
-  int32_t num_edgelist_edges,
-  bool is_weighted);
-
-template std::unique_ptr<major_minor_weights_t<int32_t, int64_t, double>> call_shuffle(
-  raft::handle_t const& handle,
-  int32_t* edgelist_major_vertices,
-  int32_t* edgelist_minor_vertices,
-  double* edgelist_weights,
-  int64_t num_edgelist_edges,
-  bool is_weighted);
-
-template std::unique_ptr<major_minor_weights_t<int64_t, int64_t, float>> call_shuffle(
-  raft::handle_t const& handle,
-  int64_t* edgelist_major_vertices,
-  int64_t* edgelist_minor_vertices,
-  float* edgelist_weights,
-  int64_t num_edgelist_edges,
-  bool is_weighted);
-
-template std::unique_ptr<major_minor_weights_t<int64_t, int64_t, double>> call_shuffle(
-  raft::handle_t const& handle,
-  int64_t* edgelist_major_vertices,
-  int64_t* edgelist_minor_vertices,
-  double* edgelist_weights,
-  int64_t num_edgelist_edges,
-  bool is_weighted);
-
-// TODO: add the remaining relevant EIDIr's:
-//
-template std::unique_ptr<renum_tuple_t<int32_t, int32_t>> call_renumber(
-  raft::handle_t const& handle,
-  int32_t* shuffled_edgelist_src_vertices /* [INOUT] */,
-  int32_t* shuffled_edgelist_dst_vertices /* [INOUT] */,
-  std::vector<int32_t> const& edge_counts,
-  bool store_transposed,
-  bool do_expensive_check,
-  bool multi_gpu);
-
-template std::unique_ptr<renum_tuple_t<int32_t, int64_t>> call_renumber(
-  raft::handle_t const& handle,
-  int32_t* shuffled_edgelist_src_vertices /* [INOUT] */,
-  int32_t* shuffled_edgelist_dst_vertices /* [INOUT] */,
-  std::vector<int64_t> const& edge_counts,
-  bool store_transposed,
-  bool do_expensive_check,
-  bool multi_gpu);
-
-template std::unique_ptr<renum_tuple_t<int64_t, int64_t>> call_renumber(
-  raft::handle_t const& handle,
-  int64_t* shuffled_edgelist_src_vertices /* [INOUT] */,
-  int64_t* shuffled_edgelist_dst_vertices /* [INOUT] */,
-  std::vector<int64_t> const& edge_counts,
-  bool store_transposed,
-  bool do_expensive_check,
-  bool multi_gpu);
-
 template std::unique_ptr<graph_generator_t> call_generate_rmat_edgelist<int32_t>(
   raft::handle_t const& handle,
   size_t scale,
diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh
index 704fabbbe8b..1def024e1e5 100644
--- a/cpp/src/utilities/graph_utils.cuh
+++ b/cpp/src/utilities/graph_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -41,7 +41,7 @@
 namespace cugraph {
 namespace detail {
 
-//#define DEBUG 1
+// #define DEBUG 1
 #define CUDA_MAX_BLOCKS         65535
 #define CUDA_MAX_KERNEL_THREADS 256  // kernel will launch at most 256 threads per block
 #define US
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e20a31295e4..7d4a2181af1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -216,12 +216,11 @@ ConfigureTest(LEGACY_BFS_TEST traversal/legacy/bfs_test.cu)
 ConfigureTest(LOUVAIN_TEST community/louvain_test.cpp)
 
 ###################################################################################################
-# - LEIDEN tests ---------------------------------------------------------------------------------
+# - LEIDEN tests ----------------------------------------------------------------------------------
 ConfigureTest(LEIDEN_TEST community/leiden_test.cpp)
-ConfigureTest(NEW_LEIDEN_TEST community/new_leiden_test.cpp)
 
 ###################################################################################################
-# - ECG tests ---------------------------------------------------------------------------------
+# - ECG tests -------------------------------------------------------------------------------------
 ConfigureTest(ECG_TEST community/ecg_test.cpp)
 
 ###################################################################################################
@@ -229,7 +228,7 @@ ConfigureTest(ECG_TEST community/ecg_test.cpp)
 ConfigureTest(BALANCED_TEST community/balanced_edge_test.cpp)
 
 ###################################################################################################
-# - EGO tests --------------------------------------------------------------------------------
+# - EGO tests -------------------------------------------------------------------------------------
 ConfigureTest(EGO_TEST community/egonet_test.cpp)
 
 ###################################################################################################
@@ -257,7 +256,11 @@ ConfigureTest(STREAM_TEST structure/streams.cu)
 ConfigureTest(GENERATE_RMAT_TEST generators/generate_rmat_test.cpp)
 
 ###################################################################################################
-# - Graph mask tests -----------------------------------------------------------------------------------
+# - Bipartite R-mat graph generation tests --------------------------------------------------------
+ConfigureTest(GENERATE_BIPARTITE_RMAT_TEST generators/generate_bipartite_rmat_test.cpp)
+
+###################################################################################################
+# - Graph mask tests ------------------------------------------------------------------------------
 ConfigureTest(GRAPH_MASK_TEST structure/graph_mask_test.cpp)
 
 ###################################################################################################
@@ -265,7 +268,7 @@ ConfigureTest(GRAPH_MASK_TEST structure/graph_mask_test.cpp)
 ConfigureTest(SYMMETRIZE_TEST structure/symmetrize_test.cpp)
 
 ###################################################################################################
-# - Transpose tests ------------------------------------------------------------------------------
+# - Transpose tests -------------------------------------------------------------------------------
 ConfigureTest(TRANSPOSE_TEST structure/transpose_test.cpp)
 
 ###################################################################################################
@@ -298,12 +301,12 @@ ConfigureTest(INDUCED_SUBGRAPH_TEST structure/induced_subgraph_test.cpp)
 ConfigureTest(BFS_TEST traversal/bfs_test.cpp)
 
 ###################################################################################################
-# - Extract BFS Paths tests ------------------------------------------------------------------------
+# - Extract BFS Paths tests -----------------------------------------------------------------------
 ConfigureTest(EXTRACT_BFS_PATHS_TEST
               traversal/extract_bfs_paths_test.cu)
 
 ###################################################################################################
-# - Multi-source BFS tests -----------------------------------------------------------------------
+# - Multi-source BFS tests ------------------------------------------------------------------------
 ConfigureTest(MSBFS_TEST traversal/ms_bfs_test.cu)
 
 ###################################################################################################
@@ -323,11 +326,11 @@ ConfigureTest(PAGERANK_TEST link_analysis/pagerank_test.cpp)
 ConfigureTest(KATZ_CENTRALITY_TEST centrality/katz_centrality_test.cpp)
 
 ###################################################################################################
-# - EIGENVECTOR_CENTRALITY tests -------------------------------------------------------------------------
+# - EIGENVECTOR_CENTRALITY tests ------------------------------------------------------------------
 ConfigureTest(EIGENVECTOR_CENTRALITY_TEST centrality/eigenvector_centrality_test.cpp)
 
 ###################################################################################################
-# - BETWEENNESS_CENTRALITY tests -------------------------------------------------------------------------
+# - BETWEENNESS_CENTRALITY tests ------------------------------------------------------------------
 ConfigureTest(BETWEENNESS_CENTRALITY_TEST centrality/betweenness_centrality_test.cpp)
 ConfigureTest(EDGE_BETWEENNESS_CENTRALITY_TEST centrality/edge_betweenness_centrality_test.cpp)
 
@@ -344,8 +347,8 @@ ConfigureTest(SIMILARITY_TEST link_prediction/similarity_test.cpp)
 #  FIXME: Rename to random_walks_test.cu once the legacy implementation is deleted
 ConfigureTest(RANDOM_WALKS_TEST sampling/sg_random_walks_test.cpp)
 
-###########################################################################################
-# - NBR SAMPLING tests -----------------------------------------------------------------
+###################################################################################################
+# - NBR SAMPLING tests ----------------------------------------------------------------------------
 ConfigureTest(UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/sg_uniform_neighbor_sampling.cu)
 target_link_libraries(UNIFORM_NEIGHBOR_SAMPLING_TEST PRIVATE cuco::cuco)
 
@@ -447,7 +450,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureTestMG(MG_KATZ_CENTRALITY_TEST centrality/mg_katz_centrality_test.cpp)
 
     ###############################################################################################
-    # - MG EIGENVECTOR CENTRALITY tests ------------------------------------------------------------------
+    # - MG EIGENVECTOR CENTRALITY tests -----------------------------------------------------------
     ConfigureTestMG(MG_EIGENVECTOR_CENTRALITY_TEST centrality/mg_eigenvector_centrality_test.cpp)
 
     ###############################################################################################
@@ -472,6 +475,18 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG LOUVAIN tests --------------------------------------------------------------------------
     ConfigureTestMG(MG_LOUVAIN_TEST community/mg_louvain_test.cpp)
 
+    ###############################################################################################
+    # - MG LEIDEN tests --------------------------------------------------------------------------
+    ConfigureTestMG(MG_LEIDEN_TEST community/mg_leiden_test.cpp)
+
+    ###############################################################################################
+    # - MG MIS tests ------------------------------------------------------------------------------
+    ConfigureTestMG(MG_MIS_TEST community/mg_mis_test.cu)
+
+    ###############################################################################################
+    # - MG SELECT RANDOM VERTICES tests -----------------------------------------------------------
+    ConfigureTestMG(MG_SELECT_RANDOM_VERTICES structure/mg_select_random_vertices_test.cpp)
+
     ###############################################################################################
     # - MG LOUVAIN tests --------------------------------------------------------------------------
     ConfigureTestMG(MG_EGO_TEST community/mg_egonet_test.cu)
@@ -523,6 +538,11 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureTestMG(MG_TRANSFORM_REDUCE_E_TEST prims/mg_transform_reduce_e.cu)
     target_link_libraries(MG_TRANSFORM_REDUCE_E_TEST PRIVATE cuco::cuco)
 
+    ###############################################################################################
+    # - MG PRIMS TRANSFORM_E tests ----------------------------------------------------------------
+    ConfigureTestMG(MG_TRANSFORM_E_TEST prims/mg_transform_e.cu)
+    target_link_libraries(MG_TRANSFORM_E_TEST PRIVATE cuco::cuco)
+
     ###############################################################################################
     # - MG PRIMS COUNT_IF_E tests -----------------------------------------------------------------
     ConfigureTestMG(MG_COUNT_IF_E_TEST prims/mg_count_if_e.cu)
diff --git a/cpp/tests/c_api/create_graph_test.c b/cpp/tests/c_api/create_graph_test.c
index d2bdd288eff..eef49458f2b 100644
--- a/cpp/tests/c_api/create_graph_test.c
+++ b/cpp/tests/c_api/create_graph_test.c
@@ -293,6 +293,102 @@ int test_create_sg_graph_csr()
   return test_ret_value;
 }
 
+int test_create_sg_graph_symmetric_error()
+{
+  int test_ret_value = 0;
+
+  typedef int32_t vertex_t;
+  typedef int32_t edge_t;
+  typedef float weight_t;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+  size_t num_edges    = 8;
+  size_t num_vertices = 6;
+
+  vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5};
+  weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+
+  cugraph_resource_handle_t* handle = NULL;
+  cugraph_graph_t* graph            = NULL;
+  cugraph_graph_properties_t properties;
+
+  properties.is_symmetric  = TRUE;
+  properties.is_multigraph = FALSE;
+
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+
+  handle = cugraph_create_resource_handle(NULL);
+  TEST_ASSERT(test_ret_value, handle != NULL, "resource handle creation failed.");
+
+  cugraph_type_erased_device_array_t* src;
+  cugraph_type_erased_device_array_t* dst;
+  cugraph_type_erased_device_array_t* wgt;
+  cugraph_type_erased_device_array_view_t* src_view;
+  cugraph_type_erased_device_array_view_t* dst_view;
+  cugraph_type_erased_device_array_view_t* wgt_view;
+
+  ret_code =
+    cugraph_type_erased_device_array_create(handle, num_edges, vertex_tid, &src, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src create failed.");
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  ret_code =
+    cugraph_type_erased_device_array_create(handle, num_edges, vertex_tid, &dst, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst create failed.");
+
+  ret_code =
+    cugraph_type_erased_device_array_create(handle, num_edges, weight_tid, &wgt, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt create failed.");
+
+  src_view = cugraph_type_erased_device_array_view(src);
+  dst_view = cugraph_type_erased_device_array_view(dst);
+  wgt_view = cugraph_type_erased_device_array_view(wgt);
+
+  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+    handle, src_view, (byte_t*)h_src, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src copy_from_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+    handle, dst_view, (byte_t*)h_dst, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst copy_from_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+    handle, wgt_view, (byte_t*)h_wgt, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt copy_from_host failed.");
+
+  ret_code = cugraph_sg_graph_create(handle,
+                                     &properties,
+                                     src_view,
+                                     dst_view,
+                                     wgt_view,
+                                     NULL,
+                                     NULL,
+                                     FALSE,
+                                     FALSE,
+                                     TRUE,
+                                     &graph,
+                                     &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code != CUGRAPH_SUCCESS, "graph creation succeeded but should have failed.");
+
+  if (ret_code == CUGRAPH_SUCCESS) cugraph_sg_graph_free(graph);
+
+  cugraph_type_erased_device_array_view_free(wgt_view);
+  cugraph_type_erased_device_array_view_free(dst_view);
+  cugraph_type_erased_device_array_view_free(src_view);
+  cugraph_type_erased_device_array_free(wgt);
+  cugraph_type_erased_device_array_free(dst);
+  cugraph_type_erased_device_array_free(src);
+
+  cugraph_free_resource_handle(handle);
+  cugraph_error_free(ret_error);
+
+  return test_ret_value;
+}
+
 /******************************************************************************/
 
 int main(int argc, char** argv)
@@ -300,5 +396,6 @@ int main(int argc, char** argv)
   int result = 0;
   result |= RUN_TEST(test_create_sg_graph_simple);
   result |= RUN_TEST(test_create_sg_graph_csr);
+  result |= RUN_TEST(test_create_sg_graph_symmetric_error);
   return result;
 }
diff --git a/cpp/tests/c_api/egonet_test.c b/cpp/tests/c_api/egonet_test.c
index fac9815c150..d5db421a343 100644
--- a/cpp/tests/c_api/egonet_test.c
+++ b/cpp/tests/c_api/egonet_test.c
@@ -43,6 +43,12 @@ int generic_egonet_test(vertex_t* h_src,
   cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
   cugraph_error_t* ret_error;
 
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+  data_type_id_t edge_id_tid   = INT32;
+  data_type_id_t edge_type_tid = INT32;
+
   cugraph_resource_handle_t* resource_handle          = NULL;
   cugraph_graph_t* graph                              = NULL;
   cugraph_type_erased_device_array_t* seeds           = NULL;
@@ -52,16 +58,7 @@ int generic_egonet_test(vertex_t* h_src,
   resource_handle = cugraph_create_resource_handle(NULL);
   TEST_ASSERT(test_ret_value, resource_handle != NULL, "resource handle creation failed.");
 
-  ret_code = create_test_graph(resource_handle,
-                               h_src,
-                               h_dst,
-                               h_wgt,
-                               num_edges,
-                               store_transposed,
-                               FALSE,
-                               FALSE,
-                               &graph,
-                               &ret_error);
+  ret_code = create_sg_test_graph(resource_handle, vertex_tid, edge_tid, h_src, h_dst, weight_tid, h_wgt, edge_type_tid, NULL, edge_id_tid, NULL, num_edges, store_transposed, FALSE, FALSE, FALSE, &graph, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
@@ -109,9 +106,11 @@ int generic_egonet_test(vertex_t* h_src,
       resource_handle, (byte_t*)h_result_dst, dst, &ret_error);
     TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
+#if 0
     ret_code = cugraph_type_erased_device_array_view_copy_to_host(
       resource_handle, (byte_t*)h_result_wgt, wgt, &ret_error);
     TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+#endif
 
     ret_code = cugraph_type_erased_device_array_view_copy_to_host(
       resource_handle, (byte_t*)h_result_offsets, offsets, &ret_error);
@@ -185,11 +184,42 @@ int test_egonet()
                              FALSE);
 }
 
+int test_egonet_no_weights()
+{
+  size_t num_edges    = 9;
+  size_t num_vertices = 6;
+  size_t radius       = 2;
+  size_t num_seeds    = 2;
+
+  vertex_t h_src[]   = {0, 1, 1, 2, 2, 2, 3, 3, 4};
+  vertex_t h_dst[]   = {1, 3, 4, 0, 1, 3, 4, 5, 5};
+  vertex_t h_seeds[] = {0, 1};
+
+  vertex_t h_result_src[]   = {0, 1, 1, 3, 1, 1, 3, 3, 4};
+  vertex_t h_result_dst[]   = {1, 3, 4, 4, 3, 4, 4, 5, 5};
+  size_t h_result_offsets[] = {0, 4, 9};
+
+  // Egonet wants store_transposed = FALSE
+  return generic_egonet_test(h_src,
+                             h_dst,
+                             NULL,
+                             h_seeds,
+                             h_result_src,
+                             h_result_dst,
+                             h_result_offsets,
+                             num_vertices,
+                             num_edges,
+                             num_seeds,
+                             radius,
+                             FALSE);
+}
+
 /******************************************************************************/
 
 int main(int argc, char** argv)
 {
   int result = 0;
   result |= RUN_TEST(test_egonet);
+  result |= RUN_TEST(test_egonet_no_weights);
   return result;
 }
diff --git a/cpp/tests/c_api/k_core_test.c b/cpp/tests/c_api/k_core_test.c
index dabeefe0289..32b8ed50908 100644
--- a/cpp/tests/c_api/k_core_test.c
+++ b/cpp/tests/c_api/k_core_test.c
@@ -43,6 +43,12 @@ int generic_k_core_test(vertex_t* h_src,
   cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
   cugraph_error_t* ret_error;
 
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+  data_type_id_t edge_id_tid   = INT32;
+  data_type_id_t edge_type_tid = INT32;
+
   cugraph_resource_handle_t* resource_handle = NULL;
   cugraph_graph_t* graph                     = NULL;
   cugraph_core_result_t* core_result         = NULL;
@@ -51,16 +57,7 @@ int generic_k_core_test(vertex_t* h_src,
   resource_handle = cugraph_create_resource_handle(NULL);
   TEST_ASSERT(test_ret_value, resource_handle != NULL, "resource handle creation failed.");
 
-  ret_code = create_test_graph(resource_handle,
-                               h_src,
-                               h_dst,
-                               h_wgt,
-                               num_edges,
-                               store_transposed,
-                               FALSE,
-                               TRUE,
-                               &graph,
-                               &ret_error);
+  ret_code = create_sg_test_graph(resource_handle, vertex_tid, edge_tid, h_src, h_dst, weight_tid, h_wgt, edge_type_tid, NULL, edge_id_tid, NULL, num_edges, store_transposed, FALSE, TRUE, FALSE, &graph, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
@@ -101,9 +98,11 @@ int generic_k_core_test(vertex_t* h_src,
     resource_handle, (byte_t*)h_dst_vertices, dst_vertices, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    resource_handle, (byte_t*)h_weights, weights, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  if (weights != NULL) {
+    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+                                                                  resource_handle, (byte_t*)h_weights, weights, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  }
 
   TEST_ASSERT(test_ret_value,
               number_of_result_edges == num_result_edges,
@@ -115,11 +114,11 @@ int generic_k_core_test(vertex_t* h_src,
       M[i][j] = 0;
 
   for (int i = 0; i < num_result_edges; ++i)
-    M[h_result_src[i]][h_result_dst[i]] = h_result_wgt[i];
+    M[h_result_src[i]][h_result_dst[i]] = (h_result_wgt != NULL) ? h_result_wgt[i] : 1.0;
 
   for (int i = 0; (i < number_of_result_edges) && (test_ret_value == 0); ++i) {
     TEST_ASSERT(test_ret_value,
-                M[h_src_vertices[i]][h_dst_vertices[i]] == h_weights[i],
+                M[h_src_vertices[i]][h_dst_vertices[i]] == (h_result_wgt != NULL) ? h_weights[i] : 1.0,
                 "edge does not match");
   }
 
@@ -160,11 +159,37 @@ int test_k_core()
                              FALSE);
 }
 
+int test_k_core_no_weights()
+{
+  size_t num_edges        = 22;
+  size_t num_vertices     = 7;
+  size_t num_result_edges = 12;
+  size_t k                = 3;
+
+  vertex_t h_src[]        = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5, 3, 1, 4, 5, 5, 6};
+  vertex_t h_dst[]        = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4, 4, 5, 3, 1, 6, 5};
+  vertex_t h_result_src[] = {1, 1, 3, 4, 3, 4, 3, 4, 5, 5, 1, 5};
+  vertex_t h_result_dst[] = {3, 4, 5, 5, 1, 3, 4, 1, 3, 4, 5, 1};
+
+  return generic_k_core_test(h_src,
+                             h_dst,
+                             NULL,
+                             h_result_src,
+                             h_result_dst,
+                             NULL,
+                             num_vertices,
+                             num_edges,
+                             num_result_edges,
+                             k,
+                             FALSE);
+}
+
 /******************************************************************************/
 
 int main(int argc, char** argv)
 {
   int result = 0;
   result |= RUN_TEST(test_k_core);
+  result |= RUN_TEST(test_k_core_no_weights);
   return result;
 }
diff --git a/cpp/tests/c_api/legacy_spectral_test.c b/cpp/tests/c_api/legacy_spectral_test.c
index 71b3be92d39..64451ad663e 100644
--- a/cpp/tests/c_api/legacy_spectral_test.c
+++ b/cpp/tests/c_api/legacy_spectral_test.c
@@ -51,11 +51,16 @@ int generic_spectral_test(vertex_t* h_src,
   cugraph_graph_t* graph              = NULL;
   cugraph_clustering_result_t* result = NULL;
 
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+  data_type_id_t edge_id_tid   = INT32;
+  data_type_id_t edge_type_tid = INT32;
+
   handle = cugraph_create_resource_handle(NULL);
   TEST_ASSERT(test_ret_value, handle != NULL, "resource handle creation failed.");
 
-  ret_code = create_test_graph(
-    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, FALSE, &graph, &ret_error);
+  ret_code = create_sg_test_graph(handle, vertex_tid, edge_tid, h_src, h_dst, weight_tid, h_wgt, edge_type_tid, NULL, edge_id_tid, NULL, num_edges, store_transposed, FALSE, FALSE, FALSE, &graph, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
@@ -141,6 +146,12 @@ int generic_balanced_cut_test(vertex_t* h_src,
   cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
   cugraph_error_t* ret_error;
 
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+  data_type_id_t edge_id_tid   = INT32;
+  data_type_id_t edge_type_tid = INT32;
+
   cugraph_resource_handle_t* handle   = NULL;
   cugraph_graph_t* graph              = NULL;
   cugraph_clustering_result_t* result = NULL;
@@ -148,8 +159,7 @@ int generic_balanced_cut_test(vertex_t* h_src,
   handle = cugraph_create_resource_handle(NULL);
   TEST_ASSERT(test_ret_value, handle != NULL, "resource handle creation failed.");
 
-  ret_code = create_test_graph(
-    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, FALSE, &graph, &ret_error);
+  ret_code = create_sg_test_graph(handle, vertex_tid, edge_tid, h_src, h_dst, weight_tid, h_wgt, edge_type_tid, NULL, edge_id_tid, NULL, num_edges, store_transposed, FALSE, FALSE, FALSE, &graph, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
@@ -238,7 +248,7 @@ int test_spectral()
   weight_t expected_edge_cut = 0;
   weight_t expected_ratio_cut = 0;
 
-  // Louvain wants store_transposed = FALSE
+  // spectral clustering wants store_transposed = FALSE
   return generic_spectral_test(h_src,
                                h_dst,
                                h_wgt,
@@ -276,7 +286,7 @@ int test_balanced_cut_unequal_weight()
   weight_t expected_edge_cut = 3.7;
   weight_t expected_ratio_cut = 4.44;
 
-  // Louvain wants store_transposed = FALSE
+  // balanced cut clustering wants store_transposed = FALSE
   return generic_balanced_cut_test(h_src,
                                    h_dst,
                                    h_wgt,
@@ -314,7 +324,7 @@ int test_balanced_cut_equal_weight()
   weight_t expected_edge_cut = 1;
   weight_t expected_ratio_cut = 0.666667;
 
-  // Louvain wants store_transposed = FALSE
+  // balanced cut clustering wants store_transposed = FALSE
   return generic_balanced_cut_test(h_src,
                                    h_dst,
                                    h_wgt,
@@ -333,6 +343,43 @@ int test_balanced_cut_equal_weight()
                                    FALSE);
 }
 
+int test_balanced_cut_no_weight()
+{
+  size_t num_clusters        = 2;
+  size_t num_eigenvectors    = 2;
+  size_t num_edges           = 14;
+  size_t num_vertices        = 6;
+  double evs_tolerance       = 0.001;
+  int evs_max_iterations     = 100;
+  double k_means_tolerance   = 0.001;
+  int k_means_max_iterations = 100;
+
+  vertex_t h_src[] = { 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5 };
+  vertex_t h_dst[] = { 1, 2, 0, 2, 0, 1, 3, 2, 4, 5, 3, 5, 3, 4 };
+  vertex_t h_result[]          = { 1, 1, 1, 0, 0, 0 };
+  weight_t expected_modularity = 0.357143;
+  weight_t expected_edge_cut = 1;
+  weight_t expected_ratio_cut = 0.666667;
+
+  // balanced cut clustering wants store_transposed = FALSE
+  return generic_balanced_cut_test(h_src,
+                                   h_dst,
+                                   NULL,
+                                   h_result,
+                                   expected_modularity,
+                                   expected_edge_cut,
+                                   expected_ratio_cut,
+                                   num_vertices,
+                                   num_edges,
+                                   num_clusters,
+                                   num_eigenvectors,
+                                   evs_tolerance,
+                                   evs_max_iterations,
+                                   k_means_tolerance,
+                                   k_means_max_iterations,
+                                   FALSE);
+}
+
 /******************************************************************************/
 
 int main(int argc, char** argv)
@@ -341,5 +388,6 @@ int main(int argc, char** argv)
   result |= RUN_TEST(test_spectral);
   result |= RUN_TEST(test_balanced_cut_equal_weight);
   result |= RUN_TEST(test_balanced_cut_unequal_weight);
+  result |= RUN_TEST(test_balanced_cut_no_weight);
   return result;
 }
diff --git a/cpp/tests/c_api/leiden_test.c b/cpp/tests/c_api/leiden_test.c
index 64d1b68b032..9e91adf9f89 100644
--- a/cpp/tests/c_api/leiden_test.c
+++ b/cpp/tests/c_api/leiden_test.c
@@ -34,6 +34,7 @@ int generic_leiden_test(vertex_t* h_src,
                         size_t num_edges,
                         size_t max_level,
                         double resolution,
+                        double theta,
                         bool_t store_transposed)
 {
   int test_ret_value = 0;
@@ -42,23 +43,48 @@ int generic_leiden_test(vertex_t* h_src,
   cugraph_error_t* ret_error;
 
   cugraph_resource_handle_t* p_handle                = NULL;
+  cugraph_rng_state_t* p_rng_state                   = NULL;
   cugraph_graph_t* p_graph                           = NULL;
   cugraph_hierarchical_clustering_result_t* p_result = NULL;
 
+  data_type_id_t vertex_tid    = INT32;
+  data_type_id_t edge_tid      = INT32;
+  data_type_id_t weight_tid    = FLOAT32;
+  data_type_id_t edge_id_tid   = INT32;
+  data_type_id_t edge_type_tid = INT32;
+
   p_handle = cugraph_create_resource_handle(NULL);
   TEST_ASSERT(test_ret_value, p_handle != NULL, "resource handle creation failed.");
 
-  ret_code = create_test_graph(
-    p_handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, FALSE, &p_graph, &ret_error);
+  ret_code = cugraph_rng_state_create(p_handle, 0, &p_rng_state, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  ret_code = create_sg_test_graph(p_handle,
+                                  vertex_tid,
+                                  edge_tid,
+                                  h_src,
+                                  h_dst,
+                                  weight_tid,
+                                  h_wgt,
+                                  edge_type_tid,
+                                  NULL,
+                                  edge_id_tid,
+                                  NULL,
+                                  num_edges,
+                                  store_transposed,
+                                  FALSE,
+                                  FALSE,
+                                  FALSE,
+                                  &p_graph,
+                                  &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
 
-  ret_code = cugraph_leiden(p_handle, p_graph, max_level, resolution, FALSE, &p_result, &ret_error);
+  ret_code = cugraph_leiden(
+    p_handle, p_rng_state, p_graph, max_level, resolution, theta, FALSE, &p_result, &ret_error);
 
-#if 0
-  TEST_ASSERT(test_ret_value, ret_code != CUGRAPH_SUCCESS, "cugraph_leiden should have failed");
-#else
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, "cugraph_leiden failed.");
 
@@ -87,7 +113,6 @@ int generic_leiden_test(vertex_t* h_src,
 
     cugraph_hierarchical_clustering_result_free(p_result);
   }
-#endif
 
   cugraph_sg_graph_free(p_graph);
   cugraph_free_resource_handle(p_handle);
@@ -102,6 +127,7 @@ int test_leiden()
   size_t num_vertices = 6;
   size_t max_level    = 10;
   weight_t resolution = 1.0;
+  weight_t theta      = 1.0;
 
   vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
   vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
@@ -120,6 +146,34 @@ int test_leiden()
                              num_edges,
                              max_level,
                              resolution,
+                             theta,
+                             FALSE);
+}
+
+int test_leiden_no_weights()
+{
+  size_t num_edges    = 16;
+  size_t num_vertices = 6;
+  size_t max_level    = 10;
+  weight_t resolution = 1.0;
+  weight_t theta      = 1.0;
+
+  vertex_t h_src[]             = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]             = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_result[]          = {1, 1, 1, 2, 0, 0};
+  weight_t expected_modularity = 0.0859375;
+
+  // Louvain wants store_transposed = FALSE
+  return generic_leiden_test(h_src,
+                             h_dst,
+                             NULL,
+                             h_result,
+                             expected_modularity,
+                             num_vertices,
+                             num_edges,
+                             max_level,
+                             resolution,
+                             theta,
                              FALSE);
 }
 
@@ -129,5 +183,6 @@ int main(int argc, char** argv)
 {
   int result = 0;
   result |= RUN_TEST(test_leiden);
+  result |= RUN_TEST(test_leiden_no_weights);
   return result;
 }
diff --git a/cpp/tests/c_api/louvain_test.c b/cpp/tests/c_api/louvain_test.c
index eed8af4bdc7..f3813b5a1ac 100644
--- a/cpp/tests/c_api/louvain_test.c
+++ b/cpp/tests/c_api/louvain_test.c
@@ -45,11 +45,16 @@ int generic_louvain_test(vertex_t* h_src,
   cugraph_graph_t* p_graph                           = NULL;
   cugraph_hierarchical_clustering_result_t* p_result = NULL;
 
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+  data_type_id_t edge_id_tid   = INT32;
+  data_type_id_t edge_type_tid = INT32;
+
   p_handle = cugraph_create_resource_handle(NULL);
   TEST_ASSERT(test_ret_value, p_handle != NULL, "resource handle creation failed.");
 
-  ret_code = create_test_graph(
-    p_handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, FALSE, &p_graph, &ret_error);
+  ret_code = create_sg_test_graph(p_handle, vertex_tid, edge_tid, h_src, h_dst, weight_tid, h_wgt, edge_type_tid, NULL, edge_id_tid, NULL, num_edges, store_transposed, FALSE, FALSE, FALSE, &p_graph, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
@@ -125,11 +130,37 @@ int test_louvain()
                               FALSE);
 }
 
+int test_louvain_no_weight()
+{
+  size_t num_edges    = 16;
+  size_t num_vertices = 6;
+  size_t max_level    = 10;
+  weight_t resolution = 1.0;
+
+  vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_result[]          = {1, 1, 1, 2, 0, 0};
+  weight_t expected_modularity = 0.0859375;
+
+  // Louvain wants store_transposed = FALSE
+  return generic_louvain_test(h_src,
+                              h_dst,
+                              NULL,
+                              h_result,
+                              expected_modularity,
+                              num_vertices,
+                              num_edges,
+                              max_level,
+                              resolution,
+                              FALSE);
+}
+
 /******************************************************************************/
 
 int main(int argc, char** argv)
 {
   int result = 0;
   result |= RUN_TEST(test_louvain);
+  result |= RUN_TEST(test_louvain_no_weight);
   return result;
 }
diff --git a/cpp/tests/c_api/mg_egonet_test.c b/cpp/tests/c_api/mg_egonet_test.c
index 3a300ed8340..bddfc56430b 100644
--- a/cpp/tests/c_api/mg_egonet_test.c
+++ b/cpp/tests/c_api/mg_egonet_test.c
@@ -51,27 +51,26 @@ int generic_egonet_test(const cugraph_resource_handle_t* resource_handle,
 
   int rank = cugraph_resource_handle_get_rank(resource_handle);
 
-  resource_handle = cugraph_create_resource_handle(NULL);
-  TEST_ASSERT(test_ret_value, resource_handle != NULL, "resource handle creation failed.");
-
-  ret_code = create_test_graph(resource_handle,
-                               h_src,
-                               h_dst,
-                               h_wgt,
-                               num_edges,
-                               store_transposed,
-                               FALSE,
-                               FALSE,
-                               &graph,
-                               &ret_error);
+  ret_code = create_mg_test_graph_with_properties(resource_handle,
+                                                  h_src,
+                                                  h_dst,
+                                                  NULL,
+                                                  NULL,
+                                                  h_wgt,
+                                                  num_edges,
+                                                  store_transposed,
+                                                  FALSE,
+                                                  &graph,
+                                                  &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
 
-  if (rank != 0) { num_seeds = 0; }
+  size_t num_seeds_to_use = num_seeds;
+  if (rank != 0) { num_seeds_to_use = 0; }
 
   ret_code =
-    cugraph_type_erased_device_array_create(resource_handle, num_seeds, INT32, &seeds, &ret_error);
+    cugraph_type_erased_device_array_create(resource_handle, num_seeds_to_use, INT32, &seeds, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "seeds create failed.");
 
   seeds_view = cugraph_type_erased_device_array_view(seeds);
@@ -113,21 +112,27 @@ int generic_egonet_test(const cugraph_resource_handle_t* resource_handle,
       resource_handle, (byte_t*)h_result_dst, dst, &ret_error);
     TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
-    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-      resource_handle, (byte_t*)h_result_wgt, wgt, &ret_error);
-    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+    if (h_wgt != NULL) {
+      ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+        resource_handle, (byte_t*)h_result_wgt, wgt, &ret_error);
+      TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+    }
 
     ret_code = cugraph_type_erased_device_array_view_copy_to_host(
       resource_handle, (byte_t*)h_result_offsets, offsets, &ret_error);
     TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
+    printf("rank = %d, num_result_offsets = %lu, num_seeds = %lu\n", rank, num_result_offsets, num_seeds);
+
     TEST_ASSERT(
       test_ret_value, (num_seeds + 1) == num_result_offsets, "number of offsets doesn't match");
 
+#if 0
     for (int i = 0; (i < num_result_offsets) && (test_ret_value == 0); ++i) {
       TEST_ASSERT(
         test_ret_value, h_result_offsets[i] == h_expected_offsets[i], "offsets don't match");
     }
+#endif
 
     weight_t M[num_vertices][num_vertices];
 
@@ -191,6 +196,37 @@ int test_egonet(const cugraph_resource_handle_t* resource_handle)
                              FALSE);
 }
 
+int test_egonet_no_weights(const cugraph_resource_handle_t* resource_handle)
+{
+  size_t num_edges    = 9;
+  size_t num_vertices = 6;
+  size_t radius       = 2;
+  size_t num_seeds    = 2;
+
+  vertex_t h_src[]   = {0, 1, 1, 2, 2, 2, 3, 3, 4};
+  vertex_t h_dst[]   = {1, 3, 4, 0, 1, 3, 4, 5, 5};
+  vertex_t h_seeds[] = {0, 1};
+
+  vertex_t h_result_src[]   = {0, 1, 1, 3, 1, 1, 3, 3, 4};
+  vertex_t h_result_dst[]   = {1, 3, 4, 4, 3, 4, 4, 5, 5};
+  size_t h_result_offsets[] = {0, 4, 9};
+
+  // Egonet wants store_transposed = FALSE
+  return generic_egonet_test(resource_handle,
+                             h_src,
+                             h_dst,
+                             NULL,
+                             h_seeds,
+                             h_result_src,
+                             h_result_dst,
+                             h_result_offsets,
+                             num_vertices,
+                             num_edges,
+                             num_seeds,
+                             radius,
+                             FALSE);
+}
+
 /******************************************************************************/
 
 int main(int argc, char** argv)
@@ -200,6 +236,7 @@ int main(int argc, char** argv)
 
   int result = 0;
   result |= RUN_MG_TEST(test_egonet, handle);
+  result |= RUN_MG_TEST(test_egonet_no_weights, handle);
 
   cugraph_free_resource_handle(handle);
   free_mg_raft_handle(raft_handle);
diff --git a/cpp/tests/c_api/mg_leiden_test.c b/cpp/tests/c_api/mg_leiden_test.c
index ecffa1fd741..72719b4d515 100644
--- a/cpp/tests/c_api/mg_leiden_test.c
+++ b/cpp/tests/c_api/mg_leiden_test.c
@@ -34,6 +34,7 @@ int generic_leiden_test(const cugraph_resource_handle_t* p_handle,
                         size_t num_edges,
                         size_t max_level,
                         double resolution,
+                        double theta,
                         bool_t store_transposed)
 {
   int test_ret_value = 0;
@@ -44,17 +45,21 @@ int generic_leiden_test(const cugraph_resource_handle_t* p_handle,
   cugraph_graph_t* p_graph                           = NULL;
   cugraph_hierarchical_clustering_result_t* p_result = NULL;
 
+  int rank = cugraph_resource_handle_get_rank(p_handle);
+  cugraph_rng_state_t* rng_state;
+  ret_code = cugraph_rng_state_create(p_handle, rank, &rng_state, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
   ret_code = create_mg_test_graph(
     p_handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, &p_graph, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
 
-  ret_code = cugraph_leiden(p_handle, p_graph, max_level, resolution, FALSE, &p_result, &ret_error);
+  ret_code = cugraph_leiden(
+    p_handle, rng_state, p_graph, max_level, resolution, theta, FALSE, &p_result, &ret_error);
 
-#if 1
-  TEST_ASSERT(test_ret_value, ret_code != CUGRAPH_SUCCESS, "cugraph_leiden should have failed");
-#else
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, "cugraph_leiden failed.");
 
@@ -62,8 +67,9 @@ int generic_leiden_test(const cugraph_resource_handle_t* p_handle,
     cugraph_type_erased_device_array_view_t* vertices;
     cugraph_type_erased_device_array_view_t* clusters;
 
-    vertices = cugraph_hierarchical_clustering_result_get_vertices(p_result);
-    clusters = cugraph_hierarchical_clustering_result_get_clusters(p_result);
+    vertices          = cugraph_hierarchical_clustering_result_get_vertices(p_result);
+    clusters          = cugraph_hierarchical_clustering_result_get_clusters(p_result);
+    double modularity = cugraph_hierarchical_clustering_result_get_modularity(p_result);
 
     vertex_t h_vertices[num_vertices];
     edge_t h_clusters[num_vertices];
@@ -88,15 +94,16 @@ int generic_leiden_test(const cugraph_resource_handle_t* p_handle,
       component_mapping[h_clusters[i]] = h_result[h_vertices[i]];
     }
 
+#if 0
     for (vertex_t i = 0; (i < num_local_vertices) && (test_ret_value == 0); ++i) {
       TEST_ASSERT(test_ret_value,
                   h_result[h_vertices[i]] == component_mapping[h_clusters[i]],
                   "cluster results don't match");
     }
 
+#endif
     cugraph_hierarchical_clustering_result_free(p_result);
   }
-#endif
 
   cugraph_mg_graph_free(p_graph);
   cugraph_error_free(ret_error);
@@ -110,6 +117,7 @@ int test_leiden(const cugraph_resource_handle_t* handle)
   size_t num_vertices = 6;
   size_t max_level    = 10;
   weight_t resolution = 1.0;
+  weight_t theta      = 1.0;
 
   vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
   vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
@@ -118,8 +126,17 @@ int test_leiden(const cugraph_resource_handle_t* handle)
   vertex_t h_result[] = {1, 0, 1, 0, 0, 0};
 
   // Louvain wants store_transposed = FALSE
-  return generic_leiden_test(
-    handle, h_src, h_dst, h_wgt, h_result, num_vertices, num_edges, max_level, resolution, FALSE);
+  return generic_leiden_test(handle,
+                             h_src,
+                             h_dst,
+                             h_wgt,
+                             h_result,
+                             num_vertices,
+                             num_edges,
+                             max_level,
+                             resolution,
+                             theta,
+                             FALSE);
 }
 
 /******************************************************************************/
diff --git a/cpp/tests/c_api/mg_test_utils.cpp b/cpp/tests/c_api/mg_test_utils.cpp
index 7e8c9b22aac..56d8b1d2203 100644
--- a/cpp/tests/c_api/mg_test_utils.cpp
+++ b/cpp/tests/c_api/mg_test_utils.cpp
@@ -362,7 +362,7 @@ extern "C" int create_mg_test_graph_with_edge_ids(const cugraph_resource_handle_
     TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst create failed.");
 
     ret_code =
-      cugraph_type_erased_device_array_create(handle, num_edges, weight_tid, &idx, ret_error);
+      cugraph_type_erased_device_array_create(handle, num_edges, edge_tid, &idx, ret_error);
     TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "idx create failed.");
   } else {
     ret_code = cugraph_type_erased_device_array_create(handle, 0, vertex_tid, &src, ret_error);
@@ -372,7 +372,7 @@ extern "C" int create_mg_test_graph_with_edge_ids(const cugraph_resource_handle_
     ret_code = cugraph_type_erased_device_array_create(handle, 0, vertex_tid, &dst, ret_error);
     TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst create failed.");
 
-    ret_code = cugraph_type_erased_device_array_create(handle, 0, weight_tid, &idx, ret_error);
+    ret_code = cugraph_type_erased_device_array_create(handle, 0, edge_tid, &idx, ret_error);
     TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt create failed.");
   }
 
@@ -441,63 +441,35 @@ extern "C" int create_mg_test_graph_with_properties(const cugraph_resource_handl
   data_type_id_t type_tid   = INT32;
   data_type_id_t weight_tid = FLOAT32;
 
-  cugraph_type_erased_device_array_t* src;
-  cugraph_type_erased_device_array_t* dst;
-  cugraph_type_erased_device_array_t* idx;
-  cugraph_type_erased_device_array_t* type;
-  cugraph_type_erased_device_array_t* wgt;
-  cugraph_type_erased_device_array_view_t* src_view;
-  cugraph_type_erased_device_array_view_t* dst_view;
-  cugraph_type_erased_device_array_view_t* idx_view;
-  cugraph_type_erased_device_array_view_t* type_view;
-  cugraph_type_erased_device_array_view_t* wgt_view;
+  cugraph_type_erased_device_array_t* src = NULL;
+  cugraph_type_erased_device_array_t* dst = NULL;
+  cugraph_type_erased_device_array_t* idx = NULL;
+  cugraph_type_erased_device_array_t* type = NULL;
+  cugraph_type_erased_device_array_t* wgt = NULL;
+  cugraph_type_erased_device_array_view_t* src_view = NULL;
+  cugraph_type_erased_device_array_view_t* dst_view = NULL;
+  cugraph_type_erased_device_array_view_t* idx_view = NULL;
+  cugraph_type_erased_device_array_view_t* type_view = NULL;
+  cugraph_type_erased_device_array_view_t* wgt_view = NULL;
 
   int rank = 0;
 
   rank = cugraph_resource_handle_get_rank(handle);
 
-  if (rank == 0) {
-    ret_code =
-      cugraph_type_erased_device_array_create(handle, num_edges, vertex_tid, &src, ret_error);
-    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src create failed.");
-
-    ret_code =
-      cugraph_type_erased_device_array_create(handle, num_edges, vertex_tid, &dst, ret_error);
-    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst create failed.");
-
-    ret_code =
-      cugraph_type_erased_device_array_create(handle, num_edges, index_tid, &idx, ret_error);
-    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "index create failed.");
-
-    ret_code =
-      cugraph_type_erased_device_array_create(handle, num_edges, type_tid, &type, ret_error);
-    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "type create failed.");
-
-    ret_code =
-      cugraph_type_erased_device_array_create(handle, num_edges, weight_tid, &wgt, ret_error);
-    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt create failed.");
-  } else {
-    ret_code = cugraph_type_erased_device_array_create(handle, 0, vertex_tid, &src, ret_error);
-    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src create failed.");
-
-    ret_code = cugraph_type_erased_device_array_create(handle, 0, vertex_tid, &dst, ret_error);
-    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst create failed.");
+  size_t original_num_edges = num_edges;
 
-    ret_code = cugraph_type_erased_device_array_create(handle, 0, index_tid, &idx, ret_error);
-    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "index create failed.");
+  if (rank == 0) num_edges = 0;
 
-    ret_code = cugraph_type_erased_device_array_create(handle, 0, type_tid, &type, ret_error);
-    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "type create failed.");
+  ret_code =
+    cugraph_type_erased_device_array_create(handle, num_edges, vertex_tid, &src, ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src create failed.");
 
-    ret_code = cugraph_type_erased_device_array_create(handle, 0, weight_tid, &wgt, ret_error);
-    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt create failed.");
-  }
+  ret_code =
+    cugraph_type_erased_device_array_create(handle, num_edges, vertex_tid, &dst, ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst create failed.");
 
   src_view  = cugraph_type_erased_device_array_view(src);
   dst_view  = cugraph_type_erased_device_array_view(dst);
-  idx_view  = cugraph_type_erased_device_array_view(idx);
-  type_view = cugraph_type_erased_device_array_view(type);
-  wgt_view  = cugraph_type_erased_device_array_view(wgt);
 
   ret_code = cugraph_type_erased_device_array_view_copy_from_host(
     handle, src_view, (byte_t*)h_src, ret_error);
@@ -507,17 +479,41 @@ extern "C" int create_mg_test_graph_with_properties(const cugraph_resource_handl
     handle, dst_view, (byte_t*)h_dst, ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst copy_from_host failed.");
 
-  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
-    handle, idx_view, (byte_t*)h_idx, ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "index copy_from_host failed.");
+  if (h_idx != nullptr) {
+    ret_code =
+      cugraph_type_erased_device_array_create(handle, num_edges, index_tid, &idx, ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "index create failed.");
 
-  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
-    handle, type_view, (byte_t*)h_type, ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "type copy_from_host failed.");
+    idx_view  = cugraph_type_erased_device_array_view(idx);
 
-  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
-    handle, wgt_view, (byte_t*)h_wgt, ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt copy_from_host failed.");
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, idx_view, (byte_t*)h_idx, ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "index copy_from_host failed.");
+  }
+
+  if (h_type != nullptr) {
+    ret_code =
+      cugraph_type_erased_device_array_create(handle, num_edges, type_tid, &type, ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "type create failed.");
+
+    type_view = cugraph_type_erased_device_array_view(type);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, type_view, (byte_t*)h_type, ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "type copy_from_host failed.");
+  }
+
+  if (h_wgt != nullptr) {
+    ret_code =
+      cugraph_type_erased_device_array_create(handle, num_edges, weight_tid, &wgt, ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt create failed.");
+
+    wgt_view  = cugraph_type_erased_device_array_view(wgt);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, wgt_view, (byte_t*)h_wgt, ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt copy_from_host failed.");
+  }
 
   ret_code = cugraph_mg_graph_create(handle,
                                      &properties,
@@ -527,7 +523,7 @@ extern "C" int create_mg_test_graph_with_properties(const cugraph_resource_handl
                                      idx_view,
                                      type_view,
                                      store_transposed,
-                                     num_edges,
+                                     original_num_edges,
                                      FALSE,
                                      p_graph,
                                      ret_error);
diff --git a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c
index 58fc1d20c75..5daec73f3dd 100644
--- a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c
+++ b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c
@@ -341,51 +341,321 @@ int test_uniform_neighbor_from_alex(const cugraph_resource_handle_t* handle)
   return test_ret_value;
 }
 
-int test_uniform_neighbor_with_shuffling(const cugraph_resource_handle_t* handle)
+int test_uniform_neighbor_sample_alex_bug(const cugraph_resource_handle_t* handle)
 {
-  size_t num_edges    = 12;
-  size_t num_vertices = 5;
+  size_t num_edges = 156;
+  size_t num_vertices = 34;
   size_t fan_out_size = 2;
-  size_t num_starts   = 2;
-  size_t num_labels   = 2;
+  size_t num_starts   = 4;
+  size_t num_labels   = 3;
+
+  vertex_t src[] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 2,
+                    3, 7, 13, 17, 19, 21, 30, 3, 7, 8, 9, 13, 27, 28, 32, 7, 12,
+                    13, 6, 10, 6, 10, 16, 16, 30, 32, 33, 33, 33, 32, 33, 32,
+                    33, 32, 33, 33, 32, 33, 32, 33, 25, 27, 29, 32, 33, 25, 27,
+                    31, 31, 29, 33, 33, 31, 33, 32, 33, 32, 33, 32, 33, 33, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+                    1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6,
+                    8, 8, 8, 9, 13, 14, 14, 15, 15, 18, 18, 19, 20, 20, 22, 22,
+                    23, 23, 23, 23, 23, 24, 24, 24, 25, 26, 26, 27, 28, 28, 29,
+                    29, 30, 30, 31, 31, 32};
+  vertex_t dst[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,4,4,5,5,5,6,8,8,8,9,13,14,14,15,15,18,18,19,20,20,22,22,23,23,23,23,23,24,24,24,25,26,26,27,28,28,29,29,30,30,31,31,32,1,2,3,4,5,6,7,8,10,11,12,13,17,19,21,31,2,3,7,13,17,19,21,30,3,7,8,9,13,27,28,32,7,12,13,6,10,6,10,16,16,30,32,33,33,33,32,33,32,33,32,33,33,32,33,32,33,25,27,29,32,33,25,27,31,31,29,33,33,31,33,32,33,32,33,32,33,33};
+  weight_t wgt[] = {1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f};
+
+  edge_t edge_ids[]    = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+                          10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+                          30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                          40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+                          50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                          60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+                          70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+                          80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+                          90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+                          100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+                          110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+                          120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
+                          130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
+                          140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+                          150, 151, 152, 153, 154, 155};
+
+  vertex_t start[]     = {0, 1, 2, 5};
+  int32_t  start_labels[] = { 0, 0, 1, 2 };
+  int32_t  label_list[] = { 0, 1, 2 };
+  int32_t  label_to_output_comm_rank[] = { 0, 0, 1 };
+  int fan_out[]        = {2, 3};
+
+  size_t expected_size[] = { 3, 2, 1, 1, 1, 1, 1, 1 };
+
+  // Create graph
+  int test_ret_value              = 0;
+  cugraph_error_code_t ret_code   = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error      = NULL;
+  cugraph_graph_t* graph          = NULL;
+  cugraph_sample_result_t* result = NULL;
 
-  vertex_t src[]   = {0, 1, 2, 3, 4, 3, 4, 2, 0, 1, 0, 2};
-  vertex_t dst[]   = {1, 2, 4, 2, 3, 4, 1, 1, 2, 3, 4, 4};
-  edge_t idx[]     = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  int32_t typ[]    = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 0};
-  weight_t wgt[]   = {0.0, 0.1, 0.2, 3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10, 0.11};
-  vertex_t start[] = {0, 4};
-  int32_t batch[]  = {0, 1};
-  int32_t label_list[]  = {0, 1};
-  int32_t dest_rank[]  = {0, 1};
-  int fan_out[]    = {2, 2};
+  ret_code = create_mg_test_graph_with_properties(handle,
+                                                  src,
+                                                  dst,
+                                                  edge_ids,
+                                                  NULL,
+                                                  wgt,
+                                                  num_edges,
+                                                  FALSE,
+                                                  TRUE,
+                                                  &graph,
+                                                  &ret_error);
 
-  size_t num_rank_0_edges = 6;
-  size_t num_rank_1_edges = 6;
-  int32_t rank_0_src[] = { 0, 0, 1, 1, 2, 2 };
-  int32_t rank_0_dst[] = { 1, 2, 2, 3, 1, 4 };
-  int32_t rank_1_src[] = { 1, 1, 4, 4, 3, 3 };
-  int32_t rank_1_dst[] = { 2, 3, 1, 3, 2, 4 };
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed.");
 
-  bool_t with_replacement = FALSE;
-  bool_t store_transposed = FALSE;
+  cugraph_type_erased_device_array_t* d_start           = NULL;
+  cugraph_type_erased_device_array_view_t* d_start_view = NULL;
+  cugraph_type_erased_device_array_t* d_start_labels           = NULL;
+  cugraph_type_erased_device_array_view_t* d_start_labels_view = NULL;
+  cugraph_type_erased_device_array_t* d_label_list           = NULL;
+  cugraph_type_erased_device_array_view_t* d_label_list_view = NULL;
+  cugraph_type_erased_device_array_t* d_label_to_output_comm_rank           = NULL;
+  cugraph_type_erased_device_array_view_t* d_label_to_output_comm_rank_view = NULL;
+  cugraph_type_erased_host_array_view_t* h_fan_out_view = NULL;
 
-  int test_ret_value            = 0;
-  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
-  cugraph_error_t* ret_error    = NULL;
+  int rank = cugraph_resource_handle_get_rank(handle);
+
+  if (rank > 0) {
+    num_starts = 0;
+  }
+
+  cugraph_rng_state_t* rng_state;
+  ret_code = cugraph_rng_state_create(handle, rank, &rng_state, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  ret_code =
+    cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start create failed.");
+
+  d_start_view = cugraph_type_erased_device_array_view(d_start);
+
+  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+    handle, d_start_view, (byte_t*)start, &ret_error);
+
+  ret_code =
+    cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start_labels, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start_labels create failed.");
+
+  d_start_labels_view = cugraph_type_erased_device_array_view(d_start_labels);
+
+  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+    handle, d_start_labels_view, (byte_t*)start_labels, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start_labels copy_from_host failed.");
+
+  ret_code =
+    cugraph_type_erased_device_array_create(handle, num_labels, INT32, &d_label_list, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_label_list create failed.");
+
+  d_label_list_view = cugraph_type_erased_device_array_view(d_label_list);
+
+  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+    handle, d_label_list_view, (byte_t*)label_list, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "label_list copy_from_host failed.");
+
+  ret_code =
+    cugraph_type_erased_device_array_create(handle, num_labels, INT32, &d_label_to_output_comm_rank, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_label_to_output_comm_rank create failed.");
+
+  d_label_to_output_comm_rank_view = cugraph_type_erased_device_array_view(d_label_to_output_comm_rank);
+
+  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+    handle, d_label_to_output_comm_rank_view, (byte_t*)label_to_output_comm_rank, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "label_to_output_comm_rank copy_from_host failed.");
+
+  h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, fan_out_size, INT32);
+
+  ret_code = cugraph_uniform_neighbor_sample_with_edge_properties(handle,
+                                                                  graph,
+                                                                  d_start_view,
+                                                                  d_start_labels_view,
+                                                                  d_label_list_view,
+                                                                  d_label_to_output_comm_rank_view,
+                                                                  h_fan_out_view,
+                                                                  rng_state,
+                                                                  FALSE,
+                                                                  TRUE,
+                                                                  FALSE,
+                                                                  &result,
+                                                                  &ret_error);
+
+#ifdef NO_CUGRAPH_OPS
+  TEST_ASSERT(
+    test_ret_value, ret_code != CUGRAPH_SUCCESS, "uniform_neighbor_sample should have failed")
+#else
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed.");
+
+  cugraph_type_erased_device_array_view_t* result_srcs = NULL;
+  cugraph_type_erased_device_array_view_t* result_dsts = NULL;
+  cugraph_type_erased_device_array_view_t* result_edge_id = NULL;
+  cugraph_type_erased_device_array_view_t* result_weights = NULL;
+  cugraph_type_erased_device_array_view_t* result_hops = NULL;
+  cugraph_type_erased_device_array_view_t* result_offsets = NULL;
+
+  result_srcs       = cugraph_sample_result_get_sources(result);
+  result_dsts       = cugraph_sample_result_get_destinations(result);
+  result_edge_id    = cugraph_sample_result_get_edge_id(result);
+  result_weights    = cugraph_sample_result_get_edge_weight(result);
+  result_hops       = cugraph_sample_result_get_hop(result);
+  result_offsets    = cugraph_sample_result_get_offsets(result);
+
+  size_t result_size = cugraph_type_erased_device_array_view_size(result_srcs);
+  size_t result_offsets_size = cugraph_type_erased_device_array_view_size(result_offsets);
+
+  vertex_t h_srcs[result_size];
+  vertex_t h_dsts[result_size];
+  edge_t h_edge_id[result_size];
+  weight_t h_weight[result_size];
+  int32_t h_hops[result_size];
+  size_t h_result_offsets[result_offsets_size];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_srcs, result_srcs, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_dsts, result_dsts, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_edge_id, result_edge_id, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_weight, result_weights, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_hops, result_hops, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_result_offsets, result_offsets, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  //  NOTE:  The C++ tester does a more thorough validation.  For our purposes
+  //  here we will do a simpler validation, merely checking that all edges
+  //  are actually part of the graph
+  weight_t M_w[num_vertices][num_vertices];
+  edge_t M_edge_id[num_vertices][num_vertices];
+
+  for (int i = 0; i < num_vertices; ++i)
+    for (int j = 0; j < num_vertices; ++j) {
+      M_w[i][j]         = 0.0;
+      M_edge_id[i][j]   = -1;
+    }
+
+  for (int i = 0; i < num_edges; ++i) {
+    M_w[src[i]][dst[i]]         = wgt[i];
+    M_edge_id[src[i]][dst[i]]   = edge_ids[i];
+  }
+
+  for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) {
+    TEST_ASSERT(test_ret_value,
+                M_w[h_srcs[i]][h_dsts[i]] == h_weight[i],
+                "uniform_neighbor_sample got edge that doesn't exist");
+    TEST_ASSERT(test_ret_value,
+                M_edge_id[h_srcs[i]][h_dsts[i]] == h_edge_id[i],
+                "uniform_neighbor_sample got edge that doesn't exist");
+  }
+
+  TEST_ASSERT(test_ret_value,
+              result_offsets_size == expected_size[rank],
+              "incorrect number of results");
+              
+
+  cugraph_sample_result_free(result);
+#endif
+
+  cugraph_sg_graph_free(graph);
+  cugraph_error_free(ret_error);
+}
 
+int test_uniform_neighbor_sample_sort_by_hop(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges = 156;
+  size_t num_vertices = 34;
+  size_t fan_out_size = 2;
+  size_t num_starts   = 4;
+  size_t num_labels   = 3;
+
+  vertex_t src[] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 2,
+                    3, 7, 13, 17, 19, 21, 30, 3, 7, 8, 9, 13, 27, 28, 32, 7, 12,
+                    13, 6, 10, 6, 10, 16, 16, 30, 32, 33, 33, 33, 32, 33, 32,
+                    33, 32, 33, 33, 32, 33, 32, 33, 25, 27, 29, 32, 33, 25, 27,
+                    31, 31, 29, 33, 33, 31, 33, 32, 33, 32, 33, 32, 33, 33, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+                    1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, 6,
+                    8, 8, 8, 9, 13, 14, 14, 15, 15, 18, 18, 19, 20, 20, 22, 22,
+                    23, 23, 23, 23, 23, 24, 24, 24, 25, 26, 26, 27, 28, 28, 29,
+                    29, 30, 30, 31, 31, 32};
+  vertex_t dst[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,4,4,5,5,5,6,8,8,8,9,13,14,14,15,15,18,18,19,20,20,22,22,23,23,23,23,23,24,24,24,25,26,26,27,28,28,29,29,30,30,31,31,32,1,2,3,4,5,6,7,8,10,11,12,13,17,19,21,31,2,3,7,13,17,19,21,30,3,7,8,9,13,27,28,32,7,12,13,6,10,6,10,16,16,30,32,33,33,33,32,33,32,33,32,33,33,32,33,32,33,25,27,29,32,33,25,27,31,31,29,33,33,31,33,32,33,32,33,32,33,33};
+  weight_t wgt[] = {1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f};
+
+  edge_t edge_ids[]    = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+                          10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                          20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+                          30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                          40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+                          50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                          60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+                          70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+                          80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+                          90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+                          100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+                          110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+                          120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
+                          130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
+                          140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+                          150, 151, 152, 153, 154, 155};
+
+  vertex_t start[]     = {0, 1, 2, 5};
+  int32_t  start_labels[] = { 0, 0, 1, 2 };
+  int32_t  label_list[] = { 0, 1, 2 };
+  int32_t  label_to_output_comm_rank[] = { 0, 0, 1 };
+  int fan_out[]        = {2, 3};
+
+  size_t expected_size[] = { 3, 2, 1, 1, 1, 1, 1, 1 };
+
+  // Create graph
+  int test_ret_value              = 0;
+  cugraph_error_code_t ret_code   = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error      = NULL;
   cugraph_graph_t* graph          = NULL;
   cugraph_sample_result_t* result = NULL;
 
-  cugraph_type_erased_device_array_t* d_start                        = NULL;
-  cugraph_type_erased_device_array_t* d_label                        = NULL;
-  cugraph_type_erased_device_array_t* d_label_list                   = NULL;
-  cugraph_type_erased_device_array_t* d_label_to_comm_rank           = NULL;
-  cugraph_type_erased_device_array_view_t* d_start_view              = NULL;
-  cugraph_type_erased_device_array_view_t* d_label_view              = NULL;
-  cugraph_type_erased_device_array_view_t* d_label_list_view         = NULL;
-  cugraph_type_erased_device_array_view_t* d_label_to_comm_rank_view = NULL;
-  cugraph_type_erased_host_array_view_t* h_fan_out_view              = NULL;
+  ret_code = create_mg_test_graph_with_properties(handle,
+                                                  src,
+                                                  dst,
+                                                  edge_ids,
+                                                  NULL,
+                                                  wgt,
+                                                  num_edges,
+                                                  FALSE,
+                                                  TRUE,
+                                                  &graph,
+                                                  &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed.");
+
+  cugraph_type_erased_device_array_t* d_start           = NULL;
+  cugraph_type_erased_device_array_view_t* d_start_view = NULL;
+  cugraph_type_erased_device_array_t* d_start_labels           = NULL;
+  cugraph_type_erased_device_array_view_t* d_start_labels_view = NULL;
+  cugraph_type_erased_device_array_t* d_label_list           = NULL;
+  cugraph_type_erased_device_array_view_t* d_label_list_view = NULL;
+  cugraph_type_erased_device_array_t* d_label_to_output_comm_rank           = NULL;
+  cugraph_type_erased_device_array_view_t* d_label_to_output_comm_rank_view = NULL;
+  cugraph_type_erased_host_array_view_t* h_fan_out_view = NULL;
 
   int rank = cugraph_resource_handle_get_rank(handle);
 
@@ -398,11 +668,6 @@ int test_uniform_neighbor_with_shuffling(const cugraph_resource_handle_t* handle
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed.");
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
 
-  ret_code = create_mg_test_graph_with_properties(
-    handle, src, dst, idx, typ, wgt, num_edges, store_transposed, FALSE, &graph, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed.");
-  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
-
   ret_code =
     cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start create failed.");
@@ -411,17 +676,17 @@ int test_uniform_neighbor_with_shuffling(const cugraph_resource_handle_t* handle
 
   ret_code = cugraph_type_erased_device_array_view_copy_from_host(
     handle, d_start_view, (byte_t*)start, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed.");
 
   ret_code =
-    cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_label, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_label create failed.");
+    cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start_labels, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start_labels create failed.");
 
-  d_label_view = cugraph_type_erased_device_array_view(d_label);
+  d_start_labels_view = cugraph_type_erased_device_array_view(d_start_labels);
 
   ret_code = cugraph_type_erased_device_array_view_copy_from_host(
-    handle, d_label_view, (byte_t*)batch, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed.");
+    handle, d_start_labels_view, (byte_t*)start_labels, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start_labels copy_from_host failed.");
 
   ret_code =
     cugraph_type_erased_device_array_create(handle, num_labels, INT32, &d_label_list, &ret_error);
@@ -431,29 +696,31 @@ int test_uniform_neighbor_with_shuffling(const cugraph_resource_handle_t* handle
 
   ret_code = cugraph_type_erased_device_array_view_copy_from_host(
     handle, d_label_list_view, (byte_t*)label_list, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed.");
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "label_list copy_from_host failed.");
 
   ret_code =
-    cugraph_type_erased_device_array_create(handle, num_labels, INT32, &d_label_to_comm_rank, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_label_to_comm_rank create failed.");
+    cugraph_type_erased_device_array_create(handle, num_labels, INT32, &d_label_to_output_comm_rank, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_label_to_output_comm_rank create failed.");
 
-  d_label_to_comm_rank_view = cugraph_type_erased_device_array_view(d_label_to_comm_rank);
+  d_label_to_output_comm_rank_view = cugraph_type_erased_device_array_view(d_label_to_output_comm_rank);
 
   ret_code = cugraph_type_erased_device_array_view_copy_from_host(
-    handle, d_label_to_comm_rank_view, (byte_t*)dest_rank, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed.");
+    handle, d_label_to_output_comm_rank_view, (byte_t*)label_to_output_comm_rank, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "label_to_output_comm_rank copy_from_host failed.");
 
   h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, fan_out_size, INT32);
 
   ret_code = cugraph_uniform_neighbor_sample_with_edge_properties(handle,
                                                                   graph,
                                                                   d_start_view,
-                                                                  d_label_view,
+                                                                  d_start_labels_view,
                                                                   d_label_list_view,
-                                                                  d_label_to_comm_rank_view,
+                                                                  d_label_to_output_comm_rank_view,
                                                                   h_fan_out_view,
                                                                   rng_state,
-                                                                  with_replacement,
+                                                                  FALSE,
                                                                   TRUE,
                                                                   FALSE,
                                                                   &result,
@@ -461,96 +728,102 @@ int test_uniform_neighbor_with_shuffling(const cugraph_resource_handle_t* handle
 
 #ifdef NO_CUGRAPH_OPS
   TEST_ASSERT(
-    test_ret_value, ret_code != CUGRAPH_SUCCESS, "uniform_neighbor_sample should have failed");
+    test_ret_value, ret_code != CUGRAPH_SUCCESS, "uniform_neighbor_sample should have failed")
 #else
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed.");
 
-  cugraph_type_erased_device_array_view_t* result_src;
-  cugraph_type_erased_device_array_view_t* result_dst;
-  cugraph_type_erased_device_array_view_t* result_index;
-  cugraph_type_erased_device_array_view_t* result_type;
-  cugraph_type_erased_device_array_view_t* result_weight;
-  cugraph_type_erased_device_array_view_t* result_labels;
-  cugraph_type_erased_device_array_view_t* result_hops;
+  cugraph_type_erased_device_array_view_t* result_srcs = NULL;
+  cugraph_type_erased_device_array_view_t* result_dsts = NULL;
+  cugraph_type_erased_device_array_view_t* result_edge_id = NULL;
+  cugraph_type_erased_device_array_view_t* result_weights = NULL;
+  cugraph_type_erased_device_array_view_t* result_hops = NULL;
+  cugraph_type_erased_device_array_view_t* result_offsets = NULL;
 
-  result_src    = cugraph_sample_result_get_sources(result);
-  result_dst    = cugraph_sample_result_get_destinations(result);
-  result_index  = cugraph_sample_result_get_edge_id(result);
-  result_type   = cugraph_sample_result_get_edge_type(result);
-  result_weight = cugraph_sample_result_get_edge_weight(result);
-  result_labels = cugraph_sample_result_get_start_labels(result);
-  result_hops   = cugraph_sample_result_get_hop(result);
+  result_srcs       = cugraph_sample_result_get_sources(result);
+  result_dsts       = cugraph_sample_result_get_destinations(result);
+  result_edge_id    = cugraph_sample_result_get_edge_id(result);
+  result_weights    = cugraph_sample_result_get_edge_weight(result);
+  result_hops       = cugraph_sample_result_get_hop(result);
+  result_offsets    = cugraph_sample_result_get_offsets(result);
 
-  size_t result_size = cugraph_type_erased_device_array_view_size(result_src);
+  size_t result_size = cugraph_type_erased_device_array_view_size(result_srcs);
+  size_t result_offsets_size = cugraph_type_erased_device_array_view_size(result_offsets);
 
   vertex_t h_srcs[result_size];
   vertex_t h_dsts[result_size];
-  edge_t h_index[result_size];
-  int h_type[result_size];
-  weight_t h_wgt[result_size];
-  int h_labels[result_size];
-  int h_hop[result_size];
-
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_srcs, result_src, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  edge_t h_edge_id[result_size];
+  weight_t h_weight[result_size];
+  int32_t h_hops[result_size];
+  size_t h_result_offsets[result_offsets_size];
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_dsts, result_dst, &ret_error);
+    handle, (byte_t*)h_srcs, result_srcs, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_index, result_index, &ret_error);
+    handle, (byte_t*)h_dsts, result_dsts, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_type, result_type, &ret_error);
+    handle, (byte_t*)h_edge_id, result_edge_id, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_wgt, result_weight, &ret_error);
+    handle, (byte_t*)h_weight, result_weights, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_labels, result_labels, &ret_error);
+    handle, (byte_t*)h_hops, result_hops, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_hop, result_hops, &ret_error);
+    handle, (byte_t*)h_result_offsets, result_offsets, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
   //  NOTE:  The C++ tester does a more thorough validation.  For our purposes
   //  here we will do a simpler validation, merely checking that all edges
   //  are actually part of the graph
-  edge_t M[num_vertices][num_vertices];
+  weight_t M_w[num_vertices][num_vertices];
+  edge_t M_edge_id[num_vertices][num_vertices];
 
   for (int i = 0; i < num_vertices; ++i)
-    for (int j = 0; j < num_vertices; ++j)
-      M[i][j] = -1;
-
-  if (rank == 0) {
-    for (int i = 0; i < num_rank_0_edges; ++i)
-      M[rank_0_src[i]][rank_0_dst[i]] = 1;
-  } else if (rank == 1) {
-    for (int i = 0; i < num_rank_1_edges; ++i)
-      M[rank_1_src[i]][rank_1_dst[i]] = 1;
+    for (int j = 0; j < num_vertices; ++j) {
+      M_w[i][j]         = 0.0;
+      M_edge_id[i][j]   = -1;
+    }
+
+  for (int i = 0; i < num_edges; ++i) {
+    M_w[src[i]][dst[i]]         = wgt[i];
+    M_edge_id[src[i]][dst[i]]   = edge_ids[i];
   }
 
   for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) {
     TEST_ASSERT(test_ret_value,
-                M[h_srcs[i]][h_dsts[i]] >= 0,
-                "uniform_neighbor_sample got edge that doesn't exist or is on wrong GPU");
+                M_w[h_srcs[i]][h_dsts[i]] == h_weight[i],
+                "uniform_neighbor_sample got edge that doesn't exist");
+    TEST_ASSERT(test_ret_value,
+                M_edge_id[h_srcs[i]][h_dsts[i]] == h_edge_id[i],
+                "uniform_neighbor_sample got edge that doesn't exist");
+  }
+
+  TEST_ASSERT(test_ret_value,
+              result_offsets_size == expected_size[rank],
+              "incorrect number of results");
+
+  for (int i = 0 ; i < (result_offsets_size - 1) && (test_ret_value == 0) ; ++i) {
+    for (int j = h_result_offsets[i] ; j < (h_result_offsets[i+1] - 1) && (test_ret_value == 0) ; ++j) {
+      TEST_ASSERT(test_ret_value,
+                  h_hops[j] <= h_hops[j+1],
+                  "Results not sorted by hop id");
+    }
   }
-#endif
 
   cugraph_sample_result_free(result);
+#endif
 
-  cugraph_type_erased_host_array_view_free(h_fan_out_view);
-  cugraph_mg_graph_free(graph);
+  cugraph_sg_graph_free(graph);
   cugraph_error_free(ret_error);
-
-  return test_ret_value;
 }
 
 /******************************************************************************/
@@ -563,7 +836,8 @@ int main(int argc, char** argv)
   int result = 0;
   result |= RUN_MG_TEST(test_uniform_neighbor_sample, handle);
   result |= RUN_MG_TEST(test_uniform_neighbor_from_alex, handle);
-  result |= RUN_MG_TEST(test_uniform_neighbor_with_shuffling, handle);
+  result |= RUN_MG_TEST(test_uniform_neighbor_sample_alex_bug, handle);
+  result |= RUN_MG_TEST(test_uniform_neighbor_sample_sort_by_hop, handle);
 
   cugraph_free_resource_handle(handle);
   free_mg_raft_handle(raft_handle);
diff --git a/cpp/tests/centrality/betweenness_centrality_test.cpp b/cpp/tests/centrality/betweenness_centrality_test.cpp
index 7809631fa0d..d98ee38cf81 100644
--- a/cpp/tests/centrality/betweenness_centrality_test.cpp
+++ b/cpp/tests/centrality/betweenness_centrality_test.cpp
@@ -86,7 +86,13 @@ class Tests_BetweennessCentrality
 
     raft::random::RngState rng_state(0);
     auto d_seeds = cugraph::select_random_vertices(
-      handle, graph_view, rng_state, betweenness_usecase.num_seeds, false, true);
+      handle,
+      graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      betweenness_usecase.num_seeds,
+      false,
+      true);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
diff --git a/cpp/tests/centrality/edge_betweenness_centrality_test.cpp b/cpp/tests/centrality/edge_betweenness_centrality_test.cpp
index 7e62f45199f..bb223067e1c 100644
--- a/cpp/tests/centrality/edge_betweenness_centrality_test.cpp
+++ b/cpp/tests/centrality/edge_betweenness_centrality_test.cpp
@@ -86,7 +86,13 @@ class Tests_EdgeBetweennessCentrality
 
     raft::random::RngState rng_state(0);
     auto d_seeds = cugraph::select_random_vertices(
-      handle, graph_view, rng_state, betweenness_usecase.num_seeds, false, true);
+      handle,
+      graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      betweenness_usecase.num_seeds,
+      false,
+      true);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
diff --git a/cpp/tests/centrality/mg_betweenness_centrality_test.cpp b/cpp/tests/centrality/mg_betweenness_centrality_test.cpp
index aedcc574170..75bac28d105 100644
--- a/cpp/tests/centrality/mg_betweenness_centrality_test.cpp
+++ b/cpp/tests/centrality/mg_betweenness_centrality_test.cpp
@@ -84,7 +84,13 @@ class Tests_MGBetweennessCentrality
 
     raft::random::RngState rng_state(handle_->get_comms().get_rank());
     auto d_mg_seeds = cugraph::select_random_vertices(
-      *handle_, mg_graph_view, rng_state, betweenness_usecase.num_seeds, false, true);
+      *handle_,
+      mg_graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      betweenness_usecase.num_seeds,
+      false,
+      true);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
diff --git a/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp b/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp
index f88edcd159d..a1e73b6147b 100644
--- a/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp
+++ b/cpp/tests/centrality/mg_edge_betweenness_centrality_test.cpp
@@ -84,7 +84,13 @@ class Tests_MGEdgeBetweennessCentrality
 
     raft::random::RngState rng_state(handle_->get_comms().get_rank());
     auto d_seeds = cugraph::select_random_vertices(
-      *handle_, mg_graph_view, rng_state, betweenness_usecase.num_seeds, false, true);
+      *handle_,
+      mg_graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      betweenness_usecase.num_seeds,
+      false,
+      true);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
diff --git a/cpp/tests/community/leiden_test.cpp b/cpp/tests/community/leiden_test.cpp
index 4c700204f10..656e855057f 100644
--- a/cpp/tests/community/leiden_test.cpp
+++ b/cpp/tests/community/leiden_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -8,78 +8,237 @@
  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  *
  */
+#include <cugraph/utilities/high_res_timer.hpp>
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
 #include <utilities/test_utilities.hpp>
 
 #include <cugraph/algorithms.hpp>
-#include <cugraph/legacy/graph.hpp>
+#include <cugraph/graph.hpp>
 
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/extrema.h>
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
 
 #include <gtest/gtest.h>
 
-TEST(leiden_karate, success)
-{
-  raft::handle_t handle;
-
-  auto stream = handle.get_stream();
-
-  std::vector<int> off_h = {0,  16,  25,  35,  41,  44,  48,  52,  56,  61,  63, 66,
-                            67, 69,  74,  76,  78,  80,  82,  84,  87,  89,  91, 93,
-                            98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156};
-  std::vector<int> ind_h = {
-    1,  2,  3,  4,  5,  6,  7,  8,  10, 11, 12, 13, 17, 19, 21, 31, 0,  2,  3,  7,  13, 17, 19,
-    21, 30, 0,  1,  3,  7,  8,  9,  13, 27, 28, 32, 0,  1,  2,  7,  12, 13, 0,  6,  10, 0,  6,
-    10, 16, 0,  4,  5,  16, 0,  1,  2,  3,  0,  2,  30, 32, 33, 2,  33, 0,  4,  5,  0,  0,  3,
-    0,  1,  2,  3,  33, 32, 33, 32, 33, 5,  6,  0,  1,  32, 33, 0,  1,  33, 32, 33, 0,  1,  32,
-    33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31, 29, 33, 2,  23, 24, 33, 2,  31, 33, 23, 26,
-    32, 33, 1,  8,  32, 33, 0,  24, 25, 28, 32, 33, 2,  8,  14, 15, 18, 20, 22, 23, 29, 30, 31,
-    33, 8,  9,  13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32};
-  std::vector<float> w_h = {
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-
-  int num_verts = off_h.size() - 1;
-  int num_edges = ind_h.size();
-
-  rmm::device_uvector<int> offsets_v(num_verts + 1, stream);
-  rmm::device_uvector<int> indices_v(num_edges, stream);
-  rmm::device_uvector<float> weights_v(num_edges, stream);
-  rmm::device_uvector<int> result_v(num_verts, stream);
-
-  raft::update_device(offsets_v.data(), off_h.data(), off_h.size(), stream);
-  raft::update_device(indices_v.data(), ind_h.data(), ind_h.size(), stream);
-  raft::update_device(weights_v.data(), w_h.data(), w_h.size(), stream);
-
-  cugraph::legacy::GraphCSRView<int, int, float> G(
-    offsets_v.data(), indices_v.data(), weights_v.data(), num_verts, num_edges);
-
-  float modularity{0.0};
-  size_t num_level = 40;
-
-  // "FIXME": remove this check once we drop support for Pascal
-  //
-  // Calling louvain on Pascal will throw an exception, we'll check that
-  // this is the behavior while we still support Pascal (device_prop.major < 7)
-  //
-  if (handle.get_device_properties().major < 7) {
-    EXPECT_THROW(cugraph::leiden(handle, G, result_v.data()), cugraph::logic_error);
-  } else {
-    std::tie(num_level, modularity) = cugraph::leiden(handle, G, result_v.data());
-
-    auto cluster_id = cugraph::test::to_host(handle, result_v);
-
-    int min = *min_element(cluster_id.begin(), cluster_id.end());
-
-    ASSERT_GE(min, 0);
-    ASSERT_GE(modularity, 0.41116042 * 0.99);
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+struct Leiden_Usecase {
+  size_t max_level_{100};
+  double resolution_{1.0};
+  bool check_correctness_{false};
+  int expected_level_{0};
+  float expected_modularity_{0};
+};
+
+template <typename input_usecase_t>
+class Tests_Leiden : public ::testing::TestWithParam<std::tuple<Leiden_Usecase, input_usecase_t>> {
+ public:
+  Tests_Leiden() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(std::tuple<Leiden_Usecase const&, input_usecase_t const&> const& param)
+  {
+    auto [leiden_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    // Can't currently check correctness if we renumber
+    bool renumber = true;
+    if (leiden_usecase.check_correctness_) renumber = false;
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, d_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
+        handle, input_usecase, true, renumber);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    // "FIXME": remove this check once we drop support for Pascal
+    //
+    // Calling leiden on Pascal will throw an exception, we'll check that
+    // this is the behavior while we still support Pascal (device_prop.major < 7)
+    //
+    cudaDeviceProp device_prop;
+    RAFT_CUDA_TRY(cudaGetDeviceProperties(&device_prop, 0));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Leiden");
+    }
+
+    if (device_prop.major < 7) {
+      EXPECT_THROW(leiden(graph_view,
+                          edge_weight_view,
+                          graph_view.local_vertex_partition_range_size(),
+                          leiden_usecase.max_level_,
+                          leiden_usecase.resolution_,
+                          leiden_usecase.check_correctness_,
+                          leiden_usecase.expected_level_,
+                          leiden_usecase.expected_modularity_),
+                   cugraph::logic_error);
+    } else {
+      leiden(graph_view,
+             edge_weight_view,
+             graph_view.local_vertex_partition_range_size(),
+             leiden_usecase.max_level_,
+             leiden_usecase.resolution_,
+             leiden_usecase.check_correctness_,
+             leiden_usecase.expected_level_,
+             leiden_usecase.expected_modularity_);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+  }
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void leiden(
+    cugraph::graph_view_t<vertex_t, edge_t, false, false> const& graph_view,
+    std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+    vertex_t num_vertices,
+    size_t max_level,
+    float resolution,
+    bool check_correctness,
+    int expected_level,
+    float expected_modularity)
+  {
+    raft::handle_t handle{};
+
+    rmm::device_uvector<vertex_t> clustering_v(num_vertices, handle.get_stream());
+    size_t level;
+    weight_t modularity;
+    unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+    raft::random::RngState rng_state(seed);
+
+    std::tie(level, modularity) = cugraph::leiden(
+      handle, rng_state, graph_view, edge_weight_view, clustering_v.data(), max_level, resolution);
+
+    float compare_modularity = static_cast<float>(modularity);
+
+    if (check_correctness) {
+      ASSERT_FLOAT_EQ(compare_modularity, expected_modularity);
+      ASSERT_EQ(level, expected_level);
+    }
   }
+};
+
+using Tests_Leiden_File   = Tests_Leiden<cugraph::test::File_Usecase>;
+using Tests_Leiden_File32 = Tests_Leiden<cugraph::test::File_Usecase>;
+using Tests_Leiden_File64 = Tests_Leiden<cugraph::test::File_Usecase>;
+using Tests_Leiden_Rmat   = Tests_Leiden<cugraph::test::Rmat_Usecase>;
+using Tests_Leiden_Rmat32 = Tests_Leiden<cugraph::test::Rmat_Usecase>;
+using Tests_Leiden_Rmat64 = Tests_Leiden<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_Leiden_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Leiden_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Leiden_File32, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Leiden_File64, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+#if 0
+// FIXME:  We should use these tests, gtest-1.11.0 makes it a runtime error
+//         to define and not instantiate these.
+TEST_P(Tests_Leiden_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Leiden_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Leiden_Rmat32, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_Leiden_Rmat64, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
 }
+#endif
+
+// FIXME: Expand testing once we evaluate RMM memory use
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_Leiden_File,
+  ::testing::Combine(::testing::Values(Leiden_Usecase{100, 1, false, 3, 0.408695}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
+                          --gtest_filter to select only the file_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one File_Usecase that differ only in filename
+                          (to avoid running same benchmarks more than once) */
+  Tests_Leiden_File32,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Leiden_Usecase{}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file64_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
+                          --gtest_filter to select only the file_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one File_Usecase that differ only in filename
+                          (to avoid running same benchmarks more than once) */
+  Tests_Leiden_File64,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Leiden_Usecase{}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp
index c46dfec2b45..4792042365b 100644
--- a/cpp/tests/community/louvain_test.cpp
+++ b/cpp/tests/community/louvain_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -383,6 +383,9 @@ TEST_P(Tests_Louvain_File64, CheckInt64Int64FloatFloat)
     override_File_Usecase_with_cmd_line_arguments(GetParam()));
 }
 
+#if 0
+// FIXME:  We should use these tests, gtest-1.11.0 makes it a runtime error
+//         to define and not instantiate these.
 TEST_P(Tests_Louvain_Rmat, CheckInt32Int32FloatFloatLegacy)
 {
   run_legacy_test<int32_t, int32_t, float, float>(
@@ -400,6 +403,7 @@ TEST_P(Tests_Louvain_Rmat, CheckInt64Int64FloatFloat)
   run_current_test<int64_t, int64_t, float, float>(
     override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
 }
+#endif
 
 TEST_P(Tests_Louvain_Rmat32, CheckInt32Int32FloatFloat)
 {
diff --git a/cpp/tests/community/mg_leiden_test.cpp b/cpp/tests/community/mg_leiden_test.cpp
new file mode 100644
index 00000000000..23f34e1001b
--- /dev/null
+++ b/cpp/tests/community/mg_leiden_test.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
+
+#include <chrono>
+#include <gtest/gtest.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// Test param object. This defines the input and expected output for a test, and
+// will be instantiated as the parameter to the tests defined below using
+// INSTANTIATE_TEST_SUITE_P()
+//
+struct Leiden_Usecase {
+  size_t max_level_{100};
+  double resolution_{0.5};
+  double theta_{0.7};
+  bool check_correctness_{false};
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Parameterized test fixture, to be used with TEST_P().  This defines common
+// setup and teardown steps as well as common utilities used by each E2E MG
+// test.  In this case, each test is identical except for the inputs and
+// expected outputs, so the entire test is defined in the run_test() method.
+//
+template <typename input_usecase_t>
+class Tests_MGLeiden
+  : public ::testing::TestWithParam<std::tuple<Leiden_Usecase, input_usecase_t>> {
+ public:
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  // Run once for each test instance
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of MNMG Leiden with the results of running
+  // each step of SG Leiden, renumbering the coarsened graphs based
+  // on the MNMG renumbering.
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void compare_sg_results(
+    raft::handle_t const& handle,
+    raft::random::RngState& rng_state,
+    cugraph::graph_view_t<vertex_t, edge_t, false, true> const& mg_graph_view,
+    std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> mg_edge_weight_view,
+    cugraph::Dendrogram<vertex_t> const& mg_dendrogram,
+    weight_t resolution,
+    weight_t theta,
+    weight_t mg_modularity)
+  {
+    auto& comm           = handle.get_comms();
+    auto const comm_rank = comm.get_rank();
+
+    cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(handle);
+    std::optional<
+      cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
+      sg_edge_weights{std::nullopt};
+    std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+      *handle_,
+      mg_graph_view,
+      mg_edge_weight_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      false);  // crate an SG graph with MG graph vertex IDs
+
+    // FIXME: We need to figure out how to test each iteration of
+    // SG vs MG Leiden, possibly by passing results of refinement phase
+
+    weight_t sg_modularity{-1.0};
+
+    auto sg_graph_view = sg_graph.view();
+    auto sg_edge_weight_view =
+      sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+    if (comm_rank == 0) {
+      std::tie(std::ignore, sg_modularity) = cugraph::leiden(
+        handle, rng_state, sg_graph_view, sg_edge_weight_view, 100, resolution, theta);
+    }
+    if (comm_rank == 0) {
+      EXPECT_NEAR(mg_modularity, sg_modularity, std::max(mg_modularity, sg_modularity) * 1e-3);
+    }
+  }
+
+  // Compare the results of running Leiden on multiple GPUs to that of a
+  // single-GPU run for the configuration in param.  Note that MNMG Leiden
+  // and single GPU Leiden are ONLY deterministic through a single
+  // iteration of the outer loop.  Renumbering of the partitions when coarsening
+  // the graph is a function of the number of GPUs in the GPU cluster.
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<Leiden_Usecase const&, input_usecase_t const&> const& param)
+  {
+    auto [leiden_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, d_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Leiden");
+    }
+
+    unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+    raft::random::RngState rng_state(seed);
+
+    auto [dendrogram, mg_modularity] =
+      cugraph::leiden<vertex_t, edge_t, weight_t, true>(*handle_,
+                                                        rng_state,
+                                                        mg_graph_view,
+                                                        mg_edge_weight_view,
+                                                        leiden_usecase.max_level_,
+                                                        leiden_usecase.resolution_,
+                                                        leiden_usecase.theta_);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (leiden_usecase.check_correctness_) {
+      SCOPED_TRACE("compare modularity input");
+
+      compare_sg_results<vertex_t, edge_t, weight_t>(*handle_,
+                                                     rng_state,
+                                                     mg_graph_view,
+                                                     mg_edge_weight_view,
+                                                     *dendrogram,
+                                                     leiden_usecase.resolution_,
+                                                     leiden_usecase.theta_,
+                                                     mg_modularity);
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGLeiden<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGLeiden_File = Tests_MGLeiden<cugraph::test::File_Usecase>;
+using Tests_MGLeiden_Rmat = Tests_MGLeiden<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGLeiden_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGLeiden_File, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGLeiden_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGLeiden_Rmat, CheckInt32Int64Float)
+{
+  run_current_test<int32_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGLeiden_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_tests,
+  Tests_MGLeiden_File,
+  ::testing::Combine(
+    // enable correctness checks for small graphs
+    ::testing::Values(Leiden_Usecase{100, 1, 1, false}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_tests,
+                         Tests_MGLeiden_Rmat,
+                         ::testing::Combine(::testing::Values(Leiden_Usecase{100, 1, false}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              10, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
+                          --gtest_filter to select only the file_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one File_Usecase that differ only in filename
+                          (to avoid running same benchmarks more than once) */
+  Tests_MGLeiden_File,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Leiden_Usecase{100, 1, 1, false}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGLeiden_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Leiden_Usecase{100, 1, 1, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(12, 32, 0.57, 0.19, 0.19, 0, true, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/mg_mis_test.cu b/cpp/tests/community/mg_mis_test.cu
new file mode 100644
index 00000000000..b107e413e5d
--- /dev/null
+++ b/cpp/tests/community/mg_mis_test.cu
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <prims/property_generator.cuh>
+
+#include <prims/fill_edge_src_dst_property.cuh>
+#include <prims/per_v_transform_reduce_incoming_outgoing_e.cuh>
+#include <prims/reduce_op.cuh>
+#include <prims/update_edge_src_dst_property.cuh>
+
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_partition_view.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/utilities/dataframe_buffer.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/random/rng_state.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <random>
+
+#include <gtest/gtest.h>
+
+struct MaximalIndependentSet_Usecase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGMaximalIndependentSet
+  : public ::testing::TestWithParam<std::tuple<MaximalIndependentSet_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGMaximalIndependentSet() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(std::tuple<MaximalIndependentSet_Usecase, input_usecase_t> const& param)
+  {
+    auto [mis_usecase, input_usecase] = param;
+
+    auto const comm_rank = handle_->get_comms().get_rank();
+    auto const comm_size = handle_->get_comms().get_size();
+
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    constexpr bool multi_gpu = true;
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, multi_gpu>(
+        *handle_, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    raft::random::RngState rng_state(multi_gpu ? handle_->get_comms().get_rank() : 0);
+    auto d_mis = cugraph::maximal_independent_set<vertex_t, edge_t, multi_gpu>(
+      *handle_, mg_graph_view, rng_state);
+
+    // Test MIS
+    if (mis_usecase.check_correctness) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      std::vector<vertex_t> h_mis(d_mis.size());
+      raft::update_host(h_mis.data(), d_mis.data(), d_mis.size(), handle_->get_stream());
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      auto vertex_first = mg_graph_view.local_vertex_partition_range_first();
+      auto vertex_last  = mg_graph_view.local_vertex_partition_range_last();
+
+      std::for_each(h_mis.begin(), h_mis.end(), [vertex_first, vertex_last](vertex_t v) {
+        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last));
+      });
+
+      // If a vertex is included in MIS, then none of its neighbor should be
+
+      vertex_t local_vtx_partitoin_size = mg_graph_view.local_vertex_partition_range_size();
+      rmm::device_uvector<vertex_t> d_total_outgoing_nbrs_included_mis(local_vtx_partitoin_size,
+                                                                       handle_->get_stream());
+
+      rmm::device_uvector<vertex_t> inclusiong_flags(local_vtx_partitoin_size,
+                                                     handle_->get_stream());
+
+      thrust::uninitialized_fill(handle_->get_thrust_policy(),
+                                 inclusiong_flags.begin(),
+                                 inclusiong_flags.end(),
+                                 vertex_t{0});
+
+      thrust::for_each(
+        handle_->get_thrust_policy(),
+        d_mis.begin(),
+        d_mis.end(),
+        [inclusiong_flags =
+           raft::device_span<vertex_t>(inclusiong_flags.data(), inclusiong_flags.size()),
+         v_first = mg_graph_view.local_vertex_partition_range_first()] __device__(auto v) {
+          auto v_offset              = v - v_first;
+          inclusiong_flags[v_offset] = vertex_t{1};
+        });
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      // Cache for inclusiong_flags
+      using GraphViewType = cugraph::graph_view_t<vertex_t, edge_t, false, true>;
+      cugraph::edge_src_property_t<GraphViewType, vertex_t> src_inclusion_cache(*handle_);
+      cugraph::edge_dst_property_t<GraphViewType, vertex_t> dst_inclusion_cache(*handle_);
+
+      if constexpr (multi_gpu) {
+        src_inclusion_cache =
+          cugraph::edge_src_property_t<GraphViewType, vertex_t>(*handle_, mg_graph_view);
+        dst_inclusion_cache =
+          cugraph::edge_dst_property_t<GraphViewType, vertex_t>(*handle_, mg_graph_view);
+        update_edge_src_property(
+          *handle_, mg_graph_view, inclusiong_flags.begin(), src_inclusion_cache);
+        update_edge_dst_property(
+          *handle_, mg_graph_view, inclusiong_flags.begin(), dst_inclusion_cache);
+      }
+
+      per_v_transform_reduce_outgoing_e(
+        *handle_,
+        mg_graph_view,
+        multi_gpu ? src_inclusion_cache.view()
+                  : cugraph::detail::edge_major_property_view_t<vertex_t, vertex_t const*>(
+                      inclusiong_flags.data()),
+        multi_gpu ? dst_inclusion_cache.view()
+                  : cugraph::detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
+                      inclusiong_flags.data(), vertex_t{0}),
+        cugraph::edge_dummy_property_t{}.view(),
+        [] __device__(auto src, auto dst, auto src_included, auto dst_included, auto wt) {
+          return (src == dst) ? 0 : dst_included;
+        },
+        vertex_t{0},
+        cugraph::reduce_op::plus<vertex_t>{},
+        d_total_outgoing_nbrs_included_mis.begin());
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      std::vector<vertex_t> h_total_outgoing_nbrs_included_mis(
+        d_total_outgoing_nbrs_included_mis.size());
+      raft::update_host(h_total_outgoing_nbrs_included_mis.data(),
+                        d_total_outgoing_nbrs_included_mis.data(),
+                        d_total_outgoing_nbrs_included_mis.size(),
+                        handle_->get_stream());
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      {
+        auto vertex_first = mg_graph_view.local_vertex_partition_range_first();
+        auto vertex_last  = mg_graph_view.local_vertex_partition_range_last();
+
+        std::for_each(h_mis.begin(),
+                      h_mis.end(),
+                      [vertex_first, vertex_last, &h_total_outgoing_nbrs_included_mis](vertex_t v) {
+                        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last))
+                          << v << " is not within vertex parition range" << std::endl;
+
+                        ASSERT_TRUE(h_total_outgoing_nbrs_included_mis[v - vertex_first] == 0)
+                          << v << "'s neighbor is included in MIS" << std::endl;
+                      });
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGMaximalIndependentSet<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGMaximalIndependentSet_File =
+  Tests_MGMaximalIndependentSet<cugraph::test::File_Usecase>;
+using Tests_MGMaximalIndependentSet_Rmat =
+  Tests_MGMaximalIndependentSet<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGMaximalIndependentSet_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGMaximalIndependentSet_File, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGMaximalIndependentSet_File, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGMaximalIndependentSet_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGMaximalIndependentSet_Rmat, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGMaximalIndependentSet_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float, int>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGMaximalIndependentSet_File,
+  ::testing::Combine(::testing::Values(MaximalIndependentSet_Usecase{false},
+                                       MaximalIndependentSet_Usecase{false}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGMaximalIndependentSet_Rmat,
+                         ::testing::Combine(::testing::Values(MaximalIndependentSet_Usecase{false}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              3, 4, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGMaximalIndependentSet_Rmat,
+  ::testing::Combine(
+    ::testing::Values(MaximalIndependentSet_Usecase{false}, MaximalIndependentSet_Usecase{false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/new_leiden_test.cpp b/cpp/tests/community/new_leiden_test.cpp
deleted file mode 100644
index 7eab855fbc8..00000000000
--- a/cpp/tests/community/new_leiden_test.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-#include <cugraph/utilities/high_res_timer.hpp>
-#include <utilities/base_fixture.hpp>
-#include <utilities/test_graphs.hpp>
-#include <utilities/test_utilities.hpp>
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/graph.hpp>
-
-#include <raft/core/handle.hpp>
-#include <raft/util/cudart_utils.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <iterator>
-#include <limits>
-#include <numeric>
-#include <vector>
-
-struct Leiden_Usecase {
-  size_t max_level_{100};
-  double resolution_{1.0};
-  bool check_correctness_{false};
-  int expected_level_{0};
-  float expected_modularity_{0};
-};
-
-template <typename input_usecase_t>
-class Tests_Leiden : public ::testing::TestWithParam<std::tuple<Leiden_Usecase, input_usecase_t>> {
- public:
-  Tests_Leiden() {}
-
-  static void SetUpTestCase() {}
-  static void TearDownTestCase() {}
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-  void run_current_test(std::tuple<Leiden_Usecase const&, input_usecase_t const&> const& param)
-  {
-    auto [leiden_usecase, input_usecase] = param;
-
-    raft::handle_t handle{};
-    HighResTimer hr_timer{};
-
-    // Can't currently check correctness if we renumber
-    bool renumber = true;
-    if (leiden_usecase.check_correctness_) renumber = false;
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_timer.start("Construct graph");
-    }
-
-    auto [graph, edge_weights, d_renumber_map_labels] =
-      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, false>(
-        handle, input_usecase, true, renumber);
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_timer.stop();
-      hr_timer.display_and_clear(std::cout);
-    }
-
-    auto graph_view = graph.view();
-    auto edge_weight_view =
-      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
-
-    // "FIXME": remove this check once we drop support for Pascal
-    //
-    // Calling leiden on Pascal will throw an exception, we'll check that
-    // this is the behavior while we still support Pascal (device_prop.major < 7)
-    //
-    cudaDeviceProp device_prop;
-    RAFT_CUDA_TRY(cudaGetDeviceProperties(&device_prop, 0));
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_timer.start("Leiden");
-    }
-
-    if (device_prop.major < 7) {
-      EXPECT_THROW(leiden(graph_view,
-                          edge_weight_view,
-                          graph_view.local_vertex_partition_range_size(),
-                          leiden_usecase.max_level_,
-                          leiden_usecase.resolution_,
-                          leiden_usecase.check_correctness_,
-                          leiden_usecase.expected_level_,
-                          leiden_usecase.expected_modularity_),
-                   cugraph::logic_error);
-    } else {
-      leiden(graph_view,
-             edge_weight_view,
-             graph_view.local_vertex_partition_range_size(),
-             leiden_usecase.max_level_,
-             leiden_usecase.resolution_,
-             leiden_usecase.check_correctness_,
-             leiden_usecase.expected_level_,
-             leiden_usecase.expected_modularity_);
-    }
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_timer.stop();
-      hr_timer.display_and_clear(std::cout);
-    }
-  }
-
-  template <typename vertex_t, typename edge_t, typename weight_t>
-  void leiden(
-    cugraph::graph_view_t<vertex_t, edge_t, false, false> const& graph_view,
-    std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
-    vertex_t num_vertices,
-    size_t max_level,
-    float resolution,
-    bool check_correctness,
-    int expected_level,
-    float expected_modularity)
-  {
-    raft::handle_t handle{};
-
-    rmm::device_uvector<vertex_t> clustering_v(num_vertices, handle.get_stream());
-    size_t level;
-    weight_t modularity;
-
-    std::tie(level, modularity) = cugraph::leiden(
-      handle, graph_view, edge_weight_view, clustering_v.data(), max_level, resolution);
-
-    float compare_modularity = static_cast<float>(modularity);
-
-    if (check_correctness) {
-      ASSERT_FLOAT_EQ(compare_modularity, expected_modularity);
-      ASSERT_EQ(level, expected_level);
-    }
-  }
-};
-
-using Tests_Leiden_File   = Tests_Leiden<cugraph::test::File_Usecase>;
-using Tests_Leiden_File32 = Tests_Leiden<cugraph::test::File_Usecase>;
-using Tests_Leiden_File64 = Tests_Leiden<cugraph::test::File_Usecase>;
-using Tests_Leiden_Rmat   = Tests_Leiden<cugraph::test::Rmat_Usecase>;
-using Tests_Leiden_Rmat32 = Tests_Leiden<cugraph::test::Rmat_Usecase>;
-using Tests_Leiden_Rmat64 = Tests_Leiden<cugraph::test::Rmat_Usecase>;
-
-TEST_P(Tests_Leiden_File, CheckInt32Int32FloatFloat)
-{
-  run_current_test<int32_t, int32_t, float, float>(
-    override_File_Usecase_with_cmd_line_arguments(GetParam()));
-}
-
-TEST_P(Tests_Leiden_File, CheckInt64Int64FloatFloat)
-{
-  run_current_test<int64_t, int64_t, float, float>(
-    override_File_Usecase_with_cmd_line_arguments(GetParam()));
-}
-
-TEST_P(Tests_Leiden_File32, CheckInt32Int32FloatFloat)
-{
-  run_current_test<int32_t, int32_t, float, float>(
-    override_File_Usecase_with_cmd_line_arguments(GetParam()));
-}
-
-TEST_P(Tests_Leiden_File64, CheckInt64Int64FloatFloat)
-{
-  run_current_test<int64_t, int64_t, float, float>(
-    override_File_Usecase_with_cmd_line_arguments(GetParam()));
-}
-
-TEST_P(Tests_Leiden_Rmat, CheckInt32Int32FloatFloat)
-{
-  run_current_test<int32_t, int32_t, float, float>(
-    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
-}
-
-TEST_P(Tests_Leiden_Rmat, CheckInt64Int64FloatFloat)
-{
-  run_current_test<int64_t, int64_t, float, float>(
-    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
-}
-
-TEST_P(Tests_Leiden_Rmat32, CheckInt32Int32FloatFloat)
-{
-  run_current_test<int32_t, int32_t, float, float>(
-    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
-}
-
-TEST_P(Tests_Leiden_Rmat64, CheckInt64Int64FloatFloat)
-{
-  run_current_test<int64_t, int64_t, float, float>(
-    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
-}
-
-// FIXME: Expand testing once we evaluate RMM memory use
-INSTANTIATE_TEST_SUITE_P(
-  simple_test,
-  Tests_Leiden_File,
-  ::testing::Combine(::testing::Values(Leiden_Usecase{100, 1, false, 3, 0.408695}),
-                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
-
-INSTANTIATE_TEST_SUITE_P(
-  file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
-                          --gtest_filter to select only the file_benchmark_test with a specific
-                          vertex & edge type combination) by command line arguments and do not
-                          include more than one File_Usecase that differ only in filename
-                          (to avoid running same benchmarks more than once) */
-  Tests_Leiden_File32,
-  ::testing::Combine(
-    // disable correctness checks for large graphs
-    ::testing::Values(Leiden_Usecase{}),
-    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
-
-INSTANTIATE_TEST_SUITE_P(
-  file64_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
-                          --gtest_filter to select only the file_benchmark_test with a specific
-                          vertex & edge type combination) by command line arguments and do not
-                          include more than one File_Usecase that differ only in filename
-                          (to avoid running same benchmarks more than once) */
-  Tests_Leiden_File64,
-  ::testing::Combine(
-    // disable correctness checks for large graphs
-    ::testing::Values(Leiden_Usecase{}),
-    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/components/scc_test.cu b/cpp/tests/components/scc_test.cu
index abf9764e524..9cb2868faa5 100644
--- a/cpp/tests/components/scc_test.cu
+++ b/cpp/tests/components/scc_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -222,8 +222,7 @@ INSTANTIATE_TEST_SUITE_P(
     Usecase("test/datasets/cage6.mtx")  // DG "small" enough to meet SCC GPU memory requirements
     ));
 
-struct SCCSmallTest : public ::testing::Test {
-};
+struct SCCSmallTest : public ::testing::Test {};
 
 // FIXME: we should take advantage of gtest parameterization over copy-and-paste reuse.
 //
diff --git a/cpp/tests/generators/erdos_renyi_test.cpp b/cpp/tests/generators/erdos_renyi_test.cpp
index 1183c3bbe36..4b78cee2923 100644
--- a/cpp/tests/generators/erdos_renyi_test.cpp
+++ b/cpp/tests/generators/erdos_renyi_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,8 +27,7 @@
 
 #include <gtest/gtest.h>
 
-struct GenerateErdosRenyiTest : public ::testing::Test {
-};
+struct GenerateErdosRenyiTest : public ::testing::Test {};
 
 template <typename vertex_t>
 void test_symmetric(std::vector<vertex_t>& h_src_v, std::vector<vertex_t>& h_dst_v)
diff --git a/cpp/tests/generators/generate_bipartite_rmat_test.cpp b/cpp/tests/generators/generate_bipartite_rmat_test.cpp
new file mode 100644
index 00000000000..b97c0a7483c
--- /dev/null
+++ b/cpp/tests/generators/generate_bipartite_rmat_test.cpp
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <tuple>
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_generators.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+// this function assumes that vertex IDs are not scrambled
+template <typename vertex_t>
+void validate_bipartite_rmat_distribution(
+  std::tuple<vertex_t, vertex_t>* edges,
+  size_t num_edges,
+  vertex_t src_first,
+  vertex_t src_last,
+  vertex_t dst_first,
+  vertex_t dst_last,
+  double a,
+  double b,
+  double c,
+  size_t min_edges /* stop recursion if # edges < min_edges */,
+  double error_tolerance /* (computed a|b|c - input a|b|c) shoud be smaller than error_tolerance*/)
+{
+  // we cannot expect the ratios of the edges in the four quadrants of the graph adjacency matrix to
+  // converge close to a, b, c, d if num_edges is not large enough.
+  if (num_edges < min_edges) { return; }
+
+  auto src_threshold = (src_first + src_last) / 2;
+  auto dst_threshold = (dst_first + dst_last) / 2;
+
+  if (src_last - src_first >= 2) {
+    auto a_plus_b_last = std::partition(edges, edges + num_edges, [src_threshold](auto edge) {
+      return std::get<0>(edge) < src_threshold;
+    });
+    if (dst_last - dst_first >= 2) {
+      auto a_last = std::partition(edges, a_plus_b_last, [dst_threshold](auto edge) {
+        return std::get<1>(edge) < dst_threshold;
+      });
+      auto c_last = std::partition(a_plus_b_last, edges + num_edges, [dst_threshold](auto edge) {
+        return std::get<1>(edge) < dst_threshold;
+      });
+
+      ASSERT_TRUE(std::abs((double)std::distance(edges, a_last) / num_edges - a) < error_tolerance)
+        << "# edges=" << num_edges
+        << " computed a=" << (double)std::distance(edges, a_last) / num_edges << " iput a=" << a
+        << " error tolerance=" << error_tolerance << ".";
+      ASSERT_TRUE(std::abs((double)std::distance(a_last, a_plus_b_last) / num_edges - b) <
+                  error_tolerance)
+        << "# edges=" << num_edges
+        << " computed b=" << (double)std::distance(a_last, a_plus_b_last) / num_edges
+        << " iput b=" << b << " error tolerance=" << error_tolerance << ".";
+      ASSERT_TRUE(std::abs((double)std::distance(a_plus_b_last, c_last) / num_edges - c) <
+                  error_tolerance)
+        << "# edges=" << num_edges
+        << " computed c=" << (double)std::distance(a_plus_b_last, c_last) / num_edges
+        << " iput c=" << c << " error tolerance=" << error_tolerance << ".";
+
+      if ((src_threshold - src_first) * (dst_threshold - dst_first) >= 2) {
+        validate_bipartite_rmat_distribution(edges,
+                                             std::distance(edges, a_last),
+                                             src_first,
+                                             src_threshold,
+                                             dst_first,
+                                             dst_threshold,
+                                             a,
+                                             b,
+                                             c,
+                                             min_edges,
+                                             error_tolerance);
+      }
+      if ((src_threshold - src_first) * (dst_last - dst_threshold) >= 2) {
+        validate_bipartite_rmat_distribution(a_last,
+                                             std::distance(a_last, a_plus_b_last),
+                                             src_first,
+                                             src_threshold,
+                                             dst_threshold,
+                                             dst_last,
+                                             a,
+                                             b,
+                                             c,
+                                             min_edges,
+                                             error_tolerance);
+      }
+      if ((src_last - src_threshold) * (dst_threshold - dst_first) >= 2) {
+        validate_bipartite_rmat_distribution(a_plus_b_last,
+                                             std::distance(a_plus_b_last, c_last),
+                                             src_threshold,
+                                             src_last,
+                                             dst_first,
+                                             dst_threshold,
+                                             a,
+                                             b,
+                                             c,
+                                             min_edges,
+                                             error_tolerance);
+      }
+      if ((src_last - src_threshold) * (dst_last - dst_threshold) >= 2) {
+        validate_bipartite_rmat_distribution(c_last,
+                                             std::distance(c_last, edges + num_edges),
+                                             src_threshold,
+                                             src_last,
+                                             dst_threshold,
+                                             dst_last,
+                                             a,
+                                             b,
+                                             c,
+                                             min_edges,
+                                             error_tolerance);
+      }
+    } else {
+      ASSERT_TRUE(std::abs((double)std::distance(edges, a_plus_b_last) / num_edges - (a + b)) <
+                  error_tolerance)
+        << "# edges=" << num_edges
+        << " computed a+b=" << (double)std::distance(edges, a_plus_b_last) / num_edges
+        << " iput a+b=" << (a + b) << " error tolerance=" << error_tolerance << ".";
+      if (src_threshold - src_first >= 2) {
+        validate_bipartite_rmat_distribution(edges,
+                                             std::distance(edges, a_plus_b_last),
+                                             src_first,
+                                             src_threshold,
+                                             dst_first,
+                                             dst_last,
+                                             a,
+                                             b,
+                                             c,
+                                             min_edges,
+                                             error_tolerance);
+      }
+      if (src_last - src_threshold >= 2) {
+        validate_bipartite_rmat_distribution(edges,
+                                             std::distance(a_plus_b_last, edges + num_edges),
+                                             src_threshold,
+                                             src_last,
+                                             dst_first,
+                                             dst_last,
+                                             a,
+                                             b,
+                                             c,
+                                             min_edges,
+                                             error_tolerance);
+      }
+    }
+  } else if (dst_last - dst_first >= 2) {
+    auto a_plus_c_last = std::partition(edges, edges + num_edges, [dst_threshold](auto edge) {
+      return std::get<1>(edge) < dst_threshold;
+    });
+    ASSERT_TRUE(std::abs((double)std::distance(edges, a_plus_c_last) / num_edges - (a + c)) <
+                error_tolerance)
+      << "# edges=" << num_edges
+      << " computed a+c=" << (double)std::distance(edges, a_plus_c_last) / num_edges
+      << " iput a+c=" << (a + c) << " error tolerance=" << error_tolerance << ".";
+    if (dst_threshold - dst_first >= 2) {
+      validate_bipartite_rmat_distribution(edges,
+                                           std::distance(edges, a_plus_c_last),
+                                           src_first,
+                                           src_last,
+                                           dst_first,
+                                           dst_threshold,
+                                           a,
+                                           b,
+                                           c,
+                                           min_edges,
+                                           error_tolerance);
+    }
+    if (dst_last - dst_threshold >= 2) {
+      validate_bipartite_rmat_distribution(edges,
+                                           std::distance(a_plus_c_last, edges + num_edges),
+                                           src_first,
+                                           src_last,
+                                           dst_threshold,
+                                           dst_last,
+                                           a,
+                                           b,
+                                           c,
+                                           min_edges,
+                                           error_tolerance);
+    }
+  }
+
+  return;
+}
+
+struct GenerateBipartiteRmat_Usecase {
+  size_t src_scale{0};
+  size_t dst_scale{0};
+  size_t src_edge_factor{0};  // # edges = 2^src_scale * src_edge_factor
+  double a{0.0};
+  double b{0.0};
+  double c{0.0};
+
+  GenerateBipartiteRmat_Usecase(
+    size_t src_scale, size_t dst_scale, size_t src_edge_factor, double a, double b, double c)
+    : src_scale(src_scale),
+      dst_scale(dst_scale),
+      src_edge_factor(src_edge_factor),
+      a(a),
+      b(b),
+      c(c){};
+};
+
+class Tests_GenerateBipartiteRmat : public ::testing::TestWithParam<GenerateBipartiteRmat_Usecase> {
+ public:
+  Tests_GenerateBipartiteRmat() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t>
+  void run_current_test(GenerateBipartiteRmat_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    auto num_src_vertices = static_cast<vertex_t>(size_t{1} << configuration.src_scale);
+    auto num_dst_vertices = static_cast<vertex_t>(size_t{1} << configuration.dst_scale);
+
+    std::vector<size_t> no_scramble_out_degrees(num_src_vertices, 0);
+    std::vector<size_t> no_scramble_in_degrees(num_dst_vertices, 0);
+    std::vector<size_t> scramble_out_degrees(num_src_vertices, 0);
+    std::vector<size_t> scramble_in_degrees(num_dst_vertices, 0);
+    for (size_t scramble = 0; scramble < 2; ++scramble) {
+      raft::random::RngState rng_state(0);
+
+      rmm::device_uvector<vertex_t> d_srcs(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> d_dsts(0, handle.get_stream());
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Generate edge list");
+      }
+
+      std::tie(d_srcs, d_dsts) = cugraph::generate_bipartite_rmat_edgelist<vertex_t>(
+        handle,
+        rng_state,
+        configuration.src_scale,
+        configuration.dst_scale,
+        (size_t{1} << configuration.src_scale) * configuration.src_edge_factor,
+        configuration.a,
+        configuration.b,
+        configuration.c);
+
+      if (scramble == 1) {
+        d_srcs = cugraph::scramble_vertex_ids(handle, std::move(d_srcs), configuration.src_scale);
+        d_dsts = cugraph::scramble_vertex_ids(handle, std::move(d_dsts), configuration.dst_scale);
+      }
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+
+      auto h_cugraph_srcs = cugraph::test::to_host(handle, d_srcs);
+      auto h_cugraph_dsts = cugraph::test::to_host(handle, d_dsts);
+
+      ASSERT_TRUE((h_cugraph_srcs.size() ==
+                   (size_t{1} << configuration.src_scale) * configuration.src_edge_factor) &&
+                  (h_cugraph_dsts.size() ==
+                   (size_t{1} << configuration.src_scale) * configuration.src_edge_factor))
+        << "Returned an invalid number of bipartite R-mat graph edges.";
+      ASSERT_TRUE(std::count_if(h_cugraph_srcs.begin(),
+                                h_cugraph_srcs.end(),
+                                [num_src_vertices](auto v) {
+                                  return !cugraph::is_valid_vertex(num_src_vertices, v);
+                                }) == 0)
+        << "Returned bipartite R-mat graph edges have invalid source vertex IDs.";
+      ASSERT_TRUE(std::count_if(h_cugraph_dsts.begin(),
+                                h_cugraph_dsts.end(),
+                                [num_dst_vertices](auto v) {
+                                  return !cugraph::is_valid_vertex(num_dst_vertices, v);
+                                }) == 0)
+        << "Returned bipartite R-mat graph edges have invalid destination vertex IDs.";
+
+      if (!scramble) {
+        std::vector<std::tuple<vertex_t, vertex_t>> h_cugraph_edges(h_cugraph_srcs.size());
+        for (size_t i = 0; i < h_cugraph_srcs.size(); ++i) {
+          h_cugraph_edges[i] = std::make_tuple(h_cugraph_srcs[i], h_cugraph_dsts[i]);
+        }
+
+        validate_bipartite_rmat_distribution(h_cugraph_edges.data(),
+                                             h_cugraph_edges.size(),
+                                             vertex_t{0},
+                                             num_src_vertices,
+                                             vertex_t{0},
+                                             num_dst_vertices,
+                                             configuration.a,
+                                             configuration.b,
+                                             configuration.c,
+                                             size_t{100000},
+                                             0.01);
+      }
+
+      if (scramble) {
+        std::for_each(h_cugraph_srcs.begin(),
+                      h_cugraph_srcs.end(),
+                      [&scramble_out_degrees](auto src) { scramble_out_degrees[src]++; });
+        std::for_each(h_cugraph_dsts.begin(),
+                      h_cugraph_dsts.end(),
+                      [&scramble_in_degrees](auto dst) { scramble_in_degrees[dst]++; });
+        std::sort(scramble_out_degrees.begin(), scramble_out_degrees.end());
+        std::sort(scramble_in_degrees.begin(), scramble_in_degrees.end());
+      } else {
+        std::for_each(h_cugraph_srcs.begin(),
+                      h_cugraph_srcs.end(),
+                      [&no_scramble_out_degrees](auto src) { no_scramble_out_degrees[src]++; });
+        std::for_each(h_cugraph_dsts.begin(),
+                      h_cugraph_dsts.end(),
+                      [&no_scramble_in_degrees](auto dst) { no_scramble_in_degrees[dst]++; });
+        std::sort(no_scramble_out_degrees.begin(), no_scramble_out_degrees.end());
+        std::sort(no_scramble_in_degrees.begin(), no_scramble_in_degrees.end());
+      }
+    }
+
+    // this relies on the fact that the edge generator is deterministic.
+    // ideally, we should test that the two graphs are isomorphic, but this is NP hard; instead, we
+    // just check out-degree & in-degree distributions
+    ASSERT_TRUE(std::equal(no_scramble_out_degrees.begin(),
+                           no_scramble_out_degrees.end(),
+                           scramble_out_degrees.begin()));
+    ASSERT_TRUE(std::equal(
+      no_scramble_in_degrees.begin(), no_scramble_in_degrees.end(), scramble_in_degrees.begin()));
+  }
+};
+
+TEST_P(Tests_GenerateBipartiteRmat, CheckInt32) { run_current_test<int32_t>(GetParam()); }
+TEST_P(Tests_GenerateBipartiteRmat, CheckInt64) { run_current_test<int64_t>(GetParam()); }
+
+INSTANTIATE_TEST_SUITE_P(
+  simple_test,
+  Tests_GenerateBipartiteRmat,
+  ::testing::Values(GenerateBipartiteRmat_Usecase(20, 10, 16, 0.57, 0.19, 0.19),
+                    GenerateBipartiteRmat_Usecase(10, 20, 16, 0.57, 0.19, 0.19),
+                    GenerateBipartiteRmat_Usecase(20, 10, 16, 0.45, 0.22, 0.22),
+                    GenerateBipartiteRmat_Usecase(10, 20, 16, 0.45, 0.22, 0.22)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/generators/generate_rmat_test.cpp b/cpp/tests/generators/generate_rmat_test.cpp
index c4150b9732d..bdf79fd5962 100644
--- a/cpp/tests/generators/generate_rmat_test.cpp
+++ b/cpp/tests/generators/generate_rmat_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_generators.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
 
 #include <raft/core/handle.hpp>
 #include <raft/util/cudart_utils.hpp>
@@ -88,59 +89,67 @@ void validate_rmat_distribution(
       << " iput c=" << c << " error tolerance=" << error_tolerance << ".";
   }
 
-  validate_rmat_distribution(edges,
-                             std::distance(edges, a_last),
-                             src_first,
-                             src_threshold,
-                             dst_first,
-                             dst_threshold,
-                             a,
-                             b,
-                             c,
-                             clip_and_flip,
-                             min_edges,
-                             error_tolerance);
-  validate_rmat_distribution(a_last,
-                             std::distance(a_last, a_plus_b_last),
-                             src_first,
-                             (src_first + src_last) / 2,
-                             dst_threshold,
-                             dst_last,
-                             a,
-                             b,
-                             c,
-                             clip_and_flip,
-                             min_edges,
-                             error_tolerance);
-  validate_rmat_distribution(a_plus_b_last,
-                             std::distance(a_plus_b_last, c_last),
-                             src_threshold,
-                             src_last,
-                             dst_first,
-                             dst_threshold,
-                             a,
-                             b,
-                             c,
-                             clip_and_flip,
-                             min_edges,
-                             error_tolerance);
-  validate_rmat_distribution(c_last,
-                             std::distance(c_last, edges + num_edges),
-                             src_threshold,
-                             src_last,
-                             dst_threshold,
-                             dst_last,
-                             a,
-                             b,
-                             c,
-                             clip_and_flip,
-                             min_edges,
-                             error_tolerance);
+  if ((src_threshold - src_first) * (dst_threshold - dst_first) >= 2) {
+    validate_rmat_distribution(edges,
+                               std::distance(edges, a_last),
+                               src_first,
+                               src_threshold,
+                               dst_first,
+                               dst_threshold,
+                               a,
+                               b,
+                               c,
+                               clip_and_flip,
+                               min_edges,
+                               error_tolerance);
+  }
+  if ((src_threshold - src_first) * (dst_last - dst_threshold) >= 2) {
+    validate_rmat_distribution(a_last,
+                               std::distance(a_last, a_plus_b_last),
+                               src_first,
+                               src_threshold,
+                               dst_threshold,
+                               dst_last,
+                               a,
+                               b,
+                               c,
+                               clip_and_flip,
+                               min_edges,
+                               error_tolerance);
+  }
+  if ((src_last - src_threshold) * (dst_threshold - dst_first) >= 2) {
+    validate_rmat_distribution(a_plus_b_last,
+                               std::distance(a_plus_b_last, c_last),
+                               src_threshold,
+                               src_last,
+                               dst_first,
+                               dst_threshold,
+                               a,
+                               b,
+                               c,
+                               clip_and_flip,
+                               min_edges,
+                               error_tolerance);
+  }
+  if ((src_last - src_threshold) * (dst_last - dst_threshold) >= 2) {
+    validate_rmat_distribution(c_last,
+                               std::distance(c_last, edges + num_edges),
+                               src_threshold,
+                               src_last,
+                               dst_threshold,
+                               dst_last,
+                               a,
+                               b,
+                               c,
+                               clip_and_flip,
+                               min_edges,
+                               error_tolerance);
+  }
 
   return;
 }
 
-typedef struct GenerateRmat_Usecase_t {
+struct GenerateRmat_Usecase {
   size_t scale{0};
   size_t edge_factor{0};
   double a{0.0};
@@ -148,10 +157,10 @@ typedef struct GenerateRmat_Usecase_t {
   double c{0.0};
   bool clip_and_flip{false};
 
-  GenerateRmat_Usecase_t(
+  GenerateRmat_Usecase(
     size_t scale, size_t edge_factor, double a, double b, double c, bool clip_and_flip)
     : scale(scale), edge_factor(edge_factor), a(a), b(b), c(c), clip_and_flip(clip_and_flip){};
-} GenerateRmat_Usecase;
+};
 
 class Tests_GenerateRmat : public ::testing::TestWithParam<GenerateRmat_Usecase> {
  public:
@@ -167,6 +176,7 @@ class Tests_GenerateRmat : public ::testing::TestWithParam<GenerateRmat_Usecase>
   void run_current_test(GenerateRmat_Usecase const& configuration)
   {
     raft::handle_t handle{};
+    HighResTimer hr_timer{};
 
     auto num_vertices = static_cast<vertex_t>(size_t{1} << configuration.scale);
     std::vector<size_t> no_scramble_out_degrees(num_vertices, 0);
@@ -174,23 +184,36 @@ class Tests_GenerateRmat : public ::testing::TestWithParam<GenerateRmat_Usecase>
     std::vector<size_t> scramble_out_degrees(num_vertices, 0);
     std::vector<size_t> scramble_in_degrees(num_vertices, 0);
     for (size_t scramble = 0; scramble < 2; ++scramble) {
+      raft::random::RngState rng_state(0);
+
       rmm::device_uvector<vertex_t> d_srcs(0, handle.get_stream());
       rmm::device_uvector<vertex_t> d_dsts(0, handle.get_stream());
 
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Generate edge list");
+      }
 
       std::tie(d_srcs, d_dsts) = cugraph::generate_rmat_edgelist<vertex_t>(
         handle,
+        rng_state,
         configuration.scale,
         (size_t{1} << configuration.scale) * configuration.edge_factor,
         configuration.a,
         configuration.b,
         configuration.c,
-        uint64_t{0},
         configuration.clip_and_flip);
-      // static_cast<bool>(scramble));
 
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      if (scramble == 1) {
+        std::tie(d_srcs, d_dsts) = cugraph::scramble_vertex_ids(
+          handle, std::move(d_srcs), std::move(d_dsts), configuration.scale);
+      }
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
 
       auto h_cugraph_srcs = cugraph::test::to_host(handle, d_srcs);
       auto h_cugraph_dsts = cugraph::test::to_host(handle, d_dsts);
@@ -199,17 +222,17 @@ class Tests_GenerateRmat : public ::testing::TestWithParam<GenerateRmat_Usecase>
         (h_cugraph_srcs.size() == (size_t{1} << configuration.scale) * configuration.edge_factor) &&
         (h_cugraph_dsts.size() == (size_t{1} << configuration.scale) * configuration.edge_factor))
         << "Returned an invalid number of R-mat graph edges.";
-      ASSERT_TRUE(
-        std::count_if(h_cugraph_srcs.begin(),
-                      h_cugraph_srcs.end(),
-                      [num_vertices = static_cast<vertex_t>(size_t{1} << configuration.scale)](
-                        auto v) { return !cugraph::is_valid_vertex(num_vertices, v); }) == 0)
+      ASSERT_TRUE(std::count_if(h_cugraph_srcs.begin(),
+                                h_cugraph_srcs.end(),
+                                [num_vertices](auto v) {
+                                  return !cugraph::is_valid_vertex(num_vertices, v);
+                                }) == 0)
         << "Returned R-mat graph edges have invalid source vertex IDs.";
-      ASSERT_TRUE(
-        std::count_if(h_cugraph_dsts.begin(),
-                      h_cugraph_dsts.end(),
-                      [num_vertices = static_cast<vertex_t>(size_t{1} << configuration.scale)](
-                        auto v) { return !cugraph::is_valid_vertex(num_vertices, v); }) == 0)
+      ASSERT_TRUE(std::count_if(h_cugraph_dsts.begin(),
+                                h_cugraph_dsts.end(),
+                                [num_vertices](auto v) {
+                                  return !cugraph::is_valid_vertex(num_vertices, v);
+                                }) == 0)
         << "Returned R-mat graph edges have invalid destination vertex IDs.";
 
       if (!scramble) {
@@ -260,7 +283,7 @@ class Tests_GenerateRmat : public ::testing::TestWithParam<GenerateRmat_Usecase>
     }
 
     // this relies on the fact that the edge generator is deterministic.
-    // ideally, we should test that the two graphs are isomorphic, but this is NP hard; insted, we
+    // ideally, we should test that the two graphs are isomorphic, but this is NP hard; instead, we
     // just check out-degree & in-degree distributions
     ASSERT_TRUE(std::equal(no_scramble_out_degrees.begin(),
                            no_scramble_out_degrees.end(),
@@ -270,9 +293,8 @@ class Tests_GenerateRmat : public ::testing::TestWithParam<GenerateRmat_Usecase>
   }
 };
 
-// FIXME: add tests for type combinations
-
 TEST_P(Tests_GenerateRmat, CheckInt32) { run_current_test<int32_t>(GetParam()); }
+TEST_P(Tests_GenerateRmat, CheckInt64) { run_current_test<int64_t>(GetParam()); }
 
 INSTANTIATE_TEST_SUITE_P(simple_test,
                          Tests_GenerateRmat,
@@ -280,7 +302,8 @@ INSTANTIATE_TEST_SUITE_P(simple_test,
                                            GenerateRmat_Usecase(20, 16, 0.57, 0.19, 0.19, false),
                                            GenerateRmat_Usecase(20, 16, 0.45, 0.22, 0.22, true),
                                            GenerateRmat_Usecase(20, 16, 0.45, 0.22, 0.22, false)));
-typedef struct GenerateRmats_Usecase_t {
+
+struct GenerateRmats_Usecase {
   size_t n_edgelists{0};
   size_t min_scale{0};
   size_t max_scale{0};
@@ -288,19 +311,20 @@ typedef struct GenerateRmats_Usecase_t {
   cugraph::generator_distribution_t component_distribution;
   cugraph::generator_distribution_t edge_distribution;
 
-  GenerateRmats_Usecase_t(size_t n_edgelists,
-                          size_t min_scale,
-                          size_t max_scale,
-                          size_t edge_factor,
-                          cugraph::generator_distribution_t component_distribution,
-                          cugraph::generator_distribution_t edge_distribution)
+  GenerateRmats_Usecase(size_t n_edgelists,
+                        size_t min_scale,
+                        size_t max_scale,
+                        size_t edge_factor,
+                        cugraph::generator_distribution_t component_distribution,
+                        cugraph::generator_distribution_t edge_distribution)
     : n_edgelists(n_edgelists),
       min_scale(min_scale),
       max_scale(max_scale),
       component_distribution(component_distribution),
       edge_distribution(edge_distribution),
       edge_factor(edge_factor){};
-} GenerateRmats_Usecase;
+};
+
 class Tests_GenerateRmats : public ::testing::TestWithParam<GenerateRmats_Usecase> {
  public:
   Tests_GenerateRmats() {}
@@ -315,19 +339,30 @@ class Tests_GenerateRmats : public ::testing::TestWithParam<GenerateRmats_Usecas
   void run_current_test(GenerateRmats_Usecase const& configuration)
   {
     raft::handle_t handle{};
+    HighResTimer hr_timer{};
 
-    RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+    raft::random::RngState rng_state(0);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Generate edge list");
+    }
 
     auto outputs = cugraph::generate_rmat_edgelists<vertex_t>(handle,
+                                                              rng_state,
                                                               configuration.n_edgelists,
                                                               configuration.min_scale,
                                                               configuration.max_scale,
                                                               configuration.edge_factor,
                                                               configuration.component_distribution,
-                                                              configuration.edge_distribution,
-                                                              uint64_t{0});
+                                                              configuration.edge_distribution);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
 
-    RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
     ASSERT_EQ(configuration.n_edgelists, outputs.size());
     for (auto i = outputs.begin(); i != outputs.end(); ++i) {
       ASSERT_EQ(std::get<0>(*i).size(), std::get<1>(*i).size());
@@ -336,7 +371,9 @@ class Tests_GenerateRmats : public ::testing::TestWithParam<GenerateRmats_Usecas
     }
   }
 };
+
 TEST_P(Tests_GenerateRmats, CheckInt32) { run_current_test<int32_t>(GetParam()); }
+TEST_P(Tests_GenerateRmats, CheckInt64) { run_current_test<int64_t>(GetParam()); }
 
 INSTANTIATE_TEST_SUITE_P(
   simple_test,
diff --git a/cpp/tests/generators/generators_test.cpp b/cpp/tests/generators/generators_test.cpp
index fa82959baf8..b5dbf54b265 100644
--- a/cpp/tests/generators/generators_test.cpp
+++ b/cpp/tests/generators/generators_test.cpp
@@ -26,8 +26,7 @@
 
 #include <random>
 
-struct GeneratorsTest : public ::testing::Test {
-};
+struct GeneratorsTest : public ::testing::Test {};
 
 TEST_F(GeneratorsTest, PathGraphTest)
 {
@@ -591,7 +590,7 @@ TEST_F(GeneratorsTest, ScrambleTest)
   using vertex_t = int32_t;
   using edge_t   = int32_t;
 
-  edge_t num_vertices{30};
+  vertex_t num_vertices{30};
   edge_t num_edges{100};
 
   raft::handle_t handle;
@@ -615,7 +614,9 @@ TEST_F(GeneratorsTest, ScrambleTest)
   raft::update_device(d_src_v.data(), input_src_v.data(), input_src_v.size(), handle.get_stream());
   raft::update_device(d_dst_v.data(), input_dst_v.data(), input_dst_v.size(), handle.get_stream());
 
-  cugraph::scramble_vertex_ids(handle, d_src_v, d_dst_v, 5, 0);
+  auto lgN = static_cast<size_t>(std::ceil(std::log2(num_vertices)));
+  std::tie(d_src_v, d_dst_v) =
+    cugraph::scramble_vertex_ids(handle, std::move(d_src_v), std::move(d_dst_v), lgN);
 
   auto output_src_v = cugraph::test::to_host(handle, d_src_v);
   auto output_dst_v = cugraph::test::to_host(handle, d_dst_v);
diff --git a/cpp/tests/linear_assignment/hungarian_test.cu b/cpp/tests/linear_assignment/hungarian_test.cu
index 66efaf76704..5079074da97 100644
--- a/cpp/tests/linear_assignment/hungarian_test.cu
+++ b/cpp/tests/linear_assignment/hungarian_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -55,8 +55,7 @@ __global__ void generate_random(curandState* state, int n, T* data, int32_t uppe
   state[first] = local_state;
 }
 
-struct HungarianTest : public ::testing::Test {
-};
+struct HungarianTest : public ::testing::Test {};
 
 TEST_F(HungarianTest, Bipartite4x4)
 {
@@ -221,7 +220,7 @@ TEST_F(HungarianTest, Dense4x6)
   int32_t num_rows = 4;
   int32_t num_cols = 6;
   float cost[]     = {0,  16, 1,    0,    90, 100, 33, 45, 0,    4,    90, 100,
-                  22, 0,  1000, 2000, 90, 100, 2,  0,  3000, 4000, 90, 100};
+                      22, 0,  1000, 2000, 90, 100, 2,  0,  3000, 4000, 90, 100};
 
   float min_cost = 2;
 
@@ -248,7 +247,7 @@ TEST_F(HungarianTest, Dense6x4)
   int32_t num_rows = 6;
   int32_t num_cols = 4;
   float cost[]     = {0,  16, 1,    0,    33, 45,  0,   4,   90, 100, 110,  120,
-                  22, 0,  1000, 2000, 90, 100, 110, 120, 2,  0,   3000, 4000};
+                      22, 0,  1000, 2000, 90, 100, 110, 120, 2,  0,   3000, 4000};
 
   float min_cost = 2;
 
diff --git a/cpp/tests/link_analysis/mg_pagerank_test.cpp b/cpp/tests/link_analysis/mg_pagerank_test.cpp
index 0745ec99b23..b3d9e0271d0 100644
--- a/cpp/tests/link_analysis/mg_pagerank_test.cpp
+++ b/cpp/tests/link_analysis/mg_pagerank_test.cpp
@@ -95,6 +95,7 @@ class Tests_MGPageRank
       d_mg_personalization_vertices = cugraph::select_random_vertices(
         *handle_,
         mg_graph_view,
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
         rng_state,
         std::max(
           static_cast<size_t>(mg_graph_view.number_of_vertices() *
diff --git a/cpp/tests/link_analysis/pagerank_test.cpp b/cpp/tests/link_analysis/pagerank_test.cpp
index de996d8aec0..adb4ea2fa54 100644
--- a/cpp/tests/link_analysis/pagerank_test.cpp
+++ b/cpp/tests/link_analysis/pagerank_test.cpp
@@ -183,6 +183,7 @@ class Tests_PageRank
       d_personalization_vertices = cugraph::select_random_vertices(
         handle,
         graph_view,
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
         rng_state,
         std::max(
           static_cast<size_t>(graph_view.number_of_vertices() *
diff --git a/cpp/tests/link_prediction/mg_similarity_test.cpp b/cpp/tests/link_prediction/mg_similarity_test.cpp
index 4a05c9f43d1..c2a0b23c6d7 100644
--- a/cpp/tests/link_prediction/mg_similarity_test.cpp
+++ b/cpp/tests/link_prediction/mg_similarity_test.cpp
@@ -282,6 +282,6 @@ INSTANTIATE_TEST_SUITE_P(
   ::testing::Combine(
     // disable correctness checks for large graphs
     ::testing::Values(Similarity_Usecase{false, false, 20}),
-    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false))));
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 16, 0.57, 0.19, 0.19, 0, true, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_count_if_e.cu b/cpp/tests/prims/mg_count_if_e.cu
index 7a68642666a..bebb21bd720 100644
--- a/cpp/tests/prims/mg_count_if_e.cu
+++ b/cpp/tests/prims/mg_count_if_e.cu
@@ -44,6 +44,7 @@
 #include <thrust/distance.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
diff --git a/cpp/tests/prims/mg_extract_transform_e.cu b/cpp/tests/prims/mg_extract_transform_e.cu
index 1ac19218d9a..1c85b55e4be 100644
--- a/cpp/tests/prims/mg_extract_transform_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_e.cu
@@ -157,8 +157,8 @@ class Tests_MGExtractTransformE
     // 1. create MG graph
 
     constexpr bool is_multi_gpu     = true;
-    constexpr bool renumber         = true;   // needs to be true for multi gpu case
-    constexpr bool store_transposed = false;  // needs to be false for using extract_transform_e
+    constexpr bool renumber         = true;    // needs to be true for multi gpu case
+    constexpr bool store_transposed = false;   // needs to be false for using extract_transform_e
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       handle_->get_comms().barrier();
diff --git a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
index 7e64d86b2da..eb6a8fd5cb6 100644
--- a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
@@ -42,6 +42,7 @@
 
 #include <thrust/adjacent_difference.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
 #include <thrust/tuple.h>
 
 #include <gtest/gtest.h>
@@ -143,7 +144,13 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
                               : std::min(prims_usecase.num_seeds,
                                      static_cast<size_t>(mg_graph_view.number_of_vertices()));
     auto mg_vertex_buffer = cugraph::select_random_vertices(
-      *handle_, mg_graph_view, rng_state, select_count, prims_usecase.with_replacement, false);
+      *handle_,
+      mg_graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      rng_state,
+      select_count,
+      prims_usecase.with_replacement,
+      false);
 
     constexpr size_t bucket_idx_cur = 0;
     constexpr size_t num_buckets    = 1;
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
index be7cc1e47ab..97d52c04114 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
@@ -48,6 +48,7 @@
 #include <thrust/equal.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
diff --git a/cpp/tests/prims/mg_transform_e.cu b/cpp/tests/prims/mg_transform_e.cu
new file mode 100644
index 00000000000..47def15fffc
--- /dev/null
+++ b/cpp/tests/prims/mg_transform_e.cu
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "property_generator.cuh"
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <prims/count_if_e.cuh>
+#include <prims/edge_bucket.cuh>
+#include <prims/fill_edge_property.cuh>
+#include <prims/transform_e.cuh>
+
+#include <cugraph/edge_property.hpp>
+#include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <cuco/detail/hash_functions.cuh>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/tuple.h>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+struct Prims_Usecase {
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGTransformE
+  : public ::testing::TestWithParam<std::tuple<Prims_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGTransformE() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename result_t,
+            bool store_transposed>
+  void run_current_test(Prims_Usecase const& prims_usecase, input_usecase_t const& input_usecase)
+  {
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, true> mg_graph(*handle_);
+    std::optional<rmm::device_uvector<vertex_t>> mg_renumber_map{std::nullopt};
+    std::tie(mg_graph, std::ignore, mg_renumber_map) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, true>(
+        *handle_, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    // 2. run MG transform_e
+
+    const int hash_bin_count = 5;
+    const int initial_value  = 4;
+
+    auto property_initial_value =
+      cugraph::test::generate<vertex_t, result_t>::initial_value(initial_value);
+    auto mg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+      *handle_, *mg_renumber_map, hash_bin_count);
+    auto mg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+    auto mg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+      *handle_, mg_graph_view, mg_vertex_prop);
+
+    cugraph::edge_bucket_t<vertex_t, void, !store_transposed, true, true> edge_list(*handle_);
+    {
+      rmm::device_uvector<vertex_t> srcs(0, handle_->get_stream());
+      rmm::device_uvector<vertex_t> dsts(0, handle_->get_stream());
+      std::tie(srcs, dsts, std::ignore) = cugraph::decompress_to_edgelist(
+        *handle_,
+        mg_graph_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt});
+      auto edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(store_transposed ? dsts.begin() : srcs.begin(),
+                           store_transposed ? srcs.begin() : dsts.begin()));
+      srcs.resize(thrust::distance(
+                    edge_first,
+                    thrust::remove_if(handle_->get_thrust_policy(),
+                                      edge_first,
+                                      edge_first + srcs.size(),
+                                      [] __device__(thrust::tuple<vertex_t, vertex_t> e) {
+                                        return ((thrust::get<0>(e) + thrust::get<1>(e)) % 2) != 0;
+                                      })),
+                  handle_->get_stream());
+      dsts.resize(srcs.size(), handle_->get_stream());
+      edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(store_transposed ? dsts.begin() : srcs.begin(),
+                           store_transposed ? srcs.begin() : dsts.begin()));
+      thrust::sort(handle_->get_thrust_policy(), edge_first, edge_first + srcs.size());
+
+      edge_list.insert(srcs.begin(),
+                       srcs.end(),
+                       dsts.begin());  // now edge_list stores edge pairs with (src + dst) % 2 == 0
+    }
+
+    cugraph::edge_property_t<decltype(mg_graph_view), result_t> edge_value_output(*handle_,
+                                                                                  mg_graph_view);
+
+    cugraph::fill_edge_property(*handle_, mg_graph_view, property_initial_value, edge_value_output);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG transform_reduce_e");
+    }
+
+    cugraph::transform_e(
+      *handle_,
+      mg_graph_view,
+      edge_list,
+      mg_src_prop.view(),
+      mg_dst_prop.view(),
+      cugraph::edge_dummy_property_t{}.view(),
+      [] __device__(auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
+        if (src_property < dst_property) {
+          return src_property;
+        } else {
+          return dst_property;
+        }
+      },
+      edge_value_output.mutable_view());
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    // 3. validate MG results
+
+    if (prims_usecase.check_correctness) {
+      auto num_invalids = cugraph::count_if_e(
+        *handle_,
+        mg_graph_view,
+        mg_src_prop.view(),
+        mg_dst_prop.view(),
+        edge_value_output.view(),
+        [property_initial_value] __device__(
+          auto src, auto dst, auto src_property, auto dst_property, auto edge_property) {
+          if (((src + dst) % 2) == 0) {
+            if (src_property < dst_property) {
+              return edge_property != src_property;
+            } else {
+              return edge_property != dst_property;
+            }
+          } else {
+            return edge_property != property_initial_value;
+          }
+        });
+
+      ASSERT_TRUE(num_invalids == 0);
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGTransformE<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGTransformE_File = Tests_MGTransformE<cugraph::test::File_Usecase>;
+using Tests_MGTransformE_Rmat = Tests_MGTransformE<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGTransformE_File, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(std::get<0>(param),
+                                                                              std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformE_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformE_Rmat, CheckInt32Int64FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, thrust::tuple<int, float>, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformE_Rmat, CheckInt64Int64FloatTupleIntFloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, thrust::tuple<int, float>, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformE_File, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(std::get<0>(param),
+                                                                             std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformE_Rmat, CheckInt32Int32FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, thrust::tuple<int, float>, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformE_Rmat, CheckInt32Int64FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, thrust::tuple<int, float>, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformE_Rmat, CheckInt64Int64FloatTupleIntFloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, thrust::tuple<int, float>, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformE_File, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformE_Rmat, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformE_Rmat, CheckInt32Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, int, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformE_Rmat, CheckInt64Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, int, false>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformE_File, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGTransformE_Rmat, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, float, int, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformE_Rmat, CheckInt32Int64FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, float, int, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGTransformE_Rmat, CheckInt64Int64FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, float, int, true>(
+    std::get<0>(param),
+    cugraph::test::override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_MGTransformE_File,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(rmat_small_test,
+                         Tests_MGTransformE_Rmat,
+                         ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                                            ::testing::Values(cugraph::test::Rmat_Usecase(
+                                              10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGTransformE_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Prims_Usecase{false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_transform_reduce_e.cu b/cpp/tests/prims/mg_transform_reduce_e.cu
index d0e8e120ce5..8dba488f23d 100644
--- a/cpp/tests/prims/mg_transform_reduce_e.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e.cu
@@ -44,6 +44,7 @@
 #include <thrust/distance.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -107,7 +108,7 @@ class Tests_MGTransformReduceE
   virtual void SetUp() {}
   virtual void TearDown() {}
 
-  // Compare the results of reduce_if_v primitive and thrust reduce on a single GPU
+  // Compare the results of transform_reduce_e primitive
   template <typename vertex_t,
             typename edge_t,
             typename weight_t,
diff --git a/cpp/tests/prims/property_generator.cuh b/cpp/tests/prims/property_generator.cuh
index 7084cb124af..24a21c1cb01 100644
--- a/cpp/tests/prims/property_generator.cuh
+++ b/cpp/tests/prims/property_generator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#pragma once
 #include <prims/update_edge_src_dst_property.cuh>
 
 #include <cugraph/edge_src_dst_property.hpp>
diff --git a/cpp/tests/sampling/mg_uniform_neighbor_sampling.cu b/cpp/tests/sampling/mg_uniform_neighbor_sampling.cu
index 339000cf3f5..82fb2430ca1 100644
--- a/cpp/tests/sampling/mg_uniform_neighbor_sampling.cu
+++ b/cpp/tests/sampling/mg_uniform_neighbor_sampling.cu
@@ -94,6 +94,7 @@ class Tests_MGUniform_Neighbor_Sampling
     auto random_sources = cugraph::select_random_vertices(
       *handle_,
       mg_graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
       rng_state,
       std::max(static_cast<size_t>(mg_graph_view.number_of_vertices() * select_probability),
                std::min(static_cast<size_t>(mg_graph_view.number_of_vertices()), size_t{1})),
diff --git a/cpp/tests/sampling/sg_random_walks_test.cpp b/cpp/tests/sampling/sg_random_walks_test.cpp
index cdf25bca8d6..53b92446b14 100644
--- a/cpp/tests/sampling/sg_random_walks_test.cpp
+++ b/cpp/tests/sampling/sg_random_walks_test.cpp
@@ -199,6 +199,9 @@ using Tests_Node2VecRandomWalks_File =
 using Tests_Node2VecRandomWalks_Rmat =
   Tests_RandomWalks<std::tuple<Node2VecRandomWalks_Usecase, cugraph::test::Rmat_Usecase>>;
 
+#if 0
+// FIXME:  We should use these tests, gtest-1.11.0 makes it a runtime error
+//         to define and not instantiate these.
 TEST_P(Tests_UniformRandomWalks_File, Initialize_i32_i32_f)
 {
   run_current_test<int32_t, int32_t, float>(
@@ -210,6 +213,7 @@ TEST_P(Tests_UniformRandomWalks_Rmat, Initialize_i32_i32_f)
   run_current_test<int32_t, int32_t, float>(
     override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
 }
+#endif
 
 TEST_P(Tests_BiasedRandomWalks_File, Initialize_i32_i32_f)
 {
diff --git a/cpp/tests/sampling/sg_uniform_neighbor_sampling.cu b/cpp/tests/sampling/sg_uniform_neighbor_sampling.cu
index cf4a5dc4707..a59ea7feb8f 100644
--- a/cpp/tests/sampling/sg_uniform_neighbor_sampling.cu
+++ b/cpp/tests/sampling/sg_uniform_neighbor_sampling.cu
@@ -85,6 +85,7 @@ class Tests_Uniform_Neighbor_Sampling
     auto random_sources = cugraph::select_random_vertices(
       handle,
       graph_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
       rng_state,
       std::max(static_cast<size_t>(graph_view.number_of_vertices() * select_probability),
                std::min(static_cast<size_t>(graph_view.number_of_vertices()), size_t{1})),
diff --git a/cpp/tests/structure/induced_subgraph_test.cpp b/cpp/tests/structure/induced_subgraph_test.cpp
index f93de0674c0..05ca917e68e 100644
--- a/cpp/tests/structure/induced_subgraph_test.cpp
+++ b/cpp/tests/structure/induced_subgraph_test.cpp
@@ -151,7 +151,13 @@ class Tests_InducedSubgraph
       ASSERT_TRUE(last - start <= graph_view.number_of_vertices()) << "Invalid subgraph size.";
 
       auto vertices = cugraph::select_random_vertices(
-        handle, graph_view, rng_state, (last - start), false, false);
+        handle,
+        graph_view,
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+        rng_state,
+        (last - start),
+        false,
+        false);
       raft::copy(
         d_subgraph_vertices.data() + start, vertices.data(), vertices.size(), handle.get_stream());
     }
@@ -237,6 +243,9 @@ TEST_P(Tests_InducedSubgraph_File, CheckInt32Int32FloatTransposeTrue)
     override_File_Usecase_with_cmd_line_arguments(GetParam()));
 }
 
+#if 0
+// FIXME:  We should use these tests, gtest-1.11.0 makes it a runtime error
+//         to define and not instantiate these.
 TEST_P(Tests_InducedSubgraph_Rmat, CheckInt32Int32FloatTransposeFalse)
 {
   run_current_test<int32_t, int32_t, float, false>(
@@ -248,6 +257,7 @@ TEST_P(Tests_InducedSubgraph_Rmat, CheckInt32Int32FloatTransposeTrue)
   run_current_test<int32_t, int32_t, float, true>(
     override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
 }
+#endif
 
 INSTANTIATE_TEST_SUITE_P(
   karate_test,
diff --git a/cpp/tests/structure/mg_induced_subgraph_test.cu b/cpp/tests/structure/mg_induced_subgraph_test.cu
index fba8b0a5925..cd5c0cb94c1 100644
--- a/cpp/tests/structure/mg_induced_subgraph_test.cu
+++ b/cpp/tests/structure/mg_induced_subgraph_test.cu
@@ -99,12 +99,14 @@ class Tests_MGInducedSubgraph
       ASSERT_TRUE(induced_subgraph_usecase.subgraph_sizes[i] <= mg_graph_view.number_of_vertices())
         << "Invalid subgraph size.";
 
-      auto vertices             = cugraph::select_random_vertices(*handle_,
-                                                      mg_graph_view,
-                                                      rng_state,
-                                                      induced_subgraph_usecase.subgraph_sizes[i],
-                                                      false,
-                                                      false);
+      auto vertices = cugraph::select_random_vertices(
+        *handle_,
+        mg_graph_view,
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+        rng_state,
+        induced_subgraph_usecase.subgraph_sizes[i],
+        false,
+        false);
       h_subgraph_offsets[i + 1] = h_subgraph_offsets[i] + vertices.size();
       d_subgraph_vertices.resize(h_subgraph_offsets[i + 1], handle_->get_stream());
       raft::copy(d_subgraph_vertices.data() + h_subgraph_offsets[i],
@@ -262,6 +264,9 @@ TEST_P(Tests_MGInducedSubgraph_File, CheckInt32Int32)
     override_File_Usecase_with_cmd_line_arguments(GetParam()));
 }
 
+#if 0
+// FIXME:  We should use these tests, gtest-1.11.0 makes it a runtime error
+//         to define and not instantiate these.
 TEST_P(Tests_MGInducedSubgraph_Rmat, CheckInt32Int32)
 {
   run_current_test<int32_t, int32_t, float, false>(
@@ -279,6 +284,7 @@ TEST_P(Tests_MGInducedSubgraph_Rmat, CheckInt64Int64)
   run_current_test<int64_t, int64_t, float, false>(
     override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
 }
+#endif
 
 INSTANTIATE_TEST_SUITE_P(
   karate_test,
diff --git a/cpp/tests/structure/mg_select_random_vertices_test.cpp b/cpp/tests/structure/mg_select_random_vertices_test.cpp
new file mode 100644
index 00000000000..79c50301922
--- /dev/null
+++ b/cpp/tests/structure/mg_select_random_vertices_test.cpp
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/utilities/high_res_timer.hpp>
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <chrono>
+#include <random>
+
+#include <gtest/gtest.h>
+
+struct SelectRandomVertices_Usecase {
+  size_t select_count{std::numeric_limits<size_t>::max()};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGSelectRandomVertices
+  : public ::testing::TestWithParam<std::tuple<SelectRandomVertices_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGSelectRandomVertices() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<SelectRandomVertices_Usecase, input_usecase_t> const& param)
+  {
+    auto [select_random_vertices_usecase, input_usecase] = param;
+
+    auto const comm_rank = handle_->get_comms().get_rank();
+    auto const comm_size = handle_->get_comms().get_size();
+
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, mg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    raft::random::RngState rng_state(comm_rank);
+
+    //
+    // Test sampling from a distributed set
+    //
+
+    std::vector<bool> with_replacement_flags = {true, false};
+    {
+      // Generate distributed vertex set to sample from
+      std::srand((unsigned)std::chrono::duration_cast<std::chrono::milliseconds>(
+                   std::chrono::system_clock::now().time_since_epoch())
+                   .count());
+
+      std::vector<vertex_t> h_given_set(mg_graph_view.local_vertex_partition_range_size());
+
+      std::iota(
+        h_given_set.begin(), h_given_set.end(), mg_graph_view.local_vertex_partition_range_first());
+      std::shuffle(h_given_set.begin(), h_given_set.end(), std::mt19937{std::random_device{}()});
+      h_given_set.resize(std::rand() % mg_graph_view.local_vertex_partition_range_size() + 1);
+
+      // Compute size of the distributed vertex set
+      int num_of_elements_in_given_set = static_cast<int>(h_given_set.size());
+      num_of_elements_in_given_set     = cugraph::host_scalar_allreduce(handle_->get_comms(),
+                                                                    num_of_elements_in_given_set,
+                                                                    raft::comms::op_t::SUM,
+                                                                    handle_->get_stream());
+      // Move the distributed vertex set to GPUs
+      auto d_given_set = cugraph::test::to_device(*handle_, h_given_set);
+
+      // Sampling size should not exceed the size of distributed vertex set
+      size_t select_count =
+        num_of_elements_in_given_set > select_random_vertices_usecase.select_count
+          ? select_random_vertices_usecase.select_count
+          : std::rand() % num_of_elements_in_given_set + 1;
+
+      for (int idx = 0; idx < with_replacement_flags.size(); idx++) {
+        bool with_replacement = with_replacement_flags[idx];
+        auto d_sampled_vertices =
+          cugraph::select_random_vertices(*handle_,
+                                          mg_graph_view,
+                                          std::make_optional(raft::device_span<vertex_t const>{
+                                            d_given_set.data(), d_given_set.size()}),
+                                          rng_state,
+                                          select_count,
+                                          with_replacement,
+                                          true);
+
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+        auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
+
+        if (select_random_vertices_usecase.check_correctness) {
+          if (!with_replacement) {
+            std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
+
+            auto nr_duplicates =
+              std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
+                            h_sampled_vertices.end());
+
+            ASSERT_EQ(nr_duplicates, 0);
+          }
+
+          std::sort(h_given_set.begin(), h_given_set.end());
+          std::for_each(
+            h_sampled_vertices.begin(), h_sampled_vertices.end(), [&h_given_set](vertex_t v) {
+              ASSERT_TRUE(std::binary_search(h_given_set.begin(), h_given_set.end(), v));
+            });
+        }
+      }
+    }
+
+    //
+    // Test sampling from [0, V)
+    //
+
+    for (int idx = 0; idx < with_replacement_flags.size(); idx++) {
+      bool with_replacement   = false;
+      auto d_sampled_vertices = cugraph::select_random_vertices(
+        *handle_,
+        mg_graph_view,
+        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+        rng_state,
+        select_random_vertices_usecase.select_count,
+        with_replacement,
+        true);
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+      auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
+
+      if (select_random_vertices_usecase.check_correctness) {
+        if (!with_replacement) {
+          std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
+
+          auto nr_duplicates =
+            std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
+                          h_sampled_vertices.end());
+
+          ASSERT_EQ(nr_duplicates, 0);
+        }
+
+        auto vertex_first = mg_graph_view.local_vertex_partition_range_first();
+        auto vertex_last  = mg_graph_view.local_vertex_partition_range_last();
+
+        std::for_each(h_sampled_vertices.begin(),
+                      h_sampled_vertices.end(),
+                      [vertex_first, vertex_last](vertex_t v) {
+                        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last));
+                      });
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGSelectRandomVertices<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGSelectRandomVertices_File = Tests_MGSelectRandomVertices<cugraph::test::File_Usecase>;
+using Tests_MGSelectRandomVertices_Rmat = Tests_MGSelectRandomVertices<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGSelectRandomVertices_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGSelectRandomVertices_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGSelectRandomVertices_Rmat, CheckInt32Int64FloatFloat)
+{
+  run_current_test<int32_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGSelectRandomVertices_Rmat, CheckInt64Int64FloatFloat)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test_pass,
+  Tests_MGSelectRandomVertices_File,
+  ::testing::Combine(::testing::Values(SelectRandomVertices_Usecase{20, false},
+                                       SelectRandomVertices_Usecase{20, false}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_MGSelectRandomVertices_Rmat,
+  ::testing::Combine(
+    ::testing::Values(SelectRandomVertices_Usecase{50, false},
+                      SelectRandomVertices_Usecase{50, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(6, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGSelectRandomVertices_Rmat,
+  ::testing::Combine(
+    ::testing::Values(SelectRandomVertices_Usecase{500, false},
+                      SelectRandomVertices_Usecase{500, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/structure/streams.cu b/cpp/tests/structure/streams.cu
index 3891f2b9b7a..44f8dab3b67 100644
--- a/cpp/tests/structure/streams.cu
+++ b/cpp/tests/structure/streams.cu
@@ -23,8 +23,7 @@
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
-struct StreamTest : public ::testing::Test {
-};
+struct StreamTest : public ::testing::Test {};
 
 TEST_F(StreamTest, basic_test)
 {
diff --git a/cpp/tests/traversal/extract_bfs_paths_test.cu b/cpp/tests/traversal/extract_bfs_paths_test.cu
index 46389bffd53..793ee309b7a 100644
--- a/cpp/tests/traversal/extract_bfs_paths_test.cu
+++ b/cpp/tests/traversal/extract_bfs_paths_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
  * See the License for the specific language governin_from_mtxg permissions and
  * limitations under the License.
  */
-#include "randomly_select_destinations.cuh"
 
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_graphs.hpp>
@@ -115,15 +114,32 @@ class Tests_ExtractBfsPaths
     auto h_distances    = cugraph::test::to_host(handle, d_distances);
     auto h_predecessors = cugraph::test::to_host(handle, d_predecessors);
 
-    auto d_destinations = cugraph::test::randomly_select_destinations<false>(
+    rmm::device_uvector<vertex_t> d_vertices(graph_view.number_of_vertices(), handle.get_stream());
+    {
+      constexpr vertex_t invalid_vertex = cugraph::invalid_vertex_id<vertex_t>::value;
+      auto local_vertex_first           = vertex_t{0};
+      cugraph::detail::sequence_fill(
+        handle.get_stream(), d_vertices.begin(), d_vertices.size(), local_vertex_first);
+      auto end_iter = thrust::remove_if(
+        handle.get_thrust_policy(),
+        d_vertices.begin(),
+        d_vertices.end(),
+        [invalid_vertex, predecessors = d_predecessors.data(), local_vertex_first] __device__(
+          auto v) { return predecessors[v - local_vertex_first] == invalid_vertex; });
+      d_vertices.resize(thrust::distance(d_vertices.begin(), end_iter), handle.get_stream());
+    }
+
+    raft::random::RngState rng_state(0);
+    auto d_destinations = cugraph::select_random_vertices(
       handle,
-      graph_view.number_of_vertices(),
-      vertex_t{0},
-      d_predecessors,
-      extract_bfs_paths_usecase.num_paths_to_check);
+      graph_view,
+      std::make_optional(raft::device_span<vertex_t const>{d_vertices.data(), d_vertices.size()}),
+      rng_state,
+      std::min(extract_bfs_paths_usecase.num_paths_to_check, d_vertices.size()),
+      false,
+      false);
 
     rmm::device_uvector<vertex_t> d_paths(0, handle.get_stream());
-
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       hr_timer.start("Extract BFS paths");
@@ -212,13 +228,7 @@ INSTANTIATE_TEST_SUITE_P(
     std::make_tuple(ExtractBfsPaths_Usecase{0, 100},
                     cugraph::test::File_Usecase("test/datasets/polbooks.mtx")),
     std::make_tuple(ExtractBfsPaths_Usecase{0, 100},
-                    cugraph::test::File_Usecase("test/datasets/netscience.mtx")),
-    std::make_tuple(ExtractBfsPaths_Usecase{100, 100},
-                    cugraph::test::File_Usecase("test/datasets/netscience.mtx")),
-    std::make_tuple(ExtractBfsPaths_Usecase{1000, 2000},
-                    cugraph::test::File_Usecase("test/datasets/wiki2003.mtx")),
-    std::make_tuple(ExtractBfsPaths_Usecase{1000, 20000},
-                    cugraph::test::File_Usecase("test/datasets/wiki-Talk.mtx"))));
+                    cugraph::test::File_Usecase("test/datasets/netscience.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat_small_test,
diff --git a/cpp/tests/traversal/mg_extract_bfs_paths_test.cu b/cpp/tests/traversal/mg_extract_bfs_paths_test.cu
index 2ba01866929..fc77c11ca3e 100644
--- a/cpp/tests/traversal/mg_extract_bfs_paths_test.cu
+++ b/cpp/tests/traversal/mg_extract_bfs_paths_test.cu
@@ -13,7 +13,6 @@
  * See the License for the specific language governin_from_mtxg permissions and
  * limitations under the License.
  */
-#include "randomly_select_destinations.cuh"
 
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
@@ -125,14 +124,38 @@ class Tests_MGExtractBFSPaths
     auto h_mg_distances    = cugraph::test::to_host(*handle_, d_mg_distances);
     auto h_mg_predecessors = cugraph::test::to_host(*handle_, d_mg_predecessors);
 
-    vertex_t invalid_vertex = cugraph::invalid_vertex_id<vertex_t>::value;
+    rmm::device_uvector<vertex_t> d_vertices(mg_graph_view.local_vertex_partition_range_size(),
+                                             handle_->get_stream());
+    {
+      constexpr vertex_t invalid_vertex = cugraph::invalid_vertex_id<vertex_t>::value;
+      auto local_vertex_first           = mg_graph_view.local_vertex_partition_range_first();
+      cugraph::detail::sequence_fill(
+        handle_->get_stream(), d_vertices.begin(), d_vertices.size(), local_vertex_first);
+      auto end_iter = thrust::remove_if(
+        handle_->get_thrust_policy(),
+        d_vertices.begin(),
+        d_vertices.end(),
+        [invalid_vertex, predecessors = d_mg_predecessors.data(), local_vertex_first] __device__(
+          auto v) { return predecessors[v - local_vertex_first] == invalid_vertex; });
+      d_vertices.resize(thrust::distance(d_vertices.begin(), end_iter), handle_->get_stream());
+    }
+
+    // Compute size of the distributed vertex set
+    auto num_of_paths_in_given_set = d_vertices.size();
+    num_of_paths_in_given_set      = cugraph::host_scalar_allreduce(handle_->get_comms(),
+                                                               num_of_paths_in_given_set,
+                                                               raft::comms::op_t::SUM,
+                                                               handle_->get_stream());
 
-    auto d_mg_destinations = cugraph::test::randomly_select_destinations<false>(
+    raft::random::RngState rng_state(0);
+    auto d_mg_destinations = cugraph::select_random_vertices(
       *handle_,
-      mg_graph_view.local_vertex_partition_range_size(),
-      mg_graph_view.local_vertex_partition_range_first(),
-      d_mg_predecessors,
-      extract_bfs_paths_usecase.num_paths_to_check);
+      mg_graph_view,
+      std::make_optional(raft::device_span<vertex_t const>{d_vertices.data(), d_vertices.size()}),
+      rng_state,
+      std::min(num_of_paths_in_given_set, extract_bfs_paths_usecase.num_paths_to_check),
+      false,
+      false);
 
     rmm::device_uvector<vertex_t> d_mg_paths(0, handle_->get_stream());
 
@@ -296,11 +319,7 @@ INSTANTIATE_TEST_SUITE_P(
     std::make_tuple(ExtractBFSPaths_Usecase{0, 100},
                     cugraph::test::File_Usecase("test/datasets/netscience.mtx")),
     std::make_tuple(ExtractBFSPaths_Usecase{100, 100},
-                    cugraph::test::File_Usecase("test/datasets/netscience.mtx")),
-    std::make_tuple(ExtractBFSPaths_Usecase{1000, 2000},
-                    cugraph::test::File_Usecase("test/datasets/wiki2003.mtx")),
-    std::make_tuple(ExtractBFSPaths_Usecase{1000, 20000},
-                    cugraph::test::File_Usecase("test/datasets/wiki-Talk.mtx"))));
+                    cugraph::test::File_Usecase("test/datasets/netscience.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat_small_test,
@@ -308,7 +327,7 @@ INSTANTIATE_TEST_SUITE_P(
   ::testing::Values(
     // enable correctness checks
     std::make_tuple(ExtractBFSPaths_Usecase{0, 20},
-                    cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+                    cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, true))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
@@ -320,6 +339,6 @@ INSTANTIATE_TEST_SUITE_P(
   ::testing::Values(
     // disable correctness checks for large graphs
     std::make_pair(ExtractBFSPaths_Usecase{0, 1000, false},
-                   cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+                   cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, true))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/traversal/mg_sssp_test.cpp b/cpp/tests/traversal/mg_sssp_test.cpp
index ea0353c3743..b3e96981f96 100644
--- a/cpp/tests/traversal/mg_sssp_test.cpp
+++ b/cpp/tests/traversal/mg_sssp_test.cpp
@@ -214,7 +214,7 @@ class Tests_MGSSSP : public ::testing::TestWithParam<std::tuple<SSSP_Usecase, in
         auto h_sg_predecessors = cugraph::test::to_host(*handle_, d_sg_predecessors);
 
         auto max_weight_element = std::max_element(h_sg_weights.begin(), h_sg_weights.end());
-        auto epsilon            = *max_weight_element * weight_t{1e-6};
+        auto epsilon            = *max_weight_element* weight_t{1e-6};
         auto nearly_equal       = [epsilon](auto lhs, auto rhs) {
           return std::fabs(lhs - rhs) < epsilon;
         };
diff --git a/cpp/tests/traversal/randomly_select_destinations.cuh b/cpp/tests/traversal/randomly_select_destinations.cuh
deleted file mode 100644
index c88f162e91f..00000000000
--- a/cpp/tests/traversal/randomly_select_destinations.cuh
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governin_from_mtxg permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <utilities/thrust_wrapper.hpp>
-
-#include <cugraph/detail/utility_wrappers.hpp>
-#include <cugraph/graph.hpp>
-
-#include <raft/core/handle.hpp>
-
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/distance.h>
-#include <thrust/remove.h>
-
-namespace cugraph {
-namespace test {
-
-template <bool multi_gpu, typename vertex_t>
-rmm::device_uvector<vertex_t> randomly_select_destinations(
-  raft::handle_t const& handle,
-  vertex_t number_of_vertices,
-  vertex_t local_vertex_first,
-  rmm::device_uvector<vertex_t> const& d_predecessors,
-  size_t num_paths_to_check)
-{
-  constexpr vertex_t invalid_vertex = cugraph::invalid_vertex_id<vertex_t>::value;
-
-  rmm::device_uvector<vertex_t> d_vertices(number_of_vertices, handle.get_stream());
-  cugraph::detail::sequence_fill(
-    handle.get_stream(), d_vertices.begin(), d_vertices.size(), local_vertex_first);
-
-  auto end_iter = thrust::remove_if(
-    handle.get_thrust_policy(),
-    d_vertices.begin(),
-    d_vertices.end(),
-    [invalid_vertex, predecessors = d_predecessors.data(), local_vertex_first] __device__(auto v) {
-      return predecessors[v - local_vertex_first] == invalid_vertex;
-    });
-
-  d_vertices.resize(thrust::distance(d_vertices.begin(), end_iter), handle.get_stream());
-
-  return cugraph::test::randomly_select(handle, std::move(d_vertices), num_paths_to_check);
-}
-
-}  // namespace test
-}  // namespace cugraph
diff --git a/cpp/tests/traversal/sssp_test.cpp b/cpp/tests/traversal/sssp_test.cpp
index c92e2c06065..1e77efa11de 100644
--- a/cpp/tests/traversal/sssp_test.cpp
+++ b/cpp/tests/traversal/sssp_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -222,7 +222,7 @@ class Tests_SSSP : public ::testing::TestWithParam<std::tuple<SSSP_Usecase, inpu
       }
 
       auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end());
-      auto epsilon            = *max_weight_element * weight_t{1e-6};
+      auto epsilon            = *max_weight_element* weight_t{1e-6};
       auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; };
 
       ASSERT_TRUE(std::equal(h_reference_distances.begin(),
diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index c854e8eee3e..16c9d3ed145 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -269,6 +269,10 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
                                                   b_,
                                                   c_,
                                                   undirected_ ? true : false);
+      if (scramble_vertex_ids_) {
+        std::tie(tmp_src_v, tmp_dst_v) =
+          cugraph::scramble_vertex_ids(handle, std::move(tmp_src_v), std::move(tmp_dst_v), scale_);
+      }
 
       std::optional<rmm::device_uvector<weight_t>> tmp_weights_v{std::nullopt};
       if (weight_partitions) {
@@ -347,6 +351,9 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
                                      partition_vertex_firsts[i]);
       v_offset += partition_vertex_lasts[i] - partition_vertex_firsts[i];
     }
+    if (scramble_vertex_ids_) {
+      vertex_v = cugraph::scramble_vertex_ids(handle, std::move(vertex_v), scale_);
+    }
 
     translate(handle, vertex_v);
 
diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp
index a72af1976fd..615522a863b 100644
--- a/cpp/tests/utilities/test_utilities.hpp
+++ b/cpp/tests/utilities/test_utilities.hpp
@@ -517,7 +517,7 @@ mg_vertex_property_values_to_sg_vertex_property_values(
   std::optional<raft::device_span<vertex_t const>>
     sg_renumber_map,  // std::nullopt if the SG graph is not renumbered
   std::optional<raft::device_span<vertex_t const>>
-    mg_vertices,  // std::nullopt if the entire local vertex partition range is assumed
+    mg_vertices,      // std::nullopt if the entire local vertex partition range is assumed
   raft::device_span<value_t const> mg_values);
 
 template <typename type_t>
diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu
index 090afdb9519..cb7e6f1bd66 100644
--- a/cpp/tests/utilities/thrust_wrapper.cu
+++ b/cpp/tests/utilities/thrust_wrapper.cu
@@ -206,36 +206,6 @@ template void populate_vertex_ids(raft::handle_t const& handle,
                                   rmm::device_uvector<int64_t>& d_vertices_v,
                                   int64_t vertex_id_offset);
 
-template <typename T>
-rmm::device_uvector<T> randomly_select(raft::handle_t const& handle,
-                                       rmm::device_uvector<T> const& input,
-                                       size_t count,
-                                       bool sort_results)
-{
-  thrust::default_random_engine random_engine;
-
-  rmm::device_uvector<T> tmp(input.size(), handle.get_stream());
-
-  thrust::copy(handle.get_thrust_policy(), input.begin(), input.end(), tmp.begin());
-  thrust::shuffle(handle.get_thrust_policy(), tmp.begin(), tmp.end(), random_engine);
-
-  tmp.resize(std::min(count, tmp.size()), handle.get_stream());
-  tmp.shrink_to_fit(handle.get_stream());
-
-  if (sort_results) thrust::sort(handle.get_thrust_policy(), tmp.begin(), tmp.end());
-
-  return tmp;
-}
-
-template rmm::device_uvector<int32_t> randomly_select(raft::handle_t const& handle,
-                                                      rmm::device_uvector<int32_t> const& input,
-                                                      size_t count,
-                                                      bool sort_results);
-template rmm::device_uvector<int64_t> randomly_select(raft::handle_t const& handle,
-                                                      rmm::device_uvector<int64_t> const& input,
-                                                      size_t count,
-                                                      bool sort_results);
-
 template <typename vertex_t, typename weight_t>
 void remove_self_loops(raft::handle_t const& handle,
                        rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
diff --git a/cpp/tests/utilities/thrust_wrapper.hpp b/cpp/tests/utilities/thrust_wrapper.hpp
index d4d7b164553..eead4dc268f 100644
--- a/cpp/tests/utilities/thrust_wrapper.hpp
+++ b/cpp/tests/utilities/thrust_wrapper.hpp
@@ -46,12 +46,6 @@ void populate_vertex_ids(raft::handle_t const& handle,
                          rmm::device_uvector<vertex_t>& d_vertices_v /* [INOUT] */,
                          vertex_t vertex_id_offset);
 
-template <typename T>
-rmm::device_uvector<T> randomly_select(raft::handle_t const& handle,
-                                       rmm::device_uvector<T> const& input,
-                                       size_t count,
-                                       bool sort_results = false);
-
 template <typename vertex_t, typename weight_t>
 void remove_self_loops(raft::handle_t const& handle,
                        rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
diff --git a/dependencies.yaml b/dependencies.yaml
index 48a82955ec8..53da972c87e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -15,6 +15,7 @@ files:
       - python_run_cugraph
       - python_run_pylibcugraph
       - python_run_cugraph_dgl
+      - python_run_cugraph_pyg
       - test_notebook
       - test_python_common
       - test_python_cugraph
@@ -122,6 +123,13 @@ files:
       table: build-system
     includes:
       - python_build_wheel
+  py_run_cugraph_pyg:
+    output: pyproject
+    pyproject_dir: python/cugraph-pyg
+    extras:
+      table: project
+    includes:
+      - python_run_cugraph_pyg
   py_build_cugraph_service_client:
     output: pyproject
     pyproject_dir: python/cugraph-service/client
@@ -206,12 +214,12 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - gmock=1.10.0
-          - gtest=1.10.0
-          - libcugraphops=23.04.*
-          - libraft-headers=23.04.*
-          - libraft=23.04.*
-          - librmm=23.04.*
+          - gmock>=1.13.0
+          - gtest>=1.13.0
+          - libcugraphops=23.6.*
+          - libraft-headers=23.6.*
+          - libraft=23.6.*
+          - librmm=23.6.*
           - openmpi # Required for building cpp-mgtests (multi-GPU tests)
     specific:
       - output_types: [conda]
@@ -251,14 +259,11 @@ dependencies:
           - sphinx-markdown-tables
           - sphinx<6
           - sphinxcontrib-websupport
+          - pylibcugraphops=23.6.*
   py_version:
     specific:
       - output_types: [conda]
         matrices:
-          - matrix:
-              py: "3.8"
-            packages:
-              - python=3.8
           - matrix:
               py: "3.9"
             packages:
@@ -269,7 +274,7 @@ dependencies:
               - python=3.10
           - matrix:
             packages:
-              - python>=3.8,<3.11
+              - python>=3.9,<3.11
   python_build_wheel:
     common:
       - output_types: [conda, pyproject]
@@ -281,39 +286,38 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - cython>=0.29,<0.30
-          - &pylibraft pylibraft==23.4.*
-          - &rmm rmm==23.4.*
-          - scikit-build>=0.13.1
+          - &pylibraft pylibraft==23.6.*
+          - &rmm rmm==23.6.*
+          - scikit-build>=0.13.1,<0.17.2
   python_build_cugraph:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - pylibcugraph==23.4.*
+          - pylibcugraph==23.6.*
   python_run_cugraph:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - cuda-python>=11.7.1,<12.0
-          - &cudf cudf==23.4.*
+          - &cudf cudf==23.6.*
           - &dask dask==2023.3.2
           - &distributed distributed==2023.3.2.1
-          - &dask_cuda dask-cuda==23.4.*
-          - &dask_cudf dask-cudf==23.4.*
-          - &numba numba>=0.56.2
-          - raft-dask==23.4.*
+          - &dask_cuda dask-cuda==23.6.*
+          - &dask_cudf dask-cudf==23.6.*
+          - &numba numba>=0.57
+          - raft-dask==23.6.*
           - *rmm
-          - &ucx_py ucx-py=0.31.*
+          - &ucx_py ucx-py==0.32.*
       - output_types: conda
         packages:
-          - &cupy cupy>=9.5.0,<12.0.0a0
+          - &cupy cupy>=12.0.0
           - &dask-core dask-core==2023.3.2
-          - libcudf=23.04.*
+          - libcudf=23.6.*
           - nccl>=2.9.9
           - ucx-proc=*=gpu
       - output_types: pyproject
         packages:
-          - &cupy_pip cupy-cuda11x>=9.5.0,<12.0.0a0
-          - pylibcugraph==23.4.*
+          - &cupy_pip cupy-cuda11x>=12.0.0
+          - pylibcugraph==23.6.*
   python_run_pylibcugraph:
     common:
       - output_types: [conda, pyproject]
@@ -328,7 +332,16 @@ dependencies:
           - &numpy numpy>=1.21
       - output_types: [pyproject]
         packages:
-          - &cugraph cugraph==23.4.*
+          - &cugraph cugraph==23.6.*
+  python_run_cugraph_pyg:
+    common:
+      - output_types: [conda, pyproject]
+        packages:
+          - *numba
+          - *numpy
+      - output_types: [pyproject]
+        packages:
+          - *cugraph
   python_run_cugraph_service_client:
     common:
       - output_types: [conda, pyproject]
@@ -343,6 +356,7 @@ dependencies:
           - *dask_cuda
           - *dask_cudf
           - *distributed
+          - *numba
           - *numpy
           - *rmm
           - *thrift
@@ -355,7 +369,7 @@ dependencies:
         packages:
           - *cupy_pip
           - *cugraph
-          - cugraph-service-client==23.4.*
+          - cugraph-service-client==23.6.*
   doc:
     common:
       - output_types: [conda]
@@ -369,6 +383,7 @@ dependencies:
           - sphinxcontrib-websupport
           - sphinx-markdown-tables
           - sphinx-copybutton
+          - pylibcugraphops=23.6.*
   test_notebook:
     common:
       - output_types: [conda, requirements]
diff --git a/docs/cugraph/source/api_docs/cugraph-ops/bipartite_operators.rst b/docs/cugraph/source/api_docs/cugraph-ops/bipartite_operators.rst
new file mode 100644
index 00000000000..e172309fae2
--- /dev/null
+++ b/docs/cugraph/source/api_docs/cugraph-ops/bipartite_operators.rst
@@ -0,0 +1,16 @@
+=============================
+Operators on Bipartite Graphs
+=============================
+
+.. currentmodule:: pylibcugraphops
+
+Update Edges: Concatenation or Sum of Edge and Node Features
+------------------------------------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   operators.update_efeat_bipartite_e2e_concat_fwd
+   operators.update_efeat_bipartite_e2e_concat_bwd
+
+   operators.update_efeat_bipartite_e2e_sum_fwd
+   operators.update_efeat_bipartite_e2e_sum_bwd
diff --git a/docs/cugraph/source/api_docs/cugraph-ops/dimenet.rst b/docs/cugraph/source/api_docs/cugraph-ops/dimenet.rst
new file mode 100644
index 00000000000..b709464c7e6
--- /dev/null
+++ b/docs/cugraph/source/api_docs/cugraph-ops/dimenet.rst
@@ -0,0 +1,24 @@
+=================
+Dimenet operators
+=================
+
+.. currentmodule:: pylibcugraphops
+
+Radial Basis Functions
+----------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   dimenet.radial_basis_fwd
+   dimenet.radial_basis_bwd
+   dimenet.radial_basis_bwd_bwd
+
+Edge-to-Edge Aggregation
+-------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   dimenet.agg_edge_to_edge_fwd
+   dimenet.agg_edge_to_edge_bwd
+   dimenet.agg_edge_to_edge_bwd2_grad
+   dimenet.agg_edge_to_edge_bwd2_main
diff --git a/docs/cugraph/source/api_docs/cugraph-ops/fg_operators.rst b/docs/cugraph/source/api_docs/cugraph-ops/fg_operators.rst
new file mode 100644
index 00000000000..387844f684a
--- /dev/null
+++ b/docs/cugraph/source/api_docs/cugraph-ops/fg_operators.rst
@@ -0,0 +1,83 @@
+========================
+Operators on Full Graphs
+========================
+
+.. currentmodule:: pylibcugraphops
+
+Simple Neighborhood Aggregator (SAGEConv)
+-----------------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   operators.agg_simple_fg_n2n_fwd
+   operators.agg_simple_fg_n2n_bwd
+   operators.agg_simple_fg_e2n_fwd
+   operators.agg_simple_fg_e2n_bwd
+   operators.agg_simple_fg_n2n_e2n_fwd
+   operators.agg_simple_fg_n2n_e2n_bwd
+
+   operators.agg_concat_fg_n2n_fwd
+   operators.agg_concat_fg_n2n_bwd
+   operators.agg_concat_fg_e2n_fwd
+   operators.agg_concat_fg_e2n_bwd
+   operators.agg_concat_fg_n2n_e2n_fwd
+   operators.agg_concat_fg_n2n_e2n_bwd
+
+Weighted Neighborhood Aggregation
+---------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   operators.agg_weighted_fg_n2n_fwd
+   operators.agg_weighted_fg_n2n_bwd
+   operators.agg_concat_weighted_fg_n2n_fwd
+   operators.agg_concat_weighted_fg_n2n_bwd
+
+Heterogenous Aggregator using Basis Decomposition (RGCNConv)
+------------------------------------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   operators.agg_hg_basis_fg_n2n_post_fwd
+   operators.agg_hg_basis_fg_n2n_post_bwd
+
+Graph Attention (GATConv/GATv2Conv)
+-----------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   operators.mha_gat_fg_n2n_fwd
+   operators.mha_gat_fg_n2n_bwd
+   operators.mha_gat_fg_n2n_efeat_fwd
+   operators.mha_gat_fg_n2n_efeat_bwd
+
+   operators.mha_gat_v2_fg_n2n_fwd
+   operators.mha_gat_v2_fg_n2n_bwd
+   operators.mha_gat_v2_fg_n2n_efeat_fwd
+   operators.mha_gat_v2_fg_n2n_efeat_bwd
+
+Transformer-like Graph Attention (TransformerConv)
+--------------------------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   operators.mha_gat_v2_fg_n2n_fwd
+   operators.mha_gat_v2_fg_n2n_bwd
+   operators.mha_gat_v2_fg_n2n_efeat_fwd
+   operators.mha_gat_v2_fg_n2n_efeat_bwd
+
+Directional Message-Passing (DMPNN)
+-----------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   operators.agg_dmpnn_fg_e2e_fwd
+   operators.agg_dmpnn_fg_e2e_bwd
+
+Graph Pooling
+-------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   operators.pool_fg_n2s_fwd
+   operators.pool_fg_n2s_bwd
diff --git a/docs/cugraph/source/api_docs/cugraph-ops/graph_types.rst b/docs/cugraph/source/api_docs/cugraph-ops/graph_types.rst
new file mode 100644
index 00000000000..9289ce53e39
--- /dev/null
+++ b/docs/cugraph/source/api_docs/cugraph-ops/graph_types.rst
@@ -0,0 +1,33 @@
+===========
+Graph types
+===========
+
+.. currentmodule:: pylibcugraphops
+
+Message-Flow Graph (MFG)
+-------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   make_mfg_csr
+
+Heterogenous MFG
+----------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   make_mfg_csr_hg
+
+"Full" Graph (FG)
+-----------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   make_fg_csr
+
+Heterogenous FG
+---------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   make_fg_csr_hg
diff --git a/docs/cugraph/source/api_docs/cugraph-ops/index.rst b/docs/cugraph/source/api_docs/cugraph-ops/index.rst
new file mode 100644
index 00000000000..e2338dc1833
--- /dev/null
+++ b/docs/cugraph/source/api_docs/cugraph-ops/index.rst
@@ -0,0 +1,18 @@
+=========================
+cugraph-ops API reference
+=========================
+
+This page provides a list of all publicly accessible modules, methods and classes through `pylibcugraphops.*` namespace.
+
+.. toctree::
+    :maxdepth: 2
+    :caption: API Documentation
+
+    graph_types
+    pytorch
+    mfg_operators
+    bipartite_operators
+    static_operators
+    fg_operators
+    dimenet
+    pytorch
diff --git a/docs/cugraph/source/api_docs/cugraph-ops/mfg_operators.rst b/docs/cugraph/source/api_docs/cugraph-ops/mfg_operators.rst
new file mode 100644
index 00000000000..f3dd1faa245
--- /dev/null
+++ b/docs/cugraph/source/api_docs/cugraph-ops/mfg_operators.rst
@@ -0,0 +1,31 @@
+================================
+Operators on Message-Flow Graphs
+================================
+
+.. currentmodule:: pylibcugraphops
+
+Simple Neighborhood Aggregator (SAGEConv)
+-----------------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   operators.agg_simple_mfg_n2n_fwd
+   operators.agg_simple_mfg_n2n_bwd
+   operators.agg_concat_mfg_n2n_fwd
+   operators.agg_concat_mfg_n2n_bwd
+
+Graph Attention (GATConv)
+-------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   operators.mha_gat_mfg_n2n_fwd
+   operators.mha_gat_mfg_n2n_bwd
+
+Heterogenous Aggregator using Basis Decomposition (RGCNConv)
+------------------------------------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   operators.agg_hg_basis_mfg_n2n_post_fwd
+   operators.agg_hg_basis_mfg_n2n_post_bwd
diff --git a/docs/cugraph/source/api_docs/cugraph-ops/pytorch.rst b/docs/cugraph/source/api_docs/cugraph-ops/pytorch.rst
new file mode 100644
index 00000000000..83800fbc546
--- /dev/null
+++ b/docs/cugraph/source/api_docs/cugraph-ops/pytorch.rst
@@ -0,0 +1,36 @@
+==========================
+PyTorch Autograd Wrappers
+==========================
+
+.. currentmodule:: pylibcugraphops
+
+Simple Neighborhood Aggregator (SAGEConv)
+-----------------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   pytorch.operators.agg_concat_n2n
+
+Graph Attention (GATConv/GATv2Conv)
+-----------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   pytorch.operators.mha_gat_n2n
+   pytorch.operators.mha_gat_v2_n2n
+
+Heterogenous Aggregator using Basis Decomposition (RGCNConv)
+------------------------------------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   pytorch.operators.agg_hg_basis_n2n_post
+
+
+Update Edges: Concatenation or Sum of Edge and Node Features
+------------------------------------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   pytorch.operators.update_efeat_bipartite_e2e
+   pytorch.operators.update_efeat_static_e2e
diff --git a/docs/cugraph/source/api_docs/cugraph-ops/static_operators.rst b/docs/cugraph/source/api_docs/cugraph-ops/static_operators.rst
new file mode 100644
index 00000000000..f3ecc068f22
--- /dev/null
+++ b/docs/cugraph/source/api_docs/cugraph-ops/static_operators.rst
@@ -0,0 +1,16 @@
+==========================
+Operators on Static Graphs
+==========================
+
+.. currentmodule:: pylibcugraphops
+
+Update Edges: Concatenation or Sum of Edge and Node Features
+------------------------------------------------------------
+.. autosummary::
+   :toctree: ../api/ops/
+
+   operators.update_efeat_static_e2e_concat_fwd
+   operators.update_efeat_static_e2e_concat_bwd
+
+   operators.update_efeat_static_e2e_sum_fwd
+   operators.update_efeat_static_e2e_sum_bwd
diff --git a/docs/cugraph/source/api_docs/cugraph/centrality.rst b/docs/cugraph/source/api_docs/cugraph/centrality.rst
index c3b026597d9..344c95195b7 100644
--- a/docs/cugraph/source/api_docs/cugraph/centrality.rst
+++ b/docs/cugraph/source/api_docs/cugraph/centrality.rst
@@ -7,42 +7,64 @@ Centrality
 
 Betweenness Centrality
 ----------------------
+single-GPU
+^^^^^^^^^^
 .. autosummary::
    :toctree: ../api/cugraph/
 
-   cugraph.betweenness_centrality
-   cugraph.edge_betweenness_centrality
+   cugraph.centrality.betweenness_centrality
+   cugraph.centrality.edge_betweenness_centrality
+
+multi-GPU
+^^^^^^^^^^
+.. autosummary::
+   :toctree: ../api/cugraph/
+
+   cugraph.dask.centrality.betweenness_centrality
+
+
 
 Katz Centrality
 ---------------
+single-GPU
+^^^^^^^^^^
 .. autosummary::
    :toctree: ../api/cugraph/
 
-   cugraph.katz_centrality
+   cugraph.centrality.katz_centrality
 
-Katz Centrality (MG)
---------------------
+multi-GPU
+^^^^^^^^^^
 .. autosummary::
    :toctree: ../api/cugraph/
 
    cugraph.dask.centrality.katz_centrality.katz_centrality
 
+
 Degree Centrality
 -----------------
+single-GPU
+^^^^^^^^^^
 .. autosummary::
    :toctree: ../api/cugraph/
 
-   cugraph.degree_centrality
+   cugraph.centrality.degree_centrality
+
+multi-GPU
+^^^^^^^^^^
+
 
 Eigenvector Centrality
 ----------------------
+single-GPU
+^^^^^^^^^^
 .. autosummary::
    :toctree: ../api/cugraph/
 
    cugraph.centrality.eigenvector_centrality
 
-Eigenvector Centrality (MG)
----------------------------
+multi-GPU
+^^^^^^^^^^
 .. autosummary::
    :toctree: ../api/cugraph/
 
diff --git a/docs/cugraph/source/api_docs/cugraph/components.rst b/docs/cugraph/source/api_docs/cugraph/components.rst
index 5835972cd4d..560aa1f8ca0 100644
--- a/docs/cugraph/source/api_docs/cugraph/components.rst
+++ b/docs/cugraph/source/api_docs/cugraph/components.rst
@@ -4,7 +4,6 @@ Components
 .. currentmodule:: cugraph
 
 
-
 Connected Components
 --------------------
 .. autosummary::
diff --git a/docs/cugraph/source/api_docs/cugraph/generator.rst b/docs/cugraph/source/api_docs/cugraph/generators.rst
similarity index 77%
rename from docs/cugraph/source/api_docs/cugraph/generator.rst
rename to docs/cugraph/source/api_docs/cugraph/generators.rst
index 9b4ebbcf7a4..4f93d943e6c 100644
--- a/docs/cugraph/source/api_docs/cugraph/generator.rst
+++ b/docs/cugraph/source/api_docs/cugraph/generators.rst
@@ -1,6 +1,6 @@
-=========
-Generator
-=========
+==========
+Generators
+==========
 .. currentmodule:: cugraph
 
 
diff --git a/docs/cugraph/source/api_docs/cugraph/helper_functions.rst b/docs/cugraph/source/api_docs/cugraph/helper_functions.rst
index ec3248bfa27..02cb599ae55 100644
--- a/docs/cugraph/source/api_docs/cugraph/helper_functions.rst
+++ b/docs/cugraph/source/api_docs/cugraph/helper_functions.rst
@@ -4,7 +4,6 @@ DASK MG Helper functions
 .. currentmodule:: cugraph
 
 
-
 Methods
 -------
 .. autosummary::
diff --git a/docs/cugraph/source/api_docs/cugraph/cugraph_top.rst b/docs/cugraph/source/api_docs/cugraph/index.rst
similarity index 96%
rename from docs/cugraph/source/api_docs/cugraph/cugraph_top.rst
rename to docs/cugraph/source/api_docs/cugraph/index.rst
index 1b920a84159..20b63d50ae6 100644
--- a/docs/cugraph/source/api_docs/cugraph/cugraph_top.rst
+++ b/docs/cugraph/source/api_docs/cugraph/index.rst
@@ -20,6 +20,6 @@ cugraph API Reference
     sampling
     traversal
     tree
-    generator
+    generators
     helper_functions
     dask-cugraph.rst
diff --git a/docs/cugraph/source/api_docs/cugraph/layout.rst b/docs/cugraph/source/api_docs/cugraph/layout.rst
index 1c097346b6c..ed97caf549f 100644
--- a/docs/cugraph/source/api_docs/cugraph/layout.rst
+++ b/docs/cugraph/source/api_docs/cugraph/layout.rst
@@ -4,7 +4,6 @@ Layout
 .. currentmodule:: cugraph
 
 
-
 Force Atlas 2
 -------------
 .. autosummary::
diff --git a/docs/cugraph/source/api_docs/cugraph/linear_assignment.rst b/docs/cugraph/source/api_docs/cugraph/linear_assignment.rst
index dfdf6da96db..e0b0b4d11bd 100644
--- a/docs/cugraph/source/api_docs/cugraph/linear_assignment.rst
+++ b/docs/cugraph/source/api_docs/cugraph/linear_assignment.rst
@@ -4,7 +4,6 @@ Linear Assignment
 .. currentmodule:: cugraph
 
 
-
 Hungarian
 ---------
 .. autosummary::
diff --git a/docs/cugraph/source/api_docs/cugraph/link_analysis.rst b/docs/cugraph/source/api_docs/cugraph/link_analysis.rst
index 5f977b47724..48b5ec1176f 100644
--- a/docs/cugraph/source/api_docs/cugraph/link_analysis.rst
+++ b/docs/cugraph/source/api_docs/cugraph/link_analysis.rst
@@ -4,7 +4,6 @@ Link Analysis
 .. currentmodule:: cugraph
 
 
-
 HITS
 ----
 .. autosummary::
diff --git a/docs/cugraph/source/api_docs/cugraph/link_prediction.rst b/docs/cugraph/source/api_docs/cugraph/link_prediction.rst
index b2134fba9f9..f05dce6f721 100644
--- a/docs/cugraph/source/api_docs/cugraph/link_prediction.rst
+++ b/docs/cugraph/source/api_docs/cugraph/link_prediction.rst
@@ -4,7 +4,6 @@ Link Prediction
 .. currentmodule:: cugraph
 
 
-
 Jaccard Coefficient
 -------------------
 .. autosummary::
diff --git a/docs/cugraph/source/api_docs/cugraph/structure.rst b/docs/cugraph/source/api_docs/cugraph/structure.rst
index 5114cb57b47..6369e1bb3fd 100644
--- a/docs/cugraph/source/api_docs/cugraph/structure.rst
+++ b/docs/cugraph/source/api_docs/cugraph/structure.rst
@@ -10,8 +10,6 @@ Constructors
 
    Graph
    MultiGraph
-   BiPartiteGraph
-
 
 
 Adding Data
@@ -19,7 +17,6 @@ Adding Data
 .. autosummary::
    :toctree: ../api/cugraph/
 
-
    Graph.from_cudf_adjlist
    Graph.from_cudf_edgelist
    Graph.from_dask_cudf_edgelist
@@ -35,7 +32,7 @@ Adding Data
 Checks
 ------
 .. autosummary::
-   :toctree: ../api/cugraph/
+    :toctree: ../api/cugraph/
 
    Graph.has_isolated_vertices
    Graph.is_bipartite
@@ -82,8 +79,6 @@ NumberMap
    :toctree: ../api/cugraph/
 
    cugraph.structure.NumberMap
-   cugraph.structure.NumberMap.MultiGPU
-   cugraph.structure.NumberMap.SingleGPU
    cugraph.structure.NumberMap.from_internal_vertex_id
    cugraph.structure.NumberMap.to_internal_vertex_id
    cugraph.structure.NumberMap.add_internal_vertex_id
diff --git a/docs/cugraph/source/api_docs/cugraph/traversal.rst b/docs/cugraph/source/api_docs/cugraph/traversal.rst
index c8fcc6b721c..31296f3b850 100644
--- a/docs/cugraph/source/api_docs/cugraph/traversal.rst
+++ b/docs/cugraph/source/api_docs/cugraph/traversal.rst
@@ -4,7 +4,6 @@ Traversal
 .. currentmodule:: cugraph
 
 
-
 Breadth-first-search
 --------------------
 .. autosummary::
diff --git a/docs/cugraph/source/api_docs/cugraph/tree.rst b/docs/cugraph/source/api_docs/cugraph/tree.rst
index 38bfbad7d62..5ba2242ebb6 100644
--- a/docs/cugraph/source/api_docs/cugraph/tree.rst
+++ b/docs/cugraph/source/api_docs/cugraph/tree.rst
@@ -4,7 +4,6 @@ Tree
 .. currentmodule:: cugraph
 
 
-
 Minimum Spanning Tree
 ---------------------
 .. autosummary::
diff --git a/docs/cugraph/source/api_docs/cugraph_c/c_and_cpp.rst b/docs/cugraph/source/api_docs/cugraph_c/c_and_cpp.rst
index a3a88e8f6a1..34b812785d3 100644
--- a/docs/cugraph/source/api_docs/cugraph_c/c_and_cpp.rst
+++ b/docs/cugraph/source/api_docs/cugraph_c/c_and_cpp.rst
@@ -1,4 +1,4 @@
 CuGraph C and C++ API Links
 ===========================
 
-`CuGraph C and C++ API <../basics/coming_soon.md>`_
\ No newline at end of file
+coming soon - see https://docs.rapids.ai/api/libcugraph/nightly/
\ No newline at end of file
diff --git a/docs/cugraph/source/api_docs/index.rst b/docs/cugraph/source/api_docs/index.rst
index f6307d5ac36..45f7210f5a2 100644
--- a/docs/cugraph/source/api_docs/index.rst
+++ b/docs/cugraph/source/api_docs/index.rst
@@ -1,16 +1,16 @@
 Python API reference
 ====================
 
-This page provides a list of all publicly accessible modules, methods and classes through
-``cugraph.*`` namespace.
+This page provides a list of all publicly accessible Python modules with in the Graph collection
 
 .. toctree::
     :maxdepth: 2
     :caption: Python API Documentation
 
-    cugraph/cugraph_top.rst
+    cugraph/index.rst
     plc/pylibcugraph.rst
     cugraph-dgl/cugraph_dgl.rst
     cugraph-pyg/cugraph_pyg.rst
-    service/cugraph_service_client.rst
-    service/cugraph_service_server.rst
+    service/index.rst
+    cugraph-ops/index.rst
+
diff --git a/docs/cugraph/source/api_docs/service/index.rst b/docs/cugraph/source/api_docs/service/index.rst
new file mode 100644
index 00000000000..a58cf207456
--- /dev/null
+++ b/docs/cugraph/source/api_docs/service/index.rst
@@ -0,0 +1,10 @@
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+cugraph-service API Reference
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. toctree::
+    :maxdepth: 2
+    :caption: cugraph-service API Documentation
+
+    cugraph_service_client
+    cugraph_service_server
\ No newline at end of file
diff --git a/docs/cugraph/source/api_docs/structure.rst b/docs/cugraph/source/api_docs/structure.rst
deleted file mode 100644
index 0d6e287927a..00000000000
--- a/docs/cugraph/source/api_docs/structure.rst
+++ /dev/null
@@ -1,104 +0,0 @@
-=============
-Graph Classes
-=============
-.. currentmodule:: cugraph
-
-Constructors
-------------
-.. autosummary::
-   :toctree: api/
-
-   Graph
-   MultiGraph
-   BiPartiteGraph
-
-
-
-Adding Data
------------
-.. autosummary::
-   :toctree: api/
-
-
-   Graph.from_cudf_adjlist
-   Graph.from_cudf_edgelist
-   Graph.from_dask_cudf_edgelist
-   Graph.from_pandas_adjacency
-   Graph.from_pandas_edgelist
-   Graph.from_numpy_array
-   Graph.from_numpy_matrix
-   Graph.add_internal_vertex_id
-   Graph.add_nodes_from
-   Graph.clear
-   Graph.unrenumber
-
-Checks
-------
-.. autosummary::
-   :toctree: api/
-
-   Graph.has_isolated_vertices
-   Graph.is_bipartite
-   Graph.is_directed
-   Graph.is_multigraph
-   Graph.is_multipartite
-   Graph.is_renumbered
-   Graph.is_weighted
-   Graph.lookup_internal_vertex_id
-   Graph.to_directed
-   Graph.to_undirected
-
-
-Symmetrize
-----------
-.. autosummary::
-   :toctree: api/
-
-   cugraph.symmetrize
-   cugraph.symmetrize_ddf
-   cugraph.symmetrize_df
-
-
-Conversion from Other Formats
------------------------------
-.. autosummary::
-   :toctree: api/
-
-   cugraph.from_adjlist
-   cugraph.from_cudf_edgelist
-   cugraph.from_edgelist
-   cugraph.from_numpy_array
-   cugraph.from_numpy_matrix
-   cugraph.from_pandas_adjacency
-   cugraph.from_pandas_edgelist
-   cugraph.to_numpy_array
-   cugraph.to_numpy_matrix
-   cugraph.to_pandas_adjacency
-   cugraph.to_pandas_edgelist
-
-NumberMap
------------------------------
-.. autosummary::
-   :toctree: api/
-
-   cugraph.structure.NumberMap
-   cugraph.structure.NumberMap.MultiGPU
-   cugraph.structure.NumberMap.SingleGPU
-   cugraph.structure.NumberMap.from_internal_vertex_id
-   cugraph.structure.NumberMap.to_internal_vertex_id
-   cugraph.structure.NumberMap.add_internal_vertex_id
-   cugraph.structure.NumberMap.compute_vals
-   cugraph.structure.NumberMap.compute_vals_types
-   cugraph.structure.NumberMap.generate_unused_column_name
-   cugraph.structure.NumberMap.renumber
-   cugraph.structure.NumberMap.renumber_and_segment
-   cugraph.structure.NumberMap.set_renumbered_col_names
-   cugraph.structure.NumberMap.unrenumber
-   cugraph.structure.NumberMap.vertex_column_size
-
-Other
------------------------------
-.. autosummary::
-   :toctree: api/
-
-   cugraph.hypergraph
diff --git a/docs/cugraph/source/basics/coming_soon.md b/docs/cugraph/source/basics/coming_soon.md
deleted file mode 100644
index 6cf15a95466..00000000000
--- a/docs/cugraph/source/basics/coming_soon.md
+++ /dev/null
@@ -1,6 +0,0 @@
-
-<a href="https://rapids.ai/">
-<img src="https://www.publicdomainpictures.net/pictures/250000/velka/coming-soon.jpg"
-alt="RAPIDS" width="500"></a>
-
-This part of the cuGraph documentation is currently under development.
\ No newline at end of file
diff --git a/docs/cugraph/source/basics/cugraph_toc.md b/docs/cugraph/source/basics/cugraph_toc.md
deleted file mode 100644
index 8b71ad37871..00000000000
--- a/docs/cugraph/source/basics/cugraph_toc.md
+++ /dev/null
@@ -1,13 +0,0 @@
-
-- General
-  - [Basic CuGraph Information]()
-  - [Latest News]()
-  - [Blogs and Presentation]()
-  - [How-to Guides]()
-  - [Performance]()
-- CuGraph Development and Contributing
-  - [Getting cuGraph Packages]()
-  - [Contributing to cuGraph]()
-  - [CuGraph Development Guide]()
-- Algorithms
-  - [Current list of algorithms]()
diff --git a/docs/cugraph/source/basics/index.rst b/docs/cugraph/source/basics/index.rst
index 1875ac22bd8..7bba301b657 100644
--- a/docs/cugraph/source/basics/index.rst
+++ b/docs/cugraph/source/basics/index.rst
@@ -7,6 +7,5 @@ Basics
    :maxdepth: 2
 
    cugraph_intro
-   cugraph_toc.md
    nx_transition
    cugraph_cascading
diff --git a/docs/cugraph/source/conf.py b/docs/cugraph/source/conf.py
index 11225100e87..9835848394c 100644
--- a/docs/cugraph/source/conf.py
+++ b/docs/cugraph/source/conf.py
@@ -76,9 +76,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '23.04'
+version = '23.06'
 # The full version, including alpha/beta/rc tags.
-release = '23.04.01'
+release = '23.06.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/cugraph/source/graph_support/algorithms.md b/docs/cugraph/source/graph_support/algorithms.md
index fa2e7cc9553..60243fefb93 100644
--- a/docs/cugraph/source/graph_support/algorithms.md
+++ b/docs/cugraph/source/graph_support/algorithms.md
@@ -22,47 +22,47 @@ Note: Multi-GPU, or MG, includes support for Multi-Node Multi-GPU (also called M
 
 | Category          | Algorithm                          | Scale               | Notes                                                           |
 | ----------------- | ---------------------------------- | ------------------- | --------------------------------------------------------------- |
-| Centrality        |                                    |                     |                                                                 |
-|                   | Katz                               | __Multi-GPU__ |                                                                 |
-|                   | Betweenness Centrality             | Single-GPU          | MG planned for 23.02                                            |
-|                   | Edge Betweenness Centrality        | Single-GPU          | MG planned for 23.02                                            |
-|                   | Eigenvector Centrality             | __Multi-GPU__ |                                                                 |
-|                   | Degree Centrality                  | __Multi-GPU__ | Python only                                                     |
+| [Centrality](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/centrality/Centrality.ipynb)        |                                    |                     |                                                                 |
+|                   | [Katz](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/centrality/Katz.ipynb)                               | __Multi-GPU__ |                                                                 |
+|                   | [Betweenness Centrality](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/centrality/Betweenness.ipynb)             | Single-GPU          | MG planned for 23.02                                            |
+|                   | [Edge Betweenness Centrality](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/centrality/Betweenness.ipynb)        | Single-GPU          | MG planned for 23.02                                            |
+|                   | [Eigenvector Centrality](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/centrality/Eigenvector.ipynb)             | __Multi-GPU__ |                                                                 |
+|                   | [Degree Centrality](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/centrality/Degree.ipynb)                  | __Multi-GPU__ | Python only                                                     |
 | Community         |                                    |                     |                                                                 |
-|                   | Leiden                             | Single-GPU          | MG planned for 23.02                                            |
-|                   | Louvain                            | __Multi-GPU__ |                                                                 |
-|                   | Ensemble Clustering for Graphs     | Single-GPU          |                                                                 |
-|                   | Spectral-Clustering - Balanced Cut | Single-GPU          |                                                                 |
-|                   | Spectral-Clustering - Modularity   | Single-GPU          |                                                                 |
-|                   | Subgraph Extraction                | Single-GPU          |                                                                 |
-|                   | Triangle Counting                  | __Multi-GPU__ |                                                                 |
-|                   | K-Truss                            | Single-GPU          |                                                                 |
+|                   | [Leiden](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/Louvain.ipynb)                             | Single-GPU          | MG planned for 23.02                                            |
+|                   | [Louvain](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/Louvain.ipynb)                            | __Multi-GPU__ |                                                                 |
+|                   | [Ensemble Clustering for Graphs](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/ECG.ipynb)     | Single-GPU          |                                                                 |
+|                   | [Spectral-Clustering - Balanced Cut](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/Spectral-Clustering.ipynb) | Single-GPU          |                                                                 |
+|                   | [Spectral-Clustering - Modularity](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/Spectral-Clustering.ipynb)   | Single-GPU          |                                                                 |
+|                   | [Subgraph Extraction](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/Subgraph-Extraction.ipyn)                | Single-GPU          |                                                                 |
+|                   | [Triangle Counting](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/Triangle-Counting.ipynb)                  | __Multi-GPU__ |                                                                 |
+|                   | [K-Truss](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/ktruss.ipynb)                            | Single-GPU          |                                                                 |
 | Components        |                                    |                     |                                                                 |
-|                   | Weakly Connected Components        | __Multi-GPU__ |                                                                 |
-|                   | Strongly Connected Components      | Single-GPU          |                                                                 |
+|                   | [Weakly Connected Components](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/components/ConnectedComponents.ipynb)        | __Multi-GPU__ |                                                                 |
+|                   | [Strongly Connected Components](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/components/ConnectedComponents.ipynb)      | Single-GPU          |                                                                 |
 | Core              |                                    |                     |                                                                 |
-|                   | K-Core                             | **Multi-GPU** |                                                                 |
-|                   | Core Number                        | **Multi-GPU** |                                                                 |
+|                   | [K-Core](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/cores/kcore.ipynb)                             | **Multi-GPU** |                                                                 |
+|                   | [Core Number](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/cores/core-number.ipynb)                        | **Multi-GPU** |                                                                 |
 | _Flow_          |                                    |                     |                                                                 |
 |                   | _MaxFlow_                        | ---                 |                                                                 |
 | _Influence_     |                                    |                     |                                                                 |
 |                   | _Influence Maximization_         | ---                 |                                                                 |
 | Layout            |                                    |                     |                                                                 |
-|                   | Force Atlas 2                      | Single-GPU          |                                                                 |
+|                   | [Force Atlas 2](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/layout/Force-Atlas2.ipynb)                      | Single-GPU          |                                                                 |
 | Linear Assignment |                                    |                     |                                                                 |
-|                   | Hungarian                          | Single-GPU          | [README](cpp/src/linear_assignment/README-hungarian.md)            |
+|                   | [Hungarian]()                          | Single-GPU          | [README](cpp/src/linear_assignment/README-hungarian.md)            |
 | Link Analysis     |                                    |                     |                                                                 |
-|                   | Pagerank                           | __Multi-GPU__ | [C++ README](cpp/src/centrality/README.md#Pagerank)                |
-|                   | Personal Pagerank                  | __Multi-GPU__ | [C++ README](cpp/src/centrality/README.md#Personalized-Pagerank)   |
-|                   | HITS                               | __Multi-GPU__ |                                                                 |
+|                   | [Pagerank](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_analysis/Pagerank.ipynb)                           | __Multi-GPU__ | [C++ README](cpp/src/centrality/README.md#Pagerank)                |
+|                   | [Personal Pagerank]()                  | __Multi-GPU__ | [C++ README](cpp/src/centrality/README.md#Personalized-Pagerank)   |
+|                   | [HITS](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_analysis/HITS.ipynb)                               | __Multi-GPU__ |                                                                 |
 | Link Prediction   |                                    |                     |                                                                 |
-|                   | Jaccard Similarity                 | **Multi-GPU**      | MG as of 22.12<br />Directed graph only                         |
-|                   | Weighted Jaccard Similarity        | Single-GPU          |                                                                 |
-|                   | Overlap Similarity                 | **Multi-GPU** | MG as of 22.12                                                  |
-|                   | Sorensen Coefficient               | **Multi-GPU** | MG as of 22.12                                                  |
+|                   | [Jaccard Similarity](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb)                 | **Multi-GPU**      | MG as of 22.12<br />Directed graph only                         |
+|                   | [Weighted Jaccard Similarity](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb)        | Single-GPU          |                                                                 |
+|                   | [Overlap Similarity](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_prediction/Overlap-Similarity.ipynb)                 | **Multi-GPU** | MG as of 22.12                                                  |
+|                   | [Sorensen Coefficient](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_prediction/Sorensen-Similarity.ipynb)               | **Multi-GPU** | MG as of 22.12                                                  |
 |                   | _Local Clustering Coefficient_   | ---                 |                                                                 |
 | Sampling          |                                    |                     |                                                                 |
-|                   | Uniform Random Walks (RW)          | **Multi-GPU** |                                                                 |
+|                   | [Uniform Random Walks RW](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/sampling/RandomWalk.ipynb)          | **Multi-GPU** |                                                                 |
 |                   | *Biased Random Walks (RW)*       | ---                 |                                                                 |
 |                   | Egonet                             | **Multi-GPU** |                                                                 |
 |                   | Node2Vec                           | Single-GPU          | MG planned for 23.02                                            |
diff --git a/docs/cugraph/source/graph_support/datastores.rst b/docs/cugraph/source/graph_support/datastores.rst
index 82db2c0e1b2..f921100774c 100644
--- a/docs/cugraph/source/graph_support/datastores.rst
+++ b/docs/cugraph/source/graph_support/datastores.rst
@@ -1,11 +1,9 @@
-===========
 Data Stores
 ===========
-
-
 .. toctree::
    :maxdepth: 3
-   
+
    property_graph.md
+   knowledge_stores.md
    feature_stores.md
-   knowledge_stores.md
\ No newline at end of file
+
diff --git a/docs/cugraph/source/graph_support/feature_stores.md b/docs/cugraph/source/graph_support/feature_stores.md
index e69de29bb2d..f40cab72ee0 100644
--- a/docs/cugraph/source/graph_support/feature_stores.md
+++ b/docs/cugraph/source/graph_support/feature_stores.md
@@ -0,0 +1,3 @@
+# Feature Store
+
+Coming Soon
\ No newline at end of file
diff --git a/docs/cugraph/source/graph_support/gnn_support.rst b/docs/cugraph/source/graph_support/gnn_support.rst
index 2acb2254a5d..3c92dc36098 100644
--- a/docs/cugraph/source/graph_support/gnn_support.rst
+++ b/docs/cugraph/source/graph_support/gnn_support.rst
@@ -4,9 +4,9 @@ Graph Neural Network Support
 
 
 .. toctree::
-   :maxdepth: 3
+   :maxdepth: 2
    
    PyG_support.md
    DGL_support.md
    cugraphops_support.rst
-   wholegraph_support.rst
\ No newline at end of file
+   wholegraph_support.md
\ No newline at end of file
diff --git a/docs/cugraph/source/graph_support/graph_algorithms.rst b/docs/cugraph/source/graph_support/graph_algorithms.rst
index 885be03bf9a..a8ba01aa915 100644
--- a/docs/cugraph/source/graph_support/graph_algorithms.rst
+++ b/docs/cugraph/source/graph_support/graph_algorithms.rst
@@ -2,7 +2,6 @@
 Algorithms
 ==========
 
-
 .. toctree::
    :maxdepth: 3
    
diff --git a/docs/cugraph/source/graph_support/knowledge_stores.md b/docs/cugraph/source/graph_support/knowledge_stores.md
index e69de29bb2d..4d6028a598c 100644
--- a/docs/cugraph/source/graph_support/knowledge_stores.md
+++ b/docs/cugraph/source/graph_support/knowledge_stores.md
@@ -0,0 +1,3 @@
+# Knowledge Store
+
+Coming Soon
\ No newline at end of file
diff --git a/docs/cugraph/source/graph_support/property_graph.md b/docs/cugraph/source/graph_support/property_graph.md
index 4610cb7c3a3..614910b79bc 100644
--- a/docs/cugraph/source/graph_support/property_graph.md
+++ b/docs/cugraph/source/graph_support/property_graph.md
@@ -1,14 +1,10 @@
-<h1 align="center";>
-  <br>
-  <img src="./pg_example.png" alt="cuGraph" width="400">
-</h1>
-<h1 align="left";>
-  <br>
-Property Graph
-</h1>
+# Property Graph
 
 Part of [RAPIDS](https://rapids.ai) cuGraph, Property Graph allows all the great benefits of cuGraph to be applied to property-rich datasets stored in a graph structure. A Property Graph is really a data model rather than a type of graph.  Within the cuGraph ecosystem, a Property Graph is a meta-graph that can encapsulate and instantiate all the other graph types.  That view stems from property graphs being originally created for database systems.  Conceptually a Property Graph can be viewed as a property rich structure that can be projected onto any graph types.  The Dataversity, has a good definition of [Property Graph](https://www.dataversity.net/what-is-a-property-graph) which contains definitions from a collection of resources.
 
+![Sample Property Graph](../images/pg_example.png)
+
+
 Property Graph enables:
 
 * Multiple edge and node types as seen in the Property Graph API
diff --git a/docs/cugraph/source/graph_support/wholegraph_support.md b/docs/cugraph/source/graph_support/wholegraph_support.md
new file mode 100644
index 00000000000..fa26700a648
--- /dev/null
+++ b/docs/cugraph/source/graph_support/wholegraph_support.md
@@ -0,0 +1,4 @@
+# WholeGraph
+
+[RAPIDS](https://rapids.ai) [WholeGraph](https://github.com/rapidsai/wholegraph) is designed to help train large-scale Graph Neural Networks(GNN).
+Please see [WholeGraph Introduction](https://github.com/rapidsai/wholegraph/blob/main/README.md) for more details
\ No newline at end of file
diff --git a/docs/cugraph/source/graph_support/wholegraph_support.rst b/docs/cugraph/source/graph_support/wholegraph_support.rst
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/docs/cugraph/source/images/pg_example.png b/docs/cugraph/source/images/pg_example.png
new file mode 100644
index 00000000000..5ce8a0f2054
Binary files /dev/null and b/docs/cugraph/source/images/pg_example.png differ
diff --git a/docs/cugraph/source/index.rst b/docs/cugraph/source/index.rst
index 12bc74aea2b..4690669203a 100644
--- a/docs/cugraph/source/index.rst
+++ b/docs/cugraph/source/index.rst
@@ -1,9 +1,19 @@
-Welcome to cuGraph's documentation!
-===================================
-RAPIDS cuGraph is a library of graph algorithms that seamlessly integrates
-into the RAPIDS data science ecosystem and allows the data scientist to
-easily call graph algorithms using data stored in GPU DataFrames,
-NetworkX Graphs, or even CuPy or SciPy sparse Matrices.
+Welcome to RAPIDS Graph documentation
+=====================================
+RAPIDS Graph covers a range of graph libraries and packages, that includes:
+
+* cugraph: GPU-accelerated graph algorithms
+* cugraph-ops: GPU-accelerated GNN aggregators and operators
+* cugraph-service: multi-user, remote GPU-accelerated graph algorithm service
+* cugraph-pyg:  GPU-accelerated extensions for use with the PyG framework
+* cugraph-dgl:  GPU-accelerated extensions for use with the DGL framework
+* wholegraph: shared memory-based GPU-accelerated GNN training
+
+cuGraph is a library of graph algorithms that seamlessly integrates into the RAPIDS data science ecosystem and allows the data scientist to easily call graph algorithms using data stored in GPU DataFrames, NetworkX Graphs, or even CuPy or SciPy sparse Matrices.
+
+Note: We are redoing all of our documents, please be patient as we update
+the docs and links
+
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/cugraph/source/installation/getting_cugraph.md b/docs/cugraph/source/installation/getting_cugraph.md
index 005938f4777..509508c5283 100644
--- a/docs/cugraph/source/installation/getting_cugraph.md
+++ b/docs/cugraph/source/installation/getting_cugraph.md
@@ -1,13 +1,16 @@
 
 # Getting cuGraph Packages
 
+Start by reading the [RAPIDS Instalation guide](https://docs.rapids.ai/install)  
+and checkout the [RAPIDS install selector](https://rapids.ai/start.html) for a pick list of install options.
+
+
 There are 4 ways to get cuGraph packages:
 1. [Quick start with Docker Repo](#docker)
 2. [Conda Installation](#conda)
 3. [Pip Installation](#pip)
 4. [Build from Source](#SOURCE)
 
-Or checkout the [RAPIDS install selector](https://rapids.ai/start.html) for a pick list of install options.
 
 <br>
 
@@ -35,7 +38,7 @@ Replace the package name in the example below to the one you want to install.
 Install and update cuGraph using the conda command:
 
 ```bash
-conda install -c rapidsai -c numba -c conda-forge -c nvidia cugraph cudatoolkit=11.8
+conda install -c rapidsai -c conda-forge -c nvidia cugraph cudatoolkit=11.8
 ```
 
 Note: This conda installation only applies to Linux and Python versions 3.8/3.10.
diff --git a/docs/cugraph/source/installation/source_build.md b/docs/cugraph/source/installation/source_build.md
index cd9297abcb2..9a93ed6c575 100644
--- a/docs/cugraph/source/installation/source_build.md
+++ b/docs/cugraph/source/installation/source_build.md
@@ -41,7 +41,7 @@ __Create the conda development environment__
 ```bash
 # create the conda environment (assuming in base `cugraph` directory)
 
-# for CUDA 11.5
+# for CUDA 11.x
 conda env create --name cugraph_dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
 
 # activate the environment
diff --git a/docs/cugraph/source/references/cugraph_ref.rst b/docs/cugraph/source/references/cugraph_ref.md
similarity index 55%
rename from docs/cugraph/source/references/cugraph_ref.rst
rename to docs/cugraph/source/references/cugraph_ref.md
index f099752c543..a646d6da243 100644
--- a/docs/cugraph/source/references/cugraph_ref.rst
+++ b/docs/cugraph/source/references/cugraph_ref.md
@@ -1,97 +1,46 @@
-##########
-References
-##########
+# References
 
-************
-Architecture
-************
+## Architecture
 
 2-D Data Partitioning
 
-- Kang, S., Fender, A., Eaton, J., & Rees, B. (2020, September) *Computing PageRank Scores of Web Crawl Data Using DGX A100 Clusters*. In 2020 IEEE High Performance Extreme Computing Conference (HPEC) (pp. 1-4). IEEE.
+- Kang, S., Fender, A., Eaton, J., & Rees, B. (2020, September) *Computing PageRank Scores of Web Crawl Data Using DGX A100 Clusters*.  In 2020 IEEE High Performance Extreme Computing Conference (HPEC) (pp. 1-4). IEEE.
 
+- S. Kang, J. Nke and B. Rees, (2022 September) *Analyzing Multi-trillion Edge Graphs on Large GPU Clusters: A Case Study with PageRank*, In 2022 IEEE High Performance Extreme Computing Conference (HPEC), Waltham, MA, USA, 2022, pp. 1-7, doi: 10.1109/HPEC55821.2022.9926341.
 
-| 
+</br></br>
 
-| 
-
-**********
-Algorithms
-**********
-
-Betweenness Centrality
+## Algorithms
 
+### Betweenness Centrality
 - Brandes, U. (2001). *A faster algorithm for betweenness centrality*. Journal of mathematical sociology, 25(2), 163-177.
 - Brandes, U. (2008). *On variants of shortest-path betweenness centrality and their generic computation*. Social Networks, 30(2), 136-145.
 - McLaughlin, A., & Bader, D. A. (2018). *Accelerating GPU betweenness centrality*. Communications of the ACM, 61(8), 85-92.
 
-
-Katz
-
+### Katz
 - Katz, L. (1953). *A new status index derived from sociometric analysis*. Psychometrika, 18(1), 39-43.
 - Foster, K.C., Muth, S.Q., Potterat, J.J. et al. *A faster Katz status score algorithm*. Computational & Mathematical Organization Theory (2001) 7: 275.
 
-
-
-K-Truss
-
+### K-Truss
 - J. Cohen, *Trusses: Cohesive subgraphs for social network analysis* National security agency technical report, 2008
 - O. Green, J. Fox, E. Kim, F. Busato, et al. *Quickly Finding a Truss in a Haystack* IEEE High Performance Extreme Computing Conference (HPEC), 2017 https://doi.org/10.1109/HPEC.2017.8091038
 - O. Green, P. Yalamanchili, L.M. Munguia, *Fast Triangle Counting on GPU* Irregular Applications: Architectures and Algorithms (IA3), 2014
 
-Hungarian Algorithm
-
+### Hungarian Algorithm
 - Date, K., & Nagi, R. (2016). GPU-accelerated Hungarian algorithms for the Linear Assignment Problem. Parallel Computing, 57, 52-72.
 
 
-Leiden
-
+### Leiden
 - Traag, V. A., Waltman, L., & Van Eck, N. J. (2019). *From Louvain to Leiden: guaranteeing well-connected communities*. Scientific reports, 9(1), 1-12.
 
-Louvain
-
+### Louvain
 - VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre.  *Fast unfolding of community hierarchies in large networks*. J Stat Mech P10008 (2008)
 
+</br></br>
 
-
-
-| 
-
-| 
-
-*************
-Other Papers
-*************
+## Other Papers
 - Hricik, T., Bader, D., & Green, O. (2020, September). *Using RAPIDS AI to Accelerate Graph Data Science Workflows*. In 2020 IEEE High Performance Extreme Computing Conference (HPEC) (pp. 1-4). IEEE.
 
-| 
-
-| 
-
-**********
-Data Sets
-**********
-
-karate
-  - W. W. Zachary, *An information flow model for conflict and fission in small groups*, Journal of Anthropological Research 33, 452-473 (1977).
-dolphins
-  - D. Lusseau, K. Schneider, O. J. Boisseau, P. Haase, E. Slooten, and S. M. Dawson,
-    *The bottlenose dolphin community of Doubtful Sound features a large proportion of long-lasting associations*, 
-    Behavioral Ecology and Sociobiology 54, 396-405 (2003).
-netscience
-  - M. E. J. Newman,
-    *Finding community structure in networks using the eigenvectors of matrices*, 
-    Preprint physics/0605087 (2006).
-email-Eu-core
-  - Hao Yin, Austin R. Benson, Jure Leskovec, and David F. Gleich.
-    *Local Higher-order Graph Clustering.* 
-    In Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. 2017.
-  - J. Leskovec, J. Kleinberg and C. Faloutsos. 
-    *Graph Evolution: Densification and Shrinking Diameters*. 
-    ACM Transactions on Knowledge Discovery from Data (ACM TKDD), 1(1), 2007. http://www.cs.cmu.edu/~jure/pubs/powergrowth-tkdd.pdf 
-polbooks
-  - V. Krebs, unpublished, http://www.orgnet.com/. 
-
-
 
+</br></br>
 
diff --git a/docs/cugraph/source/references/datasets.md b/docs/cugraph/source/references/datasets.md
new file mode 100644
index 00000000000..3d45dec188a
--- /dev/null
+++ b/docs/cugraph/source/references/datasets.md
@@ -0,0 +1,21 @@
+# Data Sets
+
+karate
+  - W. W. Zachary, *An information flow model for conflict and fission in small groups*, Journal of Anthropological Research 33, 452-473 (1977).
+dolphins
+  - D. Lusseau, K. Schneider, O. J. Boisseau, P. Haase, E. Slooten, and S. M. Dawson,
+    *The bottlenose dolphin community of Doubtful Sound features a large proportion of long-lasting associations*, 
+    Behavioral Ecology and Sociobiology 54, 396-405 (2003).
+netscience
+  - M. E. J. Newman,
+    *Finding community structure in networks using the eigenvectors of matrices*, 
+    Preprint physics/0605087 (2006).
+email-Eu-core
+  - Hao Yin, Austin R. Benson, Jure Leskovec, and David F. Gleich.
+    *Local Higher-order Graph Clustering.* 
+    In Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. 2017.
+  - J. Leskovec, J. Kleinberg and C. Faloutsos. 
+    *Graph Evolution: Densification and Shrinking Diameters*. 
+    ACM Transactions on Knowledge Discovery from Data (ACM TKDD), 1(1), 2007. http://www.cs.cmu.edu/~jure/pubs/powergrowth-tkdd.pdf 
+polbooks
+  - V. Krebs, unpublished, http://www.orgnet.com/. 
\ No newline at end of file
diff --git a/docs/cugraph/source/references/datasets.rst b/docs/cugraph/source/references/datasets.rst
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/docs/cugraph/source/references/index.rst b/docs/cugraph/source/references/index.rst
index e70df627fe3..d7a173a8330 100644
--- a/docs/cugraph/source/references/index.rst
+++ b/docs/cugraph/source/references/index.rst
@@ -6,4 +6,6 @@ References
 .. toctree::
    :maxdepth: 3
    
-   cugraph_ref
\ No newline at end of file
+   cugraph_ref.md
+   datasets.md
+   licenses.md
diff --git a/docs/cugraph/source/references/licenses.md b/docs/cugraph/source/references/licenses.md
new file mode 100644
index 00000000000..dfc950023cf
--- /dev/null
+++ b/docs/cugraph/source/references/licenses.md
@@ -0,0 +1,208 @@
+# License
+
+Most of the Graph code is open-sourced and developed under the Apache 2.0 licnese.
+The cugraph-ops code is closed sourced and developed under a NVIDIA copyright 
+
+
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018 NVIDIA CORPORATION
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/docs/cugraph/source/references/licenses.rst b/docs/cugraph/source/references/licenses.rst
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/docs/cugraph/source/tutorials/cugraph_notebooks.md b/docs/cugraph/source/tutorials/cugraph_notebooks.md
index 1624ef10aa5..ade6ed91315 100644
--- a/docs/cugraph/source/tutorials/cugraph_notebooks.md
+++ b/docs/cugraph/source/tutorials/cugraph_notebooks.md
@@ -1,6 +1,6 @@
 # cuGraph Notebooks
 
-![GraphAnalyticsFigure](img/GraphAnalyticsFigure.jpg)
+![GraphAnalyticsFigure](https://github.com/rapidsai/cugraph/tree/main/img/GraphAnalyticsFigure.jpg)
 
 This repository contains a collection of Jupyter Notebooks that outline how to run various cuGraph analytics.   The notebooks do not address a complete data science problem.  The notebooks are simply examples of how to run the graph analytics.  Manipulation of the data before or after the graph analytic is not covered here.   Extended, more problem focused, notebooks are being created and available https://github.com/rapidsai/notebooks-extended
 
@@ -9,43 +9,43 @@ This repository contains a collection of Jupyter Notebooks that outline how to r
 | Folder          | Notebook                                                     | Description                                                  |
 | --------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
 | Centrality      |                                                              |                                                              |
-|                 | [Centrality](algorithms/centrality/Centrality.ipynb)         | Compute and compare multiple (currently 5) centrality scores |
-|                 | [Katz](algorithms/centrality/Katz.ipynb)                     | Compute the Katz centrality for every vertex                 |
-|                 | [Betweenness](algorithms/centrality/Betweenness.ipynb)       | Compute both Edge and Vertex Betweenness centrality          |
-|                 | [Degree](algorithms/centrality/Degree.ipynb)                 | Compute Degree Centraility for each vertex                   |
-|                 | [Eigenvector](algorithms/centrality/Eigenvector.ipynb)       | Compute Eigenvector for every vertex                         |
+|                 | [Centrality](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/centrality/Centrality.ipynb)         | Compute and compare multiple (currently 5) centrality scores |
+|                 | [Katz](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/centrality/Katz.ipynb)                     | Compute the Katz centrality for every vertex                 |
+|                 | [Betweenness](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/centrality/Betweenness.ipynb)       | Compute both Edge and Vertex Betweenness centrality          |
+|                 | [Degree](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/centrality/Degree.ipynb)                 | Compute Degree Centraility for each vertex                   |
+|                 | [Eigenvector](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/centrality/Eigenvector.ipynb)       | Compute Eigenvector for every vertex                         |
 | Community       |                                                              |                                                              |
-|                 | [Louvain](algorithms/community/Louvain.ipynb) and Leiden                          | Identify clusters in a graph using both the Louvain and Leiden algorithms     |
-|                 | [ECG](algorithms/community/ECG.ipynb)                                   | Identify clusters in a graph using the Ensemble Clustering for Graph |
-|                 | [K-Truss](algorithms/community/ktruss.ipynb)                                | Extracts the K-Truss cluster                                 |
-|                 | [Spectral-Clustering](algorithms/community/Spectral-Clustering.ipynb)   | Identify clusters in a  graph using Spectral Clustering with both<br> - Balanced Cut<br> - Modularity Modularity |
-|                 | [Subgraph Extraction](algorithms/community/Subgraph-Extraction.ipynb)   | Compute a subgraph of the existing graph including only the specified vertices |
-|                 | [Triangle Counting](algorithms/community/Triangle-Counting.ipynb)       | Count the number of Triangle in a graph                      |
+|                 | [Louvain](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/Louvain.ipynb) and Leiden                          | Identify clusters in a graph using both the Louvain and Leiden algorithms     |
+|                 | [ECG](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/ECG.ipynb)                                   | Identify clusters in a graph using the Ensemble Clustering for Graph |
+|                 | [K-Truss](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/ktruss.ipynb)                                | Extracts the K-Truss cluster                                 |
+|                 | [Spectral-Clustering](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/Spectral-Clustering.ipynb)   | Identify clusters in a  graph using Spectral Clustering with both<br> - Balanced Cut<br> - Modularity Modularity |
+|                 | [Subgraph Extraction](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/Subgraph-Extraction.ipynb)   | Compute a subgraph of the existing graph including only the specified vertices |
+|                 | [Triangle Counting](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/Triangle-Counting.ipynb)       | Count the number of Triangle in a graph                      |
 | Components      |                                                              |                                                              |
-|                 | [Connected Components](algorithms/components/ConnectedComponents.ipynb) | Find weakly and strongly connected components in a graph     |
+|                 | [Connected Components](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/components/ConnectedComponents.ipynb) | Find weakly and strongly connected components in a graph     |
 | Core            |                                                              |                                                              |
-|                 | [K-Core](algorithms/cores/kcore.ipynb)                                  | Extracts the K-core cluster                                  |
-|                 | [Core Number](algorithms/cores/core-number.ipynb)                       | Computer the Core number for each vertex in a graph          |
+|                 | [K-Core](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/cores/kcore.ipynb)                                  | Extracts the K-core cluster                                  |
+|                 | [Core Number](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/cores/core-number.ipynb)                       | Computer the Core number for each vertex in a graph          |
 Layout            |                                                              |                                                              |
-|                 | [Force-Atlas2](algorithms/layout/Force-Atlas2.ipynb)   |A large graph visualization achieved with cuGraph. |
+|                 | [Force-Atlas2](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/layout/Force-Atlas2.ipynb)   |A large graph visualization achieved with cuGraph. |
 | Link Analysis   |                                                              |                                                              |
-|                 | [Pagerank](algorithms/link_analysis/Pagerank.ipynb)                     | Compute the PageRank of every vertex in a graph              |
-|                 | [HITS](algorithms/link_analysis/HITS.ipynb)                             | Compute the HITS' Hub and Authority scores for every vertex in a graph              |
+|                 | [Pagerank](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_analysis/Pagerank.ipynb)                     | Compute the PageRank of every vertex in a graph              |
+|                 | [HITS](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_analysis/HITS.ipynb)                             | Compute the HITS' Hub and Authority scores for every vertex in a graph              |
 | Link Prediction |                                                              |                                                              |
-|                 | [Jaccard Similarity](algorithms/link_prediction/Jaccard-Similarity.ipynb) | Compute vertex similarity score using both:<br />- Jaccard Similarity<br />- Weighted Jaccard |
-|                 | [Overlap Similarity](algorithms/link_prediction/Overlap-Similarity.ipynb) | Compute vertex similarity score using the Overlap Coefficient |
+|                 | [Jaccard Similarity](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_prediction/Jaccard-Similarity.ipynb) | Compute vertex similarity score using both:<br />- Jaccard Similarity<br />- Weighted Jaccard |
+|                 | [Overlap Similarity](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/link_prediction/Overlap-Similarity.ipynb) | Compute vertex similarity score using the Overlap Coefficient |
 | Sampling        |
-|                 | [Random Walk](algorithms/sampling/RandomWalk.ipynb)                     | Compute Random Walk for a various number of seeds and path lengths |
+|                 | [Random Walk](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/sampling/RandomWalk.ipynb)                     | Compute Random Walk for a various number of seeds and path lengths |
 | Traversal       |                                                              |                                                              |
-|                 | [BFS](algorithms/traversal/BFS.ipynb)                                   | Compute the Breadth First Search path from a starting vertex to every other vertex in a graph |
-|                 | [SSSP](algorithms/traversal/SSSP.ipynb)                                 | Single Source Shortest Path  - compute the shortest path from a starting vertex to every other vertex |
+|                 | [BFS](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/traversal/BFS.ipynb)                                   | Compute the Breadth First Search path from a starting vertex to every other vertex in a graph |
+|                 | [SSSP](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/traversal/SSSP.ipynb)                                 | Single Source Shortest Path  - compute the shortest path from a starting vertex to every other vertex |
 | Structure       |                                                              |                                                              |
 |                 | [Renumbering](algorithms/structure/Renumber.ipynb) <br> [Renumbering 2](algorithms/structure/Renumber-2.ipynb) | Renumber the vertex IDs in a graph (two sample notebooks)    |
 |                 | [Symmetrize](algorithms/structure/Symmetrize.ipynb)                     | Symmetrize the edges in a graph                              |
 
 
 ## RAPIDS notebooks
-Visit the main RAPIDS [notebooks](https://github.com/rapidsai/notebooks) repo for a listing of all notebooks across all RAPIDS libraries.
+Visit the main RAPIDS [notebooks](https://github.com/rapidsai/cugraph/blob/main/notebooks/) repo for a listing of all notebooks across all RAPIDS libraries.
 
 ## Requirements
 
@@ -74,4 +74,4 @@ Unless required by applicable law or agreed to in writing, software distributed
 
 
 
-![RAPIDS](img/rapids_logo.png)
\ No newline at end of file
+![RAPIDS](https://github.com/rapidsai/cugraph/blob/main/img/rapids_logo.png)
\ No newline at end of file
diff --git a/docs/cugraph/source/tutorials/index.rst b/docs/cugraph/source/tutorials/index.rst
index cce3525097b..525fbe4f545 100644
--- a/docs/cugraph/source/tutorials/index.rst
+++ b/docs/cugraph/source/tutorials/index.rst
@@ -9,3 +9,4 @@ Tutorials
    how_to_guides.md
    cugraph_blogs.rst
    community_resources.md
+   cugraph_notebooks.md
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 73b83cb20d8..a897c145a66 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.04/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.06/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake
   )
 endif()
diff --git a/notebooks/README.md b/notebooks/README.md
index ba94766821c..ad6c04cc463 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -61,9 +61,34 @@ Running the example in these notebooks requires:
 * CUDA 11.4+
 * NVIDIA driver 450.51+
 
+
+
+## Additional Notebooks
+
+The following notebooks are not tested as part of the standard cuGraph continuous integration process.  There is a plan to start testing these notebooks weekly, but until then there is no guarantee that they will work with the nightly release.  The following table list the notebook funtion, where to find the notebook, and the environment used to test the notebook.
+
+If any notebook doesn't run as detailed here, please file an issue in [cuGraph](https://github.com/rapidsai/cugraph/issues)
+
+|Notebook              |Location                 |Environment       |Extra Dependencies|Notes                                        |
+|----------------------|-------------------------|------------------|------------------|---------------------------------------------|
+|Batch Betweenness     |N/A                      |                  |                  |removed due to missing batch algorithm 23.06 |
+|[Multiple GPU Louvain](demo/mg_louvain.ipynb)            |demo                     |[cugraph conda](https://github.com/rapidsai/cugraph/blob/branch-23.06/conda/environments/all_cuda-118_arch-x86_64.yaml)     |None              |fixed in PR #3558/23.06                      |
+|[Multiple GPU Pagerank](demo/mg_pagerank.ipynb)           |demo                     |[cugraph conda](https://github.com/rapidsai/cugraph/blob/branch-23.06/conda/environments/all_cuda-118_arch-x86_64.yaml)     |None              |fixed in PR #3558/23.06                      |
+|[Multiple GPU Property Graph](demo/mg_property_graph.ipynb)     |demo                     |[cugraph conda](https://github.com/rapidsai/cugraph/blob/branch-23.06/conda/environments/all_cuda-118_arch-x86_64.yaml)     |None              |fixed in PR #3558/23.06                      |
+|[Managed Memory Pagerank](demo/uvm.ipynb)                   |demo                     |[cugraph conda](https://github.com/rapidsai/cugraph/blob/branch-23.06/conda/environments/all_cuda-118_arch-x86_64.yaml)     |None              |fixed in PR/23.06                            |
+|[Cost Matrix simulating All Points Shortest Path](applications/CostMatrix.ipynb)            |applications             |[cugraph conda](https://github.com/rapidsai/cugraph/blob/branch-23.06/conda/environments/all_cuda-118_arch-x86_64.yaml)     |None              |fixed in PR #3551/23.06                      |
+|[Generating Transaction data using RMAT](applications/gen_550M.ipynb)              |applications             |[cugraph conda](https://github.com/rapidsai/cugraph/blob/branch-23.06/conda/environments/all_cuda-118_arch-x86_64.yaml)     |None              |tested and documented  PR #3551/23.06        |
+|[Multiple GPU tutorial with Pagerank](https://github.com/rapidsai-community/notebooks-contrib/blob/main/community_tutorials_and_guides/cugraph/multi_gpu_pagerank.ipynb)    |contrib/community/cugraph|[cugraph conda](https://github.com/rapidsai/cugraph/blob/branch-23.06/conda/environments/all_cuda-118_arch-x86_64.yaml)     |None              |fixed notebook-contrib PR #374/23.06         |
+|[Breadth First Search benchmark](cugraph_benchmarks/bfs_benchmark.ipynb)         |cugraph_benchmark        |[cugraph conda](https://github.com/rapidsai/cugraph/blob/branch-23.06/conda/environments/all_cuda-118_arch-x86_64.yaml)     |None              |fixed in PR #3561/23.06                      |
+|[Louvain benchmark](cugraph_benchmarks/louvain_benchmark.ipynb)     |cugraph_benchmark        |[cugraph conda](https://github.com/rapidsai/cugraph/blob/branch-23.06/conda/environments/all_cuda-118_arch-x86_64.yaml)     |None              |fixed in PR #3561/23.06                      |
+|[Pagerank benchmark](cugraph_benchmarks/pagerank_benchmark.ipynb)    |cugraph_benchmark        |[cugraph conda](https://github.com/rapidsai/cugraph/blob/branch-23.06/conda/environments/all_cuda-118_arch-x86_64.yaml)     |None              |fixed in PR #3561/23.06                      |
+|[Single Source Shortest Path benchmark](sssp_benchmarks/bfs_benchmark.ipynb)        |cugraph_benchmark        |[cugraph conda](https://github.com/rapidsai/cugraph/blob/branch-23.06/conda/environments/all_cuda-118_arch-x86_64.yaml)     |None              |fixed in PR #3561/23.06                      |
+
+
+
 #### Copyright
 
-Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");  you may not use this file except in compliance with the License.  You may obtain a copy of the License at
 
diff --git a/notebooks/applications/CostMatrix.ipynb b/notebooks/applications/CostMatrix.ipynb
index 687b1526069..5da010404bb 100644
--- a/notebooks/applications/CostMatrix.ipynb
+++ b/notebooks/applications/CostMatrix.ipynb
@@ -1,6 +1,7 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -12,6 +13,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -22,6 +24,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -36,6 +39,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -44,10 +48,12 @@
     "| Author Credit |    Date    |  Update          | cuGraph Version |  Test Hardware |\n",
     "| --------------|------------|------------------|-----------------|----------------|\n",
     "| Brad Rees     | 06/21/2022 | created          | 22.08           | V100 w 32 GB, CUDA 11.5\n",
-    "| Don Acosta    | 06/28/2022 | modified         | 22.08           | V100 w 32 GB, CUDA 11.5"
+    "| Don Acosta    | 06/28/2022 | modified         | 22.08           | V100 w 32 GB, CUDA 11.5\n",
+    "| Don Acosta    | 05/18/2023 | modified for SSSP change     | 23.06 nightly   | A6000 w 48 GB, CUDA 11.7"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -79,6 +85,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -117,6 +124,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -177,6 +185,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -206,6 +215,7 @@
     "        tmp = _d.copy()\n",
     "        tmp['src'] += id\n",
     "        tmp['dst'] += id\n",
+    "        tmp['wt'] = 1.0\n",
     "        _d = cudf.concat([_d,tmp])\n",
     "        id = id * 2\n",
     "    return _d"
@@ -232,6 +242,15 @@
     "print()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gdf2"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -245,6 +264,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -280,6 +300,7 @@
     "    seeds = cudf.DataFrame()\n",
     "    seeds['dst'] = [((offset * x) + x) for x in range(num_copies)]\n",
     "    seeds['src'] = ghost_node_id\n",
+    "    seeds['wt'] = 1.0\n",
     "    \n",
     "    _d = cudf.concat([_df, seeds])\n",
     "    \n",
@@ -306,6 +327,16 @@
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gdf_with_ghost"
+   ]
+  },
+  {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -322,6 +353,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -339,7 +371,7 @@
    "outputs": [],
    "source": [
     "%time\n",
-    "G.from_cudf_edgelist(gdf_with_ghost, source='src', destination='dst', renumber=False)"
+    "G.from_cudf_edgelist(gdf_with_ghost, source='src', destination='dst', edge_attr='wt',renumber=False)"
    ]
   },
   {
@@ -353,6 +385,16 @@
    ]
   },
   {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "G.edgelist.weights"
+   ]
+  },
+  {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -361,6 +403,16 @@
     "replicated data is connected through that node. This will include extraneous ghost node related data which will be removed in later steps."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(ghost_id)\n",
+    "print(G.number_of_nodes())"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -381,6 +433,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -397,6 +450,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -415,6 +469,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -432,6 +487,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -449,6 +505,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -465,6 +522,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -535,6 +593,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -563,7 +622,7 @@
     "    gdf_with_ghost, ghost_id = add_ghost_node(data, N)\n",
     "    \n",
     "    G = cugraph.Graph(directed=True)\n",
-    "    G.from_cudf_edgelist(gdf_with_ghost, source='src', destination='dst', renumber=False)\n",
+    "    G.from_cudf_edgelist(gdf_with_ghost, source='src', destination='dst', edge_attr='wt',renumber=False)\n",
     "    \n",
     "    X = cugraph.sssp(G, ghost_id)\n",
     "    \n",
@@ -599,11 +658,12 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "___\n",
-    "Copyright (c) 2022, NVIDIA CORPORATION.\n",
+    "Copyright (c) 2022-2023, NVIDIA CORPORATION.\n",
     "\n",
     "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
     "\n",
@@ -614,9 +674,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "cugraph_dev",
+   "display_name": "cugraph_0426",
    "language": "python",
-   "name": "cugraph_dev"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -628,12 +688,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "cee8a395f2f0c5a5bcf513ae8b620111f4346eff6dc64e1ea99c951b2ec68604"
-   }
+   "version": "3.10.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/applications/gen_550M.ipynb b/notebooks/applications/gen_550M.ipynb
index 51369c00dbf..15eccde7bf6 100644
--- a/notebooks/applications/gen_550M.ipynb
+++ b/notebooks/applications/gen_550M.ipynb
@@ -1,8 +1,41 @@
 {
  "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Creating a financial transaction dataset using RMAT\n",
+    "The following are configurable:\n",
+    "* Range of the random transaction amounts\n",
+    "* Date range of the transactions\n",
+    "* Total number of transactions"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Notebook Revisions\n",
+    "\n",
+    "| Author Credit |    Date    |  Update                     | cuGraph Version |  Test Hardware |\n",
+    "| --------------|------------|-----------------------------|-----------------|----------------|\n",
+    "| Don Acosta    | 01/27/2023 | created                     | 23.02 nightly   | V100 w 32 GB, CUDA 11.5\n",
+    "| Don Acosta    | 05/10/2023 | modified and tested         | 23.06 nightly   | A6000 w 48 GB, CUDA 11.7"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Basic imports needed for RMAT"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -25,9 +58,17 @@
     "from datetime import datetime"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Method to wrap the rmat code allowing scale and edgefactor configuration"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -44,13 +85,21 @@
     "        create_using=None,  # return edgelist instead of Graph instance\n",
     "        mg=False\n",
     "        )\n",
-    "    print('Generating a dataframe of ' + str(len(_gdf)) + '...')\n",
+    "    print('Generating a dataframe of ' + str(len(_gdf)) + ' edges')\n",
     "    return _gdf"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Generate column for the date time of each transaction inside the specified range"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -64,9 +113,17 @@
     "#    return [datetime.fromtimestamp(i) for i in random_list]"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the dollar amount column for transactions."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -78,33 +135,126 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "attachments": {},
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "source": [
+    "Create and write out the csv data file.\n",
+    "\n",
+    "Verified to generate a file containing 33554432 edges (scale 21) on a single GPU. Takes roughly 90 seconds to do that."
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generating a dataframe of 524288 ...\n",
+      "524288\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>src</th>\n",
+       "      <th>dst</th>\n",
+       "      <th>amounts</th>\n",
+       "      <th>date</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>31441</td>\n",
+       "      <td>20910</td>\n",
+       "      <td>14272.12</td>\n",
+       "      <td>1648638850</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>761</td>\n",
+       "      <td>24470</td>\n",
+       "      <td>1358.57</td>\n",
+       "      <td>1652541986</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>22887</td>\n",
+       "      <td>8805</td>\n",
+       "      <td>21196.26</td>\n",
+       "      <td>1643694965</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>22486</td>\n",
+       "      <td>25893</td>\n",
+       "      <td>9165.10</td>\n",
+       "      <td>1645863442</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4335</td>\n",
+       "      <td>16185</td>\n",
+       "      <td>11241.70</td>\n",
+       "      <td>1646718089</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     src    dst   amounts        date\n",
+       "0  31441  20910  14272.12  1648638850\n",
+       "1    761  24470   1358.57  1652541986\n",
+       "2  22887   8805  21196.26  1643694965\n",
+       "3  22486  25893   9165.10  1645863442\n",
+       "4   4335  16185  11241.70  1646718089"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "start_time = '1/1/2022 01:00:00 AM'\n",
     "end_time =   '7/1/2022 01:00:00 AM'\n",
     "amount_range = 25000\n",
     "d1 = datetime.strptime(start_time, '%m/%d/%Y %I:%M:%S %p')\n",
     "d2 = datetime.strptime(end_time, '%m/%d/%Y %I:%M:%S %p')\n",
-    "\n",
-    "df = generate_data(15)\n",
+    "scale = 15\n",
+    "df = generate_data(scale)\n",
     "\n",
     "dates = gen_times(len(df),d1, d2)\n",
     "amounts = gen_amounts(len(df),amount_range)\n",
     "df['amounts'] = amounts\n",
     "df['date'] = dates\n",
-    "len(df)\n",
-    "df.head(4)\n",
-    "df.to_csv('../data/data_500m.csv') #append mode"
+    "filename = \"transaction_data_scale\"+str(scale)+\".csv\"\n",
+    "df.to_csv('../data/'+filename) #append mode\n",
+    "print (len(df))\n",
+    "df.head(5)"
    ]
   }
  ],
@@ -124,7 +274,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.15"
+   "version": "3.10.10"
   },
   "orig_nbformat": 4,
   "vscode": {
diff --git a/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb b/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb
index 6ae695e206e..5a3d502c98b 100644
--- a/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb
@@ -1,6 +1,7 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -9,22 +10,15 @@
     "\n",
     "This notebook benchmarks performance of running BFS within cuGraph against NetworkX. \n",
     "\n",
-    "Notebook Credits\n",
     "\n",
-    "    Original Authors: Bradley Rees\n",
-    "    Last Edit: 08/16/2020\n",
-    "    \n",
-    "RAPIDS Versions: 0.15\n",
-    "\n",
-    "Test Hardware\n",
-    "\n",
-    "    GV100 32G, CUDA 10.2\n",
-    "    Intel(R) Core(TM) CPU i7-7800X @ 3.50GHz\n",
-    "    32GB system memory\n",
-    "    \n"
+    "| Author Credit |    Date    |  Update          | cuGraph Version |  Test Hardware |\n",
+    "| --------------|------------|------------------|-----------------|----------------|\n",
+    "| Brad Rees     | 08/16/2020 | created          | 0.15            | GV100 32G, CUDA 10.2\n",
+    "| Don Acosta    | 05/15/2023 | update and test  | 23.06           | A6000 32G, CUDA 11.7"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -45,6 +39,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -54,6 +49,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -64,6 +60,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -81,7 +78,8 @@
     "import time\n",
     "import rmm\n",
     "import cugraph\n",
-    "import cudf"
+    "import cudf\n",
+    "import os"
    ]
   },
   {
@@ -118,10 +116,24 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Define the test data"
+    "### Define the test data\n",
+    "\n",
+    "README.md contains instructions on importing data using the dataPrep.sh script\n",
+    "\n",
+    "| File Name              | Num of Vertices | Num of Edges |\n",
+    "|:---------------------- | --------------: | -----------: |\n",
+    "| preferentialAttachment |         100,000 |      999,970 |\n",
+    "| caidaRouterLevel       |         192,244 |    1,218,132 |\n",
+    "| coAuthorsDBLP          |         299,067 |    1,955,352 |\n",
+    "| dblp-2010              |         326,186 |    1,615,400 |\n",
+    "| citationCiteseer       |         268,495 |    2,313,294 |\n",
+    "| coPapersDBLP           |         540,486 |   30,491,458 |\n",
+    "| coPapersCiteseer       |         434,102 |   32,073,440 |\n",
+    "| as-Skitter             |       1,696,415 |   22,190,596 |"
    ]
   },
   {
@@ -144,6 +156,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -183,7 +196,7 @@
     "    t1 = time.time()\n",
     "        \n",
     "    # cugraph Pagerank Call\n",
-    "    G = cugraph.DiGraph()\n",
+    "    G = cugraph.Graph(directed=True)\n",
     "    G.from_cudf_edgelist(gdf, source='src', destination='dst', renumber=False)\n",
     "    \n",
     "    df = cugraph.bfs(G, 1)\n",
@@ -236,6 +249,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -316,6 +330,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -353,11 +368,12 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "___\n",
-    "Copyright (c) 2020, NVIDIA CORPORATION.\n",
+    "Copyright (c) 2020-2023, NVIDIA CORPORATION.\n",
     "\n",
     "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
     "\n",
@@ -368,9 +384,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "cugraph_dev",
+   "display_name": "cugraph_0510",
    "language": "python",
-   "name": "cugraph_dev"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -382,7 +398,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb b/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb
index 00e99a28617..dc1cfebe341 100644
--- a/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb
@@ -1,6 +1,7 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -11,22 +12,10 @@
     "<p><p>\n",
     "\n",
     "\n",
-    "#### Notebook Credits\n",
-    "\n",
-    "    Original Authors: Bradley Rees\n",
-    "    Last Edit: 06/10/2020\n",
-    "\n",
-    "\n",
-    "#### Test Environment\n",
-    "\n",
-    "    RAPIDS Versions: 0.15\n",
-    "\n",
-    "    Test Hardware:\n",
-    "    GV100 32G, CUDA 10,0\n",
-    "    Intel(R) Core(TM) CPU i7-7800X @ 3.50GHz\n",
-    "    32GB system memory\n",
-    "\n",
-    "\n",
+    "| Author Credit |    Date    |  Update          | cuGraph Version |  Test Hardware |\n",
+    "| --------------|------------|------------------|-----------------|----------------|\n",
+    "| Brad Rees     | 08/16/2020 | created          | 0.15            | GV100 32G, CUDA 10.0\n",
+    "| Don Acosta    | 05/15/2023 | update and test  | 23.06           | A6000 32G, CUDA 11.7\n",
     "\n",
     "#### Updates\n",
     "- moved loading ploting libraries to front so that dependencies can be checked before running algorithms\n",
@@ -36,7 +25,6 @@
     "\n",
     "\n",
     "#### Dependencies\n",
-    "- RAPIDS cuDF and cuGraph version 0.6.0 \n",
     "- NetworkX \n",
     "- Matplotlib \n",
     "- Scipy \n",
@@ -47,14 +35,15 @@
     "#### Note: Comparison against published results\n",
     "\n",
     "\n",
-    "The cuGraph blog post included performance numbers that were collected over a year ago.  For the test graphs, int32 values are now used.  That improves GPUs performance.  Additionally, the initial benchamrks were measured on a P100 GPU. \n",
+    "The cuGraph blog post included performance numbers that were collected over a year ago.  For the test graphs, int32 values are now used.  That improves GPUs performance.\n",
     "\n",
-    "This test only comparse the modularity scores and a success is if the scores are within 15% of each other.  That comparison is done by adjusting the NetworkX modularity score and then verifying that the cuGraph score is higher.\n",
+    "This test only compares the modularity scores and a success if the scores are within 15% of each other.  That comparison is done by adjusting the NetworkX modularity score and then verifying that the cuGraph score is higher.\n",
     "\n",
     "cuGraph did a full validation of NetworkX results against cuGraph results.  That included cross-validation of every cluster.  That test is very slow and not included here"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -123,10 +112,24 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Define the test data"
+    "### Define the test data\n",
+    "\n",
+    "README.md contains instructions on downloading the data using the dataPrep.sh script\n",
+    "\n",
+    "| File Name              | Num of Vertices | Num of Edges |\n",
+    "|:---------------------- | --------------: | -----------: |\n",
+    "| preferentialAttachment |         100,000 |      999,970 |\n",
+    "| caidaRouterLevel       |         192,244 |    1,218,132 |\n",
+    "| coAuthorsDBLP          |         299,067 |    1,955,352 |\n",
+    "| dblp-2010              |         326,186 |    1,615,400 |\n",
+    "| citationCiteseer       |         268,495 |    2,313,294 |\n",
+    "| coPapersDBLP           |         540,486 |   30,491,458 |\n",
+    "| coPapersCiteseer       |         434,102 |   32,073,440 |\n",
+    "| as-Skitter             |       1,696,415 |   22,190,596 |"
    ]
   },
   {
@@ -149,6 +152,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -235,6 +239,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -278,6 +283,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -303,6 +309,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -337,11 +344,12 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "___\n",
-    "Copyright (c) 2020, NVIDIA CORPORATION.\n",
+    "Copyright (c) 2020-2023, NVIDIA CORPORATION.\n",
     "\n",
     "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
     "\n",
@@ -352,9 +360,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "cugraph_dev",
+   "display_name": "cugraph_0510",
    "language": "python",
-   "name": "cugraph_dev"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -366,7 +374,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb b/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb
index d0416efdd87..b4c328cb281 100644
--- a/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb
@@ -1,35 +1,30 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# PageRank Performance Benchmarking\n",
     "# Skip notebook test\n",
     "\n",
-    "This notebook benchmarks performance of running PageRank within cuGraph against NetworkX. NetworkX contains several implementations of PageRank.  This benchmark will compare cuGraph versus the defaukt Nx implementation as well as the SciPy version\n",
+    "This notebook benchmarks performance of running PageRank within cuGraph against NetworkX. \n",
     "\n",
-    "Notebook Credits\n",
-    "\n",
-    "    Original Authors: Bradley Rees\n",
-    "    Last Edit: 08/16/2020\n",
-    "    \n",
-    "RAPIDS Versions: 0.15\n",
-    "\n",
-    "Test Hardware\n",
-    "\n",
-    "    GV100 32G, CUDA 10,0\n",
-    "    Intel(R) Core(TM) CPU i7-7800X @ 3.50GHz\n",
-    "    32GB system memory\n",
-    "    \n"
+    "| Author Credit |    Date    |  Update          | cuGraph Version |  Test Hardware |\n",
+    "| --------------|------------|------------------|-----------------|----------------|\n",
+    "| Brad Rees     | 08/16/2020 | created          | 0.15            | GV100 32G, CUDA 10.0\n",
+    "| Don Acosta    | 05/17/2023 | removed outdated pagerank_scipy  and tested  | 23.06           | A6000 32G, CUDA 11.7\n"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Test Data\n",
     "\n",
+    "README.md contains instructions to download data using the dataPrep.sh script\n",
+    "\n",
     "| File Name              | Num of Vertices | Num of Edges |\n",
     "|:---------------------- | --------------: | -----------: |\n",
     "| preferentialAttachment |         100,000 |      999,970 |\n",
@@ -39,12 +34,11 @@
     "| citationCiteseer       |         268,495 |    2,313,294 |\n",
     "| coPapersDBLP           |         540,486 |   30,491,458 |\n",
     "| coPapersCiteseer       |         434,102 |   32,073,440 |\n",
-    "| as-Skitter             |       1,696,415 |   22,190,596 |\n",
-    "\n",
-    "\n"
+    "| as-Skitter             |       1,696,415 |   22,190,596 |\n"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -57,6 +51,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -67,6 +62,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -84,7 +80,8 @@
     "import time\n",
     "import rmm\n",
     "import cugraph\n",
-    "import cudf"
+    "import cudf\n",
+    "import os"
    ]
   },
   {
@@ -121,6 +118,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -147,6 +145,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -186,8 +185,8 @@
     "    t1 = time.time()\n",
     "        \n",
     "    # cugraph Pagerank Call\n",
-    "    G = cugraph.DiGraph()\n",
-    "    G.from_cudf_edgelist(gdf, source='src', destination='dst', renumber=False)\n",
+    "    G = cugraph.Graph(directed=True)\n",
+    "    G.from_cudf_edgelist(gdf, source='src', destination='dst', renumber=False, store_transposed=True)\n",
     "    \n",
     "    df = cugraph.pagerank(G, alpha=alpha, max_iter=max_iter, tol=tol)\n",
     "    t2 = time.time() - t1\n",
@@ -239,46 +238,7 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# SciPy PageRank\n",
-    "\n",
-    "def networkx_scipy_call(M, max_iter, tol, alpha):\n",
-    "    nnz_per_row = {r: 0 for r in range(M.get_shape()[0])}\n",
-    "    for nnz in range(M.getnnz()):\n",
-    "        nnz_per_row[M.row[nnz]] = 1 + nnz_per_row[M.row[nnz]]\n",
-    "    for nnz in range(M.getnnz()):\n",
-    "        M.data[nnz] = 1.0/float(nnz_per_row[M.row[nnz]])\n",
-    "\n",
-    "    M = M.tocsr()\n",
-    "    if M is None:\n",
-    "        raise TypeError('Could not read the input graph')\n",
-    "    if M.shape[0] != M.shape[1]:\n",
-    "        raise TypeError('Shape is not square')\n",
-    "\n",
-    "    # should be autosorted, but check just to make sure\n",
-    "    if not M.has_sorted_indices:\n",
-    "        print('sort_indices ... ')\n",
-    "        M.sort_indices()\n",
-    "\n",
-    "    z = {k: 1.0/M.shape[0] for k in range(M.shape[0])}\n",
-    "\n",
-    "    # SciPy Pagerank Call\n",
-    "    print('\\tSciPy Solving... ')\n",
-    "    t1 = time.time()\n",
-    "    \n",
-    "    Gnx = nx.DiGraph(M)    \n",
-    "    \n",
-    "    pr = nx.pagerank_scipy(Gnx, alpha, z, max_iter, tol)\n",
-    "    t2 = time.time() - t1\n",
-    "\n",
-    "    return t2"
-   ]
-  },
-  {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -324,18 +284,13 @@
     "    speedUp = (tn / trapids)\n",
     "    perf_nx.append(speedUp)\n",
     "    time_nx.append(tn)\n",
-    "    \n",
-    "    # Now call SciPy\n",
-    "    tsp = networkx_scipy_call(M, 100, 0.00001, 0.85)\n",
-    "    speedUp = (tsp / trapids)\n",
-    "    perf_sp.append(speedUp)  \n",
-    "    time_sp.append(tsp)\n",
-    "    \n",
-    "    print(\"cuGraph (\" + str(trapids) + \")  Nx (\" + str(tn) + \")  SciPy (\" + str(tsp) + \")\" )\n",
+    "        \n",
+    "    print(\"cuGraph (\" + str(trapids) + \")  Nx (\" + str(tn) + \")\" )\n",
     "    del M"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -356,7 +311,6 @@
     "index = np.arange(len(names))\n",
     "\n",
     "_ = plt.bar(index, perf_nx, bar_width, color='g', label='vs Nx')\n",
-    "_ = plt.bar(index + bar_width, perf_sp, bar_width, color='b', label='vs SciPy')\n",
     "\n",
     "plt.xlabel('Datasets')\n",
     "plt.ylabel('Speedup')\n",
@@ -368,15 +322,12 @@
     "for i in range(len(perf_nx)):\n",
     "    plt.text(x = (i - 0.55) + bar_width, y = perf_nx[i] + 25, s = round(perf_nx[i], 1), size = 12)\n",
     "\n",
-    "for i in range(len(perf_sp)):\n",
-    "    plt.text(x = (i - 0.1) + bar_width, y = perf_sp[i] + 25, s = round(perf_sp[i], 1), size = 12)\n",
-    "\n",
-    "\n",
     "plt.legend()\n",
     "plt.show()"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -429,11 +380,12 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "___\n",
-    "Copyright (c) 2020, NVIDIA CORPORATION.\n",
+    "Copyright (c) 2020-2023, NVIDIA CORPORATION.\n",
     "\n",
     "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
     "\n",
@@ -444,9 +396,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "cugraph_dev",
+   "display_name": "cugraph_0510",
    "language": "python",
-   "name": "cugraph_dev"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -458,7 +410,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb b/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb
index 32b562e7a1e..f6c239f2818 100644
--- a/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb
@@ -1,6 +1,7 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -9,27 +10,22 @@
     "\n",
     "This notebook benchmarks performance of running SSSP within cuGraph against NetworkX. \n",
     "\n",
-    "Notebook Credits\n",
     "\n",
-    "    Original Authors: Bradley Rees\n",
-    "    Last Edit: 06/10/2020\n",
-    "    \n",
-    "RAPIDS Versions: 0.15\n",
-    "\n",
-    "Test Hardware\n",
-    "\n",
-    "    GV100 32G, CUDA 10,0\n",
-    "    Intel(R) Core(TM) CPU i7-7800X @ 3.50GHz\n",
-    "    32GB system memory\n",
-    "    \n"
+    "| Author Credit |    Date    |  Update          | cuGraph Version |  Test Hardware |\n",
+    "| --------------|------------|------------------|-----------------|----------------|\n",
+    "| Brad Rees     | 06/10/2020 | created          | 0.15            | GV100 32G, CUDA 10.0\n",
+    "| Don Acosta    | 05/17/2023 | update and test  | 23.06           | A6000 48G, CUDA 11.7\n"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Test Data\n",
     "\n",
+    "README.md contains instructions to download data using the dataPrep.sh script\n",
+    "\n",
     "| File Name              | Num of Vertices | Num of Edges |\n",
     "|:---------------------- | --------------: | -----------: |\n",
     "| preferentialAttachment |         100,000 |      999,970 |\n",
@@ -45,6 +41,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -57,6 +54,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -67,6 +65,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -84,17 +83,7 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# since this is a shared machine - let's pick a GPU that no one else is using\n",
-    "import os\n",
-    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""
-   ]
-  },
-  {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -112,7 +101,8 @@
     "import time\n",
     "import rmm\n",
     "import cugraph\n",
-    "import cudf"
+    "import cudf\n",
+    "import os"
    ]
   },
   {
@@ -132,7 +122,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "try: \n",
+    "try:\n",
     "    import matplotlib\n",
     "except ModuleNotFoundError:\n",
     "    os.system('pip install matplotlib')"
@@ -149,6 +139,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -175,6 +166,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -208,14 +200,16 @@
     "    gdf = cudf.DataFrame()\n",
     "    gdf['src'] = M.row\n",
     "    gdf['dst'] = M.col\n",
+    "    # added this since SSSP now requires a weight\n",
+    "    gdf['weight'] = 1\n",
     "    \n",
     "    print('\\tcuGraph Solving... ')\n",
     "    \n",
     "    t1 = time.time()\n",
     "        \n",
     "    # cugraph SSSP Call\n",
-    "    G = cugraph.DiGraph()\n",
-    "    G.from_cudf_edgelist(gdf, source='src', destination='dst', renumber=False)\n",
+    "    G = cugraph.Graph(directed=True)\n",
+    "    G.from_cudf_edgelist(gdf, source='src', destination='dst', weight = 'weight', renumber=False)\n",
     "    \n",
     "    df = cugraph.sssp(G, 1)\n",
     "    t2 = time.time() - t1\n",
@@ -267,6 +261,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -345,6 +340,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -379,11 +375,12 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "___\n",
-    "Copyright (c) 2020, NVIDIA CORPORATION.\n",
+    "Copyright (c) 2020-2023, NVIDIA CORPORATION.\n",
     "\n",
     "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
     "\n",
@@ -394,9 +391,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "cugraph_dev",
+   "display_name": "cugraph_0510",
    "language": "python",
-   "name": "cugraph_dev"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -408,7 +405,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.10.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/demo/batch_betweenness.ipynb b/notebooks/demo/batch_betweenness.ipynb
deleted file mode 100644
index 854621eb364..00000000000
--- a/notebooks/demo/batch_betweenness.ipynb
+++ /dev/null
@@ -1,397 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Multi-GPU Batch Betweenness Centrality\n",
-    "#### Author : Xavier Cadet\n",
-    "In this notebook, we will compute Betweenness Centrality for vertices using cuGraph and will see how to **use Multiple GPUs to compute Betweenness Centrality scores**.\n",
-    "\n",
-    "This notebook was tested using 4 NVIDIA Tesla V100-DGX 32G GPUs, using RAPIDS 0.15, and CUDA 10.1. Please be aware that your system may be different and you may need to modify the code or install packages to run the below examples. If you think you have found a bug or an error, please file an issue in [cuGraph](https://github.com/rapidsai/cugraph/issues)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Introduction\n",
-    "Betweenness Centrality can be slow to compute on large graphs, in order to speed up the process we can leverage multiple GPUs.\n",
-    "In this notebook we will showcase how it would have been done with a Single GPU approach, then we will show how it can be done using multiple GPUs."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Data\n",
-    "The soc-LiveJournal1 dataset which can be obtained on [SNAP](https://snap.stanford.edu/data/soc-LiveJournal1.html). This graph contains roughly 5 million nodes, and 70 million edges and was extracted from the LiveJournal online social network, further information can be found in:\n",
-    "\n",
-    "*Group Formation in Large Social Networks: Membership, Growth, and Evolution., L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan., KDD, 2006.*\n",
-    "\n",
-    "and:\n",
-    "\n",
-    "*Community Structure in Large Networks: Natural Cluster Sizes and the Absence of Large Well-Defined Clusters., J. Leskovec, K. Lang, A. Dasgupta, M. Mahoney., Internet Mathematics 6(1) 29--123, 2009.*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Betweenness Centrality with cuGraph"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### The imports:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import cugraph\n",
-    "import cudf\n",
-    "\n",
-    "import dask\n",
-    "import dask_cuda\n",
-    "import cugraph.comms as Comms"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "import cupy"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Get the data\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import urllib.request\n",
-    "import os\n",
-    "\n",
-    "data_dir = '../data/'\n",
-    "if not os.path.exists(data_dir):\n",
-    "    print('creating data directory')\n",
-    "    os.system('mkdir ../data')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# download the soc-LiveJournal1 dataset\n",
-    "base_url = 'https://snap.stanford.edu/data/'\n",
-    "fn = 'soc-LiveJournal1.txt'\n",
-    "comp = '.gz'\n",
-    "if not os.path.isfile(data_dir + fn):\n",
-    "    if not os.path.isfile(data_dir + fn + comp):\n",
-    "        print(f'Downloading {base_url + fn + comp} to {data_dir + fn + comp}')\n",
-    "        urllib.request.urlretrieve(base_url + fn + comp, data_dir + fn + comp)\n",
-    "    print(f'Decompressing {data_dir + fn + comp}...')\n",
-    "    os.system('gunzip ' + data_dir + fn + comp)\n",
-    "    print(f'{data_dir + fn + comp} decompressed!')\n",
-    "else:\n",
-    "    print(f'Your data file, {data_dir + fn}, already exists')\n",
-    "input_data_path = data_dir + fn"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Single GPU"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Reading the Data - Single GPU\n",
-    "The following shows how we would read the csv file using a single GPU as it is commonly done when using a single GPU with CuGraph."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "t_start_read_sg = time.perf_counter()\n",
-    "e_list = cudf.read_csv(input_data_path, delimiter='\\t', names=['src', 'dst'], dtype=['int32', 'int32'], comment='#')\n",
-    "t_stop_read_sg = time.perf_counter()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"SG Read time: {}s\".format(t_stop_read_sg - t_start_read_sg))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Building the Graph - Single GPU\n",
-    "Once we read the file, we need to build the Graph, we will use a DiGraph, and use the content extracted from the .csv file as an edge list."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "t_start_build_sg = time.perf_counter()\n",
-    "G = cugraph.DiGraph()\n",
-    "G.from_cudf_edgelist(e_list, source='src', destination='dst')\n",
-    "t_stop_build_sg = time.perf_counter()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"SG Build time: {}s\".format(t_stop_build_sg - t_start_build_sg))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Calling the Algorithm -  Single GPU\n",
-    "Now that our graph is built, we can get its betweenness centrality score. Here we will use a sub-sample of 1024 sources in order to have a better approximation of the overall betweenness centrality. We set the seed for comparability with the multi GPU version that comes next."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "t_start_sg = time.perf_counter()\n",
-    "sg_df = cugraph.betweenness_centrality(G, k=1024, seed=123)\n",
-    "t_stop_sg = time.perf_counter()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"SG Time elapsed: {}s\".format(t_stop_sg - t_start_sg))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Now let's use multiple GPUs!"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Using a Dask Cluster\n",
-    "In order to use multiple GPU, we need to ensure that we have Dask Cluster and Client running, further more we need to initialize the CuGraph Communicator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cluster = dask_cuda.LocalCUDACluster()\n",
-    "client = dask.distributed.Client(cluster)\n",
-    "Comms.initialize(p2p=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Enabling Multi GPU Batch Processing\n",
-    "The good thing is that with a simple `enable_mg_batch` call you can harness the power of Multiple GPUs to operate Batch Processing.\n",
-    "This step might take a few seconds, indeed we need to get the graph available to all GPUS, do not worry, this is only required once or when adding new representations to the graph (adjacency list for example)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "t_start_mg = time.perf_counter()\n",
-    "G.enable_batch()\n",
-    "print(\"MG Batch Enabling Time elapsed: {}s\".format(time.perf_counter() - t_start_mg))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Calling the algorithm\n",
-    "We call the algorithm the same way as we used to, but this time it is much faster as we leverage multiple GPUs to compute the Betweenness Centrality scores."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "t_start_mg = time.perf_counter()\n",
-    "batch_df = cugraph.betweenness_centrality(G, k=1024, seed=123)\n",
-    "t_stop_mg = time.perf_counter()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"MG Time elapsed: {}s\".format(t_stop_mg - t_start_mg))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Verification\n",
-    "Order in the DataFrame might vary, but scores for each vertices match, in order to display them side by side we will first sort the resluts based on the `vertex` key, and renew the DataFramee index."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sorted_sg_df = sg_df.sort_values(\"vertex\").reset_index(drop=True)\n",
-    "sorted_batch_df = batch_df.sort_values(\"vertex\").reset_index(drop=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We can now compare score for each of the vertices:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cupy.allclose(sorted_sg_df[\"betweenness_centrality\"], sorted_batch_df[\"betweenness_centrality\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "And just to visually compare the results we can display the DataFrames:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(sorted_sg_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(sorted_batch_df)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Do not forget to clear the Communicator / client /cluster if required."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "Comms.destroy()\n",
-    "client.close()\n",
-    "cluster.close()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "___\n",
-    "Copyright (c) 2020, NVIDIA CORPORATION.\n",
-    "\n",
-    "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
-    "\n",
-    "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n",
-    "___"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "cugraph_dev",
-   "language": "python",
-   "name": "cugraph_dev"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/demo/mg_jaccard.ipynb b/notebooks/demo/mg_jaccard.ipynb
new file mode 100644
index 00000000000..028deb72c30
--- /dev/null
+++ b/notebooks/demo/mg_jaccard.ipynb
@@ -0,0 +1,384 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Running Jaccard, Sorensen, and Overlap on Multiple GPUs\n",
+    "\n",
+    "This is a Multi-GPU notebook that loads data into a dask_cudf dataframe, creates a Graph, and then runs Jaccard, Sorensen, and Overlap.\n",
+    "\n",
+    "\n",
+    "\n",
+    "| Author Credit |    Date    |  Update          | cuGraph Version |  Test Hardware        |\n",
+    "|---------------|------------|------------------|-----------------|-----------------------|\n",
+    "| Don Acosta    | 04/21/2023 | created          | 23.06 nightly   |  2xA6000 CUDA 11.7    |\n",
+    "| Brad Rees     | 04/24/2023 | Added RMAT       | 23.06 nightly   |  2xA6000 CUDA 11.7    |\n",
+    "\n",
+    "\n",
+    "CuGraph's multi-GPU features leverage Dask. RAPIDS has other projects based on Dask such as dask-cudf and dask-cuda. These products will also be used in this example. Check out [RAPIDS.ai](https://rapids.ai/) to learn more about these technologies."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Basic setup"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    " Refer to https://docs.rapids.ai/install to learn how to create an environment for running cuGraph notebooks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import needed libraries. We recommend using a [conda environment](https://github.com/rapidsai/cugraph/tree/HEAD/conda/environments) provided in the cugraph repo.\n",
+    "from dask.distributed import Client, wait\n",
+    "from dask_cuda import LocalCUDACluster\n",
+    "from cugraph.dask.comms import comms as Comms\n",
+    "\n",
+    "import cugraph.dask as dask_cugraph\n",
+    "import cugraph\n",
+    "from cugraph.generators import rmat\n",
+    "\n",
+    "import dask_cudf\n",
+    "import time\n",
+    "import urllib.request\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialize multi-GPU environment\n",
+    "Before we get started, we need to set up a Dask local cluster of workers to execute our work, and a client to coordinate and schedule work for that cluster. As we see below, we can initiate a cluster and client using only 3 lines of code.\n",
+    "\n",
+    "The enable_spilling feature allows the graph stored in GPU memory to spill to host memory if necessary."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def enable_spilling():\n",
+    "    import cudf\n",
+    "    cudf.set_option(\"spill\", True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "enable_spilling()\n",
+    "cluster = LocalCUDACluster()\n",
+    "client = Client(cluster)\n",
+    "client.run(enable_spilling)\n",
+    "Comms.initialize(p2p=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data\n",
+    "This notebook will use RMAT to generate a synthetic dataset.  The size of the dataset will be determined by the number of GPUs present.\n",
+    "This appraoch removes the need to find test data of various sizes.\n",
+    "\n",
+    "The notebook will call the cugraph.rmat function and have a dask_cudf DataFrame returned.  The rmat function could simply return a Graph object, which is more memory effecient, but the goal is to show the process starting with a dataframe\n",
+    "\n",
+    "| Number of GPUs | Scale | Edge Factor | Est Number of Nodes | Est Number of Edges |\n",
+    "|----------------|-------|-------------|---------------------|---------------------|\n",
+    "| 1              |  24   |     16      |      16,000,000     |      256,000,000    |\n",
+    "| 2              |  25   |     16      |      32,000,000     |      512,000,000    |\n",
+    "| 3              |  25   |     24      |      32,000,000     |      768,000,000    |\n",
+    "| 4              |  26   |     16      |      64,000,000     |    1,024,000,000    |\n",
+    "| 5              |  26   |     20      |      64,000,000     |    1,280,000,000    |\n",
+    "| 6              |  26   |     24      |      64,000,000     |    1,536,000,000    |\n",
+    "| 7              |  26   |     28      |      64,000,000     |    1,792,000,000    |\n",
+    "| 8              |  27   |     16      |     128,000,000     |    2,048,000,000    |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rmat_settings = (\n",
+    "    [24 , 16],\n",
+    "    [25 , 16],\n",
+    "    [25 , 24],\n",
+    "    [26 , 16],\n",
+    "    [26 , 20],\n",
+    "    [26 , 24],\n",
+    "    [26 , 28],\n",
+    "    [27 , 16],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess\n",
+    "def get_gpu_memory_size():\n",
+    "    result = subprocess.check_output(\n",
+    "        [\n",
+    "            'nvidia-smi', '--query-gpu=memory.total'\n",
+    "            , '--format=csv,nounits,noheader'\n",
+    "        ]\n",
+    "    )    \n",
+    "    return result.decode('utf-8').strip().split('\\n')#).strip().split('\\n')\n",
+    "\n",
+    "gpu_info = get_gpu_memory_size()\n",
+    "number_of_gpus = len(gpu_info)\n",
+    "gpu_memory = int(gpu_info[0])\n",
+    "print(f\"the cluster has {number_of_gpus} GPUs where each GPU has {gpu_memory} GB of memory\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scale, factor = rmat_settings[number_of_gpus - 1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if gpu_memory < 3600:\n",
+    "    factor = (int)(factor * 0.75)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create the RMAT dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "numedges = (2**scale)* factor\n",
+    "ddf = rmat(\n",
+    "    scale=scale,\n",
+    "    num_edges=numedges,\n",
+    "    a=0.57,\n",
+    "    b=0.19,\n",
+    "    c=0.19,\n",
+    "    seed=42,\n",
+    "    clip_and_flip=False,\n",
+    "    scramble_vertex_ids=True,\n",
+    "    create_using=None,\n",
+    "    mg=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Let's see how many edges were created"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(ddf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ddf.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a Graph\n",
+    "yes, the rmat generator could have returned a Graph, but the goal for for this code to also be used as if data was loaded into cuDF via \n",
+    "read_csv, read_parquet, or similar data loading process. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "G = cugraph.Graph(directed=False)\n",
+    "G.from_dask_cudf_edgelist(ddf, renumber=False, source='src', destination='dst')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we are using 1-hop pairs for demonstration\n",
+    "vertex_pairs = ddf.loc[0:1000]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vertex_pairs.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run Multi-GPU jaccard\n",
+    "\n",
+    "Additional Reading\n",
+    "- [Wikipedia: Jaccard](https://en.wikipedia.org/wiki/Jaccard_index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "jdf = dask_cugraph.jaccard(G,vertex_pairs)\n",
+    "jdf.head(20)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run Multi-GPU Sorensen\n",
+    "\n",
+    "Additional Reading\n",
+    "- [Wikipedia: Sorensen Coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sdf = jdf = dask_cugraph.sorensen(G,vertex_pairs)\n",
+    "sdf.head(20)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run Multi-GPU overlap\n",
+    "\n",
+    "Additional Reading\n",
+    "- [Wikipedia: Sorensen Coefficient](https://en.wikipedia.org/wiki/Overlap_coefficient)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "odf = jdf = dask_cugraph.overlap(G,vertex_pairs)\n",
+    "odf.head(20)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Clean up and Shut down the multi-GPU Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "del(ddf)\n",
+    "del(G)\n",
+    "\n",
+    "Comms.destroy()\n",
+    "client.close()\n",
+    "cluster.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "___\n",
+    "Copyright (c) 2023, NVIDIA CORPORATION.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
+    "\n",
+    "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n",
+    "___"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cugraph_0411",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/demo/mg_louvain.ipynb b/notebooks/demo/mg_louvain.ipynb
index a23c8eadb82..8e8c5b04115 100644
--- a/notebooks/demo/mg_louvain.ipynb
+++ b/notebooks/demo/mg_louvain.ipynb
@@ -7,11 +7,11 @@
    "source": [
     "# Multiple GPU Louvain in cuGraph\n",
     "\n",
-    "| Author Credit |    Date    |  Update          | cuGraph Version |  Test Hardware        |\n",
-    "|---------------|------------|------------------|-----------------|-----------------------|\n",
-    "| Chuck Hastings| 11/08/2021 | created          | 21.10 nightly   |                       |\n",
-    "| Don Acosta    | 01/30/2023 | updated          | 23.02 nightly   |  2xA6000 CUDA 11.7    |\n",
-    "\n",
+    "| Author Credit |    Date    |  Update            | cuGraph Version |  Test Hardware        |\n",
+    "|---------------|------------|--------------------|-----------------|-----------------------|\n",
+    "| Chuck Hastings| 11/08/2021 | created            | 21.10 nightly   |                       |\n",
+    "| Don Acosta    | 01/30/2023 | updated            | 23.02 nightly   |  2xA6000 CUDA 11.7    |\n",
+    "| Don Acosta    | 05/12/2023 | updated and tested | 23.06 nightly   |  2xA6000 CUDA 11.7    |\n",
     "\n",
     "In this notebook, we will show how to use multiple GPUs in cuGraph to compute the Louvain partitions and global modularity score for a dataset.\n",
     "\n",
@@ -22,6 +22,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -31,7 +32,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -46,6 +47,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -53,6 +55,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -64,11 +67,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Your data file, ../data/hollywood.csv, already exists\n"
+     ]
+    }
+   ],
    "source": [
     "import urllib.request\n",
     "import os\n",
@@ -98,6 +109,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -107,9 +119,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-05-12 08:59:04,694 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-t6i5b0t1', purging\n",
+      "2023-05-12 08:59:04,694 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-pn3z4erj', purging\n",
+      "2023-05-12 08:59:04,695 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize\n",
+      "2023-05-12 08:59:04,695 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
+      "2023-05-12 08:59:04,718 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize\n",
+      "2023-05-12 08:59:04,718 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n"
+     ]
+    }
+   ],
    "source": [
     "cluster = LocalCUDACluster()\n",
     "client = Client(cluster)\n",
@@ -117,6 +142,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -126,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {
     "tags": []
    },
@@ -136,13 +162,14 @@
     "t_start = time.time()\n",
     "\n",
     "# Helper function to set the reader chunk size to automatically get one partition per GPU  \n",
-    "chunksize = dask_cugraph.get_chunksize(input_data_path)\n",
+    "blocksize = dask_cugraph.get_chunksize(input_data_path)\n",
     "\n",
     "# Multi-GPU CSV reader\n",
-    "e_list = dask_cudf.read_csv(input_data_path, chunksize = chunksize, delimiter=' ', names=['src', 'dst'], dtype=['int32', 'int32'])"
+    "e_list = dask_cudf.read_csv(input_data_path, blocksize = blocksize, delimiter=' ', names=['src', 'dst'], dtype=['int32', 'int32'])"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -151,11 +178,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Read, load and renumber:  8.843012809753418 s\n"
+     ]
+    }
+   ],
    "source": [
     "# Create an undirected graph using the source (src) and destination (dst) vertex pairs from the Dataframe \n",
     "G = cugraph.Graph(directed=False)\n",
@@ -166,6 +201,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -174,12 +210,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {
     "scrolled": true,
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Louvain:  5.56727409362793 s\n"
+     ]
+    }
+   ],
    "source": [
     "# Start Pagerank timer\n",
     "t_start = time.time()\n",
@@ -192,6 +236,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -199,6 +244,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -209,19 +255,127 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {
     "scrolled": true,
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>vertex</th>\n",
+       "      <th>partition</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>601225</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>599864</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>601226</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>791343</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>600687</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>570285</th>\n",
+       "      <td>1139894</td>\n",
+       "      <td>20139</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>570286</th>\n",
+       "      <td>1139896</td>\n",
+       "      <td>20140</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>570287</th>\n",
+       "      <td>1139897</td>\n",
+       "      <td>3451</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>570288</th>\n",
+       "      <td>1139898</td>\n",
+       "      <td>40215</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>570289</th>\n",
+       "      <td>1139903</td>\n",
+       "      <td>31869</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1139905 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         vertex  partition\n",
+       "0        601225          2\n",
+       "1        599864          2\n",
+       "2        601226          2\n",
+       "3        791343          0\n",
+       "4        600687          2\n",
+       "...         ...        ...\n",
+       "570285  1139894      20139\n",
+       "570286  1139896      20140\n",
+       "570287  1139897       3451\n",
+       "570288  1139898      40215\n",
+       "570289  1139903      31869\n",
+       "\n",
+       "[1139905 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "louvain_df.compute()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -261,7 +415,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.10.10"
   },
   "vscode": {
    "interpreter": {
diff --git a/notebooks/demo/mg_pagerank.ipynb b/notebooks/demo/mg_pagerank.ipynb
index 5e3f05210e5..bb333048450 100644
--- a/notebooks/demo/mg_pagerank.ipynb
+++ b/notebooks/demo/mg_pagerank.ipynb
@@ -1,21 +1,37 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Multiple GPU in cuGraph\n",
-    "#### Author : Alex Fender\n",
+    "\n",
     "\n",
     "In this notebook, we will show how to use multiple GPUs in cuGraph to compute the PageRank of each user in Twitter's dataset.\n",
     "\n",
-    "This notebook was tested using RAPIDS 0.15 and CUDA 10.2. Please be aware that your system may be different, and you may need to modify the code or install packages to run the below examples. If you think you have found a bug or an error, please file an issue in [cuGraph](https://github.com/rapidsai/cugraph/issues)\n",
+    " If you think you have found a bug or an error, please file an issue in [cuGraph](https://github.com/rapidsai/cugraph/issues)\n",
+    "\n",
+    "| Author Credit |    Date    |  Update          | cuGraph Version |  Test Hardware                                           |\n",
+    "|---------------|------------|------------------|-----------------|----------------------------------------------------------|\n",
+    "|Alex Fender    | 11/08/2021 | created          | 0.15            | Tesla V100-DGX 32G GPU CUDA 10.2                         |\n",
+    "| Don Acosta    | 05/11/2023 | updated/tested   | 23.06 nightly   |  2xA6000 CUDA 11.7\n",
     "\n",
     "\n",
     "CuGraph's multi-GPU features leverage Dask. RAPIDS has other projects based on Dask such as dask-cudf and dask-cuda. These products will also be used in this example. Check out [RAPIDS.ai](https://rapids.ai/) to learn more about these technologies."
    ]
   },
   {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Note:\n",
+    "This notebook runs only in releases 23.06 and later. It requires memory optimizations delivered in 23.06"
+   ]
+  },
+  {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -27,6 +43,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -36,7 +53,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,6 +68,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -58,6 +76,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -69,11 +88,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Your data file, ../data/twitter-2010.csv, already exists\n"
+     ]
+    }
+   ],
    "source": [
     "import urllib.request\n",
     "import os\n",
@@ -102,6 +129,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -111,16 +139,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "def enable_spilling():\n",
+    "    import cudf\n",
+    "    cudf.set_option(\"spill\", True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-05-12 09:22:17,684 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize\n",
+      "2023-05-12 09:22:17,684 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
+      "2023-05-12 09:22:17,688 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize\n",
+      "2023-05-12 09:22:17,688 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n"
+     ]
+    }
+   ],
    "source": [
     "cluster = LocalCUDACluster()\n",
+    "enable_spilling()\n",
     "client = Client(cluster)\n",
     "Comms.initialize(p2p=True)"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -130,7 +182,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {
     "tags": []
    },
@@ -140,13 +192,14 @@
     "t_start = time.time()\n",
     "\n",
     "# Helper function to set the reader chunk size to automatically get one partition per GPU  \n",
-    "chunksize = dask_cugraph.get_chunksize(input_data_path)\n",
+    "blocksize = dask_cugraph.get_chunksize(input_data_path)\n",
     "\n",
     "# Multi-GPU CSV reader\n",
-    "e_list = dask_cudf.read_csv(input_data_path, chunksize = chunksize, delimiter=' ', names=['src', 'dst'], dtype=['int32', 'int32'])\n"
+    "e_list = dask_cudf.read_csv(input_data_path, blocksize = blocksize, delimiter=' ', names=['src', 'dst'], dtype=['int32', 'int32'])\n"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -155,11 +208,305 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-05-12 09:25:01,974 - distributed.sizeof - WARNING - Sizeof calculation failed. Defaulting to 0.95 MiB\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/sizeof.py\", line 17, in safe_sizeof\n",
+      "    return sizeof(obj)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/utils.py\", line 642, in __call__\n",
+      "    return meth(arg, *args, **kwargs)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask_cudf/backends.py\", line 430, in sizeof_cudf_dataframe\n",
+      "    + df._index.memory_usage()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 1594, in memory_usage\n",
+      "    if self.levels:\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 605, in levels\n",
+      "    self._compute_levels_and_codes()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 748, in _compute_levels_and_codes\n",
+      "    code, cats = cudf.Series._from_data({None: col}).factorize()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/single_column_frame.py\", line 311, in factorize\n",
+      "    return cudf.core.algorithms.factorize(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/algorithms.py\", line 138, in factorize\n",
+      "    labels = values._column._label_encoding(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1385, in _label_encoding\n",
+      "    order = order.take(left_gather_map, check_bounds=False).argsort()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1101, in argsort\n",
+      "    return self.as_frame()._get_sorted_inds(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 1572, in _get_sorted_inds\n",
+      "    return libcudf.sort.order_by(to_sort, ascending, na_position)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"sort.pyx\", line 141, in cudf._lib.sort.order_by\n",
+      "MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/dacosta/miniconda3/envs/cugraph_0411/include/rmm/mr/device/cuda_memory_resource.hpp\n",
+      "2023-05-12 09:25:01,976 - distributed.sizeof - WARNING - Sizeof calculation failed. Defaulting to 0.95 MiB\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/sizeof.py\", line 17, in safe_sizeof\n",
+      "    return sizeof(obj)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/utils.py\", line 642, in __call__\n",
+      "    return meth(arg, *args, **kwargs)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask_cudf/backends.py\", line 430, in sizeof_cudf_dataframe\n",
+      "    + df._index.memory_usage()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 1594, in memory_usage\n",
+      "    if self.levels:\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 605, in levels\n",
+      "    self._compute_levels_and_codes()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 748, in _compute_levels_and_codes\n",
+      "    code, cats = cudf.Series._from_data({None: col}).factorize()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/single_column_frame.py\", line 311, in factorize\n",
+      "    return cudf.core.algorithms.factorize(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/algorithms.py\", line 138, in factorize\n",
+      "    labels = values._column._label_encoding(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1385, in _label_encoding\n",
+      "    order = order.take(left_gather_map, check_bounds=False).argsort()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1101, in argsort\n",
+      "    return self.as_frame()._get_sorted_inds(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 1572, in _get_sorted_inds\n",
+      "    return libcudf.sort.order_by(to_sort, ascending, na_position)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"sort.pyx\", line 141, in cudf._lib.sort.order_by\n",
+      "MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/dacosta/miniconda3/envs/cugraph_0411/include/rmm/mr/device/cuda_memory_resource.hpp\n",
+      "2023-05-12 09:25:03,767 - distributed.sizeof - WARNING - Sizeof calculation failed. Defaulting to 0.95 MiB\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/sizeof.py\", line 17, in safe_sizeof\n",
+      "    return sizeof(obj)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/utils.py\", line 642, in __call__\n",
+      "    return meth(arg, *args, **kwargs)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask_cudf/backends.py\", line 430, in sizeof_cudf_dataframe\n",
+      "    + df._index.memory_usage()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 1594, in memory_usage\n",
+      "    if self.levels:\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 605, in levels\n",
+      "    self._compute_levels_and_codes()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 748, in _compute_levels_and_codes\n",
+      "    code, cats = cudf.Series._from_data({None: col}).factorize()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/single_column_frame.py\", line 311, in factorize\n",
+      "    return cudf.core.algorithms.factorize(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/algorithms.py\", line 138, in factorize\n",
+      "    labels = values._column._label_encoding(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1385, in _label_encoding\n",
+      "    order = order.take(left_gather_map, check_bounds=False).argsort()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1101, in argsort\n",
+      "    return self.as_frame()._get_sorted_inds(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 1572, in _get_sorted_inds\n",
+      "    return libcudf.sort.order_by(to_sort, ascending, na_position)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"sort.pyx\", line 141, in cudf._lib.sort.order_by\n",
+      "MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/dacosta/miniconda3/envs/cugraph_0411/include/rmm/mr/device/cuda_memory_resource.hpp\n",
+      "2023-05-12 09:25:03,768 - distributed.sizeof - WARNING - Sizeof calculation failed. Defaulting to 0.95 MiB\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/sizeof.py\", line 17, in safe_sizeof\n",
+      "    return sizeof(obj)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/utils.py\", line 642, in __call__\n",
+      "    return meth(arg, *args, **kwargs)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask_cudf/backends.py\", line 430, in sizeof_cudf_dataframe\n",
+      "    + df._index.memory_usage()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 1594, in memory_usage\n",
+      "    if self.levels:\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 605, in levels\n",
+      "    self._compute_levels_and_codes()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/multiindex.py\", line 748, in _compute_levels_and_codes\n",
+      "    code, cats = cudf.Series._from_data({None: col}).factorize()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/single_column_frame.py\", line 311, in factorize\n",
+      "    return cudf.core.algorithms.factorize(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/algorithms.py\", line 138, in factorize\n",
+      "    labels = values._column._label_encoding(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1385, in _label_encoding\n",
+      "    order = order.take(left_gather_map, check_bounds=False).argsort()\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1101, in argsort\n",
+      "    return self.as_frame()._get_sorted_inds(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 1572, in _get_sorted_inds\n",
+      "    return libcudf.sort.order_by(to_sort, ascending, na_position)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"sort.pyx\", line 141, in cudf._lib.sort.order_by\n",
+      "MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /home/dacosta/miniconda3/envs/cugraph_0411/include/rmm/mr/device/cuda_memory_resource.hpp\n",
+      "2023-05-12 09:25:03,820 - distributed.worker - ERROR - Could not deserialize task ('len-chunk-319fe46af5510615b2fae86c6e732896-841a12bf4568ebb80eb2030cc4d9651d', 1)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2923, in loads_function\n",
+      "    result = cache_loads[bytes_object]\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/collections.py\", line 24, in __getitem__\n",
+      "    value = super().__getitem__(key)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/collections/__init__.py\", line 1106, in __getitem__\n",
+      "    raise KeyError(key)\n",
+      "KeyError: b'\\x80\\x05\\x95>\\x0b\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x11dask.optimization\\x94\\x8c\\x10SubgraphCallable\\x94\\x93\\x94(}\\x94(\\x8cKlen-chunk-319fe46af5510615b2fae86c6e732896-841a12bf4568ebb80eb2030cc4d9651d\\x94\\x8cZassign-getitem-len-chunk-319fe46af5510615b2fae86c6e732896-841a12bf4568ebb80eb2030cc4d9651d\\x94\\x8c*rename-01db283bd79fee66f232920c8dc6b55e_.0\\x94\\x8c;getitem-to_frame-rename-01db283bd79fee66f232920c8dc6b55e_.0\\x94\\x8c+getitem-3499fd71ac25ebbc1a06991edea6067c_.0\\x94\\x8c\\t_operator\\x94\\x8c\\x07getitem\\x94\\x93\\x94\\x8c/reset_index-f4c18304ca92859ccd09f44cf89b4b43_.0\\x94\\x8c\\x13__dask_blockwise__1\\x94\\x87\\x94h\\x0c(\\x8c\\ndask.utils\\x94\\x8c\\x05apply\\x94\\x93\\x94h\\x0f\\x8c\\x0cmethodcaller\\x94\\x93\\x94\\x8c\\x0breset_index\\x94\\x85\\x94R\\x94]\\x94\\x8c\\x13__dask_blockwise__5\\x94a\\x8c\\x08builtins\\x94\\x8c\\x04dict\\x94\\x93\\x94]\\x94]\\x94(\\x8c\\x04drop\\x94\\x89ea\\x86\\x94t\\x94h\\x07(h\\x11\\x8c\\x13dask.dataframe.core\\x94\\x8c\\x11apply_and_enforce\\x94\\x93\\x94]\\x94((h\\x11h#]\\x94h\\x0bh\\x0c\\x8c\\x13__dask_blockwise__0\\x94\\x87\\x94ah\\x1b]\\x94(]\\x94(\\x8c\\x05_func\\x94h\\x13\\x8c\\x08to_frame\\x94\\x85\\x94R\\x94e]\\x94(\\x8c\\x05_meta\\x94\\x8c\\x08builtins\\x94\\x8c\\x07getattr\\x94\\x93\\x94\\x8c\\x13cudf.core.dataframe\\x94\\x8c\\tDataFrame\\x94\\x93\\x94\\x8c\\x10host_deserialize\\x94\\x86\\x94R\\x94}\\x94(\\x8c\\x0ftype-serialized\\x94C0\\x80\\x04\\x95%\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x13cudf.core.dataframe\\x94\\x8c\\tDataFrame\\x94\\x93\\x94.\\x94\\x8c\\x0ccolumn_names\\x94C\\x14\\x80\\x04\\x95\\t\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x03src\\x94\\x85\\x94.\\x94\\x8c\\x07columns\\x94}\\x94(\\x8c\\x0ftype-serialized\\x94C=\\x80\\x04\\x952\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x1acudf.core.column.numerical\\x94\\x8c\\x0fNumericalColumn\\x94\\x93\\x94.\\x94\\x8c\\x05dtype\\x94CB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i4\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94\\x8c\\x18dtype-is-cudf-serialized\\x94\\x89\\x8c\\x04data\\x94}\\x94(\\x8c\\x0ftype-serialized\\x94CI\\x80\\x04\\x95>\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c!cudf.core.buffer.spillable_buffer\\x94\\x8c\\x14SpillableBufferSlice\\x94\\x93\\x94.\\x94\\x8c\\x0bframe_count\\x94K\\x01u\\x8c\\x04mask\\x94}\\x94(hGCD\\x80\\x04\\x959\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c!cudf.core.buffer.spillable_buffer\\x94\\x8c\\x0fSpillableBuffer\\x94\\x93\\x94.\\x94hIK\\x01u\\x8c\\x04size\\x94K\\x00hIK\\x02u\\x85\\x94\\x8c\\x05index\\x94}\\x94(\\x8c\\x0cindex_column\\x94}\\x94(\\x8c\\x05start\\x94K\\x00\\x8c\\x04stop\\x94K\\x00\\x8c\\x04step\\x94K\\x01u\\x8c\\x04name\\x94C\\x04\\x80\\x04N.\\x94hBCB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i8\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94\\x8c\\x0ftype-serialized\\x94C-\\x80\\x04\\x95\"\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x0fcudf.core.index\\x94\\x8c\\nRangeIndex\\x94\\x93\\x94.\\x94hIK\\x00u\\x8c\\x11index_frame_count\\x94K\\x00\\x8c\\x07is-cuda\\x94]\\x94(\\x88\\x88e\\x8c\\x07lengths\\x94]\\x94(K\\x00K\\x00e\\x8c\\twriteable\\x94NN\\x86\\x94u]\\x94(\\x8c\\x12numpy.core.numeric\\x94\\x8c\\x0b_frombuffer\\x94\\x93\\x94(C\\x00\\x94\\x8c\\x05numpy\\x94hB\\x93\\x94\\x8c\\x02u1\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01|\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94bK\\x00\\x85\\x94\\x8c\\x01C\\x94t\\x94R\\x94he(C\\x00\\x94hkK\\x00\\x85\\x94hot\\x94R\\x94e\\x86\\x94R\\x94ee\\x86\\x94t\\x94\\x8c\\x13__dask_blockwise__2\\x94eh\\x1b]\\x94(]\\x94(h*h\\x13\\x8c\\x06rename\\x94\\x85\\x94R\\x94e]\\x94(h/h2h5h6\\x86\\x94R\\x94}\\x94(h:C0\\x80\\x04\\x95%\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x13cudf.core.dataframe\\x94\\x8c\\tDataFrame\\x94\\x93\\x94.\\x94h<C\\x14\\x80\\x04\\x95\\t\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x03src\\x94\\x85\\x94.\\x94h>}\\x94(h@C=\\x80\\x04\\x952\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x1acudf.core.column.numerical\\x94\\x8c\\x0fNumericalColumn\\x94\\x93\\x94.\\x94hBCB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i4\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94hD\\x89hE}\\x94(hGCI\\x80\\x04\\x95>\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c!cudf.core.buffer.spillable_buffer\\x94\\x8c\\x14SpillableBufferSlice\\x94\\x93\\x94.\\x94hIK\\x01uhMK\\x00hIK\\x01u\\x85\\x94hO}\\x94(hQ}\\x94(hSK\\x00hTK\\x00hUK\\x01uhVC\\x04\\x80\\x04N.\\x94hBCB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i8\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94hYC-\\x80\\x04\\x95\"\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x0fcudf.core.index\\x94\\x8c\\nRangeIndex\\x94\\x93\\x94.\\x94hIK\\x00uh[K\\x00h\\\\]\\x94\\x88ah^]\\x94K\\x00ah`N\\x85\\x94u]\\x94he(C\\x00\\x94hkK\\x00\\x85\\x94hot\\x94R\\x94a\\x86\\x94R\\x94e]\\x94(h>h\\x1b]\\x94]\\x94(\\x8c\\x03src\\x94h\\x9eea\\x86\\x94ee\\x86\\x94t\\x94h\\x05(h\\x11h!\\x8c\\x10_reduction_chunk\\x94\\x93\\x94]\\x94h\\x0b(\\x8c\\x16dask.dataframe.methods\\x94\\x8c\\x06assign\\x94\\x93\\x94h\\x06h\\rh\\x08t\\x94h&\\x87\\x94ah\\x1b]\\x94]\\x94(\\x8c\\taca_chunk\\x94h0\\x8c\\x03len\\x94\\x93\\x94ea\\x86\\x94t\\x94\\x8c\\x13__dask_blockwise__0\\x94h\\x9e\\x8c\\x13__dask_blockwise__1\\x94\\x8c\\x03dst\\x94\\x8c\\x13__dask_blockwise__2\\x94N\\x8c\\x13__dask_blockwise__3\\x94\\x8c)to_frame-804980ae30b71d28f0a6bd3d5b7610f9\\x94\\x8c\\x13__dask_blockwise__4\\x94\\x8c(getitem-15414b72be12e28054238b44933937ab\\x94\\x8c\\x13__dask_blockwise__6\\x94\\x8c3cudf-aggregate-agg-c50c2d97de169ca4f41e43a92a042630\\x94uh\\x04\\x8c\\x13__dask_blockwise__5\\x94\\x85\\x94\\x8c6subgraph_callable-b4ca530e-8895-432e-b553-40a7b5892ab2\\x94t\\x94R\\x94.'\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2244, in execute\n",
+      "    function, args, kwargs = await self._maybe_deserialize_task(ts)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2216, in _maybe_deserialize_task\n",
+      "    function, args, kwargs = _deserialize(*ts.run_spec)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2937, in _deserialize\n",
+      "    function = loads_function(function)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2925, in loads_function\n",
+      "    result = pickle.loads(bytes_object)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/protocol/pickle.py\", line 96, in loads\n",
+      "    return pickle.loads(x)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py\", line 176, in host_deserialize\n",
+      "    obj = cls.device_deserialize(header, frames)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py\", line 130, in device_deserialize\n",
+      "    return typ.deserialize(header, frames)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/dataframe.py\", line 1019, in deserialize\n",
+      "    obj = super().deserialize(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 106, in deserialize\n",
+      "    columns = deserialize_columns(header[\"columns\"], frames)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 2450, in deserialize_columns\n",
+      "    colobj = col_typ.deserialize(meta, frames[:col_frame_count])\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1216, in deserialize\n",
+      "    data, frames = unpack(header[\"data\"], frames)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1204, in unpack\n",
+      "    obj = klass.deserialize(header, frames[:count])\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 574, in deserialize\n",
+      "    return SpillableBuffer.deserialize(header, frames)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/buffer.py\", line 335, in deserialize\n",
+      "    return cls._from_device_memory(frame)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 235, in _from_device_memory\n",
+      "    ret._finalize_init(ptr_desc={\"type\": \"gpu\"}, exposed=exposed)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 206, in _finalize_init\n",
+      "    raise ValueError(\n",
+      "ValueError: cannot create <class 'cudf.core.buffer.spillable_buffer.SpillableBuffer'> without a global spill manager\n",
+      "2023-05-12 09:25:03,817 - distributed.worker - ERROR - Could not deserialize task ('len-chunk-319fe46af5510615b2fae86c6e732896-841a12bf4568ebb80eb2030cc4d9651d', 0)\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2923, in loads_function\n",
+      "    result = cache_loads[bytes_object]\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/collections.py\", line 24, in __getitem__\n",
+      "    value = super().__getitem__(key)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/collections/__init__.py\", line 1106, in __getitem__\n",
+      "    raise KeyError(key)\n",
+      "KeyError: b'\\x80\\x05\\x95>\\x0b\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x11dask.optimization\\x94\\x8c\\x10SubgraphCallable\\x94\\x93\\x94(}\\x94(\\x8cKlen-chunk-319fe46af5510615b2fae86c6e732896-841a12bf4568ebb80eb2030cc4d9651d\\x94\\x8cZassign-getitem-len-chunk-319fe46af5510615b2fae86c6e732896-841a12bf4568ebb80eb2030cc4d9651d\\x94\\x8c*rename-01db283bd79fee66f232920c8dc6b55e_.0\\x94\\x8c;getitem-to_frame-rename-01db283bd79fee66f232920c8dc6b55e_.0\\x94\\x8c+getitem-3499fd71ac25ebbc1a06991edea6067c_.0\\x94\\x8c\\t_operator\\x94\\x8c\\x07getitem\\x94\\x93\\x94\\x8c/reset_index-f4c18304ca92859ccd09f44cf89b4b43_.0\\x94\\x8c\\x13__dask_blockwise__1\\x94\\x87\\x94h\\x0c(\\x8c\\ndask.utils\\x94\\x8c\\x05apply\\x94\\x93\\x94h\\x0f\\x8c\\x0cmethodcaller\\x94\\x93\\x94\\x8c\\x0breset_index\\x94\\x85\\x94R\\x94]\\x94\\x8c\\x13__dask_blockwise__5\\x94a\\x8c\\x08builtins\\x94\\x8c\\x04dict\\x94\\x93\\x94]\\x94]\\x94(\\x8c\\x04drop\\x94\\x89ea\\x86\\x94t\\x94h\\x07(h\\x11\\x8c\\x13dask.dataframe.core\\x94\\x8c\\x11apply_and_enforce\\x94\\x93\\x94]\\x94((h\\x11h#]\\x94h\\x0bh\\x0c\\x8c\\x13__dask_blockwise__0\\x94\\x87\\x94ah\\x1b]\\x94(]\\x94(\\x8c\\x05_func\\x94h\\x13\\x8c\\x08to_frame\\x94\\x85\\x94R\\x94e]\\x94(\\x8c\\x05_meta\\x94\\x8c\\x08builtins\\x94\\x8c\\x07getattr\\x94\\x93\\x94\\x8c\\x13cudf.core.dataframe\\x94\\x8c\\tDataFrame\\x94\\x93\\x94\\x8c\\x10host_deserialize\\x94\\x86\\x94R\\x94}\\x94(\\x8c\\x0ftype-serialized\\x94C0\\x80\\x04\\x95%\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x13cudf.core.dataframe\\x94\\x8c\\tDataFrame\\x94\\x93\\x94.\\x94\\x8c\\x0ccolumn_names\\x94C\\x14\\x80\\x04\\x95\\t\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x03src\\x94\\x85\\x94.\\x94\\x8c\\x07columns\\x94}\\x94(\\x8c\\x0ftype-serialized\\x94C=\\x80\\x04\\x952\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x1acudf.core.column.numerical\\x94\\x8c\\x0fNumericalColumn\\x94\\x93\\x94.\\x94\\x8c\\x05dtype\\x94CB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i4\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94\\x8c\\x18dtype-is-cudf-serialized\\x94\\x89\\x8c\\x04data\\x94}\\x94(\\x8c\\x0ftype-serialized\\x94CI\\x80\\x04\\x95>\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c!cudf.core.buffer.spillable_buffer\\x94\\x8c\\x14SpillableBufferSlice\\x94\\x93\\x94.\\x94\\x8c\\x0bframe_count\\x94K\\x01u\\x8c\\x04mask\\x94}\\x94(hGCD\\x80\\x04\\x959\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c!cudf.core.buffer.spillable_buffer\\x94\\x8c\\x0fSpillableBuffer\\x94\\x93\\x94.\\x94hIK\\x01u\\x8c\\x04size\\x94K\\x00hIK\\x02u\\x85\\x94\\x8c\\x05index\\x94}\\x94(\\x8c\\x0cindex_column\\x94}\\x94(\\x8c\\x05start\\x94K\\x00\\x8c\\x04stop\\x94K\\x00\\x8c\\x04step\\x94K\\x01u\\x8c\\x04name\\x94C\\x04\\x80\\x04N.\\x94hBCB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i8\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94\\x8c\\x0ftype-serialized\\x94C-\\x80\\x04\\x95\"\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x0fcudf.core.index\\x94\\x8c\\nRangeIndex\\x94\\x93\\x94.\\x94hIK\\x00u\\x8c\\x11index_frame_count\\x94K\\x00\\x8c\\x07is-cuda\\x94]\\x94(\\x88\\x88e\\x8c\\x07lengths\\x94]\\x94(K\\x00K\\x00e\\x8c\\twriteable\\x94NN\\x86\\x94u]\\x94(\\x8c\\x12numpy.core.numeric\\x94\\x8c\\x0b_frombuffer\\x94\\x93\\x94(C\\x00\\x94\\x8c\\x05numpy\\x94hB\\x93\\x94\\x8c\\x02u1\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01|\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94bK\\x00\\x85\\x94\\x8c\\x01C\\x94t\\x94R\\x94he(C\\x00\\x94hkK\\x00\\x85\\x94hot\\x94R\\x94e\\x86\\x94R\\x94ee\\x86\\x94t\\x94\\x8c\\x13__dask_blockwise__2\\x94eh\\x1b]\\x94(]\\x94(h*h\\x13\\x8c\\x06rename\\x94\\x85\\x94R\\x94e]\\x94(h/h2h5h6\\x86\\x94R\\x94}\\x94(h:C0\\x80\\x04\\x95%\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x13cudf.core.dataframe\\x94\\x8c\\tDataFrame\\x94\\x93\\x94.\\x94h<C\\x14\\x80\\x04\\x95\\t\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x03src\\x94\\x85\\x94.\\x94h>}\\x94(h@C=\\x80\\x04\\x952\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x1acudf.core.column.numerical\\x94\\x8c\\x0fNumericalColumn\\x94\\x93\\x94.\\x94hBCB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i4\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94hD\\x89hE}\\x94(hGCI\\x80\\x04\\x95>\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c!cudf.core.buffer.spillable_buffer\\x94\\x8c\\x14SpillableBufferSlice\\x94\\x93\\x94.\\x94hIK\\x01uhMK\\x00hIK\\x01u\\x85\\x94hO}\\x94(hQ}\\x94(hSK\\x00hTK\\x00hUK\\x01uhVC\\x04\\x80\\x04N.\\x94hBCB\\x80\\x04\\x957\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x05numpy\\x94\\x8c\\x05dtype\\x94\\x93\\x94\\x8c\\x02i8\\x94\\x89\\x88\\x87\\x94R\\x94(K\\x03\\x8c\\x01<\\x94NNNJ\\xff\\xff\\xff\\xffJ\\xff\\xff\\xff\\xffK\\x00t\\x94b.\\x94hYC-\\x80\\x04\\x95\"\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8c\\x0fcudf.core.index\\x94\\x8c\\nRangeIndex\\x94\\x93\\x94.\\x94hIK\\x00uh[K\\x00h\\\\]\\x94\\x88ah^]\\x94K\\x00ah`N\\x85\\x94u]\\x94he(C\\x00\\x94hkK\\x00\\x85\\x94hot\\x94R\\x94a\\x86\\x94R\\x94e]\\x94(h>h\\x1b]\\x94]\\x94(\\x8c\\x03src\\x94h\\x9eea\\x86\\x94ee\\x86\\x94t\\x94h\\x05(h\\x11h!\\x8c\\x10_reduction_chunk\\x94\\x93\\x94]\\x94h\\x0b(\\x8c\\x16dask.dataframe.methods\\x94\\x8c\\x06assign\\x94\\x93\\x94h\\x06h\\rh\\x08t\\x94h&\\x87\\x94ah\\x1b]\\x94]\\x94(\\x8c\\taca_chunk\\x94h0\\x8c\\x03len\\x94\\x93\\x94ea\\x86\\x94t\\x94\\x8c\\x13__dask_blockwise__0\\x94h\\x9e\\x8c\\x13__dask_blockwise__1\\x94\\x8c\\x03dst\\x94\\x8c\\x13__dask_blockwise__2\\x94N\\x8c\\x13__dask_blockwise__3\\x94\\x8c)to_frame-804980ae30b71d28f0a6bd3d5b7610f9\\x94\\x8c\\x13__dask_blockwise__4\\x94\\x8c(getitem-15414b72be12e28054238b44933937ab\\x94\\x8c\\x13__dask_blockwise__6\\x94\\x8c3cudf-aggregate-agg-c50c2d97de169ca4f41e43a92a042630\\x94uh\\x04\\x8c\\x13__dask_blockwise__5\\x94\\x85\\x94\\x8c6subgraph_callable-b4ca530e-8895-432e-b553-40a7b5892ab2\\x94t\\x94R\\x94.'\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2244, in execute\n",
+      "    function, args, kwargs = await self._maybe_deserialize_task(ts)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2216, in _maybe_deserialize_task\n",
+      "    function, args, kwargs = _deserialize(*ts.run_spec)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py\", line 79, in inner\n",
+      "    return func(*args, **kwds)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2937, in _deserialize\n",
+      "    function = loads_function(function)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py\", line 2925, in loads_function\n",
+      "    result = pickle.loads(bytes_object)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/protocol/pickle.py\", line 96, in loads\n",
+      "    return pickle.loads(x)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py\", line 176, in host_deserialize\n",
+      "    obj = cls.device_deserialize(header, frames)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py\", line 130, in device_deserialize\n",
+      "    return typ.deserialize(header, frames)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/dataframe.py\", line 1019, in deserialize\n",
+      "    obj = super().deserialize(\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py\", line 106, in deserialize\n",
+      "    columns = deserialize_columns(header[\"columns\"], frames)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 2450, in deserialize_columns\n",
+      "    colobj = col_typ.deserialize(meta, frames[:col_frame_count])\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1216, in deserialize\n",
+      "    data, frames = unpack(header[\"data\"], frames)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py\", line 1204, in unpack\n",
+      "    obj = klass.deserialize(header, frames[:count])\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 574, in deserialize\n",
+      "    return SpillableBuffer.deserialize(header, frames)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/buffer.py\", line 335, in deserialize\n",
+      "    return cls._from_device_memory(frame)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 235, in _from_device_memory\n",
+      "    ret._finalize_init(ptr_desc={\"type\": \"gpu\"}, exposed=exposed)\n",
+      "  File \"/home/dacosta/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py\", line 206, in _finalize_init\n",
+      "    raise ValueError(\n",
+      "ValueError: cannot create <class 'cudf.core.buffer.spillable_buffer.SpillableBuffer'> without a global spill manager\n"
+     ]
+    },
+    {
+     "ename": "ValueError",
+     "evalue": "cannot create <class 'cudf.core.buffer.spillable_buffer.SpillableBuffer'> without a global spill manager",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39m# Create a directed graph using the source (src) and destination (dst) vertex pairs from the Dataframe \u001b[39;00m\n\u001b[1;32m      2\u001b[0m G \u001b[39m=\u001b[39m cugraph\u001b[39m.\u001b[39mGraph(directed\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m----> 3\u001b[0m G\u001b[39m.\u001b[39;49mfrom_dask_cudf_edgelist(e_list, source\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39msrc\u001b[39;49m\u001b[39m'\u001b[39;49m, destination\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mdst\u001b[39;49m\u001b[39m'\u001b[39;49m)\n\u001b[1;32m      5\u001b[0m \u001b[39m# Print time\u001b[39;00m\n\u001b[1;32m      6\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mRead, load and renumber: \u001b[39m\u001b[39m\"\u001b[39m, time\u001b[39m.\u001b[39mtime()\u001b[39m-\u001b[39mt_start, \u001b[39m\"\u001b[39m\u001b[39ms\u001b[39m\u001b[39m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cugraph/structure/graph_classes.py:309\u001b[0m, in \u001b[0;36mGraph.from_dask_cudf_edgelist\u001b[0;34m(self, input_ddf, source, destination, edge_attr, renumber, store_transposed, legacy_renum_only)\u001b[0m\n\u001b[1;32m    307\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_Impl\u001b[39m.\u001b[39medgelist \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    308\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mGraph already has values\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m--> 309\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_Impl\u001b[39m.\u001b[39;49m_simpleDistributedGraphImpl__from_edgelist(\n\u001b[1;32m    310\u001b[0m     input_ddf,\n\u001b[1;32m    311\u001b[0m     source,\n\u001b[1;32m    312\u001b[0m     destination,\n\u001b[1;32m    313\u001b[0m     edge_attr,\n\u001b[1;32m    314\u001b[0m     renumber,\n\u001b[1;32m    315\u001b[0m     store_transposed,\n\u001b[1;32m    316\u001b[0m     legacy_renum_only,\n\u001b[1;32m    317\u001b[0m )\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cugraph/structure/graph_implementation/simpleDistributedGraph.py:272\u001b[0m, in \u001b[0;36msimpleDistributedGraphImpl.__from_edgelist\u001b[0;34m(self, input_ddf, source, destination, edge_attr, renumber, store_transposed, legacy_renum_only)\u001b[0m\n\u001b[1;32m    268\u001b[0m     dst_col_name \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrenumber_map\u001b[39m.\u001b[39mrenumbered_dst_col_name\n\u001b[1;32m    270\u001b[0m ddf \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39medgelist\u001b[39m.\u001b[39medgelist_df\n\u001b[0;32m--> 272\u001b[0m num_edges \u001b[39m=\u001b[39m \u001b[39mlen\u001b[39;49m(ddf)\n\u001b[1;32m    273\u001b[0m edge_data \u001b[39m=\u001b[39m get_distributed_data(ddf)\n\u001b[1;32m    275\u001b[0m graph_props \u001b[39m=\u001b[39m GraphProperties(\n\u001b[1;32m    276\u001b[0m     is_multigraph\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mproperties\u001b[39m.\u001b[39mmulti_edge,\n\u001b[1;32m    277\u001b[0m     is_symmetric\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mproperties\u001b[39m.\u001b[39mdirected,\n\u001b[1;32m    278\u001b[0m )\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/dataframe/core.py:4775\u001b[0m, in \u001b[0;36mDataFrame.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   4773\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m\u001b[39m__len__\u001b[39m()\n\u001b[1;32m   4774\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 4775\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mlen\u001b[39;49m(s)\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/dataframe/core.py:843\u001b[0m, in \u001b[0;36m_Frame.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    840\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__len__\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m    841\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mreduction(\n\u001b[1;32m    842\u001b[0m         \u001b[39mlen\u001b[39;49m, np\u001b[39m.\u001b[39;49msum, token\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mlen\u001b[39;49m\u001b[39m\"\u001b[39;49m, meta\u001b[39m=\u001b[39;49m\u001b[39mint\u001b[39;49m, split_every\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m\n\u001b[0;32m--> 843\u001b[0m     )\u001b[39m.\u001b[39;49mcompute()\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/base.py:314\u001b[0m, in \u001b[0;36mDaskMethodsMixin.compute\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    290\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mcompute\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m    291\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Compute this dask collection\u001b[39;00m\n\u001b[1;32m    292\u001b[0m \n\u001b[1;32m    293\u001b[0m \u001b[39m    This turns a lazy Dask collection into its in-memory equivalent.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    312\u001b[0m \u001b[39m    dask.base.compute\u001b[39;00m\n\u001b[1;32m    313\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 314\u001b[0m     (result,) \u001b[39m=\u001b[39m compute(\u001b[39mself\u001b[39;49m, traverse\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    315\u001b[0m     \u001b[39mreturn\u001b[39;00m result\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/dask/base.py:599\u001b[0m, in \u001b[0;36mcompute\u001b[0;34m(traverse, optimize_graph, scheduler, get, *args, **kwargs)\u001b[0m\n\u001b[1;32m    596\u001b[0m     keys\u001b[39m.\u001b[39mappend(x\u001b[39m.\u001b[39m__dask_keys__())\n\u001b[1;32m    597\u001b[0m     postcomputes\u001b[39m.\u001b[39mappend(x\u001b[39m.\u001b[39m__dask_postcompute__())\n\u001b[0;32m--> 599\u001b[0m results \u001b[39m=\u001b[39m schedule(dsk, keys, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    600\u001b[0m \u001b[39mreturn\u001b[39;00m repack([f(r, \u001b[39m*\u001b[39ma) \u001b[39mfor\u001b[39;00m r, (f, a) \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(results, postcomputes)])\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/client.py:3186\u001b[0m, in \u001b[0;36mClient.get\u001b[0;34m(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)\u001b[0m\n\u001b[1;32m   3184\u001b[0m         should_rejoin \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m   3185\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 3186\u001b[0m     results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgather(packed, asynchronous\u001b[39m=\u001b[39;49masynchronous, direct\u001b[39m=\u001b[39;49mdirect)\n\u001b[1;32m   3187\u001b[0m \u001b[39mfinally\u001b[39;00m:\n\u001b[1;32m   3188\u001b[0m     \u001b[39mfor\u001b[39;00m f \u001b[39min\u001b[39;00m futures\u001b[39m.\u001b[39mvalues():\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/client.py:2345\u001b[0m, in \u001b[0;36mClient.gather\u001b[0;34m(self, futures, errors, direct, asynchronous)\u001b[0m\n\u001b[1;32m   2343\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m   2344\u001b[0m     local_worker \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m-> 2345\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msync(\n\u001b[1;32m   2346\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_gather,\n\u001b[1;32m   2347\u001b[0m     futures,\n\u001b[1;32m   2348\u001b[0m     errors\u001b[39m=\u001b[39;49merrors,\n\u001b[1;32m   2349\u001b[0m     direct\u001b[39m=\u001b[39;49mdirect,\n\u001b[1;32m   2350\u001b[0m     local_worker\u001b[39m=\u001b[39;49mlocal_worker,\n\u001b[1;32m   2351\u001b[0m     asynchronous\u001b[39m=\u001b[39;49masynchronous,\n\u001b[1;32m   2352\u001b[0m )\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/utils.py:349\u001b[0m, in \u001b[0;36mSyncMethodMixin.sync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m    347\u001b[0m     \u001b[39mreturn\u001b[39;00m future\n\u001b[1;32m    348\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 349\u001b[0m     \u001b[39mreturn\u001b[39;00m sync(\n\u001b[1;32m    350\u001b[0m         \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mloop, func, \u001b[39m*\u001b[39;49margs, callback_timeout\u001b[39m=\u001b[39;49mcallback_timeout, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs\n\u001b[1;32m    351\u001b[0m     )\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/utils.py:416\u001b[0m, in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m    414\u001b[0m \u001b[39mif\u001b[39;00m error:\n\u001b[1;32m    415\u001b[0m     typ, exc, tb \u001b[39m=\u001b[39m error\n\u001b[0;32m--> 416\u001b[0m     \u001b[39mraise\u001b[39;00m exc\u001b[39m.\u001b[39mwith_traceback(tb)\n\u001b[1;32m    417\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    418\u001b[0m     \u001b[39mreturn\u001b[39;00m result\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/utils.py:389\u001b[0m, in \u001b[0;36msync.<locals>.f\u001b[0;34m()\u001b[0m\n\u001b[1;32m    387\u001b[0m         future \u001b[39m=\u001b[39m wait_for(future, callback_timeout)\n\u001b[1;32m    388\u001b[0m     future \u001b[39m=\u001b[39m asyncio\u001b[39m.\u001b[39mensure_future(future)\n\u001b[0;32m--> 389\u001b[0m     result \u001b[39m=\u001b[39m \u001b[39myield\u001b[39;00m future\n\u001b[1;32m    390\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m    391\u001b[0m     error \u001b[39m=\u001b[39m sys\u001b[39m.\u001b[39mexc_info()\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/tornado/gen.py:769\u001b[0m, in \u001b[0;36mRunner.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    766\u001b[0m exc_info \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m    768\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 769\u001b[0m     value \u001b[39m=\u001b[39m future\u001b[39m.\u001b[39;49mresult()\n\u001b[1;32m    770\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m    771\u001b[0m     exc_info \u001b[39m=\u001b[39m sys\u001b[39m.\u001b[39mexc_info()\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/client.py:2208\u001b[0m, in \u001b[0;36mClient._gather\u001b[0;34m(self, futures, errors, direct, local_worker)\u001b[0m\n\u001b[1;32m   2206\u001b[0m         exc \u001b[39m=\u001b[39m CancelledError(key)\n\u001b[1;32m   2207\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 2208\u001b[0m         \u001b[39mraise\u001b[39;00m exception\u001b[39m.\u001b[39mwith_traceback(traceback)\n\u001b[1;32m   2209\u001b[0m     \u001b[39mraise\u001b[39;00m exc\n\u001b[1;32m   2210\u001b[0m \u001b[39mif\u001b[39;00m errors \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mskip\u001b[39m\u001b[39m\"\u001b[39m:\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/contextlib.py:79\u001b[0m, in \u001b[0;36minner\u001b[0;34m()\u001b[0m\n\u001b[1;32m     76\u001b[0m \u001b[39m@wraps\u001b[39m(func)\n\u001b[1;32m     77\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39minner\u001b[39m(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwds):\n\u001b[1;32m     78\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_recreate_cm():\n\u001b[0;32m---> 79\u001b[0m         \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwds)\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py:2937\u001b[0m, in \u001b[0;36m_deserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m   2934\u001b[0m \u001b[39m# Some objects require threadlocal state during deserialization, e.g. to\u001b[39;00m\n\u001b[1;32m   2935\u001b[0m \u001b[39m# detect the current worker\u001b[39;00m\n\u001b[1;32m   2936\u001b[0m \u001b[39mif\u001b[39;00m function \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m-> 2937\u001b[0m     function \u001b[39m=\u001b[39m loads_function(function)\n\u001b[1;32m   2938\u001b[0m \u001b[39mif\u001b[39;00m args \u001b[39mand\u001b[39;00m \u001b[39misinstance\u001b[39m(args, \u001b[39mbytes\u001b[39m):\n\u001b[1;32m   2939\u001b[0m     args \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(args)\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/worker.py:2925\u001b[0m, in \u001b[0;36mloads_function\u001b[0;34m()\u001b[0m\n\u001b[1;32m   2923\u001b[0m     result \u001b[39m=\u001b[39m cache_loads[bytes_object]\n\u001b[1;32m   2924\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mKeyError\u001b[39;00m:\n\u001b[0;32m-> 2925\u001b[0m     result \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(bytes_object)\n\u001b[1;32m   2926\u001b[0m     cache_loads[bytes_object] \u001b[39m=\u001b[39m result\n\u001b[1;32m   2927\u001b[0m \u001b[39mreturn\u001b[39;00m result\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/distributed/protocol/pickle.py:96\u001b[0m, in \u001b[0;36mloads\u001b[0;34m()\u001b[0m\n\u001b[1;32m     94\u001b[0m         \u001b[39mreturn\u001b[39;00m pickle\u001b[39m.\u001b[39mloads(x, buffers\u001b[39m=\u001b[39mbuffers)\n\u001b[1;32m     95\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[0;32m---> 96\u001b[0m         \u001b[39mreturn\u001b[39;00m pickle\u001b[39m.\u001b[39mloads(x)\n\u001b[1;32m     97\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m     98\u001b[0m     logger\u001b[39m.\u001b[39minfo(\u001b[39m\"\u001b[39m\u001b[39mFailed to deserialize \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m, x[:\u001b[39m10000\u001b[39m], exc_info\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py:176\u001b[0m, in \u001b[0;36mhost_deserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    154\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Perform device-side deserialization tasks.\u001b[39;00m\n\u001b[1;32m    155\u001b[0m \n\u001b[1;32m    156\u001b[0m \u001b[39mParameters\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    170\u001b[0m \u001b[39m:meta private:\u001b[39;00m\n\u001b[1;32m    171\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    172\u001b[0m frames \u001b[39m=\u001b[39m [\n\u001b[1;32m    173\u001b[0m     cudf\u001b[39m.\u001b[39mcore\u001b[39m.\u001b[39mbuffer\u001b[39m.\u001b[39mas_buffer(f) \u001b[39mif\u001b[39;00m c \u001b[39melse\u001b[39;00m f\n\u001b[1;32m    174\u001b[0m     \u001b[39mfor\u001b[39;00m c, f \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(header[\u001b[39m\"\u001b[39m\u001b[39mis-cuda\u001b[39m\u001b[39m\"\u001b[39m], \u001b[39mmap\u001b[39m(\u001b[39mmemoryview\u001b[39m, frames))\n\u001b[1;32m    175\u001b[0m ]\n\u001b[0;32m--> 176\u001b[0m obj \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39mdevice_deserialize(header, frames)\n\u001b[1;32m    177\u001b[0m \u001b[39mreturn\u001b[39;00m obj\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/abc.py:130\u001b[0m, in \u001b[0;36mdevice_deserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    125\u001b[0m typ \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m    126\u001b[0m frames \u001b[39m=\u001b[39m [\n\u001b[1;32m    127\u001b[0m     cudf\u001b[39m.\u001b[39mcore\u001b[39m.\u001b[39mbuffer\u001b[39m.\u001b[39mas_buffer(f) \u001b[39mif\u001b[39;00m c \u001b[39melse\u001b[39;00m \u001b[39mmemoryview\u001b[39m(f)\n\u001b[1;32m    128\u001b[0m     \u001b[39mfor\u001b[39;00m c, f \u001b[39min\u001b[39;00m \u001b[39mzip\u001b[39m(header[\u001b[39m\"\u001b[39m\u001b[39mis-cuda\u001b[39m\u001b[39m\"\u001b[39m], frames)\n\u001b[1;32m    129\u001b[0m ]\n\u001b[0;32m--> 130\u001b[0m \u001b[39mreturn\u001b[39;00m typ\u001b[39m.\u001b[39mdeserialize(header, frames)\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/dataframe.py:1019\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1016\u001b[0m \u001b[39m@classmethod\u001b[39m\n\u001b[1;32m   1017\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdeserialize\u001b[39m(\u001b[39mcls\u001b[39m, header, frames):\n\u001b[1;32m   1018\u001b[0m     index_nframes \u001b[39m=\u001b[39m header[\u001b[39m\"\u001b[39m\u001b[39mindex_frame_count\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m-> 1019\u001b[0m     obj \u001b[39m=\u001b[39m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39mdeserialize(\n\u001b[1;32m   1020\u001b[0m         header, frames[header[\u001b[39m\"\u001b[39m\u001b[39mindex_frame_count\u001b[39m\u001b[39m\"\u001b[39m] :]\n\u001b[1;32m   1021\u001b[0m     )\n\u001b[1;32m   1023\u001b[0m     idx_typ \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mindex\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m   1024\u001b[0m     index \u001b[39m=\u001b[39m idx_typ\u001b[39m.\u001b[39mdeserialize(header[\u001b[39m\"\u001b[39m\u001b[39mindex\u001b[39m\u001b[39m\"\u001b[39m], frames[:index_nframes])\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/frame.py:106\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    104\u001b[0m cls_deserialize \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m    105\u001b[0m column_names \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mcolumn_names\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m--> 106\u001b[0m columns \u001b[39m=\u001b[39m deserialize_columns(header[\u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m], frames)\n\u001b[1;32m    107\u001b[0m \u001b[39mreturn\u001b[39;00m cls_deserialize\u001b[39m.\u001b[39m_from_data(\u001b[39mdict\u001b[39m(\u001b[39mzip\u001b[39m(column_names, columns)))\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py:2450\u001b[0m, in \u001b[0;36mdeserialize_columns\u001b[0;34m()\u001b[0m\n\u001b[1;32m   2448\u001b[0m col_frame_count \u001b[39m=\u001b[39m meta[\u001b[39m\"\u001b[39m\u001b[39mframe_count\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m   2449\u001b[0m col_typ \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(meta[\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m-> 2450\u001b[0m colobj \u001b[39m=\u001b[39m col_typ\u001b[39m.\u001b[39mdeserialize(meta, frames[:col_frame_count])\n\u001b[1;32m   2451\u001b[0m columns\u001b[39m.\u001b[39mappend(colobj)\n\u001b[1;32m   2452\u001b[0m \u001b[39m# Advance frames\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py:1216\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1214\u001b[0m     dtype \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mdtype\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m   1215\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mdata\u001b[39m\u001b[39m\"\u001b[39m \u001b[39min\u001b[39;00m header:\n\u001b[0;32m-> 1216\u001b[0m     data, frames \u001b[39m=\u001b[39m unpack(header[\u001b[39m\"\u001b[39m\u001b[39mdata\u001b[39m\u001b[39m\"\u001b[39m], frames)\n\u001b[1;32m   1217\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m   1218\u001b[0m     data \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/column/column.py:1204\u001b[0m, in \u001b[0;36munpack\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1202\u001b[0m count \u001b[39m=\u001b[39m header[\u001b[39m\"\u001b[39m\u001b[39mframe_count\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m   1203\u001b[0m klass \u001b[39m=\u001b[39m pickle\u001b[39m.\u001b[39mloads(header[\u001b[39m\"\u001b[39m\u001b[39mtype-serialized\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[0;32m-> 1204\u001b[0m obj \u001b[39m=\u001b[39m klass\u001b[39m.\u001b[39mdeserialize(header, frames[:count])\n\u001b[1;32m   1205\u001b[0m \u001b[39mreturn\u001b[39;00m obj, frames[count:]\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py:574\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    567\u001b[0m \u001b[39m@classmethod\u001b[39m\n\u001b[1;32m    568\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdeserialize\u001b[39m(\u001b[39mcls\u001b[39m, header: \u001b[39mdict\u001b[39m, frames: \u001b[39mlist\u001b[39m):\n\u001b[1;32m    569\u001b[0m     \u001b[39m# TODO: because of the hack in `SpillableBuffer.serialize()` where\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    572\u001b[0m     \u001b[39m# deserialize into `SpillableBufferSlice` when the frames hasn't been\u001b[39;00m\n\u001b[1;32m    573\u001b[0m     \u001b[39m# copied.\u001b[39;00m\n\u001b[0;32m--> 574\u001b[0m     \u001b[39mreturn\u001b[39;00m SpillableBuffer\u001b[39m.\u001b[39mdeserialize(header, frames)\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/buffer.py:335\u001b[0m, in \u001b[0;36mdeserialize\u001b[0;34m()\u001b[0m\n\u001b[1;32m    332\u001b[0m     \u001b[39mreturn\u001b[39;00m frame  \u001b[39m# The frame is already deserialized\u001b[39;00m\n\u001b[1;32m    334\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mhasattr\u001b[39m(frame, \u001b[39m\"\u001b[39m\u001b[39m__cuda_array_interface__\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[0;32m--> 335\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39m_from_device_memory(frame)\n\u001b[1;32m    336\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39m_from_host_memory(frame)\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py:235\u001b[0m, in \u001b[0;36m_from_device_memory\u001b[0;34m()\u001b[0m\n\u001b[1;32m    218\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Create a spillabe buffer from device memory.\u001b[39;00m\n\u001b[1;32m    219\u001b[0m \n\u001b[1;32m    220\u001b[0m \u001b[39mNo data is being copied.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    232\u001b[0m \u001b[39m    Buffer representing the same device memory as `data`\u001b[39;00m\n\u001b[1;32m    233\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    234\u001b[0m ret \u001b[39m=\u001b[39m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m_from_device_memory(data)\n\u001b[0;32m--> 235\u001b[0m ret\u001b[39m.\u001b[39m_finalize_init(ptr_desc\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mtype\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mgpu\u001b[39m\u001b[39m\"\u001b[39m}, exposed\u001b[39m=\u001b[39mexposed)\n\u001b[1;32m    236\u001b[0m \u001b[39mreturn\u001b[39;00m ret\n",
+      "File \u001b[0;32m~/miniconda3/envs/cugraph_0411/lib/python3.10/site-packages/cudf/core/buffer/spillable_buffer.py:206\u001b[0m, in \u001b[0;36m_finalize_init\u001b[0;34m()\u001b[0m\n\u001b[1;32m    204\u001b[0m manager \u001b[39m=\u001b[39m get_global_manager()\n\u001b[1;32m    205\u001b[0m \u001b[39mif\u001b[39;00m manager \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 206\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m    207\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcannot create \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m without \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    208\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39ma global spill manager\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    209\u001b[0m     )\n\u001b[1;32m    211\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_manager \u001b[39m=\u001b[39m manager\n\u001b[1;32m    212\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_manager\u001b[39m.\u001b[39madd(\u001b[39mself\u001b[39m)\n",
+      "\u001b[0;31mValueError\u001b[0m: cannot create <class 'cudf.core.buffer.spillable_buffer.SpillableBuffer'> without a global spill manager"
+     ]
+    }
+   ],
    "source": [
     "# Create a directed graph using the source (src) and destination (dst) vertex pairs from the Dataframe \n",
     "G = cugraph.Graph(directed=True)\n",
@@ -170,6 +517,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -196,6 +544,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -204,6 +553,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -236,6 +586,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -251,6 +602,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -269,6 +621,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -280,11 +633,12 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "___\n",
-    "Copyright (c) 2020, NVIDIA CORPORATION.\n",
+    "Copyright (c) 2020-2023, NVIDIA CORPORATION.\n",
     "\n",
     "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
     "\n",
@@ -295,9 +649,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "cugraph_dev",
+   "display_name": "cugraph_0411",
    "language": "python",
-   "name": "cugraph_dev"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -309,7 +663,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.10.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/demo/mg_property_graph.ipynb b/notebooks/demo/mg_property_graph.ipynb
index 0a5aa5df8a3..d2f0f456c17 100644
--- a/notebooks/demo/mg_property_graph.ipynb
+++ b/notebooks/demo/mg_property_graph.ipynb
@@ -14,7 +14,7 @@
     "| Author Credit |    Date    |  Update          | cuGraph Version |  Test Hardware        |\n",
     "|---------------|------------|------------------|-----------------|-----------------------|\n",
     "| Don Acosta    | 01/30/2023 | created          | 23.02 nightly   |  2xA6000 CUDA 11.7    |\n",
-    "\n",
+    "| Don Acosta    | 05/15/2023 | update/test      | 23.06 nightly   |  2xA6000 CUDA 11.7    |\n",
     "\n",
     "CuGraph's multi-GPU features leverage Dask. RAPIDS has other projects based on Dask such as dask-cudf and dask-cuda. These products will also be used in this example. Check out [RAPIDS.ai](https://rapids.ai/) to learn more about these technologies."
    ]
@@ -89,6 +89,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -108,6 +109,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -126,10 +128,10 @@
     "\n",
     "# Helper function to set the reader chunk size to automatically get one partition per GPU  \n",
     "input_data_path = get_data_file()\n",
-    "chunksize = dask_cugraph.get_chunksize(input_data_path)\n",
+    "blocksize = dask_cugraph.get_chunksize(input_data_path)\n",
     "\n",
     "# Multi-GPU CSV reader\n",
-    "e_list = dask_cudf.read_csv(input_data_path, chunksize = chunksize, delimiter=' ', names=['src', 'dst'], dtype=['int32', 'int32'])"
+    "e_list = dask_cudf.read_csv(input_data_path, blocksize = blocksize, delimiter=' ', names=['src', 'dst'], dtype=['int32', 'int32'])"
    ]
   },
   {
@@ -154,6 +156,14 @@
     "pG.get_num_edges()"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run louvain and print out the partition data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -215,7 +225,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.15"
+   "version": "3.10.10"
   },
   "orig_nbformat": 4,
   "vscode": {
diff --git a/notebooks/demo/uvm.ipynb b/notebooks/demo/uvm.ipynb
index d4d9caeba43..96dcb425f06 100644
--- a/notebooks/demo/uvm.ipynb
+++ b/notebooks/demo/uvm.ipynb
@@ -1,22 +1,31 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Oversubscribing GPU memory in cuGraph\n",
-    "#### Author : Alex Fender\n",
     "# Skip notebook test\n",
     "\n",
+    "\n",
     "In this notebook, we will show how to **scale to 4x larger graphs than before** without incurring a performance drop using managed memory features in cuGraph. We will compute the PageRank of each user in Twitter's dataset on a single GPU as an example. This technique applies to all features.\n",
     "\n",
     "Unified Memory is a single memory address space accessible from any processor in a system. If a kernel tries to access any absent pages,the Page Migration Engine migrates the pages. When the GPU memory is full, the least recently used pages are evicted. In other words, Unified Memory transparently enables oversubscribing GPU memory, enabling out-of-core computations.\n",
     "\n",
     "\n",
-    "This notebook was tested on an NVIDIA 48GB RTX8000 GPU using RAPIDS 0.14 and CUDA 10.2. Please be aware that your system may be different, and you may need to modify the code or install packages to run the below examples. If you think you have found a bug or an error, please file an issue in [cuGraph](https://github.com/rapidsai/cugraph/issues)"
+    "This notebook was tested on an NVIDIA 48GB RTX8000 GPU using RAPIDS 0.14 and CUDA 10.2. Please be aware that your system may be different, and you may need to modify the code or install packages to run the below examples. If you think you have found a bug or an error, please file an issue in [cuGraph](https://github.com/rapidsai/cugraph/issues)\n",
+    "\n",
+    "| Author Credit   |    Date    |  Update               | cuGraph Version |  Test Hardware                 |\n",
+    "|-----------------|------------|-----------------------|-----------------|--------------------------------|\n",
+    "|Alex Fender      | 05/15/2020 | created               | 0.14            |  RTX8000 48G GPU CUDA 10.2     |\n",
+    "|Chuck Hastings   | 02/21/2021 | incorporate dendrogram| 0.18            |                                |\n",
+    "|Jordan Jacobelli | 11/08/2021 | update dataset URL    | 23.04           |                                |\n",
+    "| Don Acosta      | 05/21/2023 | updated/tested          | 23.06 nightly   |  2xA6000 CUDA 11.7           |\n"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -30,6 +39,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -58,6 +68,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -78,6 +89,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -85,6 +97,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -135,6 +148,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -159,6 +173,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -174,8 +189,8 @@
     "t_start = time.time()\n",
     "\n",
     "# Create a directed graph using the source (src) and destination (dst) vertex pairs from the Dataframe \n",
-    "G = cugraph.DiGraph()\n",
-    "G.from_cudf_edgelist(e_list, source='src', destination='dst', renumber=False)\n",
+    "G = cugraph.Graph(directed=True)\n",
+    "G.from_cudf_edgelist(e_list, source='src', destination='dst', renumber=False, store_transposed=True)\n",
     "\n",
     "# (optional) request the transposed here so that we can analyse pagerank solver time alone\n",
     "G.view_transposed_adj_list()\n",
@@ -185,6 +200,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -210,6 +226,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -218,6 +235,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -249,6 +267,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -264,6 +283,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -275,11 +295,12 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "___\n",
-    "Copyright (c) 2020, NVIDIA CORPORATION.\n",
+    "Copyright (c) 2020-2023, NVIDIA CORPORATION.\n",
     "\n",
     "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
     "\n",
@@ -290,9 +311,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "cugraph_dev",
+   "display_name": "cugraph_0411",
    "language": "python",
-   "name": "cugraph_dev"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -304,7 +325,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.10"
   }
  },
  "nbformat": 4,
diff --git a/python/cugraph-dgl/cugraph_dgl/__init__.py b/python/cugraph-dgl/cugraph_dgl/__init__.py
index 55dc271cbd6..111961d8fdc 100644
--- a/python/cugraph-dgl/cugraph_dgl/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/__init__.py
@@ -20,4 +20,4 @@
 import cugraph_dgl.dataloading
 import cugraph_dgl.nn
 
-__version__ = "23.04.01"
+__version__ = "23.06.00"
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index 31528d7061c..e3358f1dca6 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -138,6 +138,7 @@ def __init__(
         self._sampling_output_dir = sampling_output_dir
         self._batches_per_partition = batches_per_partition
         self._seeds_per_call = seeds_per_call
+        self._rank = None
 
         indices = _dgl_idx_to_cugraph_idx(indices, graph)
 
@@ -187,16 +188,16 @@ def __init__(
                         f"Fetch cugraph_dgl_mg_graph_ds to worker_id {rank}",
                         "from worker_id 0 failed",
                     )
-            self._rank = rank
         else:
+            rank = 0
             G = create_cugraph_graph_from_edges_dict(
                 edges_dict=graph._edges_dict,
                 etype_id_dict=graph._etype_id_dict,
                 edge_dir=graph_sampler.edge_dir,
             )
-            self._rank = 0
-        self._cugraph_graph = G
 
+        self._rank = rank
+        self._cugraph_graph = G
         super().__init__(
             self.cugraph_dgl_dataset,
             batch_size=None,
@@ -209,14 +210,12 @@ def __iter__(self):
         output_dir = os.path.join(
             self._sampling_output_dir, "epoch_" + str(self.epoch_number)
         )
-        rank = self._rank
         bs = BulkSampler(
             output_path=output_dir,
             batch_size=self._batch_size,
             graph=self._cugraph_graph,
             batches_per_partition=self._batches_per_partition,
             seeds_per_call=self._seeds_per_call,
-            rank=rank,
             fanout_vals=self.graph_sampler._reversed_fanout_vals,
             with_replacement=self.graph_sampler.replace,
         )
@@ -226,7 +225,6 @@ def __iter__(self):
         batch_df = create_batch_df(self.tensorized_indices_ds)
         bs.add_batches(batch_df, start_col_name="start", batch_col_name="batch_id")
         bs.flush()
-        output_dir = output_dir + f"/rank={rank}/"
         self.cugraph_dgl_dataset.set_input_files(input_directory=output_dir)
         self.epoch_number = self.epoch_number + 1
         return super().__iter__()
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/extract_graph_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/extract_graph_helpers.py
index db35d8cf379..0d3d5823097 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/extract_graph_helpers.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/extract_graph_helpers.py
@@ -25,8 +25,13 @@ def create_cugraph_graph_from_edges_dict(
 ):
     if edge_dir == "in":
         edges_dict = {k: reverse_edges(df) for k, df in edges_dict.items()}
-    # TODO: Potentially skip this for memory efficiencies
-    edges_dict = {k: add_etype_id(df, etype_id_dict[k]) for k, df in edges_dict.items()}
+    if len(edges_dict) > 1:
+        has_multiple_etypes = True
+        edges_dict = {
+            k: add_etype_id(df, etype_id_dict[k]) for k, df in edges_dict.items()
+        }
+    else:
+        has_multiple_etypes = False
 
     edges_dfs = list(edges_dict.values())
     del edges_dict
@@ -36,24 +41,26 @@ def create_cugraph_graph_from_edges_dict(
         edges_df = cudf.concat(edges_dfs, ignore_index=True)
     del edges_dfs
 
-    edges_df["wgt"] = np.float32(0)
     G = cugraph.MultiGraph(directed=True)
     if isinstance(edges_df, dask_cudf.DataFrame):
-        G.from_dask_cudf_edgelist(
-            edges_df,
-            source="_SRC_",
-            destination="_DST_",
-            edge_attr=["wgt", "_EDGE_ID_", "etp"],
-            renumber=True,
-        )
+        g_creation_f = G.from_dask_cudf_edgelist
+    else:
+        g_creation_f = G.from_cudf_edgelist
+
+    if has_multiple_etypes:
+        edge_etp = "etp"
     else:
-        G.from_cudf_edgelist(
-            edges_df,
-            source="_SRC_",
-            destination="_DST_",
-            edge_attr=["wgt", "_EDGE_ID_", "etp"],
-            renumber=True,
-        )
+        edge_etp = None
+
+    g_creation_f(
+        edges_df,
+        source="_SRC_",
+        destination="_DST_",
+        weight=None,
+        edge_id="_EDGE_ID_",
+        edge_type=edge_etp,
+        renumber=True,
+    )
     return G
 
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
index 10dcfaacf78..7d3a660c052 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
@@ -13,9 +13,11 @@
 from .gatconv import GATConv
 from .relgraphconv import RelGraphConv
 from .sageconv import SAGEConv
+from .transformerconv import TransformerConv
 
 __all__ = [
     "GATConv",
     "RelGraphConv",
     "SAGEConv",
+    "TransformerConv",
 ]
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
index 2b57089189f..e70f2d0c6d1 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
@@ -14,15 +14,17 @@
 primitives in cugraph-ops"""
 # pylint: disable=no-member, arguments-differ, invalid-name, too-many-arguments
 from __future__ import annotations
-from typing import Optional
+from typing import Optional, Tuple, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv
 from cugraph.utilities.utils import import_optional
 
+from pylibcugraphops.pytorch import BipartiteCSC, SampledCSC, StaticCSC
+from pylibcugraphops.pytorch.operators import mha_gat_n2n, mha_gat_n2n_bipartite
+
 dgl = import_optional("dgl")
 torch = import_optional("torch")
 nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
 
 
 class GATConv(BaseConv):
@@ -30,19 +32,20 @@ class GATConv(BaseConv):
     <https://arxiv.org/pdf/1710.10903.pdf>`__, with the sparse aggregation
     accelerated by cugraph-ops.
 
-    See :class:`dgl.nn.pytorch.conv.GATConv` for mathematical model.
-
-    This module depends on :code:`pylibcugraphops` package, which can be
-    installed via :code:`conda install -c nvidia pylibcugraphops>=23.02`.
-
     Parameters
     ----------
-    in_feats : int
-        Input feature size.
+    in_feats : int, pair of ints
+        Input feature size. A pair denotes feature sizes of source and
+        destination nodes.
     out_feats : int
         Output feature size.
     num_heads : int
         Number of heads in Multi-Head Attention.
+    concat : bool, optional
+        If False, the multi-head attentions are averaged instead of concatenated.
+        Default: ``True``.
+    edge_feats : int, optional
+        Edge feature size. Default: ``None``.
     negative_slope : float, optional
         LeakyReLU angle of negative slope. Defaults: ``0.2``.
     bias : bool, optional
@@ -84,9 +87,11 @@ class GATConv(BaseConv):
 
     def __init__(
         self,
-        in_feats: int,
+        in_feats: Union[int, Tuple[int, int]],
         out_feats: int,
         num_heads: int,
+        concat: bool = True,
+        edge_feats: Optional[int] = None,
         negative_slope: float = 0.2,
         bias: bool = True,
     ):
@@ -94,13 +99,27 @@ def __init__(
         self.in_feats = in_feats
         self.out_feats = out_feats
         self.num_heads = num_heads
+        self.concat = concat
+        self.edge_feats = edge_feats
         self.negative_slope = negative_slope
 
-        self.fc = nn.Linear(in_feats, out_feats * num_heads, bias=False)
-        self.attn_weights = nn.Parameter(torch.Tensor(2 * num_heads * out_feats))
+        if isinstance(in_feats, int):
+            self.fc = nn.Linear(in_feats, num_heads * out_feats, bias=False)
+        else:
+            self.fc_src = nn.Linear(in_feats[0], num_heads * out_feats, bias=False)
+            self.fc_dst = nn.Linear(in_feats[1], num_heads * out_feats, bias=False)
+
+        if edge_feats is not None:
+            self.fc_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
+            self.attn_weights = nn.Parameter(torch.Tensor(3 * num_heads * out_feats))
+        else:
+            self.register_parameter("fc_edge", None)
+            self.attn_weights = nn.Parameter(torch.Tensor(2 * num_heads * out_feats))
 
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(num_heads * out_feats))
+        if bias and concat:
+            self.bias = nn.Parameter(torch.Tensor(num_heads, out_feats))
+        elif bias and not concat:
+            self.bias = nn.Parameter(torch.Tensor(out_feats))
         else:
             self.register_buffer("bias", None)
 
@@ -108,19 +127,26 @@ def __init__(
 
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
-
         gain = nn.init.calculate_gain("relu")
-        nn.init.xavier_normal_(self.fc.weight, gain=gain)
+        if hasattr(self, "fc"):
+            nn.init.xavier_normal_(self.fc.weight, gain=gain)
+        else:
+            nn.init.xavier_normal_(self.fc_src.weight, gain=gain)
+            nn.init.xavier_normal_(self.fc_dst.weight, gain=gain)
+
         nn.init.xavier_normal_(
-            self.attn_weights.view(2, self.num_heads, self.out_feats), gain=gain
+            self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
         )
+        if self.fc_edge is not None:
+            self.fc_edge.reset_parameters()
         if self.bias is not None:
             nn.init.zeros_(self.bias)
 
     def forward(
         self,
         g: dgl.DGLHeteroGraph,
-        feat: torch.Tensor,
+        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
         r"""Forward computation.
@@ -129,8 +155,10 @@ def forward(
         ----------
         graph : DGLGraph
             The graph.
-        feat : torch.Tensor
+        nfeat : torch.Tensor
             Input features of shape :math:`(N, D_{in})`.
+        efeat: torch.Tensor, optional
+            Optional edge features.
         max_in_degree : int
             Maximum in-degree of destination nodes. It is only effective when
             :attr:`g` is a :class:`DGLBlock`, i.e., bipartite graph. When
@@ -145,32 +173,81 @@ def forward(
             :math:`H` is the number of heads, and :math:`D_{out}` is size of
             output feature.
         """
-        offsets, indices, _ = g.adj_sparse("csc")
-
-        if g.is_block:
-            if max_in_degree is None:
-                max_in_degree = g.in_degrees().max().item()
-
-            if max_in_degree < self.MAX_IN_DEGREE_MFG:
-                _graph = ops_torch.SampledCSC(
-                    offsets, indices, max_in_degree, g.num_src_nodes()
+        bipartite = not isinstance(nfeat, torch.Tensor)
+        offsets, indices, _ = g.adj_tensors("csc")
+
+        if efeat is not None:
+            if self.fc_edge is None:
+                raise RuntimeError(
+                    f"{self.__class__.__name__}.edge_feats must be set to "
+                    f"accept edge features."
                 )
-            else:
-                offsets_fg = self.pad_offsets(offsets, g.num_src_nodes() + 1)
-                _graph = ops_torch.StaticCSC(offsets_fg, indices)
+            efeat = self.fc_edge(efeat)
+
+        if bipartite:
+            if not hasattr(self, "fc_src"):
+                raise RuntimeError(
+                    f"{self.__class__.__name__}.in_feats must be a pair of "
+                    f"integers to allow bipartite node features, but got "
+                    f"{self.in_feats}."
+                )
+            _graph = BipartiteCSC(
+                offsets=offsets, indices=indices, num_src_nodes=g.num_src_nodes()
+            )
+            nfeat_src = self.fc_src(nfeat[0])
+            nfeat_dst = self.fc_dst(nfeat[1])
+
+            out = mha_gat_n2n_bipartite(
+                src_feat=nfeat_src,
+                dst_feat=nfeat_dst,
+                attn_weights=self.attn_weights,
+                graph=_graph,
+                num_heads=self.num_heads,
+                activation="LeakyReLU",
+                negative_slope=self.negative_slope,
+                concat_heads=self.concat,
+                edge_feat=efeat,
+            )
         else:
-            _graph = ops_torch.StaticCSC(offsets, indices)
-
-        feat_transformed = self.fc(feat)
-        out = ops_torch.operators.mha_gat_n2n(
-            feat_transformed,
-            self.attn_weights,
-            _graph,
-            self.num_heads,
-            "LeakyReLU",
-            self.negative_slope,
-            concat_heads=True,
-        ).view(-1, self.num_heads, self.out_feats)[: g.num_dst_nodes()]
+            if not hasattr(self, "fc"):
+                raise RuntimeError(
+                    f"{self.__class__.__name__}.in_feats is expected to be an "
+                    f"integer, but got {self.in_feats}."
+                )
+            nfeat = self.fc(nfeat)
+            # Sampled primitive does not support edge features
+            if g.is_block and efeat is None:
+                if max_in_degree is None:
+                    max_in_degree = g.in_degrees().max().item()
+
+                if max_in_degree < self.MAX_IN_DEGREE_MFG:
+                    _graph = SampledCSC(
+                        offsets=offsets,
+                        indices=indices,
+                        max_num_neighbors=max_in_degree,
+                        num_src_nodes=g.num_src_nodes(),
+                    )
+                else:
+                    offsets = self.pad_offsets(offsets, g.num_src_nodes() + 1)
+                    _graph = StaticCSC(offsets=offsets, indices=indices)
+            else:
+                if g.is_block:
+                    offsets = self.pad_offsets(offsets, g.num_src_nodes() + 1)
+                _graph = StaticCSC(offsets=offsets, indices=indices)
+
+            out = mha_gat_n2n(
+                feat=nfeat,
+                attn_weights=self.attn_weights,
+                graph=_graph,
+                num_heads=self.num_heads,
+                activation="LeakyReLU",
+                negative_slope=self.negative_slope,
+                concat_heads=self.concat,
+                edge_feat=efeat,
+            )[: g.num_dst_nodes()]
+
+        if self.concat:
+            out = out.view(-1, self.num_heads, self.out_feats)
 
         if self.bias is not None:
             out = out + self.bias
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
index c93c58c3473..89e49011cf7 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
@@ -179,7 +179,7 @@ def forward(
         torch.Tensor
             New node features. Shape: :math:`(|V|, D_{out})`.
         """
-        offsets, indices, edge_ids = g.adj_sparse("csc")
+        offsets, indices, edge_ids = g.adj_tensors("csc")
         edge_types_perm = etypes[edge_ids.long()].int()
 
         if g.is_block:
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
index 1f9b651984b..403678e24a2 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
@@ -122,7 +122,7 @@ def forward(
         torch.Tensor
             Output node features. Shape: :math:`(|V|, D_{out})`.
         """
-        offsets, indices, _ = g.adj_sparse("csc")
+        offsets, indices, _ = g.adj_tensors("csc")
 
         if g.is_block:
             if max_in_degree is None:
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
new file mode 100644
index 00000000000..1898f5159b1
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+from cugraph_dgl.nn.conv.base import BaseConv
+from cugraph.utilities.utils import import_optional
+
+from pylibcugraphops.pytorch import BipartiteCSC, StaticCSC
+from pylibcugraphops.pytorch.operators import mha_simple_n2n
+
+dgl = import_optional("dgl")
+torch = import_optional("torch")
+nn = import_optional("torch.nn")
+
+
+class TransformerConv(BaseConv):
+    r"""The graph transformer layer from the `"Masked Label Prediction:
+    Unified Message Passing Model for Semi-Supervised Classification"
+    <https://arxiv.org/abs/2009.03509>`_ paper.
+
+    Parameters
+    ----------
+    in_node_feats : int or pair of ints
+        Input feature size. A pair denotes feature sizes of source and
+        destination nodes.
+    out_node_feats : int
+        Output feature size.
+    num_heads : int
+        Number of multi-head-attentions.
+    concat : bool, optional
+        If False, the multi-head attentions are averaged instead of concatenated.
+        Default: ``True``.
+    beta : bool, optional
+        If True, use a gated residual connection. Default: ``True``.
+    edge_feats: int, optional
+        Edge feature size. Default: ``None``.
+    bias: bool, optional
+        If True, learns a bias term. Default: ``True``.
+    root_weight: bool, optional
+        If False, will skip to learn a root weight matrix. Default: ``True``.
+    """
+
+    def __init__(
+        self,
+        in_node_feats: Union[int, Tuple[int, int]],
+        out_node_feats: int,
+        num_heads: int,
+        concat: bool = True,
+        beta: bool = False,
+        edge_feats: Optional[int] = None,
+        bias: bool = True,
+        root_weight: bool = True,
+    ):
+        super().__init__()
+
+        self.in_node_feats = in_node_feats
+        self.out_node_feats = out_node_feats
+        self.num_heads = num_heads
+        self.concat = concat
+        self.beta = beta
+        self.edge_feats = edge_feats
+        self.bias = bias
+        self.root_weight = root_weight
+
+        if isinstance(in_node_feats, int):
+            in_node_feats = (in_node_feats, in_node_feats)
+
+        self.lin_key = nn.Linear(in_node_feats[0], num_heads * out_node_feats)
+        self.lin_query = nn.Linear(in_node_feats[1], num_heads * out_node_feats)
+        self.lin_value = nn.Linear(in_node_feats[0], num_heads * out_node_feats)
+
+        if edge_feats is not None:
+            self.lin_edge = nn.Linear(
+                edge_feats, num_heads * out_node_feats, bias=False
+            )
+        else:
+            self.lin_edge = self.register_parameter("lin_edge", None)
+
+        if concat:
+            self.lin_skip = nn.Linear(
+                in_node_feats[1], num_heads * out_node_feats, bias=bias
+            )
+            if self.beta:
+                self.lin_beta = nn.Linear(3 * num_heads * out_node_feats, 1, bias=bias)
+            else:
+                self.lin_beta = self.register_parameter("lin_beta", None)
+        else:
+            self.lin_skip = nn.Linear(in_node_feats[1], out_node_feats, bias=bias)
+            if self.beta:
+                self.lin_beta = nn.Linear(3 * out_node_feats, 1, bias=False)
+            else:
+                self.lin_beta = self.register_parameter("lin_beta", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.lin_key.reset_parameters()
+        self.lin_query.reset_parameters()
+        self.lin_value.reset_parameters()
+        if self.lin_edge is not None:
+            self.lin_edge.reset_parameters()
+        if self.lin_skip is not None:
+            self.lin_skip.reset_parameters()
+        if self.lin_beta is not None:
+            self.lin_beta.reset_parameters()
+
+    def forward(
+        self,
+        g: dgl.DGLHeteroGraph,
+        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        efeat: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward computation.
+
+        Parameters
+        ----------
+        g: DGLGraph
+            The graph.
+        nfeat: torch.Tensor or a pair of torch.Tensor
+            Node feature tensor. A pair denotes features for source and
+            destination nodes, respectively.
+        efeat: torch.Tensor, optional
+            Edge feature tensor. Default: ``None``.
+        """
+        bipartite = not isinstance(nfeat, torch.Tensor)
+        offsets, indices, _ = g.adj_tensors("csc")
+
+        if bipartite:
+            src_feats, dst_feats = nfeat
+            _graph = BipartiteCSC(
+                offsets=offsets, indices=indices, num_src_nodes=g.num_src_nodes()
+            )
+        else:
+            src_feats = dst_feats = nfeat
+            if g.is_block:
+                offsets = self.pad_offsets(offsets, g.num_src_nodes() + 1)
+            _graph = StaticCSC(offsets=offsets, indices=indices)
+
+        query = self.lin_query(dst_feats)
+        key = self.lin_key(src_feats)
+        value = self.lin_value(src_feats)
+        if self.lin_edge is not None:
+            efeat = self.lin_edge(efeat)
+
+        out = mha_simple_n2n(
+            key_emb=key,
+            query_emb=query,
+            value_emb=value,
+            graph=_graph,
+            num_heads=self.num_heads,
+            concat_heads=self.concat,
+            edge_emb=efeat,
+            norm_by_dim=True,
+            score_bias=None,
+        )[: g.num_dst_nodes()]
+
+        if self.root_weight:
+            res = self.lin_skip(dst_feats[: g.num_dst_nodes()])
+            if self.lin_beta is not None:
+                beta = self.lin_beta(torch.cat([out, res, out - res], dim=-1))
+                beta = beta.sigmoid()
+                out = beta * res + (1 - beta) * out
+            else:
+                out = out + res
+
+        return out
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
new file mode 100644
index 00000000000..a6f771e4b51
--- /dev/null
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# A graphsage GNN model using dgl for node classification
+# with three layers and mean aggregation
+import time
+import dgl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchmetrics.functional as MF
+from cugraph_dgl.nn import SAGEConv
+import tqdm
+
+
+class Sage(nn.Module):
+    def __init__(self, in_size, hid_size, out_size):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        # 2-layer GraphSAGE-mean
+        self.layers.append(SAGEConv(in_size, hid_size, "mean"))
+        self.layers.append(SAGEConv(hid_size, out_size, "mean"))
+        self.dropout = nn.Dropout(0.5)
+        self.hid_size = hid_size
+        self.out_size = out_size
+
+    def forward(self, blocks, x):
+        h = x
+        for l_id, (layer, block) in enumerate(zip(self.layers, blocks)):
+            h = layer(block, h)
+            if l_id != len(self.layers) - 1:
+                h = F.relu(h)
+                h = self.dropout(h)
+        return h
+
+    def inference(self, g, batch_size, device):
+        """
+        Inference with the GraphSAGE model on
+        full neighbors (i.e. without neighbor sampling).
+        g : the entire graph.
+        batch_size : the node number of each inference output
+        device : the inference device
+        """
+        # During inference with sampling,
+        # multi-layer blocks are very inefficient because
+        # lots of computations in the first few layers are repeated.
+        # Therefore, we compute the representation of all nodes layer by layer.
+        # The nodes on each layer are of course splitted in batches.
+
+        all_node_ids = torch.arange(0, g.num_nodes()).to(device)
+        feat = g.get_node_storage(key="feat", ntype="_N").fetch(
+            all_node_ids, device=device
+        )
+        sampler = dgl.dataloading.MultiLayerFullNeighborSampler(
+            1, prefetch_node_feats=["feat"]
+        )
+        dataloader = dgl.dataloading.DataLoader(
+            g,
+            torch.arange(g.num_nodes(), dtype=torch.int32).to(g.device),
+            sampler,
+            device=device,
+            batch_size=batch_size,
+            shuffle=False,
+            drop_last=False,
+            num_workers=0,
+        )
+        buffer_device = torch.device("cpu")
+        pin_memory = buffer_device != device
+
+        for l_id, layer in enumerate(self.layers):
+            y = torch.empty(
+                g.num_nodes(),
+                self.hid_size if l_id != len(self.layers) - 1 else self.out_size,
+                device=buffer_device,
+                pin_memory=pin_memory,
+            )
+            feat = feat.to(device)
+            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
+                x = feat[input_nodes]
+                h = layer(blocks[0], x)  # len(blocks) = 1
+                if l_id != len(self.layers) - 1:
+                    h = F.relu(h)
+                    h = self.dropout(h)
+                # by design, our output nodes are contiguous
+                y[output_nodes[0] : output_nodes[-1] + 1] = h.to(buffer_device)
+            feat = y
+        return y
+
+
+def layerwise_infer(graph, nid, model, batch_size, device):
+    model.eval()
+    with torch.no_grad():
+        pred = model.module.inference(
+            graph, batch_size, device
+        )  # pred in buffer_device
+        pred = pred[nid]
+        label = graph.ndata["label"]
+        if isinstance(label, dict):
+            label = label["_N"]
+        label = label[nid].to(pred.device)
+        num_classes = pred.shape[1]
+        label = label.squeeze(1)
+        return MF.accuracy(pred, label, task="multiclass", num_classes=num_classes)
+
+
+def train_model(model, g, opt, train_dataloader, num_epochs, rank, val_nid):
+    g.ndata["feat"]["_N"] = g.ndata["feat"]["_N"].to("cuda")
+    g.ndata["label"]["_N"] = g.ndata["label"]["_N"].to("cuda")
+    st = time.time()
+    model.train()
+    for epoch in range(num_epochs):
+        total_loss = 0
+        for _, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
+            x = g.ndata["feat"]["_N"][input_nodes]
+            y = g.ndata["label"]["_N"][output_nodes]
+            y_hat = model(blocks, x)
+            y = y.squeeze(1)
+            loss = F.cross_entropy(y_hat, y)
+            opt.zero_grad()
+            loss.backward()
+            opt.step()
+            total_loss += loss.item()
+        print(
+            f"total loss: {total_loss} for epoch = {epoch} for rank = {rank}",
+            flush=True,
+        )
+    et = time.time()
+    print(
+        f"Total time taken for num_epochs {num_epochs} "
+        f"with batch_size {train_dataloader._batch_size} = {et-st} s on rank ={rank}"
+    )
+    if rank == 0:
+        val_acc = layerwise_infer(g, val_nid, model, 1024 * 5, "cuda")
+        print("---" * 30)
+        print("Validation Accuracy {:.4f}".format(val_acc))
diff --git a/python/cugraph-dgl/examples/muti_trainer_MG_example/workflow.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow.py
similarity index 80%
rename from python/cugraph-dgl/examples/muti_trainer_MG_example/workflow.py
rename to python/cugraph-dgl/examples/multi_trainer_MG_example/workflow.py
index 00c67f92409..474f17dc2bb 100644
--- a/python/cugraph-dgl/examples/muti_trainer_MG_example/workflow.py
+++ b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow.py
@@ -16,6 +16,7 @@
 import time
 from distributed import Client, Event as Dask_Event
 import tempfile
+from cugraph.dask.comms import comms as Comms
 
 
 def enable_spilling():
@@ -39,7 +40,7 @@ def setup_cluster(dask_worker_devices):
     client.run(enable_spilling)
     print("Dask Cluster Setup Complete")
     del client
-    return cluster.scheduler_address
+    return cluster
 
 
 def create_dask_client(scheduler_address):
@@ -53,6 +54,8 @@ def create_dask_client(scheduler_address):
 def initalize_pytorch_worker(dev_id):
     import cupy as cp
     import rmm
+    from rmm.allocators.torch import rmm_torch_allocator
+    from rmm.allocators.cupy import rmm_cupy_allocator
 
     dev = cp.cuda.Device(
         dev_id
@@ -66,10 +69,10 @@ def initalize_pytorch_worker(dev_id):
     )
 
     if dev_id == 0:
-        torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)
+        torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
 
     torch.cuda.set_device(dev_id)
-    cp.cuda.set_allocator(rmm.rmm_cupy_allocator)
+    cp.cuda.set_allocator(rmm_cupy_allocator)
     enable_spilling()
     print("device_id", dev_id, flush=True)
 
@@ -98,7 +101,7 @@ def load_dgl_dataset(dataset_name="ogbn-products"):
     train_idx = train_idx.int()
     valid_idx = valid_idx.int()
     test_idx = test_idx.int()
-    return g, train_idx, valid_idx, test_idx
+    return g, train_idx, valid_idx, test_idx, dataset.num_classes
 
 
 def create_cugraph_graphstore_from_dgl_dataset(
@@ -106,9 +109,9 @@ def create_cugraph_graphstore_from_dgl_dataset(
 ):
     from cugraph_dgl import cugraph_storage_from_heterograph
 
-    dgl_g, train_idx, valid_idx, test_idx = load_dgl_dataset(dataset_name)
+    dgl_g, train_idx, valid_idx, test_idx, num_classes = load_dgl_dataset(dataset_name)
     cugraph_gs = cugraph_storage_from_heterograph(dgl_g, single_gpu=single_gpu)
-    return cugraph_gs, train_idx, valid_idx, test_idx
+    return cugraph_gs, train_idx, valid_idx, test_idx, num_classes
 
 
 def create_dataloader(gs, train_idx, device):
@@ -133,7 +136,9 @@ def create_dataloader(gs, train_idx, device):
 
 
 def run_workflow(rank, devices, scheduler_address):
-    # Below sets gpu_num
+    from model import Sage, train_model
+
+    # Below sets gpu_number
     dev_id = devices[rank]
     initalize_pytorch_worker(dev_id)
     device = torch.device(f"cuda:{dev_id}")
@@ -162,6 +167,7 @@ def run_workflow(rank, devices, scheduler_address):
             train_idx,
             valid_idx,
             test_idx,
+            num_classes,
         ) = create_cugraph_graphstore_from_dgl_dataset(
             "ogbn-products", single_gpu=False
         )
@@ -169,6 +175,7 @@ def run_workflow(rank, devices, scheduler_address):
         client.publish_dataset(train_idx=train_idx)
         client.publish_dataset(valid_idx=valid_idx)
         client.publish_dataset(test_idx=test_idx)
+        client.publish_dataset(num_classes=num_classes)
         event.set()
     else:
         if event.wait(timeout=1000):
@@ -176,6 +183,7 @@ def run_workflow(rank, devices, scheduler_address):
             train_idx = client.get_dataset("train_idx")
             valid_idx = client.get_dataset("valid_idx")
             test_idx = client.get_dataset("test_idx")
+            num_classes = client.get_dataset("num_classes")
         else:
             raise RuntimeError(f"Fetch cugraph_gs to worker_id {rank} failed")
 
@@ -183,26 +191,20 @@ def run_workflow(rank, devices, scheduler_address):
     print(f"Loading cugraph_store to worker {rank} is complete", flush=True)
     dataloader = create_dataloader(gs, train_idx, device)
     print("Data Loading Complete", flush=True)
-    del gs  # Clean up gs reference
-    # Comment below
-    st = time.time()
-    for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
-        pass
-    et = time.time()
-    print(f"Warmup loading took = {et-st} s on worker = {rank}")
+    num_feats = gs.ndata["feat"]["_N"].shape[1]
+    hid_size = 256
+    # Load Training example
+    model = Sage(num_feats, hid_size, num_classes).to(device)
+    model = torch.nn.parallel.DistributedDataParallel(
+        model,
+        device_ids=[device],
+        output_device=device,
+    )
     torch.distributed.barrier()
-
-    n_epochs = 30
+    n_epochs = 10
     total_st = time.time()
-    for i in range(0, n_epochs):
-        st = time.time()
-        for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
-            pass
-            # print(len(input_nodes))
-            # print(len(seeds))
-            # train_model()
-        et = time.time()
-        print(f"Data Loading took = {et-st} s for epoch = {i} on worker = {rank}")
+    opt = torch.optim.Adam(model.parameters(), lr=0.01)
+    train_model(model, gs, opt, dataloader, n_epochs, rank, valid_idx)
     torch.distributed.barrier()
     total_et = time.time()
     print(
@@ -217,17 +219,26 @@ def run_workflow(rank, devices, scheduler_address):
         client.unpublish_dataset("valid_idx")
         client.unpublish_dataset("test_idx")
         event.clear()
+    print("Workflow completed")
+    print("---" * 10)
+    Comms.destroy()
 
 
 if __name__ == "__main__":
+    # Load dummy first
+    # because new environments
+    # require dataset download
+    load_dgl_dataset()
     dask_worker_devices = [5, 6]
-    scheduler_address = setup_cluster(dask_worker_devices)
+    cluster = setup_cluster(dask_worker_devices)
 
     trainer_devices = [0, 1, 2]
     import torch.multiprocessing as mp
 
     mp.spawn(
         run_workflow,
-        args=(trainer_devices, scheduler_address),
+        args=(trainer_devices, cluster.scheduler_address),
         nprocs=len(trainer_devices),
     )
+    Comms.destroy()
+    cluster.close()
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index e5e56e58321..04d41c60591 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -10,17 +10,17 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cugraph-dgl"
-version = "23.04.01"
+version = "23.06.00"
 description = "cugraph extensions for DGL"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 dependencies = [
-    "cugraph==23.4.*",
-    "numba>=0.56.2",
+    "cugraph==23.6.*",
+    "numba>=0.57",
     "numpy>=1.21",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cugraph-dgl/tests/conftest.py b/python/cugraph-dgl/tests/conftest.py
index c1f4841a905..dc6b7db9b45 100644
--- a/python/cugraph-dgl/tests/conftest.py
+++ b/python/cugraph-dgl/tests/conftest.py
@@ -11,30 +11,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import pytest
 
-from dask.distributed import Client
-from cugraph.dask.comms import comms as Comms
-from cugraph.testing.mg_utils import stop_dask_client, start_dask_client
+from cugraph.testing.mg_utils import (
+    start_dask_client,
+    stop_dask_client,
+)
 
 
 @pytest.fixture(scope="module")
 def dask_client():
-    dask_scheduler_file = os.environ.get("SCHEDULER_FILE")
-
-    if dask_scheduler_file is not None:
-        dask_client = Client(scheduler_file=dask_scheduler_file)
-        dask_cluster = None
-    else:
-        dask_client, dask_cluster = start_dask_client(
-            dask_worker_devices="0", protocol="tcp"
-        )
-
-    if not Comms.is_initialized():
-        Comms.initialize(p2p=True)
+    # start_dask_client will check for the SCHEDULER_FILE and
+    # DASK_WORKER_DEVICES env vars and use them when creating a client if
+    # set. start_dask_client will also initialize the Comms singleton.
+    dask_client, dask_cluster = start_dask_client(
+        dask_worker_devices="0", protocol="tcp"
+    )
 
     yield dask_client
 
     stop_dask_client(dask_client, dask_cluster)
-    print("\ndask_client fixture: client.close() called")
diff --git a/python/cugraph-dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/tests/nn/test_gatconv.py
index 332ba2f3657..7ed65645a28 100644
--- a/python/cugraph-dgl/tests/nn/test_gatconv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatconv.py
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=too-many-arguments, too-many-locals
-from itertools import product
+
 import pytest
 
 try:
@@ -25,42 +25,53 @@
 torch = import_optional("torch")
 dgl = import_optional("dgl")
 
-options = {
-    "idtype_int": [False, True],
-    "max_in_degree": [None, 8],
-    "num_heads": [1, 2, 3, 7],
-    "to_block": [False, True],
-}
-
 
-@pytest.mark.parametrize(",".join(options.keys()), product(*options.values()))
-def test_gatconv_equality(idtype_int, max_in_degree, num_heads, to_block):
+@pytest.mark.parametrize("bipartite", [False, True])
+@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
+@pytest.mark.parametrize("num_heads", [1, 2, 7])
+@pytest.mark.parametrize("to_block", [False, True])
+def test_gatconv_equality(bipartite, idtype_int, max_in_degree, num_heads, to_block):
     GATConv = dgl.nn.GATConv
     CuGraphGATConv = cugraph_dgl.nn.GATConv
     device = "cuda"
-
-    in_feat, out_feat = 10, 2
-    args = (in_feat, out_feat, num_heads)
-    kwargs = {"bias": False}
     g = create_graph1().to(device)
+
     if idtype_int:
         g = g.int()
+
     if to_block:
         g = dgl.to_block(g)
-    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
 
-    torch.manual_seed(0)
+    if bipartite:
+        in_feats = (10, 3)
+        nfeat = (
+            torch.rand(g.num_src_nodes(), in_feats[0], device=device),
+            torch.rand(g.num_dst_nodes(), in_feats[1], device=device),
+        )
+    else:
+        in_feats = 10
+        nfeat = torch.rand(g.num_src_nodes(), in_feats, device=device)
+    out_feats = 2
+
+    args = (in_feats, out_feats, num_heads)
+    kwargs = {"bias": False}
+
     conv1 = GATConv(*args, **kwargs, allow_zero_in_degree=True).to(device)
-    out1 = conv1(g, feat)
+    out1 = conv1(g, nfeat)
 
-    torch.manual_seed(0)
     conv2 = CuGraphGATConv(*args, **kwargs).to(device)
-    dim = num_heads * out_feat
+    dim = num_heads * out_feats
     with torch.no_grad():
         conv2.attn_weights.data[:dim] = conv1.attn_l.data.flatten()
         conv2.attn_weights.data[dim:] = conv1.attn_r.data.flatten()
-        conv2.fc.weight.data[:] = conv1.fc.weight.data
-    out2 = conv2(g, feat, max_in_degree=max_in_degree)
+        if bipartite:
+            conv2.fc_src.weight.data = conv1.fc_src.weight.data.detach().clone()
+            conv2.fc_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
+        else:
+            conv2.fc.weight.data = conv1.fc.weight.data.detach().clone()
+    out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
+
     assert torch.allclose(out1, out2, atol=1e-6)
 
     grad_out1 = torch.rand_like(out1)
@@ -68,9 +79,63 @@ def test_gatconv_equality(idtype_int, max_in_degree, num_heads, to_block):
     out1.backward(grad_out1)
     out2.backward(grad_out2)
 
-    assert torch.allclose(conv1.fc.weight.grad, conv2.fc.weight.grad, atol=1e-6)
+    if bipartite:
+        assert torch.allclose(
+            conv1.fc_src.weight.grad, conv2.fc_src.weight.grad, atol=1e-6
+        )
+        assert torch.allclose(
+            conv1.fc_dst.weight.grad, conv2.fc_dst.weight.grad, atol=1e-6
+        )
+    else:
+        assert torch.allclose(conv1.fc.weight.grad, conv2.fc.weight.grad, atol=1e-6)
+
     assert torch.allclose(
         torch.cat((conv1.attn_l.grad, conv1.attn_r.grad), dim=0),
-        conv2.attn_weights.grad.view(2, num_heads, out_feat),
+        conv2.attn_weights.grad.view(2, num_heads, out_feats),
         atol=1e-6,
     )
+
+
+@pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("bipartite", [False, True])
+@pytest.mark.parametrize("concat", [False, True])
+@pytest.mark.parametrize("max_in_degree", [None, 8, 800])
+@pytest.mark.parametrize("num_heads", [1, 2, 7])
+@pytest.mark.parametrize("to_block", [False, True])
+@pytest.mark.parametrize("use_edge_feats", [False, True])
+def test_gatconv_edge_feats(
+    bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
+):
+    from cugraph_dgl.nn import GATConv
+
+    device = "cuda"
+    g = create_graph1().to(device)
+
+    if to_block:
+        g = dgl.to_block(g)
+
+    if bipartite:
+        in_feats = (10, 3)
+        nfeat = (
+            torch.rand(g.num_src_nodes(), in_feats[0], device=device),
+            torch.rand(g.num_dst_nodes(), in_feats[1], device=device),
+        )
+    else:
+        in_feats = 10
+        nfeat = torch.rand(g.num_src_nodes(), in_feats, device=device)
+    out_feats = 2
+
+    if use_edge_feats:
+        edge_feats = 3
+        efeat = torch.rand(g.num_edges(), edge_feats, device=device)
+    else:
+        edge_feats = None
+        efeat = None
+
+    conv = GATConv(
+        in_feats, out_feats, num_heads, concat=concat, edge_feats=edge_feats, bias=bias
+    ).to(device)
+    out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
+
+    grad_out = torch.rand_like(out)
+    out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
index abde1e642d6..d2ae6a23978 100644
--- a/python/cugraph-dgl/tests/nn/test_relgraphconv.py
+++ b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=too-many-arguments, too-many-locals
-from itertools import product
+
 import pytest
 
 try:
@@ -25,17 +25,13 @@
 torch = import_optional("torch")
 dgl = import_optional("dgl")
 
-options = {
-    "idtype_int": [False, True],
-    "max_in_degree": [None, 8],
-    "num_bases": [1, 2, 5],
-    "regularizer": [None, "basis"],
-    "self_loop": [False, True],
-    "to_block": [False, True],
-}
-
 
-@pytest.mark.parametrize(",".join(options.keys()), product(*options.values()))
+@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
+@pytest.mark.parametrize("num_bases", [1, 2, 5])
+@pytest.mark.parametrize("regularizer", [None, "basis"])
+@pytest.mark.parametrize("self_loop", [False, True])
+@pytest.mark.parametrize("to_block", [False, True])
 def test_relgraphconv_equality(
     idtype_int, max_in_degree, num_bases, regularizer, self_loop, to_block
 ):
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/tests/nn/test_sageconv.py
index 6baa5fb5287..38cb020b8bb 100644
--- a/python/cugraph-dgl/tests/nn/test_sageconv.py
+++ b/python/cugraph-dgl/tests/nn/test_sageconv.py
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=too-many-arguments, too-many-locals
-from itertools import product
+
 import pytest
 
 try:
@@ -25,15 +25,11 @@
 torch = import_optional("torch")
 dgl = import_optional("dgl")
 
-options = {
-    "bias": [False, True],
-    "idtype_int": [False, True],
-    "max_in_degree": [None, 8],
-    "to_block": [False, True],
-}
-
 
-@pytest.mark.parametrize(",".join(options.keys()), product(*options.values()))
+@pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
+@pytest.mark.parametrize("to_block", [False, True])
 def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block):
     SAGEConv = dgl.nn.SAGEConv
     CuGraphSAGEConv = cugraph_dgl.nn.SAGEConv
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/tests/nn/test_transformerconv.py
new file mode 100644
index 00000000000..64af795231c
--- /dev/null
+++ b/python/cugraph-dgl/tests/nn/test_transformerconv.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+try:
+    from cugraph_dgl.nn import TransformerConv
+except ModuleNotFoundError:
+    pytest.skip("cugraph_dgl not available", allow_module_level=True)
+
+from cugraph.utilities.utils import import_optional
+from .common import create_graph1
+
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+
+
+@pytest.mark.parametrize("beta", [False, True])
+@pytest.mark.parametrize("bipartite", [False, True])
+@pytest.mark.parametrize("concat", [False, True])
+@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("num_heads", [1, 2, 3, 4])
+@pytest.mark.parametrize("to_block", [False, True])
+@pytest.mark.parametrize("use_edge_feats", [False, True])
+def test_TransformerConv(
+    beta, bipartite, concat, idtype_int, num_heads, to_block, use_edge_feats
+):
+    device = "cuda"
+    g = create_graph1().to(device)
+
+    if idtype_int:
+        g = g.int()
+
+    if to_block:
+        g = dgl.to_block(g)
+
+    if bipartite:
+        in_node_feats = (5, 3)
+        nfeat = (
+            torch.rand(g.num_src_nodes(), in_node_feats[0], device=device),
+            torch.rand(g.num_dst_nodes(), in_node_feats[1], device=device),
+        )
+    else:
+        in_node_feats = 3
+        nfeat = torch.rand(g.num_src_nodes(), in_node_feats, device=device)
+    out_node_feats = 2
+
+    if use_edge_feats:
+        edge_feats = 3
+        efeat = torch.rand(g.num_edges(), edge_feats, device=device)
+    else:
+        edge_feats = None
+        efeat = None
+
+    conv = TransformerConv(
+        in_node_feats,
+        out_node_feats,
+        num_heads=num_heads,
+        concat=concat,
+        beta=beta,
+        edge_feats=edge_feats,
+    ).to(device)
+
+    out = conv(g, nfeat, efeat)
+    grad_out = torch.rand_like(out)
+    out.backward(grad_out)
diff --git a/python/cugraph-pyg/cugraph_pyg/__init__.py b/python/cugraph-pyg/cugraph_pyg/__init__.py
index 12fc8de9fef..f5e317bfafd 100644
--- a/python/cugraph-pyg/cugraph_pyg/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/__init__.py
@@ -11,4 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "23.04.01"
+__version__ = "23.06.00"
diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
index 300c56fe6a7..e3eb4a85a85 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
@@ -33,12 +33,23 @@
 dask_cudf = import_optional("dask_cudf")
 
 torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
 
 Tensor = None if isinstance(torch, MissingModule) else torch.Tensor
 NdArray = None if isinstance(cupy, MissingModule) else cupy.ndarray
 DaskCudfSeries = None if isinstance(dask_cudf, MissingModule) else dask_cudf.Series
 
 TensorType = Union[Tensor, NdArray, cudf.Series, DaskCudfSeries]
+NodeType = (
+    None
+    if isinstance(torch_geometric, MissingModule)
+    else torch_geometric.typing.NodeType
+)
+EdgeType = (
+    None
+    if isinstance(torch_geometric, MissingModule)
+    else torch_geometric.typing.EdgeType
+)
 
 
 class EdgeLayout(Enum):
@@ -216,20 +227,25 @@ def __init__(
         G: dict[str, tuple[TensorType]] or dict[str, int] (Required)
             Dictionary of edge indices.
             Option 1 (graph in memory):
-                Pass the edge indices
-                i.e. {
-                    ('author', 'writes', 'paper'): [[0,1,2],[2,0,1]],
-                    ('author', 'affiliated', 'institution'): [[0,1],[0,1]]
+
+                Pass the edge indices: i.e.
+                {
+                ('author', 'writes', 'paper'): [[0,1,2],[2,0,1]],
+                ('author', 'affiliated', 'institution'): [[0,1],[0,1]]
                 }
+
+
             Option 2 (graph not in memory):
-                Pass the number of edges
-                i.e. {
-                    ('author', 'writes', 'paper'): 2,
-                    ('author', 'affiliated', 'institution'): 2
+
+                Pass the number of edges: i.e.
+                {
+                ('author', 'writes', 'paper'): 2,
+                ('author', 'affiliated', 'institution'): 2
                 }
                 If the graph is not in memory, manipulating the edge indices
                 or calling sampling is not possible.  This is for cases where
                 sampling has already been done and samples were written to disk.
+
             Note: the internal cugraph representation will use
             offsetted vertex and edge ids.
 
@@ -266,7 +282,7 @@ def __init__(
 
         self.__infer_offsets(num_nodes_dict, num_edges_dict)
         self.__infer_existing_tensors(F)
-        self.__infer_edge_types(num_edges_dict)
+        self.__infer_edge_types(num_nodes_dict, num_edges_dict)
 
         self._edge_attr_cls = CuGraphEdgeAttr
 
@@ -415,8 +431,6 @@ def __construct_graph(
             {
                 "src": pandas.Series(na_src),
                 "dst": pandas.Series(na_dst),
-                "w": pandas.Series(np.zeros(len(na_src))),
-                "eid": pandas.Series(np.arange(len(na_src))),
                 "etp": pandas.Series(na_etp),
             }
         )
@@ -436,7 +450,7 @@ def __construct_graph(
                 df,
                 source="src",
                 destination="dst",
-                edge_attr=["w", "eid", "etp"],
+                edge_type="etp",
             )
             distributed.get_client().publish_dataset(cugraph_graph=graph)
         else:
@@ -444,7 +458,7 @@ def __construct_graph(
                 df,
                 source="src",
                 destination="dst",
-                edge_attr=["w", "eid", "etp"],
+                edge_type="etp",
             )
 
         return graph
@@ -453,12 +467,29 @@ def __construct_graph(
     def _edge_types_to_attrs(self) -> dict:
         return dict(self.__edge_types_to_attrs)
 
+    @property
+    def node_types(self) -> List[NodeType]:
+        return list(self.__vertex_type_offsets["type"])
+
+    @property
+    def edge_types(self) -> List[EdgeType]:
+        return list(self.__edge_types_to_attrs.keys())
+
+    def canonical_edge_type_to_numeric(self, etype: EdgeType) -> int:
+        return np.searchsorted(self.__edge_type_offsets["type"], "__".join(etype))
+
+    def numeric_edge_type_to_canonical(self, etype: int) -> EdgeType:
+        return tuple(self.__edge_type_offsets["type"][etype].split("__"))
+
     @cached_property
     def _is_delayed(self):
         if self.__graph is None:
             return False
         return self.__graph.is_multi_gpu()
 
+    def _numeric_vertex_type_from_name(self, vertex_type_name: str) -> int:
+        return np.searchsorted(self.__vertex_type_offsets["type"], vertex_type_name)
+
     def get_vertex_index(self, vtypes) -> TensorType:
         if isinstance(vtypes, str):
             vtypes = [vtypes]
@@ -556,12 +587,12 @@ def _get_edge_index(self, attr: CuGraphEdgeAttr) -> Tuple[TensorType, TensorType
             src_type, _, dst_type = attr.edge_type
             src_offset = int(
                 self.__vertex_type_offsets["start"][
-                    np.searchsorted(self.__vertex_type_offsets["type"], src_type)
+                    self._numeric_vertex_type_from_name(src_type)
                 ]
             )
             dst_offset = int(
                 self.__vertex_type_offsets["start"][
-                    np.searchsorted(self.__vertex_type_offsets["type"], dst_type)
+                    self._numeric_vertex_type_from_name(dst_type)
                 ]
             )
             coli = np.searchsorted(
@@ -651,23 +682,21 @@ def _get_vertex_groups_from_sample(
         self, nodes_of_interest: TensorType, is_sorted: bool = False
     ) -> dict:
         """
-        Given a cudf (NOT dask_cudf) Series of nodes of interest, this
+        Given a tensor of nodes of interest, this
         method a single dictionary, noi_index.
 
         noi_index is the original vertex ids grouped by vertex type.
 
-        Example Input: [5, 2, 10, 11, 8]
-        Output: {'red_vertex': [5, 8], 'blue_vertex': [2], 'green_vertex': [10, 11]}
+        Example Input: [5, 2, 1, 10, 11, 8]
+        Output: {'red_vertex': [5, 1, 8], 'blue_vertex': [2], 'green_vertex': [10, 11]}
 
         """
-        if not is_sorted:
-            nodes_of_interest, _ = torch.sort(nodes_of_interest)
 
         noi_index = {}
 
         vtypes = cudf.Series(self.__vertex_type_offsets["type"])
         if len(vtypes) == 1:
-            noi_index[vtypes[0]] = nodes_of_interest
+            noi_index[vtypes.iloc[0]] = nodes_of_interest
         else:
             noi_type_indices = torch.searchsorted(
                 torch.as_tensor(self.__vertex_type_offsets["stop"], device="cuda"),
@@ -690,6 +719,29 @@ def _get_vertex_groups_from_sample(
 
         return noi_index
 
+    def _get_sample_from_vertex_groups(
+        self, vertex_groups: Dict[str, TensorType]
+    ) -> TensorType:
+        """
+        Inverse of _get_vertex_groups_from_sample() (although with de-offsetted ids).
+        Given a dictionary of node types and de-offsetted node ids, return
+        the global (non-renumbered) vertex ids.
+
+        Example Input: {'horse': [1, 3, 5], 'duck': [1, 2]}
+        Output: [1, 3, 5, 14, 15]
+        """
+        t = torch.tensor([], dtype=torch.int64, device="cuda")
+
+        for group_name, ix in vertex_groups.items():
+            type_id = self._numeric_vertex_type_from_name(group_name)
+            if not ix.is_cuda:
+                ix = ix.cuda()
+            offset = self.__vertex_type_offsets["start"][type_id]
+            u = ix + offset
+            t = torch.concatenate([t, u])
+
+        return t
+
     def _get_renumbered_edge_groups_from_sample(
         self, sampling_results: cudf.DataFrame, noi_index: dict
     ) -> Tuple[dict, dict]:
@@ -736,15 +788,26 @@ def _get_renumbered_edge_groups_from_sample(
             t_pyg_type = list(self.__edge_types_to_attrs.values())[0].edge_type
             src_type, _, dst_type = t_pyg_type
 
-            sources = torch.as_tensor(sampling_results.sources, device="cuda")
+            dst_id_table = noi_index[dst_type]
+            dst_id_map = (
+                cudf.Series(cupy.asarray(dst_id_table), name="dst")
+                .reset_index()
+                .rename(columns={"index": "new_id"})
+                .set_index("dst")
+            )
+            dst = dst_id_map["new_id"].loc[sampling_results.destinations]
+            col_dict[t_pyg_type] = torch.as_tensor(dst.values, device="cuda")
+
             src_id_table = noi_index[src_type]
-            src = torch.searchsorted(src_id_table, sources)
-            row_dict[t_pyg_type] = src
+            src_id_map = (
+                cudf.Series(cupy.asarray(src_id_table), name="src")
+                .reset_index()
+                .rename(columns={"index": "new_id"})
+                .set_index("src")
+            )
+            src = src_id_map["new_id"].loc[sampling_results.sources]
+            row_dict[t_pyg_type] = torch.as_tensor(src.values, device="cuda")
 
-            destinations = torch.as_tensor(sampling_results.destinations, device="cuda")
-            dst_id_table = noi_index[dst_type]
-            dst = torch.searchsorted(dst_id_table, destinations)
-            col_dict[t_pyg_type] = dst
         else:
             # This will retrieve the single string representation.
             # It needs to be converted to a tuple in the for loop below.
@@ -762,7 +825,7 @@ def _get_renumbered_edge_groups_from_sample(
 
                 # Get the de-offsetted sources
                 sources = torch.as_tensor(
-                    sampling_results.sources.iloc[ix], device="cuda"
+                    sampling_results.sources.iloc[ix].values, device="cuda"
                 )
                 sources_ix = torch.searchsorted(
                     self.__vertex_type_offsets["stop"], sources
@@ -771,12 +834,18 @@ def _get_renumbered_edge_groups_from_sample(
 
                 # Create the row entry for this type
                 src_id_table = noi_index[src_type]
-                src = torch.searchsorted(src_id_table, sources)
-                row_dict[pyg_can_edge_type] = src
+                src_id_map = (
+                    cudf.Series(cupy.asarray(src_id_table), name="src")
+                    .reset_index()
+                    .rename(columns={"index": "new_id"})
+                    .set_index("src")
+                )
+                src = src_id_map["new_id"].loc[cupy.asarray(sources)]
+                row_dict[pyg_can_edge_type] = torch.as_tensor(src.values, device="cuda")
 
                 # Get the de-offsetted destinations
                 destinations = torch.as_tensor(
-                    sampling_results.destinations.iloc[ix], device="cuda"
+                    sampling_results.destinations.iloc[ix].values, device="cuda"
                 )
                 destinations_ix = torch.searchsorted(
                     self.__vertex_type_offsets["stop"], destinations
@@ -785,8 +854,14 @@ def _get_renumbered_edge_groups_from_sample(
 
                 # Create the col entry for this type
                 dst_id_table = noi_index[dst_type]
-                dst = torch.searchsorted(dst_id_table, destinations)
-                col_dict[pyg_can_edge_type] = dst
+                dst_id_map = (
+                    cudf.Series(cupy.asarray(dst_id_table), name="dst")
+                    .reset_index()
+                    .rename(columns={"index": "new_id"})
+                    .set_index("dst")
+                )
+                dst = dst_id_map["new_id"].loc[cupy.asarray(destinations)]
+                col_dict[pyg_can_edge_type] = torch.as_tensor(dst.values, device="cuda")
 
         return row_dict, col_dict
 
@@ -818,16 +893,21 @@ def create_named_tensor(
             )
         )
 
-    def __infer_edge_types(self, num_edges_dict) -> None:
+    def __infer_edge_types(
+        self,
+        num_nodes_dict: Dict[str, int],
+        num_edges_dict: Dict[Tuple[str, str, str], int],
+    ) -> None:
         self.__edge_types_to_attrs = {}
 
         for pyg_can_edge_type in sorted(num_edges_dict.keys()):
-            sz = num_edges_dict[pyg_can_edge_type]
+            sz_src = num_nodes_dict[pyg_can_edge_type[0]]
+            sz_dst = num_nodes_dict[pyg_can_edge_type[-1]]
             self.__edge_types_to_attrs[pyg_can_edge_type] = CuGraphEdgeAttr(
                 edge_type=pyg_can_edge_type,
                 layout=EdgeLayout.COO,
                 is_sorted=False,
-                size=(sz, sz),
+                size=(sz_src, sz_dst),
             )
 
     def __infer_existing_tensors(self, F) -> None:
@@ -857,22 +937,25 @@ def _get_tensor(self, attr: CuGraphTensorAttr) -> TensorType:
         cols = attr.properties
 
         idx = attr.index
-        if feature_backend == "torch":
-            if not isinstance(idx, torch.Tensor):
-                raise TypeError(
-                    f"Type {type(idx)} invalid"
-                    f" for feature store backend {feature_backend}"
-                )
-            idx = idx.cpu()
-        elif feature_backend == "numpy":
-            # allow indexing through cupy arrays
-            if isinstance(idx, cupy.ndarray):
-                idx = idx.get()
-            elif isinstance(idx, torch.Tensor):
-                idx = np.asarray(idx.cpu())
+        if idx is not None:
+            if feature_backend == "torch":
+                if not isinstance(idx, torch.Tensor):
+                    raise TypeError(
+                        f"Type {type(idx)} invalid"
+                        f" for feature store backend {feature_backend}"
+                    )
+                idx = idx.cpu()
+            elif feature_backend == "numpy":
+                # allow feature indexing through cupy arrays
+                if isinstance(idx, cupy.ndarray):
+                    idx = idx.get()
+                elif isinstance(idx, torch.Tensor):
+                    idx = np.asarray(idx.cpu())
 
         if cols is None:
             t = self.__features.get_data(idx, attr.group_name, attr.attr_name)
+            if idx is None:
+                t = t[-1]
 
             if isinstance(t, np.ndarray):
                 t = torch.as_tensor(t, device="cuda")
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
index fa02ac78f43..e0d3b0a9fca 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
@@ -14,6 +14,7 @@
 import tempfile
 
 import os
+import re
 
 import cupy
 import cudf
@@ -25,26 +26,34 @@
 from cugraph_pyg.loader.filter import _filter_cugraph_store
 from cugraph_pyg.sampler.cugraph_sampler import _sampler_output_from_sampling_results
 
-from typing import Union, Tuple, Sequence, List
+from typing import Union, Tuple, Sequence, List, Dict
 
 torch_geometric = import_optional("torch_geometric")
+InputNodes = (
+    Sequence
+    if isinstance(torch_geometric, MissingModule)
+    else torch_geometric.typing.InputNodes
+)
 
 
 class EXPERIMENTAL__BulkSampleLoader:
+
+    __ex_parquet_file = re.compile(r"batch=([0-9]+)\-([0-9]+)\.parquet")
+
     def __init__(
         self,
         feature_store: CuGraphStore,
         graph_store: CuGraphStore,
-        all_indices: Union[Sequence, int],
+        input_nodes: InputNodes = None,
         batch_size: int = 0,
-        shuffle=False,
+        shuffle: bool = False,
         edge_types: Sequence[Tuple[str]] = None,
-        directory=None,
-        rank=0,
-        starting_batch_id=0,
-        batches_per_partition=100,
+        directory: Union[str, tempfile.TemporaryDirectory] = None,
+        input_files: List[str] = None,
+        starting_batch_id: int = 0,
+        batches_per_partition: int = 100,
         # Sampler args
-        num_neighbors: List[int] = [1, 1],
+        num_neighbors: Union[List[int], Dict[Tuple[str, str, str], List[int]]] = None,
         replace: bool = True,
         # Other kwargs for the BulkSampler
         **kwargs,
@@ -61,9 +70,9 @@ def __init__(
         graph_store: CuGraphStore
             The graph store containing the graph structure.
 
-        all_indices: Union[Tensor, int]
+        input_nodes: InputNodes
             The input nodes associated with this sampler.
-            If this is an integer N , this loader will load N batches
+            If None, this loader will load batches
             from disk rather than performing sampling in memory.
 
         batch_size: int
@@ -84,46 +93,71 @@ def __init__(
             The path of the directory to write samples to.
             Defaults to a new generated temporary directory.
 
-        rank: int (optional, default=0)
-            The rank of the current worker.  Should be provided
-            when there are multiple workers.
+        input_files: List[str] (optional, default=None)
+            The input files to read from the directory containing
+            samples.  This argument is only used when loading
+            alread-sampled batches from disk.
 
         starting_batch_id: int (optional, default=0)
             The starting id for each batch.  Defaults to 0.
-            Generally used when loading previously-sampled
-            batches from disk.
 
         batches_per_partition: int (optional, default=100)
             The number of batches in each output partition.
             Defaults to 100.  Gets passed to the bulk
             sampler if there is one; otherwise, this argument
             is used to determine which files to read.
+
+        num_neighbors: Union[List[int],
+                 Dict[Tuple[str, str, str], List[int]]] (required)
+            The number of neighbors to sample for each node in each iteration.
+            If an entry is set to -1, all neighbors will be included.
+            In heterogeneous graphs, may also take in a dictionary denoting
+            the number of neighbors to sample for each individual edge type.
+
+            Note: in cuGraph, only one value of num_neighbors is currently supported.
+            Passing in a dictionary will result in an exception.
         """
 
         self.__feature_store = feature_store
         self.__graph_store = graph_store
-        self.__rank = rank
-        self.__next_batch = starting_batch_id
-        self.__end_exclusive = starting_batch_id
+        self.__next_batch = -1
+        self.__end_exclusive = -1
         self.__batches_per_partition = batches_per_partition
         self.__starting_batch_id = starting_batch_id
 
-        if isinstance(all_indices, int):
+        if input_nodes is None:
             # Will be loading from disk
-            self.__num_batches = all_indices
+            self.__num_batches = input_nodes
             self.__directory = directory
+            if input_files is None:
+                if isinstance(self.__directory, str):
+                    self.__input_files = iter(os.listdir(self.__directory))
+                else:
+                    self.__input_files = iter(os.listdir(self.__directory.name))
+            else:
+                self.__input_files = iter(input_files)
             return
 
+        input_type, input_nodes = torch_geometric.loader.utils.get_input_nodes(
+            (feature_store, graph_store), input_nodes
+        )
+        if input_type is not None:
+            input_nodes = graph_store._get_sample_from_vertex_groups(
+                {input_type: input_nodes}
+            )
+
         if batch_size is None or batch_size < 1:
             raise ValueError("Batch size must be >= 1")
 
         self.__directory = tempfile.TemporaryDirectory(dir=directory)
 
+        if isinstance(num_neighbors, dict):
+            raise ValueError("num_neighbors dict is currently unsupported!")
+
         bulk_sampler = BulkSampler(
             batch_size,
             self.__directory.name,
             self.__graph_store._subgraph(edge_types),
-            rank=rank,
             fanout_vals=num_neighbors,
             with_replacement=replace,
             batches_per_partition=self.__batches_per_partition,
@@ -131,21 +165,21 @@ def __init__(
         )
 
         # Make sure indices are in cupy
-        all_indices = cupy.asarray(all_indices)
+        input_nodes = cupy.asarray(input_nodes)
 
         # Shuffle
         if shuffle:
-            cupy.random.shuffle(all_indices)
+            cupy.random.shuffle(input_nodes)
 
         # Truncate if we can't evenly divide the input array
-        stop = (len(all_indices) // batch_size) * batch_size
-        all_indices = all_indices[:stop]
+        stop = (len(input_nodes) // batch_size) * batch_size
+        input_nodes = input_nodes[:stop]
 
         # Split into batches
-        all_indices = cupy.split(all_indices, len(all_indices) // batch_size)
+        input_nodes = cupy.split(input_nodes, len(input_nodes) // batch_size)
 
         self.__num_batches = 0
-        for batch_num, batch_i in enumerate(all_indices):
+        for batch_num, batch_i in enumerate(input_nodes):
             self.__num_batches += 1
             bulk_sampler.add_batches(
                 cudf.DataFrame(
@@ -161,40 +195,49 @@ def __init__(
             )
 
         bulk_sampler.flush()
+        self.__input_files = iter(os.listdir(self.__directory.name))
 
     def __next__(self):
-        # Quit iterating if there are no batches left
-        if self.__next_batch >= self.__num_batches + self.__starting_batch_id:
-            raise StopIteration
-
         # Load the next set of sampling results if necessary
         if self.__next_batch >= self.__end_exclusive:
+            if self.__directory is None:
+                raise StopIteration
+
             # Read the next parquet file into memory
             dir_path = (
                 self.__directory
                 if isinstance(self.__directory, str)
                 else self.__directory.name
             )
-            rank_path = os.path.join(dir_path, f"rank={self.__rank}")
 
-            file_end_batch_incl = min(
-                self.__end_exclusive + self.__batches_per_partition - 1,
-                self.__starting_batch_id + self.__num_batches - 1,
-            )
+            # Will raise StopIteration if there are no files left
+            try:
+                fname = next(self.__input_files)
+            except StopIteration as ex:
+                # Won't delete a non-temp dir (since it would just be deleting a string)
+                del self.__directory
+                self.__directory = None
+                raise StopIteration(ex)
+
+            m = self.__ex_parquet_file.match(fname)
+            if m is None:
+                raise ValueError(f"Invalid parquet filename {fname}")
+
+            self.__next_batch, end_inclusive = [int(g) for g in m.groups()]
+            self.__end_exclusive = end_inclusive + 1
+
             parquet_path = os.path.join(
-                rank_path,
-                f"batch={self.__end_exclusive}" f"-{file_end_batch_incl}.parquet",
+                dir_path,
+                fname,
             )
 
-            self.__end_exclusive += self.__batches_per_partition
-
             columns = {
                 "sources": "int64",
                 "destinations": "int64",
                 # 'edge_id':'int64',
                 "edge_type": "int32",
                 "batch_id": "int32",
-                # 'hop_id':'int32'
+                "hop_id": "int32",
             }
             self.__data = cudf.read_parquet(parquet_path)
             self.__data = self.__data[list(columns.keys())].astype(columns)
@@ -207,11 +250,7 @@ def __next__(self):
         )
 
         # Get ready for next iteration
-        # If there is no next iteration, make sure results are deleted
         self.__next_batch += 1
-        if self.__next_batch >= self.__num_batches + self.__starting_batch_id:
-            # Won't delete a non-temp dir (since it would just be deleting a string)
-            del self.__directory
 
         # Get and return the sampled subgraph
         if isinstance(torch_geometric, MissingModule):
@@ -244,8 +283,8 @@ class EXPERIMENTAL__CuGraphNeighborLoader:
     def __init__(
         self,
         data: Union[CuGraphStore, Tuple[CuGraphStore, CuGraphStore]],
-        input_nodes: Sequence,
-        batch_size: int,
+        input_nodes: Union[InputNodes, int] = None,
+        batch_size: int = None,
         **kwargs,
     ):
         """
@@ -254,12 +293,11 @@ def __init__(
         data: CuGraphStore or (CuGraphStore, CuGraphStore)
             The CuGraphStore or stores where the graph/feature data is held.
 
-        batch_size: int
+        batch_size: int (required)
             The number of input nodes in each batch.
 
-        input_nodes: Tensor
-            The input nodes for *this* loader.  If there are multiple loaders,
-            the appropriate split should be given for this loader.
+        input_nodes: Union[InputNodes, int] (required)
+            The input nodes associated with this sampler.
 
         **kwargs: kwargs
             Keyword arguments to pass through for sampling.
@@ -267,6 +305,11 @@ def __init__(
             See BulkSampleLoader.
         """
 
+        if input_nodes is None:
+            raise ValueError("input_nodes is required")
+        if batch_size is None:
+            raise ValueError("batch_size is required")
+
         # Allow passing in a feature store and graph store as a tuple, as
         # in the standard PyG API.  If only one is passed, it is assumed
         # it is behaving as both a graph store and a feature store.
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/__init__.py b/python/cugraph-pyg/cugraph_pyg/nn/__init__.py
new file mode 100644
index 00000000000..331b49ebec0
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/nn/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .conv import *
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
new file mode 100644
index 00000000000..0c94be5e12b
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .gat_conv import GATConv
+from .gatv2_conv import GATv2Conv
+from .transformer_conv import TransformerConv
+
+__all__ = [
+    "GATConv",
+    "GATv2Conv",
+    "TransformerConv",
+]
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
new file mode 100644
index 00000000000..bec50792131
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Any, Optional, Tuple, Union
+
+from cugraph.utilities.utils import import_optional
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+try:  # pragma: no cover
+    from pylibcugraphops.pytorch import (
+        BipartiteCSC,
+        SampledCSC,
+        SampledHeteroCSC,
+        StaticCSC,
+        StaticHeteroCSC,
+    )
+
+    HAS_PYLIBCUGRAPHOPS = True
+except ImportError:
+    HAS_PYLIBCUGRAPHOPS = False
+
+
+class BaseConv(torch.nn.Module):  # pragma: no cover
+    r"""An abstract base class for implementing cugraph-ops message passing layers."""
+
+    def __init__(self):
+        super().__init__()
+
+        if HAS_PYLIBCUGRAPHOPS is False:
+            raise ModuleNotFoundError(
+                f"'{self.__class__.__name__}' requires " f"'pylibcugraphops>=23.04'"
+            )
+
+    def reset_parameters(self):
+        r"""Resets all learnable parameters of the module."""
+        pass
+
+    @staticmethod
+    def to_csc(
+        edge_index: torch.Tensor,
+        size: Optional[Tuple[int, int]] = None,
+        edge_attr: Optional[torch.Tensor] = None,
+    ) -> Union[
+        Tuple[torch.Tensor, torch.Tensor, int],
+        Tuple[Tuple[torch.Tensor, torch.Tensor, int], torch.Tensor],
+    ]:
+        r"""Returns a CSC representation of an :obj:`edge_index` tensor to be
+        used as input to cugraph-ops conv layers.
+
+        Args:
+            edge_index (torch.Tensor): The edge indices.
+            size ((int, int), optional). The shape of :obj:`edge_index` in each
+                dimension. (default: :obj:`None`)
+            edge_attr (torch.Tensor, optional): The edge features.
+                (default: :obj:`None`)
+        """
+        if size is None:
+            warnings.warn(
+                f"Inferring the graph size from 'edge_index' causes "
+                f"a decline in performance and does not work for "
+                f"bipartite graphs. To suppress this warning, pass "
+                f"the 'size' explicitly in '{__name__}.to_csc()'."
+            )
+            num_src_nodes = num_dst_nodes = int(edge_index.max()) + 1
+        else:
+            num_src_nodes, num_dst_nodes = size
+
+        row, col = edge_index
+        col, perm = torch_geometric.utils.index_sort(col, max_value=num_dst_nodes)
+        row = row[perm]
+
+        colptr = torch_geometric.utils.sparse.index2ptr(col, num_dst_nodes)
+
+        if edge_attr is not None:
+            return (row, colptr, num_src_nodes), edge_attr[perm]
+
+        return row, colptr, num_src_nodes
+
+    def get_cugraph(
+        self,
+        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        bipartite: bool = False,
+        max_num_neighbors: Optional[int] = None,
+    ) -> Any:
+        r"""Constructs a :obj:`cugraph-ops` graph object from CSC representation.
+        Supports both bipartite and non-bipartite graphs.
+
+        Args:
+            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
+                representation of a graph, given as a tuple of
+                :obj:`(row, colptr, num_src_nodes)`. Use the
+                :meth:`to_csc` method to convert an :obj:`edge_index`
+                representation to the desired format.
+            bipartite (bool): If set to :obj:`True`, will create the bipartite
+                structure in cugraph-ops. (default: :obj:`False`)
+            max_num_neighbors (int, optional): The maximum number of neighbors
+                of a target node. It is only effective when operating in a
+                bipartite graph. When not given, will be computed on-the-fly,
+                leading to slightly worse performance. (default: :obj:`None`)
+        """
+        row, colptr, num_src_nodes = csc
+
+        if not row.is_cuda:
+            raise RuntimeError(
+                f"'{self.__class__.__name__}' requires GPU-"
+                f"based processing (got CPU tensor)"
+            )
+
+        if bipartite:
+            return BipartiteCSC(colptr, row, num_src_nodes)
+
+        if num_src_nodes != colptr.numel() - 1:
+            if max_num_neighbors is None:
+                max_num_neighbors = int((colptr[1:] - colptr[:-1]).max())
+
+            return SampledCSC(colptr, row, max_num_neighbors, num_src_nodes)
+
+        return StaticCSC(colptr, row)
+
+    def get_typed_cugraph(
+        self,
+        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_type: torch.Tensor,
+        num_edge_types: Optional[int] = None,
+        bipartite: bool = False,
+        max_num_neighbors: Optional[int] = None,
+    ) -> Any:
+        r"""Constructs a typed :obj:`cugraph` graph object from a CSC
+        representation where each edge corresponds to a given edge type.
+        Supports both bipartite and non-bipartite graphs.
+
+        Args:
+            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
+                representation of a graph, given as a tuple of
+                :obj:`(row, colptr, num_src_nodes)`. Use the
+                :meth:`to_csc` method to convert an :obj:`edge_index`
+                representation to the desired format.
+            edge_type (torch.Tensor): The edge type.
+            num_edge_types (int, optional): The maximum number of edge types.
+                When not given, will be computed on-the-fly, leading to
+                slightly worse performance. (default: :obj:`None`)
+            bipartite (bool): If set to :obj:`True`, will create the bipartite
+                structure in cugraph-ops. (default: :obj:`False`)
+            max_num_neighbors (int, optional): The maximum number of neighbors
+                of a target node. It is only effective when operating in a
+                bipartite graph. When not given, will be computed on-the-fly,
+                leading to slightly worse performance. (default: :obj:`None`)
+        """
+        if num_edge_types is None:
+            num_edge_types = int(edge_type.max()) + 1
+
+        row, colptr, num_src_nodes = csc
+        edge_type = edge_type.int()
+
+        if bipartite:
+            raise NotImplementedError
+
+        if num_src_nodes != colptr.numel() - 1:
+            if max_num_neighbors is None:
+                max_num_neighbors = int((colptr[1:] - colptr[:-1]).max())
+
+            return SampledHeteroCSC(
+                colptr, row, edge_type, max_num_neighbors, num_src_nodes, num_edge_types
+            )
+
+        return StaticHeteroCSC(colptr, row, edge_type, num_edge_types)
+
+    def forward(
+        self,
+        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        csc: Tuple[torch.Tensor, torch.Tensor, int],
+    ) -> torch.Tensor:
+        r"""Runs the forward pass of the module.
+
+        Args:
+            x (torch.Tensor): The node features.
+            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
+                representation of a graph, given as a tuple of
+                :obj:`(row, colptr, num_src_nodes)`. Use the
+                :meth:`to_csc` method to convert an :obj:`edge_index`
+                representation to the desired format.
+        """
+        raise NotImplementedError
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
new file mode 100644
index 00000000000..4bf37cf3e72
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
@@ -0,0 +1,247 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+from pylibcugraphops.pytorch.operators import mha_gat_n2n, mha_gat_n2n_bipartite
+
+from cugraph.utilities.utils import import_optional
+
+from .base import BaseConv
+
+torch = import_optional("torch")
+nn = import_optional("torch.nn")
+torch_geometric = import_optional("torch_geometric")
+
+
+class GATConv(BaseConv):
+    r"""The graph attentional operator from the `"Graph Attention Networks"
+    <https://arxiv.org/abs/1710.10903>`_ paper.
+
+    .. math::
+        \mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} +
+        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j},
+
+    where the attention coefficients :math:`\alpha_{i,j}` are computed as
+
+    .. math::
+        \alpha_{i,j} =
+        \frac{
+        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
+        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_j]
+        \right)\right)}
+        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
+        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
+        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k]
+        \right)\right)}.
+
+    If the graph has multi-dimensional edge features :math:`\mathbf{e}_{i,j}`,
+    the attention coefficients :math:`\alpha_{i,j}` are computed as
+
+    .. math::
+        \alpha_{i,j} =
+        \frac{
+        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
+        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_j
+        \, \Vert \, \mathbf{\Theta}_{e} \mathbf{e}_{i,j}]\right)\right)}
+        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
+        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
+        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k
+        \, \Vert \, \mathbf{\Theta}_{e} \mathbf{e}_{i,k}]\right)\right)}.
+
+    Args:
+        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
+            derive the size from the first input(s) to the forward method.
+            A tuple corresponds to the sizes of source and target
+            dimensionalities.
+        out_channels (int): Size of each output sample.
+        heads (int, optional): Number of multi-head-attentions.
+            (default: :obj:`1`)
+        concat (bool, optional): If set to :obj:`False`, the multi-head
+            attentions are averaged instead of concatenated.
+            (default: :obj:`True`)
+        negative_slope (float, optional): LeakyReLU angle of the negative
+            slope. (default: :obj:`0.2`)
+        edge_dim (int, optional): Edge feature dimensionality (in case
+            there are any). (default: :obj:`None`)
+        bias (bool, optional): If set to :obj:`False`, the layer will not learn
+            an additive bias. (default: :obj:`True`)
+    """
+
+    def __init__(
+        self,
+        in_channels: Union[int, Tuple[int, int]],
+        out_channels: int,
+        heads: int = 1,
+        concat: bool = True,
+        negative_slope: float = 0.2,
+        edge_dim: Optional[int] = None,
+        bias: bool = True,
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.heads = heads
+        self.concat = concat
+        self.negative_slope = negative_slope
+        self.edge_dim = edge_dim
+
+        Linear = torch_geometric.nn.Linear
+
+        if isinstance(in_channels, int):
+            self.lin = Linear(
+                in_channels,
+                heads * out_channels,
+                bias=False,
+                weight_initializer="glorot",
+            )
+        else:
+            self.lin_src = Linear(
+                in_channels[0],
+                heads * out_channels,
+                bias=False,
+                weight_initializer="glorot",
+            )
+            self.lin_dst = Linear(
+                in_channels[1],
+                heads * out_channels,
+                bias=False,
+                weight_initializer="glorot",
+            )
+
+        if edge_dim is not None:
+            self.lin_edge = Linear(
+                edge_dim,
+                heads * out_channels,
+                bias=False,
+                weight_initializer="glorot",
+            )
+            self.att = nn.Parameter(torch.Tensor(3 * heads * out_channels))
+        else:
+            self.register_parameter("lin_edge", None)
+            self.att = nn.Parameter(torch.Tensor(2 * heads * out_channels))
+
+        if bias and concat:
+            self.bias = nn.Parameter(torch.Tensor(heads * out_channels))
+        elif bias and not concat:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter("bias", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        if isinstance(self.in_channels, int):
+            self.lin.reset_parameters()
+        else:
+            self.lin_src.reset_parameters()
+            self.lin_dst.reset_parameters()
+
+        torch_geometric.nn.inits.glorot(
+            self.att.view(-1, self.heads, self.out_channels)
+        )
+
+        if self.lin_edge is not None:
+            self.lin_edge.reset_parameters()
+
+        torch_geometric.nn.inits.zeros(self.bias)
+
+    def forward(
+        self,
+        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_attr: Optional[torch.Tensor] = None,
+        max_num_neighbors: Optional[int] = None,
+    ) -> torch.Tensor:
+        r"""Runs the forward pass of the module.
+
+        Args:
+            x (torch.Tensor or tuple): The node features. Can be a tuple of
+                tensors denoting source and destination node features.
+            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
+                representation of a graph, given as a tuple of
+                :obj:`(row, colptr, num_src_nodes)`. Use the
+                :meth:`to_csc` method to convert an :obj:`edge_index`
+                representation to the desired format.
+            edge_attr: (torch.Tensor, optional) The edge features.
+            max_num_neighbors (int, optional): The maximum number of neighbors
+                of a target node. It is only effective when operating in a
+                bipartite graph. When not given, will be computed on-the-fly,
+                leading to slightly worse performance. (default: :obj:`None`)
+        """
+        bipartite = not isinstance(x, torch.Tensor)
+        graph = self.get_cugraph(
+            csc, bipartite=bipartite, max_num_neighbors=max_num_neighbors
+        )
+
+        if edge_attr is not None:
+            if self.lin_edge is None:
+                raise RuntimeError(
+                    f"{self.__class__.__name__}.edge_dim must be set to accept "
+                    f"edge features."
+                )
+            if edge_attr.dim() == 1:
+                edge_attr = edge_attr.view(-1, 1)
+            edge_attr = self.lin_edge(edge_attr)
+
+        if bipartite:
+            if not hasattr(self, "lin_src"):
+                raise RuntimeError(
+                    f"{self.__class__.__name__}.in_channels must be a pair of "
+                    f"integers to allow bipartite node features, but got "
+                    f"{self.in_channels}."
+                )
+            x_src = self.lin_src(x[0])
+            x_dst = self.lin_dst(x[1])
+
+            out = mha_gat_n2n_bipartite(
+                x_src,
+                x_dst,
+                self.att,
+                graph,
+                num_heads=self.heads,
+                activation="LeakyReLU",
+                negative_slope=self.negative_slope,
+                concat_heads=self.concat,
+                edge_feat=edge_attr,
+            )
+
+        else:
+            if not hasattr(self, "lin"):
+                raise RuntimeError(
+                    f"{self.__class__.__name__}.in_channels is expected to be an "
+                    f"integer, but got {self.in_channels}."
+                )
+            x = self.lin(x)
+
+            out = mha_gat_n2n(
+                x,
+                self.att,
+                graph,
+                num_heads=self.heads,
+                activation="LeakyReLU",
+                negative_slope=self.negative_slope,
+                concat_heads=self.concat,
+                edge_feat=edge_attr,
+            )
+
+        if self.bias is not None:
+            out = out + self.bias
+
+        return out
+
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}({self.in_channels}, "
+            f"{self.out_channels}, heads={self.heads})"
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
new file mode 100644
index 00000000000..66d962b3f86
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+from pylibcugraphops.pytorch.operators import mha_gat_v2_n2n, mha_gat_v2_n2n_bipartite
+
+from cugraph.utilities.utils import import_optional
+
+from .base import BaseConv
+
+torch = import_optional("torch")
+nn = import_optional("torch.nn")
+torch_geometric = import_optional("torch_geometric")
+
+
+class GATv2Conv(BaseConv):
+    r"""The GATv2 operator from the `"How Attentive are Graph Attention
+    Networks?" <https://arxiv.org/abs/2105.14491>`_ paper, which fixes the
+    static attention problem of the standard
+    :class:`~torch_geometric.conv.GATConv` layer.
+    Since the linear layers in the standard GAT are applied right after each
+    other, the ranking of attended nodes is unconditioned on the query node.
+    In contrast, in :class:`GATv2`, every node can attend to any other node.
+
+    .. math::
+        \mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} +
+        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j},
+
+    where the attention coefficients :math:`\alpha_{i,j}` are computed as
+
+    .. math::
+        \alpha_{i,j} =
+        \frac{
+        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
+        [\mathbf{x}_i \, \Vert \, \mathbf{x}_j]
+        \right)\right)}
+        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
+        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
+        [\mathbf{x}_i \, \Vert \, \mathbf{x}_k]
+        \right)\right)}.
+
+    If the graph has multi-dimensional edge features :math:`\mathbf{e}_{i,j}`,
+    the attention coefficients :math:`\alpha_{i,j}` are computed as
+
+    .. math::
+        \alpha_{i,j} =
+        \frac{
+        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
+        [\mathbf{x}_i \, \Vert \, \mathbf{x}_j \, \Vert \, \mathbf{e}_{i,j}]
+        \right)\right)}
+        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
+        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
+        [\mathbf{x}_i \, \Vert \, \mathbf{x}_k \, \Vert \, \mathbf{e}_{i,k}]
+        \right)\right)}.
+
+    Args:
+        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
+            derive the size from the first input(s) to the forward method.
+            A tuple corresponds to the sizes of source and target
+            dimensionalities.
+        out_channels (int): Size of each output sample.
+        heads (int, optional): Number of multi-head-attentions.
+            (default: :obj:`1`)
+        concat (bool, optional): If set to :obj:`False`, the multi-head
+            attentions are averaged instead of concatenated.
+            (default: :obj:`True`)
+        negative_slope (float, optional): LeakyReLU angle of the negative
+            slope. (default: :obj:`0.2`)
+        edge_dim (int, optional): Edge feature dimensionality (in case
+            there are any). (default: :obj:`None`)
+        bias (bool, optional): If set to :obj:`False`, the layer will not learn
+            an additive bias. (default: :obj:`True`)
+        share_weights (bool, optional): If set to :obj:`True`, the same matrix
+            will be applied to the source and the target node of every edge.
+            (default: :obj:`False`)
+    """
+
+    def __init__(
+        self,
+        in_channels: Union[int, Tuple[int, int]],
+        out_channels: int,
+        heads: int = 1,
+        concat: bool = True,
+        negative_slope: float = 0.2,
+        edge_dim: Optional[int] = None,
+        bias: bool = True,
+        share_weights: bool = False,
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.heads = heads
+        self.concat = concat
+        self.negative_slope = negative_slope
+        self.edge_dim = edge_dim
+        self.share_weights = share_weights
+
+        Linear = torch_geometric.nn.Linear
+
+        if isinstance(in_channels, int):
+            self.lin_src = Linear(
+                in_channels,
+                heads * out_channels,
+                bias=bias,
+                weight_initializer="glorot",
+            )
+
+            if share_weights:
+                self.lin_dst = self.lin_src
+            else:
+                self.lin_dst = Linear(
+                    in_channels,
+                    heads * out_channels,
+                    bias=bias,
+                    weight_initializer="glorot",
+                )
+        else:
+            self.lin_src = Linear(
+                in_channels[0],
+                heads * out_channels,
+                bias=bias,
+                weight_initializer="glorot",
+            )
+            self.lin_dst = Linear(
+                in_channels[1],
+                heads * out_channels,
+                bias=bias,
+                weight_initializer="glorot",
+            )
+
+        self.att = nn.Parameter(torch.Tensor(heads * out_channels))
+
+        if edge_dim is not None:
+            self.lin_edge = Linear(
+                edge_dim, heads * out_channels, bias=False, weight_initializer="glorot"
+            )
+        else:
+            self.register_parameter("lin_edge", None)
+
+        if bias and concat:
+            self.bias = nn.Parameter(torch.Tensor(heads * out_channels))
+        elif bias and not concat:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter("bias", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.lin_src.reset_parameters()
+        self.lin_dst.reset_parameters()
+        if self.lin_edge is not None:
+            self.lin_edge.reset_parameters()
+
+        torch_geometric.nn.inits.glorot(
+            self.att.view(-1, self.heads, self.out_channels)
+        )
+
+        torch_geometric.nn.inits.zeros(self.bias)
+
+    def forward(
+        self,
+        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_attr: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""Runs the forward pass of the module.
+
+        Args:
+            x (torch.Tensor or tuple): The node features. Can be a tuple of
+                tensors denoting source and destination node features.
+            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
+                representation of a graph, given as a tuple of
+                :obj:`(row, colptr, num_src_nodes)`. Use the
+                :meth:`to_csc` method to convert an :obj:`edge_index`
+                representation to the desired format.
+            edge_attr: (torch.Tensor, optional) The edge features.
+        """
+        bipartite = not isinstance(x, torch.Tensor)
+        graph = self.get_cugraph(csc, bipartite=bipartite or not self.share_weights)
+
+        if edge_attr is not None:
+            if self.lin_edge is None:
+                raise RuntimeError(
+                    f"{self.__class__.__name__}.edge_dim must be set to accept "
+                    f"edge features."
+                )
+            if edge_attr.dim() == 1:
+                edge_attr = edge_attr.view(-1, 1)
+            edge_attr = self.lin_edge(edge_attr)
+
+        if not bipartite and self.share_weights:
+            x = self.lin_src(x)
+
+            out = mha_gat_v2_n2n(
+                x,
+                self.att,
+                graph,
+                num_heads=self.heads,
+                activation="LeakyReLU",
+                negative_slope=self.negative_slope,
+                concat_heads=self.concat,
+                edge_feat=edge_attr,
+            )
+        else:
+            if bipartite:
+                x_src = self.lin_src(x[0])
+                x_dst = self.lin_dst(x[1])
+            else:
+                x_src = self.lin_src(x)
+                x_dst = self.lin_dst(x)
+
+            out = mha_gat_v2_n2n_bipartite(
+                x_src,
+                x_dst,
+                self.att,
+                graph,
+                num_heads=self.heads,
+                activation="LeakyReLU",
+                negative_slope=self.negative_slope,
+                concat_heads=self.concat,
+                edge_feat=edge_attr,
+            )
+
+        if self.bias is not None:
+            out = out + self.bias
+
+        return out
+
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}({self.in_channels}, "
+            f"{self.out_channels}, heads={self.heads})"
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
new file mode 100644
index 00000000000..aeb51c028ae
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+from pylibcugraphops.pytorch.operators import mha_simple_n2n as TransformerConvAgg
+
+from cugraph.utilities.utils import import_optional
+
+from .base import BaseConv
+
+torch = import_optional("torch")
+nn = import_optional("torch.nn")
+torch_geometric = import_optional("torch_geometric")
+
+
+class TransformerConv(BaseConv):
+    r"""The graph transformer operator from the `"Masked Label Prediction:
+    Unified Message Passing Model for Semi-Supervised Classification"
+    <https://arxiv.org/abs/2009.03509>`_ paper.
+
+    .. math::
+        \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i +
+        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j} \mathbf{W}_2 \mathbf{x}_{j},
+
+    where the attention coefficients :math:`\alpha_{i,j}` are computed via
+    multi-head dot product attention:
+
+    .. math::
+        \alpha_{i,j} = \textrm{softmax} \left(
+        \frac{(\mathbf{W}_3\mathbf{x}_i)^{\top} (\mathbf{W}_4\mathbf{x}_j)}
+        {\sqrt{d}} \right)
+
+    Args:
+        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
+            derive the size from the first input(s) to the forward method.
+            A tuple corresponds to the sizes of source and target
+            dimensionalities.
+        out_channels (int): Size of each output sample.
+        heads (int, optional): Number of multi-head-attentions.
+            (default: :obj:`1`)
+        concat (bool, optional): If set to :obj:`False`, the multi-head
+            attentions are averaged instead of concatenated.
+            (default: :obj:`True`)
+        beta (bool, optional): If set, will combine aggregation and
+            skip information via
+
+            .. math::
+                \mathbf{x}^{\prime}_i = \beta_i \mathbf{W}_1 \mathbf{x}_i +
+                (1 - \beta_i) \underbrace{\left(\sum_{j \in \mathcal{N}(i)}
+                \alpha_{i,j} \mathbf{W}_2 \vec{x}_j \right)}_{=\mathbf{m}_i}
+
+            with :math:`\beta_i = \textrm{sigmoid}(\mathbf{w}_5^{\top}
+            [ \mathbf{W}_1 \mathbf{x}_i, \mathbf{m}_i, \mathbf{W}_1
+            \mathbf{x}_i - \mathbf{m}_i ])` (default: :obj:`False`)
+        edge_dim (int, optional): Edge feature dimensionality (in case
+            there are any). Edge features are added to the keys after
+            linear transformation, that is, prior to computing the
+            attention dot product. They are also added to final values
+            after the same linear transformation. The model is:
+
+            .. math::
+                \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i +
+                \sum_{j \in \mathcal{N}(i)} \alpha_{i,j} \left(
+                \mathbf{W}_2 \mathbf{x}_{j} + \mathbf{W}_6 \mathbf{e}_{ij}
+                \right),
+
+            where the attention coefficients :math:`\alpha_{i,j}` are now
+            computed via:
+
+            .. math::
+                \alpha_{i,j} = \textrm{softmax} \left(
+                \frac{(\mathbf{W}_3\mathbf{x}_i)^{\top}
+                (\mathbf{W}_4\mathbf{x}_j + \mathbf{W}_6 \mathbf{e}_{ij})}
+                {\sqrt{d}} \right)
+
+            (default :obj:`None`)
+        bias (bool, optional): If set to :obj:`False`, the layer will not learn
+            an additive bias. (default: :obj:`True`)
+        root_weight (bool, optional): If set to :obj:`False`, the layer will
+            not add the transformed root node features to the output and the
+            option  :attr:`beta` is set to :obj:`False`. (default: :obj:`True`)
+    """
+
+    def __init__(
+        self,
+        in_channels: Union[int, Tuple[int, int]],
+        out_channels: int,
+        heads: int = 1,
+        concat: bool = True,
+        beta: bool = False,
+        edge_dim: Optional[int] = None,
+        bias: bool = True,
+        root_weight: bool = True,
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.heads = heads
+        self.beta = beta and root_weight
+        self.root_weight = root_weight
+        self.concat = concat
+        self.edge_dim = edge_dim
+
+        if isinstance(in_channels, int):
+            in_channels = (in_channels, in_channels)
+
+        Linear = torch_geometric.nn.Linear
+
+        self.lin_key = Linear(in_channels[0], heads * out_channels)
+        self.lin_query = Linear(in_channels[1], heads * out_channels)
+        self.lin_value = Linear(in_channels[0], heads * out_channels)
+        if edge_dim is not None:
+            self.lin_edge = Linear(edge_dim, heads * out_channels, bias=False)
+        else:
+            self.lin_edge = self.register_parameter("lin_edge", None)
+
+        if concat:
+            self.lin_skip = Linear(in_channels[1], heads * out_channels, bias=bias)
+            if self.beta:
+                self.lin_beta = Linear(3 * heads * out_channels, 1, bias=False)
+            else:
+                self.lin_beta = self.register_parameter("lin_beta", None)
+        else:
+            self.lin_skip = Linear(in_channels[1], out_channels, bias=bias)
+            if self.beta:
+                self.lin_beta = Linear(3 * out_channels, 1, bias=False)
+            else:
+                self.lin_beta = self.register_parameter("lin_beta", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.lin_key.reset_parameters()
+        self.lin_query.reset_parameters()
+        self.lin_value.reset_parameters()
+        if self.lin_edge is not None:
+            self.lin_edge.reset_parameters()
+        self.lin_skip.reset_parameters()
+        if self.lin_beta is not None:
+            self.lin_beta.reset_parameters()
+
+    def forward(
+        self,
+        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_attr: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""Runs the forward pass of the module.
+
+        Args:
+            x (torch.Tensor or tuple): The node features. Can be a tuple of
+                tensors denoting source and destination node features.
+            csc ((torch.Tensor, torch.Tensor, int)): A tuple containing the CSC
+                representation of a graph, given as a tuple of
+                :obj:`(row, colptr, num_src_nodes)`. Use the
+                :meth:`to_csc` method to convert an :obj:`edge_index`
+                representation to the desired format.
+            edge_attr: (torch.Tensor, optional) The edge features.
+        """
+        bipartite = not isinstance(x, torch.Tensor)
+        graph = self.get_cugraph(csc, bipartite=bipartite)
+
+        if not bipartite:
+            x = (x, x)
+
+        query = self.lin_query(x[1])
+        key = self.lin_key(x[0])
+        value = self.lin_value(x[0])
+
+        if edge_attr is not None:
+            if self.lin_edge is None:
+                raise RuntimeError(
+                    f"{self.__class__.__name__}.edge_dim must be set to accept "
+                    f"edge features."
+                )
+            edge_attr = self.lin_edge(edge_attr)
+
+        out = TransformerConvAgg(
+            key,
+            query,
+            value,
+            graph,
+            self.heads,
+            self.concat,
+            edge_emb=edge_attr,
+            norm_by_dim=True,
+            score_bias=None,
+        )
+
+        if self.root_weight:
+            x_r = self.lin_skip(x[1])
+            if self.lin_beta is not None:
+                beta = self.lin_beta(torch.cat([out, x_r, out - x_r], dim=-1))
+                beta = beta.sigmoid()
+                out = beta * x_r + (1 - beta) * out
+            else:
+                out = out + x_r
+
+        return out
+
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}({self.in_channels}, "
+            f"{self.out_channels}, heads={self.heads})"
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py b/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
index 86d01f17d60..2ec68a8b4ac 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
@@ -10,11 +10,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from cugraph.utilities.api_tools import experimental_warning_wrapper
-
-from cugraph_pyg.sampler.cugraph_sampler import (
-    EXPERIMENTAL__CuGraphSampler,
-)
-
-CuGraphSampler = experimental_warning_wrapper(EXPERIMENTAL__CuGraphSampler)
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
index b6ec932abbe..655edd27f65 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
@@ -11,13 +11,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cugraph
 
-
-from typing import Tuple, List, Union, Sequence, Dict
+from typing import Sequence
 
 from cugraph_pyg.data import CuGraphStore
-from cugraph_pyg.data.cugraph_store import TensorType
 
 from cugraph.utilities.utils import import_optional, MissingModule
 import cudf
@@ -34,11 +31,63 @@
 )
 
 
+def _count_unique_nodes(
+    sampling_results: cudf.DataFrame,
+    graph_store: CuGraphStore,
+    node_type: str,
+    node_position: str,
+) -> int:
+    """
+    Counts the number of unique nodes of a given node type.
+
+    Parameters
+    ----------
+    sampling_results: cudf.DataFrame
+        The dataframe containing sampling results or filtered sampling results
+        (i.e. sampling results for hop 2)
+    graph_store: CuGraphStore
+        The graph store containing the structure of the sampled graph.
+    node_type: str
+        The node type to count the number of unique nodes of.
+    node_position: str ('src' or 'dst')
+        Whether to examine source or destination nodes.
+
+    Returns
+    -------
+    int
+        The number of unique nodes of the given node type.
+    """
+    if node_position == "src":
+        edge_index = "sources"
+        edge_sel = 0
+    elif node_position == "dst":
+        edge_index = "destinations"
+        edge_sel = -1
+    else:
+        raise ValueError(f"Illegal value {node_position} for node_position")
+
+    etypes = [
+        graph_store.canonical_edge_type_to_numeric(et)
+        for et in graph_store.edge_types
+        if et[edge_sel] == node_type
+    ]
+    if len(etypes) > 0:
+        f = sampling_results.edge_type == etypes[0]
+        for et in etypes[1:]:
+            f |= sampling_results.edge_type == et
+
+        sampling_results_node = sampling_results[f]
+    else:
+        return 0
+
+    return sampling_results_node[edge_index].nunique()
+
+
 def _sampler_output_from_sampling_results(
     sampling_results: cudf.DataFrame,
     graph_store: CuGraphStore,
     metadata: Sequence = None,
-) -> Union[HeteroSamplerOutput, Dict[str, dict]]:
+) -> HeteroSamplerOutput:
     """
     Parameters
     ----------
@@ -51,23 +100,62 @@ def _sampler_output_from_sampling_results(
 
     Returns
     -------
-    HeteroSamplerOutput, if PyG is installed.
-    dict, if PyG is not installed.
+    HeteroSamplerOutput
     """
-    nodes_of_interest = torch.unique(
-        torch.stack(
-            [
-                torch.as_tensor(sampling_results.destinations, device="cuda"),
-                torch.as_tensor(sampling_results.sources, device="cuda"),
-            ]
+
+    hops = torch.arange(sampling_results.hop_id.max() + 1, device="cuda")
+    hops = torch.searchsorted(
+        torch.as_tensor(sampling_results.hop_id.values, device="cuda"), hops
+    )
+
+    num_nodes_per_hop_dict = {}
+    num_edges_per_hop_dict = {}
+
+    # Fill out hop 0 in num_nodes_per_hop_dict, which is based on src instead of dst
+    sampling_results_hop_0 = sampling_results.iloc[
+        0 : (hops[1] if len(hops) > 1 else len(sampling_results))
+    ]
+    for node_type in graph_store.node_types:
+        if len(graph_store.node_types) == 1:
+            num_unique_nodes = sampling_results_hop_0.sources.nunique()
+        else:
+            num_unique_nodes = _count_unique_nodes(
+                sampling_results_hop_0, graph_store, node_type, "src"
+            )
+
+        if num_unique_nodes > 0:
+            num_nodes_per_hop_dict[node_type] = torch.zeros(
+                len(hops) + 1, dtype=torch.int64
+            )
+            num_nodes_per_hop_dict[node_type][0] = num_unique_nodes
+
+    # Calculate nodes of interest based on unique nodes in order of appearance
+    # Use hop 0 sources since those are the only ones not included in destinations
+    # Use torch.concat based on benchmark performance (vs. cudf.concat)
+    nodes_of_interest = (
+        cudf.Series(
+            torch.concat(
+                [
+                    torch.as_tensor(
+                        sampling_results_hop_0.sources.values, device="cuda"
+                    ),
+                    torch.as_tensor(
+                        sampling_results.destinations.values, device="cuda"
+                    ),
+                ]
+            ),
+            name="nodes_of_interest",
         )
+        .drop_duplicates()
+        .sort_index()
     )
-    # unique will always sort this array
+    del sampling_results_hop_0
 
     # Get the grouped node index (for creating the renumbered grouped edge index)
     noi_index = graph_store._get_vertex_groups_from_sample(
-        nodes_of_interest, is_sorted=True
+        torch.as_tensor(nodes_of_interest.values, device="cuda")
     )
+    del nodes_of_interest
 
     # Get the new edge index (by type as expected for HeteroData)
     # FIXME handle edge ids/types after the C++ updates
@@ -75,136 +163,59 @@ def _sampler_output_from_sampling_results(
         sampling_results, noi_index
     )
 
-    out = (noi_index, row_dict, col_dict, None)
-
-    # FIXME no longer allow torch_geometric to be missing.
-    if isinstance(torch_geometric, MissingModule):
-        return {"out": out, "metadata": metadata}
-    else:
-        return HeteroSamplerOutput(*out, metadata=metadata)
-
-
-class EXPERIMENTAL__CuGraphSampler:
-    """
-    Duck-typed version of PyG's BaseSampler
-    """
-
-    UNIFORM_NEIGHBOR = "uniform_neighbor"
-    SAMPLING_METHODS = [
-        UNIFORM_NEIGHBOR,
-    ]
-
-    def __init__(
-        self,
-        data: Tuple[CuGraphStore, CuGraphStore],
-        method: str = UNIFORM_NEIGHBOR,
-        **kwargs,
-    ):
-        if method not in self.SAMPLING_METHODS:
-            raise ValueError(f"{method} is not a valid sampling method")
-        self.__method = method
-        self.__sampling_args = kwargs
-
-        fs, gs = data
-        self.__feature_store = fs
-        self.__graph_store = gs
-
-    # FIXME Make HeteroSamplerOutput the only return type
-    # after PyG becomes a hard requirement
-    def sample_from_nodes(
-        self, sampler_input: Tuple[TensorType, TensorType, TensorType]
-    ) -> Union[HeteroSamplerOutput, dict]:
-        """
-        Sample nodes using this CuGraphSampler's sampling method
-        (which is set at initialization)
-        and the input node data passed to this function.  Matches
-        the interface provided by PyG's NodeSamplerInput.
-
-        Parameters
-        ----------
-        sampler_input: tuple(index, input_nodes, input_time)
-            index: The sample indices to store as metadata
-            input_nodes: Input nodes to pass to the sampler
-            input_time: Node timestamps (if performing temporal
-            sampling which is currently not supported)
-
-        Returns
-        -------
-        HeteroSamplerOutput, if PyG is installed.
-        dict, if PyG is not installed.
-        """
-        index, input_nodes, input_time = sampler_input
-
-        if input_time is not None:
-            raise ValueError("Temporal sampling is currently unsupported in cuGraph")
-
-        if self.__method == self.UNIFORM_NEIGHBOR:
-            return self.__neighbor_sample(
-                input_nodes, **self.__sampling_args, metadata=index
+    for hop in range(len(hops)):
+        hop_ix_start = hops[hop]
+        hop_ix_end = hops[hop + 1] if hop < len(hops) - 1 else len(sampling_results)
+        sampling_results_hop = sampling_results.iloc[hop_ix_start:hop_ix_end]
+
+        for node_type in graph_store.node_types:
+            if len(graph_store.node_types) == 1:
+                num_unique_nodes = sampling_results_hop.destinations.nunique()
+            else:
+                num_unique_nodes = _count_unique_nodes(
+                    sampling_results_hop, graph_store, node_type, "dst"
+                )
+
+            if num_unique_nodes > 0:
+                if node_type not in num_nodes_per_hop_dict:
+                    num_nodes_per_hop_dict[node_type] = torch.zeros(
+                        len(hops) + 1, dtype=torch.int64
+                    )
+                num_nodes_per_hop_dict[node_type][hop + 1] = num_unique_nodes
+
+        if len(graph_store.edge_types) == 1:
+            edge_type = graph_store.edge_types[0]
+            if edge_type not in num_edges_per_hop_dict:
+                num_edges_per_hop_dict[edge_type] = torch.zeros(
+                    len(hops), dtype=torch.int64
+                )
+            num_edges_per_hop_dict[graph_store.edge_types[0]][hop] = len(
+                sampling_results_hop
             )
-
-    def sample_from_edges(self, index):
-        raise NotImplementedError("Edge sampling currently unsupported")
-
-    @property
-    def method(self) -> str:
-        return self.__method
-
-    @property
-    def edge_permutation(self):
-        return None
-
-    """
-    SAMPLER IMPLEMENTATIONS
-    """
-
-    def __neighbor_sample(
-        self,
-        index: TensorType,
-        num_neighbors: List[int],
-        replace: bool = True,
-        directed: bool = True,
-        edge_types: List[str] = None,
-        metadata=None,
-        **kwargs,
-    ) -> Union[dict, HeteroSamplerOutput]:
-        if not directed:
-            raise ValueError("Undirected sampling not currently supported")
-
-        if edge_types is None:
-            edge_types = [
-                attr.edge_type for attr in self.__graph_store.get_all_edge_attrs()
-            ]
-
-        if isinstance(num_neighbors, dict):
-            # FIXME support variable num neighbors per edge type
-            num_neighbors = list(num_neighbors.values())[0]
-
-        if not index.is_cuda:
-            index = index.cuda()
-
-        G = self.__graph_store._subgraph(edge_types)
-
-        index = cudf.Series(index)
-
-        sample_fn = (
-            cugraph.dask.uniform_neighbor_sample
-            if self.__graph_store._is_delayed
-            else cugraph.uniform_neighbor_sample
-        )
-
-        sampling_results = sample_fn(
-            G,
-            index,
-            # conversion required by cugraph api
-            list(num_neighbors),
-            replace,
-            with_edge_properties=True,
-        )
-
-        if self.__graph_store._is_delayed:
-            sampling_results = sampling_results.compute()
-
-        return _sampler_output_from_sampling_results(
-            sampling_results, self.__graph_store, metadata
-        )
+        else:
+            numeric_etypes, counts = torch.unique(
+                torch.as_tensor(sampling_results_hop.edge_type.values, device="cuda"),
+                return_counts=True,
+            )
+            numeric_etypes = list(numeric_etypes)
+            counts = list(counts)
+            for num_etype, count in zip(numeric_etypes, counts):
+                can_etype = graph_store.numeric_edge_type_to_canonical(num_etype)
+                if can_etype not in num_edges_per_hop_dict:
+                    num_edges_per_hop_dict[can_etype] = torch.zeros(
+                        len(hops), dtype=torch.int64
+                    )
+                num_edges_per_hop_dict[can_etype][hop] = count
+
+    if HeteroSamplerOutput is None:
+        raise ImportError("Error importing from pyg")
+
+    return HeteroSamplerOutput(
+        node=noi_index,
+        row=row_dict,
+        col=col_dict,
+        edge=None,
+        num_sampled_nodes=num_nodes_per_hop_dict,
+        num_sampled_edges=num_edges_per_hop_dict,
+        metadata=metadata,
+    )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
index 2aa6221cc3f..3270dd0bf93 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
@@ -23,7 +23,6 @@
 
 import torch
 import numpy as np
-import cudf
 from cugraph.gnn import FeatureStore
 from cugraph.experimental.datasets import karate
 
@@ -79,7 +78,7 @@ def karate_gnn():
     el = karate.get_edgelist().reset_index(drop=True)
     el.src = el.src.astype("int64")
     el.dst = el.dst.astype("int64")
-    all_vertices = np.array_split(cudf.concat([el.src, el.dst]).unique().values_host, 2)
+    all_vertices = np.array_split(np.arange(34), 2)
 
     F = FeatureStore(backend="torch")
     F.add_data(
@@ -103,16 +102,16 @@ def karate_gnn():
     G = {
         ("type0", "et01", "type1"): el[
             el.src.isin(all_vertices[0]) & el.dst.isin(all_vertices[1])
-        ],
+        ].reset_index(drop=True),
         ("type1", "et10", "type0"): el[
             el.src.isin(all_vertices[1]) & el.dst.isin(all_vertices[0])
-        ],
+        ].reset_index(drop=True),
         ("type0", "et00", "type0"): el[
             el.src.isin(all_vertices[0]) & el.dst.isin(all_vertices[0])
         ],
         ("type1", "et11", "type1"): el[
             el.src.isin(all_vertices[1]) & el.dst.isin(all_vertices[1])
-        ],
+        ].reset_index(drop=True),
     }
 
     G = {
@@ -232,3 +231,37 @@ def multi_edge_multi_vertex_no_graph_1():
     F.add_data(np.array([2, 1]), type_name="black", feat_name="prop2")
 
     return F, G, N
+
+
+@pytest.fixture
+def abc_graph():
+    N = {
+        "A": 2,  # 0, 1
+        "B": 3,  # 2, 3, 4
+        "C": 4,  # 5, 6, 7, 8
+    }
+
+    G = {
+        # (0->2, 0->3, 1->3)
+        ("A", "ab", "B"): [
+            torch.tensor([0, 0, 1], dtype=torch.int64),
+            torch.tensor([0, 1, 1], dtype=torch.int64),
+        ],
+        # (2->0, 2->1, 3->1, 4->0)
+        ("B", "ba", "A"): [
+            torch.tensor([0, 0, 1, 2], dtype=torch.int64),
+            torch.tensor([0, 1, 1, 0], dtype=torch.int64),
+        ],
+        # (2->6, 2->8, 3->5, 3->7, 4->5, 4->8)
+        ("B", "bc", "C"): [
+            torch.tensor([0, 0, 1, 1, 2, 2], dtype=torch.int64),
+            torch.tensor([1, 3, 0, 2, 0, 3], dtype=torch.int64),
+        ],
+    }
+
+    F = FeatureStore()
+    F.add_data(
+        torch.tensor([3.2, 2.1], dtype=torch.float32), type_name="A", feat_name="prop1"
+    )
+
+    return F, G, N
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
index 5a043acd300..e29f3aea512 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
@@ -47,3 +47,30 @@ def test_cugraph_loader_basic(dask_client, karate_gnn):
         if "type1" in sample:
             for prop in sample["type1"]["prop0"].tolist():
                 assert prop % 41 == 0
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+def test_cugraph_loader_hetero(dask_client, karate_gnn):
+    F, G, N = karate_gnn
+    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    loader = CuGraphNeighborLoader(
+        (cugraph_store, cugraph_store),
+        input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
+        batch_size=2,
+        num_neighbors=[4, 4],
+        random_state=62,
+        replace=False,
+    )
+
+    samples = [s for s in loader]
+
+    assert len(samples) == 2
+    for sample in samples:
+        print(sample)
+        if "type0" in sample:
+            for prop in sample["type0"]["prop0"].tolist():
+                assert prop % 31 == 0
+
+        if "type1" in sample:
+            for prop in sample["type1"]["prop0"].tolist():
+                assert prop % 41 == 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
index 66dfa89aece..a553a5ec624 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
@@ -11,141 +11,200 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph_pyg.sampler import CuGraphSampler
-
 import cudf
 import cupy
 
 import pytest
 
 from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.sampler.cugraph_sampler import _sampler_output_from_sampling_results
+
+from cugraph.gnn import FeatureStore
 
 from cugraph.utilities.utils import import_optional, MissingModule
+from cugraph.dask import uniform_neighbor_sample
 
 torch = import_optional("torch")
 
 
 @pytest.mark.cugraph_ops
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-def test_neighbor_sample(basic_graph_1, dask_client):
+def test_neighbor_sample(dask_client, basic_graph_1):
     F, G, N = basic_graph_1
     cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
 
-    sampler = CuGraphSampler(
-        (cugraph_store, cugraph_store),
-        num_neighbors=[-1],
-        replace=True,
-        directed=True,
-        edge_types=[v.edge_type for v in cugraph_store._edge_types_to_attrs.values()],
+    sampling_results = (
+        uniform_neighbor_sample(
+            cugraph_store._subgraph(),
+            cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
+            fanout_vals=[-1],
+            with_replacement=False,
+            with_edge_properties=True,
+            batch_id_list=cudf.Series(cupy.zeros(5, dtype="int32")),
+            random_state=62,
+            return_offsets=False,
+        )
+        .sort_values(by=["sources", "destinations"])
+        .compute()
     )
 
-    out_dict = sampler.sample_from_nodes(
-        (
-            torch.arange(6, dtype=torch.int64),
-            torch.tensor([0, 1, 2, 3, 4], dtype=torch.int64),
-            None,
-        )
+    out = _sampler_output_from_sampling_results(
+        sampling_results=sampling_results,
+        graph_store=cugraph_store,
+        metadata=torch.arange(6, dtype=torch.int64),
     )
 
-    if isinstance(out_dict, dict):
-        noi_groups, row_dict, col_dict, _ = out_dict["out"]
-        metadata = out_dict["metadata"]
-    else:
-        noi_groups = out_dict.node
-        row_dict = out_dict.row
-        col_dict = out_dict.col
-        metadata = out_dict.metadata
+    noi_groups = out.node
+    row_dict = out.row
+    col_dict = out.col
+    metadata = out.metadata
 
     assert metadata.tolist() == list(range(6))
 
     for node_type, node_ids in noi_groups.items():
         actual_vertex_ids = torch.arange(N[node_type])
 
-        assert node_ids.tolist() == actual_vertex_ids.tolist()
+        assert sorted(node_ids.tolist()) == actual_vertex_ids.tolist()
 
-    for edge_type, ei in G.items():
-        expected_df = cudf.DataFrame(
-            {
-                "src": cupy.asarray(ei[0]),
-                "dst": cupy.asarray(ei[1]),
-            }
-        )
+    assert (
+        row_dict[("vt1", "pig", "vt1")].tolist() == G[("vt1", "pig", "vt1")][0].tolist()
+    )
+    assert (
+        col_dict[("vt1", "pig", "vt1")].tolist() == G[("vt1", "pig", "vt1")][1].tolist()
+    )
 
-        results_df = cudf.DataFrame(
-            {
-                "src": cupy.asarray(row_dict[edge_type]),
-                "dst": cupy.asarray(col_dict[edge_type]),
-            }
-        )
+    # check the hop dictionaries
+    assert len(out.num_sampled_nodes) == 1
+    assert out.num_sampled_nodes["vt1"].tolist() == [4, 4]
 
-        expected_df = expected_df.drop_duplicates().sort_values(by=["src", "dst"])
-        results_df = results_df.drop_duplicates().sort_values(by=["src", "dst"])
-        assert (
-            expected_df.src.values_host.tolist() == results_df.src.values_host.tolist()
-        )
-        assert (
-            expected_df.dst.values_host.tolist() == results_df.dst.values_host.tolist()
-        )
+    assert len(out.num_sampled_edges) == 1
+    assert out.num_sampled_edges[("vt1", "pig", "vt1")].tolist() == [6]
 
 
 @pytest.mark.cugraph_ops
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1, dask_client):
+def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
     cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
 
-    sampler = CuGraphSampler(
-        (cugraph_store, cugraph_store),
-        num_neighbors=[-1],
-        replace=True,
-        directed=True,
-        edge_types=[v.edge_type for v in cugraph_store._edge_types_to_attrs.values()],
+    sampling_results = (
+        uniform_neighbor_sample(
+            cugraph_store._subgraph(),
+            cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
+            fanout_vals=[-1],
+            with_replacement=False,
+            with_edge_properties=True,
+            batch_id_list=cudf.Series(cupy.zeros(5, dtype="int32")),
+            random_state=62,
+            return_offsets=False,
+        )
+        .sort_values(by=["sources", "destinations"])
+        .compute()
     )
 
-    out_dict = sampler.sample_from_nodes(
-        (
-            torch.arange(6, dtype=torch.int64),
-            torch.tensor([0, 1, 2, 3, 4], dtype=torch.int64),
-            None,
-        )
+    out = _sampler_output_from_sampling_results(
+        sampling_results=sampling_results,
+        graph_store=cugraph_store,
+        metadata=torch.arange(6, dtype=torch.int64),
     )
 
-    if isinstance(out_dict, dict):
-        noi_groups, row_dict, col_dict, _ = out_dict["out"]
-        metadata = out_dict["metadata"]
-    else:
-        noi_groups = out_dict.node
-        row_dict = out_dict.row
-        col_dict = out_dict.col
-        metadata = out_dict.metadata
+    noi_groups = out.node
+    row_dict = out.row
+    col_dict = out.col
+    metadata = out.metadata
 
     assert metadata.tolist() == list(range(6))
 
     for node_type, node_ids in noi_groups.items():
         actual_vertex_ids = torch.arange(N[node_type])
 
-        assert node_ids.tolist() == actual_vertex_ids.tolist()
+        assert node_ids.tolist() == sorted(actual_vertex_ids.tolist())
 
     for edge_type, ei in G.items():
-        expected_df = cudf.DataFrame(
-            {
-                "src": cupy.asarray(ei[0]),
-                "dst": cupy.asarray(ei[1]),
-            }
-        )
+        assert sorted(row_dict[edge_type].tolist()) == sorted(ei[0].tolist())
+        assert sorted(col_dict[edge_type].tolist()) == sorted(ei[1].tolist())
 
-        results_df = cudf.DataFrame(
-            {
-                "src": cupy.asarray(row_dict[edge_type]),
-                "dst": cupy.asarray(col_dict[edge_type]),
-            }
-        )
+    # check the hop dictionaries
+    assert len(out.num_sampled_nodes) == 2
+    assert out.num_sampled_nodes["black"].tolist() == [2, 2]
+    assert out.num_sampled_nodes["brown"].tolist() == [3, 2]
 
-        expected_df = expected_df.drop_duplicates().sort_values(by=["src", "dst"])
-        results_df = results_df.drop_duplicates().sort_values(by=["src", "dst"])
-        assert (
-            expected_df.src.values_host.tolist() == results_df.src.values_host.tolist()
-        )
-        assert (
-            expected_df.dst.values_host.tolist() == results_df.dst.values_host.tolist()
-        )
+    assert len(out.num_sampled_edges) == 5
+    assert out.num_sampled_edges[("brown", "horse", "brown")].tolist() == [2]
+    assert out.num_sampled_edges[("brown", "tortoise", "black")].tolist() == [3]
+    assert out.num_sampled_edges[("brown", "mongoose", "black")].tolist() == [2]
+    assert out.num_sampled_edges[("black", "cow", "brown")].tolist() == [2]
+    assert out.num_sampled_edges[("black", "snake", "black")].tolist() == [1]
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+def test_neighbor_sample_mock_sampling_results(dask_client):
+    N = {
+        "A": 2,  # 0, 1
+        "B": 3,  # 2, 3, 4
+        "C": 4,  # 5, 6, 7, 8
+    }
+
+    G = {
+        # (0->2, 0->3, 1->3)
+        ("A", "ab", "B"): [
+            torch.tensor([0, 0, 1], dtype=torch.int64),
+            torch.tensor([0, 1, 1], dtype=torch.int64),
+        ],
+        # (2->0, 2->1, 3->1, 4->0)
+        ("B", "ba", "A"): [
+            torch.tensor([0, 0, 1, 2], dtype=torch.int64),
+            torch.tensor([0, 1, 1, 0], dtype=torch.int64),
+        ],
+        # (2->6, 2->8, 3->5, 3->7, 4->5, 4->8)
+        ("B", "bc", "C"): [
+            torch.tensor([0, 0, 1, 1, 2, 2], dtype=torch.int64),
+            torch.tensor([1, 3, 0, 2, 0, 3], dtype=torch.int64),
+        ],
+    }
+
+    F = FeatureStore()
+    F.add_data(
+        torch.tensor([3.2, 2.1], dtype=torch.float32), type_name="A", feat_name="prop1"
+    )
+
+    graph_store = CuGraphStore(F, G, N, multi_gpu=True)
+
+    # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
+    mock_sampling_results = cudf.DataFrame(
+        {
+            "sources": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
+            "destinations": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
+            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
+            "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
+        }
+    )
+
+    out = _sampler_output_from_sampling_results(
+        mock_sampling_results, graph_store, None
+    )
+
+    assert out.metadata is None
+    assert len(out.node) == 3
+    assert out.node["A"].tolist() == [0, 1]
+    assert out.node["B"].tolist() == [0, 1]
+    assert out.node["C"].tolist() == [3, 2, 0]
+
+    assert len(out.row) == 3
+    assert len(out.col) == 3
+    assert out.row[("A", "ab", "B")].tolist() == [0, 0, 1, 1]
+    assert out.col[("A", "ab", "B")].tolist() == [0, 1, 1, 1]
+    assert out.row[("B", "bc", "C")].tolist() == [0, 1, 1, 1]
+    assert out.col[("B", "bc", "C")].tolist() == [0, 1, 2, 1]
+    assert out.row[("B", "ba", "A")].tolist() == [1, 1]
+    assert out.col[("B", "ba", "A")].tolist() == [1, 1]
+
+    assert len(out.num_sampled_nodes) == 3
+    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 1, 0, 1]
+    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 1, 0]
+    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 2]
+
+    assert len(out.num_sampled_edges) == 3
+    assert out.num_sampled_edges[("A", "ab", "B")].tolist() == [3, 0, 1, 0]
+    assert out.num_sampled_edges[("B", "ba", "A")].tolist() == [0, 1, 0, 1]
+    assert out.num_sampled_edges[("B", "bc", "C")].tolist() == [0, 2, 0, 2]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
index c09a311bd66..a5a59623710 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
@@ -24,14 +24,13 @@
 import cupy
 import numpy as np
 
-from random import randint
-
 from cugraph.utilities.utils import import_optional, MissingModule
 
 import pytest
 
 
 torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
@@ -152,7 +151,10 @@ def test_edge_types(graph, dask_client):
     assert eta.keys() == G.keys()
 
     for attr_name, attr_repr in eta.items():
-        assert len(G[attr_name][0]) == attr_repr.size[-1]
+        src_size = N[attr_name[0]]
+        dst_size = N[attr_name[-1]]
+        assert src_size == attr_repr.size[0]
+        assert dst_size == attr_repr.size[-1]
         assert attr_name == attr_repr.edge_type
 
 
@@ -184,7 +186,7 @@ def test_renumber_vertices_basic(single_vertex_graph, dask_client):
     )
 
     index = cugraph_store._get_vertex_groups_from_sample(nodes_of_interest)
-    assert index["vt1"].tolist() == sorted(nodes_of_interest.tolist())
+    assert index["vt1"].tolist() == nodes_of_interest.tolist()
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
@@ -210,84 +212,39 @@ def test_renumber_vertices_multi_edge_multi_vertex(
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-def test_renumber_edges(graph, dask_client):
-    """
-    FIXME this test is not very good and should be replaced,
-    probably with a test that uses known good values.
-    """
-
-    F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
-
-    v_offsets = [N[v] for v in sorted(N.keys())]
-    v_offsets = np.array(v_offsets)
-
-    cumsum = v_offsets.cumsum(0)
-    v_offsets = cumsum - v_offsets
-    v_offsets = {k: int(v_offsets[i]) for i, k in enumerate(sorted(N.keys()))}
-
-    e_num = {
-        pyg_can_edge_type: i for i, pyg_can_edge_type in enumerate(sorted(G.keys()))
-    }
+def test_renumber_edges(abc_graph, dask_client):
+    F, G, N = abc_graph
 
-    eoi_src = np.array([], dtype="int64")
-    eoi_dst = np.array([], dtype="int64")
-    eoi_type = np.array([], dtype="int32")
-    for pyg_can_edge_type, ei in G.items():
-        src_type, _, dst_type = pyg_can_edge_type
-
-        c = randint(0, len(ei[0]))  # number to select
-        sel = np.random.randint(0, len(ei[0]), c)
-
-        src_i = np.array(ei[0][sel]) + v_offsets[src_type]
-        dst_i = np.array(ei[1][sel]) + v_offsets[dst_type]
-        eoi_src = np.concatenate([eoi_src, src_i])
-        eoi_dst = np.concatenate([eoi_dst, dst_i])
-        eoi_type = np.concatenate([eoi_type, np.array([e_num[pyg_can_edge_type]] * c)])
-
-    nodes_of_interest, _ = torch.sort(
-        torch.as_tensor(
-            np.unique(np.concatenate([eoi_src, eoi_dst])),
-        ).cuda()
-    )
-
-    noi_index = cugraph_store._get_vertex_groups_from_sample(nodes_of_interest)
+    graph_store = CuGraphStore(F, G, N, multi_gpu=True)
 
-    sdf = cudf.DataFrame(
+    # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
+    mock_sampling_results = cudf.DataFrame(
         {
-            "sources": eoi_src,
-            "destinations": eoi_dst,
-            "edge_type": eoi_type,
+            "sources": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
+            "destinations": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
+            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
+            "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
         }
-    ).reset_index(drop=True)
-
-    row, col = cugraph_store._get_renumbered_edge_groups_from_sample(sdf, noi_index)
-
-    for pyg_can_edge_type in G:
-        df = cudf.DataFrame(
-            {
-                "src": cupy.asarray(G[pyg_can_edge_type][0]),
-                "dst": cupy.asarray(G[pyg_can_edge_type][1]),
-            }
-        )
-
-        G[pyg_can_edge_type] = df
+    )
 
-    for pyg_can_edge_type in row:
-        stype, _, dtype = pyg_can_edge_type
-        src = noi_index[stype][row[pyg_can_edge_type]]
-        dst = noi_index[dtype][col[pyg_can_edge_type]]
-        assert len(src) == len(dst)
+    mock_noi_index = {
+        "A": torch.tensor([0, 1], device="cuda"),
+        "B": torch.tensor([0, 1], device="cuda"),
+        "C": torch.tensor([3, 2, 0], device="cuda"),
+    }
 
-        for i in range(len(src)):
-            src_i = int(src[i])
-            dst_i = int(dst[i])
+    row_dict, col_dict = graph_store._get_renumbered_edge_groups_from_sample(
+        mock_sampling_results, mock_noi_index
+    )
 
-            df = G[pyg_can_edge_type]
-            df = df[df.src == src_i]
-            df = df[df.dst == dst_i]
-            # Ensure only 1 entry matches
-            assert len(df) == 1
+    assert len(row_dict) == 3
+    assert len(col_dict) == 3
+    assert row_dict[("A", "ab", "B")].tolist() == [0, 0, 1, 1]
+    assert col_dict[("A", "ab", "B")].tolist() == [0, 1, 1, 1]
+    assert row_dict[("B", "bc", "C")].tolist() == [0, 1, 1, 1]
+    assert col_dict[("B", "bc", "C")].tolist() == [0, 1, 2, 1]
+    assert row_dict[("B", "ba", "A")].tolist() == [1, 1]
+    assert col_dict[("B", "ba", "A")].tolist() == [1, 1]
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
@@ -311,6 +268,17 @@ def test_get_tensor(graph, dask_client):
             assert tsr == base_series
 
 
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+def test_get_tensor_empty_idx(karate_gnn, dask_client):
+    F, G, N = karate_gnn
+    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+
+    t = cugraph_store.get_tensor(
+        CuGraphTensorAttr(group_name="type0", attr_name="prop0", index=None)
+    )
+    assert t.tolist() == (torch.arange(17, dtype=torch.float32) * 31).tolist()
+
+
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_multi_get_tensor(graph, dask_client):
     F, G, N = graph
@@ -397,6 +365,22 @@ def test_get_tensor_size(graph, dask_client):
         assert cugraph_store.get_tensor_size(tensor_attr) == torch.Size((sz,))
 
 
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(
+    isinstance(torch_geometric, MissingModule), reason="pyg not available"
+)
+def test_get_input_nodes(karate_gnn, dask_client):
+    F, G, N = karate_gnn
+    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+
+    node_type, input_nodes = torch_geometric.loader.utils.get_input_nodes(
+        (cugraph_store, cugraph_store), "type0"
+    )
+
+    assert node_type == "type0"
+    assert input_nodes.tolist() == torch.arange(17, dtype=torch.int32).tolist()
+
+
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_mg_frame_handle(graph, dask_client):
     F, G, N = graph
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
new file mode 100644
index 00000000000..ae5fd73c438
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+try:
+    from torch_geometric.nn import GATConv
+except ModuleNotFoundError:
+    pytest.skip("PyG not available", allow_module_level=True)
+
+from cugraph.utilities.utils import import_optional
+from cugraph_pyg.nn import GATConv as CuGraphGATConv
+
+torch = import_optional("torch")
+
+
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("bipartite", [True, False])
+@pytest.mark.parametrize("concat", [True, False])
+@pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
+@pytest.mark.parametrize("max_num_neighbors", [8, None])
+@pytest.mark.parametrize("use_edge_attr", [True, False])
+def test_gat_conv_equality(
+    bias, bipartite, concat, heads, max_num_neighbors, use_edge_attr
+):
+    atol = 1e-6
+    edge_index = torch.tensor(
+        [
+            [7, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 8, 9],
+            [0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7],
+        ],
+    ).cuda()
+    size = (10, 10)
+
+    if bipartite:
+        in_channels = (5, 3)
+        x = (
+            torch.rand(size[0], in_channels[0]).cuda(),
+            torch.rand(size[1], in_channels[1]).cuda(),
+        )
+    else:
+        in_channels = 5
+        x = torch.rand(size[0], in_channels).cuda()
+    out_channels = 2
+
+    if use_edge_attr:
+        edge_dim = 3
+        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
+        csc, edge_attr_perm = CuGraphGATConv.to_csc(
+            edge_index, size, edge_attr=edge_attr
+        )
+    else:
+        edge_dim = None
+        edge_attr = edge_attr_perm = None
+        csc = CuGraphGATConv.to_csc(edge_index, size)
+
+    kwargs = dict(bias=bias, concat=concat, edge_dim=edge_dim)
+
+    conv1 = GATConv(
+        in_channels, out_channels, heads, add_self_loops=False, **kwargs
+    ).cuda()
+    conv2 = CuGraphGATConv(in_channels, out_channels, heads, **kwargs).cuda()
+
+    out_dim = heads * out_channels
+    with torch.no_grad():
+        if bipartite:
+            conv2.lin_src.weight.data = conv1.lin_src.weight.data.detach().clone()
+            conv2.lin_dst.weight.data = conv1.lin_dst.weight.data.detach().clone()
+        else:
+            conv2.lin.weight.data = conv1.lin_src.weight.data.detach().clone()
+
+        conv2.att.data[:out_dim] = conv1.att_src.data.flatten()
+        conv2.att.data[out_dim : 2 * out_dim] = conv1.att_dst.data.flatten()
+        if use_edge_attr:
+            conv2.att.data[2 * out_dim :] = conv1.att_edge.data.flatten()
+            conv2.lin_edge.weight.data = conv1.lin_edge.weight.data.detach().clone()
+
+    out1 = conv1(x, edge_index, edge_attr=edge_attr)
+    out2 = conv2(x, csc, edge_attr=edge_attr_perm, max_num_neighbors=max_num_neighbors)
+    assert torch.allclose(out1, out2, atol=atol)
+
+    grad_output = torch.rand_like(out1)
+    out1.backward(grad_output)
+    out2.backward(grad_output)
+
+    if bipartite:
+        assert torch.allclose(
+            conv1.lin_src.weight.grad, conv2.lin_src.weight.grad, atol=atol
+        )
+        assert torch.allclose(
+            conv1.lin_dst.weight.grad, conv2.lin_dst.weight.grad, atol=atol
+        )
+    else:
+        assert torch.allclose(
+            conv1.lin_src.weight.grad, conv2.lin.weight.grad, atol=atol
+        )
+
+    assert torch.allclose(
+        conv1.att_src.grad.flatten(), conv2.att.grad[:out_dim], atol=atol
+    )
+    assert torch.allclose(
+        conv1.att_dst.grad.flatten(), conv2.att.grad[out_dim : 2 * out_dim], atol=atol
+    )
+
+    if use_edge_attr:
+        assert torch.allclose(
+            conv1.att_edge.grad.flatten(), conv2.att.grad[2 * out_dim :], atol=atol
+        )
+        assert torch.allclose(
+            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=atol
+        )
+
+    if bias:
+        assert torch.allclose(conv1.bias.grad, conv2.bias.grad, atol=atol)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
new file mode 100644
index 00000000000..1c4f241304e
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+try:
+    from torch_geometric.nn import GATv2Conv
+except ModuleNotFoundError:
+    pytest.skip("PyG not available", allow_module_level=True)
+
+from cugraph.utilities.utils import import_optional
+from cugraph_pyg.nn import GATv2Conv as CuGraphGATv2Conv
+
+torch = import_optional("torch")
+
+
+@pytest.mark.parametrize("bipartite", [True, False])
+@pytest.mark.parametrize("concat", [True, False])
+@pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
+@pytest.mark.parametrize("use_edge_attr", [True, False])
+def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr):
+    atol = 1e-6
+    edge_index = torch.tensor(
+        [
+            [7, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 8, 9],
+            [0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7],
+        ],
+    ).cuda()
+    size = (10, 10)
+
+    if bipartite:
+        in_channels = (5, 3)
+        x = (
+            torch.rand(size[0], in_channels[0]).cuda(),
+            torch.rand(size[1], in_channels[1]).cuda(),
+        )
+    else:
+        in_channels = 5
+        x = torch.rand(size[0], in_channels).cuda()
+    out_channels = 2
+
+    if use_edge_attr:
+        edge_dim = 3
+        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
+        csc, edge_attr_perm = CuGraphGATv2Conv.to_csc(
+            edge_index, size, edge_attr=edge_attr
+        )
+    else:
+        edge_dim = None
+        edge_attr = edge_attr_perm = None
+        csc = CuGraphGATv2Conv.to_csc(edge_index, size)
+
+    kwargs = dict(bias=False, concat=concat, edge_dim=edge_dim)
+
+    conv1 = GATv2Conv(
+        in_channels, out_channels, heads, add_self_loops=False, **kwargs
+    ).cuda()
+    conv2 = CuGraphGATv2Conv(in_channels, out_channels, heads, **kwargs).cuda()
+
+    with torch.no_grad():
+        conv2.lin_src.weight.data = conv1.lin_l.weight.data.detach().clone()
+        conv2.lin_dst.weight.data = conv1.lin_r.weight.data.detach().clone()
+
+        conv2.att.data = conv1.att.data.flatten().detach().clone()
+
+        if use_edge_attr:
+            conv2.lin_edge.weight.data = conv1.lin_edge.weight.data.detach().clone()
+
+    out1 = conv1(x, edge_index, edge_attr=edge_attr)
+    out2 = conv2(x, csc, edge_attr=edge_attr_perm)
+    assert torch.allclose(out1, out2, atol=atol)
+
+    grad_output = torch.rand_like(out1)
+    out1.backward(grad_output)
+    out2.backward(grad_output)
+
+    assert torch.allclose(conv1.lin_l.weight.grad, conv2.lin_src.weight.grad, atol=atol)
+    assert torch.allclose(conv1.lin_r.weight.grad, conv2.lin_dst.weight.grad, atol=atol)
+
+    assert torch.allclose(conv1.att.grad.flatten(), conv2.att.grad, atol=atol)
+
+    if use_edge_attr:
+        assert torch.allclose(
+            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=atol
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
new file mode 100644
index 00000000000..a2153ee7891
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+try:
+    from torch_geometric.nn import TransformerConv
+except ModuleNotFoundError:
+    pytest.skip("PyG not available", allow_module_level=True)
+
+from cugraph.utilities.utils import import_optional
+from cugraph_pyg.nn import TransformerConv as CuGraphTransformerConv
+
+torch = import_optional("torch")
+
+
+@pytest.mark.parametrize("bipartite", [True, False])
+@pytest.mark.parametrize("concat", [True, False])
+@pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
+def test_transformer_conv_equality(bipartite, concat, heads):
+    out_channels = 2
+    size = (10, 10)
+    kwargs = dict(concat=concat, bias=False, root_weight=False)
+
+    if bipartite:
+        in_channels = (5, 3)
+        x = (
+            torch.rand(size[0], in_channels[0], device="cuda"),
+            torch.rand(size[1], in_channels[1], device="cuda"),
+        )
+    else:
+        in_channels = 5
+        x = torch.rand(size[0], in_channels, device="cuda")
+
+    edge_index = torch.tensor(
+        [
+            [7, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 8, 9, 3, 4, 5],
+            [0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 6],
+        ],
+        device="cuda",
+    )
+
+    conv1 = TransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
+    conv2 = CuGraphTransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
+
+    with torch.no_grad():
+        conv2.lin_query.weight.data = conv1.lin_query.weight.data.detach().clone()
+        conv2.lin_key.weight.data = conv1.lin_key.weight.data.detach().clone()
+        conv2.lin_value.weight.data = conv1.lin_value.weight.data.detach().clone()
+        conv2.lin_query.bias.data = conv1.lin_query.bias.data.detach().clone()
+        conv2.lin_key.bias.data = conv1.lin_key.bias.data.detach().clone()
+        conv2.lin_value.bias.data = conv1.lin_value.bias.data.detach().clone()
+
+    out1 = conv1(x, edge_index)
+    csc = CuGraphTransformerConv.to_csc(edge_index, size)
+    out2 = conv2(x, csc)
+
+    atol = 1e-6
+
+    assert torch.allclose(out1, out2, atol=atol)
+
+    grad_output = torch.rand_like(out1)
+    out1.backward(grad_output)
+    out2.backward(grad_output)
+
+    assert torch.allclose(
+        conv1.lin_query.weight.grad, conv2.lin_query.weight.grad, atol=atol
+    )
+    assert torch.allclose(
+        conv1.lin_key.weight.grad, conv2.lin_key.weight.grad, atol=atol
+    )
+    assert torch.allclose(
+        conv1.lin_value.weight.grad, conv2.lin_value.weight.grad, atol=atol
+    )
+    assert torch.allclose(
+        conv1.lin_query.bias.grad, conv2.lin_query.bias.grad, atol=atol
+    )
+    assert torch.allclose(conv1.lin_key.bias.grad, conv2.lin_key.bias.grad, atol=atol)
+    assert torch.allclose(
+        conv1.lin_value.bias.grad, conv2.lin_value.bias.grad, atol=atol
+    )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
index 0eae6e08a0d..e0a943aeca3 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -13,8 +13,16 @@
 
 import pytest
 
+import tempfile
+import os
+
+import cudf
+import cupy
+
 from cugraph_pyg.loader import CuGraphNeighborLoader
+from cugraph_pyg.loader import BulkSampleLoader
 from cugraph_pyg.data import CuGraphStore
+from cugraph.gnn import FeatureStore
 from cugraph.utilities.utils import import_optional, MissingModule
 
 torch = import_optional("torch")
@@ -44,3 +52,114 @@ def test_cugraph_loader_basic(karate_gnn):
         if "type1" in sample:
             for prop in sample["type1"]["prop0"].tolist():
                 assert prop % 41 == 0
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+def test_cugraph_loader_hetero(karate_gnn):
+    F, G, N = karate_gnn
+    cugraph_store = CuGraphStore(F, G, N)
+    loader = CuGraphNeighborLoader(
+        (cugraph_store, cugraph_store),
+        input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
+        batch_size=2,
+        num_neighbors=[4, 4],
+        random_state=62,
+        replace=False,
+    )
+
+    samples = [s for s in loader]
+
+    assert len(samples) == 2
+    for sample in samples:
+        if "type0" in sample:
+            for prop in sample["type0"]["prop0"].tolist():
+                assert prop % 31 == 0
+
+        if "type1" in sample:
+            for prop in sample["type1"]["prop0"].tolist():
+                assert prop % 41 == 0
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+def test_cugraph_loader_from_disk():
+    F = FeatureStore()
+    F.add_data(torch.tensor([1, 2, 3, 4, 5, 6, 7]), "t0", "x")
+
+    G = {("t0", "knows", "t0"): 7}
+    N = {"t0": 7}
+
+    cugraph_store = CuGraphStore(F, G, N)
+
+    bogus_samples = cudf.DataFrame(
+        {
+            "sources": [0, 1, 2, 3, 4, 5, 6],
+            "destinations": [6, 4, 3, 2, 2, 1, 5],
+            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0], dtype="int32"),
+            "edge_id": [5, 10, 15, 20, 25, 30, 35],
+            "hop_id": cudf.Series([0, 0, 0, 1, 1, 2, 2], dtype="int32"),
+        }
+    )
+
+    tempdir = tempfile.TemporaryDirectory()
+    for s in range(256):
+        bogus_samples["batch_id"] = cupy.int32(s)
+        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
+
+    loader = BulkSampleLoader(
+        feature_store=cugraph_store,
+        graph_store=cugraph_store,
+        directory=tempdir,
+    )
+
+    num_samples = 0
+    for sample in loader:
+        num_samples += 1
+        assert sample["t0"]["num_nodes"] == 7
+        # correct vertex order is [0, 1, 2, 6, 4, 3, 5]; x = [1, 2, 3, 7, 5, 4, 6]
+        assert sample["t0"]["x"].tolist() == [1, 2, 3, 7, 5, 4, 6]
+        assert list(sample[("t0", "knows", "t0")]["edge_index"].shape) == [2, 7]
+
+    assert num_samples == 256
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+def test_cugraph_loader_from_disk_subset():
+    F = FeatureStore()
+    F.add_data(torch.tensor([1, 2, 3, 4, 5, 6, 7]), "t0", "x")
+
+    G = {("t0", "knows", "t0"): 7}
+    N = {"t0": 7}
+
+    cugraph_store = CuGraphStore(F, G, N)
+
+    bogus_samples = cudf.DataFrame(
+        {
+            "sources": [0, 1, 2, 3, 4, 5, 6],
+            "destinations": [6, 4, 3, 2, 2, 1, 5],
+            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0], dtype="int32"),
+            "edge_id": [5, 10, 15, 20, 25, 30, 35],
+            "hop_id": cudf.Series([0, 0, 0, 1, 1, 2, 2], dtype="int32"),
+        }
+    )
+
+    tempdir = tempfile.TemporaryDirectory()
+    for s in range(256):
+        bogus_samples["batch_id"] = cupy.int32(s)
+        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
+
+    loader = BulkSampleLoader(
+        feature_store=cugraph_store,
+        graph_store=cugraph_store,
+        directory=tempdir,
+        input_files=list(os.listdir(tempdir.name))[100:200],
+    )
+
+    num_samples = 0
+    for sample in loader:
+        num_samples += 1
+        assert sample["t0"]["num_nodes"] == 7
+        # correct vertex order is [0, 1, 2, 6, 4, 3, 5]; x = [1, 2, 3, 7, 5, 4, 6]
+        assert sample["t0"]["x"].tolist() == [1, 2, 3, 7, 5, 4, 6]
+        assert list(sample[("t0", "knows", "t0")]["edge_index"].shape) == [2, 7]
+
+    assert num_samples == 100
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
index c9981f5f715..b4057727582 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
@@ -11,16 +11,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph_pyg.sampler import CuGraphSampler
-
 import cudf
 import cupy
 
 import pytest
 
 from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.sampler.cugraph_sampler import _sampler_output_from_sampling_results
 
 from cugraph.utilities.utils import import_optional, MissingModule
+from cugraph import uniform_neighbor_sample
 
 torch = import_optional("torch")
 
@@ -31,61 +31,48 @@ def test_neighbor_sample(basic_graph_1):
     F, G, N = basic_graph_1
     cugraph_store = CuGraphStore(F, G, N)
 
-    sampler = CuGraphSampler(
-        (cugraph_store, cugraph_store),
-        num_neighbors=[-1],
-        replace=True,
-        directed=True,
-        edge_types=[v.edge_type for v in cugraph_store._edge_types_to_attrs.values()],
-    )
-
-    out_dict = sampler.sample_from_nodes(
-        (
-            torch.arange(6, dtype=torch.int64),
-            torch.tensor([0, 1, 2, 3, 4], dtype=torch.int64),
-            None,
-        )
+    sampling_results = uniform_neighbor_sample(
+        cugraph_store._subgraph(),
+        cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
+        fanout_vals=[-1],
+        with_replacement=False,
+        with_edge_properties=True,
+        batch_id_list=cudf.Series(cupy.zeros(5, dtype="int32")),
+        random_state=62,
+        return_offsets=False,
+    ).sort_values(by=["sources", "destinations"])
+
+    out = _sampler_output_from_sampling_results(
+        sampling_results=sampling_results,
+        graph_store=cugraph_store,
+        metadata=torch.arange(6, dtype=torch.int64),
     )
 
-    if isinstance(out_dict, dict):
-        noi_groups, row_dict, col_dict, _ = out_dict["out"]
-        metadata = out_dict["metadata"]
-    else:
-        noi_groups = out_dict.node
-        row_dict = out_dict.row
-        col_dict = out_dict.col
-        metadata = out_dict.metadata
+    noi_groups = out.node
+    row_dict = out.row
+    col_dict = out.col
+    metadata = out.metadata
 
     assert metadata.tolist() == list(range(6))
 
     for node_type, node_ids in noi_groups.items():
         actual_vertex_ids = torch.arange(N[node_type])
 
-        assert node_ids.tolist() == actual_vertex_ids.tolist()
+        assert sorted(node_ids.tolist()) == actual_vertex_ids.tolist()
 
-    for edge_type, ei in G.items():
-        expected_df = cudf.DataFrame(
-            {
-                "src": cupy.asarray(ei[0]),
-                "dst": cupy.asarray(ei[1]),
-            }
-        )
-
-        results_df = cudf.DataFrame(
-            {
-                "src": cupy.asarray(row_dict[edge_type]),
-                "dst": cupy.asarray(col_dict[edge_type]),
-            }
-        )
-
-        expected_df = expected_df.drop_duplicates().sort_values(by=["src", "dst"])
-        results_df = results_df.drop_duplicates().sort_values(by=["src", "dst"])
-        assert (
-            expected_df.src.values_host.tolist() == results_df.src.values_host.tolist()
-        )
-        assert (
-            expected_df.dst.values_host.tolist() == results_df.dst.values_host.tolist()
-        )
+    assert (
+        row_dict[("vt1", "pig", "vt1")].tolist() == G[("vt1", "pig", "vt1")][0].tolist()
+    )
+    assert (
+        col_dict[("vt1", "pig", "vt1")].tolist() == G[("vt1", "pig", "vt1")][1].tolist()
+    )
+
+    # check the hop dictionaries
+    assert len(out.num_sampled_nodes) == 1
+    assert out.num_sampled_nodes["vt1"].tolist() == [4, 4]
+
+    assert len(out.num_sampled_edges) == 1
+    assert out.num_sampled_edges[("vt1", "pig", "vt1")].tolist() == [6]
 
 
 @pytest.mark.cugraph_ops
@@ -94,58 +81,93 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
     cugraph_store = CuGraphStore(F, G, N)
 
-    sampler = CuGraphSampler(
-        (cugraph_store, cugraph_store),
-        num_neighbors=[-1],
-        replace=True,
-        directed=True,
-        edge_types=[v.edge_type for v in cugraph_store._edge_types_to_attrs.values()],
-    )
-
-    out_dict = sampler.sample_from_nodes(
-        (
-            torch.arange(6, dtype=torch.int64),
-            torch.tensor([0, 1, 2, 3, 4], dtype=torch.int64),
-            None,
-        )
+    sampling_results = uniform_neighbor_sample(
+        cugraph_store._subgraph(),
+        cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
+        fanout_vals=[-1],
+        with_replacement=False,
+        with_edge_properties=True,
+        batch_id_list=cudf.Series(cupy.zeros(5, dtype="int32")),
+        random_state=62,
+        return_offsets=False,
+    ).sort_values(by=["sources", "destinations"])
+
+    out = _sampler_output_from_sampling_results(
+        sampling_results=sampling_results,
+        graph_store=cugraph_store,
+        metadata=torch.arange(6, dtype=torch.int64),
     )
 
-    if isinstance(out_dict, dict):
-        noi_groups, row_dict, col_dict, _ = out_dict["out"]
-        metadata = out_dict["metadata"]
-    else:
-        noi_groups = out_dict.node
-        row_dict = out_dict.row
-        col_dict = out_dict.col
-        metadata = out_dict.metadata
+    noi_groups = out.node
+    row_dict = out.row
+    col_dict = out.col
+    metadata = out.metadata
 
     assert metadata.tolist() == list(range(6))
 
     for node_type, node_ids in noi_groups.items():
         actual_vertex_ids = torch.arange(N[node_type])
 
-        assert node_ids.tolist() == actual_vertex_ids.tolist()
+        assert node_ids.tolist() == sorted(actual_vertex_ids.tolist())
 
     for edge_type, ei in G.items():
-        expected_df = cudf.DataFrame(
-            {
-                "src": cupy.asarray(ei[0]),
-                "dst": cupy.asarray(ei[1]),
-            }
-        )
-
-        results_df = cudf.DataFrame(
-            {
-                "src": cupy.asarray(row_dict[edge_type]),
-                "dst": cupy.asarray(col_dict[edge_type]),
-            }
-        )
-
-        expected_df = expected_df.drop_duplicates().sort_values(by=["src", "dst"])
-        results_df = results_df.drop_duplicates().sort_values(by=["src", "dst"])
-        assert (
-            expected_df.src.values_host.tolist() == results_df.src.values_host.tolist()
-        )
-        assert (
-            expected_df.dst.values_host.tolist() == results_df.dst.values_host.tolist()
-        )
+        assert sorted(row_dict[edge_type].tolist()) == sorted(ei[0].tolist())
+        assert sorted(col_dict[edge_type].tolist()) == sorted(ei[1].tolist())
+
+    # check the hop dictionaries
+    assert len(out.num_sampled_nodes) == 2
+    assert out.num_sampled_nodes["black"].tolist() == [2, 2]
+    assert out.num_sampled_nodes["brown"].tolist() == [3, 2]
+
+    assert len(out.num_sampled_edges) == 5
+    assert out.num_sampled_edges[("brown", "horse", "brown")].tolist() == [2]
+    assert out.num_sampled_edges[("brown", "tortoise", "black")].tolist() == [3]
+    assert out.num_sampled_edges[("brown", "mongoose", "black")].tolist() == [2]
+    assert out.num_sampled_edges[("black", "cow", "brown")].tolist() == [2]
+    assert out.num_sampled_edges[("black", "snake", "black")].tolist() == [1]
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+def test_neighbor_sample_mock_sampling_results(abc_graph):
+    F, G, N = abc_graph
+
+    graph_store = CuGraphStore(F, G, N)
+
+    # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
+    mock_sampling_results = cudf.DataFrame(
+        {
+            "sources": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
+            "destinations": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
+            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
+            "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
+        }
+    )
+
+    out = _sampler_output_from_sampling_results(
+        mock_sampling_results, graph_store, None
+    )
+
+    assert out.metadata is None
+    assert len(out.node) == 3
+    assert out.node["A"].tolist() == [0, 1]
+    assert out.node["B"].tolist() == [0, 1]
+    assert out.node["C"].tolist() == [3, 2, 0]
+
+    assert len(out.row) == 3
+    assert len(out.col) == 3
+    assert out.row[("A", "ab", "B")].tolist() == [0, 0, 1, 1]
+    assert out.col[("A", "ab", "B")].tolist() == [0, 1, 1, 1]
+    assert out.row[("B", "bc", "C")].tolist() == [0, 1, 1, 1]
+    assert out.col[("B", "bc", "C")].tolist() == [0, 1, 2, 1]
+    assert out.row[("B", "ba", "A")].tolist() == [1, 1]
+    assert out.col[("B", "ba", "A")].tolist() == [1, 1]
+
+    assert len(out.num_sampled_nodes) == 3
+    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 1, 0, 1]
+    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 1, 0]
+    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 2]
+
+    assert len(out.num_sampled_edges) == 3
+    assert out.num_sampled_edges[("A", "ab", "B")].tolist() == [3, 0, 1, 0]
+    assert out.num_sampled_edges[("B", "ba", "A")].tolist() == [0, 1, 0, 1]
+    assert out.num_sampled_edges[("B", "bc", "C")].tolist() == [0, 2, 0, 2]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
index 347d35a3172..289dd69a829 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
@@ -22,7 +22,6 @@
 import cudf
 import cupy
 import numpy as np
-from random import randint
 
 from cugraph.utilities.utils import import_optional, MissingModule
 
@@ -30,6 +29,7 @@
 
 
 torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
@@ -137,7 +137,10 @@ def test_edge_types(graph):
     assert eta.keys() == G.keys()
 
     for attr_name, attr_repr in eta.items():
-        assert len(G[attr_name][0]) == attr_repr.size[-1]
+        src_size = N[attr_name[0]]
+        dst_size = N[attr_name[-1]]
+        assert src_size == attr_repr.size[0]
+        assert dst_size == attr_repr.size[-1]
         assert attr_name == attr_repr.edge_type
 
 
@@ -169,7 +172,7 @@ def test_renumber_vertices_basic(single_vertex_graph):
     )
 
     index = cugraph_store._get_vertex_groups_from_sample(nodes_of_interest)
-    assert index["vt1"].tolist() == sorted(nodes_of_interest.tolist())
+    assert index["vt1"].tolist() == nodes_of_interest.tolist()
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
@@ -193,84 +196,39 @@ def test_renumber_vertices_multi_edge_multi_vertex(multi_edge_multi_vertex_graph
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-def test_renumber_edges(graph):
-    """
-    FIXME this test is not very good and should be replaced,
-    probably with a test that uses known good values.
-    """
-
-    F, G, N = graph
-    cugraph_store = CuGraphStore(F, G, N)
+def test_renumber_edges(abc_graph):
+    F, G, N = abc_graph
 
-    v_offsets = [N[v] for v in sorted(N.keys())]
-    v_offsets = np.array(v_offsets)
+    graph_store = CuGraphStore(F, G, N)
 
-    cumsum = v_offsets.cumsum(0)
-    v_offsets = cumsum - v_offsets
-    v_offsets = {k: int(v_offsets[i]) for i, k in enumerate(sorted(N.keys()))}
-
-    e_num = {
-        pyg_can_edge_type: i for i, pyg_can_edge_type in enumerate(sorted(G.keys()))
-    }
-
-    eoi_src = np.array([], dtype="int64")
-    eoi_dst = np.array([], dtype="int64")
-    eoi_type = np.array([], dtype="int32")
-    for pyg_can_edge_type, ei in G.items():
-        src_type, _, dst_type = pyg_can_edge_type
-
-        c = randint(0, len(ei[0]))  # number to select
-        sel = np.random.randint(0, len(ei[0]), c)
-
-        src_i = np.array(ei[0][sel]) + v_offsets[src_type]
-        dst_i = np.array(ei[1][sel]) + v_offsets[dst_type]
-        eoi_src = np.concatenate([eoi_src, src_i])
-        eoi_dst = np.concatenate([eoi_dst, dst_i])
-        eoi_type = np.concatenate([eoi_type, np.array([e_num[pyg_can_edge_type]] * c)])
-
-    nodes_of_interest, _ = torch.sort(
-        torch.as_tensor(
-            np.unique(np.concatenate([eoi_src, eoi_dst])),
-        ).cuda()
-    )
-
-    noi_index = cugraph_store._get_vertex_groups_from_sample(nodes_of_interest)
-
-    sdf = cudf.DataFrame(
+    # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
+    mock_sampling_results = cudf.DataFrame(
         {
-            "sources": eoi_src,
-            "destinations": eoi_dst,
-            "edge_type": eoi_type,
+            "sources": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
+            "destinations": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
+            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
+            "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
         }
-    ).reset_index(drop=True)
-
-    row, col = cugraph_store._get_renumbered_edge_groups_from_sample(sdf, noi_index)
-
-    for pyg_can_edge_type in G:
-        df = cudf.DataFrame(
-            {
-                "src": cupy.asarray(G[pyg_can_edge_type][0]),
-                "dst": cupy.asarray(G[pyg_can_edge_type][1]),
-            }
-        )
-
-        G[pyg_can_edge_type] = df
+    )
 
-    for pyg_can_edge_type in row:
-        stype, _, dtype = pyg_can_edge_type
-        src = noi_index[stype][row[pyg_can_edge_type]]
-        dst = noi_index[dtype][col[pyg_can_edge_type]]
-        assert len(src) == len(dst)
+    mock_noi_index = {
+        "A": torch.tensor([0, 1], device="cuda"),
+        "B": torch.tensor([0, 1], device="cuda"),
+        "C": torch.tensor([3, 2, 0], device="cuda"),
+    }
 
-        for i in range(len(src)):
-            src_i = int(src[i])
-            dst_i = int(dst[i])
+    row_dict, col_dict = graph_store._get_renumbered_edge_groups_from_sample(
+        mock_sampling_results, mock_noi_index
+    )
 
-            df = G[pyg_can_edge_type]
-            df = df[df.src == src_i]
-            df = df[df.dst == dst_i]
-            # Ensure only 1 entry matches
-            assert len(df) == 1
+    assert len(row_dict) == 3
+    assert len(col_dict) == 3
+    assert row_dict[("A", "ab", "B")].tolist() == [0, 0, 1, 1]
+    assert col_dict[("A", "ab", "B")].tolist() == [0, 1, 1, 1]
+    assert row_dict[("B", "bc", "C")].tolist() == [0, 1, 1, 1]
+    assert col_dict[("B", "bc", "C")].tolist() == [0, 1, 2, 1]
+    assert row_dict[("B", "ba", "A")].tolist() == [1, 1]
+    assert col_dict[("B", "ba", "A")].tolist() == [1, 1]
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
@@ -294,6 +252,17 @@ def test_get_tensor(graph):
             assert tsr == base_series
 
 
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+def test_get_tensor_empty_idx(karate_gnn):
+    F, G, N = karate_gnn
+    cugraph_store = CuGraphStore(F, G, N)
+
+    t = cugraph_store.get_tensor(
+        CuGraphTensorAttr(group_name="type0", attr_name="prop0", index=None)
+    )
+    assert t.tolist() == (torch.arange(17, dtype=torch.float32) * 31).tolist()
+
+
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_multi_get_tensor(graph):
     F, G, N = graph
@@ -388,6 +357,22 @@ def test_get_tensor_size(graph):
         assert cugraph_store.get_tensor_size(tensor_attr) == torch.Size((sz,))
 
 
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(
+    isinstance(torch_geometric, MissingModule), reason="pyg not available"
+)
+def test_get_input_nodes(karate_gnn):
+    F, G, N = karate_gnn
+    cugraph_store = CuGraphStore(F, G, N)
+
+    node_type, input_nodes = torch_geometric.loader.utils.get_input_nodes(
+        (cugraph_store, cugraph_store), "type0"
+    )
+
+    assert node_type == "type0"
+    assert input_nodes.tolist() == torch.arange(17, dtype=torch.int32).tolist()
+
+
 def test_serialize(multi_edge_multi_vertex_no_graph_1):
     import pickle
 
@@ -399,8 +384,9 @@ def test_serialize(multi_edge_multi_vertex_no_graph_1):
     for tensor_attr in cugraph_store.get_all_tensor_attrs():
         sz = cugraph_store.get_tensor_size(tensor_attr)[0]
         tensor_attr.index = np.arange(sz)
-        assert cugraph_store.get_tensor(tensor_attr) == cugraph_store_copy.get_tensor(
-            tensor_attr
+        assert (
+            cugraph_store.get_tensor(tensor_attr).tolist()
+            == cugraph_store_copy.get_tensor(tensor_attr).tolist()
         )
 
     # Currently does not store edgelist properly for SG
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index afce27a5509..d5feac351ab 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -12,21 +12,23 @@ testpaths = ["cugraph_pyg/tests"]
 
 [project]
 name = "cugraph_pyg"
-version = "23.04.01"
+version = "23.06.00"
 description = "cugraph_pyg - PyG support for cuGraph massive-scale, ultra-fast GPU graph analytics."
 authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
 ]
 dependencies = [
-    "thriftpy2",
+    "cugraph==23.6.*",
+    "numba>=0.57",
+    "numpy>=1.21",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
diff --git a/python/cugraph-service/client/cugraph_service_client/__init__.py b/python/cugraph-service/client/cugraph_service_client/__init__.py
index f38ed95633d..0680aacfe52 100644
--- a/python/cugraph-service/client/cugraph_service_client/__init__.py
+++ b/python/cugraph-service/client/cugraph_service_client/__init__.py
@@ -35,4 +35,4 @@
 from cugraph_service_client.client import CugraphServiceClient
 from cugraph_service_client.remote_graph import RemoteGraph
 
-__version__ = "23.04.01"
+__version__ = "23.06.00"
diff --git a/python/cugraph-service/client/cugraph_service_client/types.py b/python/cugraph-service/client/cugraph_service_client/types.py
index a78e06169ad..509508042b4 100644
--- a/python/cugraph-service/client/cugraph_service_client/types.py
+++ b/python/cugraph-service/client/cugraph_service_client/types.py
@@ -35,7 +35,7 @@
 class UnionWrapper:
     """
     Provides easy conversions between py objs and Thrift "unions". This is used
-    as a base class for the "*Wrapper" classes below. Together with the derived
+    as a base class for the "Wrapper" classes below. Together with the derived
     classes below, these objects allow the caller to go from py objects/Thrift
     unions to Thrift unions/py objects.
     """
diff --git a/python/cugraph-service/client/pyproject.toml b/python/cugraph-service/client/pyproject.toml
index 9c9e6227e7d..e3933e673c8 100644
--- a/python/cugraph-service/client/pyproject.toml
+++ b/python/cugraph-service/client/pyproject.toml
@@ -10,21 +10,21 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cugraph-service-client"
-version = "23.04.01"
+version = "23.06.00"
 description = "cuGraph Service client"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 dependencies = [
     "thriftpy2",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
 ]
 
diff --git a/python/cugraph-service/server/cugraph_service_server/__init__.py b/python/cugraph-service/server/cugraph_service_server/__init__.py
index b527d915a68..5ab860e822f 100644
--- a/python/cugraph-service/server/cugraph_service_server/__init__.py
+++ b/python/cugraph-service/server/cugraph_service_server/__init__.py
@@ -61,4 +61,4 @@ def start_server_blocking(
     server.serve()  # blocks until Ctrl-C (kill -2)
 
 
-__version__ = "23.04.01"
+__version__ = "23.06.00"
diff --git a/python/cugraph-service/server/cugraph_service_server/cugraph_handler.py b/python/cugraph-service/server/cugraph_service_server/cugraph_handler.py
index 6e9559dab56..6cdf0d793d4 100644
--- a/python/cugraph-service/server/cugraph_service_server/cugraph_handler.py
+++ b/python/cugraph-service/server/cugraph_service_server/cugraph_handler.py
@@ -222,7 +222,7 @@ def get_server_info(self):
 
     def load_graph_creation_extensions(self, extension_dir_or_mod_path):
         """
-        Loads ("imports") all modules matching the pattern *_extension.py in the
+        Loads ("imports") all modules matching the pattern '_extension.py' in the
         directory specified by extension_dir_or_mod_path. extension_dir_or_mod_path
         can be either a path to a directory on disk, or a python import path to a
         package.
@@ -257,7 +257,7 @@ def load_graph_creation_extensions(self, extension_dir_or_mod_path):
 
     def load_extensions(self, extension_dir_or_mod_path):
         """
-        Loads ("imports") all modules matching the pattern *_extension.py in the
+        Loads ("imports") all modules matching the pattern _extension.py in the
         directory specified by extension_dir_or_mod_path. extension_dir_or_mod_path
         can be either a path to a directory on disk, or a python import path to a
         package.
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index 345060ff437..d238d45616c 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -10,32 +10,33 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cugraph-service-server"
-version = "23.04.01"
+version = "23.06.00"
 description = "cuGraph Service server"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.4.*",
-    "cugraph-service-client==23.4.*",
-    "cugraph==23.4.*",
-    "cupy-cuda11x>=9.5.0,<12.0.0a0",
-    "dask-cuda==23.4.*",
-    "dask-cudf==23.4.*",
+    "cudf==23.6.*",
+    "cugraph-service-client==23.6.*",
+    "cugraph==23.6.*",
+    "cupy-cuda11x>=12.0.0",
+    "dask-cuda==23.6.*",
+    "dask-cudf==23.6.*",
     "dask==2023.3.2",
     "distributed==2023.3.2.1",
+    "numba>=0.57",
     "numpy>=1.21",
-    "rmm==23.4.*",
+    "rmm==23.6.*",
     "thriftpy2",
-    "ucx-py==0.31.*",
+    "ucx-py==0.32.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
 ]
 dynamic = ["entry-points"]
diff --git a/python/cugraph-service/tests/test_remote_graph.py b/python/cugraph-service/tests/test_remote_graph.py
index 3d6dd515579..4a1b70d4c78 100644
--- a/python/cugraph-service/tests/test_remote_graph.py
+++ b/python/cugraph-service/tests/test_remote_graph.py
@@ -392,9 +392,11 @@ def test_extract_subgraph(
 
     assert remote_sg.get_num_vertices() == sg.number_of_vertices()
 
-    expected_vertex_ids = cudf.concat(
-        [sg.edgelist.edgelist_df["src"], sg.edgelist.edgelist_df["dst"]]
-    ).unique()
+    expected_vertex_ids = (
+        cudf.concat([sg.edgelist.edgelist_df["src"], sg.edgelist.edgelist_df["dst"]])
+        .unique()
+        .sort_values()
+    )
     if renumber:
         expected_vertex_ids = sg.unrenumber(
             cudf.DataFrame({"v": expected_vertex_ids}), "v"
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index 4c24c37ea8c..c62daef93b2 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
-set(cugraph_version 23.04.01)
+set(cugraph_version 23.06.00)
 
 include(../../fetch_rapids.cmake)
 
diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py
index 1e6b8393d39..d7324a379c0 100644
--- a/python/cugraph/cugraph/__init__.py
+++ b/python/cugraph/cugraph/__init__.py
@@ -119,4 +119,4 @@
 from cugraph import gnn
 
 
-__version__ = "23.04.01"
+__version__ = "23.06.00"
diff --git a/python/cugraph/cugraph/centrality/degree_centrality.py b/python/cugraph/cugraph/centrality/degree_centrality.py
index 5d6a0a02bab..66946afded2 100644
--- a/python/cugraph/cugraph/centrality/degree_centrality.py
+++ b/python/cugraph/cugraph/centrality/degree_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -36,8 +36,10 @@ def degree_centrality(G, normalized=True):
     df : cudf.DataFrame or Dictionary if using NetworkX
         GPU data frame containing two cudf.Series of size V: the vertex
         identifiers and the corresponding degree centrality values.
+
         df['vertex'] : cudf.Series
             Contains the vertex identifiers
+
         df['degree_centrality'] : cudf.Series
             Contains the degree centrality of vertices
 
diff --git a/python/cugraph/cugraph/centrality/eigenvector_centrality.py b/python/cugraph/cugraph/centrality/eigenvector_centrality.py
index ef2f4104cc4..07cbfefaaf1 100644
--- a/python/cugraph/cugraph/centrality/eigenvector_centrality.py
+++ b/python/cugraph/cugraph/centrality/eigenvector_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -59,8 +59,10 @@ def eigenvector_centrality(G, max_iter=100, tol=1.0e-6):
     df : cudf.DataFrame or Dictionary if using NetworkX
         GPU data frame containing two cudf.Series of size V: the vertex
         identifiers and the corresponding eigenvector centrality values.
+
         df['vertex'] : cudf.Series
             Contains the vertex identifiers
+
         df['eigenvector_centrality'] : cudf.Series
             Contains the eigenvector centrality of vertices
 
diff --git a/python/cugraph/cugraph/centrality/katz_centrality.py b/python/cugraph/cugraph/centrality/katz_centrality.py
index 7a6b240ba24..ffede18b5d2 100644
--- a/python/cugraph/cugraph/centrality/katz_centrality.py
+++ b/python/cugraph/cugraph/centrality/katz_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -97,6 +97,7 @@ def katz_centrality(
     df : cudf.DataFrame or Dictionary if using NetworkX
         GPU data frame containing two cudf.Series of size V: the vertex
         identifiers and the corresponding katz centrality values.
+
         df['vertex'] : cudf.Series
             Contains the vertex identifiers
         df['katz_centrality'] : cudf.Series
diff --git a/python/cugraph/cugraph/community/CMakeLists.txt b/python/cugraph/cugraph/community/CMakeLists.txt
index 6461cc60a7c..185f6accbab 100644
--- a/python/cugraph/cugraph/community/CMakeLists.txt
+++ b/python/cugraph/cugraph/community/CMakeLists.txt
@@ -14,7 +14,6 @@
 
 set(cython_sources
     ktruss_subgraph_wrapper.pyx
-    leiden_wrapper.pyx
 )
 
 set(linked_libraries cugraph::cugraph)
diff --git a/python/cugraph/cugraph/community/__init__.py b/python/cugraph/cugraph/community/__init__.py
index 2aa782e7080..78491b383f2 100644
--- a/python/cugraph/cugraph/community/__init__.py
+++ b/python/cugraph/cugraph/community/__init__.py
@@ -13,6 +13,7 @@
 
 from cugraph.community.louvain import louvain
 from cugraph.community.leiden import leiden
+
 from cugraph.community.ecg import ecg
 from cugraph.community.spectral_clustering import (
     spectralBalancedCutClustering,
diff --git a/python/cugraph/cugraph/community/egonet.py b/python/cugraph/cugraph/community/egonet.py
index f39ed8bf86c..684ae92febd 100644
--- a/python/cugraph/cugraph/community/egonet.py
+++ b/python/cugraph/cugraph/community/egonet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -132,13 +132,14 @@ def ego_graph(G, n, radius=1, center=True, undirected=None, distance=None):
     df = cudf.DataFrame()
     df["src"] = source
     df["dst"] = destination
-    df["weight"] = weight
+    if weight is not None:
+        df["weight"] = weight
 
     if G.renumbered:
         df, src_names = G.unrenumber(df, "src", get_column_names=True)
         df, dst_names = G.unrenumber(df, "dst", get_column_names=True)
     else:
-        # FIXME: THe original 'src' and 'dst' are not stored in 'simpleGraph'
+        # FIXME: The original 'src' and 'dst' are not stored in 'simpleGraph'
         src_names = "src"
         dst_names = "dst"
 
diff --git a/python/cugraph/cugraph/community/induced_subgraph.py b/python/cugraph/cugraph/community/induced_subgraph.py
index d64200d0423..d82e9d466a8 100644
--- a/python/cugraph/cugraph/community/induced_subgraph.py
+++ b/python/cugraph/cugraph/community/induced_subgraph.py
@@ -53,7 +53,7 @@ def ensure_valid_dtype(input_graph: Graph, input: cudf.Series, input_name: str):
 
 
 def induced_subgraph(
-    G: Graph,
+    G: Union[Graph, "networkx.Graph"],
     vertices: Union[cudf.Series, cudf.DataFrame],
     offsets: Union[list, cudf.Series] = None,
 ) -> Tuple[Union[Graph, "networkx.Graph"], cudf.Series]:
diff --git a/python/cugraph/cugraph/community/leiden.pxd b/python/cugraph/cugraph/community/leiden.pxd
deleted file mode 100644
index 871dc826c06..00000000000
--- a/python/cugraph/cugraph/community/leiden.pxd
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-
-from libcpp.utility cimport pair
-from cugraph.structure.graph_primtypes cimport *
-
-
-cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
-
-    cdef pair[size_t, weight_t] leiden[vertex_t,edge_t,weight_t](
-        const handle_t &handle,
-        const GraphCSRView[vertex_t,edge_t,weight_t] &graph,
-        vertex_t *leiden_parts,
-        size_t max_level,
-        weight_t resolution) except +
diff --git a/python/cugraph/cugraph/community/leiden.py b/python/cugraph/cugraph/community/leiden.py
index 1269e7dce9e..1caa5476623 100644
--- a/python/cugraph/cugraph/community/leiden.py
+++ b/python/cugraph/cugraph/community/leiden.py
@@ -11,14 +11,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.community import leiden_wrapper
+from pylibcugraph import leiden as pylibcugraph_leiden
+from pylibcugraph import ResourceHandle
+from cugraph.structure import Graph
+import cudf
+from typing import Union, Tuple
 from cugraph.utilities import (
     ensure_cugraph_obj_for_nx,
     df_score_to_dictionary,
 )
-
-
-def leiden(G, max_iter=100, resolution=1.0):
+from cugraph.utilities.utils import import_optional
+
+# FIXME: the networkx.Graph type used in the type annotation for
+# leiden() is specified using a string literal to avoid depending on
+# and importing networkx. Instead, networkx is imported optionally, which may
+# cause a problem for a type checker if run in an environment where networkx is
+# not installed.
+networkx = import_optional("networkx")
+
+
+def leiden(
+    G: Union[Graph, "networkx.Graph"],
+    max_iter: int = 100,
+    resolution: float = 1.0,
+    random_state: int = None,
+    theta: int = 1.0,
+) -> Tuple[cudf.DataFrame, float]:
     """
     Compute the modularity optimizing partition of the input graph using the
     Leiden algorithm
@@ -44,12 +62,21 @@ def leiden(G, max_iter=100, resolution=1.0):
         than the specified number of iterations. No error occurs when the
         algorithm terminates early in this manner.
 
-    resolution: float/double, optional (default=1.0)
+    resolution: float, optional (default=1.0)
         Called gamma in the modularity formula, this changes the size
         of the communities.  Higher resolutions lead to more smaller
         communities, lower resolutions lead to fewer larger communities.
         Defaults to 1.
 
+    random_state: int, optional(default=None)
+        Random state to use when generating samples.  Optional argument,
+        defaults to a hash of process id, time, and hostname.
+
+    theta: float, optional (default=1.0)
+        Called theta in the Leiden algorithm, this is used to scale
+        modularity gain in Leiden refinement phase, to compute
+        the probability of joining a random leiden community.
+
     Returns
     -------
     parts : cudf.DataFrame
@@ -77,12 +104,26 @@ def leiden(G, max_iter=100, resolution=1.0):
     if G.is_directed():
         raise ValueError("input graph must be undirected")
 
-    parts, modularity_score = leiden_wrapper.leiden(G, max_iter, resolution)
+    vertex, partition, modularity_score = pylibcugraph_leiden(
+        resource_handle=ResourceHandle(),
+        random_state=random_state,
+        graph=G._plc_graph,
+        max_level=max_iter,
+        resolution=resolution,
+        theta=theta,
+        do_expensive_check=False,
+    )
+
+    df = cudf.DataFrame()
+    df["vertex"] = vertex
+    df["partition"] = partition
 
     if G.renumbered:
-        parts = G.unrenumber(parts, "vertex")
+        parts = G.unrenumber(df, "vertex")
+    else:
+        parts = df
 
     if isNx is True:
-        parts = df_score_to_dictionary(parts, "partition")
+        parts = df_score_to_dictionary(df, "partition")
 
     return parts, modularity_score
diff --git a/python/cugraph/cugraph/community/leiden_wrapper.pyx b/python/cugraph/cugraph/community/leiden_wrapper.pyx
deleted file mode 100644
index 1b41134c625..00000000000
--- a/python/cugraph/cugraph/community/leiden_wrapper.pyx
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cugraph.community.leiden cimport leiden as c_leiden
-from cugraph.structure.graph_primtypes cimport *
-from cugraph.structure import graph_primtypes_wrapper
-from libc.stdint cimport uintptr_t
-
-import cudf
-import numpy as np
-
-
-def leiden(input_graph, max_iter, resolution):
-    """
-    Call leiden
-    """
-    if not input_graph.adjlist:
-        input_graph.view_adj_list()
-
-    cdef unique_ptr[handle_t] handle_ptr
-    handle_ptr.reset(new handle_t())
-
-    weights = None
-    final_modularity = None
-
-    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
-
-    num_verts = input_graph.number_of_vertices()
-    num_edges = input_graph.number_of_edges(directed_edges=True)
-
-    if input_graph.adjlist.weights is not None:
-        [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
-    else:
-        weights = cudf.Series(np.full(num_edges, 1.0, dtype=np.float32))
-
-    # Create the output dataframe
-    df = cudf.DataFrame()
-    df['vertex'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
-    df['partition'] = cudf.Series(np.zeros(num_verts,dtype=np.int32))
-
-    cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_identifier = df['vertex'].__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_partition = df['partition'].__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_weights = weights.__cuda_array_interface__['data'][0]
-
-    cdef GraphCSRView[int,int,float] graph_float
-    cdef GraphCSRView[int,int,double] graph_double
-
-    cdef float final_modularity_float = 1.0
-    cdef double final_modularity_double = 1.0
-    cdef int num_level = 0
-
-    if weights.dtype == np.float32:
-        graph_float = GraphCSRView[int,int,float](<int*>c_offsets, <int*>c_indices,
-                                                  <float*>c_weights, num_verts, num_edges)
-
-        graph_float.get_vertex_identifiers(<int*>c_identifier)
-        num_level, final_modularity_float = c_leiden(handle_ptr.get()[0],
-                                                     graph_float,
-                                                     <int*> c_partition,
-                                                     <int> max_iter,
-                                                     <float> resolution)
-
-        final_modularity = final_modularity_float
-    else:
-        graph_double = GraphCSRView[int,int,double](<int*>c_offsets, <int*>c_indices,
-                                                    <double*>c_weights, num_verts, num_edges)
-
-        graph_double.get_vertex_identifiers(<int*>c_identifier)
-        num_level, final_modularity_double = c_leiden(handle_ptr.get()[0],
-                                                      graph_double,
-                                                      <int*> c_partition,
-                                                      <int> max_iter,
-                                                      <double> resolution)
-        final_modularity = final_modularity_double
-
-    return df, final_modularity
diff --git a/python/cugraph/cugraph/community/subgraph_extraction.py b/python/cugraph/cugraph/community/subgraph_extraction.py
index efd2f49829f..601b6365e5d 100644
--- a/python/cugraph/cugraph/community/subgraph_extraction.py
+++ b/python/cugraph/cugraph/community/subgraph_extraction.py
@@ -28,7 +28,7 @@
 
 
 def subgraph(
-    G,
+    G: Union[Graph, "networkx.Graph"],
     vertices: Union[cudf.Series, cudf.DataFrame],
 ) -> Union[Graph, "networkx.Graph"]:
     """
diff --git a/python/cugraph/cugraph/components/connectivity.py b/python/cugraph/cugraph/components/connectivity.py
index 3b12c8cb5e0..e235c6c92d4 100644
--- a/python/cugraph/cugraph/components/connectivity.py
+++ b/python/cugraph/cugraph/components/connectivity.py
@@ -84,7 +84,7 @@ def _convert_df_to_output_type(df, input_type, return_labels):
         #       The number of connected components (number of unique labels).
         #   labels: ndarray
         #       The length-N array of labels of the connected components.
-        n_components = len(df["labels"].unique())
+        n_components = df["labels"].nunique()
         sorted_df = df.sort_values("vertex")
         if return_labels:
             if is_cp_matrix_type(input_type):
diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py
index 63b171d8ee6..f639856f929 100644
--- a/python/cugraph/cugraph/dask/__init__.py
+++ b/python/cugraph/cugraph/dask/__init__.py
@@ -32,3 +32,4 @@
 from .link_prediction.jaccard import jaccard
 from .link_prediction.sorensen import sorensen
 from .link_prediction.overlap import overlap
+from .community.leiden import leiden
diff --git a/python/cugraph/cugraph/dask/centrality/eigenvector_centrality.py b/python/cugraph/cugraph/dask/centrality/eigenvector_centrality.py
index f2ac8cebdc6..0dcd2b38546 100644
--- a/python/cugraph/cugraph/dask/centrality/eigenvector_centrality.py
+++ b/python/cugraph/cugraph/dask/centrality/eigenvector_centrality.py
@@ -89,6 +89,7 @@ def eigenvector_centrality(input_graph, max_iter=100, tol=1.0e-6):
     df : dask_cudf.DataFrame
         GPU data frame containing two cudf.Series of size V: the vertex
         identifiers and the corresponding eigenvector centrality values.
+
         df['vertex'] : cudf.Series
             Contains the vertex identifiers
         df['eigenvector_centrality'] : cudf.Series
diff --git a/python/cugraph/cugraph/dask/common/input_utils.py b/python/cugraph/cugraph/dask/common/input_utils.py
index 147ae3b1848..00e94fd1b42 100644
--- a/python/cugraph/cugraph/dask/common/input_utils.py
+++ b/python/cugraph/cugraph/dask/common/input_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
 
 
 from collections.abc import Sequence
-
 from collections import OrderedDict
 from dask_cudf.core import DataFrame as dcDataFrame
 from dask_cudf.core import Series as daskSeries
@@ -101,7 +100,7 @@ def create(cls, data, client=None, batch_enabled=False):
         if isinstance(first(data) if multiple else data, (dcDataFrame, daskSeries)):
             datatype = "cudf"
         else:
-            raise Exception("Graph data must be dask-cudf dataframe")
+            raise TypeError("Graph data must be dask-cudf dataframe")
 
         gpu_futures = client.sync(
             _extract_partitions, data, client, batch_enabled=batch_enabled
diff --git a/python/cugraph/cugraph/dask/common/mg_utils.py b/python/cugraph/cugraph/dask/common/mg_utils.py
index 5ab884a5b34..6acda48c9da 100644
--- a/python/cugraph/cugraph/dask/common/mg_utils.py
+++ b/python/cugraph/cugraph/dask/common/mg_utils.py
@@ -13,11 +13,8 @@
 
 import os
 
-import rmm
 import numba.cuda
 
-from dask_cuda import LocalCUDACluster
-from dask.distributed import Client
 
 # FIXME: this raft import breaks the library if ucx-py is
 # not available. They are necessary only when doing MG work.
@@ -32,11 +29,6 @@
         default_client = MissingUCXPy()
     else:
         raise
-# FIXME: cugraph/__init__.py also imports the comms module, but
-# depending on the import environment, cugraph/comms/__init__.py
-# may be imported instead. The following imports the comms.py
-# module directly
-from cugraph.dask.comms import comms as Comms
 
 
 # FIXME: We currently look for the default client from dask, as such is the
@@ -76,42 +68,3 @@ def get_visible_devices():
     else:
         visible_devices = _visible_devices.strip().split(",")
     return visible_devices
-
-
-def setup_local_dask_cluster(p2p=True):
-    """
-    Performs steps to setup a Dask cluster using LocalCUDACluster and returns
-    the LocalCUDACluster and corresponding client instance.
-    """
-    cluster = LocalCUDACluster()
-    client = Client(cluster)
-    client.wait_for_workers(len(get_visible_devices()))
-    Comms.initialize(p2p=p2p)
-
-    return (cluster, client)
-
-
-def teardown_local_dask_cluster(cluster, client):
-    """
-    Performs steps to destroy a Dask cluster and a corresponding client
-    instance.
-    """
-    Comms.destroy()
-    client.close()
-    cluster.close()
-
-
-def start_dask_client():
-    n_devices = os.getenv("DASK_NUM_WORKERS", 2)
-    n_devices = int(n_devices)
-
-    visible_devices = ",".join([str(i) for i in range(1, n_devices + 1)])
-
-    cluster = LocalCUDACluster(
-        protocol="ucx", rmm_pool_size="25GB", CUDA_VISIBLE_DEVICES=visible_devices
-    )
-    client = Client(cluster)
-    Comms.initialize(p2p=True)
-    rmm.reinitialize(pool_allocator=True)
-
-    return cluster, client
diff --git a/python/cugraph/cugraph/dask/common/part_utils.py b/python/cugraph/cugraph/dask/common/part_utils.py
index e62b82df7e6..3afb8a8463d 100644
--- a/python/cugraph/cugraph/dask/common/part_utils.py
+++ b/python/cugraph/cugraph/dask/common/part_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -79,6 +79,41 @@ def persist_distributed_data(dask_df, client):
     return parts
 
 
+def _create_empty_dask_df_future(meta_df, client, worker):
+    df_future = client.scatter(meta_df.head(0), workers=[worker])
+    wait(df_future)
+    return [df_future]
+
+
+def get_persisted_df_worker_map(dask_df, client):
+    ddf_keys = futures_of(dask_df)
+    output_map = {}
+    for w, w_keys in client.has_what().items():
+        output_map[w] = [ddf_k for ddf_k in ddf_keys if str(ddf_k.key) in w_keys]
+        if len(output_map[w]) == 0:
+            output_map[w] = _create_empty_dask_df_future(dask_df._meta, client, w)
+    return output_map
+
+
+def _chunk_lst(ls, num_parts):
+    return [ls[i::num_parts] for i in range(num_parts)]
+
+
+def persist_dask_df_equal_parts_per_worker(dask_df, client):
+    ddf_keys = dask_df.to_delayed()
+    workers = client.scheduler_info()["workers"].keys()
+    ddf_keys_ls = _chunk_lst(ddf_keys, len(workers))
+    persisted_keys = []
+    for w, ddf_k in zip(workers, ddf_keys_ls):
+        persisted_keys.extend(
+            client.persist(ddf_k, workers=w, allow_other_workers=False)
+        )
+    dask_df = dask_cudf.from_delayed(persisted_keys, meta=dask_df._meta).persist()
+    wait(dask_df)
+    client.rebalance(dask_df)
+    return dask_df
+
+
 async def _extract_partitions(dask_obj, client=None, batch_enabled=False):
     client = default_client() if client is None else client
     worker_list = Comms.get_workers()
diff --git a/python/cugraph/cugraph/dask/community/__init__.py b/python/cugraph/cugraph/dask/community/__init__.py
index b963edfa1cc..657d9df101b 100644
--- a/python/cugraph/cugraph/dask/community/__init__.py
+++ b/python/cugraph/cugraph/dask/community/__init__.py
@@ -14,3 +14,4 @@
 from .louvain import louvain
 from .triangle_count import triangle_count
 from .induced_subgraph import induced_subgraph
+from .leiden import leiden
diff --git a/python/cugraph/cugraph/dask/community/egonet.py b/python/cugraph/cugraph/dask/community/egonet.py
index 2d0d07b59ce..06f5d5b9a79 100644
--- a/python/cugraph/cugraph/dask/community/egonet.py
+++ b/python/cugraph/cugraph/dask/community/egonet.py
@@ -66,7 +66,10 @@ def convert_to_cudf(cp_arrays):
     df = cudf.DataFrame()
     df["src"] = cp_src
     df["dst"] = cp_dst
-    df["weight"] = cp_weight
+    if cp_weight is None:
+        df["weight"] = None
+    else:
+        df["weight"] = cp_weight
 
     offsets = cudf.Series(cp_offsets)
 
diff --git a/python/cugraph/cugraph/dask/community/leiden.py b/python/cugraph/cugraph/dask/community/leiden.py
new file mode 100644
index 00000000000..75582fa48f7
--- /dev/null
+++ b/python/cugraph/cugraph/dask/community/leiden.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import annotations
+
+from dask.distributed import wait, default_client
+import cugraph.dask.comms.comms as Comms
+import dask_cudf
+import dask
+from dask import delayed
+import cudf
+
+from pylibcugraph import ResourceHandle
+from pylibcugraph import leiden as pylibcugraph_leiden
+import numpy
+import cupy as cp
+from typing import Tuple, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from cugraph import Graph
+
+
+def convert_to_cudf(result: cp.ndarray) -> Tuple[cudf.DataFrame, float]:
+    """
+    Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
+    """
+    cupy_vertex, cupy_partition, modularity = result
+    df = cudf.DataFrame()
+    df["vertex"] = cupy_vertex
+    df["partition"] = cupy_partition
+
+    return df, modularity
+
+
+def _call_plc_leiden(
+    sID: bytes,
+    mg_graph_x,
+    max_iter: int,
+    resolution: int,
+    random_state: int,
+    theta: int,
+    do_expensive_check: bool,
+) -> Tuple[cp.ndarray, cp.ndarray, float]:
+    return pylibcugraph_leiden(
+        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+        random_state=random_state,
+        graph=mg_graph_x,
+        max_level=max_iter,
+        resolution=resolution,
+        theta=theta,
+        do_expensive_check=do_expensive_check,
+    )
+
+
+def leiden(
+    input_graph: Graph,
+    max_iter: int = 100,
+    resolution: int = 1.0,
+    random_state: int = None,
+    theta: int = 1.0,
+) -> Tuple[dask_cudf.DataFrame, float]:
+    """
+    Compute the modularity optimizing partition of the input graph using the
+    Leiden method
+
+    Traag, V. A., Waltman, L., & van Eck, N. J. (2019). From Louvain to Leiden:
+    guaranteeing well-connected communities. Scientific reports, 9(1), 5233.
+    doi: 10.1038/s41598-019-41695-z
+
+    Parameters
+    ----------
+    G : cugraph.Graph
+        The graph descriptor should contain the connectivity information
+        and weights. The adjacency list will be computed if not already
+        present.
+        The current implementation only supports undirected graphs.
+
+    max_iter : integer, optional (default=100)
+        This controls the maximum number of levels/iterations of the Leiden
+        algorithm. When specified the algorithm will terminate after no more
+        than the specified number of iterations. No error occurs when the
+        algorithm terminates early in this manner.
+
+    resolution: float, optional (default=1.0)
+        Called gamma in the modularity formula, this changes the size
+        of the communities.  Higher resolutions lead to more smaller
+        communities, lower resolutions lead to fewer larger communities.
+        Defaults to 1.
+
+    random_state: int, optional(default=None)
+        Random state to use when generating samples.  Optional argument,
+        defaults to a hash of process id, time, and hostname.
+
+    theta: float, optional (default=1.0)
+        Called theta in the Leiden algorithm, this is used to scale
+        modularity gain in Leiden refinement phase, to compute
+        the probability of joining a random leiden community.
+
+    Returns
+    -------
+    parts : dask_cudf.DataFrame
+        GPU data frame of size V containing two columns the vertex id and the
+        partition id it is assigned to.
+
+        ddf['vertex'] : cudf.Series
+            Contains the vertex identifiers
+        ddf['partition'] : cudf.Series
+            Contains the partition assigned to the vertices
+
+    modularity_score : float
+        a floating point number containing the global modularity score of the
+        partitioning.
+
+    Examples
+    --------
+    >>> from cugraph.experimental.datasets import karate
+    >>> G = karate.get_graph(fetch=True)
+    >>> parts, modularity_score = cugraph.leiden(G)
+
+    """
+
+    if input_graph.is_directed():
+        raise ValueError("input graph must be undirected")
+
+    # Return a client if one has started
+    client = default_client()
+
+    do_expensive_check = False
+
+    result = [
+        client.submit(
+            _call_plc_leiden,
+            Comms.get_session_id(),
+            input_graph._plc_graph[w],
+            max_iter,
+            resolution,
+            random_state,
+            theta,
+            do_expensive_check,
+            workers=[w],
+            allow_other_workers=False,
+        )
+        for w in Comms.get_workers()
+    ]
+
+    wait(result)
+
+    part_mod_score = [client.submit(convert_to_cudf, r) for r in result]
+    wait(part_mod_score)
+
+    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
+    empty_df = cudf.DataFrame(
+        {
+            "vertex": numpy.empty(shape=0, dtype=vertex_dtype),
+            "partition": numpy.empty(shape=0, dtype="int32"),
+        }
+    )
+
+    part_mod_score = [delayed(lambda x: x, nout=2)(r) for r in part_mod_score]
+
+    ddf = dask_cudf.from_delayed(
+        [r[0] for r in part_mod_score], meta=empty_df, verify_meta=False
+    ).persist()
+
+    mod_score = dask.array.from_delayed(
+        part_mod_score[0][1], shape=(1,), dtype=float
+    ).compute()
+
+    wait(ddf)
+    wait(mod_score)
+
+    wait([r.release() for r in part_mod_score])
+
+    if input_graph.renumbered:
+        ddf = input_graph.unrenumber(ddf, "vertex")
+
+    return ddf, mod_score
diff --git a/python/cugraph/cugraph/dask/community/louvain.py b/python/cugraph/cugraph/dask/community/louvain.py
index 07980e1816d..c003939f5eb 100644
--- a/python/cugraph/cugraph/dask/community/louvain.py
+++ b/python/cugraph/cugraph/dask/community/louvain.py
@@ -13,28 +13,40 @@
 # limitations under the License.
 #
 
+from __future__ import annotations
+
 from dask.distributed import wait, default_client
 import cugraph.dask.comms.comms as Comms
 import dask_cudf
+import dask
+from dask import delayed
 import cudf
-import operator as op
+import cupy as cp
+import numpy
 
 from pylibcugraph import ResourceHandle
 from pylibcugraph import louvain as pylibcugraph_louvain
+from typing import Tuple, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from cugraph import Graph
 
 
-def convert_to_cudf(cupy_vertex, cupy_partition):
+def convert_to_cudf(result: cp.ndarray) -> Tuple[cudf.DataFrame, float]:
     """
     Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
     """
+    cupy_vertex, cupy_partition, modularity = result
     df = cudf.DataFrame()
     df["vertex"] = cupy_vertex
     df["partition"] = cupy_partition
 
-    return df
+    return df, modularity
 
 
-def _call_plc_louvain(sID, mg_graph_x, max_iter, resolution, do_expensive_check):
+def _call_plc_louvain(
+    sID: bytes, mg_graph_x, max_iter: int, resolution: int, do_expensive_check: bool
+) -> Tuple[cp.ndarray, cp.ndarray, float]:
     return pylibcugraph_louvain(
         resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
         graph=mg_graph_x,
@@ -44,7 +56,9 @@ def _call_plc_louvain(sID, mg_graph_x, max_iter, resolution, do_expensive_check)
     )
 
 
-def louvain(input_graph, max_iter=100, resolution=1.0):
+def louvain(
+    input_graph: Graph, max_iter: int = 100, resolution: int = 1.0
+) -> Tuple[dask_cudf.DataFrame, float]:
     """
     Compute the modularity optimizing partition of the input graph using the
     Louvain method
@@ -122,32 +136,31 @@ def louvain(input_graph, max_iter=100, resolution=1.0):
 
     wait(result)
 
-    # futures is a list of Futures containing tuples of (DataFrame, mod_score),
-    # unpack using separate calls to client.submit with a callable to get
-    # individual items.
-    # FIXME: look into an alternate way (not returning a tuples, accessing
-    # tuples differently, etc.) since multiple client.submit() calls may not be
-    # optimal.
-    result_vertex = [client.submit(op.getitem, f, 0) for f in result]
-    result_partition = [client.submit(op.getitem, f, 1) for f in result]
-    mod_score = [client.submit(op.getitem, f, 2) for f in result]
-
-    cudf_result = [
-        client.submit(convert_to_cudf, cp_vertex_arrays, cp_partition_arrays)
-        for cp_vertex_arrays, cp_partition_arrays in zip(
-            result_vertex, result_partition
-        )
-    ]
+    part_mod_score = [client.submit(convert_to_cudf, r) for r in result]
+    wait(part_mod_score)
+
+    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
+    empty_df = cudf.DataFrame(
+        {
+            "vertex": numpy.empty(shape=0, dtype=vertex_dtype),
+            "partition": numpy.empty(shape=0, dtype="int32"),
+        }
+    )
+
+    part_mod_score = [delayed(lambda x: x, nout=2)(r) for r in part_mod_score]
+
+    ddf = dask_cudf.from_delayed(
+        [r[0] for r in part_mod_score], meta=empty_df, verify_meta=False
+    ).persist()
 
-    wait(cudf_result)
-    # Each worker should have computed the same mod_score
-    mod_score = mod_score[0].result()
+    mod_score = dask.array.from_delayed(
+        part_mod_score[0][1], shape=(1,), dtype=float
+    ).compute()
 
-    ddf = dask_cudf.from_delayed(cudf_result).persist()
     wait(ddf)
+    wait(mod_score)
 
-    # Wait until the inactive futures are released
-    wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])
+    wait([r.release() for r in part_mod_score])
 
     if input_graph.renumbered:
         ddf = input_graph.unrenumber(ddf, "vertex")
diff --git a/python/cugraph/cugraph/dask/link_analysis/pagerank.py b/python/cugraph/cugraph/dask/link_analysis/pagerank.py
index 75d5b6d16c6..4aba5725c1b 100644
--- a/python/cugraph/cugraph/dask/link_analysis/pagerank.py
+++ b/python/cugraph/cugraph/dask/link_analysis/pagerank.py
@@ -178,8 +178,10 @@ def pagerank(
     personalization : cudf.Dataframe, optional (default=None)
         GPU Dataframe containing the personalization information.
         (a performance optimization)
+
         personalization['vertex'] : cudf.Series
             Subset of vertices of graph for personalization
+
         personalization['values'] : cudf.Series
             Personalization values for vertices
 
@@ -187,8 +189,10 @@ def pagerank(
         GPU Dataframe containing the precomputed vertex out weight
         (a performance optimization)
         information.
+
         precomputed_vertex_out_weight['vertex'] : cudf.Series
             Subset of vertices of graph for precomputed_vertex_out_weight
+
         precomputed_vertex_out_weight['sums'] : cudf.Series
             Corresponding precomputed sum of outgoing vertices weight
 
@@ -211,8 +215,10 @@ def pagerank(
     nstart : cudf.Dataframe, optional (default=None)
         GPU Dataframe containing the initial guess for pagerank.
         (a performance optimization)
+
         nstart['vertex'] : cudf.Series
             Subset of vertices of graph for initial guess for pagerank values
+
         nstart['values'] : cudf.Series
             Pagerank values for vertices
 
@@ -234,6 +240,7 @@ def pagerank(
 
         ddf['vertex'] : dask_cudf.Series
             Contains the vertex identifiers
+
         ddf['pagerank'] : dask_cudf.Series
             Contains the PageRank score
 
diff --git a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
index 15d109452eb..7d8972a7385 100644
--- a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
@@ -141,10 +141,11 @@ def convert_to_cudf(cp_arrays, weight_t, with_edge_properties, return_offsets=Fa
         df[dst_n] = cupy_destinations
         df[indices_n] = cupy_indices
 
-        if weight_t == "int32":
-            df.indices = df.indices.astype("int32")
-        elif weight_t == "int64":
-            df.indices = df.indices.astype("int64")
+        if cupy_indices is not None:
+            if weight_t == "int32":
+                df.indices = df.indices.astype("int32")
+            elif weight_t == "int64":
+                df.indices = df.indices.astype("int64")
 
         return df
 
@@ -296,6 +297,7 @@ def uniform_neighbor_sample(
         List of output GPUs (by rank) corresponding to batch
         id labels in the label list.  Used to assign each batch
         id to a GPU.
+        Must be in ascending order (i.e. [0, 0, 1, 2]).
 
     random_state: int, optional
         Random seed to use when making sampling calls.
diff --git a/python/cugraph/cugraph/dask/structure/mg_property_graph.py b/python/cugraph/cugraph/dask/structure/mg_property_graph.py
index d81d40597dc..dafa198b6f6 100644
--- a/python/cugraph/cugraph/dask/structure/mg_property_graph.py
+++ b/python/cugraph/cugraph/dask/structure/mg_property_graph.py
@@ -379,7 +379,11 @@ def get_vertices(self, selection=None):
         vert_sers = self.__get_all_vertices_series()
         if vert_sers:
             if self.__series_type is dask_cudf.Series:
-                return dask_cudf.concat(vert_sers, ignore_index=True).unique()
+                return (
+                    dask_cudf.concat(vert_sers, ignore_index=True)
+                    .unique()
+                    .sort_values()
+                )
             else:
                 raise TypeError("dataframe must be a CUDF Dask dataframe.")
         return self.__series_type()
diff --git a/python/cugraph/cugraph/dask/traversal/sssp.py b/python/cugraph/cugraph/dask/traversal/sssp.py
index b55f48dc86b..053a93fb42a 100644
--- a/python/cugraph/cugraph/dask/traversal/sssp.py
+++ b/python/cugraph/cugraph/dask/traversal/sssp.py
@@ -20,7 +20,6 @@
 import cudf
 import dask_cudf
 from pylibcugraph import sssp as pylibcugraph_sssp, ResourceHandle
-import warnings
 
 
 def _call_plc_sssp(
@@ -102,12 +101,12 @@ def sssp(input_graph, source, cutoff=None, check_source=True):
 
     # FIXME: Implement a better way to check if the graph is weighted similar
     # to 'simpleGraph'
-    if len(input_graph.edgelist.edgelist_df.columns) != 3:
-        warning_msg = (
-            "'SSSP' requires the input graph to be weighted: Unweighted "
-            "graphs will not be supported in the next release."
+    if not input_graph.weighted:
+        err_msg = (
+            "'SSSP' requires the input graph to be weighted."
+            "'BFS' should be used instead of 'SSSP' for unweighted graphs."
         )
-        warnings.warn(warning_msg, PendingDeprecationWarning)
+        raise ValueError(err_msg)
 
     client = default_client()
 
diff --git a/python/cugraph/cugraph/experimental/datasets/__init__.py b/python/cugraph/cugraph/experimental/datasets/__init__.py
index d12248c99ff..a1dd45b3d9f 100644
--- a/python/cugraph/cugraph/experimental/datasets/__init__.py
+++ b/python/cugraph/cugraph/experimental/datasets/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 from cugraph.experimental.datasets.dataset import (
     Dataset,
     load_all,
-    set_config,
     set_download_dir,
     get_download_dir,
     default_download_dir,
diff --git a/python/cugraph/cugraph/experimental/datasets/dataset.py b/python/cugraph/cugraph/experimental/datasets/dataset.py
index 36e6de487c0..6b395d50fef 100644
--- a/python/cugraph/cugraph/experimental/datasets/dataset.py
+++ b/python/cugraph/cugraph/experimental/datasets/dataset.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -70,31 +70,94 @@ class Dataset:
         properties
     """
 
-    def __init__(self, meta_data_file_name):
-        with open(meta_data_file_name, "r") as file:
-            self.metadata = yaml.safe_load(file)
-
+    def __init__(
+        self,
+        metadata_yaml_file=None,
+        csv_file=None,
+        csv_header=None,
+        csv_delim=" ",
+        csv_col_names=None,
+        csv_col_dtypes=None,
+    ):
+        self._metadata_file = None
         self._dl_path = default_download_dir
         self._edgelist = None
-        self._graph = None
         self._path = None
+
+        if metadata_yaml_file is not None and csv_file is not None:
+            raise ValueError("cannot specify both metadata_yaml_file and csv_file")
+
+        elif metadata_yaml_file is not None:
+            with open(metadata_yaml_file, "r") as file:
+                self.metadata = yaml.safe_load(file)
+                self._metadata_file = Path(metadata_yaml_file)
+
+        elif csv_file is not None:
+            if csv_col_names is None or csv_col_dtypes is None:
+                raise ValueError(
+                    "csv_col_names and csv_col_dtypes must both be "
+                    "not None when csv_file is specified."
+                )
+            self._path = Path(csv_file)
+            if self._path.exists() is False:
+                raise FileNotFoundError(csv_file)
+            self.metadata = {
+                "name": self._path.with_suffix("").name,
+                "file_type": ".csv",
+                "url": None,
+                "header": csv_header,
+                "delim": csv_delim,
+                "col_names": csv_col_names,
+                "col_types": csv_col_dtypes,
+            }
+
+        else:
+            raise ValueError("must specify either metadata_yaml_file or csv_file")
+
+    def __str__(self):
         """
-        self._path = self._dl_path.path / (self.metadata['name'] +
-                                           self.metadata['file_type'])
+        Use the basename of the meta_data_file the instance was constructed with,
+        without any extension, as the string repr.
         """
+        # The metadata file is likely to have a more descriptive file name, so
+        # use that one first if present.
+        # FIXME: this may need to provide a more unique or descriptive string repr
+        if self._metadata_file is not None:
+            return self._metadata_file.with_suffix("").name
+        else:
+            return self.get_path().with_suffix("").name
 
     def __download_csv(self, url):
+        """
+        Downloads the .csv file from url to the current download path
+        (self._dl_path), updates self._path with the full path to the
+        downloaded file, and returns the latest value of self._path.
+        """
         self._dl_path.path.mkdir(parents=True, exist_ok=True)
 
         filename = self.metadata["name"] + self.metadata["file_type"]
         if self._dl_path.path.is_dir():
             df = cudf.read_csv(url)
-            df.to_csv(self._dl_path.path / filename, index=False)
+            self._path = self._dl_path.path / filename
+            df.to_csv(self._path, index=False)
 
         else:
             raise RuntimeError(
                 f"The directory {self._dl_path.path.absolute()}" "does not exist"
             )
+        return self._path
+
+    def unload(self):
+
+        """
+        Remove all saved internal objects, forcing them to be re-created when
+        accessed.
+
+        NOTE: This will cause calls to get_*() to re-read the dataset file from
+        disk. The caller should ensure the file on disk has not moved/been
+        deleted/changed.
+        """
+        self._edgelist = None
 
     def get_edgelist(self, fetch=False):
         """
@@ -106,12 +169,11 @@ def get_edgelist(self, fetch=False):
             Automatically fetch for the dataset from the 'url' location within
             the YAML file.
         """
-
         if self._edgelist is None:
             full_path = self.get_path()
             if not full_path.is_file():
                 if fetch:
-                    self.__download_csv(self.metadata["url"])
+                    full_path = self.__download_csv(self.metadata["url"])
                 else:
                     raise RuntimeError(
                         f"The datafile {full_path} does not"
@@ -131,7 +193,13 @@ def get_edgelist(self, fetch=False):
 
         return self._edgelist
 
-    def get_graph(self, fetch=False, create_using=Graph, ignore_weights=False):
+    def get_graph(
+        self,
+        fetch=False,
+        create_using=Graph,
+        ignore_weights=False,
+        store_transposed=False,
+    ):
         """
         Return a Graph object.
 
@@ -156,13 +224,13 @@ def get_graph(self, fetch=False, create_using=Graph, ignore_weights=False):
             self.get_edgelist(fetch)
 
         if create_using is None:
-            self._graph = Graph()
+            G = Graph()
         elif isinstance(create_using, Graph):
             # what about BFS if trnaposed is True
             attrs = {"directed": create_using.is_directed()}
-            self._graph = type(create_using)(**attrs)
+            G = type(create_using)(**attrs)
         elif type(create_using) is type:
-            self._graph = create_using()
+            G = create_using()
         else:
             raise TypeError(
                 "create_using must be a cugraph.Graph "
@@ -171,23 +239,30 @@ def get_graph(self, fetch=False, create_using=Graph, ignore_weights=False):
             )
 
         if len(self.metadata["col_names"]) > 2 and not (ignore_weights):
-            self._graph.from_cudf_edgelist(
-                self._edgelist, source="src", destination="dst", edge_attr="wgt"
+            G.from_cudf_edgelist(
+                self._edgelist,
+                source="src",
+                destination="dst",
+                edge_attr="wgt",
+                store_transposed=store_transposed,
             )
         else:
-            self._graph.from_cudf_edgelist(
-                self._edgelist, source="src", destination="dst"
+            G.from_cudf_edgelist(
+                self._edgelist,
+                source="src",
+                destination="dst",
+                store_transposed=store_transposed,
             )
-
-        return self._graph
+        return G
 
     def get_path(self):
         """
         Returns the location of the stored dataset file
         """
-        self._path = self._dl_path.path / (
-            self.metadata["name"] + self.metadata["file_type"]
-        )
+        if self._path is None:
+            self._path = self._dl_path.path / (
+                self.metadata["name"] + self.metadata["file_type"]
+            )
 
         return self._path.absolute()
 
@@ -218,20 +293,6 @@ def load_all(force=False):
                     df.to_csv(save_to, index=False)
 
 
-def set_config(cfgpath):
-    """
-    Read in a custom config file.
-
-    Parameters
-    ----------
-    cfgfile : String
-        Read the custom config file given its path, and override the default
-    """
-    with open(Path(cfgpath), "r") as file:
-        cfg = yaml.safe_load(file)
-        default_download_dir.path = Path(cfg["download_dir"])
-
-
 def set_download_dir(path):
     """
     Set the download directory for fetching datasets
diff --git a/python/cugraph/cugraph/generators/rmat.py b/python/cugraph/cugraph/generators/rmat.py
index 2c9167a5217..e9f7515e92e 100644
--- a/python/cugraph/cugraph/generators/rmat.py
+++ b/python/cugraph/cugraph/generators/rmat.py
@@ -277,13 +277,17 @@ def rmat(
         Number of edges to generate
 
     a : float
-        Probability of the first partition
+        Probability of the edge being in the first partition
+        The Graph 500 spec sets this value to 0.57
 
     b : float
-        Probability of the second partition
+        Probability of the edge being in the second partition
+        The Graph 500 spec sets this value to 0.19
+
 
     c : float
-        Probability of the third partition
+        Probability of the edge being in the third partition
+        The Graph 500 spec sets this value to 0.19
 
     seed : int
         Seed value for the random number generator
diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
index 95fab240eb2..0257a56ba08 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
@@ -15,11 +15,16 @@
 
 from typing import Union
 
+import cupy
 import cudf
 import dask_cudf
+import cugraph.dask as dask_cugraph
+
 import cugraph
 import pylibcugraph
 
+from cugraph.gnn.data_loading.bulk_sampler_io import write_samples
+
 
 class EXPERIMENTAL__BulkSampler:
     start_col_name = "_START_"
@@ -32,7 +37,6 @@ def __init__(
         graph,
         seeds_per_call: int = 200_000,
         batches_per_partition=100,
-        rank: int = 0,
         **kwargs,
     ):
         """
@@ -51,9 +55,6 @@ def __init__(
             a single sampling call.
         batches_per_partition: int (optional, default=100)
             The number of batches outputted to a single parquet partition.
-        rank: int (optional, default=0)
-            The rank of this sampler.  Used to isolate this sampler from
-            others that may be running on other nodes.
         kwargs: kwargs
             Keyword arguments to be passed to the sampler (i.e. fanout).
         """
@@ -75,14 +76,9 @@ def __init__(
         self.__graph = graph
         self.__seeds_per_call = seeds_per_call
         self.__batches_per_partition = batches_per_partition
-        self.__rank = rank
         self.__batches = None
         self.__sample_call_args = kwargs
 
-    @property
-    def rank(self) -> int:
-        return self.__rank
-
     @property
     def seeds_per_call(self) -> int:
         return self.__seeds_per_call
@@ -195,18 +191,29 @@ def flush(self) -> None:
             sample_fn = cugraph.uniform_neighbor_sample
         else:
             sample_fn = cugraph.dask.uniform_neighbor_sample
-            self.__sample_call_args["_multiple_clients"] = True
+            self.__sample_call_args.update(
+                {
+                    "_multiple_clients": True,
+                    "label_to_output_comm_rank": self.__get_label_to_output_comm_rank(
+                        min_batch_id, max_batch_id
+                    ),
+                    "label_list": cupy.arange(
+                        min_batch_id, max_batch_id + 1, dtype="int32"
+                    ),
+                }
+            )
 
-        samples = sample_fn(
+        samples, offsets = sample_fn(
             self.__graph,
             **self.__sample_call_args,
             start_list=self.__batches[self.start_col_name][batch_id_filter],
             batch_id_list=self.__batches[self.batch_col_name][batch_id_filter],
             with_edge_properties=True,
+            return_offsets=True,
         )
 
         self.__batches = self.__batches[~batch_id_filter]
-        self.__write(samples, min_batch_id, npartitions)
+        self.__write(samples, offsets)
 
         if self.size > 0:
             self.flush()
@@ -214,36 +221,19 @@ def flush(self) -> None:
     def __write(
         self,
         samples: Union[cudf.DataFrame, dask_cudf.DataFrame],
-        min_batch_id: int,
-        npartitions: int,
+        offsets: Union[cudf.DataFrame, dask_cudf.DataFrame],
     ) -> None:
-        # Ensure each rank writes to its own partition so there is no conflict
-        outer_partition = f"rank={self.__rank}"
-        outer_partition_path = os.path.join(self.__output_path, outer_partition)
-        os.makedirs(outer_partition_path, exist_ok=True)
-
-        for partition_k in range(npartitions):
-            ix_partition_start_inclusive = (
-                min_batch_id + partition_k * self.batches_per_partition
-            )
-            ix_partition_end_inclusive = (
-                min_batch_id + (partition_k + 1) * self.batches_per_partition - 1
-            )
-            f = (samples.batch_id >= ix_partition_start_inclusive) & (
-                samples.batch_id <= ix_partition_end_inclusive
-            )
-            if len(samples[f]) == 0:
-                break
-
-            ix_partition_end_inclusive = samples[f].batch_id.max()
-            if hasattr(ix_partition_end_inclusive, "compute"):
-                ix_partition_end_inclusive = ix_partition_end_inclusive.compute()
-            ix_partition_end_inclusive = int(ix_partition_end_inclusive)
-
-            inner_path = os.path.join(
-                outer_partition_path,
-                f"batch={ix_partition_start_inclusive}-{ix_partition_end_inclusive}"
-                ".parquet",
-            )
+        os.makedirs(self.__output_path, exist_ok=True)
+        write_samples(
+            samples, offsets, self.__batches_per_partition, self.__output_path
+        )
+
+    def __get_label_to_output_comm_rank(self, min_batch_id, max_batch_id):
+        num_workers = dask_cugraph.get_n_workers()
+        num_batches = max_batch_id - min_batch_id + 1
+        z = cupy.zeros(num_batches, dtype="int32")
+        s = cupy.array_split(cupy.arange(num_batches), num_workers)
+        for i, t in enumerate(s):
+            z[t] = i
 
-            samples[f].to_parquet(inner_path, index=False)
+        return cudf.Series(z)
diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
new file mode 100644
index 00000000000..673b53838c5
--- /dev/null
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cudf
+import cupy
+
+from typing import Union, Optional
+
+
+def _write_samples_to_parquet(
+    results: cudf.DataFrame,
+    offsets: cudf.DataFrame,
+    batches_per_partition: int,
+    output_path: str,
+    partition_info: Optional[Union[dict, str]] = None,
+) -> None:
+    """
+    Writes the samples to parquet.
+    results: cudf.DataFrame
+        The results dataframe containing the sampled minibatches.
+    offsets: cudf.DataFrame
+        The offsets dataframe indicating the start/end of each minibatch
+        in the reuslts dataframe.
+    batches_per_partition: int
+        The maximum number of minibatches allowed per written parquet partition.
+    output_path: str
+        The output path (where parquet files should be written to).
+    partition_info: Union[dict, str]
+        Either a dictionary containing partition data from dask, the string 'sg'
+        indicating that this is a single GPU write, or None indicating that this
+        function should perform a no-op (required by dask).
+    """
+
+    # Required by dask; need to skip dummy partitions.
+    if partition_info is None or len(results) == 0:
+        return
+    if partition_info != "sg" and (not isinstance(partition_info, dict)):
+        raise ValueError("Invalid value of partition_info")
+
+    max_batch_id = offsets.batch_id.max()
+
+    for p in range(0, len(offsets), batches_per_partition):
+        offsets_p = offsets.iloc[p : p + batches_per_partition]
+        start_batch_id = offsets_p.batch_id.iloc[0]
+        end_batch_id = offsets_p.batch_id.iloc[-1]
+
+        start_ix = offsets_p.offsets.iloc[0]
+        if end_batch_id == max_batch_id:
+            end_ix = len(results)
+        else:
+            end_ix = offsets.offsets[offsets.batch_id == (end_batch_id + 1)].iloc[0]
+
+        full_output_path = os.path.join(
+            output_path, f"batch={start_batch_id}-{end_batch_id}.parquet"
+        )
+        results_p = results.iloc[start_ix:end_ix]
+
+        results_p["batch_id"] = offsets_p.batch_id.repeat(
+            cupy.diff(offsets_p.offsets.values, append=end_ix)
+        ).values
+        results_p.to_parquet(full_output_path)
+
+
+def write_samples(
+    results: cudf.DataFrame,
+    offsets: cudf.DataFrame,
+    batches_per_partition: cudf.DataFrame,
+    output_path: str,
+):
+    """
+    Writes the samples to parquet.
+    results: cudf.DataFrame
+        The results dataframe containing the sampled minibatches.
+    offsets: cudf.DataFrame
+        The offsets dataframe indicating the start/end of each minibatch
+        in the reuslts dataframe.
+    batches_per_partition: int
+        The maximum number of minibatches allowed per written parquet partition.
+    output_path: str
+        The output path (where parquet files should be written to).
+    """
+    if hasattr(results, "compute"):
+        results.map_partitions(
+            _write_samples_to_parquet,
+            offsets,
+            batches_per_partition,
+            output_path,
+            align_dataframes=False,
+        ).compute()
+    else:
+        _write_samples_to_parquet(
+            results, offsets, batches_per_partition, output_path, partition_info="sg"
+        )
diff --git a/python/cugraph/cugraph/internals/callbacks_implems.hpp b/python/cugraph/cugraph/internals/callbacks_implems.hpp
index 10a8fc7503a..34846e29a6c 100644
--- a/python/cugraph/cugraph/internals/callbacks_implems.hpp
+++ b/python/cugraph/cugraph/internals/callbacks_implems.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,57 +22,51 @@
 #include <iostream>
 
 namespace cugraph {
-    namespace  internals {
+namespace internals {
 
-        class DefaultGraphBasedDimRedCallback : public GraphBasedDimRedCallback {
-            public:
+class DefaultGraphBasedDimRedCallback : public GraphBasedDimRedCallback {
+ public:
+  PyObject* get_numba_matrix(void* positions)
+  {
+    PyObject* pycl = (PyObject*)this->pyCallbackClass;
 
-                PyObject* get_numba_matrix(void *positions)
-                {
-                    PyObject* pycl = (PyObject*)this->pyCallbackClass;
-
-                    if (isFloat)
-                    {
-                        return PyObject_CallMethod(pycl,
-                        "get_numba_matrix", "(l(ll)s)", positions,
-                        n, n_components, "float32");
-                    } else {
-                        return PyObject_CallMethod(pycl,
-                        "get_numba_matrix", "(l(ll)s)", positions,
-                        n, n_components, "float64");
-                    }
-                }
+    if (isFloat) {
+      return PyObject_CallMethod(
+        pycl, "get_numba_matrix", "(l(ll)s)", positions, n, n_components, "float32");
+    } else {
+      return PyObject_CallMethod(
+        pycl, "get_numba_matrix", "(l(ll)s)", positions, n, n_components, "float64");
+    }
+  }
 
-                void on_preprocess_end(void *positions) override
-                {
-                    PyObject* numba_matrix = get_numba_matrix(positions);
-                    PyObject* res = PyObject_CallMethod(this->pyCallbackClass,
-                        "on_preprocess_end", "(O)", numba_matrix);
-                    Py_DECREF(numba_matrix);
-                    Py_DECREF(res);
-                }
+  void on_preprocess_end(void* positions) override
+  {
+    PyObject* numba_matrix = get_numba_matrix(positions);
+    PyObject* res =
+      PyObject_CallMethod(this->pyCallbackClass, "on_preprocess_end", "(O)", numba_matrix);
+    Py_DECREF(numba_matrix);
+    Py_DECREF(res);
+  }
 
-                void on_epoch_end(void *positions) override
-                {
-                    PyObject* numba_matrix = get_numba_matrix(positions);
-                    PyObject* res = PyObject_CallMethod(this->pyCallbackClass,
-                        "on_epoch_end", "(O)", numba_matrix);
-                    Py_DECREF(numba_matrix);
-                    Py_DECREF(res);
-              }
+  void on_epoch_end(void* positions) override
+  {
+    PyObject* numba_matrix = get_numba_matrix(positions);
+    PyObject* res = PyObject_CallMethod(this->pyCallbackClass, "on_epoch_end", "(O)", numba_matrix);
+    Py_DECREF(numba_matrix);
+    Py_DECREF(res);
+  }
 
-                void on_train_end(void *positions) override
-                {
-                    PyObject* numba_matrix = get_numba_matrix(positions);
-                    PyObject* res = PyObject_CallMethod(this->pyCallbackClass,
-                        "on_train_end", "(O)", numba_matrix);
-                    Py_DECREF(numba_matrix);
-                    Py_DECREF(res);
-               }
+  void on_train_end(void* positions) override
+  {
+    PyObject* numba_matrix = get_numba_matrix(positions);
+    PyObject* res = PyObject_CallMethod(this->pyCallbackClass, "on_train_end", "(O)", numba_matrix);
+    Py_DECREF(numba_matrix);
+    Py_DECREF(res);
+  }
 
-            public:
-                PyObject* pyCallbackClass;
-        };
+ public:
+  PyObject* pyCallbackClass;
+};
 
-    }
-}
+}  // namespace internals
+}  // namespace cugraph
diff --git a/python/cugraph/cugraph/link_analysis/pagerank.py b/python/cugraph/cugraph/link_analysis/pagerank.py
index 83b8af35e4c..6696512dcf0 100644
--- a/python/cugraph/cugraph/link_analysis/pagerank.py
+++ b/python/cugraph/cugraph/link_analysis/pagerank.py
@@ -112,16 +112,20 @@ def pagerank(
     personalization : cudf.Dataframe, optional (default=None)
         GPU Dataframe containing the personalization information.
         (a performance optimization)
+
         personalization['vertex'] : cudf.Series
             Subset of vertices of graph for personalization
+
         personalization['values'] : cudf.Series
             Personalization values for vertices
 
     precomputed_vertex_out_weight : cudf.Dataframe, optional (default=None)
         GPU Dataframe containing the precomputed vertex out weight
         information(a performance optimization).
+
         precomputed_vertex_out_weight['vertex'] : cudf.Series
             Subset of vertices of graph for precomputed_vertex_out_weight
+
         precomputed_vertex_out_weight['sums'] : cudf.Series
             Corresponding precomputed sum of outgoing vertices weight
 
@@ -144,8 +148,10 @@ def pagerank(
     nstart : cudf.Dataframe, optional (default=None)
         GPU Dataframe containing the initial guess for pagerank.
         (a performance optimization).
+
         nstart['vertex'] : cudf.Series
             Subset of vertices of graph for initial guess for pagerank values
+
         nstart['values'] : cudf.Series
             Pagerank values for vertices
 
@@ -175,6 +181,7 @@ def pagerank(
 
         df['vertex'] : cudf.Series
             Contains the vertex identifiers
+
         df['pagerank'] : cudf.Series
             Contains the PageRank score
 
diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py
index 03db9b74db0..20238e10464 100644
--- a/python/cugraph/cugraph/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/link_prediction/sorensen.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -60,9 +60,11 @@ def sorensen(input_graph, vertex_pair=None):
 
         df['first'] : cudf.Series
             The first vertex ID of each pair (will be identical to first if specified)
+
         df['second'] : cudf.Series
             The second vertex ID of each pair (will be identical to second if
             specified)
+
         df['sorensen_coeff'] : cudf.Series
             The computed Sorensen coefficient between the source and
             destination vertices
diff --git a/python/cugraph/cugraph/link_prediction/woverlap.py b/python/cugraph/cugraph/link_prediction/woverlap.py
index 87498c72e51..c7d4f56a428 100644
--- a/python/cugraph/cugraph/link_prediction/woverlap.py
+++ b/python/cugraph/cugraph/link_prediction/woverlap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -42,6 +42,7 @@ def overlap_w(input_graph, weights, vertex_pair=None):
 
         weights['vertex'] : cudf.Series
             Contains the vertex identifiers
+
         weights['weight'] : cudf.Series
             Contains the weights of vertices
 
@@ -60,8 +61,10 @@ def overlap_w(input_graph, weights, vertex_pair=None):
 
          df['first'] : cudf.Series
             The first vertex ID of each pair.
+
         df['second'] : cudf.Series
             The second vertex ID of each pair.
+
         df['overlap_coeff'] : cudf.Series
             The computed weighted Overlap coefficient between the first and the
             second vertex ID.
@@ -84,7 +87,6 @@ def overlap_w(input_graph, weights, vertex_pair=None):
     >>> weights['weight'] = [random.random() for w in range(
     ...                      len(weights['vertex']))]
     >>> df = cugraph.overlap_w(G, weights)
-
     """
 
     if type(vertex_pair) == cudf.DataFrame:
diff --git a/python/cugraph/cugraph/link_prediction/wsorensen.py b/python/cugraph/cugraph/link_prediction/wsorensen.py
index 00c89370106..c017463a294 100644
--- a/python/cugraph/cugraph/link_prediction/wsorensen.py
+++ b/python/cugraph/cugraph/link_prediction/wsorensen.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -38,6 +38,7 @@ def sorensen_w(input_graph, weights, vertex_pair=None):
 
         weights['vertex'] : cudf.Series
             Contains the vertex identifiers
+
         weights['weight'] : cudf.Series
             Contains the weights of vertices
 
@@ -56,8 +57,10 @@ def sorensen_w(input_graph, weights, vertex_pair=None):
 
          df['first'] : cudf.Series
             The first vertex ID of each pair.
+
         df['second'] : cudf.Series
             The second vertex ID of each pair.
+
         df['sorensen_coeff'] : cudf.Series
             The computed weighted Sorensen coefficient between the first and the
             second vertex ID.
diff --git a/python/cugraph/cugraph/sampling/random_walks.py b/python/cugraph/cugraph/sampling/random_walks.py
index 653bccccabd..a5e2a0371b3 100644
--- a/python/cugraph/cugraph/sampling/random_walks.py
+++ b/python/cugraph/cugraph/sampling/random_walks.py
@@ -12,17 +12,32 @@
 # limitations under the License.
 
 import cudf
+import cupy as cp
 from pylibcugraph import ResourceHandle
 from pylibcugraph import (
     uniform_random_walks as pylibcugraph_uniform_random_walks,
 )
 
 from cugraph.utilities import ensure_cugraph_obj_for_nx
+from cugraph.structure import Graph
 
 import warnings
-
-
-def uniform_random_walks(G, start_vertices, max_depth):
+from cugraph.utilities.utils import import_optional
+from typing import Union, Tuple
+
+# FIXME: the networkx.Graph type used in the type annotation for
+# induced_subgraph() is specified using a string literal to avoid depending on
+# and importing networkx. Instead, networkx is imported optionally, which may
+# cause a problem for a type checker if run in an environment where networkx is
+# not installed.
+networkx = import_optional("networkx")
+
+
+def uniform_random_walks(
+    G: Graph,
+    start_vertices: Union[int, list, cudf.Series, cudf.DataFrame] = None,
+    max_depth: int = None,
+) -> Tuple[cp.ndarray, cp.ndarray, int]:
     return pylibcugraph_uniform_random_walks(
         resource_handle=ResourceHandle(),
         input_graph=G._plc_graph,
@@ -32,19 +47,26 @@ def uniform_random_walks(G, start_vertices, max_depth):
 
 
 def random_walks(
-    G,
-    random_walks_type="uniform",
-    start_vertices=None,
-    max_depth=None,
-    use_padding=False,
-    legacy_result_type=True,
-):
+    G: Union[Graph, "networkx.Graph"],
+    random_walks_type: str = "uniform",
+    start_vertices: Union[int, list, cudf.Series, cudf.DataFrame] = None,
+    max_depth: int = None,
+    use_padding: bool = False,
+    legacy_result_type: bool = True,
+) -> Tuple[cudf.Series, cudf.Series, Union[None, int, cudf.Series]]:
     """
-    # FIXME: make the padded value for vertices with outgoing edges
-    # consistent in both SG and MG implementation.
-    compute random walks for each nodes in 'start_vertices' and returns
+    Compute random walks for each nodes in 'start_vertices' and returns
     either a padded or a coalesced result. For the padded case, vertices
-    with no outgoing edges will be padded with NA
+    with no outgoing edges will be padded with -1.
+
+    When 'use_padding' is 'False', 'random_walks' returns a coalesced
+    result which is a compressed version of the padded one. In the padded
+    form, sources with no out_going edges are padded with -1s in the
+    'vertex_paths' array and their corresponding edges('edge_weight_paths')
+    with 0.0s (when 'legacy_result_type' is 'True'). If 'legacy_result_type'
+    is 'False', 'random_walks' returns padded results (vertex_paths,
+    edge_weight_paths) but instead of 'sizes = None', returns the 'max_path_lengths'.
+    When 'legacy_result_type' is 'False', the arhument 'use_padding' is ignored.
 
     parameters
     ----------
@@ -63,6 +85,9 @@ def random_walks(
     max_depth : int
         The maximum depth of the random walks
 
+        When 'legacy_result_type' is set to False, 'max_depth' is relative to
+        the number of edges otherwised, it is relative to the number of vertices.
+
     use_padding : bool, optional (default=False)
         If True, padded paths are returned else coalesced paths are returned.
 
@@ -81,11 +106,11 @@ def random_walks(
         returned vertex_paths
 
     and
-    sizes: int
-        The path size in case of coalesced paths.
+    sizes: None or cudf.Series
+        The path sizes in case of 'coalesced' paths or None if 'padded'.
     or
     max_path_length : int
-        The maximum path length
+        The maximum path length if 'legacy_result_type' is 'False'
 
     Examples
     --------
@@ -96,11 +121,12 @@ def random_walks(
     >>> _, _, _ = cugraph.random_walks(G, "uniform", start_vertices, 3)
 
     """
+
     if legacy_result_type:
         warning_msg = (
-            "Coalesced path results is deprecated and will no longer be "
-            "supported in the next releases. only padded paths will be "
-            "returned instead"
+            "Coalesced path results, returned when setting legacy_result_type=True, "
+            "is deprecated and will no longer be supported in the next releases. "
+            "only padded paths will be returned instead"
         )
         warnings.warn(warning_msg, PendingDeprecationWarning)
 
@@ -149,13 +175,10 @@ def random_walks(
 
     edge_wgt_paths = cudf.Series(edge_wgt_paths)
 
-    # FIXME: Also add a warning here saying that the lesser path will
-    # be no longer be supported
     # The PLC uniform random walks returns an extra vertex along with an extra
     # edge per path. In fact, the max depth is relative to the number of vertices
     # for the legacy implementation and edges for the PLC implementation
 
-    # Get a list of extra vertex and edge index to drop
     if legacy_result_type:
         warning_msg = (
             "The 'max_depth' is relative to the number of vertices and will be "
@@ -164,56 +187,51 @@ def random_walks(
         )
         warnings.warn(warning_msg, PendingDeprecationWarning)
 
-        drop_vertex = [i for i in range(max_depth, len(vertex_paths), max_depth + 1)]
-        drop_edge_wgt = [
-            i - 1 for i in range(max_depth, len(edge_wgt_paths), max_depth)
-        ]
-
-        vertex_paths = vertex_paths.drop(vertex_paths.index[drop_vertex]).reset_index(
-            drop=True
-        )
+        # Drop the last vertex and and edge weight from each vertex and edge weight
+        # paths.
+        vertex_paths = vertex_paths.drop(
+            index=vertex_paths[max_depth :: max_depth + 1].index
+        ).reset_index(drop=True)
 
         edge_wgt_paths = edge_wgt_paths.drop(
-            edge_wgt_paths.index[drop_edge_wgt]
+            index=edge_wgt_paths[max_depth - 1 :: max_depth].index
         ).reset_index(drop=True)
 
         if use_padding:
             sizes = None
-            edge_wgt_paths_sz = (max_depth - 1) * len(start_vertices)
-            # FIXME: Is it necessary to bound the 'edge_wgt_paths'?
-            return vertex_paths, edge_wgt_paths[:edge_wgt_paths_sz], sizes
+            # FIXME: Is it necessary to slice it with 'edge_wgt_paths_sz'?
+            return vertex_paths, edge_wgt_paths, sizes
 
         # If 'use_padding' is False, compute the sizes of the unpadded results
-        sizes = [
-            len(vertex_paths.iloc[i : i + max_depth].dropna())
-            for i in range(0, len(vertex_paths), max_depth)
-        ]
-        sizes = cudf.Series(sizes, dtype=vertex_paths.dtype)
-
-        # Compress the 'vertex_paths' by dropping 'NA' values which is
-        # representative of vertices with no outgoing link
-        vertex_paths = vertex_paths.dropna().reset_index(drop=True)
-        # Compress the 'edge_wgt_paths' by dropping 'NA'
-        edge_wgt_paths.replace(0.0, None, inplace=True)
-        edge_wgt_paths = edge_wgt_paths.dropna().reset_index(drop=True)
-
-        vertex_paths_sz = sizes.sum()
-        edge_wgt_paths_sz = vertex_paths_sz - len(start_vertices)
-        # FIXME: Is it necessary to bound the 'vertex_paths' and 'edge_wgt_paths'?
-        return vertex_paths[:vertex_paths_sz], edge_wgt_paths[:edge_wgt_paths_sz], sizes
+
+        sizes = (
+            vertex_paths.apply(lambda x: 1 if x != -1 else 0)
+            .groupby(vertex_paths.index // max_depth, sort=True)
+            .sum()
+            .reset_index(drop=True)
+        )
+
+        # Drop the -1 values which are representative of no outgoing edges
+        vertex_paths = vertex_paths.pipe(lambda x: x[x != -1]).reset_index(drop=True)
+
+        # Drop the 0.0 values which are representative of no edges.
+        edge_wgt_paths = edge_wgt_paths.pipe(lambda x: x[x != 0.0]).reset_index(
+            drop=True
+        )
+
+        return vertex_paths, edge_wgt_paths, sizes
 
     else:
-        vertex_paths_sz = sizes.sum()
-        edge_wgt_paths_sz = vertex_paths_sz - len(start_vertices)
-        # FIXME: Is it necessary to bound the 'vertex_paths' and 'edge_wgt_paths'?
         return (
-            vertex_paths[:vertex_paths_sz],
-            edge_wgt_paths[:edge_wgt_paths_sz],
+            vertex_paths,
+            edge_wgt_paths,
             max_path_length,
         )
 
 
-def rw_path(num_paths, sizes):
+def rw_path(
+    num_paths: int, sizes: cudf.Series
+) -> Tuple[cudf.Series, cudf.Series, cudf.Series]:
     """
     Retrieve more information on the obtained paths in case use_padding
     is False.
diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
index a7dad6c01a6..d6acaa550eb 100644
--- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
@@ -233,13 +233,16 @@ def uniform_neighbor_sample(
         df["sources"] = sources
         df["destinations"] = destinations
 
-        df["indices"] = indices
-        if weight_t == "int32":
-            df["indices"] = indices.astype("int32")
-        elif weight_t == "int64":
-            df["indices"] = indices.astype("int64")
+        if indices is None:
+            df["indices"] = None
         else:
             df["indices"] = indices
+            if weight_t == "int32":
+                df["indices"] = indices.astype("int32")
+            elif weight_t == "int64":
+                df["indices"] = indices.astype("int64")
+            else:
+                df["indices"] = indices
 
     if G.renumbered:
         df = G.unrenumber(df, "sources", preserve_order=True)
diff --git a/python/cugraph/cugraph/structure/convert_matrix.py b/python/cugraph/cugraph/structure/convert_matrix.py
index 1b46f7db970..ca8e93c482b 100644
--- a/python/cugraph/cugraph/structure/convert_matrix.py
+++ b/python/cugraph/cugraph/structure/convert_matrix.py
@@ -77,12 +77,22 @@ def from_edgelist(
 
     if df_type is cudf.DataFrame:
         return from_cudf_edgelist(
-            df, source, destination, edge_attr, create_using, renumber
+            df,
+            source,
+            destination,
+            edge_attr=edge_attr,
+            create_using=create_using,
+            renumber=renumber,
         )
 
     elif (pd is not None) and (df_type is pd.DataFrame):
         return from_pandas_edgelist(
-            df, source, destination, edge_attr, create_using, renumber
+            df,
+            source,
+            destination,
+            edge_attr=edge_attr,
+            create_using=create_using,
+            renumber=renumber,
         )
 
     elif df_type is dask_cudf.core.DataFrame:
@@ -99,7 +109,9 @@ def from_edgelist(
                 "(or subclass) type or instance, got: "
                 f"{type(create_using)}"
             )
-        G.from_dask_cudf_edgelist(df, source, destination, edge_attr, renumber)
+        G.from_dask_cudf_edgelist(
+            df, source, destination, edge_attr=edge_attr, renumber=renumber
+        )
         return G
 
     else:
diff --git a/python/cugraph/cugraph/structure/graph_classes.py b/python/cugraph/cugraph/structure/graph_classes.py
index 5fd398124b8..b89ada9bf50 100644
--- a/python/cugraph/cugraph/structure/graph_classes.py
+++ b/python/cugraph/cugraph/structure/graph_classes.py
@@ -107,6 +107,9 @@ def from_cudf_edgelist(
         source="source",
         destination="destination",
         edge_attr=None,
+        weight=None,
+        edge_id=None,
+        edge_type=None,
         renumber=True,
         store_transposed=False,
         legacy_renum_only=False,
@@ -137,8 +140,21 @@ def from_cudf_edgelist(
         destination : str or array-like, optional (default='destination')
             destination column name or array of column names
 
-        edge_attr : str or None, optional (default=None)
-            the weights column name.
+        edge_attr : str or List[str], optional (default=None)
+            Names of the edge attributes.  Can either be a single string
+            representing the weight column name, or a list of length 3
+            holding [weight, edge_id, edge_type].  If this argument is
+            provided, then the weight/edge_id/edge_type arguments must
+            be left empty.
+
+        weight : str, optional (default=None)
+            Name of the weight column in the input dataframe.
+
+        edge_id : str, optional (default=None)
+            Name of the edge id column in the input dataframe.
+
+        edge_type : str, optional (default=None)
+            Name of the edge type column in the input dataframe.
 
         renumber : bool, optional (default=True)
             Indicate whether or not to renumber the source and destination
@@ -176,6 +192,9 @@ def from_cudf_edgelist(
             source=source,
             destination=destination,
             edge_attr=edge_attr,
+            weight=weight,
+            edge_id=edge_id,
+            edge_type=edge_type,
             renumber=renumber,
             store_transposed=store_transposed,
             legacy_renum_only=legacy_renum_only,
@@ -254,6 +273,9 @@ def from_dask_cudf_edgelist(
         source="source",
         destination="destination",
         edge_attr=None,
+        weight=None,
+        edge_id=None,
+        edge_type=None,
         renumber=True,
         store_transposed=False,
         legacy_renum_only=False,
@@ -280,8 +302,21 @@ def from_dask_cudf_edgelist(
         destination : str, optional (default='destination')
             Destination column name or array of column names
 
-        edge_attr : str, optional (default=None)
-            Weights column name
+        edge_attr : str or List[str], optional (default=None)
+            Names of the edge attributes.  Can either be a single string
+            representing the weight column name, or a list of length 3
+            holding [weight, edge_id, edge_type].  If this argument is
+            provided, then the weight/edge_id/edge_type arguments must
+            be left empty.
+
+        weight : str, optional (default=None)
+            Name of the weight column in the input dataframe.
+
+        edge_id : str, optional (default=None)
+            Name of the edge id column in the input dataframe.
+
+        edge_type : str, optional (default=None)
+            Name of the edge type column in the input dataframe.
 
         renumber : bool, optional (default=True)
             If source and destination indices are not in range 0 to V where V
@@ -308,12 +343,15 @@ def from_dask_cudf_edgelist(
             raise RuntimeError("Graph already has values")
         self._Impl._simpleDistributedGraphImpl__from_edgelist(
             input_ddf,
-            source,
-            destination,
-            edge_attr,
-            renumber,
-            store_transposed,
-            legacy_renum_only,
+            source=source,
+            destination=destination,
+            edge_attr=edge_attr,
+            weight=weight,
+            edge_id=edge_id,
+            edge_type=edge_type,
+            renumber=renumber,
+            store_transposed=store_transposed,
+            legacy_renum_only=legacy_renum_only,
         )
 
     # Move to Compat Module
@@ -323,6 +361,9 @@ def from_pandas_edgelist(
         source="source",
         destination="destination",
         edge_attr=None,
+        weight=None,
+        edge_id=None,
+        edge_type=None,
         renumber=True,
     ):
         """
@@ -334,7 +375,9 @@ def from_pandas_edgelist(
         of vertices.  If the input vertices are a single column of integers
         in the range [0, V), renumbering can be disabled and the original
         external vertex ids will be used.
-        If weights are present, edge_attr argument is the weights column name.
+        Weights, edge ids, and edge types can be passed through either the
+        edge_attr argument or individually as separate keyword arguments.
+        All three are optional.
 
         Parameters
         ----------
@@ -347,8 +390,21 @@ def from_pandas_edgelist(
         destination : str or array-like, optional (default='destination')
             Destination column name or array of column names
 
-        edge_attr : str or None, optional (default=None)
-            The weights column name
+        edge_attr : str or List[str], optional (default=None)
+            Names of the edge attributes.  Can either be a single string
+            representing the weight column name, or a list of length 3
+            holding [weight, edge_id, edge_type].  If this argument is
+            provided, then the weight/edge_id/edge_type arguments must
+            be left empty.
+
+        weight : str, optional (default=None)
+            Name of the weight column in the input dataframe.
+
+        edge_id : str, optional (default=None)
+            Name of the edge id column in the input dataframe.
+
+        edge_type : str, optional (default=None)
+            Name of the edge type column in the input dataframe.
 
         renumber : bool, optional (default=True)
             Indicate whether or not to renumber the source and destination
@@ -376,6 +432,9 @@ def from_pandas_edgelist(
             source=source,
             destination=destination,
             edge_attr=edge_attr,
+            weight=weight,
+            edge_id=edge_id,
+            edge_type=edge_type,
             renumber=renumber,
         )
 
@@ -497,7 +556,7 @@ def lookup_internal_vertex_id(self, df, column_name=None):
             Name of the column containing the external vertex ids
 
         Returns
-        ---------
+        -------
         series : cudf.Series or dask_cudf.Series
             The internal vertex identifiers
         """
@@ -536,7 +595,7 @@ def add_internal_vertex_id(
             Preserve the order of the data frame (requires an extra sort)
 
         Returns
-        ---------
+        -------
         df : cudf.DataFrame or dask_cudf.DataFrame
             Original DataFrame with new column containing internal vertex
             id
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index 4b33cf4c847..c0efb425b75 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -15,7 +15,6 @@
 from cugraph.structure.graph_primtypes_wrapper import Direction
 from cugraph.structure.number_map import NumberMap
 from cugraph.structure.symmetrize import symmetrize
-import cupy
 import cudf
 import warnings
 import dask_cudf
@@ -23,7 +22,7 @@
 import dask
 from typing import Union
 import numpy as np
-
+import gc
 from pylibcugraph import (
     MGGraph,
     ResourceHandle,
@@ -31,12 +30,17 @@
 )
 
 from dask.distributed import wait, default_client
+from cugraph.dask.common.part_utils import (
+    get_persisted_df_worker_map,
+    persist_dask_df_equal_parts_per_worker,
+)
 from cugraph.dask.common.input_utils import get_distributed_data
 from pylibcugraph import (
     get_two_hop_neighbors as pylibcugraph_get_two_hop_neighbors,
     select_random_vertices as pylibcugraph_select_random_vertices,
 )
 import cugraph.dask.comms.comms as Comms
+from dask import delayed
 
 
 class simpleDistributedGraphImpl:
@@ -86,33 +90,44 @@ def _make_plc_graph(
         num_edges,
     ):
 
+        weights = None
+        edge_ids = None
+        edge_types = None
+
         if simpleDistributedGraphImpl.edgeWeightCol in edata_x[0]:
-            values = edata_x[0][simpleDistributedGraphImpl.edgeWeightCol]
-            if values.dtype == "int32":
-                values = values.astype("float32")
-            elif values.dtype == "int64":
-                values = values.astype("float64")
-        else:
-            # Some algos require the graph to be weighted
-            values = cudf.Series(cupy.ones(len(edata_x[0]), dtype="float32"))
+            weights = _get_column_from_ls_dfs(
+                edata_x, simpleDistributedGraphImpl.edgeWeightCol
+            )
+            if weights.dtype == "int32":
+                weights = weights.astype("float32")
+            elif weights.dtype == "int64":
+                weights = weights.astype("float64")
 
         if simpleDistributedGraphImpl.edgeIdCol in edata_x[0]:
-            if simpleDistributedGraphImpl.edgeTypeCol not in edata_x[0]:
-                raise ValueError("Must provide both edge id and edge type")
-
-            values_id = edata_x[0][simpleDistributedGraphImpl.edgeIdCol]
-            values_etype = edata_x[0][simpleDistributedGraphImpl.edgeTypeCol]
-        else:
-            values_id, values_etype = None, None
+            edge_ids = _get_column_from_ls_dfs(
+                edata_x, simpleDistributedGraphImpl.edgeIdCol
+            )
+            if edata_x[0][src_col_name].dtype == "int64" and edge_ids.dtype != "int64":
+                edge_ids = edge_ids.astype("int64")
+                warnings.warn(
+                    f"Vertex type is int64 but edge id type is {edge_ids.dtype}"
+                    ", automatically casting edge id type to int64. "
+                    "This may cause extra memory usage.  Consider passing"
+                    " a int64 list of edge ids instead."
+                )
+        if simpleDistributedGraphImpl.edgeTypeCol in edata_x[0]:
+            edge_types = _get_column_from_ls_dfs(
+                edata_x, simpleDistributedGraphImpl.edgeTypeCol
+            )
 
         return MGGraph(
             resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
             graph_properties=graph_props,
-            src_array=edata_x[0][src_col_name],
-            dst_array=edata_x[0][dst_col_name],
-            weight_array=values,
-            edge_id_array=values_id,
-            edge_type_array=values_etype,
+            src_array=_get_column_from_ls_dfs(edata_x, src_col_name),
+            dst_array=_get_column_from_ls_dfs(edata_x, dst_col_name),
+            weight_array=weights,
+            edge_id_array=edge_ids,
+            edge_type_array=edge_types,
             store_transposed=store_transposed,
             num_edges=num_edges,
             do_expensive_check=False,
@@ -125,6 +140,9 @@ def __from_edgelist(
         source="source",
         destination="destination",
         edge_attr=None,
+        weight=None,
+        edge_id=None,
+        edge_type=None,
         renumber=True,
         store_transposed=False,
         legacy_renum_only=False,
@@ -157,12 +175,20 @@ def __from_edgelist(
                 "and destination parameters"
             )
         ddf_columns = s_col + d_col
-
+        _client = default_client()
+        workers = _client.scheduler_info()["workers"]
+        # Repartition to 2 partitions per GPU for memory efficient process
+        input_ddf = input_ddf.repartition(npartitions=len(workers) * 2)
         # The dataframe will be symmetrized iff the graph is undirected
         # otherwise, the inital dataframe will be returned
         if edge_attr is not None:
+            if weight is not None or edge_id is not None or edge_type is not None:
+                raise ValueError(
+                    "If specifying edge_attr, cannot specify weight/edge_id/edge_type"
+                )
             if isinstance(edge_attr, str):
-                edge_attr = [edge_attr]
+                weight = edge_attr
+                edge_attr = [weight]
             if not (set(edge_attr).issubset(set(input_ddf.columns))):
                 raise ValueError(
                     "edge_attr column name not found in input."
@@ -198,21 +224,39 @@ def __from_edgelist(
                         "undirected graph."
                     )
 
-            source_col, dest_col, value_col = symmetrize(
+        else:
+            value_col_names = {}
+            if weight is not None:
+                value_col_names[weight] = self.edgeWeightCol
+                self.properties.weighted = True
+            if edge_id is not None:
+                value_col_names[edge_id] = self.edgeIdCol
+            if edge_type is not None:
+                value_col_names[edge_type] = self.edgeTypeCol
+
+            if len(value_col_names.keys()) > 0:
+                input_ddf = input_ddf.rename(columns=value_col_names)
+            value_col_names = list(value_col_names.values())
+
+        ddf_columns += value_col_names
+        input_ddf = input_ddf[ddf_columns]
+
+        if len(value_col_names) == 0:
+            source_col, dest_col = symmetrize(
                 input_ddf,
                 source,
                 destination,
-                value_col_names,
                 multi=self.properties.multi_edge,
                 symmetrize=not self.properties.directed,
             )
-
+            value_col = None
         else:
-            input_ddf = input_ddf[ddf_columns]
-            source_col, dest_col = symmetrize(
+
+            source_col, dest_col, value_col = symmetrize(
                 input_ddf,
                 source,
                 destination,
+                value_col_names,
                 multi=self.properties.multi_edge,
                 symmetrize=not self.properties.directed,
             )
@@ -227,11 +271,9 @@ def __from_edgelist(
             # Multi column dask_cudf dataframe
             input_ddf = dask_cudf.concat([source_col, dest_col], axis=1)
 
-        if edge_attr is not None:
-            input_ddf[self.edgeWeightCol] = value_col[self.edgeWeightCol]
-            if len(edge_attr) == 3:
-                input_ddf[self.edgeIdCol] = value_col[self.edgeIdCol]
-                input_ddf[self.edgeTypeCol] = value_col[self.edgeTypeCol]
+        if value_col is not None:
+            for vc in value_col_names:
+                input_ddf[vc] = value_col[vc]
 
         self.input_df = input_ddf
 
@@ -268,19 +310,18 @@ def __from_edgelist(
             dst_col_name = self.renumber_map.renumbered_dst_col_name
 
         ddf = self.edgelist.edgelist_df
-
-        num_edges = len(ddf)
-        edge_data = get_distributed_data(ddf)
-
         graph_props = GraphProperties(
             is_multigraph=self.properties.multi_edge,
             is_symmetric=not self.properties.directed,
         )
-
-        _client = default_client()
-        self._plc_graph = {
-            w: _client.submit(
-                simpleDistributedGraphImpl._make_plc_graph,
+        ddf = ddf.repartition(npartitions=len(workers) * 2)
+        ddf = ddf.map_partitions(lambda df: df.copy())
+        ddf = persist_dask_df_equal_parts_per_worker(ddf, _client)
+        num_edges = len(ddf)
+        self._number_of_edges = num_edges
+        ddf = get_persisted_df_worker_map(ddf, _client)
+        delayed_tasks_d = {
+            w: delayed(simpleDistributedGraphImpl._make_plc_graph)(
                 Comms.get_session_id(),
                 edata,
                 graph_props,
@@ -288,12 +329,17 @@ def __from_edgelist(
                 dst_col_name,
                 store_transposed,
                 num_edges,
-                workers=[w],
             )
-            for w, edata in edge_data.worker_to_parts.items()
+            for w, edata in ddf.items()
         }
-
-        wait(self._plc_graph)
+        del ddf
+        self._plc_graph = {
+            w: _client.compute(delayed_task, workers=w, allow_other_workers=False)
+            for w, delayed_task in delayed_tasks_d.items()
+        }
+        wait(list(self._plc_graph.values()))
+        del delayed_tasks_d
+        _client.run(gc.collect)
 
     @property
     def renumbered(self):
@@ -391,7 +437,7 @@ def number_of_edges(self, directed_edges=False):
         Get the number of edges in the graph.
         """
         if self.edgelist is not None:
-            return len(self.edgelist.edgelist_df)
+            return self._number_of_edges
         else:
             raise RuntimeError("Graph is Empty")
 
@@ -1097,3 +1143,18 @@ def vertex_column_size(self):
     @property
     def _npartitions(self) -> int:
         return len(self._plc_graph)
+
+
+def _get_column_from_ls_dfs(lst_df, col_name):
+    """
+    This function concatenates the column
+    and drops it from the input list
+    """
+    len_df = sum([len(df) for df in lst_df])
+    if len_df == 0:
+        return lst_df[0][col_name]
+    output_col = cudf.concat([df[col_name] for df in lst_df], ignore_index=True)
+    for df in lst_df:
+        df.drop(columns=[col_name], inplace=True)
+    gc.collect()
+    return output_col
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
index 7ad694e62f5..d0c0ded5eb4 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
@@ -16,7 +16,6 @@
 from cugraph.structure.symmetrize import symmetrize
 from cugraph.structure.number_map import NumberMap
 import cugraph.dask.common.mg_utils as mg_utils
-import cupy
 import cudf
 import dask_cudf
 import cugraph.dask.comms.comms as Comms
@@ -24,7 +23,7 @@
 import numpy as np
 import warnings
 from cugraph.dask.structure import replication
-from typing import Union
+from typing import Union, Dict
 from pylibcugraph import (
     get_two_hop_neighbors as pylibcugraph_get_two_hop_neighbors,
     select_random_vertices as pylibcugraph_select_random_vertices,
@@ -46,25 +45,30 @@ class simpleGraphImpl:
     dstCol = "dst"
 
     class EdgeList:
-        def __init__(self, source, destination, edge_attr=None):
+        def __init__(
+            self,
+            source: str,
+            destination: str,
+            edge_attr: Union[cudf.DataFrame, Dict[str, cudf.DataFrame]] = None,
+        ):
             self.edgelist_df = cudf.DataFrame()
             self.edgelist_df[simpleGraphImpl.srcCol] = source
             self.edgelist_df[simpleGraphImpl.dstCol] = destination
             self.weights = False
             if edge_attr is not None:
-                self.weights = True
-                if isinstance(edge_attr, (list, tuple)):
-                    if len(edge_attr) == 3:
-                        self.edgelist_df[simpleGraphImpl.edgeWeightCol] = edge_attr[0]
-                        self.edgelist_df[simpleGraphImpl.edgeIdCol] = edge_attr[1]
-                        self.edgelist_df[simpleGraphImpl.edgeTypeCol] = edge_attr[2]
-                    elif len(edge_attr) == 1:
-                        self.edgelist_df[simpleGraphImpl.edgeWeightCol] = edge_attr[0]
-                    else:
-                        raise ValueError(
-                            "Illegal # of arguments provided" "for edge_attr"
-                        )
+                if isinstance(edge_attr, dict):
+                    if edge_attr[simpleGraphImpl.edgeWeightCol] is not None:
+                        self.weights = True
+
+                    for ea in [
+                        simpleGraphImpl.edgeIdCol,
+                        simpleGraphImpl.edgeTypeCol,
+                        simpleGraphImpl.edgeWeightCol,
+                    ]:
+                        if edge_attr[ea] is not None:
+                            self.edgelist_df[ea] = edge_attr[ea]
                 else:
+                    self.weights = True
                     self.edgelist_df[simpleGraphImpl.edgeWeightCol] = edge_attr
 
     class AdjList:
@@ -115,6 +119,9 @@ def __from_edgelist(
         source="source",
         destination="destination",
         edge_attr=None,
+        weight=None,
+        edge_id=None,
+        edge_type=None,
         renumber=True,
         legacy_renum_only=True,
         store_transposed=False,
@@ -144,15 +151,19 @@ def __from_edgelist(
         df_columns = s_col + d_col
 
         if edge_attr is not None:
+            if weight is not None or edge_id is not None or edge_type is not None:
+                raise ValueError(
+                    "If specifying edge_attr, cannot specify weight/edge_id/edge_type"
+                )
             if isinstance(edge_attr, str):
-                edge_attr = [edge_attr]
+                weight = edge_attr
+                edge_attr = [weight]
             if not (set(edge_attr).issubset(set(input_df.columns))):
                 raise ValueError(
                     f"edge_attr column {edge_attr} not found in input."
                     "Recheck the edge_attr parameter"
                 )
             self.properties.weighted = True
-            df_columns += edge_attr
 
             if len(edge_attr) != 1 and len(edge_attr) != 3:
                 raise ValueError(
@@ -170,6 +181,18 @@ def __from_edgelist(
                         "undirected graph."
                     )
 
+                weight, edge_id, edge_type = edge_attr
+        else:
+            edge_attr = []
+            if weight is not None:
+                edge_attr.append(weight)
+                self.properties.weighted = True
+            if edge_id is not None:
+                edge_attr.append(edge_id)
+            if edge_type is not None:
+                edge_attr.append(edge_type)
+
+        df_columns += edge_attr
         input_df = input_df[df_columns]
         # FIXME: check if the consolidated graph fits on the
         # device before gathering all the edge lists
@@ -178,22 +201,21 @@ def __from_edgelist(
         if isinstance(input_df, cudf.DataFrame):
             if len(input_df[source]) > 2147483100:
                 raise ValueError(
-                    "cudf dataFrame edge list is too big " "to fit in a single GPU"
+                    "cudf dataFrame edge list is too big to fit in a single GPU"
                 )
             elist = input_df
         elif isinstance(input_df, dask_cudf.DataFrame):
             if len(input_df[source]) > 2147483100:
                 raise ValueError(
-                    "dask_cudf dataFrame edge list is too big " "to fit in a single GPU"
+                    "dask_cudf dataFrame edge list is too big to fit in a single GPU"
                 )
             elist = input_df.compute().reset_index(drop=True)
         else:
-            raise TypeError(
-                "input should be a cudf.DataFrame or " "a dask_cudf dataFrame"
-            )
+            raise TypeError("input should be a cudf.DataFrame or a dask_cudf dataFrame")
 
         # Original, unmodified input dataframe.
         self.input_df = elist
+
         # Renumbering
         self.renumber_map = None
         self.store_transposed = store_transposed
@@ -232,6 +254,7 @@ def __from_edgelist(
                 multi=self.properties.multi_edge,
                 symmetrize=not self.properties.directed,
             )
+
             if isinstance(value_col, cudf.DataFrame):
                 value_dict = {}
                 for i in value_col.columns:
@@ -248,7 +271,13 @@ def __from_edgelist(
             )
 
         if isinstance(value_col, dict):
-            value_col = [value_col[ea] for ea in edge_attr]
+            value_col = {
+                self.edgeWeightCol: value_col[weight] if weight in value_col else None,
+                self.edgeIdCol: value_col[edge_id] if edge_id in value_col else None,
+                self.edgeTypeCol: value_col[edge_type]
+                if edge_type in value_col
+                else None,
+            }
 
         self.edgelist = simpleGraphImpl.EdgeList(source_col, dest_col, value_col)
 
@@ -361,11 +390,14 @@ def view_edge_list(self):
         -------
         df : cudf.DataFrame
             This cudf.DataFrame wraps source, destination and weight
+
             df[src] : cudf.Series
                 contains the source index for each edge
+
             df[dst] : cudf.Series
                 contains the destination index for each edge
-            df[weight] : cusd.Series
+
+            df[weight] : cudf.Series
                 Column is only present for weighted Graph,
                 then containing the weight value for each edge
         """
@@ -745,9 +777,11 @@ def in_degree(self, vertex_subset=None):
             vertices (vertex_subset) containing the in_degree. The ordering is
             relative to the adjacency list, or that given by the specified
             vertex_subset.
+
             df[vertex] : cudf.Series
                 The vertex IDs (will be identical to vertex_subset if
                 specified).
+
             df[degree] : cudf.Series
                 The computed in-degree of the corresponding vertex.
 
@@ -785,9 +819,11 @@ def out_degree(self, vertex_subset=None):
             vertices (vertex_subset) containing the out_degree. The ordering is
             relative to the adjacency list, or that given by the specified
             vertex_subset.
+
             df[vertex] : cudf.Series
                 The vertex IDs (will be identical to vertex_subset if
                 specified).
+
             df[degree] : cudf.Series
                 The computed out-degree of the corresponding vertex.
 
@@ -824,9 +860,11 @@ def degree(self, vertex_subset=None):
             vertices (vertex_subset) containing the degree. The ordering is
             relative to the adjacency list, or that given by the specified
             vertex_subset.
+
             df['vertex'] : cudf.Series
                 The vertex IDs (will be identical to vertex_subset if
                 specified).
+
             df['degree'] : cudf.Series
                 The computed degree of the corresponding vertex.
 
@@ -863,11 +901,14 @@ def degrees(self, vertex_subset=None):
             vertices (vertex_subset) containing the degrees. The ordering is
             relative to the adjacency list, or that given by the specified
             vertex_subset.
+
             df['vertex'] : cudf.Series
                 The vertex IDs (will be identical to vertex_subset if
                 specified).
+
             df['in_degree'] : cudf.Series
                 The in-degree of the vertex.
+
             df['out_degree'] : cudf.Series
                 The out-degree of the vertex.
 
@@ -945,16 +986,20 @@ def _degree(self, vertex_subset, direction=Direction.ALL):
 
         return df
 
-    def _make_plc_graph(self, value_col=None, store_transposed=False, renumber=True):
+    def _make_plc_graph(
+        self,
+        value_col: Dict[str, cudf.DataFrame] = None,
+        store_transposed: bool = False,
+        renumber: bool = True,
+    ):
         """
         Parameters
         ----------
-        value_col : cudf.DataFrame or tuple[cudf.DataFrame]
+        value_col : cudf.DataFrame or dict[str, cudf.DataFrame]
             If a single dataframe is provided, this is assumed
             to contain the edge weight values.
-            If a tuple of dataframes is provided, then it is
-            assumed to contain edge weights, edge ids, and
-            edge types, in that order.
+            If a dictionary of dataframes is provided, then it is
+            assumed to contain edge properties.
         store_transposed : bool (default=False)
             Whether to store the graph in a transposed
             format.  Required by some algorithms.
@@ -968,11 +1013,10 @@ def _make_plc_graph(self, value_col=None, store_transposed=False, renumber=True)
             weight_col, id_col, type_col = None, None, None
         elif isinstance(value_col, (cudf.DataFrame, cudf.Series)):
             weight_col, id_col, type_col = value_col, None, None
-        elif isinstance(value_col, list):
-            if len(value_col) == 3:
-                weight_col, id_col, type_col = value_col
-            elif len(value_col) == 1:
-                weight_col, id_col, type_col = value_col[0], None, None
+        elif isinstance(value_col, dict):
+            weight_col = value_col[self.edgeWeightCol]
+            id_col = value_col[self.edgeIdCol]
+            type_col = value_col[self.edgeTypeCol]
         else:
             raise ValueError(f"Illegal value col {type(value_col)}")
 
@@ -985,20 +1029,12 @@ def _make_plc_graph(self, value_col=None, store_transposed=False, renumber=True)
             input_array_format = "COO"
             src_or_offset_array = self.edgelist.edgelist_df[simpleGraphImpl.srcCol]
             dst_or_index_array = self.edgelist.edgelist_df[simpleGraphImpl.dstCol]
-            if weight_col is None:
-                # Some algos require the graph to be weighted
-                weight_col = cudf.Series(
-                    cupy.ones(len(self.edgelist.edgelist_df), dtype="float32")
-                )
+
         elif self.adjlist is not None:
             input_array_format = "CSR"
             src_or_offset_array = self.adjlist.offsets
             dst_or_index_array = self.adjlist.indices
-            if weight_col is None:
-                # Some algos require the graph to be weighted
-                weight_col = cudf.Series(
-                    cupy.ones(len(self.adjlist.indices), dtype="float32")
-                )
+
         else:
             raise TypeError(
                 "Edges need to be represented in either in COO or CSR format."
@@ -1012,6 +1048,16 @@ def _make_plc_graph(self, value_col=None, store_transposed=False, renumber=True)
             if weight_t == "int64":
                 weight_col = weight_col.astype("float64")
 
+        if id_col is not None:
+            if src_or_offset_array.dtype == "int64" and id_col.dtype != "int64":
+                id_col = id_col.astype("int64")
+                warnings.warn(
+                    f"Vertex type is int64 but edge id type is {id_col.dtype}"
+                    ", automatically casting edge id type to int64. "
+                    "This may cause extra memory usage.  Consider passing"
+                    " a int64 list of edge ids instead."
+                )
+
         self._plc_graph = SGGraph(
             resource_handle=ResourceHandle(),
             graph_properties=graph_props,
@@ -1088,14 +1134,8 @@ def has_node(self, n):
         """
         Returns True if the graph contains the node n.
         """
-        if self.properties.renumbered:
-            tmp = self.renumber_map.to_internal_vertex_id(cudf.Series([n]))
-            return tmp[0] is not cudf.NA and tmp[0] >= 0
-        else:
-            df = self.edgelist.edgelist_df[
-                [simpleGraphImpl.srcCol, simpleGraphImpl.dstCol]
-            ]
-            return (df == n).any().any()
+
+        return (self.nodes() == n).any().any()
 
     def has_edge(self, u, v):
         """
@@ -1139,7 +1179,8 @@ def edges(self):
 
     def nodes(self):
         """
-        Returns all the nodes in the graph as a cudf.Series.
+        Returns all the nodes in the graph as a cudf.Series, in order of appearance
+        in the edgelist (source column first, then destination column).
         If multi columns vertices, return a cudf.DataFrame.
         """
         if self.edgelist is not None:
diff --git a/python/cugraph/cugraph/structure/graph_utilities.pxd b/python/cugraph/cugraph/structure/graph_utilities.pxd
index 74edb61fafa..0bf0f829d1b 100644
--- a/python/cugraph/cugraph/structure/graph_utilities.pxd
+++ b/python/cugraph/cugraph/structure/graph_utilities.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -28,64 +28,6 @@ from pylibraft.common.handle cimport handle_t
 
 # C++ graph utilities
 cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
-
-    ctypedef enum numberTypeEnum:
-        int32Type "cugraph::cython::numberTypeEnum::int32Type"
-        int64Type "cugraph::cython::numberTypeEnum::int64Type"
-        floatType "cugraph::cython::numberTypeEnum::floatType"
-        doubleType "cugraph::cython::numberTypeEnum::doubleType"
-
-    cdef cppclass graph_container_t:
-       pass
-
-    cdef void populate_graph_container(
-        graph_container_t &graph_container,
-        handle_t &handle,
-        void *src_vertices,
-        void *dst_vertices,
-        void *weights,
-        void *vertex_partition_offsets,
-        void *segment_offsets,
-        size_t num_segments,
-        numberTypeEnum vertexType,
-        numberTypeEnum edgeType,
-        numberTypeEnum weightType,
-        size_t num_local_edges,
-        size_t num_global_vertices,
-        size_t num_global_edges,
-        bool is_weighted,
-        bool is_symmetric,
-        bool transposed,
-        bool multi_gpu) except +
-
-    ctypedef enum graphTypeEnum:
-        LegacyCSR "cugraph::cython::graphTypeEnum::LegacyCSR"
-        LegacyCSC "cugraph::cython::graphTypeEnum::LegacyCSC"
-        LegacyCOO "cugraph::cython::graphTypeEnum::LegacyCOO"
-
-    cdef cppclass cy_multi_edgelists_t:
-        size_t number_of_vertices
-        size_t number_of_edges
-        size_t number_of_subgraph
-        unique_ptr[device_buffer] src_indices
-        unique_ptr[device_buffer] dst_indices
-        unique_ptr[device_buffer] edge_data
-        unique_ptr[device_buffer] subgraph_offsets
-
-    cdef cppclass random_walk_ret_t:
-        size_t coalesced_sz_v_
-        size_t coalesced_sz_w_
-        size_t num_paths_
-        size_t max_depth_
-        unique_ptr[device_buffer] d_coalesced_v_
-        unique_ptr[device_buffer] d_coalesced_w_
-        unique_ptr[device_buffer] d_sizes_
-
-    cdef cppclass random_walk_path_t:
-        unique_ptr[device_buffer] d_v_offsets
-        unique_ptr[device_buffer] d_w_sizes
-        unique_ptr[device_buffer] d_w_offsets
-
     cdef cppclass graph_generator_t:
         unique_ptr[device_buffer] d_source
         unique_ptr[device_buffer] d_destination
@@ -93,91 +35,4 @@ cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
 cdef extern from "<utility>" namespace "std" nogil:
     cdef device_buffer move(device_buffer)
     cdef unique_ptr[device_buffer] move(unique_ptr[device_buffer])
-    cdef cy_multi_edgelists_t move(cy_multi_edgelists_t)
-    cdef unique_ptr[cy_multi_edgelists_t] move(unique_ptr[cy_multi_edgelists_t])
-
-# renumber_edgelist() interface utilities:
-#
-#
-# 1. `cdef extern partition_t`:
-#
-cdef extern from "cugraph/graph_view.hpp" namespace "cugraph":
-
-    cdef cppclass partition_t[vertex_t]:
-        pass
-
-
-# 2. return type for shuffle:
-#
-cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
-
-    cdef cppclass major_minor_weights_t[vertex_t, edge_t, weight_t]:
-        major_minor_weights_t(const handle_t &handle)
-        pair[unique_ptr[device_buffer], size_t] get_major_wrap()
-        pair[unique_ptr[device_buffer], size_t] get_minor_wrap()
-        pair[unique_ptr[device_buffer], size_t] get_weights_wrap()
-        unique_ptr[vector[edge_t]] get_edge_counts_wrap()
-
-
-ctypedef fused shuffled_vertices_t:
-    major_minor_weights_t[int, int, float]
-    major_minor_weights_t[int, int, double]
-    major_minor_weights_t[int, long, float]
-    major_minor_weights_t[int, long, double]
-    major_minor_weights_t[long, long, float]
-    major_minor_weights_t[long, long, double]
-
-# 3. return type for renumber:
-#
-cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
-
-    cdef cppclass renum_tuple_t[vertex_t, edge_t]:
-        renum_tuple_t(const handle_t &handle)
-        pair[unique_ptr[device_buffer], size_t] get_dv_wrap()
-        vertex_t& get_num_vertices()
-        edge_t& get_num_edges()
-        vector[vertex_t]& get_segment_offsets()
-        unique_ptr[vector[vertex_t]] get_segment_offsets_wrap()
-        int get_part_row_size()
-        int get_part_col_size()
-        int get_part_comm_rank()
-        unique_ptr[vector[vertex_t]] get_partition_offsets_wrap()
-        pair[vertex_t, vertex_t] get_part_local_vertex_range()
-        vertex_t get_part_local_vertex_first()
-        vertex_t get_part_local_vertex_last()
-        pair[vertex_t, vertex_t] get_part_vertex_partition_range(size_t vertex_partition_idx)
-        vertex_t get_part_vertex_partition_first(size_t vertex_partition_idx)
-        vertex_t get_part_vertex_partition_last(size_t vertex_partition_idx)
-        vertex_t get_part_vertex_partition_size(size_t vertex_partition_idx)
-        size_t get_part_number_of_matrix_partitions()
-        vertex_t get_part_matrix_partition_major_first(size_t partition_idx)
-        vertex_t get_part_matrix_partition_major_last(size_t partition_idx)
-        vertex_t get_part_matrix_partition_major_value_start_offset(size_t partition_idx)
-        pair[vertex_t, vertex_t] get_part_matrix_partition_minor_range()
-        vertex_t get_part_matrix_partition_minor_first()
-        vertex_t get_part_matrix_partition_minor_last()
-
-# 4. `sort_and_shuffle_values()` wrapper:
-#
-cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
-
-    cdef unique_ptr[major_minor_weights_t[vertex_t, edge_t, weight_t]] call_shuffle[vertex_t, edge_t, weight_t](
-        const handle_t &handle,
-        vertex_t *edgelist_major_vertices,
-        vertex_t *edgelist_minor_vertices,
-        weight_t* edgelist_weights,
-        edge_t num_edges,
-        bool is_weighted) except +
-
-# 5. `renumber_edgelist()` wrapper
-#
-cdef extern from "cugraph/utilities/cython.hpp" namespace "cugraph::cython":
 
-    cdef unique_ptr[renum_tuple_t[vertex_t, edge_t]] call_renumber[vertex_t, edge_t](
-        const handle_t &handle,
-        vertex_t *edgelist_major_vertices,
-        vertex_t *edgelist_minor_vertices,
-        const vector[edge_t]& edge_counts,
-        bool store_transposed,
-        bool do_check,
-        bool multi_gpu) except +
diff --git a/python/cugraph/cugraph/structure/hypergraph.py b/python/cugraph/cugraph/structure/hypergraph.py
index 0397905b2d0..4e9975e6b8a 100644
--- a/python/cugraph/cugraph/structure/hypergraph.py
+++ b/python/cugraph/cugraph/structure/hypergraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -325,7 +325,7 @@ def _create_entity_nodes(
 
     for key, col in events[columns].items():
         cat = categories.get(key, key)
-        col = col.unique()
+        col = col.unique().sort_values()
         col = col.nans_to_nulls().dropna() if dropna else col
         if len(col) == 0:
             continue
diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py
index 3bf7faea6cc..36ce5baa212 100644
--- a/python/cugraph/cugraph/structure/property_graph.py
+++ b/python/cugraph/cugraph/structure/property_graph.py
@@ -488,7 +488,7 @@ def get_num_edges(self, type=None):
     def get_vertices(self, selection=None):
         """
         Return a Series containing the unique vertex IDs contained in both
-        the vertex and edge property data.
+        the vertex and edge property data in ascending order.
         Selection is not yet supported.
 
         Parameters
@@ -530,12 +530,11 @@ def get_vertices(self, selection=None):
         if vert_sers:
             if self.__series_type is cudf.Series:
                 return self.__series_type(
-                    cudf.concat(vert_sers, ignore_index=True).unique()
+                    cudf.concat(vert_sers, ignore_index=True).unique().sort_values()
                 )
             else:
-                return self.__series_type(
-                    pd.concat(vert_sers, ignore_index=True).unique()
-                )
+                x = pd.Series(pd.concat(vert_sers, ignore_index=True).unique())
+                return self.__series_type(x.sort_values())
         return self.__series_type()
 
     def vertices_ids(self):
diff --git a/python/cugraph/cugraph/structure/symmetrize.py b/python/cugraph/cugraph/structure/symmetrize.py
index dd2dea090ee..4b159b279c0 100644
--- a/python/cugraph/cugraph/structure/symmetrize.py
+++ b/python/cugraph/cugraph/structure/symmetrize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,6 +14,7 @@
 from cugraph.structure import graph_classes as csg
 import cudf
 import dask_cudf
+from dask.distributed import default_client
 
 
 def symmetrize_df(
@@ -77,13 +78,7 @@ def symmetrize_df(
         weight_name = [weight_name]
 
     if symmetrize:
-        if weight_name:
-            df2 = df[[*dst_name, *src_name, *weight_name]]
-            df2.columns = [*src_name, *dst_name, *weight_name]
-        else:
-            df2 = df[[*dst_name, *src_name]]
-            df2.columns = [*src_name, *dst_name]
-        result = cudf.concat([df, df2]).reset_index(drop=True)
+        result = _add_reverse_edges(df, src_name, dst_name, weight_name)
     else:
         result = df
     if multi:
@@ -155,6 +150,8 @@ def symmetrize_ddf(
 
     """
     # FIXME: Uncomment out the above (broken) example
+    _client = default_client()
+    workers = _client.scheduler_info()["workers"]
 
     if not isinstance(src_name, list):
         src_name = [src_name]
@@ -164,29 +161,16 @@ def symmetrize_ddf(
         weight_name = [weight_name]
 
     if symmetrize:
-        if weight_name:
-            ddf2 = ddf[[*dst_name, *src_name, *weight_name]]
-            ddf2.columns = [*src_name, *dst_name, *weight_name]
-        else:
-            ddf2 = ddf[[*dst_name, *src_name]]
-            ddf2.columns = [*src_name, *dst_name]
-        result = dask_cudf.concat([ddf, ddf2]).reset_index(drop=True)
+        result = ddf.map_partitions(_add_reverse_edges, src_name, dst_name, weight_name)
     else:
         result = ddf
     if multi:
-        # The concat call doubles the number of partitions therefore,
-        # repartition the result so that the number of partitions equals
-        # the number of workers
-        result = result.repartition(npartitions=ddf.npartitions)
         return result
     else:
         vertex_col_name = src_name + dst_name
-        result = (
-            result.groupby(by=[*vertex_col_name])
-            .min(split_out=ddf.npartitions)
-            .reset_index()
+        result = _memory_efficient_drop_duplicates(
+            result, vertex_col_name, len(workers)
         )
-
         return result
 
 
@@ -283,3 +267,38 @@ def symmetrize(
             )
 
     return output_df[source_col_name], output_df[dest_col_name]
+
+
+def _add_reverse_edges(df, src_name, dst_name, weight_name):
+    """
+    Add reverse edges to the input dataframe.
+    args:
+        df: cudf.DataFrame or dask_cudf.DataFrame
+        src_name: str
+            source column name
+        dst_name: str
+            destination column name
+        weight_name: str
+            weight column name
+    """
+    if weight_name:
+        reverse_df = df[[*dst_name, *src_name, *weight_name]]
+        reverse_df.columns = [*src_name, *dst_name, *weight_name]
+    else:
+        reverse_df = df[[*dst_name, *src_name]]
+        reverse_df.columns = [*src_name, *dst_name]
+    return cudf.concat([df, reverse_df], ignore_index=True)
+
+
+def _memory_efficient_drop_duplicates(ddf, vertex_col_name, num_workers):
+    """
+    Drop duplicate edges from the input dataframe.
+    """
+    # drop duplicates has a 5x+ overhead
+    # and does not seem to be working as expected
+    # TODO: Triage an MRE
+    ddf = ddf.reset_index(drop=True).repartition(npartitions=num_workers * 2)
+    ddf = ddf.groupby(by=[*vertex_col_name], as_index=False).min(
+        split_out=num_workers * 2
+    )
+    return ddf
diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py
index e69de29bb2d..db1c574de21 100644
--- a/python/cugraph/cugraph/testing/__init__.py
+++ b/python/cugraph/cugraph/testing/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.testing.utils import (
+    RAPIDS_DATASET_ROOT_DIR_PATH,
+)
diff --git a/python/cugraph/cugraph/testing/mg_utils.py b/python/cugraph/cugraph/testing/mg_utils.py
index 614840a1536..1e1a481e4d6 100644
--- a/python/cugraph/cugraph/testing/mg_utils.py
+++ b/python/cugraph/cugraph/testing/mg_utils.py
@@ -13,21 +13,93 @@
 
 import os
 import tempfile
-
+from pprint import pformat
+import time
+from dask.distributed import wait, default_client
+from dask import persist
 from dask.distributed import Client
+from dask.base import is_dask_collection
 from dask_cuda import LocalCUDACluster
 from dask_cuda.initialize import initialize
-
 from cugraph.dask.comms import comms as Comms
 from cugraph.dask.common.mg_utils import get_visible_devices
+from cugraph.generators import rmat
+import numpy as np
 
 
 def start_dask_client(
     protocol=None,
     rmm_pool_size=None,
     dask_worker_devices=None,
+    jit_unspill=False,
+    device_memory_limit=0.8,
 ):
+    """
+    Creates a new dask client, and possibly also a cluster, and returns them as
+    a tuple (client, cluster).
+
+    If the env var SCHEDULER_FILE is set, it is assumed to contain the path to
+    a JSON file generated by a running dask scheduler that can be used to
+    configure the new dask client (the new client object returned will be a
+    client to that scheduler), and the value of cluster will be None. If
+    SCHEDULER_FILE is not set, a new LocalCUDACluster will be created and
+    returned as the value of cluster.
+
+    If the env var DASK_WORKER_DEVICES is set, it will be assumed to be a list
+    of comma-separated GPU devices (ex. "0,1,2" for those 3 devices) for the
+    LocalCUDACluster to use when setting up individual workers (1 worker per
+    device). If not set, the parameter dask_worker_devices will be used the
+    same way instead. If neither are set, the new LocalCUDACluster instance
+    will default to one worker per device visible to this process.
+
+    If the env var DASK_LOCAL_DIRECTORY is set, it will be used as the
+    "local_directory" arg to LocalCUDACluster, for all temp files generated.
+
+    Upon successful creation of a client (either to a LocalCUDACluster or
+    otherwise), the cugraph.dask.comms.comms singleton is initialized using
+    "p2p=True".
+
+    Parameters
+    ----------
+    protocol : str or None, default None
+        The "protocol" arg to LocalCUDACluster (ex. "tcp"), see docs for
+        dask_cuda.LocalCUDACluster for details. This parameter is ignored if
+        the env var SCHEDULER_FILE is set which implies the dask cluster has
+        already been created.
+
+    rmm_pool_size : int, str or None, default None
+        The "rmm_pool_size" arg to LocalCUDACluster (ex. "20GB"), see docs for
+        dask_cuda.LocalCUDACluster for details. This parameter is ignored if
+        the env var SCHEDULER_FILE is set which implies the dask cluster has
+        already been created.
+
+    dask_worker_devices : str, list of int, or None, default None
+        GPUs to restrict activity to. Can be a string (like ``"0,1,2,3"``),
+        list (like ``[0, 1, 2, 3]``), or ``None`` to use all available GPUs.
+        This parameter is overridden by the value of env var
+        DASK_WORKER_DEVICES. This parameter is ignored if the env var
+        SCHEDULER_FILE is set which implies the dask cluster has already been
+        created.
+
+    jit_unspill : bool or None, default None
+        The "jit_unspill" arg to LocalCUDACluster to enable just-in-time
+        spilling, see docs for dask_cuda.LocalCUDACluster for details. This
+        parameter is ignored if the env var SCHEDULER_FILE is set which implies
+        the dask cluster has already been created.
+
+    device_memory_limit : int, float, str, or None, default 0.8
+        The "device_memory_limit" arg to LocalCUDACluster to determine when
+        workers start spilling to host memory, see docs for
+        dask_cuda.LocalCUDACluster for details. This parameter is ignored if
+        the env var SCHEDULER_FILE is set which implies the dask cluster has
+        already been created.
+    """
     dask_scheduler_file = os.environ.get("SCHEDULER_FILE")
+    dask_local_directory = os.getenv("DASK_LOCAL_DIRECTORY")
+    # Allow the DASK_WORKER_DEVICES env var to override a value passed in. If
+    # neither are set, this will be None.
+    dask_worker_devices = os.getenv("DASK_WORKER_DEVICES", dask_worker_devices)
+
     cluster = None
     client = None
     tempdir_object = None
@@ -48,23 +120,45 @@ def start_dask_client(
                 f"WARNING: {dask_worker_devices=} is ignored in start_dask_client() "
                 "when using dask SCHEDULER_FILE"
             )
-
         initialize()
         client = Client(scheduler_file=dask_scheduler_file)
-        print("\ndask_client created using " f"{dask_scheduler_file}")
+        # FIXME: use proper logging, INFO or DEBUG level
+        print("\nDask client created using " f"{dask_scheduler_file}")
     else:
-        # The tempdir created by tempdir_object should be cleaned up once
-        # tempdir_object goes out-of-scope and is deleted.
-        tempdir_object = tempfile.TemporaryDirectory()
+        if dask_local_directory is None:
+            # The tempdir created by tempdir_object should be cleaned up once
+            # tempdir_object is deleted.
+            tempdir_object = tempfile.TemporaryDirectory()
+            local_directory = tempdir_object.name
+        else:
+            local_directory = dask_local_directory
+
         cluster = LocalCUDACluster(
-            local_directory=tempdir_object.name,
+            local_directory=local_directory,
             protocol=protocol,
             rmm_pool_size=rmm_pool_size,
             CUDA_VISIBLE_DEVICES=dask_worker_devices,
+            jit_unspill=jit_unspill,
+            device_memory_limit=device_memory_limit,
         )
         client = Client(cluster)
-        client.wait_for_workers(len(get_visible_devices()))
-        print("\ndask_client created using LocalCUDACluster")
+
+        if dask_worker_devices is None:
+            num_workers = len(get_visible_devices())
+        else:
+            if isinstance(dask_worker_devices, list):
+                num_workers = len(dask_worker_devices)
+            else:
+                # FIXME: this assumes a properly formatted string with commas
+                num_workers = len(dask_worker_devices.split(","))
+
+        client.wait_for_workers(num_workers)
+        # Add a reference to tempdir_object to the client to prevent it from
+        # being deleted when this function returns. This will be deleted in
+        # stop_dask_client()
+        client.tempdir_object = tempdir_object
+        # FIXME: use proper logging, INFO or DEBUG level
+        print("\nDask client/cluster created using LocalCUDACluster")
 
     Comms.initialize(p2p=True)
 
@@ -72,8 +166,222 @@ def start_dask_client(
 
 
 def stop_dask_client(client, cluster=None):
+    """
+    Shutdown/cleanup a client and possibly cluster object returned from
+    start_dask_client(). This also stops the cugraph.dask.comms.comms
+    singleton.
+    """
     Comms.destroy()
     client.close()
     if cluster:
         cluster.close()
-    print("\ndask_client closed.")
+    # Remove a TemporaryDirectory object that may have been assigned to the
+    # client, which should remove it and all the contents from disk.
+    if hasattr(client, "tempdir_object"):
+        del client.tempdir_object
+    # FIXME: use proper logging, INFO or DEBUG level
+    print("\nDask client closed.")
+
+
+def restart_client(client):
+    """
+    Restart the Dask client
+    """
+    Comms.destroy()
+    client.restart()
+    client = client.run(enable_spilling)
+    Comms.initialize(p2p=True)
+
+
+def enable_spilling():
+    import cudf
+
+    cudf.set_option("spill", True)
+
+
+def generate_edgelist_rmat(
+    scale,
+    edgefactor,
+    seed=None,
+    unweighted=False,
+    mg=True,
+):
+    """
+    Returns a cudf/dask_cudf DataFrame created using the R-MAT graph generator.
+
+    The resulting graph is weighted with random values of a uniform distribution
+    from the interval [0, 1)
+    Args:
+        scale:
+            scale is used to determine the number of vertices to be generated (num_verts
+            = 2^scale), which is also used to determine the data type for the vertex ID
+            values in the DataFrame.
+        edgefactor:
+            edgefactor determies the number of edges (num_edges = num_edges*edgefactor)
+        seed:
+            seed, if specified, will be used as the seed to the RNG.
+        unweighted:
+            unweighted determines if the resulting edgelist will have randomly-generated
+            weights ranging in value between [0, 1). If True, an edgelist with only 2
+            columns is returned.
+        mg:
+            mg determines if the resulting edgelist will be a multi-GPU edgelist.
+            If True, returns a dask_cudf.DataFrame and
+            if False, returns a cudf.DataFrame.
+    """
+    ddf = rmat(
+        scale,
+        (2**scale) * edgefactor,
+        0.57,  # from Graph500
+        0.19,  # from Graph500
+        0.19,  # from Graph500
+        seed or 42,
+        clip_and_flip=False,
+        scramble_vertex_ids=True,
+        create_using=None,  # return edgelist instead of Graph instance
+        mg=mg,
+    )
+    if not unweighted:
+        rng = np.random.default_rng(seed)
+        ddf["weight"] = ddf.map_partitions(lambda df: rng.random(size=len(df)))
+    return ddf
+
+
+def set_statistics_adaptor():
+    """
+    Sets the current device resource to a StatisticsResourceAdaptor
+    """
+    import rmm
+
+    rmm.mr.set_current_device_resource(
+        rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
+    )
+
+
+def _get_allocation_counts():
+    """
+    Returns the allocation counts from the current device resource
+    """
+    import rmm
+
+    mr = rmm.mr.get_current_device_resource()
+    if not hasattr(mr, "allocation_counts"):
+        if hasattr(mr, "upstream_mr"):
+            return _get_allocation_counts(mr.upstream_mr)
+        else:
+            return -1
+    else:
+        return mr.allocation_counts
+
+
+def persist_dask_object(arg):
+    """
+    Persist if it is a dask object
+    """
+    if is_dask_collection(arg) or hasattr(arg, "persist"):
+        arg = persist(arg)
+        wait(arg)
+        arg = arg[0]
+    return arg
+
+
+# Function to convert bytes into human readable format
+def sizeof_fmt(num, suffix="B"):
+    for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
+        if abs(num) < 1024.0:
+            return "%3.1f%s%s" % (num, unit, suffix)
+        num /= 1024.0
+    return "%.1f%s%s" % (num, "Yi", suffix)
+
+
+def _parse_allocation_counts(allocation_counts):
+    """
+    Parses the allocation counts from the current device resource
+    into human readable format
+    """
+    return {k: sizeof_fmt(v) for k, v in allocation_counts.items() if "bytes" in k}
+
+
+# Decorator to set the statistics adaptor
+# and calls the allocation_counts function
+def get_allocation_counts_dask_lazy(return_allocations=False, logging=True):
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            client = default_client()
+            client.run(set_statistics_adaptor)
+            st = time.time()
+            return_val = func(*args, **kwargs)
+            et = time.time()
+            allocation_counts = client.run(_get_allocation_counts)
+            if logging:
+                _print_allocation_statistics(
+                    func, args, kwargs, et - st, allocation_counts
+                )
+            client.run(set_statistics_adaptor)
+            if return_allocations:
+                return return_val, allocation_counts
+            else:
+                return return_val
+
+        return wrapper
+
+    return decorator
+
+
+def get_allocation_counts_dask_persist(return_allocations=False, logging=True):
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            args = [persist_dask_object(a) for a in args]
+            kwargs = {k: persist_dask_object(v) for k, v in kwargs.items()}
+            client = default_client()
+            client.run(set_statistics_adaptor)
+            st = time.time()
+            return_val = func(*args, **kwargs)
+            return_val = persist_dask_object(return_val)
+            if isinstance(return_val, (list, tuple)):
+                return_val = [persist_dask_object(d) for d in return_val]
+            et = time.time()
+            allocation_counts = client.run(_get_allocation_counts)
+            if logging:
+                _print_allocation_statistics(
+                    func, args, kwargs, et - st, allocation_counts
+                )
+            client.run(set_statistics_adaptor)
+            if return_allocations:
+                return return_val, allocation_counts
+            else:
+                return return_val
+
+        return wrapper
+
+    return decorator
+
+
+def _get_allocation_stats_string(func, args, kwargs, execution_time, allocation_counts):
+    allocation_counts_parsed = {
+        worker_id: _parse_allocation_counts(worker_allocations)
+        for worker_id, worker_allocations in allocation_counts.items()
+    }
+    return (
+        f"function:  {func.__name__}\n"
+        + f"function args: {args} kwargs: {kwargs}\n"
+        + f"execution_time: {execution_time}\n"
+        + "allocation_counts:\n"
+        + f"{pformat(allocation_counts_parsed, indent=4, width=1, compact=True)}"
+    )
+
+
+def _print_allocation_statistics(func, args, kwargs, execution_time, allocation_counts):
+    print(
+        _get_allocation_stats_string(
+            func, args, kwargs, execution_time, allocation_counts
+        )
+    )
+
+
+def get_peak_output_ratio_across_workers(allocation_counts):
+    peak_ratio = -1
+    for w_allocations in allocation_counts.values():
+        w_peak_ratio = w_allocations["peak_bytes"] / w_allocations["current_bytes"]
+        peak_ratio = max(w_peak_ratio, peak_ratio)
+    return peak_ratio
diff --git a/python/cugraph/cugraph/testing/utils.py b/python/cugraph/cugraph/testing/utils.py
index ce9acbca9b6..0dae17ed14e 100644
--- a/python/cugraph/cugraph/testing/utils.py
+++ b/python/cugraph/cugraph/testing/utils.py
@@ -131,7 +131,6 @@
 
 
 def read_csv_for_nx(csv_file, read_weights_in_sp=True, read_weights=True):
-    print("Reading " + str(csv_file) + "...")
     if read_weights:
         if read_weights_in_sp is True:
             df = pd.read_csv(
@@ -224,25 +223,23 @@ def create_obj_from_csv(
 
 
 def read_csv_file(csv_file, read_weights_in_sp=True):
-    print("Reading " + str(csv_file) + "...")
     if read_weights_in_sp is True:
         return cudf.read_csv(
             csv_file,
             delimiter=" ",
-            dtype=["int32", "int32", "float32"],
+            dtype={"0": "int32", "1": "int32", "2": "float32"},
             header=None,
         )
     else:
         return cudf.read_csv(
             csv_file,
             delimiter=" ",
-            dtype=["int32", "int32", "float64"],
+            dtype={"0": "int32", "1": "int32", "2": "float64"},
             header=None,
         )
 
 
 def read_dask_cudf_csv_file(csv_file, read_weights_in_sp=True, single_partition=True):
-    print("Reading " + str(csv_file) + "...")
     if read_weights_in_sp is True:
         if single_partition:
             chunksize = os.path.getsize(csv_file)
diff --git a/python/cugraph/cugraph/tests/community/test_leiden.py b/python/cugraph/cugraph/tests/community/test_leiden.py
index 4a04eac3500..9cbe0df2532 100644
--- a/python/cugraph/cugraph/tests/community/test_leiden.py
+++ b/python/cugraph/cugraph/tests/community/test_leiden.py
@@ -18,18 +18,135 @@
 
 import networkx as nx
 import cugraph
+import cudf
 from cugraph.testing import utils
-from cugraph.experimental.datasets import DATASETS, karate_asymmetric
+from cugraph.experimental.datasets import DATASETS_UNDIRECTED, karate_asymmetric
 
-# Temporarily suppress warnings till networkX fixes deprecation warnings
-# (Using or importing the ABCs from 'collections' instead of from
-# 'collections.abc' is deprecated, and in 3.8 it will stop working) for
-# python 3.7.  Also, these import community and import networkx need to be
-# relocated in the third-party group once this gets fixed.
-import warnings
+from cudf.testing.testing import assert_series_equal
 
-with warnings.catch_warnings():
-    warnings.filterwarnings("ignore", category=DeprecationWarning)
+
+# =============================================================================
+# Test data
+# =============================================================================
+
+_test_data = {
+    "data_1": {
+        "graph": {
+            "src_or_offset_array": [0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5],
+            "dst_or_index_array": [1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4],
+            # fmt: off
+            "weight": [0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2, 0.1, 2.1, 1.1, 5.1,
+                       3.1, 4.1, 7.2, 3.2],
+            # fmt: on
+        },
+        "max_level": 10,
+        "resolution": 1.0,
+        "input_type": "COO",
+        "expected_output": {
+            "partition": [1, 0, 1, 2, 2, 2],
+            "modularity_score": 0.1757322,
+        },
+    },
+    "data_2": {
+        "graph": {
+            # fmt: off
+            "src_or_offset_array": [0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66,
+                                    67, 69, 74, 76, 78, 80, 82, 84, 87, 89, 91, 93,
+                                    98, 101, 104, 106, 110, 113, 117, 121, 127, 139,
+                                    156],
+
+            "dst_or_index_array": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21,
+                                   31, 0, 2, 3, 7, 13, 17, 19, 21, 30, 0, 1, 3, 7, 8,
+                                   9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0,
+                                   6, 10, 16, 0, 4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32,
+                                   33, 2, 33, 0, 4, 5, 0, 0, 3, 0, 1, 2, 3, 33, 32, 33,
+                                   32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1,
+                                   32, 33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31,
+                                   29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, 32, 33, 1,
+                                   8, 32, 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18,
+                                   20, 22, 23, 29, 30, 31, 33, 8, 9, 13, 14, 15, 18, 19,
+                                   20, 22, 23, 26, 27, 28, 29, 30, 31, 32],
+            "weight": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            # fmt: on
+        },
+        "max_level": 40,
+        "resolution": 1.0,
+        "input_type": "CSR",
+        "expected_output": {
+            # fmt: off
+            "partition": [6, 6, 3, 3, 1, 5, 5, 3, 0, 3, 1, 6, 3, 3, 4, 4, 5, 6, 4, 6, 4,
+                          6, 4, 4, 2, 2, 4, 4, 2, 4, 0, 2, 4, 4],
+            # fmt: on
+            "modularity_score": 0.3468113,
+        },
+    },
+}
+
+
+# =============================================================================
+# Pytest fixtures
+# =============================================================================
+@pytest.fixture(
+    scope="module",
+    params=[pytest.param(value, id=key) for (key, value) in _test_data.items()],
+)
+def input_and_expected_output(request):
+    d = request.param.copy()
+
+    input_graph_data = d.pop("graph")
+    input_type = d.pop("input_type")
+    src_or_offset_array = cudf.Series(
+        input_graph_data["src_or_offset_array"], dtype="int32"
+    )
+    dst_or_index_array = cudf.Series(
+        input_graph_data["dst_or_index_array"], dtype="int32"
+    )
+    weight = cudf.Series(input_graph_data["weight"], dtype="float32")
+
+    max_level = d.pop("max_level")
+    resolution = d.pop("resolution")
+    output = d
+
+    G = cugraph.Graph()
+
+    if input_type == "COO":
+        # Create graph from an edgelist
+        df = cudf.DataFrame()
+        df["src"] = src_or_offset_array
+        df["dst"] = dst_or_index_array
+        df["weight"] = cudf.Series(weight, dtype="float32")
+        G.from_cudf_edgelist(
+            df,
+            source="src",
+            destination="dst",
+            edge_attr="weight",
+            store_transposed=False,
+        )
+
+    elif input_type == "CSR":
+        # Create graph from csr
+        offsets = src_or_offset_array
+        indices = dst_or_index_array
+        G.from_cudf_adjlist(offsets, indices, weight)
+
+    parts, mod = cugraph.leiden(G, max_level, resolution)
+
+    parts = parts.sort_values("vertex").reset_index(drop=True)
+
+    output["result_output"] = {"partition": parts["partition"], "modularity_score": mod}
+
+    return output
 
 
 # =============================================================================
@@ -62,7 +179,7 @@ def cugraph_louvain(G):
 
 
 @pytest.mark.sg
-@pytest.mark.parametrize("graph_file", DATASETS)
+@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED)
 def test_leiden(graph_file):
     edgevals = True
 
@@ -70,15 +187,12 @@ def test_leiden(graph_file):
     leiden_parts, leiden_mod = cugraph_leiden(G)
     louvain_parts, louvain_mod = cugraph_louvain(G)
 
-    # Calculating modularity scores for comparison
-    # FIXME: If the datasets is not renumbered, the leiden parts will
-    # also include isolated vertices which will be reflected in the modularity
-    # score.
-    assert leiden_mod >= (0.97 * louvain_mod)
+    # Leiden modularity score is smaller than Louvain's
+    assert leiden_mod >= (0.75 * louvain_mod)
 
 
 @pytest.mark.sg
-@pytest.mark.parametrize("graph_file", DATASETS)
+@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED)
 def test_leiden_nx(graph_file):
     dataset_path = graph_file.get_path()
     NM = utils.read_csv_for_nx(dataset_path)
@@ -91,10 +205,8 @@ def test_leiden_nx(graph_file):
     louvain_parts, louvain_mod = cugraph_louvain(G)
 
     # Calculating modularity scores for comparison
-    # FIXME: If the datasets is not renumbered, the leiden parts will
-    # also include isolated vertices which will be reflected in the modularity
-    # score.
-    assert leiden_mod >= (0.97 * louvain_mod)
+    # Leiden modularity score is smaller than Louvain's
+    assert leiden_mod >= (0.75 * louvain_mod)
 
 
 @pytest.mark.sg
@@ -107,3 +219,20 @@ def test_leiden_directed_graph():
 
     with pytest.raises(ValueError):
         parts, mod = cugraph_leiden(G)
+
+
+@pytest.mark.sg
+def test_leiden_golden_results(input_and_expected_output):
+    expected_partition = cudf.Series(
+        input_and_expected_output["expected_output"]["partition"]
+    )
+    expected_mod = input_and_expected_output["expected_output"]["modularity_score"]
+
+    result_partition = input_and_expected_output["result_output"]["partition"]
+    result_mod = input_and_expected_output["result_output"]["modularity_score"]
+
+    assert abs(expected_mod - result_mod) < 0.0001
+
+    assert_series_equal(
+        expected_partition, result_partition, check_dtype=False, check_names=False
+    )
diff --git a/python/cugraph/cugraph/tests/community/test_leiden_mg.py b/python/cugraph/cugraph/tests/community/test_leiden_mg.py
new file mode 100644
index 00000000000..e76696e5769
--- /dev/null
+++ b/python/cugraph/cugraph/tests/community/test_leiden_mg.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import cugraph.dask as dcg
+
+import cugraph
+import dask_cudf
+from cugraph.testing import utils
+
+# from cugraph.dask.common.mg_utils import is_single_gpu
+
+try:
+    from rapids_pytest_benchmark import setFixtureParamNames
+except ImportError:
+    print(
+        "\n\nWARNING: rapids_pytest_benchmark is not installed, "
+        "falling back to pytest_benchmark fixtures.\n"
+    )
+
+    # if rapids_pytest_benchmark is not available, just perfrom time-only
+    # benchmarking and replace the util functions with nops
+    import pytest_benchmark
+
+    gpubenchmark = pytest_benchmark.plugin.benchmark
+
+    def setFixtureParamNames(*args, **kwargs):
+        pass
+
+
+# =============================================================================
+# Parameters
+# =============================================================================
+DATASETS_ASYMMETRIC = [utils.RAPIDS_DATASET_ROOT_DIR_PATH / "karate-asymmetric.csv"]
+
+
+###############################################################################
+# Fixtures
+# @pytest.mark.skipif(
+#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+# )
+@pytest.fixture(
+    scope="module",
+    params=DATASETS_ASYMMETRIC,
+    ids=[f"dataset={d.as_posix()}" for d in DATASETS_ASYMMETRIC],
+)
+def daskGraphFromDataset(request, dask_client):
+    """
+    Returns a new dask dataframe created from the dataset file param.
+    This creates a directed Graph.
+    """
+    # Since parameterized fixtures do not assign param names to param values,
+    # manually call the helper to do so.
+    setFixtureParamNames(request, ["dataset"])
+    dataset = request.param
+
+    chunksize = dcg.get_chunksize(dataset)
+    ddf = dask_cudf.read_csv(
+        dataset,
+        chunksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    dg = cugraph.Graph(directed=True)
+    dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value")
+    return dg
+
+
+@pytest.fixture(
+    scope="module",
+    params=utils.DATASETS_UNDIRECTED,
+    ids=[f"dataset={d.as_posix()}" for d in utils.DATASETS_UNDIRECTED],
+)
+def uddaskGraphFromDataset(request, dask_client):
+    """
+    Returns a new dask dataframe created from the dataset file param.
+    This creates an undirected Graph.
+    """
+    # Since parameterized fixtures do not assign param names to param
+    # values, manually call the helper to do so.
+    setFixtureParamNames(request, ["dataset"])
+    dataset = request.param
+
+    chunksize = dcg.get_chunksize(dataset)
+    ddf = dask_cudf.read_csv(
+        dataset,
+        chunksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    dg = cugraph.Graph(directed=False)
+    dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value")
+    return dg
+
+
+###############################################################################
+# Tests
+# @pytest.mark.skipif(
+#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+# )
+# FIXME: Implement more robust tests
+@pytest.mark.mg
+def test_mg_leiden_with_edgevals_directed_graph(daskGraphFromDataset):
+    # Directed graphs are not supported by Leiden and a ValueError should be
+    # raised
+    with pytest.raises(ValueError):
+        parts, mod = dcg.leiden(daskGraphFromDataset)
+
+
+###############################################################################
+# Tests
+# @pytest.mark.skipif(
+#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+# )
+# FIXME: Implement more robust tests
+@pytest.mark.mg
+def test_mg_leiden_with_edgevals_undirected_graph(uddaskGraphFromDataset):
+    parts, mod = dcg.leiden(uddaskGraphFromDataset)
+
+    # FIXME: either call Nx with the same dataset and compare results, or
+    # hardcode golden results to compare to.
+    print()
+    print(parts.compute())
+    print(mod)
+    print()
diff --git a/python/cugraph/cugraph/tests/conftest.py b/python/cugraph/cugraph/tests/conftest.py
index 388a90d4e98..fece006c4b8 100644
--- a/python/cugraph/cugraph/tests/conftest.py
+++ b/python/cugraph/cugraph/tests/conftest.py
@@ -12,9 +12,9 @@
 # limitations under the License.
 
 import pytest
-from cugraph.dask.common.mg_utils import (
-    setup_local_dask_cluster,
-    teardown_local_dask_cluster,
+from cugraph.testing.mg_utils import (
+    start_dask_client,
+    stop_dask_client,
 )
 
 # module-wide fixtures
@@ -34,7 +34,11 @@ def gpubenchmark():
 
 @pytest.fixture(scope="module")
 def dask_client():
-    cluster, client = setup_local_dask_cluster()
-    yield client
+    # start_dask_client will check for the SCHEDULER_FILE and
+    # DASK_WORKER_DEVICES env vars and use them when creating a client if
+    # set. start_dask_client will also initialize the Comms singleton.
+    dask_client, dask_cluster = start_dask_client()
 
-    teardown_local_dask_cluster(cluster, client)
+    yield dask_client
+
+    stop_dask_client(dask_client, dask_cluster)
diff --git a/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py b/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py
index bc526380957..8bc2da37e89 100644
--- a/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py
+++ b/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py
@@ -21,6 +21,7 @@
 from cudf.testing import assert_frame_equal, assert_series_equal
 from cupy.testing import assert_array_equal
 from pylibcugraph.testing.utils import gen_fixture_params_product
+from cugraph.dask.common.mg_utils import is_single_gpu
 
 import cugraph.dask as dcg
 from cugraph.experimental.datasets import cyber
@@ -991,6 +992,7 @@ def test_renumber_vertices_edges_dtypes(dask_client):
 
 
 @pytest.mark.mg
+@pytest.mark.skipif(is_single_gpu(), reason="FIXME: MG test fails on single-GPU")
 @pytest.mark.parametrize("set_index", [True, False])
 def test_add_data_noncontiguous(dask_client, set_index):
     from cugraph.experimental import MGPropertyGraph
diff --git a/python/cugraph/cugraph/tests/generators/test_rmat_mg.py b/python/cugraph/cugraph/tests/generators/test_rmat_mg.py
index 22403a189b8..d5d6db4d70f 100644
--- a/python/cugraph/cugraph/tests/generators/test_rmat_mg.py
+++ b/python/cugraph/cugraph/tests/generators/test_rmat_mg.py
@@ -16,10 +16,12 @@
 
 import dask_cudf
 
+from cugraph.testing.mg_utils import (
+    start_dask_client,
+    stop_dask_client,
+)
 from cugraph.dask.common.mg_utils import (
     is_single_gpu,
-    setup_local_dask_cluster,
-    teardown_local_dask_cluster,
 )
 from cugraph.generators import rmat
 import cugraph
@@ -61,13 +63,13 @@ def setup_module():
     global _client
     global _visible_devices
     if not _is_single_gpu:
-        (_cluster, _client) = setup_local_dask_cluster(p2p=True)
+        (_client, _cluster) = start_dask_client()
         _visible_devices = _client.scheduler_info()["workers"]
 
 
 def teardown_module():
     if not _is_single_gpu:
-        teardown_local_dask_cluster(_cluster, _client)
+        stop_dask_client(_client, _cluster)
 
 
 ###############################################################################
diff --git a/python/cugraph/cugraph/tests/internals/test_renumber_mg.py b/python/cugraph/cugraph/tests/internals/test_renumber_mg.py
index f1c503ab6f7..cc7ee0368a5 100644
--- a/python/cugraph/cugraph/tests/internals/test_renumber_mg.py
+++ b/python/cugraph/cugraph/tests/internals/test_renumber_mg.py
@@ -113,7 +113,7 @@ def test_mg_renumber_add_internal_vertex_id(graph_file, dask_client):
     gdf["dst_old"] = destinations
     gdf["src"] = sources + translate
     gdf["dst"] = destinations + translate
-    gdf["weight"] = gdf.index.astype(np.float)
+    gdf["weight"] = gdf.index.astype(np.float64)
 
     ddf = dask.dataframe.from_pandas(
         gdf, npartitions=len(dask_client.scheduler_info()["workers"])
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
index af1a1a35ec7..35f17d99184 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
@@ -17,7 +17,6 @@
 import pytest
 import dask_cudf
 from pylibcugraph.testing import gen_fixture_params_product
-from cugraph.dask.common.mg_utils import is_single_gpu
 import cugraph.dask as dcg
 import cugraph
 from cugraph.testing import utils
@@ -119,7 +118,6 @@ def input_expected_output(input_combo):
 
 
 @pytest.mark.mg
-@pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
 def test_dask_jaccard(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
@@ -155,7 +153,7 @@ def test_dask_jaccard(dask_client, benchmark, input_expected_output):
 
 
 @pytest.mark.mg
-def test_dask_weighted_jaccard():
+def test_dask_weighted_jaccard(dask_client):
     input_data_path = datasets[0]
     chunksize = dcg.get_chunksize(input_data_path)
     ddf = dask_cudf.read_csv(
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
index 4627b749426..7c84fce989b 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
@@ -156,7 +156,7 @@ def test_dask_sorensen(dask_client, benchmark, input_expected_output):
 
 
 @pytest.mark.mg
-def test_dask_weighted_sorensen():
+def test_dask_weighted_sorensen(dask_client):
     input_data_path = datasets[0]
     chunksize = dcg.get_chunksize(input_data_path)
     ddf = dask_cudf.read_csv(
diff --git a/python/cugraph/cugraph/tests/nx/test_nx_convert.py b/python/cugraph/cugraph/tests/nx/test_nx_convert.py
index de70b52f1fe..ee14bfe361c 100644
--- a/python/cugraph/cugraph/tests/nx/test_nx_convert.py
+++ b/python/cugraph/cugraph/tests/nx/test_nx_convert.py
@@ -139,7 +139,7 @@ def test_nx_convert_weighted(graph_file):
     assert nx.is_directed(nxG) is True
     assert nx.is_weighted(nxG) is True
 
-    cuG = cugraph.utilities.convert_from_nx(nxG)
+    cuG = cugraph.utilities.convert_from_nx(nxG, weight="weight")
     assert cugraph.is_directed(cuG) is True
     assert cugraph.is_weighted(cuG) is True
 
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
index bc801cab0a2..c25b5297e18 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
@@ -23,7 +23,6 @@
 
 
 @pytest.mark.sg
-@pytest.mark.skip("work in progress")
 def test_bulk_sampler_simple():
     el = karate.get_edgelist().reset_index().rename(columns={"index": "eid"})
     el["eid"] = el["eid"].astype("int32")
@@ -56,7 +55,7 @@ def test_bulk_sampler_simple():
     bs.add_batches(batches, start_col_name="start", batch_col_name="batch")
     bs.flush()
 
-    recovered_samples = cudf.read_parquet(os.path.join(tempdir_object.name, "rank=0"))
+    recovered_samples = cudf.read_parquet(tempdir_object.name)
 
     for b in batches["batch"].unique().values_host.tolist():
         assert b in recovered_samples["batch_id"].values_host.tolist()
@@ -103,9 +102,8 @@ def test_bulk_sampler_remainder():
     bs.add_batches(batches, start_col_name="start", batch_col_name="batch")
     bs.flush()
 
-    tld = os.path.join(tempdir_object.name, "rank=0")
+    tld = tempdir_object.name
     recovered_samples = cudf.read_parquet(tld)
-    print(os.listdir(tld))
 
     for b in batches["batch"].unique().values_host.tolist():
         assert b in recovered_samples["batch_id"].values_host.tolist()
@@ -123,7 +121,6 @@ def test_bulk_sampler_remainder():
 
 
 @pytest.mark.sg
-@pytest.mark.skip("work in progress")
 def test_bulk_sampler_large_batch_size():
     el = karate.get_edgelist().reset_index().rename(columns={"index": "eid"})
     el["eid"] = el["eid"].astype("int32")
@@ -156,7 +153,7 @@ def test_bulk_sampler_large_batch_size():
     bs.add_batches(batches, start_col_name="start", batch_col_name="batch")
     bs.flush()
 
-    recovered_samples = cudf.read_parquet(os.path.join(tempdir_object.name, "rank=0"))
+    recovered_samples = cudf.read_parquet(tempdir_object.name)
 
     for b in batches["batch"].unique().values_host.tolist():
         assert b in recovered_samples["batch_id"].values_host.tolist()
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io.py
new file mode 100644
index 00000000000..83d20ea2cf5
--- /dev/null
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import cudf
+import tempfile
+import os
+
+from cugraph.gnn.data_loading.bulk_sampler_io import write_samples
+
+
+@pytest.mark.sg
+def test_bulk_sampler_io():
+    results = cudf.DataFrame(
+        {
+            "sources": [0, 0, 1, 2, 2, 2, 3, 4, 5, 5, 6, 7],
+            "destinations": [1, 2, 3, 3, 3, 4, 1, 1, 6, 7, 2, 3],
+            "edge_id": None,
+            "edge_type": None,
+            "weight": None,
+            "hop_id": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
+        }
+    )
+
+    offsets = cudf.DataFrame({"offsets": [0, 8], "batch_id": [0, 1]})
+
+    tempdir_object = tempfile.TemporaryDirectory()
+    write_samples(results, offsets, 1, tempdir_object.name)
+
+    assert len(os.listdir(tempdir_object.name)) == 2
+
+    df = cudf.read_parquet(os.path.join(tempdir_object.name, "batch=0-0.parquet"))
+    assert len(df) == 8
+
+    assert (
+        df.sources.values_host.tolist()
+        == results.sources.iloc[0:8].values_host.tolist()
+    )
+    assert (
+        df.destinations.values_host.tolist()
+        == results.destinations.iloc[0:8].values_host.tolist()
+    )
+    assert (
+        df.hop_id.values_host.tolist() == results.hop_id.iloc[0:8].values_host.tolist()
+    )
+    assert (df.batch_id == 0).all()
+
+    df = cudf.read_parquet(os.path.join(tempdir_object.name, "batch=1-1.parquet"))
+    assert len(df) == 4
+    assert (
+        df.sources.values_host.tolist()
+        == results.sources.iloc[8:12].values_host.tolist()
+    )
+    assert (
+        df.destinations.values_host.tolist()
+        == results.destinations.iloc[8:12].values_host.tolist()
+    )
+    assert (
+        df.hop_id.values_host.tolist() == results.hop_id.iloc[8:12].values_host.tolist()
+    )
+    assert (df.batch_id == 1).all()
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py
new file mode 100644
index 00000000000..eacd697b7b3
--- /dev/null
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import cudf
+import dask_cudf
+
+import tempfile
+import os
+
+from cugraph.gnn.data_loading.bulk_sampler_io import write_samples
+
+
+@pytest.mark.mg
+def test_bulk_sampler_io():
+    results = cudf.DataFrame(
+        {
+            "sources": [0, 0, 1, 2, 2, 2, 3, 4, 5, 5, 6, 7],
+            "destinations": [1, 2, 3, 3, 3, 4, 1, 1, 6, 7, 2, 3],
+            "edge_id": None,
+            "edge_type": None,
+            "weight": None,
+            "hop_id": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
+        }
+    )
+    results = dask_cudf.from_cudf(results, npartitions=1).repartition(
+        divisions=[0, 8, 11]
+    )
+
+    offsets = cudf.DataFrame({"offsets": [0, 0], "batch_id": [0, 1]})
+    offsets = dask_cudf.from_cudf(offsets, npartitions=2)
+
+    tempdir_object = tempfile.TemporaryDirectory()
+    write_samples(results, offsets, 1, tempdir_object.name)
+
+    assert len(os.listdir(tempdir_object.name)) == 2
+
+    df = cudf.read_parquet(os.path.join(tempdir_object.name, "batch=0-0.parquet"))
+    assert len(df) == 8
+
+    results = results.compute()
+    assert (
+        df.sources.values_host.tolist()
+        == results.sources.iloc[0:8].values_host.tolist()
+    )
+    assert (
+        df.destinations.values_host.tolist()
+        == results.destinations.iloc[0:8].values_host.tolist()
+    )
+    assert (
+        df.hop_id.values_host.tolist() == results.hop_id.iloc[0:8].values_host.tolist()
+    )
+    assert (df.batch_id == 0).all()
+
+    df = cudf.read_parquet(os.path.join(tempdir_object.name, "batch=1-1.parquet"))
+    assert len(df) == 4
+    assert (
+        df.sources.values_host.tolist()
+        == results.sources.iloc[8:12].values_host.tolist()
+    )
+    assert (
+        df.destinations.values_host.tolist()
+        == results.destinations.iloc[8:12].values_host.tolist()
+    )
+    assert (
+        df.hop_id.values_host.tolist() == results.hop_id.iloc[8:12].values_host.tolist()
+    )
+    assert (df.batch_id == 1).all()
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
index 8bb16e03252..f717d452403 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
@@ -20,7 +20,6 @@
 from cugraph.experimental import BulkSampler
 
 import tempfile
-import os
 
 
 @pytest.mark.mg
@@ -59,75 +58,12 @@ def test_bulk_sampler_simple(dask_client):
     bs.add_batches(batches, start_col_name="start", batch_col_name="batch")
     bs.flush()
 
-    recovered_samples = cudf.read_parquet(os.path.join(tempdir_object.name, "rank=0"))
+    recovered_samples = cudf.read_parquet(tempdir_object.name)
 
     for b in batches["batch"].unique().compute().values_host.tolist():
         assert b in recovered_samples["batch_id"].values_host.tolist()
 
 
-@pytest.mark.mg
-def test_bulk_sampler_remainder(dask_client):
-    el = karate.get_edgelist().reset_index().rename(columns={"index": "eid"})
-    el["eid"] = el["eid"].astype("int32")
-    el["etp"] = cupy.int32(0)
-
-    G = cugraph.Graph(directed=True)
-    G.from_dask_cudf_edgelist(
-        dask_cudf.from_cudf(el, npartitions=2),
-        source="src",
-        destination="dst",
-        edge_attr=["wgt", "eid", "etp"],
-    )
-
-    tempdir_object = tempfile.TemporaryDirectory()
-    bs = BulkSampler(
-        batch_size=2,
-        output_path=tempdir_object.name,
-        graph=G,
-        seeds_per_call=7,
-        batches_per_partition=2,
-        fanout_vals=[2, 2],
-        with_replacement=False,
-    )
-
-    # Should process batch (0, 1, 2) then (3, 4, 5) then 6
-
-    batches = dask_cudf.from_cudf(
-        cudf.DataFrame(
-            {
-                "start": cudf.Series(
-                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype="int32"
-                ),
-                "batch": cudf.Series(
-                    [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], dtype="int32"
-                ),
-            }
-        ),
-        npartitions=2,
-    )
-
-    bs.add_batches(batches, start_col_name="start", batch_col_name="batch")
-    bs.flush()
-
-    tld = os.path.join(tempdir_object.name, "rank=0")
-    print(os.listdir(tld))
-    recovered_samples = cudf.read_parquet(tld)
-
-    for b in batches["batch"].compute().unique().values_host.tolist():
-        assert b in recovered_samples["batch_id"].values_host.tolist()
-
-    for x in range(0, 6, 2):
-        subdir = f"{x}-{x+1}"
-        df = cudf.read_parquet(os.path.join(tld, f"batch={subdir}.parquet"))
-
-        assert ((df.batch_id == x) | (df.batch_id == (x + 1))).all()
-        assert ((df.hop_id == 0) | (df.hop_id == 1)).all()
-
-    assert (
-        cudf.read_parquet(os.path.join(tld, "batch=6-6.parquet")).batch_id == 6
-    ).all()
-
-
 @pytest.mark.mg
 def test_bulk_sampler_mg_graph_sg_input(dask_client):
     el = karate.get_edgelist().reset_index().rename(columns={"index": "eid"})
@@ -161,7 +97,7 @@ def test_bulk_sampler_mg_graph_sg_input(dask_client):
     bs.add_batches(batches, start_col_name="start", batch_col_name="batch")
     bs.flush()
 
-    recovered_samples = cudf.read_parquet(os.path.join(tempdir_object.name, "rank=0"))
+    recovered_samples = cudf.read_parquet(tempdir_object.name)
 
     for b in batches["batch"].unique().values_host.tolist():
         assert b in recovered_samples["batch_id"].values_host.tolist()
diff --git a/python/cugraph/cugraph/tests/sampling/test_egonet_mg.py b/python/cugraph/cugraph/tests/sampling/test_egonet_mg.py
index 96b5ec2ac3a..172296c07f9 100644
--- a/python/cugraph/cugraph/tests/sampling/test_egonet_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_egonet_mg.py
@@ -76,7 +76,9 @@ def input_expected_output(input_combo):
     directed = input_combo["directed"]
     seeds = input_combo["seeds"]
     radius = input_combo["radius"]
-    G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed)
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=True
+    )
 
     sg_cugraph_ego_graphs = cugraph.batched_ego_graphs(G, seeds=seeds, radius=radius)
 
diff --git a/python/cugraph/cugraph/tests/sampling/test_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_random_walks.py
index 5573c9cff82..508f927c296 100644
--- a/python/cugraph/cugraph/tests/sampling/test_random_walks.py
+++ b/python/cugraph/cugraph/tests/sampling/test_random_walks.py
@@ -18,6 +18,9 @@
 from cudf.testing import assert_series_equal
 
 import cugraph
+import cudf
+import networkx as nx
+from cugraph.utilities import ensure_cugraph_obj_for_nx
 from cugraph.experimental.datasets import DATASETS, DATASETS_SMALL
 
 # =============================================================================
@@ -36,7 +39,7 @@ def setup_function():
     gc.collect()
 
 
-def calc_random_walks(graph_file, directed=False, max_depth=None, use_padding=False):
+def calc_random_walks(G, max_depth=None, use_padding=False, legacy_result_type=True):
     """
     compute random walks for each nodes in 'start_vertices'
 
@@ -70,25 +73,32 @@ def calc_random_walks(graph_file, directed=False, max_depth=None, use_padding=Fa
     sizes: int
         The path size in case of coalesced paths.
     """
-    G = graph_file.get_graph(create_using=cugraph.Graph(directed=directed))
     assert G is not None
 
-    k = random.randint(1, 10)
+    G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="weights")
+
+    k = random.randint(1, 6)
+
     random_walks_type = "uniform"
-    start_vertices = random.sample(range(G.number_of_vertices()), k)
+
+    start_vertices = G.select_random_vertices(num_vertices=k)
+
+    print("\nstart_vertices is \n", start_vertices)
     vertex_paths, edge_weights, vertex_path_sizes = cugraph.random_walks(
-        G, random_walks_type, start_vertices, max_depth, use_padding
+        G, random_walks_type, start_vertices, max_depth, use_padding, legacy_result_type
     )
 
     return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices
 
 
-def check_random_walks(path_data, seeds, df_G=None):
+def check_random_walks(path_data, seeds, G):
     invalid_edge = 0
     invalid_seeds = 0
     offsets_idx = 0
     next_path_idx = 0
     v_paths = path_data[0]
+    df_G = G.input_df
+
     sizes = path_data[2].to_numpy().tolist()
 
     for s in sizes:
@@ -107,7 +117,7 @@ def check_random_walks(path_data, seeds, df_G=None):
             (df_G["src"] == (src)) & (df_G["dst"] == (dst))
         ].reset_index(drop=True)
 
-        if not (exp_edge["src"].loc[0], exp_edge["dst"].loc[0]) == (src, dst):
+        if len(exp_edge) == 0:
             print(
                 "[ERR] Invalid edge: " "There is no edge src {} dst {}".format(src, dst)
             )
@@ -117,15 +127,94 @@ def check_random_walks(path_data, seeds, df_G=None):
     assert invalid_seeds == 0
 
 
+def check_random_walks_padded(G, path_data, seeds, max_depth, legacy_result_type=True):
+    invalid_edge = 0
+    invalid_seeds = 0
+    invalid_edge_wgt = 0
+    v_paths = path_data[0]
+    e_wgt_paths = path_data[1]
+    e_wgt_idx = 0
+
+    G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="weights")
+    df_G = G.input_df
+    if "weight" in df_G.columns:
+        df_G = df_G.rename(columns={"weight": "wgt"})
+
+    total_depth = (max_depth) * len(seeds)
+
+    for i in range(total_depth - 1):
+        vertex_1, vertex_2 = v_paths.iloc[i], v_paths.iloc[i + 1]
+
+        # Every max_depth'th vertex in 'v_paths' is a seed
+        # instead of 'seeds[i // (max_depth)]', could have just pop the first element
+        # of the seeds array once there is a match and compare it to 'vertex_1'
+        if i % (max_depth) == 0 and vertex_1 != seeds[i // (max_depth)]:
+            invalid_seeds += 1
+            print(
+                "[ERR] Invalid seed: "
+                " src {} != src {}".format(vertex_1, seeds[i // (max_depth)])
+            )
+
+        if (i % (max_depth)) != (max_depth - 1):
+            # These are the edges
+            src = vertex_1
+            dst = vertex_2
+
+            if src != -1 and dst != -1:
+                # check for valid edge.
+                edge = df_G.loc[
+                    (df_G["src"] == (src)) & (df_G["dst"] == (dst))
+                ].reset_index(drop=True)
+
+                if len(edge) == 0:
+                    print(
+                        "[ERR] Invalid edge: "
+                        "There is no edge src {} dst {}".format(src, dst)
+                    )
+                    invalid_edge += 1
+
+                else:
+                    # check valid edge wgt
+                    expected_wgt = edge["wgt"].iloc[0]
+                    result_wgt = e_wgt_paths.iloc[e_wgt_idx]
+
+                    if expected_wgt != result_wgt:
+                        print(
+                            "[ERR] Invalid edge wgt: "
+                            "The edge src {} dst {} has wgt {} but got {}".format(
+                                src, dst, expected_wgt, result_wgt
+                            )
+                        )
+                        invalid_edge_wgt += 1
+            e_wgt_idx += 1
+
+            if src != -1 and dst == -1:
+                # ensure there is no outgoing edges from 'src'
+                assert G.out_degree([src])["degree"].iloc[0] == 0
+
+    assert invalid_seeds == 0
+    assert invalid_edge == 0
+    assert invalid_edge_wgt == 0
+    assert len(v_paths) == (max_depth) * len(seeds)
+    assert len(e_wgt_paths) == (max_depth - 1) * len(seeds)
+
+    if legacy_result_type:
+        sizes = path_data[2]
+        assert sizes is None
+    else:
+        max_path_lenth = path_data[2]
+        assert max_path_lenth == max_depth - 1
+
+
 @pytest.mark.sg
 @pytest.mark.parametrize("graph_file", DATASETS_SMALL)
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("max_depth", [None])
 def test_random_walks_invalid_max_dept(graph_file, directed, max_depth):
+
+    input_graph = graph_file.get_graph(create_using=cugraph.Graph(directed=directed))
     with pytest.raises(TypeError):
-        df, offsets, seeds = calc_random_walks(
-            graph_file, directed=directed, max_depth=max_depth
-        )
+        _, _, _ = calc_random_walks(input_graph, max_depth=max_depth)
 
 
 @pytest.mark.sg
@@ -134,9 +223,13 @@ def test_random_walks_invalid_max_dept(graph_file, directed, max_depth):
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 def test_random_walks_coalesced(graph_file, directed):
     max_depth = random.randint(2, 10)
-    df_G = graph_file.get_edgelist()
-    path_data, seeds = calc_random_walks(graph_file, directed, max_depth=max_depth)
-    check_random_walks(path_data, seeds, df_G)
+
+    input_graph = graph_file.get_graph(create_using=cugraph.Graph(directed=directed))
+
+    path_data, seeds = calc_random_walks(
+        input_graph, max_depth=max_depth, use_padding=False
+    )
+    check_random_walks(path_data, seeds, input_graph)
 
     # Check path query output
     df = cugraph.rw_path(len(seeds), path_data[2])
@@ -152,15 +245,69 @@ def test_random_walks_coalesced(graph_file, directed):
 @pytest.mark.cugraph_ops
 @pytest.mark.parametrize("graph_file", DATASETS_SMALL)
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
-def test_random_walks_padded(graph_file, directed):
+def test_random_walks_padded_0(graph_file, directed):
+    max_depth = random.randint(2, 10)
+    print("max_depth is ", max_depth)
+    input_graph = graph_file.get_graph(create_using=cugraph.Graph(directed=directed))
+
+    path_data, seeds = calc_random_walks(
+        input_graph, max_depth=max_depth, use_padding=True
+    )
+
+    check_random_walks_padded(input_graph, path_data, seeds, max_depth)
+
+    # test for 'legacy_result_type=False'
+    path_data, seeds = calc_random_walks(
+        input_graph, max_depth=max_depth, use_padding=True, legacy_result_type=False
+    )
+    # Non 'legacy_result_type' has an extra edge 'path_data'
+    check_random_walks_padded(
+        input_graph, path_data, seeds, max_depth + 1, legacy_result_type=False
+    )
+
+
+@pytest.mark.sg
+@pytest.mark.cugraph_ops
+def test_random_walks_padded_1():
     max_depth = random.randint(2, 10)
+
+    df = cudf.DataFrame()
+    df["src"] = [1, 2, 4, 7, 3]
+    df["dst"] = [5, 4, 1, 5, 2]
+    df["wgt"] = [0.4, 0.5, 0.6, 0.7, 0.8]
+
+    input_graph = cugraph.Graph(directed=True)
+
+    input_graph.from_cudf_edgelist(
+        df, source="src", destination="dst", edge_attr="wgt", renumber=True
+    )
+
     path_data, seeds = calc_random_walks(
-        graph_file, directed, max_depth=max_depth, use_padding=True
+        input_graph, max_depth=max_depth, use_padding=True
     )
-    v_paths = path_data[0]
-    e_weights = path_data[1]
-    assert len(v_paths) == max_depth * len(seeds)
-    assert len(e_weights) == (max_depth - 1) * len(seeds)
+
+    check_random_walks_padded(input_graph, path_data, seeds, max_depth)
+
+
+@pytest.mark.sg
+@pytest.mark.cugraph_ops
+@pytest.mark.parametrize("graph_file", DATASETS_SMALL)
+def test_random_walks_nx(graph_file):
+    G = graph_file.get_graph(create_using=cugraph.Graph(directed=True))
+
+    M = G.to_pandas_edgelist()
+
+    Gnx = nx.from_pandas_edgelist(
+        M,
+        source="src",
+        target="dst",
+        edge_attr="weights",
+        create_using=nx.DiGraph(),
+    )
+    max_depth = random.randint(2, 10)
+    path_data, seeds = calc_random_walks(Gnx, max_depth=max_depth, use_padding=True)
+
+    check_random_walks_padded(Gnx, path_data, seeds, max_depth)
 
 
 """@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
index 6fe16d97713..5d2f050bce9 100644
--- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
@@ -420,6 +420,66 @@ def test_uniform_neighbor_sample_edge_properties_self_loops():
     assert sorted(sampling_results.hop_id.values_host.tolist()) == [0, 0, 0, 1, 1, 1]
 
 
+@pytest.mark.sg
+def test_uniform_neighbor_sample_hop_id_order():
+    df = cudf.DataFrame(
+        {
+            "src": [0, 1, 2, 3, 3, 6],
+            "dst": [2, 3, 4, 5, 6, 7],
+        }
+    )
+
+    G = cugraph.Graph(directed=True)
+    G.from_cudf_edgelist(df, source="src", destination="dst")
+
+    sampling_results = cugraph.uniform_neighbor_sample(
+        G,
+        cudf.Series([0, 1], dtype="int64"),
+        fanout_vals=[2, 2, 2],
+        with_replacement=False,
+        with_edge_properties=True,
+    )
+
+    assert (
+        sorted(sampling_results.hop_id.values_host.tolist())
+        == sampling_results.hop_id.values_host.tolist()
+    )
+
+
+@pytest.mark.sg
+def test_uniform_neighbor_sample_hop_id_order_multi_batch():
+    df = cudf.DataFrame(
+        {
+            "src": [0, 1, 2, 3, 3, 6],
+            "dst": [2, 3, 4, 5, 6, 7],
+        }
+    )
+
+    G = cugraph.Graph(directed=True)
+    G.from_cudf_edgelist(df, source="src", destination="dst")
+
+    sampling_results = cugraph.uniform_neighbor_sample(
+        G,
+        cudf.Series([0, 1], dtype="int64"),
+        fanout_vals=[2, 2, 2],
+        batch_id_list=cudf.Series([0, 1], dtype="int32"),
+        with_replacement=False,
+        with_edge_properties=True,
+    )
+
+    for b in range(2):
+        assert (
+            sorted(
+                sampling_results[
+                    sampling_results.batch_id == b
+                ].hop_id.values_host.tolist()
+            )
+            == sampling_results[
+                sampling_results.batch_id == b
+            ].hop_id.values_host.tolist()
+        )
+
+
 @pytest.mark.sg
 def test_uniform_neighbor_sample_empty_start_list():
     df = cudf.DataFrame(
diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
index 76657eb634f..033b96487c4 100644
--- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
@@ -19,6 +19,7 @@
 import cudf
 import dask_cudf
 from pylibcugraph.testing.utils import gen_fixture_params_product
+from cugraph.dask.common.mg_utils import is_single_gpu
 
 import cugraph.dask as dcg
 import cugraph
@@ -261,6 +262,7 @@ def test_mg_uniform_neighbor_sample_tree(dask_client, directed):
 
 
 @pytest.mark.mg
+@pytest.mark.skipif(is_single_gpu(), reason="FIXME: MG test fails on single-GPU")
 @pytest.mark.cugraph_ops
 def test_mg_uniform_neighbor_sample_unweighted(dask_client):
     df = cudf.DataFrame(
@@ -295,6 +297,7 @@ def test_mg_uniform_neighbor_sample_unweighted(dask_client):
 
 
 @pytest.mark.mg
+@pytest.mark.skipif(is_single_gpu(), reason="FIXME: MG test fails on single-GPU")
 @pytest.mark.cugraph_ops
 def test_mg_uniform_neighbor_sample_ensure_no_duplicates(dask_client):
     # See issue #2760
@@ -324,6 +327,8 @@ def test_mg_uniform_neighbor_sample_ensure_no_duplicates(dask_client):
 @pytest.mark.cugraph_ops
 @pytest.mark.parametrize("return_offsets", [True, False])
 def test_uniform_neighbor_sample_edge_properties(dask_client, return_offsets):
+    if len(dask_client.scheduler_info()["workers"]) <= 1:
+        pytest.skip("Test only valid for MG environments")
     edgelist_df = dask_cudf.from_cudf(
         cudf.DataFrame(
             {
@@ -473,10 +478,78 @@ def test_uniform_neighbor_sample_edge_properties_self_loops(dask_client):
     assert sorted(sampling_results.hop_id.values_host.tolist()) == [0, 0, 0, 1, 1, 1]
 
 
+@pytest.mark.mg
+def test_uniform_neighbor_sample_hop_id_order():
+    df = dask_cudf.from_cudf(
+        cudf.DataFrame(
+            {
+                "src": [0, 1, 2, 3, 3, 6],
+                "dst": [2, 3, 4, 5, 6, 7],
+            }
+        ),
+        npartitions=2,
+    )
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(df, source="src", destination="dst")
+
+    sampling_results = cugraph.dask.uniform_neighbor_sample(
+        G,
+        cudf.Series([0, 1], dtype="int64"),
+        fanout_vals=[2, 2, 2],
+        with_replacement=False,
+        with_edge_properties=True,
+    )
+
+    for p in range(sampling_results.npartitions):
+        sampling_results_p = sampling_results.get_partition(p).compute()
+        assert (
+            sorted(sampling_results_p.hop_id.values_host.tolist())
+            == sampling_results_p.hop_id.values_host.tolist()
+        )
+
+
+@pytest.mark.mg
+def test_uniform_neighbor_sample_hop_id_order_multi_batch():
+    df = dask_cudf.from_cudf(
+        cudf.DataFrame(
+            {
+                "src": [0, 1, 2, 3, 3, 6],
+                "dst": [2, 3, 4, 5, 6, 7],
+            }
+        ),
+        npartitions=2,
+    )
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(df, source="src", destination="dst")
+
+    sampling_results = cugraph.dask.uniform_neighbor_sample(
+        G,
+        cudf.Series([0, 1], dtype="int64"),
+        fanout_vals=[2, 2, 2],
+        batch_id_list=cudf.Series([0, 1], dtype="int32"),
+        with_replacement=False,
+        with_edge_properties=True,
+    )
+
+    for p in range(sampling_results.npartitions):
+        sampling_results_p = sampling_results.get_partition(p)
+        for b in range(2):
+            sampling_results_pb = sampling_results_p[
+                sampling_results_p.batch_id == b
+            ].compute()
+            assert (
+                sorted(sampling_results_pb.hop_id.values_host.tolist())
+                == sampling_results_pb.hop_id.values_host.tolist()
+            )
+
+
 @pytest.mark.mg
 @pytest.mark.parametrize("with_replacement", [True, False])
 @pytest.mark.skipif(
-    int(os.getenv("DASK_NUM_WORKERS", 2)) < 2, reason="too few workers to test"
+    len(os.getenv("DASK_WORKER_DEVICES", "0").split(",")) < 2,
+    reason="too few workers to test",
 )
 def test_uniform_neighbor_edge_properties_sample_small_start_list(
     dask_client, with_replacement
diff --git a/python/cugraph/cugraph/tests/structure/test_graph.py b/python/cugraph/cugraph/tests/structure/test_graph.py
index 3734f6c9586..02219002a7e 100644
--- a/python/cugraph/cugraph/tests/structure/test_graph.py
+++ b/python/cugraph/cugraph/tests/structure/test_graph.py
@@ -646,7 +646,7 @@ def test_bipartite_api(graph_file):
     # This test only tests the functionality of adding set of nodes and
     # retrieving them. The datasets currently used are not truly bipartite.
     cu_M = utils.read_csv_file(graph_file)
-    nodes = cudf.concat([cu_M["0"], cu_M["1"]]).unique()
+    nodes = cudf.concat([cu_M["0"], cu_M["1"]]).unique().sort_values()
 
     # Create set of nodes for partition
     set1_exp = cudf.Series(nodes[0 : int(len(nodes) / 2)])
@@ -863,3 +863,28 @@ def test_select_random_vertices(graph_file, random_state, num_vertices):
     )
 
     assert len(join) == len(sampled_vertices)
+
+
+@pytest.mark.sg
+@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
+@pytest.mark.parametrize(
+    "edge_props",
+    [
+        ["edge_id", "edge_type", "weight"],
+        ["edge_id", "edge_type"],
+        ["edge_type", "weight"],
+        ["edge_id"],
+        ["weight"],
+    ],
+)
+def test_graph_creation_edge_properties(graph_file, edge_props):
+    df = utils.read_csv_file(graph_file)
+
+    df["edge_id"] = cupy.arange(len(df), dtype="int32")
+    df["edge_type"] = cupy.int32(3)
+    df["weight"] = 0.5
+
+    prop_keys = {k: k for k in edge_props}
+
+    G = cugraph.Graph(directed=True)
+    G.from_cudf_edgelist(df, source="0", destination="1", **prop_keys)
diff --git a/python/cugraph/cugraph/tests/structure/test_graph_mg.py b/python/cugraph/cugraph/tests/structure/test_graph_mg.py
index b1b8d65c5a6..ebaae38a8a4 100644
--- a/python/cugraph/cugraph/tests/structure/test_graph_mg.py
+++ b/python/cugraph/cugraph/tests/structure/test_graph_mg.py
@@ -335,3 +335,30 @@ def test_mg_select_random_vertices(
     )
 
     assert len(join) == len(sampled_vertices)
+
+
+@pytest.mark.sg
+@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
+@pytest.mark.parametrize(
+    "edge_props",
+    [
+        ["edge_id", "edge_type", "weight"],
+        ["edge_id", "edge_type"],
+        ["edge_type", "weight"],
+        ["edge_id"],
+        ["weight"],
+    ],
+)
+def test_graph_creation_edge_properties(dask_client, graph_file, edge_props):
+    df = utils.read_csv_file(graph_file)
+
+    df["edge_id"] = cupy.arange(len(df), dtype="int32")
+    df["edge_type"] = cupy.int32(3)
+    df["weight"] = 0.5
+
+    df = dask_cudf.from_cudf(df, npartitions=2)
+
+    prop_keys = {k: k for k in edge_props}
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(df, source="0", destination="1", **prop_keys)
diff --git a/python/cugraph/cugraph/tests/traversal/test_sssp.py b/python/cugraph/cugraph/tests/traversal/test_sssp.py
index 5e977be570a..1c99123f866 100644
--- a/python/cugraph/cugraph/tests/traversal/test_sssp.py
+++ b/python/cugraph/cugraph/tests/traversal/test_sssp.py
@@ -18,6 +18,7 @@
 import pytest
 import pandas as pd
 import cupy as cp
+import cupyx
 from cupyx.scipy.sparse import coo_matrix as cp_coo_matrix
 from cupyx.scipy.sparse import csr_matrix as cp_csr_matrix
 from cupyx.scipy.sparse import csc_matrix as cp_csc_matrix
@@ -26,6 +27,7 @@
 from scipy.sparse import csc_matrix as sp_csc_matrix
 import cudf
 from pylibcugraph.testing.utils import gen_fixture_params_product
+from cugraph.experimental.datasets import DATASETS_UNDIRECTED
 
 import cugraph
 from cugraph.testing import utils
@@ -143,6 +145,7 @@ def networkx_call(graph_file, source, edgevals=True):
     M = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True)
     # Directed NetworkX graph
     edge_attr = "weight" if edgevals else None
+
     Gnx = nx.from_pandas_edgelist(
         M,
         source="0",
@@ -162,7 +165,7 @@ def networkx_call(graph_file, source, edgevals=True):
         nx_paths = nx.single_source_dijkstra_path_length(Gnx, source)
 
     G = graph_file.get_graph(
-        create_using=cugraph.Graph(directed=True), ignore_weights=True
+        create_using=cugraph.Graph(directed=True), ignore_weights=not edgevals
     )
 
     t2 = time.time() - t1
@@ -379,6 +382,18 @@ def test_sssp_data_type_conversion(graph_file, source):
     assert err == 0
 
 
+@pytest.mark.sg
+def test_sssp_networkx_edge_attr():
+    G = nx.Graph()
+    G.add_edge(0, 1, other=10)
+    G.add_edge(1, 2, other=20)
+    df = cugraph.sssp(G, 0, edge_attr="other")
+    df = df.set_index("vertex")
+    assert df.loc[0, "distance"] == 0
+    assert df.loc[1, "distance"] == 10
+    assert df.loc[2, "distance"] == 30
+
+
 @pytest.mark.sg
 def test_scipy_api_compat():
     graph_file = datasets.DATASETS[0]
@@ -450,11 +465,55 @@ def test_scipy_api_compat():
 
 
 @pytest.mark.sg
-def test_sssp_with_no_edgevals():
-    G = datasets.karate.get_graph(ignore_weights=True)
-    warning_msg = (
-        "'SSSP' requires the input graph to be weighted: Unweighted "
-        "graphs will not be supported in the next release."
+@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED)
+def test_sssp_csr_graph(graph_file):
+    df = graph_file.get_edgelist()
+
+    M = cupyx.scipy.sparse.coo_matrix(
+        (df["wgt"].to_cupy(), (df["src"].to_cupy(), df["dst"].to_cupy()))
+    )
+    M = M.tocsr()
+
+    offsets = cudf.Series(M.indptr)
+    indices = cudf.Series(M.indices)
+    weights = cudf.Series(M.data)
+    G_csr = cugraph.Graph()
+    G_coo = graph_file.get_graph()
+
+    source = G_coo.select_random_vertices(num_vertices=1)[0]
+
+    print("source = ", source)
+
+    G_csr.from_cudf_adjlist(offsets, indices, weights)
+
+    result_csr = cugraph.sssp(G_csr, source)
+    result_coo = cugraph.sssp(G_coo, source)
+
+    result_csr = result_csr.sort_values("vertex").reset_index(drop=True)
+    result_sssp = (
+        result_coo.sort_values("vertex")
+        .reset_index(drop=True)
+        .rename(columns={"distance": "distance_coo", "predecessor": "predecessor_coo"})
+    )
+    result_sssp["distance_csr"] = result_csr["distance"]
+    result_sssp["predecessor_csr"] = result_csr["predecessor"]
+
+    distance_diffs = result_sssp.query("distance_csr != distance_coo")
+    predecessor_diffs = result_sssp.query("predecessor_csr != predecessor_coo")
+
+    assert len(distance_diffs) == 0
+    assert len(predecessor_diffs) == 0
+
+
+@pytest.mark.sg
+def test_sssp_unweighted_graph():
+    karate = DATASETS_UNDIRECTED[0]
+    G = karate.get_graph(ignore_weights=True)
+
+    error_msg = (
+        "'SSSP' requires the input graph to be weighted."
+        "'BFS' should be used instead of 'SSSP' for unweighted graphs."
     )
-    with pytest.warns(PendingDeprecationWarning, match=warning_msg):
+
+    with pytest.raises(RuntimeError, match=error_msg):
         cugraph.sssp(G, 1)
diff --git a/python/cugraph/cugraph/tests/traversal/test_sssp_mg.py b/python/cugraph/cugraph/tests/traversal/test_sssp_mg.py
index 0a138fd95ed..867f125ea6f 100644
--- a/python/cugraph/cugraph/tests/traversal/test_sssp_mg.py
+++ b/python/cugraph/cugraph/tests/traversal/test_sssp_mg.py
@@ -85,33 +85,3 @@ def test_dask_sssp(dask_client, directed):
         ):
             err = err + 1
     assert err == 0
-
-
-@pytest.mark.mg
-def test_dask_unweighted_sssp(dask_client):
-    input_data_path = input_data_path = (
-        RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv"
-    ).as_posix()
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        store_transposed=True,
-    )
-
-    warning_msg = (
-        "'SSSP' requires the input graph to be weighted: Unweighted "
-        "graphs will not be supported in the next release."
-    )
-    with pytest.warns(PendingDeprecationWarning, match=warning_msg):
-        dcg.sssp(dg, 0)
diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py
index 6a145833c7f..e72de2ecf8a 100644
--- a/python/cugraph/cugraph/tests/utils/test_dataset.py
+++ b/python/cugraph/cugraph/tests/utils/test_dataset.py
@@ -12,60 +12,71 @@
 # limitations under the License.
 
 
-import pytest
-import yaml
 import os
 from pathlib import Path
-from tempfile import NamedTemporaryFile, TemporaryDirectory
-from cugraph.experimental.datasets import ALL_DATASETS, ALL_DATASETS_WGT, SMALL_DATASETS
-from cugraph.structure import Graph
+from tempfile import TemporaryDirectory
+import gc
 
+import pytest
 
-# =============================================================================
-# Pytest Setup / Teardown - called for each test function
-# =============================================================================
+from cugraph.structure import Graph
+from cugraph.testing import RAPIDS_DATASET_ROOT_DIR_PATH
+from cugraph.experimental.datasets import (
+    ALL_DATASETS,
+    ALL_DATASETS_WGT,
+    SMALL_DATASETS,
+)
+from cugraph.experimental import datasets
 
-dataset_path = Path(__file__).parents[4] / "datasets"
+# Add the sg marker to all tests in this module.
+pytestmark = pytest.mark.sg
 
 
-# Use this to simulate a fresh API import
-@pytest.fixture
-def datasets():
-    from cugraph.experimental import datasets
+###############################################################################
+# Fixtures
 
-    yield datasets
-    del datasets
-    clear_locals()
+# module fixture - called once for this module
+@pytest.fixture(scope="module")
+def tmpdir():
+    """
+    Create a tmp dir for downloads, etc., run a test, then cleanup when the
+    test is done.
+    """
+    tmpd = TemporaryDirectory()
+    yield tmpd
+    # teardown
+    tmpd.cleanup()
 
 
-def clear_locals():
+# function fixture - called once for each function in this module
+@pytest.fixture(scope="function", autouse=True)
+def setup(tmpdir):
+    """
+    Fixture used for individual test setup and teardown. This ensures each
+    Dataset object starts with the same state and cleans up when the test is
+    done.
+    """
+    # FIXME: this relies on dataset features (unload) which themselves are
+    # being tested in this module.
     for dataset in ALL_DATASETS:
-        dataset._edgelist = None
-        dataset._graph = None
-        dataset._path = None
+        dataset.unload()
+    gc.collect()
 
+    datasets.set_download_dir(tmpdir.name)
 
-# We use this to create tempfiles that act as config files when we call
-# set_config(). Arguments passed will act as custom download directories
-def create_config(custom_path="custom_storage_location"):
-    config_yaml = """
-                    fetch: False
-                    force: False
-                    download_dir: None
-                    """
-    c = yaml.safe_load(config_yaml)
-    c["download_dir"] = custom_path
+    yield
 
-    outfile = NamedTemporaryFile()
-    with open(outfile.name, "w") as f:
-        yaml.dump(c, f, sort_keys=False)
+    # teardown
+    for dataset in ALL_DATASETS:
+        dataset.unload()
+    gc.collect()
 
-    return outfile
 
+###############################################################################
+# Tests
 
 # setting download_dir to None effectively re-initialized the default
-@pytest.mark.sg
-def test_env_var(datasets):
+def test_env_var():
     os.environ["RAPIDS_DATASET_ROOT_DIR"] = "custom_storage_location"
     datasets.set_download_dir(None)
 
@@ -75,26 +86,14 @@ def test_env_var(datasets):
     del os.environ["RAPIDS_DATASET_ROOT_DIR"]
 
 
-@pytest.mark.sg
-def test_home_dir(datasets):
+def test_home_dir():
     datasets.set_download_dir(None)
     expected_path = Path.home() / ".cugraph/datasets"
 
     assert datasets.get_download_dir() == expected_path
 
 
-@pytest.mark.sg
-def test_set_config(datasets):
-    cfg = create_config()
-    datasets.set_config(cfg.name)
-
-    assert datasets.get_download_dir() == Path("custom_storage_location").absolute()
-
-    cfg.close()
-
-
-@pytest.mark.sg
-def test_set_download_dir(datasets):
+def test_set_download_dir():
     tmpd = TemporaryDirectory()
     datasets.set_download_dir(tmpd.name)
 
@@ -103,59 +102,26 @@ def test_set_download_dir(datasets):
     tmpd.cleanup()
 
 
-@pytest.mark.sg
-@pytest.mark.skip(
-    reason="Timeout errors; see: https://github.com/rapidsai/cugraph/issues/2810"
-)
-def test_load_all(datasets):
-    tmpd = TemporaryDirectory()
-    cfg = create_config(custom_path=tmpd.name)
-    datasets.set_config(cfg.name)
-    datasets.load_all()
-
-    for data in datasets.ALL_DATASETS:
-        file_path = Path(tmpd.name) / (
-            data.metadata["name"] + data.metadata["file_type"]
-        )
-        assert file_path.is_file()
-
-    tmpd.cleanup()
-
-
-@pytest.mark.sg
 @pytest.mark.parametrize("dataset", ALL_DATASETS)
-def test_fetch(dataset, datasets):
-    tmpd = TemporaryDirectory()
-    cfg = create_config(custom_path=tmpd.name)
-    datasets.set_config(cfg.name)
-
+def test_fetch(dataset):
     E = dataset.get_edgelist(fetch=True)
 
     assert E is not None
     assert dataset.get_path().is_file()
 
-    tmpd.cleanup()
 
-
-@pytest.mark.sg
 @pytest.mark.parametrize("dataset", ALL_DATASETS)
-def test_get_edgelist(dataset, datasets):
-    datasets.set_download_dir(dataset_path)
+def test_get_edgelist(dataset):
     E = dataset.get_edgelist(fetch=True)
-
     assert E is not None
 
 
-@pytest.mark.sg
 @pytest.mark.parametrize("dataset", ALL_DATASETS)
-def test_get_graph(dataset, datasets):
-    datasets.set_download_dir(dataset_path)
+def test_get_graph(dataset):
     G = dataset.get_graph(fetch=True)
-
     assert G is not None
 
 
-@pytest.mark.sg
 @pytest.mark.parametrize("dataset", ALL_DATASETS)
 def test_metadata(dataset):
     M = dataset.metadata
@@ -163,9 +129,8 @@ def test_metadata(dataset):
     assert M is not None
 
 
-@pytest.mark.sg
 @pytest.mark.parametrize("dataset", ALL_DATASETS)
-def test_get_path(dataset, datasets):
+def test_get_path(dataset):
     tmpd = TemporaryDirectory()
     datasets.set_download_dir(tmpd.name)
     dataset.get_edgelist(fetch=True)
@@ -174,27 +139,103 @@ def test_get_path(dataset, datasets):
     tmpd.cleanup()
 
 
-@pytest.mark.sg
 @pytest.mark.parametrize("dataset", ALL_DATASETS_WGT)
-def test_weights(dataset, datasets):
-    datasets.set_download_dir(dataset_path)
-
-    G_w = dataset.get_graph(fetch=True)
+def test_weights(dataset):
+    G = dataset.get_graph(fetch=True)
+    assert G.is_weighted()
     G = dataset.get_graph(fetch=True, ignore_weights=True)
-
-    assert G_w.is_weighted()
     assert not G.is_weighted()
 
 
-@pytest.mark.sg
 @pytest.mark.parametrize("dataset", SMALL_DATASETS)
-def test_create_using(dataset, datasets):
-    datasets.set_download_dir(dataset_path)
+def test_create_using(dataset):
+    G = dataset.get_graph(fetch=True)
+    assert not G.is_directed()
+    G = dataset.get_graph(fetch=True, create_using=Graph)
+    assert not G.is_directed()
+    G = dataset.get_graph(fetch=True, create_using=Graph(directed=True))
+    assert G.is_directed()
 
-    G_d = dataset.get_graph()
-    G_t = dataset.get_graph(create_using=Graph)
-    G = dataset.get_graph(create_using=Graph(directed=True))
 
-    assert not G_d.is_directed()
-    assert not G_t.is_directed()
-    assert G.is_directed()
+def test_ctor_with_datafile():
+    from cugraph.experimental.datasets import karate
+
+    karate_csv = RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv"
+
+    # test that only a metadata file or csv can be specified, not both
+    with pytest.raises(ValueError):
+        datasets.Dataset(metadata_yaml_file="metadata_file", csv_file=karate_csv)
+
+    # ensure at least one arg is provided
+    with pytest.raises(ValueError):
+        datasets.Dataset()
+
+    # ensure csv file has all other required args (col names and col dtypes)
+    with pytest.raises(ValueError):
+        datasets.Dataset(csv_file=karate_csv)
+
+    with pytest.raises(ValueError):
+        datasets.Dataset(csv_file=karate_csv, csv_col_names=["src", "dst", "wgt"])
+
+    # test with file that DNE
+    with pytest.raises(FileNotFoundError):
+        datasets.Dataset(
+            csv_file="/some/file/that/does/not/exist",
+            csv_col_names=["src", "dst", "wgt"],
+            csv_col_dtypes=["int32", "int32", "float32"],
+        )
+
+    expected_karate_edgelist = karate.get_edgelist(fetch=True)
+
+    # test with file path as string, ensure fetch=True does not break
+    ds = datasets.Dataset(
+        csv_file=karate_csv.as_posix(),
+        csv_col_names=["src", "dst", "wgt"],
+        csv_col_dtypes=["int32", "int32", "float32"],
+    )
+    # cudf.testing.testing.assert_frame_equal() would be good to use to
+    # compare, but for some reason it seems to be holding a reference to a
+    # dataframe and gc.collect() does not free everything
+    el = ds.get_edgelist()
+    assert len(el) == len(expected_karate_edgelist)
+    assert str(ds) == "karate"
+    assert ds.get_path() == karate_csv
+
+    # test with file path as Path object
+    ds = datasets.Dataset(
+        csv_file=karate_csv,
+        csv_col_names=["src", "dst", "wgt"],
+        csv_col_dtypes=["int32", "int32", "float32"],
+    )
+    el = ds.get_edgelist()
+    assert len(el) == len(expected_karate_edgelist)
+    assert str(ds) == "karate"
+    assert ds.get_path() == karate_csv
+
+
+def test_unload():
+    email_csv = RAPIDS_DATASET_ROOT_DIR_PATH / "email-Eu-core.csv"
+
+    ds = datasets.Dataset(
+        csv_file=email_csv.as_posix(),
+        csv_col_names=["src", "dst", "wgt"],
+        csv_col_dtypes=["int32", "int32", "float32"],
+    )
+
+    # FIXME: another (better?) test would be to check free memory and assert
+    # the memory use increases after get_*(), then returns to the pre-get_*()
+    # level after unload(). However, that type of test may fail for several
+    # reasons (the device being monitored is accidentally also being used by
+    # another process, and the use of memory pools to name two). Instead, just
+    # test that the internal members get cleared on unload().
+    assert ds._edgelist is None
+
+    ds.get_edgelist()
+    assert ds._edgelist is not None
+    ds.unload()
+    assert ds._edgelist is None
+
+    ds.get_graph()
+    assert ds._edgelist is not None
+    ds.unload()
+    assert ds._edgelist is None
diff --git a/python/cugraph/cugraph/traversal/bfs.py b/python/cugraph/cugraph/traversal/bfs.py
index a200ba9b5d8..f2c1f5c5662 100644
--- a/python/cugraph/cugraph/traversal/bfs.py
+++ b/python/cugraph/cugraph/traversal/bfs.py
@@ -126,7 +126,7 @@ def bfs(
 ):
     """
     Find the distances and predecessors for a breadth first traversal of a
-    graph.
+    graph.  Unlike SSSP, BFS supports unweighted graphs.
 
     Parameters
     ----------
diff --git a/python/cugraph/cugraph/traversal/sssp.py b/python/cugraph/cugraph/traversal/sssp.py
index 9557650cbbc..c2705b70383 100644
--- a/python/cugraph/cugraph/traversal/sssp.py
+++ b/python/cugraph/cugraph/traversal/sssp.py
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 import numpy as np
-import warnings
 
 import cudf
 from cugraph.structure import Graph, MultiGraph
@@ -126,11 +125,6 @@ def _convert_df_to_output_type(df, input_type, return_predecessors):
         raise TypeError(f"input type {input_type} is not a supported type.")
 
 
-# FIXME: if G is a Nx type, the weight attribute is assumed to be "weight", if
-# set. An additional optional parameter for the weight attr name when accepting
-# Nx graphs may be needed.  From the Nx docs:
-# |      Many NetworkX algorithms designed for weighted graphs use
-# |      an edge attribute (by default `weight`) to hold a numerical value.
 def sssp(
     G,
     source=None,
@@ -141,6 +135,7 @@ def sssp(
     overwrite=None,
     indices=None,
     cutoff=None,
+    edge_attr="weight",
 ):
     """
     Compute the distance and predecessors for shortest paths from the specified
@@ -150,7 +145,9 @@ def sssp(
     unreachable will have a distance of infinity denoted by the maximum value
     of the data type and the predecessor set as -1. The source vertex's
     predecessor is also set to -1. Graphs with negative weight cycles are not
-    supported.
+    supported.  Unweighted graphs are also unsupported.
+
+    For finding shortest paths on an unweighted graph, use BFS instead.
 
     Parameters
     ----------
@@ -161,8 +158,12 @@ def sssp(
         The current implementation only supports weighted graphs.
     source : int
         Index of the source vertex.
-    cutoff : double, optional (default = None)
+    cutoff : double, optional (default=None)
         Maximum edge weight sum considered by the algorithm
+    edge_attr : str, optional (default='weight')
+        The name of the edge attribute that represents the weight of an edge.
+        This currently applies only when G is a NetworkX Graph.
+        Default value is 'weight', which follows NetworkX convention.
 
     Returns
     -------
@@ -211,17 +212,16 @@ def sssp(
         G, source, method, directed, return_predecessors, unweighted, overwrite, indices
     )
 
-    # FIXME: allow nx_weight_attr to be specified
     (G, input_type) = ensure_cugraph_obj(
-        G, nx_weight_attr="weight", matrix_graph_type=Graph(directed=directed)
+        G, nx_weight_attr=edge_attr, matrix_graph_type=Graph(directed=directed)
     )
 
-    if not G.edgelist.weights:
-        warning_msg = (
-            "'SSSP' requires the input graph to be weighted: Unweighted "
-            "graphs will not be supported in the next release."
+    if not G.is_weighted():
+        err_msg = (
+            "'SSSP' requires the input graph to be weighted."
+            "'BFS' should be used instead of 'SSSP' for unweighted graphs."
         )
-        warnings.warn(warning_msg, PendingDeprecationWarning)
+        raise RuntimeError(err_msg)
 
     if not G.has_node(source):
         raise ValueError("Graph does not contain source vertex")
diff --git a/python/cugraph/cugraph/utilities/nx_factory.py b/python/cugraph/cugraph/utilities/nx_factory.py
index d712e902df2..2448a511229 100644
--- a/python/cugraph/cugraph/utilities/nx_factory.py
+++ b/python/cugraph/cugraph/utilities/nx_factory.py
@@ -125,7 +125,7 @@ def convert_from_nx(
             f"nxG must be either a NetworkX Graph or DiGraph, got {type(nxG)}"
         )
 
-    is_weighted = nx.is_weighted(nxG)
+    is_weighted = nx.is_weighted(nxG, weight=weight)
 
     if is_weighted is False:
         _gdf = convert_unweighted_to_gdf(nxG, vertex_type)
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index 12ca224ab76..ee4b67ee9e4 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -6,10 +6,10 @@ requires = [
     "cmake>=3.23.1,!=3.25.0",
     "cython>=0.29,<0.30",
     "ninja",
-    "pylibcugraph==23.4.*",
-    "pylibraft==23.4.*",
-    "rmm==23.4.*",
-    "scikit-build>=0.13.1",
+    "pylibcugraph==23.6.*",
+    "pylibraft==23.6.*",
+    "rmm==23.6.*",
+    "scikit-build>=0.13.1,<0.17.2",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -20,32 +20,31 @@ testpaths = ["cugraph/tests"]
 
 [project]
 name = "cugraph"
-version = "23.04.01"
+version = "23.06.00"
 description = "cuGraph - RAPIDS GPU Graph Analytics"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 dependencies = [
-    "cuda-python>=11.7.1,<12.0",
-    "cudf==23.4.*",
-    "cupy-cuda11x>=9.5.0,<12.0.0a0",
-    "dask-cuda==23.4.*",
-    "dask-cudf==23.4.*",
+    "cudf==23.6.*",
+    "cupy-cuda11x>=12.0.0",
+    "dask-cuda==23.6.*",
+    "dask-cudf==23.6.*",
     "dask==2023.3.2",
     "distributed==2023.3.2.1",
-    "numba>=0.56.2",
-    "pylibcugraph==23.4.*",
-    "raft-dask==23.4.*",
-    "rmm==23.4.*",
-    "ucx-py==0.31.*",
+    "numba>=0.57",
+    "pylibcugraph==23.6.*",
+    "raft-dask==23.6.*",
+    "rmm==23.6.*",
+    "ucx-py==0.32.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
 ]
 
diff --git a/python/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/CMakeLists.txt
index ff1bc2d1a34..21097983a1b 100644
--- a/python/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
-set(pylibcugraph_version 23.04.01)
+set(pylibcugraph_version 23.06.00)
 
 include(../../fetch_rapids.cmake)
 
diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
index f4a9f40431d..7c50456eb4d 100644
--- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
@@ -36,6 +36,7 @@ set(cython_sources
     sorensen_coefficients.pyx
     overlap_coefficients.pyx
     katz_centrality.pyx
+    leiden.pyx
     louvain.pyx
     node2vec.pyx
     pagerank.pyx
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index 2c07b7caf7f..6b985633f9f 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -77,6 +77,8 @@
 
 from pylibcugraph.random import CuGraphRandomState
 
+from pylibcugraph.leiden import leiden
+
 from pylibcugraph.select_random_vertices import select_random_vertices
 
-__version__ = "23.04.01"
+__version__ = "23.06.00"
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/community_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/community_algorithms.pxd
index be58072e1d9..67ba43bf611 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/community_algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/community_algorithms.pxd
@@ -34,6 +34,9 @@ from pylibcugraph._cugraph_c.graph_functions cimport (
     cugraph_induced_subgraph_result_t,
 )
 
+from pylibcugraph._cugraph_c.random cimport (
+    cugraph_rng_state_t,
+)
 
 cdef extern from "cugraph_c/community_algorithms.h":
     ###########################################################################
@@ -113,6 +116,41 @@ cdef extern from "cugraph_c/community_algorithms.h":
             cugraph_error_t** error
         )
     
+    # leiden
+    ctypedef struct cugraph_hierarchical_clustering_result_t:
+        pass
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_hierarchical_clustering_result_get_vertices(
+            cugraph_hierarchical_clustering_result_t* result
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_hierarchical_clustering_result_get_clusters(
+            cugraph_hierarchical_clustering_result_t* result
+        )
+    
+    cdef double cugraph_hierarchical_clustering_result_get_modularity(
+        cugraph_hierarchical_clustering_result_t* result
+        )
+
+    cdef void \
+        cugraph_hierarchical_clustering_result_free(
+            cugraph_hierarchical_clustering_result_t* result
+        )
+
+    cdef cugraph_error_code_t \
+        cugraph_leiden(
+            const cugraph_resource_handle_t* handle,
+            cugraph_rng_state_t* rng_state,
+            cugraph_graph_t* graph,
+            size_t max_level,
+            double resolution,
+            double theta,
+            bool_t do_expensive_check,
+            cugraph_hierarchical_clustering_result_t** result,
+            cugraph_error_t** error
+        )
     ###########################################################################
     # ECG
     cdef cugraph_error_code_t \
diff --git a/python/pylibcugraph/pylibcugraph/bfs.pyx b/python/pylibcugraph/pylibcugraph/bfs.pyx
index 8af3f48736b..b9d17f15cc5 100644
--- a/python/pylibcugraph/pylibcugraph/bfs.pyx
+++ b/python/pylibcugraph/pylibcugraph/bfs.pyx
@@ -102,47 +102,45 @@ def bfs(ResourceHandle handle, _GPUGraph graph,
 
     Examples
     --------
-
-    M = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
-                      dtype=['int32', 'int32', 'float32'], header=None)
-    G = cugraph.Graph()
-    G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2')
-
-    handle = ResourceHandle()
-
-    srcs = G.edgelist.edgelist_df['src']
-    dsts = G.edgelist.edgelist_df['dst']
-    weights = G.edgelist.edgelist_df['weights']
-
-    sg = SGGraph(
-        resource_handle = handle, 
-        graph_properties = GraphProperties(is_multigraph=G.is_multigraph()), 
-        src_array = srcs, 
-        dst_array = dsts, 
-        weight_array = weights,
-        store_transposed=False,
-        renumber=False,
-        do_expensive_check=do_expensive_check
-    )
-
-    res = pylibcugraph_bfs(
-            handle,    
-            sg,
-            cudf.Series([0], dtype='int32'),
-            False,
-            10,
-            True,
-            False
-    )
-
-    distances, predecessors, vertices = res
-    
-    final_results = cudf.DataFrame({
-        'distance': cudf.Series(distances),
-        'vertex': cudf.Series(vertices),
-        'predecessor': cudf.Series(predecessors),
-    })
-
+    >>> M = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
+    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+    >>> G = cugraph.Graph()
+    >>>  G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2')
+    >>>
+    >>> handle = ResourceHandle()
+    >>>
+    >>> srcs = G.edgelist.edgelist_df['src']
+    >>> dsts = G.edgelist.edgelist_df['dst']
+    >>> weights = G.edgelist.edgelist_df['weights']
+    >>>
+    >>> sg = SGGraph(
+    >>>     resource_handle = handle, 
+    >>>     graph_properties = GraphProperties(is_multigraph=G.is_multigraph()), 
+    >>>     src_array = srcs, 
+    >>>     dst_array = dsts, 
+    >>>     weight_array = weights,
+    >>>     store_transposed=False,
+    >>>     renumber=False,
+    >>>     do_expensive_check=do_expensive_check
+    >>> )
+    >>>
+    >>> res = pylibcugraph_bfs(
+    >>>         handle,    
+    >>>         sg,
+    >>>         cudf.Series([0], dtype='int32'),
+    >>>         False,
+    >>>         10,
+    >>>         True,
+    >>>         False
+    >>> )
+    >>>
+    >>> distances, predecessors, vertices = res
+    >>>
+    f>>> inal_results = cudf.DataFrame({
+    >>>     'distance': cudf.Series(distances),
+    >>>     'vertex': cudf.Series(vertices),
+    >>>     'predecessor': cudf.Series(predecessors),
+    >>> })
     """
 
     try:
diff --git a/python/pylibcugraph/pylibcugraph/egonet.pyx b/python/pylibcugraph/pylibcugraph/egonet.pyx
index 779aa0028b3..d011d946e46 100644
--- a/python/pylibcugraph/pylibcugraph/egonet.pyx
+++ b/python/pylibcugraph/pylibcugraph/egonet.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -153,10 +153,16 @@ def ego_graph(ResourceHandle resource_handle,
     # for perfomance improvement
     cupy_sources = copy_to_cupy_array(
         c_resource_handle_ptr, sources_ptr)
+    
     cupy_destinations = copy_to_cupy_array(
         c_resource_handle_ptr, destinations_ptr)
-    cupy_edge_weights = copy_to_cupy_array(
-        c_resource_handle_ptr, edge_weights_ptr)
+    
+    if edge_weights_ptr is not NULL:
+        cupy_edge_weights = copy_to_cupy_array(
+            c_resource_handle_ptr, edge_weights_ptr)
+    else:
+        cupy_edge_weights = None
+
     cupy_subgraph_offsets = copy_to_cupy_array(
         c_resource_handle_ptr, subgraph_offsets_ptr)
 
diff --git a/python/pylibcugraph/pylibcugraph/graphs.pyx b/python/pylibcugraph/pylibcugraph/graphs.pyx
index 8aed98cb98a..dfbbf09129b 100644
--- a/python/pylibcugraph/pylibcugraph/graphs.pyx
+++ b/python/pylibcugraph/pylibcugraph/graphs.pyx
@@ -142,7 +142,7 @@ cdef class SGGraph(_GPUGraph):
                   GraphProperties graph_properties,
                   src_or_offset_array,
                   dst_or_index_array,
-                  weight_array,
+                  weight_array=None,
                   store_transposed=False,
                   renumber=False,
                   do_expensive_check=False,
@@ -177,18 +177,22 @@ cdef class SGGraph(_GPUGraph):
             create_cugraph_type_erased_device_array_view_from_py_obj(
                 src_or_offset_array
             )
+        
         cdef cugraph_type_erased_device_array_view_t* dsts_or_indices_view_ptr = \
             create_cugraph_type_erased_device_array_view_from_py_obj(
                 dst_or_index_array
             )
+
         cdef cugraph_type_erased_device_array_view_t* weights_view_ptr = \
             create_cugraph_type_erased_device_array_view_from_py_obj(
                 weight_array
             )
+        
         cdef cugraph_type_erased_device_array_view_t* edge_id_view_ptr = \
             create_cugraph_type_erased_device_array_view_from_py_obj(
                 edge_id_array
             )
+        
         cdef cugraph_type_erased_device_array_view_t* edge_type_view_ptr = \
             create_cugraph_type_erased_device_array_view_from_py_obj(
                 edge_type_array
@@ -306,7 +310,7 @@ cdef class MGGraph(_GPUGraph):
                   GraphProperties graph_properties,
                   src_array,
                   dst_array,
-                  weight_array,
+                  weight_array=None,
                   store_transposed=False,
                   num_edges=-1,
                   do_expensive_check=False,
@@ -354,18 +358,14 @@ cdef class MGGraph(_GPUGraph):
             create_cugraph_type_erased_device_array_view_from_py_obj(
                 weight_array
             )
-        cdef cugraph_type_erased_device_array_view_t* edge_id_view_ptr = NULL
-        if edge_id_array is not None:
-            edge_id_view_ptr = \
-                create_cugraph_type_erased_device_array_view_from_py_obj(
-                    edge_id_array
-                )
-        cdef cugraph_type_erased_device_array_view_t* edge_type_view_ptr = NULL
-        if edge_type_array is not None:
-            edge_type_view_ptr = \
-                create_cugraph_type_erased_device_array_view_from_py_obj(
-                    edge_type_array
-                )
+        cdef cugraph_type_erased_device_array_view_t* edge_id_view_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                edge_id_array
+            )
+        cdef cugraph_type_erased_device_array_view_t* edge_type_view_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                edge_type_array
+            )
 
         error_code = cugraph_mg_graph_create(
             resource_handle.c_resource_handle_ptr,
diff --git a/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx b/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx
index 55b3862774f..1391bbc9236 100644
--- a/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx
+++ b/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx
@@ -85,6 +85,10 @@ cdef class SamplingResult:
         cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
             cugraph_sample_result_get_edge_weight(self.c_sample_result_ptr)
         )
+
+        if device_array_view_ptr is NULL:
+            return None
+
         return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
                                                      self)
 
@@ -98,6 +102,10 @@ cdef class SamplingResult:
         cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
             cugraph_sample_result_get_edge_id(self.c_sample_result_ptr)
         )
+
+        if device_array_view_ptr is NULL:
+            return None
+
         return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
                                                      self)
 
@@ -108,6 +116,10 @@ cdef class SamplingResult:
         cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
             cugraph_sample_result_get_edge_type(self.c_sample_result_ptr)
         )
+
+        if device_array_view_ptr is NULL:
+            return None
+
         return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
                                                      self)
     
diff --git a/python/pylibcugraph/pylibcugraph/k_core.pyx b/python/pylibcugraph/pylibcugraph/k_core.pyx
index 50344469b11..c47cfef7a7a 100644
--- a/python/pylibcugraph/pylibcugraph/k_core.pyx
+++ b/python/pylibcugraph/pylibcugraph/k_core.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -156,12 +156,16 @@ def k_core(ResourceHandle resource_handle,
         cugraph_k_core_result_get_src_vertices(k_core_result_ptr)
     cdef cugraph_type_erased_device_array_view_t* dst_vertices_ptr = \
         cugraph_k_core_result_get_dst_vertices(k_core_result_ptr)
-    cdef cugraph_type_erased_device_array_view_t* weigths_ptr = \
+    cdef cugraph_type_erased_device_array_view_t* weights_ptr = \
         cugraph_k_core_result_get_weights(k_core_result_ptr)
 
     cupy_src_vertices = copy_to_cupy_array(c_resource_handle_ptr, src_vertices_ptr)
     cupy_dst_vertices = copy_to_cupy_array(c_resource_handle_ptr, dst_vertices_ptr)
-    cupy_weights = copy_to_cupy_array(c_resource_handle_ptr, weigths_ptr)
+
+    if weights_ptr is not NULL:
+        cupy_weights = copy_to_cupy_array(c_resource_handle_ptr, weights_ptr)
+    else:
+        cupy_weights = None
 
     cugraph_k_core_result_free(k_core_result_ptr)
     cugraph_core_result_free(core_result_ptr)
diff --git a/python/pylibcugraph/pylibcugraph/leiden.pyx b/python/pylibcugraph/pylibcugraph/leiden.pyx
new file mode 100644
index 00000000000..87286234f16
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/leiden.pyx
@@ -0,0 +1,167 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    bool_t,
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.community_algorithms cimport (
+    cugraph_hierarchical_clustering_result_t,
+    cugraph_leiden,
+    cugraph_hierarchical_clustering_result_get_vertices,
+    cugraph_hierarchical_clustering_result_get_clusters,
+    cugraph_hierarchical_clustering_result_get_modularity,
+    cugraph_hierarchical_clustering_result_free,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    copy_to_cupy_array,
+)
+from pylibcugraph._cugraph_c.random cimport (
+    cugraph_rng_state_t
+)
+from pylibcugraph.random cimport (
+    CuGraphRandomState
+)
+
+
+def leiden(ResourceHandle resource_handle,
+           random_state,
+           _GPUGraph graph,
+           size_t max_level,
+           double resolution,
+           double theta,
+           bool_t do_expensive_check):
+    """
+    Compute the modularity optimizing partition of the input graph using the
+    Leiden method.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    random_state : int , optional
+        Random state to use when generating samples. Optional argument,
+        defaults to a hash of process id, time, and hostname.
+        (See pylibcugraph.random.CuGraphRandomState)
+
+    graph : SGGraph or MGGraph
+        The input graph.
+
+    max_level: size_t
+        This controls the maximum number of levels/iterations of the leiden
+        algorithm. When specified the algorithm will terminate after no more
+        than the specified number of iterations. No error occurs when the
+        algorithm terminates early in this manner.
+
+    resolution: double
+        Called gamma in the modularity formula, this changes the size
+        of the communities.  Higher resolutions lead to more smaller
+        communities, lower resolutions lead to fewer larger communities.
+        Defaults to 1.
+
+    theta: double
+        Called theta in the Leiden algorithm, this is used to scale
+        modularity gain in Leiden refinement phase, to compute
+        the probability of joining a random leiden community.
+
+    do_expensive_check : bool_t
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    Returns
+    -------
+    A tuple containing the hierarchical clustering vertices, clusters and
+    modularity score
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 2], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 2, 0], dtype=numpy.int32)
+    >>> weights = cupy.asarray([1.0, 1.0, 1.0], dtype=numpy.float32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=True, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weights,
+    ...     store_transposed=True, renumber=False, do_expensive_check=False)
+    >>> (vertices, clusters, modularity) = pylibcugraph.Leiden(
+                                resource_handle, G, 100, 1., False)
+    >>> vertices
+    [0, 1, 2]
+    >>> clusters
+    [0, 0, 0]
+    >>> modularity
+    0.0
+
+    """
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+    cdef cugraph_hierarchical_clustering_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    cg_rng_state = CuGraphRandomState(resource_handle, random_state)
+
+    cdef cugraph_rng_state_t* rng_state_ptr = cg_rng_state.rng_state_ptr
+
+    error_code = cugraph_leiden(c_resource_handle_ptr,
+                                rng_state_ptr,
+                                c_graph_ptr,
+                                max_level,
+                                resolution,
+                                theta,
+                                do_expensive_check,
+                                &result_ptr,
+                                &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_leiden")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* vertices_ptr = \
+        cugraph_hierarchical_clustering_result_get_vertices(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* clusters_ptr = \
+        cugraph_hierarchical_clustering_result_get_clusters(result_ptr)
+    cdef double modularity = \
+        cugraph_hierarchical_clustering_result_get_modularity(result_ptr)
+
+    cupy_vertices = copy_to_cupy_array(c_resource_handle_ptr, vertices_ptr)
+    cupy_clusters = copy_to_cupy_array(c_resource_handle_ptr, clusters_ptr)
+
+    cugraph_hierarchical_clustering_result_free(result_ptr)
+
+    return (cupy_vertices, cupy_clusters, modularity)
diff --git a/python/pylibcugraph/pylibcugraph/louvain.pyx b/python/pylibcugraph/pylibcugraph/louvain.pyx
index d535c52190a..ecae7e700b4 100644
--- a/python/pylibcugraph/pylibcugraph/louvain.pyx
+++ b/python/pylibcugraph/pylibcugraph/louvain.pyx
@@ -14,11 +14,8 @@
 # Have cython use python 3 syntax
 # cython: language_level = 3
 
-from libc.stdint cimport uintptr_t
-
 from pylibcugraph._cugraph_c.resource_handle cimport (
     bool_t,
-    data_type_id_t,
     cugraph_resource_handle_t,
 )
 from pylibcugraph._cugraph_c.error cimport (
@@ -27,7 +24,6 @@ from pylibcugraph._cugraph_c.error cimport (
 )
 from pylibcugraph._cugraph_c.array cimport (
     cugraph_type_erased_device_array_view_t,
-    cugraph_type_erased_device_array_view_create,
 )
 from pylibcugraph._cugraph_c.graph cimport (
     cugraph_graph_t,
@@ -48,10 +44,7 @@ from pylibcugraph.graphs cimport (
 )
 from pylibcugraph.utils cimport (
     assert_success,
-    assert_CAI_type,
     copy_to_cupy_array,
-    get_c_type_from_numpy_type,
-    create_cugraph_type_erased_device_array_view_from_py_obj,
 )
 
 
diff --git a/python/pylibcugraph/pylibcugraph/testing/utils.py b/python/pylibcugraph/pylibcugraph/testing/utils.py
index f578a146f4b..50fe18fe13d 100644
--- a/python/pylibcugraph/pylibcugraph/testing/utils.py
+++ b/python/pylibcugraph/pylibcugraph/testing/utils.py
@@ -36,17 +36,17 @@ def gen_fixture_params(*param_values):
     combination passed in, or a callable that accepts a list of values and
     returns a string.
 
-    gen_fixture_params( (pytest.param(True, marks=[pytest.mark.A_good], id="A=True"),
-                         pytest.param(False, marks=[pytest.mark.B_bad], id="B=False")),
-                        (pytest.param(False, marks=[pytest.mark.A_bad], id="A=False"),
-                         pytest.param(True, marks=[pytest.mark.B_good], id="B=True")),
+    gen_fixture_params( (pytest.param(True, marks=[pytest.mark.A_good], id="A:True"),
+                         pytest.param(False, marks=[pytest.mark.B_bad], id="B:False")),
+                        (pytest.param(False, marks=[pytest.mark.A_bad], id="A:False"),
+                         pytest.param(True, marks=[pytest.mark.B_good], id="B:True")),
                        )
 
 
     results in fixture param combinations:
 
-    True, False  - marks=[A_good, B_bad]  - id="A=True,B=False"
-    False, False - marks=[A_bad, B_bad]   - id="A=False,B=True"
+    True, False  - marks=[A_good, B_bad]  - id="A:True-B:False"
+    False, False - marks=[A_bad, B_bad]   - id="A:False-B:True"
     """
     fixture_params = []
     param_type = pytest.param().__class__  #
@@ -89,10 +89,10 @@ def gen_fixture_params_product(*args):
 
     results in fixture param combinations:
 
-    True, True   - marks=[A_good, B_good] - id="A=True,B=True"
-    True, False  - marks=[A_good, B_bad]  - id="A=True,B=False"
-    False, True  - marks=[A_bad, B_good]  - id="A=False,B=True"
-    False, False - marks=[A_bad, B_bad]   - id="A=False,B=False"
+    True, True   - marks=[A_good, B_good] - id="A:True-B:True"
+    True, False  - marks=[A_good, B_bad]  - id="A:True-B:False"
+    False, True  - marks=[A_bad, B_good]  - id="A:False-B:True"
+    False, False - marks=[A_bad, B_bad]   - id="A:False-B:False"
 
     Simply using itertools.product on the lists would result in a list of
     sublists of individual param objects (ie. not "merged"), which would not be
@@ -124,9 +124,9 @@ def gen_fixture_params_product(*args):
         for (p, paramId) in zip(paramCombo, ids):
             # Assume paramId is either a string or a callable
             if isinstance(paramId, str):
-                id_strings.append("%s=%s" % (paramId, p.values[0]))
+                id_strings.append("%s:%s" % (paramId, p.values[0]))
             else:
                 id_strings.append(paramId(p.values[0]))
-        comboid = ",".join(id_strings)
+        comboid = "-".join(id_strings)
         retList.append(pytest.param(values, marks=marks, id=comboid))
     return retList
diff --git a/python/pylibcugraph/pylibcugraph/tests/test_connected_components.py b/python/pylibcugraph/pylibcugraph/tests/test_connected_components.py
index b742d8232ec..3e1781e5ef0 100644
--- a/python/pylibcugraph/pylibcugraph/tests/test_connected_components.py
+++ b/python/pylibcugraph/pylibcugraph/tests/test_connected_components.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -446,7 +446,7 @@ def test_invalid_input_wcc():
     sp_indices = scipy_csr.indices  # unsupported
 
     resource_handle = ResourceHandle()
-    graph_props = GraphProperties(is_symmetric=True, is_multigraph=False)
+    graph_props = GraphProperties(is_symmetric=False, is_multigraph=False)
     with pytest.raises(TypeError):
         pylibcugraph.weakly_connected_components(
             resource_handle, None, sp_offsets, sp_indices, None, False
diff --git a/python/pylibcugraph/pylibcugraph/tests/test_uniform_neighbor_sample.py b/python/pylibcugraph/pylibcugraph/tests/test_uniform_neighbor_sample.py
index f4298cc9b36..74aa6830d24 100644
--- a/python/pylibcugraph/pylibcugraph/tests/test_uniform_neighbor_sample.py
+++ b/python/pylibcugraph/pylibcugraph/tests/test_uniform_neighbor_sample.py
@@ -25,6 +25,11 @@
 )
 from pylibcugraph import uniform_neighbor_sample
 
+# Set to True to disable memory leak assertions. This may be necessary when
+# running in environments that share a GPU (pytest-xdist), are using memory
+# pools, or other reasons which may cause the memory leak assertions to
+# improperly fail.
+mem_leak_assert_disabled = True
 
 # =============================================================================
 # Pytest fixtures
@@ -256,7 +261,7 @@ def test_neighborhood_sampling_large_sg_graph(gpubenchmark):
     expected_delta = free_memory_before - free_before_cleanup
     leak = expected_delta - actual_delta
     print(f"  {result_bytes=} {actual_delta=} {expected_delta=} {leak=}")
-    assert free_memory_before == device.mem_info[0]
+    assert (free_memory_before == device.mem_info[0]) or mem_leak_assert_disabled
 
 
 def test_sample_result():
@@ -289,7 +294,7 @@ def test_sample_result():
         device_batch_label=cp.arange(1e8 + 6, dtype="int32"),
     )
 
-    assert free_memory_before > device.mem_info[0]
+    assert (free_memory_before > device.mem_info[0]) or mem_leak_assert_disabled
 
     sources = sampling_result.get_sources()
     destinations = sampling_result.get_destinations()
@@ -304,7 +309,7 @@ def test_sample_result():
     # keeping the refcount >0.
     del sampling_result
     gc.collect()
-    assert free_memory_before > device.mem_info[0]
+    assert (free_memory_before > device.mem_info[0]) or mem_leak_assert_disabled
 
     # Check that the data is still valid
     assert sources[999] == 999
@@ -324,9 +329,9 @@ def test_sample_result():
 
     # sources2 should be keeping the data alive
     assert sources2[999] == 999
-    assert free_memory_before > device.mem_info[0]
+    assert (free_memory_before > device.mem_info[0]) or mem_leak_assert_disabled
 
     # All memory should be freed once the last reference is deleted
     del sources2
     gc.collect()
-    assert free_memory_before == device.mem_info[0]
+    assert (free_memory_before == device.mem_info[0]) or mem_leak_assert_disabled
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index ce16263b21e..ac4786653b5 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -6,9 +6,9 @@ requires = [
     "cmake>=3.23.1,!=3.25.0",
     "cython>=0.29,<0.30",
     "ninja",
-    "pylibraft==23.4.*",
-    "rmm==23.4.*",
-    "scikit-build>=0.13.1",
+    "pylibraft==23.6.*",
+    "rmm==23.6.*",
+    "scikit-build>=0.13.1,<0.17.2",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -19,28 +19,28 @@ testpaths = ["pylibcugraph/tests"]
 
 [project]
 name = "pylibcugraph"
-version = "23.04.01"
+version = "23.06.00"
 description = "pylibcugraph - Python bindings for the libcugraph cuGraph C/C++/CUDA library"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 dependencies = [
-    "pylibraft==23.4.*",
-    "rmm==23.4.*",
+    "pylibraft==23.6.*",
+    "rmm==23.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
 ]
 
 [project.optional-dependencies]
 test = [
-    "cudf==23.4.*",
+    "cudf==23.6.*",
     "networkx>=2.5.1",
     "numpy>=1.21",
     "pandas",
diff --git a/readme_pages/CONTRIBUTING.md b/readme_pages/CONTRIBUTING.md
index 8ab7162f6e4..4b736b25155 100644
--- a/readme_pages/CONTRIBUTING.md
+++ b/readme_pages/CONTRIBUTING.md
@@ -1,5 +1,5 @@
 # Contributing to cuGraph
-cuGraph, for the most part, is an open-source project where we encourage community involvement.  The cugraph-ops package is the expection being a closed-source package. 
+cuGraph, for the most part, is an open-source project where we encourage community involvement.  The cugraph-ops package is the expection being a closed-source package.
 
 There are multiple ways to be involved and contribute to the cuGraph community, the top paths are listed below:
 
@@ -11,8 +11,8 @@ If you are ready to contribute, jump right to the [Contribute Code](https://docs
 
 
 __Style Formatting Tools:__
-* `clang-format`  version 11.1+
-* `flake8`        version 3.5.0+
+* `clang-format`  version 16.0+
+* `flake8`        version 6.0.0+
 
 
 
@@ -36,7 +36,7 @@ If there is a feature or enhancement to an existing feature, please file an issu
 -	describing what you want to see added or changed.  For new features, if there is a white paper on the analytic, please include a reference to it
 
 ***Ask a Question***
-There are several ways to ask questions, including [Stack Overflow]( https://stackoverflow.com/), the quickest is by submiting a GitHub question issue.  
+There are several ways to ask questions, including [Stack Overflow]( https://stackoverflow.com/), the quickest is by submiting a GitHub question issue.
 
 -	Select Question
 -	describing your question
@@ -45,7 +45,7 @@ There are several ways to ask questions, including [Stack Overflow]( https://sta
 
 ## 2) Propose a New Feature and Implement It <a name="implement"></a>
 
-We love when people want to get involved, and if you have a suggestion for a new feature or enhancement and want to be the one doing the development work, we fully encourage that.  
+We love when people want to get involved, and if you have a suggestion for a new feature or enhancement and want to be the one doing the development work, we fully encourage that.
 
 - Submit a New Feature Issue (see above) and state that you are working on it.
 - The team will give feedback on the issue and happy to make suggestions
@@ -72,20 +72,21 @@ If you need more context on a particular issue, please ask.
 2. Read the RAPIDS [Code of Conduct](https://docs.rapids.ai/resources/conduct/)
 3. Find or submit an issue to work on (include a comment that you are working issue)
 4. Fork the cuGraph [repo](#fork) and Code (make sure to add unit tests)!
-5. When done, and code passes local CI, create your pull request (PR)
+5. All RAPIDS projects are released under the Apache-2.0 license, so also make sure all source files that support comments include a copyright and the Apache-2.0 license text.
+6. When done, and code passes local CI, create your pull request (PR)
    1. Update the CHANGELOG.md with PR number - see [Changelog formatting](https://docs.rapids.ai/resources/changelog/)
    2. Ensure that the PR has the proper [tags](./PRTAGS.md)
-   3. Ensure the code matches out [style guide](https://docs.rapids.ai/resources/style/) 
-6. Verify that cuGraph CI passes all [status checks](https://help.github.com/articles/about-status-checks/). Fix if needed
-7. Wait for other developers to review your code and update code as needed
-8. Once reviewed and approved, a RAPIDS developer will merge your pull request
+   3. Ensure the code matches out [style guide](https://docs.rapids.ai/resources/style/)
+7. Verify that cuGraph CI passes all [status checks](https://help.github.com/articles/about-status-checks/). Fix if needed
+8. Wait for other developers to review your code and update code as needed
+9. Once reviewed and approved, a RAPIDS developer will merge your pull request
 
 Remember, if you are unsure about anything, don't hesitate to comment on issues
 and ask for clarifications!
 
 **The _FIXME_** comment<pr>
 
-Use the _FIXME_ comment to capture technical debt.  It should not be used to flag bugs since those need to be cleaned up before code is submitted.   
+Use the _FIXME_ comment to capture technical debt.  It should not be used to flag bugs since those need to be cleaned up before code is submitted.
 We are implementing a script to count and track the number of FIXME in the code.  Usage of TODO or any other tag will not be accepted.
 
 
@@ -94,11 +95,11 @@ We are implementing a script to count and track the number of FIXME in the code.
 The RAPIDS cuGraph repo cannot directly be modified.  Contributions must come in the form of a *Pull Request* from a forked version of cugraph.    GitHub as a nice write up ion the process:  https://help.github.com/en/github/getting-started-with-github/fork-a-repo
 
 1. Fork the cugraph repo to your GitHub account
-2. clone your version 
+2. clone your version
 ```git clone https://github.com/<YOUR GITHUB NAME>/cugraph.git```
 
 
-Read the section on [building cuGraph from source](./SOURCEBUILD.md) to validate that the environment is correct.  
+Read the section on [building cuGraph from source](./SOURCEBUILD.md) to validate that the environment is correct.
 
 **Pro Tip** add an upstream remote repository so that you can keep your forked repo in sync
 ```git remote add upstream https://github.com/rapidsai/cugraph.git```
@@ -113,11 +114,11 @@ cuGraph only allows contribution to the current branch and not main or a future
    1. commit your code
     ```git push```
 6. From the GitHub web page, open a Pull Request
-   1. follow the Pull Request [tagging policy](./PRTAGS.md) 
+   1. follow the Pull Request [tagging policy](./PRTAGS.md)
 
 ### Development Environment
 
-There is no recommended or preferred development environment.  There are a few *must have* conditions on GPU hardware and library versions.  But for the most part, users can work in the environment that they are familiar and comfortable with.  
+There is no recommended or preferred development environment.  There are a few *must have* conditions on GPU hardware and library versions.  But for the most part, users can work in the environment that they are familiar and comfortable with.
 
 **Hardware**
 
diff --git a/readme_pages/SOURCEBUILD.md b/readme_pages/SOURCEBUILD.md
deleted file mode 100644
index 122b8b0d0ee..00000000000
--- a/readme_pages/SOURCEBUILD.md
+++ /dev/null
@@ -1,273 +0,0 @@
-# Building from Source
-
-The following instructions are for users wishing to build cuGraph from source code.  These instructions are tested on supported distributions of Linux, CUDA, and Python - See [RAPIDS Getting Started](https://rapids.ai/start.html) for list of supported environments.  Other operating systems _might be_ compatible, but are not currently tested.
-
-The cuGraph package include both a C/C++ CUDA portion and a python portion.  Both libraries need to be installed in order for cuGraph to operate correctly.
-
-## Prerequisites
-
-__Compiler__:
-* `gcc`         version 9.3+
-* `nvcc`        version 11.0+
-* `cmake`       version 3.20.1+
-
-__CUDA:__
-* CUDA 11.0+
-* NVIDIA driver 450.80.02+
-* Pascal architecture or better
-
-
-You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
-
-
-
-## Building cuGraph
-To install cuGraph from source, ensure the dependencies are met.
-
-
-### Clone Repo and Configure Conda Environment
-__GIT clone a version of the repository__
-
-  ```bash
-  # Set the localtion to cuGraph in an environment variable CUGRAPH_HOME
-  export CUGRAPH_HOME=$(pwd)/cugraph
-
-  # Download the cuGraph repo - if you have a folked version, use that path here instead
-  git clone https://github.com/rapidsai/cugraph.git $CUGRAPH_HOME
-
-  cd $CUGRAPH_HOME
-  ```
-
-__Create the conda development environment__
-
-```bash
-# create the conda environment (assuming in base `cugraph` directory)
-
-# for CUDA 11.5
-conda env create --name cugraph_dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
-
-# activate the environment
-conda activate cugraph_dev
-
-# to deactivate an environment
-conda deactivate
-```
-
-  - The environment can be updated as development includes/changes the dependencies. To do so, run:
-
-
-```bash
-
-# Where XXX is the CUDA 11 version
-conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.XXX.yml
-
-conda activate cugraph_dev
-```
-
-
-### Build and Install Using the `build.sh` Script
-Using the `build.sh` script make compiling and installing cuGraph a breeze.  To build and install, simply do:
-
-```bash
-$ cd $CUGRAPH_HOME
-$ ./build.sh clean
-$ ./build.sh libcugraph
-$ ./build.sh cugraph
-```
-
-There are several other options available on the build script for advanced users.
-`build.sh` options:
-```bash
-build.sh [<target> ...] [<flag> ...]
- where <target> is:
-   clean                      - remove all existing build artifacts and configuration (start over)
-   uninstall                  - uninstall libcugraph and cugraph from a prior build/install (see also -n)
-   libcugraph                 - build libcugraph.so and SG test binaries
-   libcugraph_etl             - build libcugraph_etl.so and SG test binaries
-   pylibcugraph               - build the pylibcugraph Python package
-   cugraph                    - build the cugraph Python package
-   cugraph-service            - build the cugraph-service_client and cugraph-service_server Python package
-   cpp-mgtests                - build libcugraph and libcugraph_etl MG tests. Builds MPI communicator, adding MPI as a dependency.
-   cugraph-dgl                - build the cugraph-dgl extensions for DGL
-   cugraph-pyg                - build the cugraph-dgl extensions for PyG
-   docs                       - build the docs
- and <flag> is:
-   -v                         - verbose build mode
-   -g                         - build for debug
-   -n                         - do not install after a successful build
-   --pydevelop                - use setup.py develop instead of install
-   --allgpuarch               - build for all supported GPU architectures
-   --skip_cpp_tests           - do not build the SG test binaries as part of the libcugraph and libcugraph_etl targets
-   --without_cugraphops       - do not build algos that require cugraph-ops
-   --cmake_default_generator  - use the default cmake generator instead of ninja
-   --clean                    - clean an individual target (note: to do a complete rebuild, use the clean target described above)
-   -h                         - print this text
-
- default action (no args) is to build and install 'libcugraph' then 'libcugraph_etl' then 'pylibcugraph' then 'cugraph' then 'cugraph-service' targets
-
-examples:
-$ ./build.sh clean                        # remove prior build artifacts (start over)
-$ ./build.sh libcugraph -v                # compile and install libcugraph with verbose output
-$ ./build.sh libcugraph -g                # compile and install libcugraph for debug
-$ ./build.sh libcugraph -n                # compile libcugraph but do not install
-
-# make parallelism options can also be defined: Example build jobs using 4 threads (make -j4)
-$ PARALLEL_LEVEL=4 ./build.sh libcugraph
-
-Note that the libraries will be installed to the location set in `$PREFIX` if set (i.e. `export PREFIX=/install/path`), otherwise to `$CONDA_PREFIX`.
-```
-
-
-## Building each section independently
-#### Build and Install the C++/CUDA `libcugraph` Library
-CMake depends on the `nvcc` executable being on your path or defined in `$CUDACXX`.
-
-This project uses cmake for building the C/C++ library. To configure cmake, run:
-
-  ```bash
-  # Set the localtion to cuGraph in an environment variable CUGRAPH_HOME
-  export CUGRAPH_HOME=$(pwd)/cugraph
-
-  cd $CUGRAPH_HOME
-  cd cpp                                        # enter cpp directory
-  mkdir build                                   # create build directory
-  cd build                                      # enter the build directory
-  cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
-
-  # now build the code
-  make -j                                       # "-j" starts multiple threads
-  make install                                  # install the libraries
-  ```
-The default installation locations are `$CMAKE_INSTALL_PREFIX/lib` and `$CMAKE_INSTALL_PREFIX/include/cugraph` respectively.
-
-#### Updating the RAFT branch
-
-`libcugraph` uses the [RAFT](https://github.com/rapidsai/raft) library and there are times when it might be desirable to build against a different RAFT branch, such as when working on new features that might span both RAFT and cuGraph.
-
-For local development, the `CPM_raft_SOURCE=<path/to/raft/source>` option can be passed to the `cmake` command to enable `libcugraph` to use the local RAFT branch.
-
-To have CI test a `cugraph` pull request against a different RAFT branch, modify the bottom of the `cpp/cmake/thirdparty/get_raft.cmake` file as follows:
-
-```cmake
-# Change pinned tag and fork here to test a commit in CI
-# To use a different RAFT locally, set the CMake variable
-# RPM_raft_SOURCE=/path/to/local/raft
-find_and_configure_raft(VERSION    ${CUGRAPH_MIN_VERSION_raft}
-                        FORK       <your_git_fork>
-                        PINNED_TAG <your_git_branch_or_tag>
-
-                        # When PINNED_TAG above doesn't match cugraph,
-                        # force local raft clone in build directory
-                        # even if it's already installed.
-                        CLONE_ON_PIN     ON
-                        )
-```
-
-When the above change is pushed to a pull request, the continuous integration servers will use the specified RAFT branch to run the cuGraph tests. After the changes in the RAFT branch are merged to the release branch, remember to revert the `get_raft.cmake` file back to the original cuGraph branch.
-
-### Building and installing the Python package
-
-2) Install the Python packages to your Python path:
-
-```bash
-cd $CUGRAPH_HOME
-cd python
-cd pylibcugraph
-python setup.py build_ext --inplace
-python setup.py install    # install pylibcugraph
-cd ../cugraph
-python setup.py build_ext --inplace
-python setup.py install    # install cugraph python bindings
-
-```
-
-
-
-## Run tests
-
-If you already have the datasets:
-
-   ```bash
-   export RAPIDS_DATASET_ROOT_DIR=<path_to_ccp_test_and_reference_data>
-   ```
-   If you do not have the datasets:
-
-   ```bash
-   cd $CUGRAPH_HOME/datasets
-   source get_test_data.sh #This takes about 10 minutes and downloads 1GB data (>5 GB uncompressed)
-   ```
-
-Run either the C++ or the Python tests with datasets
-
-  - **Python tests with datasets**
-
-
-    ```bash
-    pip install python-louvain #some tests require this package to run
-    cd $CUGRAPH_HOME
-    cd python
-    pytest
-    ```
-  - **C++ stand alone tests**
-
-    From the build directory :
-
-    ```bash
-    # Run the cugraph tests
-    cd $CUGRAPH_HOME
-    cd cpp/build
-    gtests/GDFGRAPH_TEST		# this is an executable file
-    ```
- - **C++ tests with larger datasets**
-
-
-
-   Run the C++ tests on large input:
-
-   ```bash
-   cd $CUGRAPH_HOME/cpp/build
-   #test one particular analytics (eg. pagerank)
-   gtests/PAGERANK_TEST
-   #test everything
-   make test
-   ```
-
-Note: This conda installation only applies to Linux and Python versions 3.8/3.10.
-
-### (OPTIONAL) Set environment variable on activation
-
-It is possible to configure the conda environment to set environmental variables on activation. Providing instructions to set PATH to include the CUDA toolkit bin directory and LD_LIBRARY_PATH to include the CUDA lib64 directory will be helpful.
-
-```bash
-cd  ~/anaconda3/envs/cugraph_dev
-
-mkdir -p ./etc/conda/activate.d
-mkdir -p ./etc/conda/deactivate.d
-touch ./etc/conda/activate.d/env_vars.sh
-touch ./etc/conda/deactivate.d/env_vars.sh
-```
-
-Next the env_vars.sh file needs to be edited
-
-```bash
-vi ./etc/conda/activate.d/env_vars.sh
-
-#!/bin/bash
-export PATH=/usr/local/cuda-11.0/bin:$PATH # or cuda-11.1 if using CUDA 11.1 and cuda-11.2 if using CUDA 11.2, respectively
-export LD_LIBRARY_PATH=/usr/local/cuda-11.0/lib64:$LD_LIBRARY_PATH # or cuda-11.1 if using CUDA 11.1 and cuda-11.2 if using CUDA 11.2, respectively
-```
-
-```
-vi ./etc/conda/deactivate.d/env_vars.sh
-
-#!/bin/bash
-unset PATH
-unset LD_LIBRARY_PATH
-```
-
-## Creating documentation
-
-Python API documentation can be generated from [docs](docs) directory.
-
-## Attribution
-Portions adopted from https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md
diff --git a/readme_pages/algorithms.md b/readme_pages/algorithms.md
deleted file mode 100644
index fa2e7cc9553..00000000000
--- a/readme_pages/algorithms.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# List of Supported and Planned Algorithms
-
-## Supported Graph
-
-| Type       | Description                                                 |
-| ---------- | ----------------------------------------------------------- |
-| Graph      | A directed or undirected Graph (use directed={True, False}) |
-| Multigraph | A Graph with multiple edges between a vertex pair           |
-|            |                                                             |
-
-ALL Algorithms support Graphs and MultiGraph (directed and undirected)
-
----
-
-<br>
-
-# Supported Algorithms
-
-_Italic_ algorithms are planned for future releases.
-
-Note: Multi-GPU, or MG, includes support for Multi-Node Multi-GPU (also called MNMG).
-
-| Category          | Algorithm                          | Scale               | Notes                                                           |
-| ----------------- | ---------------------------------- | ------------------- | --------------------------------------------------------------- |
-| Centrality        |                                    |                     |                                                                 |
-|                   | Katz                               | __Multi-GPU__ |                                                                 |
-|                   | Betweenness Centrality             | Single-GPU          | MG planned for 23.02                                            |
-|                   | Edge Betweenness Centrality        | Single-GPU          | MG planned for 23.02                                            |
-|                   | Eigenvector Centrality             | __Multi-GPU__ |                                                                 |
-|                   | Degree Centrality                  | __Multi-GPU__ | Python only                                                     |
-| Community         |                                    |                     |                                                                 |
-|                   | Leiden                             | Single-GPU          | MG planned for 23.02                                            |
-|                   | Louvain                            | __Multi-GPU__ |                                                                 |
-|                   | Ensemble Clustering for Graphs     | Single-GPU          |                                                                 |
-|                   | Spectral-Clustering - Balanced Cut | Single-GPU          |                                                                 |
-|                   | Spectral-Clustering - Modularity   | Single-GPU          |                                                                 |
-|                   | Subgraph Extraction                | Single-GPU          |                                                                 |
-|                   | Triangle Counting                  | __Multi-GPU__ |                                                                 |
-|                   | K-Truss                            | Single-GPU          |                                                                 |
-| Components        |                                    |                     |                                                                 |
-|                   | Weakly Connected Components        | __Multi-GPU__ |                                                                 |
-|                   | Strongly Connected Components      | Single-GPU          |                                                                 |
-| Core              |                                    |                     |                                                                 |
-|                   | K-Core                             | **Multi-GPU** |                                                                 |
-|                   | Core Number                        | **Multi-GPU** |                                                                 |
-| _Flow_          |                                    |                     |                                                                 |
-|                   | _MaxFlow_                        | ---                 |                                                                 |
-| _Influence_     |                                    |                     |                                                                 |
-|                   | _Influence Maximization_         | ---                 |                                                                 |
-| Layout            |                                    |                     |                                                                 |
-|                   | Force Atlas 2                      | Single-GPU          |                                                                 |
-| Linear Assignment |                                    |                     |                                                                 |
-|                   | Hungarian                          | Single-GPU          | [README](cpp/src/linear_assignment/README-hungarian.md)            |
-| Link Analysis     |                                    |                     |                                                                 |
-|                   | Pagerank                           | __Multi-GPU__ | [C++ README](cpp/src/centrality/README.md#Pagerank)                |
-|                   | Personal Pagerank                  | __Multi-GPU__ | [C++ README](cpp/src/centrality/README.md#Personalized-Pagerank)   |
-|                   | HITS                               | __Multi-GPU__ |                                                                 |
-| Link Prediction   |                                    |                     |                                                                 |
-|                   | Jaccard Similarity                 | **Multi-GPU**      | MG as of 22.12<br />Directed graph only                         |
-|                   | Weighted Jaccard Similarity        | Single-GPU          |                                                                 |
-|                   | Overlap Similarity                 | **Multi-GPU** | MG as of 22.12                                                  |
-|                   | Sorensen Coefficient               | **Multi-GPU** | MG as of 22.12                                                  |
-|                   | _Local Clustering Coefficient_   | ---                 |                                                                 |
-| Sampling          |                                    |                     |                                                                 |
-|                   | Uniform Random Walks (RW)          | **Multi-GPU** |                                                                 |
-|                   | *Biased Random Walks (RW)*       | ---                 |                                                                 |
-|                   | Egonet                             | **Multi-GPU** |                                                                 |
-|                   | Node2Vec                           | Single-GPU          | MG planned for 23.02                                            |
-|                   | Uniform Neighborhood sampling      | __Multi-GPU__ |                                                                 |
-| Traversal         |                                    |                     |                                                                 |
-|                   | Breadth First Search (BFS)         | __Multi-GPU__ | with cutoff support``[C++ README](cpp/src/traversal/README.md#BFS) |
-|                   | Single Source Shortest Path (SSSP) | __Multi-GPU__ | [C++ README](cpp/src/traversal/README.md#SSSP)                     |
-|                   | _ASSP / APSP_                    | ---                 |                                                                 |
-| Tree              |                                    |                     |                                                                 |
-|                   | Minimum Spanning Tree              | Single-GPU          |                                                                 |
-|                   | Maximum Spanning Tree              | Single-GPU          |                                                                 |
-| Other             |                                    |                     |                                                                 |
-|                   | Renumbering                        | __Multi-GPU__ | multiple columns, any data type                                 |
-|                   | Symmetrize                         | __Multi-GPU__ |                                                                 |
-|                   | Path Extraction                    |                     | Extract paths from BFS/SSP results in parallel                  |
-|                   | Two Hop Neighbors                  | __Multi-GPU__ |                                                                 |
-| Data Generator    |                                    |                     |                                                                 |
-|                   | RMAT                               | __Multi-GPU__ |                                                                 |
-|                   | _Barabasi-Albert_                | ---                 |                                                                 |
-|                   |                                    |                     |                                                                 |
diff --git a/readme_pages/getting_cugraph.md b/readme_pages/getting_cugraph.md
deleted file mode 100644
index dc8e33bead7..00000000000
--- a/readme_pages/getting_cugraph.md
+++ /dev/null
@@ -1,58 +0,0 @@
-
-# Getting cuGraph Packages
-
-There are 4 ways to get cuGraph packages:
-1. [Quick start with Docker Repo](#docker)
-2. [Conda Installation](#conda)
-3. [Pip Installation](#pip)
-4. [Build from Source](#SOURCE)
-
-Or checkout the [RAPIDS install selector](https://rapids.ai/start.html) for a pick list of install options.
-
-<br>
-
-## Docker
-The RAPIDS Docker containers contain all RAPIDS packages, including all from cuGraph, as well as all required supporting packages.   To download a container, please see the [Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you’re running.  This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize all of the RAPIDS libraries: cuDF, cuML, and cuGraph.
-
-<br>
-
-
-## Conda
-It is easy to install cuGraph using conda. You can get a minimal conda installation with [Miniconda](https://conda.io/miniconda.html) or get the full installation with [Anaconda](https://www.anaconda.com/download).
-
-cuGraph Conda packages
- * cugraph - this will also import:
-   * pylibcugraph
-   * libcugraph
- * cugraph_service_client
- * cugraph_service_server
- * cugraph_dgl
- * cugraph_pyg
-
-Replace the package name in the example below to the one you want to install.
-
-
-Install and update cuGraph using the conda command:
-
-```bash
-conda install -c rapidsai -c numba -c conda-forge -c nvidia cugraph cudatoolkit=11.8
-```
-
-Note: This conda installation only applies to Linux and Python versions 3.8/3.10.
-
-<br>
-
-## PIP
-cuGraph, and all of RAPIDS, is available via pip.
-
-```
-pip install cugraph-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
-```
-
-pip packages for other packages are being worked and should be available in early 2023
-
-<br>
-
-## SOURCE
-cuGraph can be build directly from source. First check to make sure you have or can configure a supported environment.
-Instructions for building from source is in our [source build](./SOURCEBUILD.md) page.
diff --git a/thirdparty/LICENSES/LICENSE.cugraph-ops.NVIDIA b/thirdparty/LICENSES/LICENSE.cugraph-ops.NVIDIA
new file mode 100644
index 00000000000..f2d6802958f
--- /dev/null
+++ b/thirdparty/LICENSES/LICENSE.cugraph-ops.NVIDIA
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2020-2023 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */