Implement optimized support for vector I/O in Subfiling VFD (#3896)

Vector I/O requests are now processed within a single set of I/O call batches, rather than each I/O vector entry (tuple constructed from the types, addrs, sizes and bufs arrays) being processed individually. This allows I/O to be more efficiently parallelized among the I/O concentrator processes during large I/O requests. * Fixed some calculations and add test cases for issues spotted from review * Removed a variable that was compensating for previous miscalculations
HDFGroup · Dec 27, 2023 · 6ffc55c · 6ffc55c
1 parent 695efa9
commit 6ffc55c
Show file tree

Hide file tree

Showing 8 changed files with 2,190 additions and 1,060 deletions.
diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt
@@ -278,6 +278,16 @@ New Features
 
     Library:
     --------
+    - Implemented optimized support for vector I/O in the Subfiling VFD
+
+      Previously, the Subfiling VFD would handle vector I/O requests by
+      breaking them down into individual I/O requests, one for each entry
+      in the I/O vectors provided. This could result in poor I/O performance
+      for features in HDF5 that utilize vector I/O, such as parallel I/O
+      to filtered datasets. The Subfiling VFD now properly handles vector
+      I/O requests in their entirety, resulting in fewer I/O calls, improved
+      vector I/O performance and improved vector I/O memory efficiency.
+
     - Added a simple cache to the read-only S3 (ros3) VFD
 
       The read-only S3 VFD now caches the first N bytes of a file stored

diff --git a/src/H5FDsubfiling/H5FDioc.c b/src/H5FDsubfiling/H5FDioc.c
@@ -1610,12 +1610,14 @@ H5FD__ioc_write_vector_internal(H5FD_t *_file, uint32_t count, H5FD_mem_t H5_ATT
     H5FD_ioc_t          *file_ptr      = (H5FD_ioc_t *)_file;
     io_req_t           **sf_io_reqs    = NULL;
     int64_t              sf_context_id = -1;
+    size_t               io_size       = 0;
+    bool                 extend_sizes  = false;
     herr_t               ret_value     = SUCCEED;
 
     assert(_file);
-    assert(addrs);
-    assert(sizes);
-    assert(bufs);
+    assert((addrs) || (count == 0));
+    assert((sizes) || (count == 0));
+    assert((bufs) || (count == 0));
 
     if (count == 0)
         H5_SUBFILING_GOTO_DONE(SUCCEED);
@@ -1648,12 +1650,22 @@ H5FD__ioc_write_vector_internal(H5FD_t *_file, uint32_t count, H5FD_mem_t H5_ATT
     for (size_t i = 0; i < (size_t)count; i++) {
         herr_t write_status;
 
-        if (sizes[i] == 0)
+        if (!extend_sizes) {
+            if ((i > 0) && (sizes[i] == 0)) {
+                extend_sizes = true;
+                io_size      = sizes[i - 1];
+            }
+            else {
+                io_size = sizes[i];
+            }
+        }
+
+        if (io_size == 0)
             H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "invalid size argument of 0");
 
         H5_CHECK_OVERFLOW(addrs[i], haddr_t, int64_t);
-        H5_CHECK_OVERFLOW(sizes[i], size_t, int64_t);
-        write_status = ioc__write_independent_async(sf_context_id, (int64_t)addrs[i], (int64_t)sizes[i],
+        H5_CHECK_OVERFLOW(io_size, size_t, int64_t);
+        write_status = ioc__write_independent_async(sf_context_id, (int64_t)addrs[i], (int64_t)io_size,
                                                     bufs[i], &sf_io_reqs[i]);
 
         if (write_status < 0)
@@ -1691,12 +1703,14 @@ H5FD__ioc_read_vector_internal(H5FD_t *_file, uint32_t count, haddr_t addrs[], s
     H5FD_ioc_t          *file_ptr      = (H5FD_ioc_t *)_file;
     io_req_t           **sf_io_reqs    = NULL;
     int64_t              sf_context_id = -1;
+    size_t               io_size       = 0;
+    bool                 extend_sizes  = false;
     herr_t               ret_value     = SUCCEED;
 
     assert(_file);
-    assert(addrs);
-    assert(sizes);
-    assert(bufs);
+    assert((addrs) || (count == 0));
+    assert((sizes) || (count == 0));
+    assert((bufs) || (count == 0));
 
     if (count == 0)
         H5_SUBFILING_GOTO_DONE(SUCCEED);
@@ -1720,12 +1734,22 @@ H5FD__ioc_read_vector_internal(H5FD_t *_file, uint32_t count, haddr_t addrs[], s
         H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't allocate MPI request array");
 
     for (size_t i = 0; i < (size_t)count; i++) {
-        int read_status;
+        herr_t read_status;
+
+        if (!extend_sizes) {
+            if ((i > 0) && (sizes[i] == 0)) {
+                extend_sizes = true;
+                io_size      = sizes[i - 1];
+            }
+            else {
+                io_size = sizes[i];
+            }
+        }
 
         H5_CHECK_OVERFLOW(addrs[i], haddr_t, int64_t);
-        H5_CHECK_OVERFLOW(sizes[i], size_t, int64_t);
-        read_status = ioc__read_independent_async(sf_context_id, (int64_t)addrs[i], (int64_t)sizes[i],
-                                                  bufs[i], &sf_io_reqs[i]);
+        H5_CHECK_OVERFLOW(io_size, size_t, int64_t);
+        read_status = ioc__read_independent_async(sf_context_id, (int64_t)addrs[i], (int64_t)io_size, bufs[i],
+                                                  &sf_io_reqs[i]);
 
         if (read_status < 0)
             H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "couldn't queue read operation");

diff --git a/src/H5FDsubfiling/H5FDioc_int.c b/src/H5FDsubfiling/H5FDioc_int.c
@@ -297,9 +297,13 @@ ioc__read_independent_async(int64_t context_id, int64_t offset, int64_t elements
      * unpredictable order. However, if some IOCs own more than
      * 1 subfile, we need to associate each read with a unique
      * message tag to make sure the data is received in the
-     * correct order.
+     * correct order. We also need a unique message tag in the
+     * case where only 1 subfile is used in total. In this case,
+     * vector I/O calls are passed directly down to this VFD without
+     * being split up into multiple I/O requests, so we need the
+     * tag to distinguish each I/O request.
      */
-    need_data_tag = num_subfiles != num_io_concentrators;
+    need_data_tag = (num_subfiles == 1) || (num_subfiles != num_io_concentrators);
     if (!need_data_tag)
         data_tag = READ_INDEP_DATA;
 

diff --git a/src/H5FDsubfiling/H5FDioc_threads.c b/src/H5FDsubfiling/H5FDioc_threads.c
@@ -456,8 +456,9 @@ translate_opcode(io_op_t op)
         case LOGGING_OP:
             return "LOGGING_OP";
             break;
+        default:
+            return "unknown";
     }
-    return "unknown";
 }
 #endif
 
@@ -873,9 +874,14 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int ioc_idx, int source, MPI_C
      * unpredictable order. However, if some IOCs own more than
      * 1 subfile, we need to associate each read with a unique
      * message tag to make sure the data is received in the
-     * correct order.
+     * correct order. We also need a unique message tag in the
+     * case where only 1 subfile is used in total. In this case,
+     * vector I/O calls are passed directly down to this VFD without
+     * being split up into multiple I/O requests, so we need the
+     * tag to distinguish each I/O request.
      */
-    need_data_tag = sf_context->sf_num_subfiles != sf_context->topology->n_io_concentrators;
+    need_data_tag = (sf_context->sf_num_subfiles == 1) ||
+                    (sf_context->sf_num_subfiles != sf_context->topology->n_io_concentrators);
     if (!need_data_tag)
         send_tag = READ_INDEP_DATA;