diff --git a/src/CartLatticeLauncher.hpp.Rt b/src/CartLatticeLauncher.hpp.Rt
index a8fa47f8b..0fbad39ff 100644
--- a/src/CartLatticeLauncher.hpp.Rt
+++ b/src/CartLatticeLauncher.hpp.Rt
@@ -153,13 +153,13 @@ public:
 template < eOperationType I, eCalculateGlobals G, eStage S >
 void CartLatticeLauncher::RunInterior(CudaStream_t stream) const {
   const CartInteriorExecutor< I, G, S > executor(container, data);
-  LaunchExecutorNoWait(executor, stream);
+  LaunchExecutorAsync(executor, stream);
 }
 
 template < eOperationType I, eCalculateGlobals G, eStage S >
 void CartLatticeLauncher::RunBorder(CudaStream_t stream) const {
   const CartBorderExecutor< I, G, S > executor(container, data);
-  LaunchExecutorNoWait(executor, stream);
+  LaunchExecutorAsync(executor, stream);
 }
 
 
diff --git a/src/GetThreads.h b/src/GetThreads.h
index 79524f365..0e2f7e8fc 100644
--- a/src/GetThreads.h
+++ b/src/GetThreads.h
@@ -98,7 +98,7 @@ void LaunchExecutor(const EX& executor) {
 }
 
 template<class EX>
-void LaunchExecutorNoWait(const EX& executor, CudaStream_t stream) {
+void LaunchExecutorAsync(const EX& executor, CudaStream_t stream) {
   const auto exec_params = ComputeLaunchParams(executor);
-  CudaKernelRunNoWait(Kernel< EX >, exec_params.blx, exec_params.thr, stream, executor);
+  CudaKernelRunAsync(Kernel< EX >, exec_params.blx, exec_params.thr, stream, executor);
 }
diff --git a/src/cross.h b/src/cross.h
index 77c0b1855..5d5b424e7 100644
--- a/src/cross.h
+++ b/src/cross.h
@@ -56,16 +56,16 @@
       #ifndef CROSS_HIP 
        #define CudaKernelRun(a__,b__,c__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() )
        #ifdef CROSS_SYNC
-         #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() );
+         #define CudaKernelRunAsync(a__,b__,c__,e__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() );
        #else
-         #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__,0,e__>>>(__VA_ARGS__);
+         #define CudaKernelRunAsync(a__,b__,c__,e__,...) a__<<<b__,c__,0,e__>>>(__VA_ARGS__);
        #endif
       #else
        #define CudaKernelRun(a__,b__,c__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() )
        #ifdef CROSS_SYNC
-         #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() );
+         #define CudaKernelRunAsync(a__,b__,c__,e__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() );
        #else
-         #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__,0,e__>>>(__VA_ARGS__);
+         #define CudaKernelRunAsync(a__,b__,c__,e__,...) a__<<<b__,c__,0,e__>>>(__VA_ARGS__);
        #endif
       #endif
       #define CudaBlock blockIdx
@@ -302,7 +302,7 @@
     }
 
     template <typename F, typename ...P>
-    inline void CudaKernelRunNoWait(F &&func, const dim3& blocks, const dim3& threads, CudaStream_t stream, P &&... args) {
+    inline void CudaKernelRunAsync(F &&func, const dim3& blocks, const dim3& threads, CudaStream_t stream, P &&... args) {
       CPUKernelRun(func, blocks, std::forward<P>(args)...);
     }