From 462478ce885a146ee17896209d31576234228ad4 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Fri, 10 Oct 2025 21:50:48 +0000 Subject: [PATCH] transport: Deprecate enforce_cst CUDA 11.3 released cuFlushGPUDirectRDMAWrites API which takes the place of the host transport enforce_cst api. NVSHMEM no longer supports CUDA 11, so these legacy API's can be removed. Signed-off-by: Seth Zegelstein --- src/host/proxy/proxy.cpp | 50 +++--------- .../internal/host_transport/transport.h | 1 - src/modules/transport/ibdevx/ibdevx.cpp | 41 ---------- src/modules/transport/ibgda/ibgda.cpp | 1 - src/modules/transport/ibrc/ibrc.cpp | 1 - src/modules/transport/libfabric/libfabric.cpp | 79 ------------------- src/modules/transport/libfabric/libfabric.h | 5 -- src/modules/transport/ucx/ucx.cpp | 62 --------------- 8 files changed, 11 insertions(+), 229 deletions(-) diff --git a/src/host/proxy/proxy.cpp b/src/host/proxy/proxy.cpp index ccb3f26..2b81707 100644 --- a/src/host/proxy/proxy.cpp +++ b/src/host/proxy/proxy.cpp @@ -686,51 +686,23 @@ int process_channel_amo(proxy_state_t *state, proxy_channel_t *ch, int *is_proce } void enforce_cst(proxy_state_t *proxy_state) { -#if defined(NVSHMEM_X86_64) - nvshmemi_state_t *state = proxy_state->nvshmemi_state; -#endif - int status = 0; if (nvshmemi_options.BYPASS_FLUSH) return; - if (proxy_state->is_consistency_api_supported) { - if (CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER > proxy_state->gdr_device_native_ordering && - CUPFN(nvshmemi_cuda_syms, cuFlushGPUDirectRDMAWrites)) { - status = - CUPFN(nvshmemi_cuda_syms, - cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX, - CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER)); - /** We would want to use cudaFlushGPUDirectRDMAWritesToAllDevices when we enable - consistent access of data on any GPU (and not just self GPU) with - wait_until, quiet, barrier, etc. **/ - if (status != CUDA_SUCCESS) { - NVSHMEMI_ERROR_EXIT("cuFlushGPUDirectRDMAWrites() failed in the proxy thread \n"); - } - } - return; - } -#if defined(NVSHMEM_PPC64LE) - status = cudaEventRecord(proxy_state->cuev, proxy_state->stream); - if (unlikely(status != CUDA_SUCCESS)) { - NVSHMEMI_ERROR_EXIT("cuEventRecord() failed in the proxy thread \n"); - } -#elif defined(NVSHMEM_X86_64) - for (int i = 0; i < state->num_initialized_transports; i++) { - if (!((state->transport_bitmap) & (1 << i))) continue; - struct nvshmem_transport *tcurr = state->transports[i]; - if (!tcurr->host_ops.enforce_cst) continue; - - // assuming the transport is connected - IB RC - if (tcurr->attr & NVSHMEM_TRANSPORT_ATTR_CONNECTED) { - status = tcurr->host_ops.enforce_cst(tcurr); - if (status) { - NVSHMEMI_ERROR_PRINT("aborting due to error in progress_cst \n"); - exit(-1); - } + if (CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER > proxy_state->gdr_device_native_ordering && + CUPFN(nvshmemi_cuda_syms, cuFlushGPUDirectRDMAWrites)) { + status = + CUPFN(nvshmemi_cuda_syms, + cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX, + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER)); + /** We would want to use cudaFlushGPUDirectRDMAWritesToAllDevices when we enable + consistent access of data on any GPU (and not just self GPU) with + wait_until, quiet, barrier, etc. **/ + if (status != CUDA_SUCCESS) { + NVSHMEMI_ERROR_EXIT("cuFlushGPUDirectRDMAWrites() failed in the proxy thread \n"); } } -#endif } inline void quiet_ack_channels(proxy_state_t *proxy_state) { diff --git a/src/include/internal/host_transport/transport.h b/src/include/internal/host_transport/transport.h index 2487ca1..c607189 100644 --- a/src/include/internal/host_transport/transport.h +++ b/src/include/internal/host_transport/transport.h @@ -148,7 +148,6 @@ struct nvshmem_transport_host_ops { fence_handle fence; quiet_handle quiet; put_signal_handle put_signal; - int (*enforce_cst)(struct nvshmem_transport *transport); int (*enforce_cst_at_target)(struct nvshmem_transport *transport); int (*add_device_remote_mem_handles)(struct nvshmem_transport *transport, int transport_stride, nvshmem_mem_handle_t *mem_handles, uint64_t heap_offset, diff --git a/src/modules/transport/ibdevx/ibdevx.cpp b/src/modules/transport/ibdevx/ibdevx.cpp index b2b1699..e3ae5ec 100644 --- a/src/modules/transport/ibdevx/ibdevx.cpp +++ b/src/modules/transport/ibdevx/ibdevx.cpp @@ -1440,46 +1440,6 @@ int nvshmemt_ibdevx_amo(struct nvshmem_transport *tcurr, int pe, void *curetptr, return status; } -int nvshmemt_ibdevx_enforce_cst_at_target(struct nvshmem_transport *tcurr) { - nvshmemt_ib_common_state_t ibdevx_state = (nvshmemt_ib_common_state_t)tcurr->state; - struct ibdevx_ep *ep = (struct ibdevx_ep *)ibdevx_state->cst_ep; - struct ibdevx_rw_wqe *wqe; - - int status = 0; - - uintptr_t wqe_bb_idx_64 = ep->wqe_bb_idx; - uint32_t wqe_bb_idx_32 = ep->wqe_bb_idx; - size_t wqe_size; - - wqe = (struct ibdevx_rw_wqe *)((char *)ep->wq_buf + - ((wqe_bb_idx_64 % get_ibdevx_qp_depth(ibdevx_state)) - << NVSHMEMT_IBDEVX_WQE_BB_SHIFT)); - wqe_size = sizeof(struct ibdevx_rw_wqe); - memset(wqe, 0, sizeof(struct ibdevx_rw_wqe)); - - wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; - wqe->ctrl.qpn_ds = - htobe32((uint32_t)(wqe_size / NVSHMEMT_IBDEVX_MLX5_SEND_WQE_DS) | ep->qpid << 8); - wqe->ctrl.opmod_idx_opcode = htobe32(MLX5_OPCODE_RDMA_READ | (wqe_bb_idx_32 << 8)); - - wqe->raddr.raddr = htobe64((uintptr_t)local_dummy_mr.mr->addr); - wqe->raddr.rkey = htobe32(local_dummy_mr.rkey); - - wqe->data.data_seg.byte_count = htobe32((uint32_t)4); - wqe->data.data_seg.lkey = htobe32(local_dummy_mr.lkey); - wqe->data.data_seg.addr = htobe64((uintptr_t)local_dummy_mr.mr->addr); - - assert(wqe_size <= MLX5_SEND_WQE_BB); - ep->wqe_bb_idx++; - nvshmemt_ibdevx_post_send(ep, (void *)wqe, 1); - - status = nvshmemt_ib_common_check_poll_avail(tcurr, ep, NVSHMEMT_IB_COMMON_WAIT_ALL); - NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, "check_poll failed \n"); - -out: - return status; -} - // Using common fence and quiet functions from transport_ib_common int nvshmemt_ibdevx_ep_create(struct ibdevx_ep **ep, int devid, nvshmem_transport_t t, @@ -1932,7 +1892,6 @@ int nvshmemt_init(nvshmem_transport_t *t, struct nvshmemi_cuda_fn_table *table, transport->host_ops.finalize = nvshmemt_ibdevx_finalize; transport->host_ops.show_info = nvshmemt_ibdevx_show_info; transport->host_ops.progress = nvshmemt_ibdevx_progress; - transport->host_ops.enforce_cst = nvshmemt_ibdevx_enforce_cst_at_target; transport->host_ops.put_signal = nvshmemt_put_signal; transport->attr = NVSHMEM_TRANSPORT_ATTR_CONNECTED; diff --git a/src/modules/transport/ibgda/ibgda.cpp b/src/modules/transport/ibgda/ibgda.cpp index f69ee8a..c666efe 100644 --- a/src/modules/transport/ibgda/ibgda.cpp +++ b/src/modules/transport/ibgda/ibgda.cpp @@ -4915,7 +4915,6 @@ int nvshmemt_init(nvshmem_transport_t *t, struct nvshmemi_cuda_fn_table *table, transport->host_ops.amo = NULL; transport->host_ops.fence = NULL; transport->host_ops.quiet = NULL; - transport->host_ops.enforce_cst = NULL; transport->host_ops.add_device_remote_mem_handles = nvshmemt_ibgda_add_device_remote_mem_handles; transport->host_ops.put_signal = NULL; diff --git a/src/modules/transport/ibrc/ibrc.cpp b/src/modules/transport/ibrc/ibrc.cpp index 07a75e1..f8dfd3e 100644 --- a/src/modules/transport/ibrc/ibrc.cpp +++ b/src/modules/transport/ibrc/ibrc.cpp @@ -1800,7 +1800,6 @@ int nvshmemt_init(nvshmem_transport_t *t, struct nvshmemi_cuda_fn_table *table, transport->host_ops.progress = nvshmemt_ibrc_progress; transport->host_ops.put_signal = nvshmemt_put_signal; - transport->host_ops.enforce_cst = nvshmemt_ibrc_enforce_cst_at_target; #if !defined(NVSHMEM_PPC64LE) && !defined(NVSHMEM_AARCH64) if (!use_gdrcopy) #endif diff --git a/src/modules/transport/libfabric/libfabric.cpp b/src/modules/transport/libfabric/libfabric.cpp index a7a4d6c..9d03c7b 100644 --- a/src/modules/transport/libfabric/libfabric.cpp +++ b/src/modules/transport/libfabric/libfabric.cpp @@ -1071,71 +1071,6 @@ int nvshmemt_put_signal_unordered(struct nvshmem_transport *tcurr, int pe, rma_v return status; } -static int nvshmemt_libfabric_enforce_cst(struct nvshmem_transport *tcurr) { - nvshmemt_libfabric_state_t *libfabric_state = (nvshmemt_libfabric_state_t *)tcurr->state; - uint64_t num_retries = 0; - int status; - int target_ep; - int mype = tcurr->my_pe; - -#ifdef NVSHMEM_USE_GDRCOPY - if (use_gdrcopy) { - if (libfabric_state->provider != NVSHMEMT_LIBFABRIC_PROVIDER_SLINGSHOT) { - int temp; - nvshmemt_libfabric_memhandle_info_t *mem_handle_info; - - mem_handle_info = - (nvshmemt_libfabric_memhandle_info_t *)nvshmemt_mem_handle_cache_get_by_idx( - libfabric_state->cache, 0); - if (!mem_handle_info) { - goto skip; - } - gdrcopy_ftable.copy_from_mapping(mem_handle_info->mh, &temp, mem_handle_info->cpu_ptr, - sizeof(int)); - } - } - -skip: -#endif - - target_ep = mype * NVSHMEMT_LIBFABRIC_DEFAULT_NUM_EPS + NVSHMEMT_LIBFABRIC_PROXY_EP_IDX; - do { - struct fi_msg_rma msg; - struct iovec l_iov; - struct fi_rma_iov r_iov; - void *desc = libfabric_state->local_mr_desc[NVSHMEMT_LIBFABRIC_PROXY_EP_IDX]; - uint64_t flags = 0; - - memset(&msg, 0, sizeof(struct fi_msg_rma)); - memset(&l_iov, 0, sizeof(struct iovec)); - memset(&r_iov, 0, sizeof(struct fi_rma_iov)); - - l_iov.iov_base = libfabric_state->local_mem_ptr; - l_iov.iov_len = 8; - - r_iov.addr = 0; // Zero offset - r_iov.len = 8; - r_iov.key = libfabric_state->local_mr_key[NVSHMEMT_LIBFABRIC_PROXY_EP_IDX]; - - msg.msg_iov = &l_iov; - msg.desc = &desc; - msg.iov_count = 1; - msg.rma_iov = &r_iov; - msg.rma_iov_count = 1; - msg.context = NULL; - msg.data = 0; - - if (libfabric_state->prov_info->caps & FI_FENCE) flags |= FI_FENCE; - - status = - fi_readmsg(libfabric_state->eps[NVSHMEMT_LIBFABRIC_PROXY_EP_IDX].endpoint, &msg, flags); - /* This try_again makes an assumption that enforce_cst is only for proxy threaded ops*/ - } while (try_again(tcurr, &status, &num_retries, 1)); - - libfabric_state->eps[target_ep].submitted_ops++; - return status; -} - static int nvshmemt_libfabric_release_mem_handle(nvshmem_mem_handle_t *mem_handle, nvshmem_transport_t t) { nvshmemt_libfabric_state_t *libfabric_state = (nvshmemt_libfabric_state_t *)t->state; @@ -1177,9 +1112,6 @@ static int nvshmemt_libfabric_release_mem_handle(nvshmem_mem_handle_t *mem_handl max_reg = 1; for (int i = 0; i < max_reg; i++) { - if (libfabric_state->local_mr[i] == fabric_handle->hdls[i].mr) - libfabric_state->local_mr[i] = NULL; - int status = fi_close(&fabric_handle->hdls[i].mr->fid); if (status) { NVSHMEMI_WARN_PRINT("Error releasing mem handle idx %d (%d): %s\n", i, status, @@ -1359,15 +1291,6 @@ static int nvshmemt_libfabric_get_mem_handle(nvshmem_mem_handle_t *mem_handle, v } while (curr_ptr < (char *)buf + length); } - if (libfabric_state->local_mr[0] == NULL && !local_only) { - for (int i = 0; i < NVSHMEMT_LIBFABRIC_DEFAULT_NUM_EPS; i++) { - libfabric_state->local_mr[i] = fabric_handle->hdls[i].mr; - libfabric_state->local_mr_key[i] = fabric_handle->hdls[i].key; - libfabric_state->local_mr_desc[i] = fabric_handle->hdls[i].local_desc; - } - libfabric_state->local_mem_ptr = buf; - } - out: if (status) { if (handle_info) { @@ -2100,8 +2023,6 @@ int nvshmemt_init(nvshmem_transport_t *t, struct nvshmemi_cuda_fn_table *table, transport->host_ops.finalize = nvshmemt_libfabric_finalize; transport->host_ops.show_info = nvshmemt_libfabric_show_info; transport->host_ops.progress = nvshmemt_libfabric_progress; - transport->host_ops.enforce_cst = nvshmemt_libfabric_enforce_cst; - transport->attr = NVSHMEM_TRANSPORT_ATTR_CONNECTED; transport->is_successfully_initialized = true; diff --git a/src/modules/transport/libfabric/libfabric.h b/src/modules/transport/libfabric/libfabric.h index f2b5931..17cb7c7 100644 --- a/src/modules/transport/libfabric/libfabric.h +++ b/src/modules/transport/libfabric/libfabric.h @@ -278,11 +278,6 @@ typedef struct { struct fid_domain *domain; struct fid_av *addresses[NVSHMEMT_LIBFABRIC_DEFAULT_NUM_EPS]; nvshmemt_libfabric_endpoint_t *eps; - /* local_mr is used only for consistency ops. */ - struct fid_mr *local_mr[2]; - uint64_t local_mr_key[2]; - void *local_mr_desc[2]; - void *local_mem_ptr; nvshmemt_libfabric_domain_name_t *domain_names; int num_domains; nvshmemt_libfabric_provider provider; diff --git a/src/modules/transport/ucx/ucx.cpp b/src/modules/transport/ucx/ucx.cpp index 47b9f54..8ac071e 100644 --- a/src/modules/transport/ucx/ucx.cpp +++ b/src/modules/transport/ucx/ucx.cpp @@ -1179,67 +1179,6 @@ int nvshmemt_ucx_finalize(nvshmem_transport_t transport) { return 0; } -int nvshmemt_ucx_enforce_cst_at_target(struct nvshmem_transport *tcurr) { - transport_ucx_state_t *ucx_state = (transport_ucx_state_t *)tcurr->state; - nvshmemt_ucx_mem_handle_info_t *mem_handle_info; - - mem_handle_info = - (nvshmemt_ucx_mem_handle_info_t *)nvshmemt_mem_handle_cache_get_by_idx(ucx_state->cache, 0); - - if (!mem_handle_info) return 0; -#ifdef NVSHMEM_USE_GDRCOPY - if (use_gdrcopy) { - int temp; - gdrcopy_ftable.copy_from_mapping(mem_handle_info->mh, &temp, mem_handle_info->cpu_ptr, - sizeof(int)); - return 0; - } -#endif - int mype = tcurr->my_pe; - int ep_index = (ucx_state->ep_count * mype + ucx_state->proxy_ep_idx); - ucp_ep_h ep = ucx_state->endpoints[ep_index]; - ucp_request_param_t param; - ucs_status_ptr_t ucs_ptr_rc = NULL; - ucs_status_t ucs_rc; - nvshmemt_ucx_mem_handle_t *mem_handle; - ucp_rkey_h rkey; - int local_int; - - mem_handle = mem_handle_info->mem_handle; - if (unlikely(mem_handle->ep_rkey_host == NULL)) { - ucs_rc = ucp_ep_rkey_unpack(ep, mem_handle->rkey_packed_buf, &mem_handle->ep_rkey_host); - if (ucs_rc != UCS_OK) { - NVSHMEMI_ERROR_EXIT("Unable to unpack rkey in UCS transport! Exiting.\n"); - } - } - rkey = mem_handle->ep_rkey_host; - - param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK; - param.cb.send = nvshmemt_ucx_send_request_cb; - - ucs_ptr_rc = - ucp_get_nbx(ep, &local_int, sizeof(int), (uint64_t)mem_handle_info->ptr, rkey, ¶m); - - /* Wait for completion of get. */ - if (ucs_ptr_rc != NULL) { - if (UCS_PTR_IS_ERR(ucs_ptr_rc)) { - NVSHMEMI_ERROR_PRINT("UCX CST request completed with error.\n"); - return NVSHMEMX_ERROR_INTERNAL; - } else { - do { - ucs_rc = ucp_request_check_status(ucs_ptr_rc); - ucp_worker_progress(ucx_state->worker_context); - } while (ucs_rc == UCS_INPROGRESS); - if (ucs_rc != UCS_OK) { - NVSHMEMI_ERROR_PRINT("UCX CST request completed with error.\n"); - return NVSHMEMX_ERROR_INTERNAL; - } - } - } - - return 0; -} - int nvshmemt_ucx_show_info(struct nvshmem_transport *transport, int style) { NVSHMEMI_ERROR_PRINT("UCX show info not implemented"); return 0; @@ -1445,7 +1384,6 @@ int nvshmemt_init(nvshmem_transport_t *t, struct nvshmemi_cuda_fn_table *table, transport->host_ops.finalize = nvshmemt_ucx_finalize; transport->host_ops.show_info = nvshmemt_ucx_show_info; transport->host_ops.progress = nvshmemt_ucx_progress; - transport->host_ops.enforce_cst = nvshmemt_ucx_enforce_cst_at_target; transport->host_ops.enforce_cst_at_target = NULL; transport->host_ops.put_signal = nvshmemt_put_signal; transport->attr = NVSHMEM_TRANSPORT_ATTR_CONNECTED;