From 9c0adf9a15e57218097a0d6b8ed5f79b7573508c Mon Sep 17 00:00:00 2001 From: Leonid Belyaev Date: Wed, 13 Sep 2023 13:40:03 -0400 Subject: [PATCH 1/3] REVERT potential resource leak fix --- .../mpi-wrappers/mpi_group_wrappers.cpp | 19 ------------------ mpi-proxy-split/p2p_drain_send_recv.cpp | 20 ++++++++++--------- 2 files changed, 11 insertions(+), 28 deletions(-) diff --git a/mpi-proxy-split/mpi-wrappers/mpi_group_wrappers.cpp b/mpi-proxy-split/mpi-wrappers/mpi_group_wrappers.cpp index 2ceb0fcca..e6e403191 100644 --- a/mpi-proxy-split/mpi-wrappers/mpi_group_wrappers.cpp +++ b/mpi-proxy-split/mpi-wrappers/mpi_group_wrappers.cpp @@ -50,25 +50,6 @@ USER_DEFINED_WRAPPER(int, Comm_group, (MPI_Comm) comm, (MPI_Group *) group) return retval; } -// Calls MPI_Comm_group to define a new group for internal purposes. -// See: p2p_drain_send_recv.cpp -int -MPI_Comm_internal_virt_group(MPI_Comm comm, MPI_Group *group) -{ - int retval; - DMTCP_PLUGIN_DISABLE_CKPT(); - MPI_Comm realComm = VIRTUAL_TO_REAL_COMM(comm); - JUMP_TO_LOWER_HALF(lh_info.fsaddr); - retval = NEXT_FUNC(Comm_group)(realComm, group); - RETURN_TO_UPPER_HALF(); - if (retval == MPI_SUCCESS) { - MPI_Group virtGroup = ADD_NEW_GROUP(*group); - *group = virtGroup; - } - DMTCP_PLUGIN_ENABLE_CKPT(); - return retval; -} - USER_DEFINED_WRAPPER(int, Group_size, (MPI_Group) group, (int *) size) { int retval; diff --git a/mpi-proxy-split/p2p_drain_send_recv.cpp b/mpi-proxy-split/p2p_drain_send_recv.cpp index 84a29b3bd..c125eedbc 100644 --- a/mpi-proxy-split/p2p_drain_send_recv.cpp +++ b/mpi-proxy-split/p2p_drain_send_recv.cpp @@ -45,7 +45,6 @@ extern int MPI_Comm_create_group_internal(MPI_Comm comm, MPI_Group group, extern int MPI_Comm_free_internal(MPI_Comm *comm); extern int MPI_Comm_group_internal(MPI_Comm comm, MPI_Group *group); extern int MPI_Group_free_internal(MPI_Group *group); -extern int MPI_Comm_internal_virt_group(MPI_Comm comm, MPI_Group *group); int *g_sendBytesByRank; // Number of bytes sent to other ranks int *g_rsendBytesByRank; // Number of bytes sent to other ranks by MPI_rsend int *g_bytesSentToUsByRank; // Number of bytes other ranks sent to us @@ -76,7 +75,15 @@ registerLocalSendsAndRecvs() // Get a copy of MPI_COMM_WORLD MPI_Group group_world; MPI_Comm mana_comm; - MPI_Comm_internal_virt_group(MPI_COMM_WORLD, &group_world); + // MPI_Comm_group creates the same real id (and therefore, virtual id) for + // identical calls. If the application already has this virtual id, no extra + // resource is created. + // + // See mpi-proxy-split/virtual-ids.h, onCreate(id), usage of realIdExists. + // + // FIXME: This Comm_group can cause an extra LOG_CALL and LOG_REPLAY. But, it + // needs to happen if the application moves to create the same vid /later/. + MPI_Comm_group(MPI_COMM_WORLD, &group_world); MPI_Comm_create_group_internal(MPI_COMM_WORLD, group_world, 1, &mana_comm); // broadcast sendBytes and recvBytes @@ -85,15 +92,10 @@ registerLocalSendsAndRecvs() g_bytesSentToUsByRank[g_world_rank] = 0; // Free resources - // mana_comm is a real id, and MPI_Comm_free_internal expects a - // virtual id, but it works out because virtualToReal(real_id) is - // defined to be real_id. MPI_Comm_free_internal(&mana_comm); - // Because group_world is a virtual group, we have to free both its - // virtual and real id to clean up correctly. - MPI_Group_free_internal(&group_world); - REMOVE_OLD_GROUP(group_world); + // We do NOT free group_world, because if the application also made this + // call, there is a collision. } // status was received by MPI_Iprobe From 9c15f1829a5d2a5c299c481353472220dec4e625 Mon Sep 17 00:00:00 2001 From: Kapil Arya Date: Wed, 13 Sep 2023 20:33:43 -0700 Subject: [PATCH 2/3] Simplified mtcp-restart plugin interface. This uses the newly proposed environment variable based mechanism. --- dmtcp | 2 +- restart_plugin/dmtcp_restart_plugin.cpp | 26 +++---- restart_plugin/mtcp_restart_plugin.c | 95 +++++++++++++++---------- 3 files changed, 68 insertions(+), 55 deletions(-) diff --git a/dmtcp b/dmtcp index 339d6ffba..04c6a4a54 160000 --- a/dmtcp +++ b/dmtcp @@ -1 +1 @@ -Subproject commit 339d6ffbabdf5b1ed6fe763c7ec8007e3bca32ba +Subproject commit 04c6a4a540a2122bc804c1bb48870fdfd65c69ec diff --git a/restart_plugin/dmtcp_restart_plugin.cpp b/restart_plugin/dmtcp_restart_plugin.cpp index 422b67f1b..fc0342587 100644 --- a/restart_plugin/dmtcp_restart_plugin.cpp +++ b/restart_plugin/dmtcp_restart_plugin.cpp @@ -1,6 +1,7 @@ #include "workerstate.h" #include "dmtcp_restart.h" #include "jassert.h" +#include "jconvert.h" #include "jfilesystem.h" #include "util.h" @@ -39,29 +40,20 @@ void dmtcp_restart_plugin(const string &restartDir, // Also, create the DMTCP shared-memory area. t->initialize(); - vector mtcpArgs = getMtcpArgs(); - mtcpArgs.push_back((char *)"--mpi"); - - const map &kvmap = t->getKeyValueMap(); - - mtcpArgs.push_back((char*) "--minLibsStart"); - mtcpArgs.push_back((char*) kvmap.at("MANA_MinLibsStart").c_str()); - - mtcpArgs.push_back((char*) "--maxLibsEnd"); - mtcpArgs.push_back((char*) kvmap.at("MANA_MaxLibsEnd").c_str()); - - mtcpArgs.push_back((char*) "--minHighMemStart"); - mtcpArgs.push_back((char*) kvmap.at("MANA_MinHighMemStart").c_str()); + publishKeyValueMapToMtcpEnvironment(t); if (!restartDir.empty()) { - mtcpArgs.push_back((char *)"--restartdir"); - mtcpArgs.push_back((char *)restartDir.c_str()); + setenv("MANA_RestartDir=", restartDir.c_str(), 1); } - for (const string &image : ckptImages) { - mtcpArgs.push_back((char*) image.c_str()); + for (size_t i = 0; i < ckptImages.size(); i++) { + string key = "MANA_CkptImage_Rank_" + jalib::XToString(i); + setenv(key.c_str(), ckptImages[i].c_str(), 1); } + vector mtcpArgs = getMtcpArgs(); + mtcpArgs.push_back((char *)"--mpi"); + mtcpArgs.push_back(NULL); execvp(mtcpArgs[0], &mtcpArgs[0]); JASSERT(false)(mtcpArgs[0]).Text("execvp failed!"); diff --git a/restart_plugin/mtcp_restart_plugin.c b/restart_plugin/mtcp_restart_plugin.c index 862ab4f6a..fd62caf84 100644 --- a/restart_plugin/mtcp_restart_plugin.c +++ b/restart_plugin/mtcp_restart_plugin.c @@ -48,13 +48,19 @@ NO_OPTIMIZE char* -getCkptImageByRank(int rank, char **argv) +getCkptImageByRank(int rank, char **environ) { - char *fname = NULL; - if (rank >= 0) { - fname = argv[rank]; + if (rank < 0) { + return NULL; } - return fname; + + char *fname = NULL; + char envKey[64] = {0}; + char rankStr[20] = {0}; + mtcp_itoa(rankStr, rank); + mtcp_strcpy(envKey, "MANA_CkptImage_Rank_"); + mtcp_strncat(envKey, rankStr, mtcp_strlen(rankStr)); + return mtcp_getenv(envKey, environ); } static inline int @@ -191,17 +197,17 @@ int my_memcmp(const void *buffer1, const void *buffer2, size_t len) { // FIXME: Many style rules broken. Code never reviewed by skilled programmer. int getCkptImageByDir(RestoreInfo *rinfo, char *buffer, size_t buflen, int rank) { - if(!rinfo->restartDir) { + if(!rinfo->pluginInfo.restartDir) { MTCP_PRINTF("***ERROR No restart directory found - cannot find checkpoint image by directory!"); return -1; } - size_t len = mtcp_strlen(rinfo->restartDir); + size_t len = mtcp_strlen(rinfo->pluginInfo.restartDir); if(len >= buflen){ MTCP_PRINTF("***ERROR Restart directory would overflow given buffer!"); return -1; } - mtcp_strcpy(buffer, rinfo->restartDir); // start with directory + mtcp_strcpy(buffer, rinfo->pluginInfo.restartDir); // start with directory // ensure directory ends with / if(buffer[len - 1] != '/') { @@ -437,6 +443,36 @@ bool is_overlap(char *start1, char *end1, char *start2, char *end2) { return end1 >= start2 || end2 >= start1; } +void populate_plugin_info(RestoreInfo *rinfo) +{ + // FIXME: Eventually, mpi-proxy-split/mpi_plugin.cpp should + // directly write to rinfo->pluginInfo, and we won't + // need to do this extra copy, here. + // Copy the upper-half info to rinfo->pluginInfo + + const char *minLibsStartStr = mtcp_getenv("MANA_MinLibsStart", rinfo->environ); + if (minLibsStartStr != NULL) { + rinfo->pluginInfo.minLibsStart = (char*) mtcp_strtoll(minLibsStartStr); + } + + const char *maxLibsEndStr = mtcp_getenv("MANA_MaxLibsEnd", rinfo->environ); + if (maxLibsEndStr != NULL) { + rinfo->pluginInfo.maxLibsEnd = (char*) mtcp_strtoll(maxLibsEndStr); + } + + const char *minHighMemStartStr = mtcp_getenv("MANA_MinHighMemStart", rinfo->environ); + if (minHighMemStartStr != NULL) { + rinfo->pluginInfo.minHighMemStart = (char*) mtcp_strtoll(minHighMemStartStr); + } + + const char *maxHighMemEndStr = mtcp_getenv("MANA_MaxHighMemEnd", rinfo->environ); + if (maxHighMemEndStr != NULL) { + rinfo->pluginInfo.maxHighMemEnd = (char*) mtcp_strtoll(maxHighMemEndStr); + } + + rinfo->pluginInfo.restartDir = mtcp_getenv("MANA_RestarDir", rinfo->environ); +} + #ifdef SINGLE_CART_REORDER int load_cartesian_properties(char *filename, CartesianProperties *cp) @@ -535,15 +571,7 @@ mtcp_plugin_hook(RestoreInfo *rinfo) // when calling mtcp:restorememoryareas(). rinfo->pluginInfo.lh_info_addr = lh_info_addr; - // FIXME: Eventually, mpi-proxy-split/mpi_plugin.cpp should - // directly write to rinfo->pluginInfo, and we won't - // need to do this extra copy, here. - // Copy the upper-half info to rinfo->pluginInfo - rinfo->pluginInfo.minLibsStart = rinfo->minLibsStart; - rinfo->pluginInfo.maxLibsEnd = rinfo->maxLibsEnd; - rinfo->pluginInfo.minHighMemStart = rinfo->minHighMemStart; - rinfo->pluginInfo.maxHighMemEnd = rinfo->maxHighMemEnd; - rinfo->pluginInfo.restartDir = rinfo->restartDir; + populate_plugin_info(rinfo); // Reserve first 500 file descriptors for the Upper-half int reserved_fds[500]; @@ -560,18 +588,18 @@ mtcp_plugin_hook(RestoreInfo *rinfo) */ { // minLibsStart was chosen with extra space below, for future libs, mmap. - start1 = rinfo->minLibsStart; // first lib of upper half + start1 = rinfo->pluginInfo.minLibsStart; // first lib of upper half // Either lh_info_addr->memRange is a region between 1 GB and 2 GB below // the end of stack in the lower half; or else it is at an unusual // address for which we hope there is no address conflict. // The latter holds if USE_LH_FIXED_ADDRESS was defined in // mtcp_split_process.c, in both restart_plugin and mpi-proxy-split dirs. - end1 = rinfo->maxLibsEnd; + end1 = rinfo->pluginInfo.maxLibsEnd; // Reserve 8MB above min high memory region. That should include space for // stack, argv, env, auxvec. - start2 = rinfo->minHighMemStart - 1 * GB; // Allow for stack to grow - end2 = rinfo->minHighMemStart + 8 * MB; + start2 = rinfo->pluginInfo.minHighMemStart - 1 * GB; // Allow for stack to grow + end2 = rinfo->pluginInfo.minHighMemStart + 8 * MB; // Ignore region start2:end2 if it is overlapped with region start1:end1 if (is_overlap(start1, end1, start2, end2)) { if (end1 < end2) { end1 = end2; } @@ -670,7 +698,7 @@ mtcp_plugin_hook(RestoreInfo *rinfo) char *filename = "./ckpt_rank_0/cartesian.info"; char full_filename[PATH_MAX]; - set_header_filepath(full_filename, rinfo->restartDir); + set_header_filepath(full_filename, rinfo->pluginInfo.restartDir); ManaHeader m_header; MTCP_ASSERT(load_mana_header(full_filename, &m_header) == 0); @@ -719,7 +747,7 @@ mtcp_plugin_hook(RestoreInfo *rinfo) ckpt_image_rank_to_be_restored) == -1) { mtcp_strncpy( rinfo->ckptImage, - getCkptImageByRank(ckpt_image_rank_to_be_restored, rinfo->argv), + getCkptImageByRank(ckpt_image_rank_to_be_restored, rinfo->environ), PATH_MAX); } @@ -755,14 +783,7 @@ mtcp_plugin_hook(RestoreInfo *rinfo) // when calling mtcp:restorememoryareas(). rinfo->pluginInfo.lh_info_addr = lh_info_addr; - // FIXME: Eventually, mpi-proxy-split/mpi_plugin.cpp should - // directly write to rinfo->pluginInfo, and we won't - // need to do this extra copy, here. - // Copy the upper-half info to rinfo->pluginInfo - rinfo->pluginInfo.minLibsStart = rinfo->minLibsStart; - rinfo->pluginInfo.maxLibsEnd = rinfo->maxLibsEnd; - rinfo->pluginInfo.minHighMemStart = rinfo->minHighMemStart; - rinfo->pluginInfo.restartDir = rinfo->restartDir; + populate_plugin_info(rinfo); // Reserve first 500 file descriptors for the Upper-half int reserved_fds[500]; @@ -779,18 +800,18 @@ mtcp_plugin_hook(RestoreInfo *rinfo) */ { // minLibsStart was chosen with extra space below, for future libs, mmap. - start1 = rinfo->minLibsStart; // first lib of upper half + start1 = rinfo->pluginInfo.minLibsStart; // first lib of upper half // Either lh_info_addr->memRange is a region between 1 GB and 2 GB below // the end of stack in the lower half; or else it is at an unusual // address for which we hope there is no address conflict. // The latter holds if USE_LH_FIXED_ADDRESS was defined in // mtcp_split_process.c, in both restart_plugin and mpi-proxy-split dirs. - end1 = rinfo->maxLibsEnd; + end1 = rinfo->pluginInfo.maxLibsEnd; // Reserve 8MB above min high memory region. That should include space for // stack, argv, env, auxvec. - start2 = rinfo->minHighMemStart - 1 * GB; // Allow for stack to grow - end2 = rinfo->minHighMemStart + 8 * MB; + start2 = rinfo->pluginInfo.minHighMemStart - 1 * GB; // Allow for stack to grow + end2 = rinfo->pluginInfo.minHighMemStart + 8 * MB; // Ignore region start2:end2 if it is overlapped with region start1:end1 if (is_overlap(start1, end1, start2, end2)) { if (end1 < end2) { end1 = end2; } @@ -882,7 +903,7 @@ mtcp_plugin_hook(RestoreInfo *rinfo) # endif char full_filename[PATH_MAX]; - set_header_filepath(full_filename, rinfo->restartDir); + set_header_filepath(full_filename, rinfo->pluginInfo.restartDir); ManaHeader m_header; MTCP_ASSERT(load_mana_header(full_filename, &m_header) == 0); @@ -899,7 +920,7 @@ mtcp_plugin_hook(RestoreInfo *rinfo) if (getCkptImageByDir(rinfo, rinfo->ckptImage, 512, rank) == -1) { mtcp_strncpy(rinfo->ckptImage, - getCkptImageByRank(rank, rinfo->argv), + getCkptImageByRank(rank, rinfo->environ), PATH_MAX); } From 23fc9aaa2eb23cacd36abca0108b91068ba2c8a1 Mon Sep 17 00:00:00 2001 From: Leonid Belyaev Date: Sun, 17 Sep 2023 16:00:51 -0400 Subject: [PATCH 3/3] end2 value now tracks start2 value --- restart_plugin/mtcp_restart_plugin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/restart_plugin/mtcp_restart_plugin.c b/restart_plugin/mtcp_restart_plugin.c index fd62caf84..26f05addc 100644 --- a/restart_plugin/mtcp_restart_plugin.c +++ b/restart_plugin/mtcp_restart_plugin.c @@ -599,7 +599,7 @@ mtcp_plugin_hook(RestoreInfo *rinfo) // Reserve 8MB above min high memory region. That should include space for // stack, argv, env, auxvec. start2 = rinfo->pluginInfo.minHighMemStart - 1 * GB; // Allow for stack to grow - end2 = rinfo->pluginInfo.minHighMemStart + 8 * MB; + end2 = start2 + 8 * MB; // Ignore region start2:end2 if it is overlapped with region start1:end1 if (is_overlap(start1, end1, start2, end2)) { if (end1 < end2) { end1 = end2; } @@ -811,7 +811,7 @@ mtcp_plugin_hook(RestoreInfo *rinfo) // Reserve 8MB above min high memory region. That should include space for // stack, argv, env, auxvec. start2 = rinfo->pluginInfo.minHighMemStart - 1 * GB; // Allow for stack to grow - end2 = rinfo->pluginInfo.minHighMemStart + 8 * MB; + end2 = start2 + 8 * MB; // Ignore region start2:end2 if it is overlapped with region start1:end1 if (is_overlap(start1, end1, start2, end2)) { if (end1 < end2) { end1 = end2; }