[mvapich-discuss] Patch to retry psm_ep_open
Adam T. Moody
moody20 at llnl.gov
Tue Mar 10 19:13:34 EDT 2015
My last patch contained some code for a second, unrelated patch. Here's
a cleaned up version for just the psm_ep_open retry portion.
-Adam
Adam T. Moody wrote:
> Hello MVAPICH team,
> We have some people who run a sequence of up to 1000 independent MPI
> jobs within a single SLURM allocation as a suite of application
> regression tests. All job steps are submitted to SLURM at once, and
> they rely on SLURM to schedule the job steps to run in turn once
> earlier jobs finish and free up resources. It seems that some of
> these job steps start before the previous job steps have fully
> released their PSM contexts, which then leads to a failure in
> psm_ep_open() in the new job step. It's not clear whether the problem
> lies with our (old) version of SLURM in starting the next job too
> early or whether the node / network card driver is just slow to free
> up contexts.
>
> Anyway, as a work around for such cases, the attached patch retries
> psm_ep_open multiple times after sleeping for some time between
> retries. The user can tune the total number of retries and the time
> between retries with environment variables. This work around is
> rather hacky, but it helps on our machines. I thought I'd send it
> your way in case it's useful to others with PSM.
>
> My original patch was for MVAPICH-1.2, and I've ported this to
> MVAPICH2-2.0.1. I checked that it compiles, however, if you want to
> include it, please verify that it does what you'd expect. In
> particular, please look at the warning it prints in case you have a
> better format for that.
> Thanks,
> -Adam
>
>------------------------------------------------------------------------
>
>--- src/mpid/ch3/channels/psm/src/psm_entry.c.orig 2015-02-27 16:53:09.085710000 -0800
>+++ src/mpid/ch3/channels/psm/src/psm_entry.c 2015-03-10 15:35:07.180223000 -0700
>@@ -32,10 +32,17 @@
> int g_mv2_show_env_info = 0;
> mv2_arch_hca_type g_mv2_arch_hca_type = 0;
>
>+/* some jobs start up before contexts from previous job are released,
>+ * so we use this sleep and retry hack as a work around */
>+static int mv2_psm_ep_open_retry_count = 10; /* number of retry attempts if psm_ep_open fails */
>+static int mv2_psm_ep_open_retry_secs = 10; /* number of seconds to sleep between psm_ep_open retries */
>+
> static char scratch[WRBUFSZ];
> static char *kvsid;
> static psm_uuid_t psm_uuid;
>
>+static int mv2_psm_rank = -1;
>+
> static int psm_bcast_uuid(int pg_size, int pg_rank);
> static int psm_allgather_epid(psm_epid_t *list, int pg_size, int pg_rank);
> static void psm_other_init(MPIDI_PG_t *pg);
>@@ -182,6 +189,15 @@
> return g_mv2_arch_hca_type;
> }
>
>+/* print error string to stderr, flush stderr, and return error */
>+static psm_error_t mv2_psm_err_handler(psm_ep_t ep, const psm_error_t error, const char* error_string, psm_error_token_t token)
>+{
>+ /* print error and flush stderr */
>+ fprintf(stderr, "ERROR: Rank %d: PSM error handler: %s : %s\n",
>+ mv2_psm_rank, psm_error_get_string(error), error_string);
>+ fflush(stderr);
>+ return error;
>+}
>
> #undef FUNCNAME
> #define FUNCNAME psm_doinit
>@@ -194,6 +210,10 @@
> int heterogeneity = 0;
> psm_epid_t myid, *epidlist = NULL;
> psm_error_t *errs = NULL, err;
>+ struct psm_ep_open_opts psm_opts;
>+
>+ /* record our global rank to print rank in error messages */
>+ mv2_psm_rank = pg_rank;
>
> /* Override split_type */
> MPID_Comm_fns = &comm_fns;
>@@ -245,7 +265,9 @@
> }
>
> psm_preinit(pg_size);
>- psm_error_register_handler(NULL, PSM_ERRHANDLER_NO_HANDLER);
>+
>+ /* override global error handler so we can print error messages */
>+ psm_error_register_handler(NULL, mv2_psm_err_handler);
>
> err = psm_init(&verno_major, &verno_minor);
> if(err != PSM_OK) {
>@@ -253,9 +275,45 @@
> MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**psminit");
> }
>
>- if((err = psm_ep_open(psm_uuid, NULL, &psmdev_cw.ep, &myid)) != PSM_OK) {
>- fprintf(stderr, "psm_ep_open failed with error %s\n", psm_error_get_string(err));
>- MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**psmepopen");
>+ /* By default, PSM sets cpu affinity on a process if it's not
>+ * already set. We disable cpu affinity in PSM here. MVAPICH
>+ * or the process launcher will set affinity, unless the user
>+ * disabled it, but in that case, he probably doesn't want
>+ * PSM to set it either. In particular, we don't want PSM to
>+ * set affinity on singleton MPI jobs (single-process MPI run
>+ * w/o mpirun), since PSM will bind all such jobs to core 0. */
>+ psm_ep_open_opts_get_defaults(&psm_opts);
>+ psm_opts.affinity = PSM_EP_OPEN_AFFINITY_SKIP;
>+
>+ if((err = psm_ep_open(psm_uuid, &psm_opts, &psmdev_cw.ep, &myid)) != PSM_OK) {
>+ /* some jobs start up before contexts from previous job are released,
>+ * so we use this sleep and retry hack as a work around, a negative
>+ * retry count means we should retry forever */
>+ int success = 0; /* we'll set this to 1 if psm_ep_open is successful */
>+ int retries = 1;
>+ while (mv2_psm_ep_open_retry_count < 0 || retries <= mv2_psm_ep_open_retry_count) {
>+ /* print warning to stdout, use stdout instead of stderr,
>+ * since apps check stderr for fatal events and this may
>+ * not be fatal, if we fail the retries error_abort_all
>+ * prints to stderr */
>+ fprintf(stdout, "[%d] MV2_WARNING at %s:%d: Failed to open an end-point: %s, retry attempt %d of %d in %d seconds...",
>+ pg_rank, __FILE__, __LINE__,
>+ psm_error_get_string(err), retries, mv2_psm_ep_open_retry_count, mv2_psm_ep_open_retry_secs
>+ );
>+ fflush(stdout);
>+ sleep(mv2_psm_ep_open_retry_secs);
>+ if((err = psm_ep_open(psm_uuid, &psm_opts, &psmdev_cw.ep, &myid)) == PSM_OK) {
>+ success = 1;
>+ break;
>+ }
>+ retries++;
>+ }
>+
>+ /* bail with fatal error if we couldn't open the device */
>+ if (! success) {
>+ fprintf(stderr, "psm_ep_open failed with error %s\n", psm_error_get_string(err));
>+ MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**psmepopen");
>+ }
> }
> epidlist = (psm_epid_t *)MPIU_Malloc(pg_size * sizeof(psm_epid_t));
> if(epidlist == NULL) {
>@@ -544,6 +602,20 @@
> if ((flag = getenv("MV2_SHOW_ENV_INFO")) != NULL) {
> g_mv2_show_env_info = atoi(flag);
> }
>+
>+ /* number of times to retry psm_ep_open upon failure */
>+ if ((flag = getenv("MV2_PSM_EP_OPEN_RETRY_COUNT")) != NULL) {
>+ mv2_psm_ep_open_retry_count = atoi(flag);
>+ }
>+
>+ /* sleep time in seconds between open retries */
>+ if ((flag = getenv("MV2_PSM_EP_OPEN_RETRY_SECS")) != NULL) {
>+ mv2_psm_ep_open_retry_secs = atoi(flag);
>+ if (mv2_psm_ep_open_retry_secs < 0) {
>+ /* TODO: print warning to user that secs can't be negative */
>+ mv2_psm_ep_open_retry_secs = 0;
>+ }
>+ }
> }
>
> /* Ch3 expects channel to initialize VC fields.
>
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: psm_ep_open_retry.patch
Type: text/x-patch
Size: 3381 bytes
Desc: not available
URL: <http://mailman.cse.ohio-state.edu/pipermail/mvapich-discuss/attachments/20150310/8e984375/attachment.bin>
More information about the mvapich-discuss
mailing list