[mvapich-discuss] Patch to retry psm_ep_open

Adam T. Moody moody20 at llnl.gov
Tue Mar 10 19:13:34 EDT 2015


My last patch contained some code for a second, unrelated patch.  Here's 
a cleaned up version for just the psm_ep_open retry portion.
-Adam


Adam T. Moody wrote:

> Hello MVAPICH team,
> We have some people who run a sequence of up to 1000 independent MPI 
> jobs within a single SLURM allocation as a suite of application 
> regression tests.  All job steps are submitted to SLURM at once, and 
> they rely on SLURM to schedule the job steps to run in turn once 
> earlier jobs finish and free up resources.  It seems that some of 
> these job steps start before the previous job steps have fully 
> released their PSM contexts, which then leads to a failure in 
> psm_ep_open() in the new job step.  It's not clear whether the problem 
> lies with our (old) version of SLURM in starting the next job too 
> early or whether the node / network card driver is just slow to free 
> up contexts.
>
> Anyway, as a work around for such cases, the attached patch retries 
> psm_ep_open multiple times after sleeping for some time between 
> retries.  The user can tune the total number of retries and the time 
> between retries with environment variables.  This work around is 
> rather hacky, but it helps on our machines.  I thought I'd send it 
> your way in case it's useful to others with PSM.
>
> My original patch was for MVAPICH-1.2, and I've ported this to 
> MVAPICH2-2.0.1.  I checked that it compiles, however, if you want to 
> include it, please verify that it does what you'd expect.  In 
> particular, please look at the warning it prints in case you have a 
> better format for that.
> Thanks,
> -Adam
>
>------------------------------------------------------------------------
>
>--- src/mpid/ch3/channels/psm/src/psm_entry.c.orig	2015-02-27 16:53:09.085710000 -0800
>+++ src/mpid/ch3/channels/psm/src/psm_entry.c	2015-03-10 15:35:07.180223000 -0700
>@@ -32,10 +32,17 @@
> int g_mv2_show_env_info = 0;
> mv2_arch_hca_type g_mv2_arch_hca_type = 0;
> 
>+/* some jobs start up before contexts from previous job are released,
>+ * so we use this sleep and retry hack as a work around */
>+static int mv2_psm_ep_open_retry_count = 10; /* number of retry attempts if psm_ep_open fails */
>+static int mv2_psm_ep_open_retry_secs  = 10; /* number of seconds to sleep between psm_ep_open retries */
>+
> static char    scratch[WRBUFSZ];
> static char             *kvsid;
> static psm_uuid_t       psm_uuid;
> 
>+static int mv2_psm_rank = -1;
>+
> static int  psm_bcast_uuid(int pg_size, int pg_rank);
> static int  psm_allgather_epid(psm_epid_t *list, int pg_size, int pg_rank);
> static void psm_other_init(MPIDI_PG_t *pg);
>@@ -182,6 +189,15 @@
>     return g_mv2_arch_hca_type;
> }
> 
>+/* print error string to stderr, flush stderr, and return error */
>+static psm_error_t mv2_psm_err_handler(psm_ep_t ep, const psm_error_t error, const char* error_string, psm_error_token_t token)
>+{
>+    /* print error and flush stderr */
>+    fprintf(stderr, "ERROR: Rank %d: PSM error handler: %s : %s\n",
>+            mv2_psm_rank, psm_error_get_string(error), error_string);
>+    fflush(stderr);
>+    return error;
>+}
> 
> #undef FUNCNAME
> #define FUNCNAME psm_doinit
>@@ -194,6 +210,10 @@
>     int heterogeneity = 0; 
>     psm_epid_t myid, *epidlist = NULL;
>     psm_error_t *errs = NULL, err;
>+    struct psm_ep_open_opts psm_opts;
>+
>+    /* record our global rank to print rank in error messages */
>+    mv2_psm_rank = pg_rank;
> 
>     /* Override split_type */
>     MPID_Comm_fns = &comm_fns;
>@@ -245,7 +265,9 @@
>     }
> 
>     psm_preinit(pg_size);
>-    psm_error_register_handler(NULL, PSM_ERRHANDLER_NO_HANDLER);
>+
>+    /* override global error handler so we can print error messages */
>+    psm_error_register_handler(NULL, mv2_psm_err_handler);
> 
>     err = psm_init(&verno_major, &verno_minor);
>     if(err != PSM_OK) {
>@@ -253,9 +275,45 @@
>         MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**psminit");
>     }
> 
>-    if((err = psm_ep_open(psm_uuid, NULL, &psmdev_cw.ep, &myid)) != PSM_OK) {
>-        fprintf(stderr, "psm_ep_open failed with error %s\n", psm_error_get_string(err));
>-        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**psmepopen");
>+    /* By default, PSM sets cpu affinity on a process if it's not
>+     * already set.  We disable cpu affinity in PSM here.  MVAPICH
>+     * or the process launcher will set affinity, unless the user
>+     * disabled it, but in that case, he probably doesn't want
>+     * PSM to set it either.  In particular, we don't want PSM to
>+     * set affinity on singleton MPI jobs (single-process MPI run
>+     * w/o mpirun), since PSM will bind all such jobs to core 0. */
>+    psm_ep_open_opts_get_defaults(&psm_opts);
>+    psm_opts.affinity = PSM_EP_OPEN_AFFINITY_SKIP;
>+
>+    if((err = psm_ep_open(psm_uuid, &psm_opts, &psmdev_cw.ep, &myid)) != PSM_OK) {
>+        /* some jobs start up before contexts from previous job are released,
>+         * so we use this sleep and retry hack as a work around, a negative
>+         * retry count means we should retry forever */
>+        int success = 0; /* we'll set this to 1 if psm_ep_open is successful */
>+        int retries = 1;
>+        while (mv2_psm_ep_open_retry_count < 0 || retries <= mv2_psm_ep_open_retry_count) {
>+            /* print warning to stdout, use stdout instead of stderr,
>+             * since apps check stderr for fatal events and this may
>+             * not be fatal, if we fail the retries error_abort_all
>+             * prints to stderr */
>+            fprintf(stdout, "[%d] MV2_WARNING at %s:%d: Failed to open an end-point: %s, retry attempt %d of %d in %d seconds...",
>+                    pg_rank, __FILE__, __LINE__,
>+                    psm_error_get_string(err), retries, mv2_psm_ep_open_retry_count, mv2_psm_ep_open_retry_secs
>+            );
>+            fflush(stdout);
>+            sleep(mv2_psm_ep_open_retry_secs);
>+            if((err = psm_ep_open(psm_uuid, &psm_opts, &psmdev_cw.ep, &myid)) == PSM_OK) {
>+                success = 1;
>+                break;
>+            }
>+            retries++;
>+        }
>+
>+        /* bail with fatal error if we couldn't open the device */
>+        if (! success) {
>+            fprintf(stderr, "psm_ep_open failed with error %s\n", psm_error_get_string(err));
>+            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**psmepopen");
>+        }
>     }
>     epidlist = (psm_epid_t *)MPIU_Malloc(pg_size * sizeof(psm_epid_t));
>     if(epidlist == NULL) {
>@@ -544,6 +602,20 @@
>     if ((flag = getenv("MV2_SHOW_ENV_INFO")) != NULL) {
>         g_mv2_show_env_info = atoi(flag);
>     }
>+
>+    /* number of times to retry psm_ep_open upon failure */
>+    if ((flag = getenv("MV2_PSM_EP_OPEN_RETRY_COUNT")) != NULL) {
>+        mv2_psm_ep_open_retry_count = atoi(flag);
>+    }
>+
>+    /* sleep time in seconds between open retries */
>+    if ((flag = getenv("MV2_PSM_EP_OPEN_RETRY_SECS")) != NULL) {
>+        mv2_psm_ep_open_retry_secs = atoi(flag);
>+        if (mv2_psm_ep_open_retry_secs < 0) {
>+            /* TODO: print warning to user that secs can't be negative */
>+            mv2_psm_ep_open_retry_secs = 0;
>+        }
>+    }
> }
> 
> /* Ch3 expects channel to initialize VC fields.
>  
>

-------------- next part --------------
A non-text attachment was scrubbed...
Name: psm_ep_open_retry.patch
Type: text/x-patch
Size: 3381 bytes
Desc: not available
URL: <http://mailman.cse.ohio-state.edu/pipermail/mvapich-discuss/attachments/20150310/8e984375/attachment.bin>


More information about the mvapich-discuss mailing list