[mvapich-discuss] Patch to retry psm_ep_open

Hari Subramoni subramoni.1 at osu.edu
Tue Mar 10 21:54:35 EDT 2015


Hi Adma

Thanks a lot for the patch! We will take this in.

Regards,
Hari

On Tuesday, March 10, 2015, Adam T. Moody <moody20 at llnl.gov> wrote:

> My last patch contained some code for a second, unrelated patch.  Here's a
> cleaned up version for just the psm_ep_open retry portion.
> -Adam
>
>
> Adam T. Moody wrote:
>
>  Hello MVAPICH team,
>> We have some people who run a sequence of up to 1000 independent MPI jobs
>> within a single SLURM allocation as a suite of application regression
>> tests.  All job steps are submitted to SLURM at once, and they rely on
>> SLURM to schedule the job steps to run in turn once earlier jobs finish and
>> free up resources.  It seems that some of these job steps start before the
>> previous job steps have fully released their PSM contexts, which then leads
>> to a failure in psm_ep_open() in the new job step.  It's not clear whether
>> the problem lies with our (old) version of SLURM in starting the next job
>> too early or whether the node / network card driver is just slow to free up
>> contexts.
>>
>> Anyway, as a work around for such cases, the attached patch retries
>> psm_ep_open multiple times after sleeping for some time between retries.
>> The user can tune the total number of retries and the time between retries
>> with environment variables.  This work around is rather hacky, but it helps
>> on our machines.  I thought I'd send it your way in case it's useful to
>> others with PSM.
>>
>> My original patch was for MVAPICH-1.2, and I've ported this to
>> MVAPICH2-2.0.1.  I checked that it compiles, however, if you want to
>> include it, please verify that it does what you'd expect.  In particular,
>> please look at the warning it prints in case you have a better format for
>> that.
>> Thanks,
>> -Adam
>>
>> ------------------------------------------------------------------------
>>
>> --- src/mpid/ch3/channels/psm/src/psm_entry.c.orig      2015-02-27
>> 16:53:09.085710000 -0800
>> +++ src/mpid/ch3/channels/psm/src/psm_entry.c   2015-03-10
>> 15:35:07.180223000 -0700
>> @@ -32,10 +32,17 @@
>> int g_mv2_show_env_info = 0;
>> mv2_arch_hca_type g_mv2_arch_hca_type = 0;
>>
>> +/* some jobs start up before contexts from previous job are released,
>> + * so we use this sleep and retry hack as a work around */
>> +static int mv2_psm_ep_open_retry_count = 10; /* number of retry attempts
>> if psm_ep_open fails */
>> +static int mv2_psm_ep_open_retry_secs  = 10; /* number of seconds to
>> sleep between psm_ep_open retries */
>> +
>> static char    scratch[WRBUFSZ];
>> static char             *kvsid;
>> static psm_uuid_t       psm_uuid;
>>
>> +static int mv2_psm_rank = -1;
>> +
>> static int  psm_bcast_uuid(int pg_size, int pg_rank);
>> static int  psm_allgather_epid(psm_epid_t *list, int pg_size, int
>> pg_rank);
>> static void psm_other_init(MPIDI_PG_t *pg);
>> @@ -182,6 +189,15 @@
>>     return g_mv2_arch_hca_type;
>> }
>>
>> +/* print error string to stderr, flush stderr, and return error */
>> +static psm_error_t mv2_psm_err_handler(psm_ep_t ep, const psm_error_t
>> error, const char* error_string, psm_error_token_t token)
>> +{
>> +    /* print error and flush stderr */
>> +    fprintf(stderr, "ERROR: Rank %d: PSM error handler: %s : %s\n",
>> +            mv2_psm_rank, psm_error_get_string(error), error_string);
>> +    fflush(stderr);
>> +    return error;
>> +}
>>
>> #undef FUNCNAME
>> #define FUNCNAME psm_doinit
>> @@ -194,6 +210,10 @@
>>     int heterogeneity = 0;     psm_epid_t myid, *epidlist = NULL;
>>     psm_error_t *errs = NULL, err;
>> +    struct psm_ep_open_opts psm_opts;
>> +
>> +    /* record our global rank to print rank in error messages */
>> +    mv2_psm_rank = pg_rank;
>>
>>     /* Override split_type */
>>     MPID_Comm_fns = &comm_fns;
>> @@ -245,7 +265,9 @@
>>     }
>>
>>     psm_preinit(pg_size);
>> -    psm_error_register_handler(NULL, PSM_ERRHANDLER_NO_HANDLER);
>> +
>> +    /* override global error handler so we can print error messages */
>> +    psm_error_register_handler(NULL, mv2_psm_err_handler);
>>
>>     err = psm_init(&verno_major, &verno_minor);
>>     if(err != PSM_OK) {
>> @@ -253,9 +275,45 @@
>>         MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**psminit");
>>     }
>>
>> -    if((err = psm_ep_open(psm_uuid, NULL, &psmdev_cw.ep, &myid)) !=
>> PSM_OK) {
>> -        fprintf(stderr, "psm_ep_open failed with error %s\n",
>> psm_error_get_string(err));
>> -        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**psmepopen");
>> +    /* By default, PSM sets cpu affinity on a process if it's not
>> +     * already set.  We disable cpu affinity in PSM here.  MVAPICH
>> +     * or the process launcher will set affinity, unless the user
>> +     * disabled it, but in that case, he probably doesn't want
>> +     * PSM to set it either.  In particular, we don't want PSM to
>> +     * set affinity on singleton MPI jobs (single-process MPI run
>> +     * w/o mpirun), since PSM will bind all such jobs to core 0. */
>> +    psm_ep_open_opts_get_defaults(&psm_opts);
>> +    psm_opts.affinity = PSM_EP_OPEN_AFFINITY_SKIP;
>> +
>> +    if((err = psm_ep_open(psm_uuid, &psm_opts, &psmdev_cw.ep, &myid)) !=
>> PSM_OK) {
>> +        /* some jobs start up before contexts from previous job are
>> released,
>> +         * so we use this sleep and retry hack as a work around, a
>> negative
>> +         * retry count means we should retry forever */
>> +        int success = 0; /* we'll set this to 1 if psm_ep_open is
>> successful */
>> +        int retries = 1;
>> +        while (mv2_psm_ep_open_retry_count < 0 || retries <=
>> mv2_psm_ep_open_retry_count) {
>> +            /* print warning to stdout, use stdout instead of stderr,
>> +             * since apps check stderr for fatal events and this may
>> +             * not be fatal, if we fail the retries error_abort_all
>> +             * prints to stderr */
>> +            fprintf(stdout, "[%d] MV2_WARNING at %s:%d: Failed to open
>> an end-point: %s, retry attempt %d of %d in %d seconds...",
>> +                    pg_rank, __FILE__, __LINE__,
>> +                    psm_error_get_string(err), retries,
>> mv2_psm_ep_open_retry_count, mv2_psm_ep_open_retry_secs
>> +            );
>> +            fflush(stdout);
>> +            sleep(mv2_psm_ep_open_retry_secs);
>> +            if((err = psm_ep_open(psm_uuid, &psm_opts, &psmdev_cw.ep,
>> &myid)) == PSM_OK) {
>> +                success = 1;
>> +                break;
>> +            }
>> +            retries++;
>> +        }
>> +
>> +        /* bail with fatal error if we couldn't open the device */
>> +        if (! success) {
>> +            fprintf(stderr, "psm_ep_open failed with error %s\n",
>> psm_error_get_string(err));
>> +            MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**psmepopen");
>> +        }
>>     }
>>     epidlist = (psm_epid_t *)MPIU_Malloc(pg_size * sizeof(psm_epid_t));
>>     if(epidlist == NULL) {
>> @@ -544,6 +602,20 @@
>>     if ((flag = getenv("MV2_SHOW_ENV_INFO")) != NULL) {
>>         g_mv2_show_env_info = atoi(flag);
>>     }
>> +
>> +    /* number of times to retry psm_ep_open upon failure */
>> +    if ((flag = getenv("MV2_PSM_EP_OPEN_RETRY_COUNT")) != NULL) {
>> +        mv2_psm_ep_open_retry_count = atoi(flag);
>> +    }
>> +
>> +    /* sleep time in seconds between open retries */
>> +    if ((flag = getenv("MV2_PSM_EP_OPEN_RETRY_SECS")) != NULL) {
>> +        mv2_psm_ep_open_retry_secs = atoi(flag);
>> +        if (mv2_psm_ep_open_retry_secs < 0) {
>> +            /* TODO: print warning to user that secs can't be negative */
>> +            mv2_psm_ep_open_retry_secs = 0;
>> +        }
>> +    }
>> }
>>
>> /* Ch3 expects channel to initialize VC fields.
>>
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.cse.ohio-state.edu/pipermail/mvapich-discuss/attachments/20150310/cc6944d9/attachment-0001.html>


More information about the mvapich-discuss mailing list