[mvapich-discuss] Patch to retry psm_ep_open
Hari Subramoni
subramoni.1 at osu.edu
Tue Mar 10 21:54:35 EDT 2015
Hi Adma
Thanks a lot for the patch! We will take this in.
Regards,
Hari
On Tuesday, March 10, 2015, Adam T. Moody <moody20 at llnl.gov> wrote:
> My last patch contained some code for a second, unrelated patch. Here's a
> cleaned up version for just the psm_ep_open retry portion.
> -Adam
>
>
> Adam T. Moody wrote:
>
> Hello MVAPICH team,
>> We have some people who run a sequence of up to 1000 independent MPI jobs
>> within a single SLURM allocation as a suite of application regression
>> tests. All job steps are submitted to SLURM at once, and they rely on
>> SLURM to schedule the job steps to run in turn once earlier jobs finish and
>> free up resources. It seems that some of these job steps start before the
>> previous job steps have fully released their PSM contexts, which then leads
>> to a failure in psm_ep_open() in the new job step. It's not clear whether
>> the problem lies with our (old) version of SLURM in starting the next job
>> too early or whether the node / network card driver is just slow to free up
>> contexts.
>>
>> Anyway, as a work around for such cases, the attached patch retries
>> psm_ep_open multiple times after sleeping for some time between retries.
>> The user can tune the total number of retries and the time between retries
>> with environment variables. This work around is rather hacky, but it helps
>> on our machines. I thought I'd send it your way in case it's useful to
>> others with PSM.
>>
>> My original patch was for MVAPICH-1.2, and I've ported this to
>> MVAPICH2-2.0.1. I checked that it compiles, however, if you want to
>> include it, please verify that it does what you'd expect. In particular,
>> please look at the warning it prints in case you have a better format for
>> that.
>> Thanks,
>> -Adam
>>
>> ------------------------------------------------------------------------
>>
>> --- src/mpid/ch3/channels/psm/src/psm_entry.c.orig 2015-02-27
>> 16:53:09.085710000 -0800
>> +++ src/mpid/ch3/channels/psm/src/psm_entry.c 2015-03-10
>> 15:35:07.180223000 -0700
>> @@ -32,10 +32,17 @@
>> int g_mv2_show_env_info = 0;
>> mv2_arch_hca_type g_mv2_arch_hca_type = 0;
>>
>> +/* some jobs start up before contexts from previous job are released,
>> + * so we use this sleep and retry hack as a work around */
>> +static int mv2_psm_ep_open_retry_count = 10; /* number of retry attempts
>> if psm_ep_open fails */
>> +static int mv2_psm_ep_open_retry_secs = 10; /* number of seconds to
>> sleep between psm_ep_open retries */
>> +
>> static char scratch[WRBUFSZ];
>> static char *kvsid;
>> static psm_uuid_t psm_uuid;
>>
>> +static int mv2_psm_rank = -1;
>> +
>> static int psm_bcast_uuid(int pg_size, int pg_rank);
>> static int psm_allgather_epid(psm_epid_t *list, int pg_size, int
>> pg_rank);
>> static void psm_other_init(MPIDI_PG_t *pg);
>> @@ -182,6 +189,15 @@
>> return g_mv2_arch_hca_type;
>> }
>>
>> +/* print error string to stderr, flush stderr, and return error */
>> +static psm_error_t mv2_psm_err_handler(psm_ep_t ep, const psm_error_t
>> error, const char* error_string, psm_error_token_t token)
>> +{
>> + /* print error and flush stderr */
>> + fprintf(stderr, "ERROR: Rank %d: PSM error handler: %s : %s\n",
>> + mv2_psm_rank, psm_error_get_string(error), error_string);
>> + fflush(stderr);
>> + return error;
>> +}
>>
>> #undef FUNCNAME
>> #define FUNCNAME psm_doinit
>> @@ -194,6 +210,10 @@
>> int heterogeneity = 0; psm_epid_t myid, *epidlist = NULL;
>> psm_error_t *errs = NULL, err;
>> + struct psm_ep_open_opts psm_opts;
>> +
>> + /* record our global rank to print rank in error messages */
>> + mv2_psm_rank = pg_rank;
>>
>> /* Override split_type */
>> MPID_Comm_fns = &comm_fns;
>> @@ -245,7 +265,9 @@
>> }
>>
>> psm_preinit(pg_size);
>> - psm_error_register_handler(NULL, PSM_ERRHANDLER_NO_HANDLER);
>> +
>> + /* override global error handler so we can print error messages */
>> + psm_error_register_handler(NULL, mv2_psm_err_handler);
>>
>> err = psm_init(&verno_major, &verno_minor);
>> if(err != PSM_OK) {
>> @@ -253,9 +275,45 @@
>> MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**psminit");
>> }
>>
>> - if((err = psm_ep_open(psm_uuid, NULL, &psmdev_cw.ep, &myid)) !=
>> PSM_OK) {
>> - fprintf(stderr, "psm_ep_open failed with error %s\n",
>> psm_error_get_string(err));
>> - MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**psmepopen");
>> + /* By default, PSM sets cpu affinity on a process if it's not
>> + * already set. We disable cpu affinity in PSM here. MVAPICH
>> + * or the process launcher will set affinity, unless the user
>> + * disabled it, but in that case, he probably doesn't want
>> + * PSM to set it either. In particular, we don't want PSM to
>> + * set affinity on singleton MPI jobs (single-process MPI run
>> + * w/o mpirun), since PSM will bind all such jobs to core 0. */
>> + psm_ep_open_opts_get_defaults(&psm_opts);
>> + psm_opts.affinity = PSM_EP_OPEN_AFFINITY_SKIP;
>> +
>> + if((err = psm_ep_open(psm_uuid, &psm_opts, &psmdev_cw.ep, &myid)) !=
>> PSM_OK) {
>> + /* some jobs start up before contexts from previous job are
>> released,
>> + * so we use this sleep and retry hack as a work around, a
>> negative
>> + * retry count means we should retry forever */
>> + int success = 0; /* we'll set this to 1 if psm_ep_open is
>> successful */
>> + int retries = 1;
>> + while (mv2_psm_ep_open_retry_count < 0 || retries <=
>> mv2_psm_ep_open_retry_count) {
>> + /* print warning to stdout, use stdout instead of stderr,
>> + * since apps check stderr for fatal events and this may
>> + * not be fatal, if we fail the retries error_abort_all
>> + * prints to stderr */
>> + fprintf(stdout, "[%d] MV2_WARNING at %s:%d: Failed to open
>> an end-point: %s, retry attempt %d of %d in %d seconds...",
>> + pg_rank, __FILE__, __LINE__,
>> + psm_error_get_string(err), retries,
>> mv2_psm_ep_open_retry_count, mv2_psm_ep_open_retry_secs
>> + );
>> + fflush(stdout);
>> + sleep(mv2_psm_ep_open_retry_secs);
>> + if((err = psm_ep_open(psm_uuid, &psm_opts, &psmdev_cw.ep,
>> &myid)) == PSM_OK) {
>> + success = 1;
>> + break;
>> + }
>> + retries++;
>> + }
>> +
>> + /* bail with fatal error if we couldn't open the device */
>> + if (! success) {
>> + fprintf(stderr, "psm_ep_open failed with error %s\n",
>> psm_error_get_string(err));
>> + MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**psmepopen");
>> + }
>> }
>> epidlist = (psm_epid_t *)MPIU_Malloc(pg_size * sizeof(psm_epid_t));
>> if(epidlist == NULL) {
>> @@ -544,6 +602,20 @@
>> if ((flag = getenv("MV2_SHOW_ENV_INFO")) != NULL) {
>> g_mv2_show_env_info = atoi(flag);
>> }
>> +
>> + /* number of times to retry psm_ep_open upon failure */
>> + if ((flag = getenv("MV2_PSM_EP_OPEN_RETRY_COUNT")) != NULL) {
>> + mv2_psm_ep_open_retry_count = atoi(flag);
>> + }
>> +
>> + /* sleep time in seconds between open retries */
>> + if ((flag = getenv("MV2_PSM_EP_OPEN_RETRY_SECS")) != NULL) {
>> + mv2_psm_ep_open_retry_secs = atoi(flag);
>> + if (mv2_psm_ep_open_retry_secs < 0) {
>> + /* TODO: print warning to user that secs can't be negative */
>> + mv2_psm_ep_open_retry_secs = 0;
>> + }
>> + }
>> }
>>
>> /* Ch3 expects channel to initialize VC fields.
>>
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.cse.ohio-state.edu/pipermail/mvapich-discuss/attachments/20150310/cc6944d9/attachment-0001.html>
More information about the mvapich-discuss
mailing list