[mvapich-discuss] problem with jobstartup with mvapich 0.96/0.97 and mpiexec

Jimmy Tang jtang at tchpc.tcd.ie
Wed Mar 15 19:04:00 EST 2006


Hi,

with the latest release of 0.97 I was testing it against some codes that
we have locally on our systems etc... and after some frustration that
mpiexec hasnt been working since 0.96. I decided to investigate why
mpiexec fails with 0.96/0.97 of mvapich, which lead me to email the
mvapich mailing list for a solution.

the mpiexec developer pointed me to  the source code

	mpid/vapi/process/pmgr_client_mpirun_rsh.c

there is a block of code which repeats itself 3 times (which also breaks
things) removing code allows mvapich to function correctly with mpiexec

I'd like to ask if its possible to remove the block of code from the
source? or at least put an ifdef in to disable that code by default?

I would imagine most people probably run (torque|openpbs) + mpiexec +
mvapich which is where mvapich fails to startup with mpiexec with that
code block in this setup. it would be nice if mvapich played nice with
mpiexec out of the box without the need to remove "cruft" that breaks
things.

attached is a diff with the code block removed.

Thanks,
Jimmy


-- 
Jimmy Tang
Trinity Centre for High Performance Computing,
Lloyd Building, Trinity College Dublin.
http://www.tchpc.tcd.ie/
-------------- next part --------------
--- mpid/vapi/process/pmgr_client_mpirun_rsh.c-orig	2006-03-15 23:25:56.560024884 +0000
+++ mpid/vapi/process/pmgr_client_mpirun_rsh.c	2006-03-15 23:28:08.862319174 +0000
@@ -167,315 +167,6 @@
     *id_p = pmgr_id;
     *processes_p = pmgr_processes;
 
-     /*
-      *  Route stdout and stderr to mpiexec if applicable  - dskinner at nersc.gov
-      *  if MPIEXEC_STDOUT_PORT and MPIEXEC_STDERR_PORT not detected
-      *  do nothing.  What's on the other side is described at
-      *  Route stdin stdout and stderr to mpiexec  - dskinner at nersc.gov
-      *  if MPIEXEC_STDOUT_PORT MPIEXEC_STDOUT_PORT and MPIEXEC_STDERR_PORT are
-      *  not detected no sockets are created and stdout/stderr are left as is.
-      *  no conditional recompilation should be required due to the above fact.
-      *  What's on the other side of these sockets is described at
-      *  http://www.osc.edu/~pw/mpiexec/
-      *
-      */
-
-   str = getenv("MPIEXEC_STDOUT_PORT");
-   if(str) {
-     mpirun_stdout_port = atoi(str);
-     if (mpirun_port <= 0) {
-         fprintf(stderr, "Invalid MPIEXEC_STDOUT_PORT port %s\n", str);
-         exit(1);
-     }
-    mpirun_stdout_socket = socket(AF_INET, SOCK_STREAM, 0);
-    if (mpirun_stdout_socket < 0) {
-        perror("socket");
-        exit(1);
-    }
-
-    sockaddr.sin_family = AF_INET;
-    sockaddr.sin_addr = *(struct in_addr *) (*mpirun_hostent->h_addr_list);
-    sockaddr.sin_port = htons(mpirun_stdout_port);
-
-    if (connect(mpirun_stdout_socket, (struct sockaddr *) &sockaddr,
-                sizeof(sockaddr)) < 0) {
-        perror("connect");
-        exit(1);
-    }
-
-    fflush(stdout);
-    dup2(mpirun_stdout_socket,1);
-    close(mpirun_stdout_socket);
-
-    /* we have now connected stdout to the mpiexec program */
-   }
-
-   str = getenv("MPIEXEC_STDERR_PORT");
-   if(str) {
-     mpirun_stderr_port = atoi(str);
-     if (mpirun_port <= 0) {
-         fprintf(stderr, "Invalid MPIEXEC_STDERR_PORT port %s\n", str);
-         exit(1);
-     }
-    mpirun_stderr_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
-    if (mpirun_stderr_socket < 0) {
-        perror("socket");
-        exit(1);
-    }
-
-    sockaddr.sin_family = AF_INET;
-    sockaddr.sin_addr = *(struct in_addr *) (*mpirun_hostent->h_addr_list);
-    sockaddr.sin_port = htons(mpirun_stderr_port);
-
-    if (connect(mpirun_stderr_socket, (struct sockaddr *) &sockaddr,
-                sizeof(sockaddr)) < 0) {
-        perror("connect");
-        exit(1);
-    }
-
-    fflush(stderr);
-    dup2(mpirun_stderr_socket,2);
-    close(mpirun_stderr_socket);
-
-    /* we have now connected stderr to the mpiexec program */
-   }
-   str = getenv("MPIEXEC_STDIN_PORT");
-   if(str) {
-     mpirun_stdin_port = atoi(str);
-     if (mpirun_port <= 0) {
-         fprintf(stderr, "Invalid MPIEXEC_STDIN_PORT port %s\n", str);
-         exit(1);
-     }
-    mpirun_stdin_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
-    if (mpirun_stdin_socket < 0) {
-        perror("socket");
-        exit(1);
-    }
-
-    sockaddr.sin_family = AF_INET;
-    sockaddr.sin_addr = *(struct in_addr *) (*mpirun_hostent->h_addr_list);
-    sockaddr.sin_port = htons(mpirun_stdin_port);
-
-    if (connect(mpirun_stdin_socket, (struct sockaddr *) &sockaddr,
-                sizeof(sockaddr)) < 0) {
-        perror("connect");
-        exit(1);
-    }
-
-    fflush(stderr);
-    dup2(mpirun_stdin_socket,0);
-    close(mpirun_stdin_socket);
-
-    /* we have now connected stdin to the mpiexec program */
-   }
-
-
-     /*
-      *  Route stdout and stderr to mpiexec if applicable  - dskinner at nersc.gov
-      *  if MPIEXEC_STDOUT_PORT and MPIEXEC_STDERR_PORT not detected
-      *  do nothing.  What's on the other side is described at
-      *  Route stdin stdout and stderr to mpiexec  - dskinner at nersc.gov
-      *  if MPIEXEC_STDOUT_PORT MPIEXEC_STDOUT_PORT and MPIEXEC_STDERR_PORT are
-      *  not detected no sockets are created and stdout/stderr are left as is.
-      *  no conditional recompilation should be required due to the above fact.
-      *  What's on the other side of these sockets is described at
-      *  http://www.osc.edu/~pw/mpiexec/
-      *
-      */
-
-   str = getenv("MPIEXEC_STDOUT_PORT");
-   if(str) {
-     mpirun_stdout_port = atoi(str);
-     if (mpirun_port <= 0) {
-         fprintf(stderr, "Invalid MPIEXEC_STDOUT_PORT port %s\n", str);
-         exit(1);
-     }
-    mpirun_stdout_socket = socket(AF_INET, SOCK_STREAM, 0);
-    if (mpirun_stdout_socket < 0) {
-        perror("socket");
-        exit(1);
-    }
-
-    sockaddr.sin_family = AF_INET;
-    sockaddr.sin_addr = *(struct in_addr *) (*mpirun_hostent->h_addr_list);
-    sockaddr.sin_port = htons(mpirun_stdout_port);
-
-    if (connect(mpirun_stdout_socket, (struct sockaddr *) &sockaddr,
-                sizeof(sockaddr)) < 0) {
-        perror("connect");
-        exit(1);
-    }
-
-    fflush(stdout);
-    dup2(mpirun_stdout_socket,1);
-    close(mpirun_stdout_socket);
-
-    /* we have now connected stdout to the mpiexec program */
-   }
-
-   str = getenv("MPIEXEC_STDERR_PORT");
-   if(str) {
-     mpirun_stderr_port = atoi(str);
-     if (mpirun_port <= 0) {
-         fprintf(stderr, "Invalid MPIEXEC_STDERR_PORT port %s\n", str);
-         exit(1);
-     }
-    mpirun_stderr_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
-    if (mpirun_stderr_socket < 0) {
-        perror("socket");
-        exit(1);
-    }
-
-    sockaddr.sin_family = AF_INET;
-    sockaddr.sin_addr = *(struct in_addr *) (*mpirun_hostent->h_addr_list);
-    sockaddr.sin_port = htons(mpirun_stderr_port);
-
-    if (connect(mpirun_stderr_socket, (struct sockaddr *) &sockaddr,
-                sizeof(sockaddr)) < 0) {
-        perror("connect");
-        exit(1);
-    }
-
-    fflush(stderr);
-    dup2(mpirun_stderr_socket,2);
-    close(mpirun_stderr_socket);
-
-    /* we have now connected stderr to the mpiexec program */
-   }
-   str = getenv("MPIEXEC_STDIN_PORT");
-   if(str) {
-     mpirun_stdin_port = atoi(str);
-     if (mpirun_port <= 0) {
-         fprintf(stderr, "Invalid MPIEXEC_STDIN_PORT port %s\n", str);
-         exit(1);
-     }
-    mpirun_stdin_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
-    if (mpirun_stdin_socket < 0) {
-        perror("socket");
-        exit(1);
-    }
-
-    sockaddr.sin_family = AF_INET;
-    sockaddr.sin_addr = *(struct in_addr *) (*mpirun_hostent->h_addr_list);
-    sockaddr.sin_port = htons(mpirun_stdin_port);
-
-    if (connect(mpirun_stdin_socket, (struct sockaddr *) &sockaddr,
-                sizeof(sockaddr)) < 0) {
-        perror("connect");
-        exit(1);
-    }
-
-    fflush(stderr);
-    dup2(mpirun_stdin_socket,0);
-    close(mpirun_stdin_socket);
-
-    /* we have now connected stdin to the mpiexec program */
-   }
-
-
-     /*
-      *  Route stdout and stderr to mpiexec if applicable  - dskinner at nersc.gov
-      *  if MPIEXEC_STDOUT_PORT and MPIEXEC_STDERR_PORT not detected
-      *  do nothing.  What's on the other side is described at
-      *  Route stdin stdout and stderr to mpiexec  - dskinner at nersc.gov
-      *  if MPIEXEC_STDOUT_PORT MPIEXEC_STDOUT_PORT and MPIEXEC_STDERR_PORT are
-      *  not detected no sockets are created and stdout/stderr are left as is.
-      *  no conditional recompilation should be required due to the above fact.
-      *  What's on the other side of these sockets is described at
-      *  http://www.osc.edu/~pw/mpiexec/
-      *
-      */
-
-   str = getenv("MPIEXEC_STDOUT_PORT");
-   if(str) {
-     mpirun_stdout_port = atoi(str);
-     if (mpirun_port <= 0) {
-         fprintf(stderr, "Invalid MPIEXEC_STDOUT_PORT port %s\n", str);
-         exit(1);
-     }
-    mpirun_stdout_socket = socket(AF_INET, SOCK_STREAM, 0);
-    if (mpirun_stdout_socket < 0) {
-        perror("socket");
-        exit(1);
-    }
-
-    sockaddr.sin_family = AF_INET;
-    sockaddr.sin_addr = *(struct in_addr *) (*mpirun_hostent->h_addr_list);
-    sockaddr.sin_port = htons(mpirun_stdout_port);
-
-    if (connect(mpirun_stdout_socket, (struct sockaddr *) &sockaddr,
-                sizeof(sockaddr)) < 0) {
-        perror("connect");
-        exit(1);
-    }
-
-    fflush(stdout);
-    dup2(mpirun_stdout_socket,1);
-    close(mpirun_stdout_socket);
-
-    /* we have now connected stdout to the mpiexec program */
-   }
-
-   str = getenv("MPIEXEC_STDERR_PORT");
-   if(str) {
-     mpirun_stderr_port = atoi(str);
-     if (mpirun_port <= 0) {
-         fprintf(stderr, "Invalid MPIEXEC_STDERR_PORT port %s\n", str);
-         exit(1);
-     }
-    mpirun_stderr_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
-    if (mpirun_stderr_socket < 0) {
-        perror("socket");
-        exit(1);
-    }
-
-    sockaddr.sin_family = AF_INET;
-    sockaddr.sin_addr = *(struct in_addr *) (*mpirun_hostent->h_addr_list);
-    sockaddr.sin_port = htons(mpirun_stderr_port);
-
-    if (connect(mpirun_stderr_socket, (struct sockaddr *) &sockaddr,
-                sizeof(sockaddr)) < 0) {
-        perror("connect");
-        exit(1);
-    }
-
-    fflush(stderr);
-    dup2(mpirun_stderr_socket,2);
-    close(mpirun_stderr_socket);
-
-    /* we have now connected stderr to the mpiexec program */
-   }
-   str = getenv("MPIEXEC_STDIN_PORT");
-   if(str) {
-     mpirun_stdin_port = atoi(str);
-     if (mpirun_port <= 0) {
-         fprintf(stderr, "Invalid MPIEXEC_STDIN_PORT port %s\n", str);
-         exit(1);
-     }
-    mpirun_stdin_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
-    if (mpirun_stdin_socket < 0) {
-        perror("socket");
-        exit(1);
-    }
-
-    sockaddr.sin_family = AF_INET;
-    sockaddr.sin_addr = *(struct in_addr *) (*mpirun_hostent->h_addr_list);
-    sockaddr.sin_port = htons(mpirun_stdin_port);
-
-    if (connect(mpirun_stdin_socket, (struct sockaddr *) &sockaddr,
-                sizeof(sockaddr)) < 0) {
-        perror("connect");
-        exit(1);
-    }
-
-    fflush(stderr);
-    dup2(mpirun_stdin_socket,0);
-    close(mpirun_stdin_socket);
-
-    /* we have now connected stdin to the mpiexec program */
-   }
-
-
     return 1;
 }
 


More information about the mvapich-discuss mailing list