[mvapich-discuss] NaNs from non-blocking comms

Tue Apr 5 15:00:41 EDT 2011

I'm resending this because some scanner didn't like my previous attachment.

Using mvapich2-1.6 configured and built under x86_64 Linux with

Intel-11.0.083 suite of compilers

./configure CC=icc CXX=icpc F77=ifort F90=ifort CFLAGS=-fpic
CXXFLAGS=-fpic FFLAGS=-fpic F90FLAGS=-fpic
--prefix=/home/dkokron/play/mvapich2-1.6/install/intel --enable-f77
--enable-f90 --enable-cxx --enable-romio --with-hwloc

The attached example code gives NaN's as output from the MPI_Recv if
MV2_ON_DEMAND_THRESHOLD is set to be less than the number of processes
used.

The example also gives NaNs using IntelMPI-4.0.1.002 if
I_MPI_USE_DYNAMIC_CONNECTIONS=enable

See the 'commands' file in the tarball for more information.
-- 
Dan Kokron
Global Modeling and Assimilation Office
NASA Goddard Space Flight Center
Greenbelt, MD 20771
Daniel.S.Kokron at nasa.gov
Phone: (301) 614-5192
Fax:   (301) 614-5304
-------------- next part --------------
module load comp/intel-11.0.083 mpi/mv2-1.6/intel-11.0.083

mpif90 -g -O0 -ftz -align all -fno-alias -traceback -debug -nolib-inline -fno-inline-functions -assume protect_parens,minus0 -prec-div -prec-sqrt -check bounds -check uninit -fp-stack-check -ftrapuv TestNonBlocking.F90

# This env var is critical to allow running under MVAPICH2-1.6
setenv MV2_ON_DEMAND_THRESHOLD 72
mpiexec.hydra -prepend-rank -launcher-exec /usr/bin/sshmpi -np 72 ./a.out
[3]  NaN found           65           9         640
[69]  NaN found           65           8         568

#### Intel MPI
module load comp/intel-11.0.083 mpi/impi-4.0.1.002-beta
mpiifort -g -O0 -ftz -align all -fno-alias -traceback -debug -nolib-inline -fno-inline-functions -assume protect_parens,minus0 -prec-div -prec-sqrt -check bounds -check uninit -fp-stack-check -ftrapuv TestNonBlocking.F90
set NUMNODES=`sort -u $PBS_NODEFILE | wc -l`
mpdboot -v --totalnum=$NUMNODES --file=$PBS_NODEFILE --rsh=sshmpi
# This env var is critical to allow running under IntelMPI
setenv I_MPI_USE_DYNAMIC_CONNECTIONS 0
mpiexec -l -perhost 8 -n 72 ./a.out
-------------- next part --------------
program nonblocking
  implicit none
  include "mpif.h"

  real(kind=4), parameter           :: pi = 3.14159265358979323846
  real(kind=4), allocatable, dimension (:  )     :: var
  real(kind=4), allocatable, dimension (:,:)     :: array, DstArray
  integer, parameter      :: im_world=288
  integer, parameter      :: jm_world=181
  integer, parameter      :: NX=4
  integer, parameter      :: NY=18
  integer, allocatable    :: i1(:),in(:),j1(:),jn(:),im(:),jm(:)
  integer, allocatable    :: IMs(:), JMs(:)
  integer                 :: i,j,k,n
  integer                 :: npes, mype, col, row, kount
  character(len=128), parameter :: FILENAME="output.dat"
  character(len=128)            :: FNAME
  integer status(MPI_STATUS_SIZE)
  integer (kind=4)        :: ierr

  call mpi_init(ierr)
  call MPI_COMM_SIZE(MPI_COMM_WORLD,npes,ierr)
  call MPI_COMM_RANK(MPI_COMM_WORLD,mype,ierr)

  allocate( IMs(0:NX-1), JMs(0:NY-1), stat=ierr)
  call MAPL_DecomposeDim ( im_world, IMs, NX )
  call MAPL_DecomposeDim ( jm_world, JMs, NY )
!  print*,'My ims ',ims
!  print*,'My jms ',jms

  allocate (i1(0:npes-1),  stat=ierr)
  allocate (in(0:npes-1),  stat=ierr)
  allocate (j1(0:npes-1),  stat=ierr)
  allocate (jn(0:npes-1),  stat=ierr)
  allocate (im(0:npes-1),  stat=ierr)
  allocate (jm(0:npes-1),  stat=ierr)

  do n=0,npes-1
     col = mod(n,NX)
     i1(n) = sum(IMs(:col))-IMs(col)+1
     in(n) = sum(IMs(:col))
!     print*,'My i1 in ',i1,in
     row = n/NX
     j1(n) = sum(JMs(:row))-JMs(row)+1
     jn(n) = sum(JMs(:row))
!     print*,'My j1 jn ',j1,jn
     im(n) = in(n) - i1(n) + 1
     jm(n) = jn(n) - j1(n) + 1
  end do

  allocate(DstArray(IM(mype), JM(mype)    ),stat=ierr)
  DstArray = Z'7FA00000'

  if(mype == 0) then
     allocate(array(im_world,jm_world))
     do j=1,jm_world
        do i=1,im_world
           array(i,j) = sin(2*pi*float(i)/float(im_world))*cos(pi*float(j)/float(jm_world))
        end do
     end do

     do n=0,npes-1
        kount = IM(n)*JM(n)
        if(n == mype) then
           DstArray = array(i1(n):in(n),j1(n):jn(n))
        else
!           print*,'ISend: ',n,i1(n),in(n),j1(n),jn(n),kount
           call MPI_ISend(array(i1(n):in(n),j1(n):jn(n)), kount, MPI_REAL, &
                n, 1, MPI_COMM_WORLD, status,  ierr)
        end if
     end do
  else
     allocate(var(0:IM(mype)*JM(mype)-1)     ,stat=ierr)
     var = Z'7FA00000'
     call MPI_Recv(var, size(var), MPI_REAL, 0, 1, MPI_COMM_WORLD, status, ierr)
!     print*,'Recv: ',status
     k=0 
     do J=1,JM(mype)
        do I=1,IM(mype)
           if(isnan(var(k))) print*,'NaN found ',i,j,k
           DstArray(I,J) = var(k)
           k = k+1
        end do
     end do
     deallocate(var)
  end if

  call MPI_Barrier(MPI_COMM_WORLD, ierr)

  if(mype == 0) deallocate(array)
  deallocate(i1)
  deallocate(in)
  deallocate(j1)
  deallocate(jn)
  deallocate(im)
  deallocate(jm)
  deallocate(DstArray)
  call MPI_FINALIZE(ierr)

end program nonblocking

subroutine MAPL_DecomposeDim ( dim_world,dim,NDEs )
   implicit   none
   integer    dim_world, NDEs
   integer    dim(0:NDEs-1)
   integer    n,im,rm,nbeg,nend
   im = dim_world/NDEs
   rm = dim_world-NDEs*im
   do n=0,NDEs-1
      dim(n) = im
      if( n.le.rm-1 ) dim(n) = im+1
   enddo
end subroutine MAPL_DecomposeDim