[mvapich-discuss] SIGSEV in F90: An MPI bug?

David Stuebe dstuebe at umassd.edu
Wed Jan 23 14:29:54 EST 2008


Hello MVAPICH
I have found a strange bug in MVAPICH2 using IFORT. The behavior is very
strange indeed - it seems to be related to how ifort deals with passing
pointers to the MVAPICH FORTRAN 90 INTERFACE.
The MPI call returns successfully, but later calls to a dummy subroutine
cause a sigsev.

 Please look at the following code:

!=================================================================================
!=================================================================================
!=================================================================================
! TEST CODE TO FOR POSSIBLE BUG IN MVAPICH2 COMPILED ON IFORT
! WRITEN BY: DAVID STUEBE
! DATE: JAN 23, 2008
!
! COMPILE WITH: mpif90 -xP mpi_prog.f90 -o xtest
!
! KNOWN BEHAVIOR:
! PASSING A NONE CONTIGUOUS POINTER TO MPI_BCAST CAUSES FAILURE OF
! SUBROUTINES USING MULTI DIMENSIONAL EXPLICT SHAPE ARRAYS WITHOUT AN
INTERFACE -
! EVEN THOUGH THE MPI_BCAST COMPLETES SUCCESUFULLY, RETURNING VALID DATA.
!
! COMMENTS:
! I REALIZE PASSING NON CONTIGUOUS POINTERS IS DANGEROUS - SHAME ON
! ME FOR MAKING THAT MISTAKE. HOWEVER, IT SHOULD EITHER WORK OR NOT.
! RETURNING SUCCESSFULLY BUT CAUSING INTERFACE ERRORS LATER IS
! EXTREMELY DIFFICULT TO DEBUG!
!
! CONDITIONS FOR OCCURANCE:
!    COMPILER MUST OPTIMIZE USING 'VECTORIZATION'
!    ARRAY MUST BE 'LARGE' -SYSTEM DEPENDENT ?
!    MUST BE RUN ON MORE THAN ONE NODE TO CAUSE CRASH...
!    ie  Running inside one SMP box does not crash.
!
!    RUNNING UNDER MPD, ALL PROCESSES SIGSEV
!    RUNNING UNDER MPIEXEC0.82 FOR PBS,
!       ONLY SOME PROCESSES SIGSEV ?
!
! ENVIRONMENTAL INFO:
! NODES: DELL 1850 3.0GHZ, 2GB RAM, INFINIBAND PCI-EX 4X
! SYSTEM: ROCKS 4.2
! gcc version 3.4.6 20060404 (Red Hat 3.4.6-3)
!
! IFORT/ICC:
!   Intel(R) Fortran Compiler for Intel(R) EM64T-based applications,
!   Version 9.1 Build 20061101 Package ID: l_fc_c_9.1.040
!
! MVAPICH2: mpif90 for mvapich2-1.0
! ./configure --prefix=/usr/local/share/mvapich2/1.0
--with-device=osu_ch3:mrail --with-rdma=vapi --with-pm=mpd --enable-f90
--enable-cxx --disable-romio --without-mpe
!
!=================================================================================
!=================================================================================
!=================================================================================

Module vars
  USE MPI
  implicit none


  integer :: n,m,MYID,NPROCS
  integer :: ipt

  integer, allocatable, target :: data(:,:)

  contains

    subroutine alloc_vars
      implicit none

      integer Status

      allocate(data(n,m),stat=status)
      if (status /=0) then
         write(ipt,*) "allocation error"
         stop
      end if

      data = 0

    end subroutine alloc_vars

   SUBROUTINE INIT_MPI_ENV(ID,NP)
!===================================================================================|
!  INITIALIZE MPI
ENVIRONMENT                                                       |
!===================================================================================|
     INTEGER, INTENT(OUT) :: ID,NP
     INTEGER IERR

     IERR=0

     CALL MPI_INIT(IERR)
     IF(IERR/=0) WRITE(*,*) "BAD MPI_INIT", ID
     CALL MPI_COMM_RANK(MPI_COMM_WORLD,ID,IERR)
     IF(IERR/=0) WRITE(*,*) "BAD MPI_COMM_RANK", ID
     CALL MPI_COMM_SIZE(MPI_COMM_WORLD,NP,IERR)
     IF(IERR/=0) WRITE(*,*) "BAD MPI_COMM_SIZE", ID

   END SUBROUTINE INIT_MPI_ENV


!==============================================================================|
  SUBROUTINE PSHUTDOWN

!==============================================================================|
    INTEGER IERR

    IERR=0
    CALL MPI_FINALIZE(IERR)
    if(ierr /=0) write(ipt,*) "BAD MPI_FINALIZE", MYID
    close(IPT)
    STOP

  END SUBROUTINE PSHUTDOWN


  SUBROUTINE CONTIGUOUS_WORKS
    IMPLICIT NONE
    INTEGER, pointer :: ptest(:,:)
    INTEGER :: IERR, I,J


    write(ipt,*) "START CONTIGUOUS:"
    n=2000 ! Set size here...
    m=n+10

    call alloc_vars
    write(ipt,*) "ALLOCATED DATA"
    ptest => data(1:N,1:N)

    IF (MYID == 0) ptest=6
    write(ipt,*) "Made POINTER"

    call MPI_BCAST(ptest,N*N,MPI_INTEGER,0,MPI_COMM_WORLD,IERR)
    IF(IERR /= 0) WRITE(IPT,*) "BAD BCAST", MYID

    write(ipt,*) "BROADCAST Data; a value:",data(1,6)

    DO I = 1,N
       DO J = 1,N
          if(data(I,J) /= 6) &
               & write(ipt,*) "INCORRECT VALUE!", I,J,data(I,J)
       END DO

       DO J = N+1,M
          if(data(I,J) /= 0) &
               & write(ipt,*) "INCORRECT VALUE!", I,J,data(I,J)
       END DO

    END DO

    ! CALL THREE DIFFERENT EXAMPLES OF SUBROUTINES W/OUT AN ITERFACE
    ! THAT USE AN EXPLICIT SHAPE ARRAY
    write(ipt,*) "CALLING DUMMY1"
    CALL DUMMY1

    write(ipt,*) "CALLING DUMMY2"
    call Dummy2(m,n)

    write(ipt,*) "CALLING DUMMY3"
    call Dummy3
    write(ipt,*) "FINISHED!"

  END SUBROUTINE CONTIGUOUS_WORKS

  SUBROUTINE NON_CONTIGUOUS_FAILS
    IMPLICIT NONE
    INTEGER, pointer :: ptest(:,:)
    INTEGER :: IERR, I,J


    write(ipt,*) "START NON_CONTIGUOUS:"

    m=200 ! Set size here - crash is size dependent!
    n=m+10

    call alloc_vars
    write(ipt,*) "ALLOCATED DATA"
    ptest => data(1:M,1:M)

!===================================================
! IF YOU CALL DUMMY2 HERE TOO, THEN EVERYTHING PASSES  ???
!===================================================
!    CALL DUMMY1 ! THIS ONE HAS NO EFFECT
!    CALL DUMMY2 ! THIS ONE 'FIXES' THE BUG

    IF (MYID == 0) ptest=6
    write(ipt,*) "Made POINTER"

    call MPI_BCAST(ptest,M*M,MPI_INTEGER,0,MPI_COMM_WORLD,IERR)
    IF(IERR /= 0) WRITE(IPT,*) "BAD BCAST"

    write(ipt,*) "BROADCAST Data; a value:",data(1,6)

    DO I = 1,M
       DO J = 1,M
          if(data(J,I) /= 6) &
               & write(ipt,*) "INCORRECT VALUE!",I,J,DATA(I,J)
       END DO

       DO J = M+1,N
          if(data(J,I) /= 0) &
               & write(ipt,*) "INCORRECT VALUE!",I,J,DATA(I,J)
       END DO
    END DO

    ! CALL THREE DIFFERENT EXAMPLES OF SUBROUTINES W/OUT AN ITERFACE
    ! THAT USE AN EXPLICIT SHAPE ARRAY
    write(ipt,*) "CALLING DUMMY1"
    CALL DUMMY1

    write(ipt,*) "CALLING DUMMY2"
    call Dummy2(m,n) ! SHOULD CRASH HERE!

    write(ipt,*) "CALLING DUMMY3"
    call Dummy3
    write(ipt,*) "FINISHED!"

  END SUBROUTINE NON_CONTIGUOUS_FAILS


  End Module vars


Program main
  USE vars
  implicit none


  CALL INIT_MPI_ENV(MYID,NPROCS)

  ipt=myid+10
  OPEN(ipt)


  write(ipt,*) "Start memory test!"

  CALL NON_CONTIGUOUS_FAILS

!  CALL CONTIGUOUS_WORKS

  write(ipt,*) "End memory test!"

  CALL PSHUTDOWN

END Program main



! TWO DUMMY SUBROUTINE WITH EXPLICIT SHAPE ARRAYS
! DUMMY1 DECLARES A VECTOR  - THIS ONE NEVER CAUSES FAILURE
! DUMMY2 DECLARES AN ARRAY  - THIS ONE CAUSES FAILURE

SUBROUTINE DUMMY1
  USE vars
  implicit none
  real, dimension(m) :: my_data

  write(ipt,*) "m,n",m,n

  write(ipt,*) "DUMMY 1", size(my_data)

END SUBROUTINE DUMMY1


SUBROUTINE DUMMY2(i,j)
  USE vars
  implicit none
  INTEGER, INTENT(IN) ::i,j


  real, dimension(i,j) :: my_data

  write(ipt,*) "start: DUMMY 2", size(my_data)


END SUBROUTINE DUMMY2

SUBROUTINE DUMMY3
  USE vars
  implicit none


  real, dimension(m,n) :: my_data


  write(ipt,*) "start: DUMMY 3", size(my_data)


END SUBROUTINE DUMMY3
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://mail.cse.ohio-state.edu/pipermail/mvapich-discuss/attachments/20080123/a231fe79/attachment-0001.html


More information about the mvapich-discuss mailing list