[mvapich-discuss] Passive target communication with PSM-CH3

Fri Oct 16 11:51:05 EDT 2015

Does MVAPICH 2.1 support passive target communication with PSM-CH3 devices?
We're using Intel InfiniPath_QLE7340, and it seems like an
MPI_Win_lock()/MPI_Fetch_and_op()/MPI_Win_unlock() sequence executes slowly
if the target is doing computation.

MVAPICH 2.1 was configured thus (where the Intel compiler is version
15.0.1):

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
./configure --prefix=/path/to/mvapich/2.1 \
  --enable-fortran=yes \
  --with-device=ch3:psm \
  --enable-threads=multiple \
  CC=icc CXX=icpc FC=ifort F77=ifort
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In the following test program, all ranks fetch & increment a value on rank
0. Rank 0 also does computation if RANK0_POLL isn't defined at compile time.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
program test_fetch_and_op
   use iso_fortran_env, only: int64
   use, intrinsic :: iso_c_binding, only: c_ptr, c_f_pointer
   use mpi
   implicit none

   integer :: info, integer_size, i, win, ierr, rank, world_size, n_i=0,
sum_i=0, reduce_sum_i
   integer, pointer :: iteration
   integer(kind=int64) :: count_rate, time1 = 0, time2 = 0,
time_fetch_and_op = 0
   integer, parameter :: one = 1, n = 1024, max_i = 2**16-1
   integer, allocatable :: rank_i(:)
   type(c_ptr) :: p_iteration
   double precision :: A(n,n), B(n,n), A_min
   double precision, allocatable :: rank_time(:)

   call random_number(A)
   call random_number(B)
   call system_clock(COUNT_RATE = count_rate)

   call MPI_Init(ierr)
   call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr);
   call MPI_Comm_size(MPI_COMM_WORLD, world_size, ierr);
   call MPI_Info_create(info, ierr)
   call MPI_Info_set(info, "accumulate_ops", "same_op", ierr)
   call MPI_Sizeof(i, integer_size, ierr)
   call MPI_Win_allocate(INT(integer_size, KIND=MPI_ADDRESS_KIND), 1, info,
MPI_COMM_WORLD, p_iteration, win, ierr)
   call C_F_POINTER(p_iteration, iteration)
   if (rank == 0) then
      iteration = 1
      allocate(rank_i(world_size))
      allocate(rank_time(world_size))
   else
      ! avoid Intel Fortran error in MPI_Gather when -check is used
      allocate(rank_i(1))
      allocate(rank_time(1))
   end if

   call MPI_Barrier(MPI_COMM_WORLD,ierr)

   do while (.true.)
#ifdef RANK0_POLL
      if (rank == 0) then
         call MPI_WIN_LOCK(MPI_LOCK_SHARED, 0, 0, win,ierr);
         call MPI_Get(i, 1, MPI_INTEGER, 0, 0_MPI_ADDRESS_KIND, 1,
MPI_INTEGER, win, ierr)
         call MPI_WIN_UNLOCK(0, win, ierr)
         if (i > max_i) exit
      else
#endif
      call system_clock(time1)
      call MPI_WIN_LOCK(MPI_LOCK_EXCLUSIVE, 0, 0, win,ierr);
      call MPI_FETCH_AND_OP(one, i, MPI_INTEGER, 0, 0_MPI_ADDRESS_KIND,
MPI_SUM, win, ierr)
      call MPI_WIN_UNLOCK(0, win, ierr)
      call system_clock(time2)
      if (i > max_i) exit
      time_fetch_and_op = time_fetch_and_op + (time2-time1)
      A = A + A * B ! do some computation
      n_i = n_i + 1
      sum_i = sum_i + i
#ifdef RANK0_POLL
      end if
#endif
   end do

   call MPI_Reduce(A(1,1), A_min, 1, MPI_DOUBLE_PRECISION, MPI_MIN, 0,
MPI_COMM_WORLD, ierr)
   call MPI_Gather(n_i, 1, MPI_INTEGER, rank_i, 1, MPI_INTEGER, 0,
MPI_COMM_WORLD, ierr)
   call MPI_Gather(DBLE(time_fetch_and_op)/count_rate, 1,
MPI_DOUBLE_PRECISION, rank_time, &
                   1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
   call MPI_Reduce(sum_i, reduce_sum_i, 1, MPI_INTEGER, MPI_SUM, 0,
MPI_COMM_WORLD, ierr)

   if (rank == 0) then
      write(*,*) 'minval(A(1,1)[:]) = ', A_min
      do i = 1, SIZE(rank_i)
         write(*,*) 'rank ', i-1, rank_i(i), rank_time(i)
      end do
      write(*,*) 'SUM(rank_i)', SUM(rank_i)
      write(*,*) 'reduce_sum_i', reduce_sum_i
   end if

   call MPI_Win_free(win, ierr)
   call MPI_Finalize(ierr)
end program test_fetch_and_op
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Compiled thus:

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mpif90 -o test_fetch_and_op test_fetch_and_op.F90
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Invoked from a job script to run on 2 nodes (16 ranks/node):

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mpirun -n 32 ./test_fetch_and_op
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The output indicates that ranks spend varying amounts of (cumulative) time
waiting for the MPI_Win_lock()/MPI_Fethch_and_op()/MPI_Unlock() sequence to
complete (4th column, in seconds). The third column indicates the number of
iterations executed by that rank.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 minval(A(1,1)[:]) =   3.013034008757427E+118
 rank            0        4585  0.988093000000000
 rank            1        2293   4.60248800000000
 rank            2        2293   1.99486700000000
 rank            3        2294   4.69011900000000
 rank            4        2294   4.74206200000000
 rank            5        2309   8.53757700000000
 rank            6        4147   6.24735400000000
 rank            7        2293   4.54687600000000
 rank            8        2417   8.60026900000000
 rank            9        2293   8.49449000000000
 rank           10        2294   8.48404600000000
 rank           11        2293   4.56649700000000
 rank           12        2293   1.91861300000000
 rank           13        2291   4.60065300000000
 rank           14        2293   1.90240000000000
 rank           15        2293   8.51508700000000
 rank           16        2292   2.43570500000000
 rank           17        2292   2.40850700000000
 rank           18        2292   2.47852800000000
 rank           19         765   3.55292800000000
 rank           20        2291   2.34879100000000
 rank           21         765   3.48627200000000
 rank           22         765   3.41105000000000
 rank           23        2292   2.61309200000000
 rank           24         765   3.45583100000000
 rank           25         765   3.56492300000000
 rank           26         870   4.03421000000000
 rank           27        2292   2.45449000000000
 rank           28        2292   2.57531800000000
 rank           29        2292   2.41837300000000
 rank           30         765   3.47952800000000
 rank           31         765   3.48015500000000
 SUM(rank_i)       65535
 reduce_sum_i  2147450880

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Compiling the code so that the rank 0 process doesn't do any computation
produces much lower cumulative wait times for the other ranks:

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mpif90 -DRANK0_POLL -o test_fetch_and_op test_fetch_and_op.f90
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 minval(A(1,1)[:]) =   3.920868194323862E-007
 rank            0           0  0.000000000000000E+000
 rank            1        2812  0.122509000000000
 rank            2        3824  0.175148000000000
 rank            3        1051  5.798800000000000E-002
 rank            4        1305  4.384800000000000E-002
 rank            5        1041  5.300300000000000E-002
 rank            6        1585  4.846100000000000E-002
 rank            7        1055  4.719500000000000E-002
 rank            8        1329  4.898600000000000E-002
 rank            9        3836  0.158367000000000
 rank           10        1052  5.660700000000000E-002
 rank           11        4087  8.280400000000000E-002
 rank           12        1488  5.919000000000000E-002
 rank           13        4076  0.109046000000000
 rank           14        1008  5.416400000000000E-002
 rank           15        1487  5.333600000000000E-002
 rank           16        2142  0.137507000000000
 rank           17        1553  0.117416000000000
 rank           18        2175  0.138576000000000
 rank           19        2592  0.187258000000000
 rank           20        2102  0.136301000000000
 rank           21        2174  0.136671000000000
 rank           22        2013  0.166204000000000
 rank           23        2531  0.195296000000000
 rank           24        2057  0.147237000000000
 rank           25        2127  0.138742000000000
 rank           26        2591  0.193355000000000
 rank           27        2151  0.156355000000000
 rank           28        2009  0.171320000000000
 rank           29        2121  0.140748000000000
 rank           30        1573  0.118845000000000
 rank           31        2588  0.159494000000000
 SUM(rank_i)       65535
 reduce_sum_i  2147450880
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Any advice would be appreciated. Thanks!

--
Nathan Weeks
Systems Analyst
Iowa State University -- Department of Mathematics
http://weeks.public.iastate.edu/
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.cse.ohio-state.edu/pipermail/mvapich-discuss/attachments/20151016/49245e3e/attachment-0001.html>