[mvapich-discuss] MVAPICH 2.1a GDR cuda-aware/GDR data corrupted?

Tue Mar 31 21:10:47 EDT 2015

Hi,

I am observing bad data with MVAPICH 2.1a GDR in non-blocking, point-to-point communication.
Host-host communication is fine, but both cuda-aware MPI and cuda-aware MPI with GPUDirect RDMA fail.
I have additional data showing similar behavior for MVAPICH 2.0 GDR.

Jens

DETAILS:

1. To test communication correctness, I modify the MPI_recv call in the bandwidth test of the OSU micro benchmarks (4.4)
in such a way that received data for different iterations of the benchmark for a given message size
is written into an expanded output buffer in contiguous fashion. Then I check if the received characters match
the expected values (‘a’).

Patch to osu_bw.c:

--- osu_bw.c	2015-03-31 20:29:32.000000000 -0400
+++ osu_bw_expanded_buf.c	2015-03-31 20:24:22.000000000 -0400
@@ -42,9 +42,11 @@
 
 #define MAX_REQ_NUM 1000
 
+#define WINDOW_SIZE 64
+
 #define MAX_ALIGNMENT 65536
 #define MAX_MSG_SIZE (1<<22)
-#define MYBUFSIZE (MAX_MSG_SIZE + MAX_ALIGNMENT)
+#define MYBUFSIZE (MAX_MSG_SIZE*WINDOW_SIZE + MAX_ALIGNMENT)
 
 #define LOOP_LARGE  20
 #define WINDOW_SIZE_LARGE  64
@@ -98,6 +100,7 @@
 int allocate_memory (char **sbuf, char **rbuf, int rank);
 void print_header (int rank);
 void touch_data (void *sbuf, void *rbuf, int rank, size_t size);
+void check_data (void *buf, size_t size);
 void free_memory (void *sbuf, void *rbuf, int rank);
 int init_accel (void);
 int cleanup_accel (void);
@@ -110,7 +113,7 @@
     char *s_buf, *r_buf;
     double t_start = 0.0, t_end = 0.0, t = 0.0;
     int loop = 100;
-    int window_size = 64;
+    int window_size = WINDOW_SIZE;
     int skip = 10;
     int po_ret = process_options(argc, argv);
 
@@ -205,12 +208,16 @@
         else if(myid == 1) {
             for(i = 0; i < loop + skip; i++) {
                 for(j = 0; j < window_size; j++) {
-                    MPI_Irecv(r_buf, size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
+                    MPI_Irecv(r_buf + j*size, size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
                             request + j);
                 }
 
                 MPI_Waitall(window_size, request, reqstat);
                 MPI_Send(s_buf, 4, MPI_CHAR, 0, 101, MPI_COMM_WORLD);
+
+                check_data(r_buf, size*window_size);
+
+
             }
         }
 
@@ -564,6 +571,39 @@
     }
 }
 
+void
+check_data (void * buf, size_t size)
+{
+    char *h_rbuf;
+    #ifdef _ENABLE_CUDA_
+    if ('D' == options.dst) {
+        h_rbuf = malloc(size);
+        cudaError_t cuerr = cudaMemcpy(h_rbuf, buf, size, cudaMemcpyDeviceToHost);
+        if (cudaSuccess != cuerr) {
+            fprintf(stderr, "Error copying D2H\n");
+            return;
+        }
+    } else
+    #endif
+        {
+        h_rbuf = buf;
+    }
+
+    unsigned int i;
+    for (i = 0; i < size; ++i)
+        {
+        if (h_rbuf[i] != 'a')
+            {
+            printf("Message byte %d, %c != %c\n", i, h_rbuf[i], 'a');
+            break;
+            }
+        }
+    if ('D' == options.dst) {
+        free(h_rbuf);
+        }
+}
+
+
 int
 free_device_buffer (void * buf)
 {


2. I execute the test on a dual rail configuration node, with two GPUs and two HCAs on different segments of PCIe.
Specifically, I am testing on the Wilkes cluster. The three different configurations are:
Host-Host, Device-Device cuda-aware, and Device-Device GDR. The CUDA toolkit version is 6.5.

These are the results:

a) Host-Host
mpirun -np $SLURM_NTASKS -ppn 2  -genvall \
-genv MV2_RAIL_SHARING_POLICY FIXED_MAPPING -genv MV2_PROCESS_TO_RAIL_MAPPING 0:1 \
-genv MV2_RAIL_SHARING_LARGE_MSG_THRESHOLD 1G \
-genv MV2_ENABLE_AFFINITY 1 -genv MV2_CPU_BINDING_LEVEL SOCKET -genv MV2_CPU_BINDING_POLICY SCATTER \
-genv MV2_USE_SHARED_MEM 0 \
-genv MV2_USE_CUDA 1 -genv MV2_USE_GPUDIRECT 0 -genv MV2_GPUDIRECT_GDRCOPY_LIB ${GDRCOPY_LIBRARY_PATH}/libgdrapi.so -genv MV2_CUDA_IPC 0 \
sh /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/get_local_rank /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/mpi/pt2pt/osu_bw H H

ldd /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/mpi/pt2pt/osu_bw
# OSU MPI-CUDA Bandwidth Test v4.4
# Send Buffer on HOST (H) and Receive Buffer on HOST (H)
# Size      Bandwidth (MB/s)
1                       1.11
2                       2.20
4                       4.43
8                       8.89
16                     17.84
32                     35.64
64                     70.33
128                   133.84
256                   242.58
512                   359.18
1024                  578.63
2048                  828.26
4096                 1011.72
8192                 1134.18
16384                1205.19
32768                1261.87
65536                1272.95
131072               1279.46
262144               1275.65
524288               1275.42
1048576              1275.61
2097152              1277.70
4194304              1278.82

-> OK

b) Device-Device cuda-aware

mpirun -np $SLURM_NTASKS -ppn 2  -genvall \
-genv MV2_RAIL_SHARING_POLICY FIXED_MAPPING -genv MV2_PROCESS_TO_RAIL_MAPPING 0:1 \
-genv MV2_RAIL_SHARING_LARGE_MSG_THRESHOLD 1G \
-genv MV2_ENABLE_AFFINITY 1 -genv MV2_CPU_BINDING_LEVEL SOCKET -genv MV2_CPU_BINDING_POLICY SCATTER \
-genv MV2_USE_SHARED_MEM 0 \
-genv MV2_USE_CUDA 1 -genv MV2_USE_GPUDIRECT 0 -genv MV2_GPUDIRECT_GDRCOPY_LIB ${GDRCOPY_LIBRARY_PATH}/libgdrapi.so -genv MV2_CUDA_IPC 0 \
sh /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/get_local_rank /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/mpi/pt2pt/osu_bw D D

# OSU MPI-CUDA Bandwidth Test v4.4
# Send Buffer on DEVICE (D) and Receive Buffer on DEVICE (D)
# Size      Bandwidth (MB/s)
Warning *** The GPU and IB selected are not on the same socket. Do not delever the best performance 
1                       0.07
2                       0.13
4                       0.34
8                       0.69
16                      1.36
32                      2.73
64                      5.42
128                    10.87
256                    21.70
512                    43.21
1024                   84.83
2048                  161.70
4096                  299.68
8192                  412.03
16384                 501.18
32768                 543.28
Message byte 0, b != a
65536                 661.09
Message byte 0, b != a
131072                739.19
Message byte 0, b != a
262144                770.89
Message byte 0, b != a
524288                761.48
1048576               756.53
2097152               757.82
Message byte 0, b != a
4194304               755.51

-> FAIL

c) Device-device GDR
unset MV2_GPUDIRECT_GDRCOPY_LIB
mpirun -np $SLURM_NTASKS -ppn 2  -genvall \
-genv MV2_RAIL_SHARING_POLICY FIXED_MAPPING -genv MV2_PROCESS_TO_RAIL_MAPPING 0:1 \
-genv MV2_RAIL_SHARING_LARGE_MSG_THRESHOLD 1G \
-genv MV2_ENABLE_AFFINITY 1 -genv MV2_CPU_BINDING_LEVEL SOCKET -genv MV2_CPU_BINDING_POLICY SCATTER \
-genv MV2_USE_SHARED_MEM 0 \
-genv MV2_USE_CUDA 1 -genv MV2_USE_GPUDIRECT 1 -genv MV2_CUDA_IPC 0\
sh /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/get_local_rank /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/mpi/pt2pt/osu_bw D D

# OSU MPI-CUDA Bandwidth Test v4.4
# Send Buffer on DEVICE (D) and Receive Buffer on DEVICE (D)
# Size      Bandwidth (MB/s)
Warning *** The GPU and IB selected are not on the same socket. Do not delever the best performance 
1                       0.01
2                       0.03
4                       0.05
8                       0.11
16                      0.22
32                      0.84
64                      1.69
128                     3.35
256                     6.61
512                    13.22
1024                   25.67
2048                   49.59
4096                   92.64
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
8192                   14.81
Message byte 0, b != a
16384                 421.67
32768                 608.24
65536                 721.74
131072                792.72
262144                795.85
524288                780.61
1048576               776.48
2097152               160.07
4194304               401.23

-> FAIL

d) Device-Device GDR (no loopback)
unset MV2_GPUDIRECT_GDRCOPY_LIB
mpirun -np $SLURM_NTASKS -ppn 2  -genvall \
-genv MV2_RAIL_SHARING_POLICY FIXED_MAPPING -genv MV2_PROCESS_TO_RAIL_MAPPING 0:1 \
-genv MV2_RAIL_SHARING_LARGE_MSG_THRESHOLD 1G \
-genv MV2_ENABLE_AFFINITY 1 -genv MV2_CPU_BINDING_LEVEL SOCKET -genv MV2_CPU_BINDING_POLICY SCATTER \
-genv MV2_USE_SHARED_MEM 0 \
-genv MV2_USE_CUDA 1 -genv MV2_USE_GPUDIRECT 1 -genv MV2_CUDA_IPC 0 -genv MV2_USE_GPUDIRECT_LOOPBACK_LIMIT 9999999 \
sh /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/get_local_rank /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/mpi/pt2pt/osu_bw D D

# OSU MPI-CUDA Bandwidth Test v4.4
# Send Buffer on DEVICE (D) and Receive Buffer on DEVICE (D)
# Size      Bandwidth (MB/s)
Warning *** The GPU and IB selected are not on the same socket. Do not delever the best performance 
1                       0.01
2                       0.03
4                       0.05
8                       0.11
16                      0.22
32                      0.83
64                      1.67
128                     3.33
256                     6.57
512                    13.08
1024                   25.40
2048                   49.38
4096                   91.31
8192                  595.21
16384                 666.12
Message byte 0, b != a
32768                 605.65
65536                 721.52
131072                791.46
262144                794.08
524288                779.70
1048576               776.23
2097152               187.64
4194304               196.25

-> FAIL

3. Additional info:

MVAPICH2 Version:     	2.1a
MVAPICH2 Release date:	Sun Sep 21 12:00:00 EDT 2014
MVAPICH2 Device:      	ch3:mrail
MVAPICH2 configure:   	--build=x86_64-unknown-linux-gnu --host=x86_64-unknown-linux-gnu --target=x86_64-redhat-linux-gnu --program-prefix= --prefix=/opt/mvapich2/gdr/2.1a/gnu --exec-prefix=/opt/mvapich2/gdr/2.1a/gnu --bindir=/opt/mvapich2/gdr/2.1a/gnu/bin --sbindir=/opt/mvapich2/gdr/2.1a/gnu/sbin --sysconfdir=/opt/mvapich2/gdr/2.1a/gnu/etc --datadir=/opt/mvapich2/gdr/2.1a/gnu/share --includedir=/opt/mvapich2/gdr/2.1a/gnu/include --libdir=/opt/mvapich2/gdr/2.1a/gnu/lib64 --libexecdir=/opt/mvapich2/gdr/2.1a/gnu/libexec --localstatedir=/var --sharedstatedir=/var/lib --mandir=/opt/mvapich2/gdr/2.1a/gnu/share/man --infodir=/opt/mvapich2/gdr/2.1a/gnu/share/info --disable-rpath --disable-static --enable-shared --disable-rdma-cm --disable-mcast --enable-cuda --without-hydra-ckpointlib CPPFLAGS=-I/usr/local/cuda/include LDFLAGS=-L/usr/local/cuda/lib64 -Wl,-rpath,/usr/local/cuda/lib64 -Wl,-rpath,XORIGIN/placeholder
MVAPICH2 CC:  	gcc -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic   -DNDEBUG -DNVALGRIND -O2
MVAPICH2 CXX: 	g++ -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic  -DNDEBUG -DNVALGRIND -O2
MVAPICH2 F77: 	gfortran -L/lib -L/lib -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -I/opt/mvapich2/gdr/2.1a/gnu/lib64/gfortran/modules  -O2
MVAPICH2 FC:  	gfortran   -O2

ldd /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/mpi/pt2pt/osu_bw
    linux-vdso.so.1 =>  (0x00007fff4ec9a000)
    libmpi.so.12 => /usr/local/Cluster-Apps/mvapich2-GDR/gnu/2.1a_cuda-6.5/lib64/libmpi.so.12 (0x00007fd83ab34000)
    libc.so.6 => /lib64/libc.so.6 (0x00007fd83a776000)
    libcudart.so.6.5 => /usr/local/Cluster-Apps/cuda/6.5/lib64/libcudart.so.6.5 (0x00007fd83a526000)
    libcuda.so.1 => /usr/lib64/libcuda.so.1 (0x00007fd8395b4000)
    libstdc++.so.6 => /usr/local/Cluster-Apps/gcc/4.8.1/lib64/libstdc++.so.6 (0x00007fd8392ab000)
    libnuma.so.1 => /usr/lib64/libnuma.so.1 (0x00007fd8390a0000)
    libibumad.so.3 => /usr/lib64/libibumad.so.3 (0x00007fd838e98000)
    libibverbs.so.1 => /usr/lib64/libibverbs.so.1 (0x00007fd838c82000)
    libdl.so.2 => /lib64/libdl.so.2 (0x00007fd838a7e000)
    librt.so.1 => /lib64/librt.so.1 (0x00007fd838875000)
    libgfortran.so.3 => /usr/local/Cluster-Apps/gcc/4.8.1/lib64/libgfortran.so.3 (0x00007fd83855f000)
    libm.so.6 => /lib64/libm.so.6 (0x00007fd8382db000)
    libpthread.so.0 => /lib64/libpthread.so.0 (0x00007fd8380bd000)
    libgcc_s.so.1 => /usr/local/Cluster-Apps/gcc/4.8.1/lib64/libgcc_s.so.1 (0x00007fd837ea8000)
    /lib64/ld-linux-x86-64.so.2 (0x00007fd83b1f2000)
    libnl.so.1 => /lib64/libnl.so.1 (0x00007fd837c55000)
    libquadmath.so.0 => /usr/local/Cluster-Apps/gcc/4.8.1/lib/../lib64/libquadmath.so.0 (0x00007fd837a1a000)

[hpcgla1 at tesla80 qc_spiga]$ nvidia-smi topo -m
        GPU0 GPU1    mlx5_0  mlx5_1  CPU Affinity
GPU0     X      SOC     PHB     SOC     0-0,2-2,4-4,6-6,8-8,10-10
GPU1    SOC      X      SOC     PHB     1-1,3-3,5-5,7-7,9-9,11-11
mlx5_0  PHB     SOC      X      SOC     
mlx5_1  SOC     PHB     SOC      X      

Legend:

  X   = Self
  SOC = Path traverses a socket-level link (e.g. QPI)
  PHB = Path traverses a PCIe host bridge
  PXB = Path traverses multiple PCIe internal switches
  PIX = Path traverses a PCIe internal switch

The warning message
Warning *** The GPU and IB selected are not on the same socket. Do not delever the best performance
goes away if I set MV2_CPU_MAPPING 0:1, but behavior is unchanged otherwise.

Additonal details (ib configuration, loaded modules, ofed version,..) upon request.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.cse.ohio-state.edu/pipermail/mvapich-discuss/attachments/20150331/ae28fd3d/attachment-0001.html>