[mvapich-discuss] MVAPICH 2.1a GDR cuda-aware/GDR data corrupted?
Jens Glaser
jsglaser at umich.edu
Tue Mar 31 21:10:47 EDT 2015
Hi,
I am observing bad data with MVAPICH 2.1a GDR in non-blocking, point-to-point communication.
Host-host communication is fine, but both cuda-aware MPI and cuda-aware MPI with GPUDirect RDMA fail.
I have additional data showing similar behavior for MVAPICH 2.0 GDR.
Jens
DETAILS:
1. To test communication correctness, I modify the MPI_recv call in the bandwidth test of the OSU micro benchmarks (4.4)
in such a way that received data for different iterations of the benchmark for a given message size
is written into an expanded output buffer in contiguous fashion. Then I check if the received characters match
the expected values (‘a’).
Patch to osu_bw.c:
--- osu_bw.c 2015-03-31 20:29:32.000000000 -0400
+++ osu_bw_expanded_buf.c 2015-03-31 20:24:22.000000000 -0400
@@ -42,9 +42,11 @@
#define MAX_REQ_NUM 1000
+#define WINDOW_SIZE 64
+
#define MAX_ALIGNMENT 65536
#define MAX_MSG_SIZE (1<<22)
-#define MYBUFSIZE (MAX_MSG_SIZE + MAX_ALIGNMENT)
+#define MYBUFSIZE (MAX_MSG_SIZE*WINDOW_SIZE + MAX_ALIGNMENT)
#define LOOP_LARGE 20
#define WINDOW_SIZE_LARGE 64
@@ -98,6 +100,7 @@
int allocate_memory (char **sbuf, char **rbuf, int rank);
void print_header (int rank);
void touch_data (void *sbuf, void *rbuf, int rank, size_t size);
+void check_data (void *buf, size_t size);
void free_memory (void *sbuf, void *rbuf, int rank);
int init_accel (void);
int cleanup_accel (void);
@@ -110,7 +113,7 @@
char *s_buf, *r_buf;
double t_start = 0.0, t_end = 0.0, t = 0.0;
int loop = 100;
- int window_size = 64;
+ int window_size = WINDOW_SIZE;
int skip = 10;
int po_ret = process_options(argc, argv);
@@ -205,12 +208,16 @@
else if(myid == 1) {
for(i = 0; i < loop + skip; i++) {
for(j = 0; j < window_size; j++) {
- MPI_Irecv(r_buf, size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
+ MPI_Irecv(r_buf + j*size, size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
request + j);
}
MPI_Waitall(window_size, request, reqstat);
MPI_Send(s_buf, 4, MPI_CHAR, 0, 101, MPI_COMM_WORLD);
+
+ check_data(r_buf, size*window_size);
+
+
}
}
@@ -564,6 +571,39 @@
}
}
+void
+check_data (void * buf, size_t size)
+{
+ char *h_rbuf;
+ #ifdef _ENABLE_CUDA_
+ if ('D' == options.dst) {
+ h_rbuf = malloc(size);
+ cudaError_t cuerr = cudaMemcpy(h_rbuf, buf, size, cudaMemcpyDeviceToHost);
+ if (cudaSuccess != cuerr) {
+ fprintf(stderr, "Error copying D2H\n");
+ return;
+ }
+ } else
+ #endif
+ {
+ h_rbuf = buf;
+ }
+
+ unsigned int i;
+ for (i = 0; i < size; ++i)
+ {
+ if (h_rbuf[i] != 'a')
+ {
+ printf("Message byte %d, %c != %c\n", i, h_rbuf[i], 'a');
+ break;
+ }
+ }
+ if ('D' == options.dst) {
+ free(h_rbuf);
+ }
+}
+
+
int
free_device_buffer (void * buf)
{
2. I execute the test on a dual rail configuration node, with two GPUs and two HCAs on different segments of PCIe.
Specifically, I am testing on the Wilkes cluster. The three different configurations are:
Host-Host, Device-Device cuda-aware, and Device-Device GDR. The CUDA toolkit version is 6.5.
These are the results:
a) Host-Host
mpirun -np $SLURM_NTASKS -ppn 2 -genvall \
-genv MV2_RAIL_SHARING_POLICY FIXED_MAPPING -genv MV2_PROCESS_TO_RAIL_MAPPING 0:1 \
-genv MV2_RAIL_SHARING_LARGE_MSG_THRESHOLD 1G \
-genv MV2_ENABLE_AFFINITY 1 -genv MV2_CPU_BINDING_LEVEL SOCKET -genv MV2_CPU_BINDING_POLICY SCATTER \
-genv MV2_USE_SHARED_MEM 0 \
-genv MV2_USE_CUDA 1 -genv MV2_USE_GPUDIRECT 0 -genv MV2_GPUDIRECT_GDRCOPY_LIB ${GDRCOPY_LIBRARY_PATH}/libgdrapi.so -genv MV2_CUDA_IPC 0 \
sh /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/get_local_rank /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/mpi/pt2pt/osu_bw H H
ldd /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/mpi/pt2pt/osu_bw
# OSU MPI-CUDA Bandwidth Test v4.4
# Send Buffer on HOST (H) and Receive Buffer on HOST (H)
# Size Bandwidth (MB/s)
1 1.11
2 2.20
4 4.43
8 8.89
16 17.84
32 35.64
64 70.33
128 133.84
256 242.58
512 359.18
1024 578.63
2048 828.26
4096 1011.72
8192 1134.18
16384 1205.19
32768 1261.87
65536 1272.95
131072 1279.46
262144 1275.65
524288 1275.42
1048576 1275.61
2097152 1277.70
4194304 1278.82
-> OK
b) Device-Device cuda-aware
mpirun -np $SLURM_NTASKS -ppn 2 -genvall \
-genv MV2_RAIL_SHARING_POLICY FIXED_MAPPING -genv MV2_PROCESS_TO_RAIL_MAPPING 0:1 \
-genv MV2_RAIL_SHARING_LARGE_MSG_THRESHOLD 1G \
-genv MV2_ENABLE_AFFINITY 1 -genv MV2_CPU_BINDING_LEVEL SOCKET -genv MV2_CPU_BINDING_POLICY SCATTER \
-genv MV2_USE_SHARED_MEM 0 \
-genv MV2_USE_CUDA 1 -genv MV2_USE_GPUDIRECT 0 -genv MV2_GPUDIRECT_GDRCOPY_LIB ${GDRCOPY_LIBRARY_PATH}/libgdrapi.so -genv MV2_CUDA_IPC 0 \
sh /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/get_local_rank /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/mpi/pt2pt/osu_bw D D
# OSU MPI-CUDA Bandwidth Test v4.4
# Send Buffer on DEVICE (D) and Receive Buffer on DEVICE (D)
# Size Bandwidth (MB/s)
Warning *** The GPU and IB selected are not on the same socket. Do not delever the best performance
1 0.07
2 0.13
4 0.34
8 0.69
16 1.36
32 2.73
64 5.42
128 10.87
256 21.70
512 43.21
1024 84.83
2048 161.70
4096 299.68
8192 412.03
16384 501.18
32768 543.28
Message byte 0, b != a
65536 661.09
Message byte 0, b != a
131072 739.19
Message byte 0, b != a
262144 770.89
Message byte 0, b != a
524288 761.48
1048576 756.53
2097152 757.82
Message byte 0, b != a
4194304 755.51
-> FAIL
c) Device-device GDR
unset MV2_GPUDIRECT_GDRCOPY_LIB
mpirun -np $SLURM_NTASKS -ppn 2 -genvall \
-genv MV2_RAIL_SHARING_POLICY FIXED_MAPPING -genv MV2_PROCESS_TO_RAIL_MAPPING 0:1 \
-genv MV2_RAIL_SHARING_LARGE_MSG_THRESHOLD 1G \
-genv MV2_ENABLE_AFFINITY 1 -genv MV2_CPU_BINDING_LEVEL SOCKET -genv MV2_CPU_BINDING_POLICY SCATTER \
-genv MV2_USE_SHARED_MEM 0 \
-genv MV2_USE_CUDA 1 -genv MV2_USE_GPUDIRECT 1 -genv MV2_CUDA_IPC 0\
sh /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/get_local_rank /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/mpi/pt2pt/osu_bw D D
# OSU MPI-CUDA Bandwidth Test v4.4
# Send Buffer on DEVICE (D) and Receive Buffer on DEVICE (D)
# Size Bandwidth (MB/s)
Warning *** The GPU and IB selected are not on the same socket. Do not delever the best performance
1 0.01
2 0.03
4 0.05
8 0.11
16 0.22
32 0.84
64 1.69
128 3.35
256 6.61
512 13.22
1024 25.67
2048 49.59
4096 92.64
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
Message byte 0, b != a
8192 14.81
Message byte 0, b != a
16384 421.67
32768 608.24
65536 721.74
131072 792.72
262144 795.85
524288 780.61
1048576 776.48
2097152 160.07
4194304 401.23
-> FAIL
d) Device-Device GDR (no loopback)
unset MV2_GPUDIRECT_GDRCOPY_LIB
mpirun -np $SLURM_NTASKS -ppn 2 -genvall \
-genv MV2_RAIL_SHARING_POLICY FIXED_MAPPING -genv MV2_PROCESS_TO_RAIL_MAPPING 0:1 \
-genv MV2_RAIL_SHARING_LARGE_MSG_THRESHOLD 1G \
-genv MV2_ENABLE_AFFINITY 1 -genv MV2_CPU_BINDING_LEVEL SOCKET -genv MV2_CPU_BINDING_POLICY SCATTER \
-genv MV2_USE_SHARED_MEM 0 \
-genv MV2_USE_CUDA 1 -genv MV2_USE_GPUDIRECT 1 -genv MV2_CUDA_IPC 0 -genv MV2_USE_GPUDIRECT_LOOPBACK_LIMIT 9999999 \
sh /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/get_local_rank /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/mpi/pt2pt/osu_bw D D
# OSU MPI-CUDA Bandwidth Test v4.4
# Send Buffer on DEVICE (D) and Receive Buffer on DEVICE (D)
# Size Bandwidth (MB/s)
Warning *** The GPU and IB selected are not on the same socket. Do not delever the best performance
1 0.01
2 0.03
4 0.05
8 0.11
16 0.22
32 0.83
64 1.67
128 3.33
256 6.57
512 13.08
1024 25.40
2048 49.38
4096 91.31
8192 595.21
16384 666.12
Message byte 0, b != a
32768 605.65
65536 721.52
131072 791.46
262144 794.08
524288 779.70
1048576 776.23
2097152 187.64
4194304 196.25
-> FAIL
3. Additional info:
MVAPICH2 Version: 2.1a
MVAPICH2 Release date: Sun Sep 21 12:00:00 EDT 2014
MVAPICH2 Device: ch3:mrail
MVAPICH2 configure: --build=x86_64-unknown-linux-gnu --host=x86_64-unknown-linux-gnu --target=x86_64-redhat-linux-gnu --program-prefix= --prefix=/opt/mvapich2/gdr/2.1a/gnu --exec-prefix=/opt/mvapich2/gdr/2.1a/gnu --bindir=/opt/mvapich2/gdr/2.1a/gnu/bin --sbindir=/opt/mvapich2/gdr/2.1a/gnu/sbin --sysconfdir=/opt/mvapich2/gdr/2.1a/gnu/etc --datadir=/opt/mvapich2/gdr/2.1a/gnu/share --includedir=/opt/mvapich2/gdr/2.1a/gnu/include --libdir=/opt/mvapich2/gdr/2.1a/gnu/lib64 --libexecdir=/opt/mvapich2/gdr/2.1a/gnu/libexec --localstatedir=/var --sharedstatedir=/var/lib --mandir=/opt/mvapich2/gdr/2.1a/gnu/share/man --infodir=/opt/mvapich2/gdr/2.1a/gnu/share/info --disable-rpath --disable-static --enable-shared --disable-rdma-cm --disable-mcast --enable-cuda --without-hydra-ckpointlib CPPFLAGS=-I/usr/local/cuda/include LDFLAGS=-L/usr/local/cuda/lib64 -Wl,-rpath,/usr/local/cuda/lib64 -Wl,-rpath,XORIGIN/placeholder
MVAPICH2 CC: gcc -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -DNDEBUG -DNVALGRIND -O2
MVAPICH2 CXX: g++ -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -DNDEBUG -DNVALGRIND -O2
MVAPICH2 F77: gfortran -L/lib -L/lib -O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -m64 -mtune=generic -I/opt/mvapich2/gdr/2.1a/gnu/lib64/gfortran/modules -O2
MVAPICH2 FC: gfortran -O2
ldd /home/hpcgla1/osu-micro-benchmarks-4.4-expanded/mpi/pt2pt/osu_bw
linux-vdso.so.1 => (0x00007fff4ec9a000)
libmpi.so.12 => /usr/local/Cluster-Apps/mvapich2-GDR/gnu/2.1a_cuda-6.5/lib64/libmpi.so.12 (0x00007fd83ab34000)
libc.so.6 => /lib64/libc.so.6 (0x00007fd83a776000)
libcudart.so.6.5 => /usr/local/Cluster-Apps/cuda/6.5/lib64/libcudart.so.6.5 (0x00007fd83a526000)
libcuda.so.1 => /usr/lib64/libcuda.so.1 (0x00007fd8395b4000)
libstdc++.so.6 => /usr/local/Cluster-Apps/gcc/4.8.1/lib64/libstdc++.so.6 (0x00007fd8392ab000)
libnuma.so.1 => /usr/lib64/libnuma.so.1 (0x00007fd8390a0000)
libibumad.so.3 => /usr/lib64/libibumad.so.3 (0x00007fd838e98000)
libibverbs.so.1 => /usr/lib64/libibverbs.so.1 (0x00007fd838c82000)
libdl.so.2 => /lib64/libdl.so.2 (0x00007fd838a7e000)
librt.so.1 => /lib64/librt.so.1 (0x00007fd838875000)
libgfortran.so.3 => /usr/local/Cluster-Apps/gcc/4.8.1/lib64/libgfortran.so.3 (0x00007fd83855f000)
libm.so.6 => /lib64/libm.so.6 (0x00007fd8382db000)
libpthread.so.0 => /lib64/libpthread.so.0 (0x00007fd8380bd000)
libgcc_s.so.1 => /usr/local/Cluster-Apps/gcc/4.8.1/lib64/libgcc_s.so.1 (0x00007fd837ea8000)
/lib64/ld-linux-x86-64.so.2 (0x00007fd83b1f2000)
libnl.so.1 => /lib64/libnl.so.1 (0x00007fd837c55000)
libquadmath.so.0 => /usr/local/Cluster-Apps/gcc/4.8.1/lib/../lib64/libquadmath.so.0 (0x00007fd837a1a000)
[hpcgla1 at tesla80 qc_spiga]$ nvidia-smi topo -m
GPU0 GPU1 mlx5_0 mlx5_1 CPU Affinity
GPU0 X SOC PHB SOC 0-0,2-2,4-4,6-6,8-8,10-10
GPU1 SOC X SOC PHB 1-1,3-3,5-5,7-7,9-9,11-11
mlx5_0 PHB SOC X SOC
mlx5_1 SOC PHB SOC X
Legend:
X = Self
SOC = Path traverses a socket-level link (e.g. QPI)
PHB = Path traverses a PCIe host bridge
PXB = Path traverses multiple PCIe internal switches
PIX = Path traverses a PCIe internal switch
The warning message
Warning *** The GPU and IB selected are not on the same socket. Do not delever the best performance
goes away if I set MV2_CPU_MAPPING 0:1, but behavior is unchanged otherwise.
Additonal details (ib configuration, loaded modules, ofed version,..) upon request.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.cse.ohio-state.edu/pipermail/mvapich-discuss/attachments/20150331/ae28fd3d/attachment-0001.html>
More information about the mvapich-discuss
mailing list