[mvapich-discuss] mvapich2-2.3.2 crash when CPU affinity is enabled

Honggang LI honli at redhat.com
Tue Mar 24 23:45:13 EDT 2020


It is a regression issue. Issue can be reproduced with mvapich2-2.3.2
and mvapich2-2.3.3. mvapich2-2.3 works for me.

thanks

[root at rdma-qe-06 ~]$ /opt/mvapich2-2.3.3/bin/mpirun -genv MV2_ENABLE_AFFINITY 0 -hostfile  /root/hfile_one_core -np 2 /opt/mvapich2-2.3.3/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_bw
# OSU MPI Bandwidth Test v5.6.2
# Size      Bandwidth (MB/s)
1                       3.36
2                       6.67
4                      13.36
8                      26.65
16                     53.25
32                    102.68
64                    242.07
128                   434.11
256                   894.55
512                  1515.48
1024                 2071.42
2048                 2617.76
4096                 2577.08
8192                 3383.27
16384                6082.21
32768                6291.01
65536                6402.05
131072               6464.33
262144               6496.74
524288               6513.39
1048576              6521.92
2097152              6525.98
4194304              6528.13
[root at rdma-qe-06 ~]$ 


[root at rdma-qe-06 ~]$ 
[root at rdma-qe-06 ~]$ /opt/mvapich2-2.3.3/bin/mpirun -genv MV2_ENABLE_AFFINITY 1 -genv MV2_DEBUG_SHOW_BACKTRACE 1 -hostfile  /root/hfile_one_core -np 2 /opt/mvapich2-2.3.3/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_bw
[rdma-qe-07.lab.bos.redhat.com:mpi_rank_1][error_sighandler] Caught error: Floating point exception (signal 8)
[rdma-qe-06.lab.bos.redhat.com:mpi_rank_0][error_sighandler] Caught error: Floating point exception (signal 8)
[rdma-qe-07.lab.bos.redhat.com:mpi_rank_1][print_backtrace]   0: /opt/mvapich2-2.3.3/lib/libmpi.so.12(print_backtrace+0x21) [0x14789ccb78a1]
[rdma-qe-07.lab.bos.redhat.com:mpi_rank_1][print_backtrace]   1: /opt/mvapich2-2.3.3/lib/libmpi.so.12(error_sighandler+0x67) [0x14789ccb79c7]
[rdma-qe-07.lab.bos.redhat.com:mpi_rank_1][print_backtrace]   2: /lib64/libpthread.so.0(+0x12dd0) [0x14789b91add0]
[rdma-qe-07.lab.bos.redhat.com:mpi_rank_1][print_backtrace]   3: /opt/mvapich2-2.3.3/lib/libmpi.so.12(MPIDI_CH3I_set_affinity+0x2ff) [0x14789cd0c0ef]
[rdma-qe-07.lab.bos.redhat.com:mpi_rank_1][print_backtrace]   4: /opt/mvapich2-2.3.3/lib/libmpi.so.12(MPID_Init+0x299) [0x14789cc55d09]
[rdma-qe-07.lab.bos.redhat.com:mpi_rank_1][print_backtrace]   5: /opt/mvapich2-2.3.3/lib/libmpi.so.12(MPIR_Init_thread+0x2ff) [0x14789cbbee6f]
[rdma-qe-07.lab.bos.redhat.com:mpi_rank_1][print_backtrace]   6: /opt/mvapich2-2.3.3/lib/libmpi.so.12(MPI_Init+0xc1) [0x14789cbbe8f1]
[rdma-qe-07.lab.bos.redhat.com:mpi_rank_1][print_backtrace]   7: /opt/mvapich2-2.3.3/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_bw() [0x40130c]
[rdma-qe-07.lab.bos.redhat.com:mpi_rank_1][print_backtrace]   8: /lib64/libc.so.6(__libc_start_main+0xf3) [0x14789ac3a6a3]
[rdma-qe-07.lab.bos.redhat.com:mpi_rank_1][print_backtrace]   9: /opt/mvapich2-2.3.3/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_bw() [0x4018fe]
[rdma-qe-06.lab.bos.redhat.com:mpi_rank_0][print_backtrace]   0: /opt/mvapich2-2.3.3/lib/libmpi.so.12(print_backtrace+0x21) [0x1469961c78a1]
[rdma-qe-06.lab.bos.redhat.com:mpi_rank_0][print_backtrace]   1: /opt/mvapich2-2.3.3/lib/libmpi.so.12(error_sighandler+0x67) [0x1469961c79c7]
[rdma-qe-06.lab.bos.redhat.com:mpi_rank_0][print_backtrace]   2: /lib64/libpthread.so.0(+0x12dd0) [0x146994e2add0]
[rdma-qe-06.lab.bos.redhat.com:mpi_rank_0][print_backtrace]   3: /opt/mvapich2-2.3.3/lib/libmpi.so.12(MPIDI_CH3I_set_affinity+0x2ff) [0x14699621c0ef]
[rdma-qe-06.lab.bos.redhat.com:mpi_rank_0][print_backtrace]   4: /opt/mvapich2-2.3.3/lib/libmpi.so.12(MPID_Init+0x299) [0x146996165d09]
[rdma-qe-06.lab.bos.redhat.com:mpi_rank_0][print_backtrace]   5: /opt/mvapich2-2.3.3/lib/libmpi.so.12(MPIR_Init_thread+0x2ff) [0x1469960cee6f]
[rdma-qe-06.lab.bos.redhat.com:mpi_rank_0][print_backtrace]   6: /opt/mvapich2-2.3.3/lib/libmpi.so.12(MPI_Init+0xc1) [0x1469960ce8f1]
[rdma-qe-06.lab.bos.redhat.com:mpi_rank_0][print_backtrace]   7: /opt/mvapich2-2.3.3/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_bw() [0x40130c]
[rdma-qe-06.lab.bos.redhat.com:mpi_rank_0][print_backtrace]   8: /lib64/libc.so.6(__libc_start_main+0xf3) [0x14699414a6a3]
[rdma-qe-06.lab.bos.redhat.com:mpi_rank_0][print_backtrace]   9: /opt/mvapich2-2.3.3/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_bw() [0x4018fe]

===================================================================================
=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
=   PID 2347 RUNNING AT 172.31.0.7
=   EXIT CODE: 136
=   CLEANING UP REMAINING PROCESSES
=   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
[proxy:0:0 at rdma-qe-06.lab.bos.redhat.com] HYD_pmcd_pmip_control_cmd_cb (pm/pmiserv/pmip_cb.c:911): assert (!closed) failed
[proxy:0:0 at rdma-qe-06.lab.bos.redhat.com] HYDT_dmxu_poll_wait_for_event (tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:0 at rdma-qe-06.lab.bos.redhat.com] main (pm/pmiserv/pmip.c:202): demux engine error waiting for event
[mpiexec at rdma-qe-06.lab.bos.redhat.com] HYDT_bscu_wait_for_completion (tools/bootstrap/utils/bscu_wait.c:76): one of the processes terminated badly; aborting
[mpiexec at rdma-qe-06.lab.bos.redhat.com] HYDT_bsci_wait_for_completion (tools/bootstrap/src/bsci_wait.c:23): launcher returned error waiting for completion
[mpiexec at rdma-qe-06.lab.bos.redhat.com] HYD_pmci_wait_for_completion (pm/pmiserv/pmiserv_pmci.c:218): launcher returned error waiting for completion
[mpiexec at rdma-qe-06.lab.bos.redhat.com] main (ui/mpich/mpiexec.c:340): process manager error waiting for completion
[root at rdma-qe-06 ~]$ 
[root at rdma-qe-06 ~]$ 



[root at rdma-qe-06 ~]$ 
[root at rdma-qe-06 ~]$ /opt/mvapich2-2.3.3/bin/mpichversion 
MVAPICH2 Version:     	2.3.3
MVAPICH2 Release date:	Thu January 09 22:00:00 EST 2019
MVAPICH2 Device:      	ch3:mrail
MVAPICH2 configure:   	--prefix=/opt/mvapich2-2.3.3 --enable-error-messages=all --enable-g=dbg,debug
MVAPICH2 CC:  	gcc    -DNDEBUG -DNVALGRIND -g -O2
MVAPICH2 CXX: 	g++   -DNDEBUG -DNVALGRIND -g -O2
MVAPICH2 F77: 	gfortran -L/lib -L/lib   -g -O2
MVAPICH2 FC:  	gfortran   -g -O2
[root at rdma-qe-06 ~]$ 


[root at rdma-qe-06 ~]$ 
[root at rdma-qe-06 ~]$ /opt/mvapich2-2.3.3/bin/mpivars 
[rdma-qe-06.lab.bos.redhat.com:mpi_rank_0][error_sighandler] Caught error: Floating point exception (signal 8)
Floating point exception (core dumped)
[root at rdma-qe-06 ~]$ 
[root at rdma-qe-06 ~]$ 
[root at rdma-qe-06 ~]$ cp /var/lib/systemd/coredump/core.mpivars.0.380e3c4c3b64419d881807dc764ddcfa.2378.1585106338000000.lz4 core.mpivars.lz4

[root at rdma-qe-06 ~]$ 
[root at rdma-qe-06 ~]$ 
[root at rdma-qe-06 ~]$ lz4 -d core.mpivars.lz4 
Decoding file core.mpivars 
core.mpivars.lz4     : decoded 3739648 bytes                                   
[root at rdma-qe-06 ~]$ 


[root at rdma-qe-06 ~]$ cgdb /opt/mvapich2-2.3.3/bin/mpivars core.mpivars


 62│ static __hwloc_inline int
 63│ hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type)
 64│ {
 65│   int depth = hwloc_get_type_depth(topology, type);
 66│   if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
 67│     return 0;
 68│   if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
 69│     return -1; /* FIXME: agregate nbobjs from different levels? */
 70├>  return hwloc_get_nbobjs_by_depth(topology, depth);
 71│ }
 72│
 73│ static __hwloc_inline hwloc_obj_t
 74│ hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx)
 75│ {
 76│   int depth = hwloc_get_type_depth(topology, type);
 77│   if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
 78│     return NULL;
/root/mvapich2-2.3.3/contrib/hwloc/include/hwloc/inlines.h                                                                              
#0  Missing separate debuginfos, use: yum debuginfo-install glibc-2.28-101.el8.x86_64 infiniband-diags-26.0-8.el8.x86_64 libibumad-26.0-
8.el8.x86_64 libibverbs-26.0-8.el8.x86_64 libnl3-3.5.0-1.el8.x86_64 librdmacm-26.0-8.el8.x86_64
--Type <RET> for more, q to quit, c to continue without paging-- at contrib/hwloc/include/hwloc/inlines.h:70
(gdb) bt
#0  0x000000000048de9f in hwloc_get_nbobjs_by_type (type=HWLOC_OBJ_PU, topology=0x26dccd0) at contrib/hwloc/include/hwloc/inlines.h:70
#1  mv2_generate_implicit_cpu_mapping (num_app_threads=1, local_procs=1) at src/mpid/ch3/channels/common/src/affinity/hwloc_bind.c:2747
#2  MPIDI_CH3I_set_affinity (pg=pg at entry=0x2685480, pg_rank=pg_rank at entry=0) at src/mpid/ch3/channels/common/src/affinity/hwloc_bind.c:3
102
#3  0x0000000000423e31 in MPID_Init (argc=argc at entry=0x7fff335025dc, argv=argv at entry=0x7fff335025d0, requested=requested at entry=0, provid
ed=provided at entry=0x7fff33502568, has_args=has_args at entry=0x7fff33502560, has_env=has_env at entry=0x7fff33502564) at src/mpid/ch3/src/mpid
_init.c:402
#4  0x000000000040879e in MPIR_Init_thread (argc=argc at entry=0x7fff335025dc, argv=argv at entry=0x7fff335025d0, required=required at entry=0, p
rovided=provided at entry=0x7fff335025e8) at src/mpi/init/initthread.c:488
#5  0x0000000000408ac0 in PMPI_Init_thread (argc=argc at entry=0x7fff335025dc, argv=argv at entry=0x7fff335025d0, required=required at entry=0, p
rovided=provided at entry=0x7fff335025e8) at src/mpi/init/initthread.c:675
#6  0x0000000000406460 in main (argc=<optimized out>, argv=<optimized out>) at src/env/mpivars.c:70
(gdb) 


[root at rdma-qe-06 ~]$ ibstat
CA 'mlx5_0'
	CA type: MT4113
	Number of ports: 2
	Firmware version: 10.16.1200
	Hardware version: 0
	Node GUID: 0xf452140300085ef0
	System image GUID: 0xf452140300085ef0
	Port 1:
		State: Active
		Physical state: LinkUp
		Rate: 56
		Base lid: 3
		LMC: 0
		SM lid: 13
		Capability mask: 0x26596848
		Port GUID: 0xf452140300085ef0
		Link layer: InfiniBand
	Port 2:
		State: Active
		Physical state: LinkUp
		Rate: 56
		Base lid: 26
		LMC: 0
		SM lid: 1
		Capability mask: 0x26596848
		Port GUID: 0xf452140300085ef8
		Link layer: InfiniBand




More information about the mvapich-discuss mailing list