[mvapich-discuss] problems with MVAPICH2 over 10GbE

Jonathan Perkins perkinjo at cse.ohio-state.edu
Thu Aug 18 21:10:39 EDT 2011


Hello Jeffrey, we looked into this a bit more.  After applying the
attached patch and
rebuilding mvapich2 you can use a variable called MV2_DEFAULT_PORT to control
which port to use.

First apply the patch using `patch -p0 < multi_port_roce_patch.txt' at
the top level of
the mvapich2 source code and rebuild the library.  To run in RDMAoE mode on the
2nd port do something like:

mpirun_rsh -np 2 node1 node2 MV2_USE_RDMAOE=1 MV2_DEFAULT_PORT=2 ./prog

Please let us know if this works for you.

On Wed, Aug 17, 2011 at 4:48 PM, Konz, Jeffrey (SSA Solution Centers)
<jeffrey.konz at hp.com> wrote:
> Jonathan,
>
> An issue is selecting the right port on the Mellanox NIC, it has two ports: 1 IB, 1 10GigE.
> Not sure how to do that.
>
> #ibstat
> CA 'mlx4_0'
>        CA type: MT26438
>        Number of ports: 2
>        Firmware version: 2.7.9100
>        Hardware version: b0
>        Node GUID: 0x78e7d10300214bbc
>        System image GUID: 0x78e7d10300214bbf
>        Port 1:
>                State: Active
>                Physical state: LinkUp
>                Rate: 40
>                Base lid: 13
>                LMC: 0
>                SM lid: 1
>                Capability mask: 0x02510868
>                Port GUID: 0x78e7d10300214bbd
>                Link layer: IB
>        Port 2:
>                State: Active
>                Physical state: LinkUp
>                Rate: 40
>                Base lid: 0
>                LMC: 0
>                SM lid: 0
>                Capability mask: 0x00010000
>                Port GUID: 0x7ae7d1fffe214bbd
>                Link layer: Ethernet
>
>
> #ibv_devinfo -v
> hca_id: mlx4_0
>        transport:                      InfiniBand (0)
>        fw_ver:                         2.7.9100
>        node_guid:                      78e7:d103:0021:4bbc
>        sys_image_guid:                 78e7:d103:0021:4bbf
>        vendor_id:                      0x02c9
>        vendor_part_id:                 26438
>        hw_ver:                         0xB0
>        board_id:                       HP_0200000003
>        phys_port_cnt:                  2
>        max_mr_size:                    0xffffffffffffffff
>        page_size_cap:                  0xfffffe00
>        max_qp:                         260032
>        max_qp_wr:                      16351
>        device_cap_flags:               0x007c9c76
>        max_sge:                        32
>        max_sge_rd:                     0
>        max_cq:                         65408
>        max_cqe:                        4194303
>        max_mr:                         524272
>        max_pd:                         32764
>        max_qp_rd_atom:                 16
>        max_ee_rd_atom:                 0
>        max_res_rd_atom:                4160512
>        max_qp_init_rd_atom:            128
>        max_ee_init_rd_atom:            0
>        atomic_cap:                     ATOMIC_HCA (1)
>        max_ee:                         0
>        max_rdd:                        0
>        max_mw:                         0
>        max_raw_ipv6_qp:                0
>        max_raw_ethy_qp:                2
>        max_mcast_grp:                  8192
>        max_mcast_qp_attach:            56
>        max_total_mcast_qp_attach:      458752
>        max_ah:                         0
>        max_fmr:                        0
>        max_srq:                        65472
>        max_srq_wr:                     16383
>        max_srq_sge:                    31
>        max_pkeys:                      128
>        local_ca_ack_delay:             15
>                port:   1
>                        state:                  PORT_ACTIVE (4)
>                        max_mtu:                2048 (4)
>                        active_mtu:             2048 (4)
>                        sm_lid:                 1
>                        port_lid:               13
>                        port_lmc:               0x00
>                        link_layer:             IB
>                        max_msg_sz:             0x40000000
>                        port_cap_flags:         0x02510868
>                        max_vl_num:             8 (4)
>                        bad_pkey_cntr:          0x0
>                        qkey_viol_cntr:         0x0
>                        sm_sl:                  0
>                        pkey_tbl_len:           128
>                        gid_tbl_len:            128
>                        subnet_timeout:         18
>                        init_type_reply:        0
>                        active_width:           4X (2)
>                        active_speed:           10.0 Gbps (4)
>                        phys_state:             LINK_UP (5)
>                        GID[  0]:               fe80:0000:0000:0000:78e7:d103:0021:4bbd
>
>                port:   2
>                        state:                  PORT_ACTIVE (4)
>                        max_mtu:                2048 (4)
>                        active_mtu:             1024 (3)
>                        sm_lid:                 0
>                        port_lid:               0
>                        port_lmc:               0x00
>                        link_layer:             Ethernet
>                        max_msg_sz:             0x40000000
>                        port_cap_flags:         0x00010000
>                        max_vl_num:             8 (4)
>                        bad_pkey_cntr:          0x0
>                        qkey_viol_cntr:         0x0
>                        sm_sl:                  0
>                        pkey_tbl_len:           1
>                        gid_tbl_len:            128
>                        subnet_timeout:         0
>                        init_type_reply:        0
>                        active_width:           4X (2)
>                        active_speed:           10.0 Gbps (4)
>                        phys_state:             LINK_UP (5)
>                        GID[  0]:               fe80:0000:0000:0000:7ae7:d1ff:fe21:4bbd
>
>
> -Jeff
>
>
>> -----Original Message-----
>> From: Jonathan Perkins [mailto:perkinjo at cse.ohio-state.edu]
>> Sent: Wednesday, August 17, 2011 12:25 PM
>> To: Konz, Jeffrey (SSA Solution Centers)
>> Cc: mvapich-discuss at cse.ohio-state.edu
>> Subject: Re: [mvapich-discuss] problems with MVAPICH2 over 10GbE
>>
>> Thanks for your report, I'm checking with some of the other developers
>> to verify the way this should work.  I believe that you do not need to
>> use the IP addresses of the RDMAoE port but instead specify the the
>> HCA name using MV2_IBA_HCA in addition to the MV2_USE_RDMAOE=1 option.
>>
>> The name of the HCA can be found by using the ibstat command and
>> should look something like mlx4_...
>>
>> On Wed, Aug 17, 2011 at 11:17 AM, Konz, Jeffrey (SSA Solution Centers)
>> <jeffrey.konz at hp.com> wrote:
>> > I am running on a cluster with the Mellanox LOM that supports both IB
>> and 10 GbE.
>> > Both ports on the interface are active, one is on IB network the
>> other on 10 GbE network.
>> >
>> > I built mvapich2-1.7rc1 with these options : "--with-device=ch3:mrail
>> --with-rdma=gen2"
>> >
>> > Running over IB works fine.
>> >
>> > When I try to run over the 10GbE network with the "MV2_USE_RDMAOE=1"
>> option I get this error:
>> >
>> > Fatal error in MPI_Init:
>> > Internal MPI error!
>> >
>> > [atl3-13:mpispawn_0][readline] Unexpected End-Of-File on file
>> descriptor 5. MPI process died?
>> > [atl3-13:mpispawn_0][mtpmi_processops] Error while reading PMI
>> socket. MPI process died?
>> > [atl3-13:mpispawn_0][child_handler] MPI process (rank: 0, pid: 23500)
>> exited with status 1
>> > [atl3-13:mpirun_rsh][process_mpispawn_connection] mpispawn_0 from
>> node 10.10.0.149 aborted: Error while reading a PMI socket (4)
>> >
>> > In the hostfile I specified the IP addresses of the 10 GbE ports.
>> >
>> > I am running incorrectly or have I not built mvapich with the correct
>> options?
>> >
>> > Thanks,
>> >
>> > -Jeff
>> >
>> >
>> > _______________________________________________
>> > mvapich-discuss mailing list
>> > mvapich-discuss at cse.ohio-state.edu
>> > http://mail.cse.ohio-state.edu/mailman/listinfo/mvapich-discuss
>> >
>> >
>>
>>
>>
>> --
>> Jonathan Perkins
>> http://www.cse.ohio-state.edu/~perkinjo
>
>



-- 
Jonathan Perkins
http://www.cse.ohio-state.edu/~perkinjo
-------------- next part --------------
Index: src/mpid/ch3/channels/mrail/src/gen2/rdma_iba_priv.c
===================================================================
--- src/mpid/ch3/channels/mrail/src/gen2/rdma_iba_priv.c	(revision 4848)
+++ src/mpid/ch3/channels/mrail/src/gen2/rdma_iba_priv.c	(working copy)
@@ -855,8 +855,8 @@
             ports[i][0] = rdma_default_port;
 
             if (use_iboeth) {
-                if (ibv_query_gid(MPIDI_CH3I_RDMA_Process.nic_context[i], 0,
-                                    0, &gids[i][0])) {
+                if (ibv_query_gid(MPIDI_CH3I_RDMA_Process.nic_context[i],
+                                    rdma_default_port, 0, &gids[i][0])) {
                     MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
                         "**fail", "Failed to retrieve gid on rank %d", pg_rank);
                 }
Index: src/mpid/ch3/channels/mrail/src/gen2/ring_startup.c
===================================================================
--- src/mpid/ch3/channels/mrail/src/gen2/ring_startup.c	(revision 4848)
+++ src/mpid/ch3/channels/mrail/src/gen2/ring_startup.c	(working copy)
@@ -400,19 +400,29 @@
     union ibv_gid gid;
     int mpi_errno = MPI_SUCCESS;
     int port;
+    char *value = NULL;
 
-
     if (!ring_rdma_open_hca(proc)) {
         MPIU_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto out,
                 "**fail", "**fail %s", "cannot open hca device");
     }
         
-    port = _find_active_port(proc->boot_context);
-    if (port < 0) {
-        MPIU_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto out, "**fail",
-                "**fail %s", "could not find active port");
+    if ((value = getenv("MV2_DEFAULT_PORT")) != NULL) {
+        rdma_default_port = atoi(value);
     }
 
+    if (rdma_default_port < 0 || rdma_num_ports > 1) {
+        /* Find active port if user has not asked us to use one */
+        port = _find_active_port(proc->boot_context);
+        if (port < 0) {
+            MPIU_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto out, "**fail",
+                    "**fail %s", "could not find active port");
+        }
+    } else {
+        /* Use port specified by user */
+        port = rdma_default_port;
+    }
+
     proc->boot_cq_hndl = ibv_create_cq(proc->boot_context,
                                        rdma_default_max_cq_size,
                                        NULL, NULL, 0);


More information about the mvapich-discuss mailing list