[mvapich-discuss] rdma_iba_priv.c + Error posting recv

Matthew Koop koop at cse.ohio-state.edu
Mon Nov 6 12:21:49 EST 2006


Vishwas,

I've attached a patch to address the issue you're seeing with the VAPI
device. Can you try this out and verify that it solves your problem?

I'd also like to strongly suggest that you move towards the OFED/Gen2
stack, since it has more support from vendors and the overall community.
Our support for OFED/Gen2 also has various features not found in the VAPI
version.

Let us know if you have any other questions.

Thanks,

Matt


On Mon, 6 Nov 2006, Vishwas Vasisht wrote:

> Hi,
>
> I have 65 nodes Opetron cluster, with total of 260 cores(64 nodes + 1
> Master, each dual processor, dual cored) I was trying to submit a job
> (cpi, jobfarming..), using -np to be greater than 260. It was working
> till -np 300. But for above 300, I am getting these errors several
> times.
>
> --------------------------------------------------------------------------
> [rdma_iba_priv.c:406] error(-236): Error posting recv!
> rank 12 in job 7  masternode_33851   caused collective abort of all ranks
>   exit status of rank 12: killed by signal 9
> --------------------------------------------------------------------------
>
> Can you please help me sorting this out.
>
> Regards
> Vishwas
>
-------------- next part --------------
Index: src/mpid/osu_ch3/channels/mrail/src/vapi/rdma_iba_priv.c
===================================================================
--- src/mpid/osu_ch3/channels/mrail/src/vapi/rdma_iba_priv.c	(revision 705)
+++ src/mpid/osu_ch3/channels/mrail/src/vapi/rdma_iba_priv.c	(working copy)
@@ -320,6 +320,55 @@
     }
 }
 
+void post_ring_recv(int post_id, int pg_size, int pg_rank,
+        int unit_length, char *alladdrs, VIP_MEM_HANDLE * mem_handle) 
+{
+    VAPI_rr_desc_t rr;
+    VAPI_sg_lst_entry_t sg_entry_r;
+    char *recv_addr;
+    int ret;
+    struct MPIDI_CH3I_RDMA_Process_t *proc;
+    proc = &MPIDI_CH3I_RDMA_Process;
+
+    if(post_id - 1 >= pg_size) {
+        return;
+    }
+
+    if ((post_id + 1) < pg_size) {
+        recv_addr = alladdrs + unit_length *
+            ((pg_rank + pg_size - post_id - 1) % pg_size);
+    } else if (post_id < pg_size) {
+        recv_addr = alladdrs + unit_length * pg_rank;
+    } else {
+        recv_addr = alladdrs + unit_length * pg_rank + QPLEN_XDR;
+    }
+
+    /* Fillup a recv descriptor */
+    rr.comp_type = VAPI_SIGNALED;
+    rr.opcode = VAPI_RECEIVE;
+    rr.id = post_id;
+    rr.sg_lst_len = 1;
+    rr.sg_lst_p = &(sg_entry_r);
+    sg_entry_r.lkey = mem_handle->lkey;
+    sg_entry_r.addr = (VAPI_virt_addr_t) (virt_addr_t) recv_addr;
+
+    if ((post_id + 1) >= pg_size)
+        sg_entry_r.len = (post_id + 2 - pg_size) * QPLEN_XDR;
+    else
+        sg_entry_r.len = unit_length;
+
+    /* Post the recv descriptor */
+    if (post_id < pg_size) {
+        ret =
+            VAPI_post_rr(proc->nic[0], proc->boot_qp_hndl[0], &rr);
+    } else {
+        ret =
+            VAPI_post_rr(proc->nic[0], proc->boot_qp_hndl[1], &rr);
+    }
+    CHECK_RETURN(ret, "Error posting recv!");
+}
+
+
 /* Using a ring of queue pairs to exchange all the queue_pairs,
  * If mpd is used,  only info about lhs and rsh are provided. */
 static void ib_vapi_bootstrap_ring(int lhs, int rhs,
@@ -338,8 +387,6 @@
     /* Register alladdrs and post receive descriptors */
     {
         /* work entries related variables */
-        VAPI_rr_desc_t rr;
-        VAPI_sg_lst_entry_t sg_entry_r;
         VAPI_sr_desc_t sr;
         VAPI_sg_lst_entry_t sg_entry_s;
 
@@ -352,9 +399,11 @@
 
         int unit_length;
         char *dest_loc;
-        char *recv_addr;
         char *send_addr;
 
+        int prepost = 15;
+        int post_recv = 0;
+
         struct MPIDI_CH3I_RDMA_Process_t *proc;
         proc = &MPIDI_CH3I_RDMA_Process;
 
@@ -366,44 +415,8 @@
         dest_loc = alladdrs + pg_rank * unit_length;
         strncpy(dest_loc, local_addr, unit_length);
 
-        recv_index = 0;
-
-        /* Post receive for all_addr */
-        for (j = 0; j < pg_size + 1; j++) {
-            /* The last two entries are used for a barrier,
-             * they overlap the local address */
-            if ((j + 1) < pg_size) {
-                recv_addr = alladdrs + unit_length *
-                    ((pg_rank + pg_size - j - 1) % pg_size);
-            } else if (j < pg_size) {
-                recv_addr = alladdrs + unit_length * pg_rank;
-            } else {
-                recv_addr = alladdrs + unit_length * pg_rank + QPLEN_XDR;
-            }
-
-            /* Fillup a recv descriptor */
-            rr.comp_type = VAPI_SIGNALED;
-            rr.opcode = VAPI_RECEIVE;
-            rr.id = j;
-            rr.sg_lst_len = 1;
-            rr.sg_lst_p = &(sg_entry_r);
-            sg_entry_r.lkey = mem_handle->lkey;
-            sg_entry_r.addr = (VAPI_virt_addr_t) (virt_addr_t) recv_addr;
-
-            if ((j + 1) >= pg_size)
-                sg_entry_r.len = (j + 2 - pg_size) * QPLEN_XDR;
-            else
-                sg_entry_r.len = unit_length;
-
-            /* Post the recv descriptor */
-            if (j < pg_size) {
-                ret =
-                    VAPI_post_rr(proc->nic[0], proc->boot_qp_hndl[0], &rr);
-            } else {
-                ret =
-                    VAPI_post_rr(proc->nic[0], proc->boot_qp_hndl[1], &rr);
-            }
-            CHECK_RETURN(ret, "Error posting recv!");
+        for (post_recv = 0;  post_recv < prepost; post_recv++) {
+            post_ring_recv(post_recv, pg_size, pg_rank, unit_length, alladdrs, mem_handle);
         }
 
         /* synchronize all the processes */
@@ -454,6 +467,11 @@
                     } else {
                         DEBUG_PRINT("expected message %d\n", rc.id);
                         recv_index = rc.id + 1;
+
+                        post_ring_recv(post_recv, pg_size, pg_rank, 
+                                unit_length, alladdrs, mem_handle);
+                        post_recv++;
+                        
                     }
                 } else if (ret == VAPI_EINVAL_HCA_HNDL
                            || ret == VAPI_EINVAL_CQ_HNDL) {


More information about the mvapich-discuss mailing list