[mvapich-discuss] "Too many open files" error
Lei Chai
chai.15 at osu.edu
Tue Mar 10 01:50:16 EDT 2009
Hi Mike,
Sorry to know you are having this problem. Could you try the following
things:
- Which pallas version are you running? The latest one is IMB-3.2, could
you try it and see if the problem disappear??
- What are the "ulimic -n" and "cat /proc/sys/fs/file-max" outputs on
your system?
- Could you try the env variable VIADEV_USE_SHMEM_BCAST=0?
- If the problem is still there could you try the patch below?
Thanks,
Lei
===================================================================
--- mpid/ch_gen2/mpid_smpi.c 2009-03-10 03:16:35 UTC (rev 3233)
+++ mpid/ch_gen2/mpid_smpi.c 2009-03-10 05:30:15 UTC (rev 3234)
@@ -86,6 +86,8 @@
#define MPID_PROGRESSION_LOCK()
#define MPID_PROGRESSION_UNLOCK()
+unsigned int g_shmem_size = 0;
+unsigned int g_shmem_size_pool = 0;
int smp_eagersize = SMP_EAGERSIZE;
int smpi_length_queue = SMPI_LENGTH_QUEUE;
int expect_cancel_ack = 0;
@@ -759,9 +761,8 @@
void
smpi_init (void)
{
- unsigned int i, j, size, pool, pid, wait;
+ unsigned int i, j, pool, pid, wait;
int local_num, sh_size, pid_len, rq_len, param_len, limit_len;
- unsigned int size_pool;
struct stat file_status, file_status_pool;
char *shmem_file = NULL;
char *pool_file = NULL;
@@ -846,11 +847,11 @@
sh_size = sizeof(struct shared_mem) + pid_len + param_len + rq_len +
limit_len + SMPI_CACHE_LINE_SIZE * 4;
- size = (SMPI_CACHE_LINE_SIZE + sh_size + pagesize +
+ g_shmem_size = (SMPI_CACHE_LINE_SIZE + sh_size + pagesize +
(smpi.num_local_nodes * (smpi.num_local_nodes - 1) *
(SMPI_ALIGN (smpi_length_queue + pagesize))));
- size_pool =
+ g_shmem_size_pool =
SMPI_ALIGN (sizeof (SEND_BUF_T) * smp_num_send_buffer +
pagesize) * smpi.num_local_nodes + SMPI_CACHE_LINE_SIZE;
@@ -867,7 +868,7 @@
}
/* set file size, without touching pages */
- if (ftruncate (smpi.fd, size)) {
+ if (ftruncate (smpi.fd, g_shmem_size)) {
/* to clean up tmp shared file */
unlink (shmem_file);
error_abort_all (GEN_EXIT_ERR,
@@ -886,7 +887,7 @@
}
- if (ftruncate (smpi.fd_pool, size_pool)) {
+ if (ftruncate (smpi.fd_pool, g_shmem_size_pool)) {
/* to clean up tmp shared file */
unlink (pool_file);
error_abort_all (GEN_EXIT_ERR,
@@ -898,8 +899,8 @@
#ifndef _X86_64_
{
char *buf;
- buf = (char *) calloc (size + 1, sizeof (char));
- if (write (smpi.fd, buf, size) != size) {
+ buf = (char *) calloc (g_shmem_size + 1, sizeof (char));
+ if (write (smpi.fd, buf, g_shmem_size) != g_shmem_size) {
error_abort_all (GEN_EXIT_ERR,
"[%d] smpi_init:error in writing "
"shared memory file: %d\n",
@@ -910,8 +911,8 @@
{
char *buf;
- buf = (char *) calloc (size_pool + 1, sizeof (char));
- if (write (smpi.fd_pool, buf, size_pool) != size_pool) {
+ buf = (char *) calloc (g_shmem_size_pool + 1, sizeof (char));
+ if (write (smpi.fd_pool, buf, g_shmem_size_pool) != g_shmem_size_pool) {
error_abort_all (GEN_EXIT_ERR,
"[%d] smpi_init:error in writing "
"shared pool file: %d\n",
@@ -959,14 +960,14 @@
}
usleep (10);
}
- while (file_status.st_size != size ||
- file_status_pool.st_size != size_pool);
+ while (file_status.st_size != g_shmem_size ||
+ file_status_pool.st_size != g_shmem_size_pool);
smpi_shmem = (struct shared_mem *)malloc(sizeof(struct shared_mem));
smpi_malloc_assert(smpi_shmem, "smpi_init", "SMPI_SHMEM");
/* mmap of the shared memory file */
- smpi.mmap_ptr = mmap (0, size,
+ smpi.mmap_ptr = mmap (0, g_shmem_size,
(PROT_READ | PROT_WRITE), (MAP_SHARED), smpi.fd, 0);
if (smpi.mmap_ptr == (void *) -1) {
/* to clean up tmp shared file */
@@ -976,7 +977,7 @@
"shared memory: %d\n", MPID_MyWorldRank, errno);
}
- smpi.send_buf_pool_ptr = mmap (0, size_pool, (PROT_READ | PROT_WRITE),
+ smpi.send_buf_pool_ptr = mmap (0, g_shmem_size_pool, (PROT_READ | PROT_WRITE),
(MAP_SHARED), smpi.fd_pool, 0);
if (smpi.send_buf_pool_ptr == (void *) -1) {
@@ -1217,14 +1218,12 @@
MPID_SMP_Check_incoming ();
}
/* unmap the shared memory file */
- munmap (smpi.mmap_ptr, (SMPI_CACHE_LINE_SIZE +
- sizeof (struct shared_mem) +
- (smpi.num_local_nodes *
- (smpi.num_local_nodes -
- 1) * (smpi_length_queue +
- SMPI_CACHE_LINE_SIZE))));
-
+ munmap (smpi.mmap_ptr, g_shmem_size);
close (smpi.fd);
+
+ munmap(smpi.send_buf_pool_ptr, g_shmem_size_pool);
+ close(smpi.fd_pool);
+
smpi_send_fifo_ptr = smpi.send_fifo_head;
while (smpi_send_fifo_ptr) {
free (smpi_send_fifo_ptr);
===================================================
Mike Heinz wrote:
>
> Hey, we’re QA testing a release of OFED 1.4, including MVAPICH, and
> the testers just run into the following problem – they’re running
> Pallas across 44 nodes when, part way through the run when machines
> start failing with a “too many open files” error (see below).
>
> At first blush, this sounds like a ulimit problem, and I’m trying to
> get access to the failing machines to test that theory – but is there
> some known condition where mvapich will leak file handles?
>
> [root at st28]# /usr/mpi/gcc/mvapich-1.1.0/bin/mpirun -np 44 -machinefile
>
> (prior test cases trimmed)
>
> #----------------------------------------------------------------
>
> # Benchmarking Bcast
>
> # ( #processes = 8 )
>
> # ( 36 additional processes waiting in MPI_Barrier)
>
> #----------------------------------------------------------------
>
> #bytes #repetitions t_min[usec] t_max[usec] t_avg[usec]
>
> 0 1000 0.05 0.07 0.05
>
> 1 1000 8.70 8.71 8.71
>
> 2 1000 8.16 8.18 8.17
>
> 4 1000 8.17 8.19 8.18
>
> 8 1000 7.83 7.84 7.83
>
> 16 1000 8.08 8.10 8.09
>
> 32 1000 8.36 8.38 8.37
>
> 64 1000 8.28 8.30 8.29
>
> 128 1000 9.02 9.03 9.03
>
> 256 1000 9.33 9.35 9.34
>
> 512 1000 10.13 10.14 10.13
>
> 1024 1000 12.33 12.35 12.33
>
> 2048 1000 14.86 14.89 14.87
>
> 4096 1000 20.21 20.23 20.22
>
> 8192 1000 33.47 33.51 33.49
>
> 16384 1000 126.25 126.32 126.27
>
> open: Too many open files
>
> [5820] shmem_coll_init:error in opening shared memory file
>
> </tmp/ib_shmem_bcast_coll-5820-st28-0-1.tmp>: 24
>
> open: Too many open files
>
> [5820] shmem_coll_init:error in opening shared memory file
>
> </tmp/ib_shmem_bcast_coll-5820-st37-0-1.tmp>: 24
>
> open: Too many open files
>
> open: Too many open files
>
> open: Too many open files
>
> open: Too many open files
>
> [5820] shmem_coll_init:error in opening shared memory file
>
> </tmp/ib_shmem_bcast_coll-5820-st30-0-1.tmp>: 24
>
> open: Too many open files
>
> [5820] shmem_coll_init:error in opening shared memory file
>
> </tmp/ib_shmem_bcast_coll-5820-st46-0-1.tmp>: 24
>
> [0] shmem_coll_mmap:error in mmapping shared memory: 2
>
> open: Too many open files
>
> [5820] shmem_coll_init:error in opening shared memory file
>
> </tmp/ib_shmem_bcast_coll-5820-st47-0-1.tmp>: 24
>
> --
>
> Michael Heinz
>
> Principal Engineer, Qlogic Corporation
>
> King of Prussia, Pennsylvania
>
> ------------------------------------------------------------------------
>
> _______________________________________________
> mvapich-discuss mailing list
> mvapich-discuss at cse.ohio-state.edu
> http://mail.cse.ohio-state.edu/mailman/listinfo/mvapich-discuss
>
More information about the mvapich-discuss
mailing list