Blame SOURCES/db-5.3.21-mutex_leak.patch

7809d0
diff -U 5 -r db-5.3.21.old/src/dbinc_auto/int_def.in db-5.3.21/src/dbinc_auto/int_def.in
7809d0
--- db-5.3.21.old/src/dbinc_auto/int_def.in	2012-05-12 01:57:53.000000000 +0800
7809d0
+++ db-5.3.21/src/dbinc_auto/int_def.in	2016-10-25 22:40:58.000000000 +0800
7809d0
@@ -1371,10 +1371,11 @@
7809d0
 #define	__memp_failchk __memp_failchk@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_bhwrite __memp_bhwrite@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_pgread __memp_pgread@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_pg __memp_pg@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_bhfree __memp_bhfree@DB_VERSION_UNIQUE_NAME@
7809d0
+#define	__memp_bh_clear_dirty __memp_bh_clear_dirty@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_fget_pp __memp_fget_pp@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_fget __memp_fget@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_fcreate_pp __memp_fcreate_pp@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_fcreate __memp_fcreate@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_set_clear_len __memp_set_clear_len@DB_VERSION_UNIQUE_NAME@
7809d0
@@ -1395,10 +1396,11 @@
7809d0
 #define	__memp_fopen __memp_fopen@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_fclose_pp __memp_fclose_pp@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_fclose __memp_fclose@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_mf_discard __memp_mf_discard@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_inmemlist __memp_inmemlist@DB_VERSION_UNIQUE_NAME@
7809d0
+#define	__memp_mf_mark_dead __memp_mf_mark_dead@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_fput_pp __memp_fput_pp@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_fput __memp_fput@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_unpin_buffers __memp_unpin_buffers@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_dirty __memp_dirty@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_shared __memp_shared@DB_VERSION_UNIQUE_NAME@
7809d0
@@ -1453,10 +1455,11 @@
7809d0
 #define	__memp_fsync_pp __memp_fsync_pp@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_fsync __memp_fsync@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__mp_xxx_fh __mp_xxx_fh@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_sync_int __memp_sync_int@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_mf_sync __memp_mf_sync@DB_VERSION_UNIQUE_NAME@
7809d0
+#define	__memp_purge_dead_files __memp_purge_dead_files@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__memp_trickle_pp __memp_trickle_pp@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__mutex_alloc __mutex_alloc@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__mutex_alloc_int __mutex_alloc_int@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__mutex_free __mutex_free@DB_VERSION_UNIQUE_NAME@
7809d0
 #define	__mutex_free_int __mutex_free_int@DB_VERSION_UNIQUE_NAME@
7809d0
diff -U 5 -r db-5.3.21.old/src/dbinc_auto/mp_ext.h db-5.3.21/src/dbinc_auto/mp_ext.h
7809d0
--- db-5.3.21.old/src/dbinc_auto/mp_ext.h	2012-05-12 01:57:53.000000000 +0800
7809d0
+++ db-5.3.21/src/dbinc_auto/mp_ext.h	2016-10-25 22:40:58.000000000 +0800
7809d0
@@ -14,10 +14,11 @@
7809d0
 int __memp_failchk __P((ENV *));
7809d0
 int __memp_bhwrite __P((DB_MPOOL *, DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
7809d0
 int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
7809d0
 int __memp_pg __P((DB_MPOOLFILE *, db_pgno_t, void *, int));
7809d0
 int __memp_bhfree __P((DB_MPOOL *, REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t));
7809d0
+void __memp_bh_clear_dirty __P((ENV*, DB_MPOOL_HASH *, BH *));
7809d0
 int __memp_fget_pp __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *));
7809d0
 int __memp_fget __P((DB_MPOOLFILE *, db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *));
7809d0
 int __memp_fcreate_pp __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t));
7809d0
 int __memp_fcreate __P((ENV *, DB_MPOOLFILE **));
7809d0
 int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t));
7809d0
@@ -38,10 +39,11 @@
7809d0
 int __memp_fopen __P((DB_MPOOLFILE *, MPOOLFILE *, const char *, const char **, u_int32_t, int, size_t));
7809d0
 int __memp_fclose_pp __P((DB_MPOOLFILE *, u_int32_t));
7809d0
 int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t));
7809d0
 int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *, int));
7809d0
 int __memp_inmemlist __P((ENV *, char ***, int *));
7809d0
+void __memp_mf_mark_dead __P((DB_MPOOL *, MPOOLFILE *, int*));
7809d0
 int __memp_fput_pp __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t));
7809d0
 int __memp_fput __P((DB_MPOOLFILE *, DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY));
7809d0
 int __memp_unpin_buffers __P((ENV *, DB_THREAD_INFO *));
7809d0
 int __memp_dirty __P((DB_MPOOLFILE *, void *, DB_THREAD_INFO *, DB_TXN *, DB_CACHE_PRIORITY, u_int32_t));
7809d0
 int __memp_shared __P((DB_MPOOLFILE *, void *));
7809d0
@@ -96,10 +98,11 @@
7809d0
 int __memp_fsync_pp __P((DB_MPOOLFILE *));
7809d0
 int __memp_fsync __P((DB_MPOOLFILE *));
7809d0
 int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **));
7809d0
 int __memp_sync_int __P((ENV *, DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *));
7809d0
 int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int));
7809d0
+int __memp_purge_dead_files __P((ENV *));
7809d0
 int __memp_trickle_pp __P((DB_ENV *, int, int *));
7809d0
 
7809d0
 #if defined(__cplusplus)
7809d0
 }
7809d0
 #endif
7809d0
diff -U 5 -r db-5.3.21.old/src/mp/mp_bh.c db-5.3.21/src/mp/mp_bh.c
7809d0
--- db-5.3.21.old/src/mp/mp_bh.c	2012-05-12 01:57:53.000000000 +0800
7809d0
+++ db-5.3.21/src/mp/mp_bh.c	2016-10-25 17:09:35.000000000 +0800
7809d0
@@ -472,15 +472,12 @@
7809d0
 	 * a shared latch.
7809d0
 	 */
7809d0
 	if (F_ISSET(bhp, BH_DIRTY | BH_TRASH)) {
7809d0
 		MUTEX_LOCK(env, hp->mtx_hash);
7809d0
 		DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc));
7809d0
-		if (ret == 0 && F_ISSET(bhp, BH_DIRTY)) {
7809d0
-			F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
7809d0
-			DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
7809d0
-			atomic_dec(env, &hp->hash_page_dirty);
7809d0
-		}
7809d0
+		if (ret == 0)
7809d0
+			__memp_bh_clear_dirty(env, hp, bhp);
7809d0
 
7809d0
 		/* put the page back if necessary. */
7809d0
 		if ((ret != 0 || BH_REFCOUNT(bhp) > 1) &&
7809d0
 		    F_ISSET(bhp, BH_TRASH)) {
7809d0
 			ret = __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1);
7809d0
@@ -686,5 +683,31 @@
7809d0
 	} else
7809d0
 		MUTEX_UNLOCK(env, mfp->mutex);
7809d0
 
7809d0
 	return (ret);
7809d0
 }
7809d0
+
7809d0
+/*
7809d0
+ * __memp_bh_clear_dirty --
7809d0
+ *	Clear the dirty flag of of a buffer. Calls on the same buffer must be
7809d0
+ *	serialized to get the accounting correct. This can be achieved by
7809d0
+ *	acquiring an exclusive lock on the buffer, a shared lock on the
7809d0
+ *	buffer plus an exclusive lock on the hash bucket, or some other
7809d0
+ *	mechanism that guarantees single-thread access to the entire region
7809d0
+ *	(e.g. during __memp_region_bhfree()).
7809d0
+ *
7809d0
+ * PUBLIC: void __memp_bh_clear_dirty __P((ENV*, DB_MPOOL_HASH *, BH *));
7809d0
+ */
7809d0
+void
7809d0
+__memp_bh_clear_dirty(env, hp, bhp)
7809d0
+	ENV *env;
7809d0
+	DB_MPOOL_HASH *hp;
7809d0
+	BH *bhp;
7809d0
+{
7809d0
+	COMPQUIET(env, env);
7809d0
+	if (F_ISSET(bhp, BH_DIRTY)) {
7809d0
+		F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
7809d0
+		DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
7809d0
+		(void)atomic_dec(env, &hp->hash_page_dirty);
7809d0
+	}
7809d0
+}
7809d0
+
7809d0
diff -U 5 -r db-5.3.21.old/src/mp/mp_fget.c db-5.3.21/src/mp/mp_fget.c
7809d0
--- db-5.3.21.old/src/mp/mp_fget.c	2012-05-12 01:57:53.000000000 +0800
7809d0
+++ db-5.3.21/src/mp/mp_fget.c	2016-10-25 17:11:08.000000000 +0800
7809d0
@@ -437,16 +437,11 @@
7809d0
 		 * complain and get out.
7809d0
 		 */
7809d0
 		if (flags == DB_MPOOL_FREE) {
7809d0
 freebuf:		MUTEX_LOCK(env, hp->mtx_hash);
7809d0
 			h_locked = 1;
7809d0
-			if (F_ISSET(bhp, BH_DIRTY)) {
7809d0
-				F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
7809d0
-				DB_ASSERT(env,
7809d0
-				   atomic_read(&hp->hash_page_dirty) > 0);
7809d0
-				atomic_dec(env, &hp->hash_page_dirty);
7809d0
-			}
7809d0
+			__memp_bh_clear_dirty(env, hp, bhp);
7809d0
 
7809d0
 			/*
7809d0
 			 * If the buffer we found is already freed, we're done.
7809d0
 			 * If the ref count is not 1 then someone may be
7809d0
 			 * peeking at the buffer.  We cannot free it until they
7809d0
diff -U 5 -r db-5.3.21.old/src/mp/mp_fopen.c db-5.3.21/src/mp/mp_fopen.c
7809d0
--- db-5.3.21.old/src/mp/mp_fopen.c	2012-05-12 01:57:53.000000000 +0800
7809d0
+++ db-5.3.21/src/mp/mp_fopen.c	2016-10-25 22:31:05.000000000 +0800
7809d0
@@ -12,10 +12,11 @@
7809d0
 #include "dbinc/log.h"
7809d0
 #include "dbinc/mp.h"
7809d0
 #include "dbinc/db_page.h"
7809d0
 #include "dbinc/hash.h"
7809d0
 
7809d0
+static int __memp_count_dead_mutex __P((DB_MPOOL *, u_int32_t *));
7809d0
 static int __memp_mpf_alloc __P((DB_MPOOL *,
7809d0
     DB_MPOOLFILE *, const char *, u_int32_t, u_int32_t, MPOOLFILE **));
7809d0
 static int __memp_mpf_find __P((ENV *,
7809d0
     DB_MPOOLFILE *, DB_MPOOL_HASH *, const char *, u_int32_t, MPOOLFILE **));
7809d0
 
7809d0
@@ -709,11 +710,15 @@
7809d0
 		 * We should be able to set mfp to NULL and break out of the
7809d0
 		 * loop, but I like the idea of checking all the entries.
7809d0
 		 */
7809d0
 		if (LF_ISSET(DB_TRUNCATE)) {
7809d0
 			MUTEX_LOCK(env, mfp->mutex);
7809d0
-			mfp->deadfile = 1;
7809d0
+			/*
7809d0
+			 * We cannot purge dead files here, because the caller
7809d0
+			 * is holding the mutex of the hash bucket of mfp.
7809d0
+			 */
7809d0
+			__memp_mf_mark_dead(dbmp, mfp, NULL);
7809d0
 			MUTEX_UNLOCK(env, mfp->mutex);
7809d0
 			continue;
7809d0
 		}
7809d0
 
7809d0
 		/*
7809d0
@@ -907,14 +912,15 @@
7809d0
 	DB_MPOOL *dbmp;
7809d0
 	ENV *env;
7809d0
 	MPOOLFILE *mfp;
7809d0
 	char *rpath;
7809d0
 	u_int32_t ref;
7809d0
-	int deleted, ret, t_ret;
7809d0
+	int deleted, purge_dead, ret, t_ret;
7809d0
 
7809d0
 	env = dbmfp->env;
7809d0
 	dbmp = env->mp_handle;
7809d0
+	purge_dead = 0;
7809d0
 	ret = 0;
7809d0
 
7809d0
 	/*
7809d0
 	 * Remove the DB_MPOOLFILE from the process' list.
7809d0
 	 *
7809d0
@@ -1004,11 +1010,11 @@
7809d0
 	}
7809d0
 	DB_ASSERT(env, mfp->neutral_cnt < mfp->mpf_cnt);
7809d0
 	if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) {
7809d0
 		if (LF_ISSET(DB_MPOOL_DISCARD) ||
7809d0
 		    F_ISSET(mfp, MP_TEMP) || mfp->unlink_on_close) {
7809d0
-			mfp->deadfile = 1;
7809d0
+			__memp_mf_mark_dead(dbmp, mfp, &purge_dead);
7809d0
 		}
7809d0
 		if (mfp->unlink_on_close) {
7809d0
 			if ((t_ret = __db_appname(dbmp->env, DB_APP_DATA,
7809d0
 			    R_ADDR(dbmp->reginfo, mfp->path_off), NULL,
7809d0
 			    &rpath)) != 0 && ret == 0)
7809d0
@@ -1037,10 +1043,12 @@
7809d0
 			deleted = 1;
7809d0
 		}
7809d0
 	}
7809d0
 	if (!deleted && !LF_ISSET(DB_MPOOL_NOLOCK))
7809d0
 		MUTEX_UNLOCK(env, mfp->mutex);
7809d0
+	if (purge_dead)
7809d0
+		(void)__memp_purge_dead_files(env);
7809d0
 
7809d0
 done:	/* Discard the DB_MPOOLFILE structure. */
7809d0
 	if (dbmfp->pgcookie != NULL) {
7809d0
 		__os_free(env, dbmfp->pgcookie->data);
7809d0
 		__os_free(env, dbmfp->pgcookie);
7809d0
@@ -1091,11 +1099,11 @@
7809d0
 	/*
7809d0
 	 * We have to release the MPOOLFILE mutex before acquiring the region
7809d0
 	 * mutex so we don't deadlock.  Make sure nobody ever looks at this
7809d0
 	 * structure again.
7809d0
 	 */
7809d0
-	mfp->deadfile = 1;
7809d0
+	__memp_mf_mark_dead(dbmp, mfp, NULL);
7809d0
 
7809d0
 	/* Discard the mutex we're holding and return it too the pool. */
7809d0
 	MUTEX_UNLOCK(env, mfp->mutex);
7809d0
 	if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0)
7809d0
 		ret = t_ret;
7809d0
@@ -1216,5 +1224,106 @@
7809d0
 	/* Make sure we don't return any garbage. */
7809d0
 	*cntp = 0;
7809d0
 	*namesp = NULL;
7809d0
 	return (ret);
7809d0
 }
7809d0
+
7809d0
+/*
7809d0
+ * __memp_mf_mark_dead --
7809d0
+ *	Mark an MPOOLFILE as dead because its contents are no longer necessary.
7809d0
+ *	This happens when removing, truncation, or closing an unnamed in-memory
7809d0
+ *	database. Return, in the purgep parameter, whether the caller should
7809d0
+ *	call __memp_purge_dead_files() after the lock on mfp is released. The
7809d0
+ *	caller must hold an exclusive lock on the mfp handle.
7809d0
+ *
7809d0
+ * PUBLIC: void __memp_mf_mark_dead __P((DB_MPOOL *, MPOOLFILE *, int*));
7809d0
+ */
7809d0
+void
7809d0
+__memp_mf_mark_dead(dbmp, mfp, purgep)
7809d0
+	DB_MPOOL *dbmp;	
7809d0
+	MPOOLFILE *mfp;
7809d0
+	int *purgep;
7809d0
+{
7809d0
+	ENV *env;
7809d0
+#ifdef HAVE_MUTEX_SUPPORT
7809d0
+	REGINFO *infop;
7809d0
+	DB_MUTEXREGION *mtxregion;
7809d0
+	u_int32_t mutex_max, mutex_inuse, dead_mutex;
7809d0
+#endif
7809d0
+
7809d0
+	if (purgep != NULL)
7809d0
+		*purgep = 0;
7809d0
+
7809d0
+	env = dbmp->env;
7809d0
+
7809d0
+#ifdef HAVE_MUTEX_SUPPORT
7809d0
+	MUTEX_REQUIRED(env, mfp->mutex);
7809d0
+
7809d0
+	if (MUTEX_ON(env) && mfp->deadfile == 0) {
7809d0
+		infop = &env->mutex_handle->reginfo;
7809d0
+		mtxregion = infop->primary;
7809d0
+
7809d0
+		mutex_inuse = mtxregion->stat.st_mutex_inuse;
7809d0
+		if ((mutex_max = env->dbenv->mutex_max) == 0)
7809d0
+			mutex_max = infop->rp->max / mtxregion->mutex_size;
7809d0
+
7809d0
+		/*
7809d0
+		 * Purging dead pages requires a full scan of the entire cache
7809d0
+		 * buffer, so it is a slow operation. We only want to do it
7809d0
+		 * when it is necessary and provides enough benefits. Below is
7809d0
+		 * a simple heuristic that determines when to purge all dead
7809d0
+		 * pages.
7809d0
+		 */
7809d0
+		if (purgep != NULL && mutex_inuse > mutex_max - 200) {
7809d0
+			/*
7809d0
+			 * If the mutex region is almost full and there are
7809d0
+			 * many mutexes held by dead files, purge dead files.
7809d0
+			 */
7809d0
+			(void)__memp_count_dead_mutex(dbmp, &dead_mutex);
7809d0
+			dead_mutex += mfp->block_cnt + 1;
7809d0
+
7809d0
+			if (dead_mutex > mutex_inuse / 20)
7809d0
+				*purgep = 1;
7809d0
+		}
7809d0
+	}
7809d0
+#endif
7809d0
+
7809d0
+	mfp->deadfile = 1;
7809d0
+}
7809d0
+
7809d0
+/*
7809d0
+ * __memp_count_dead_mutex --
7809d0
+ *	Estimate the number of mutexes held by dead files.
7809d0
+ */
7809d0
+static int
7809d0
+__memp_count_dead_mutex(dbmp, dead_mutex)
7809d0
+	DB_MPOOL *dbmp;
7809d0
+	u_int32_t *dead_mutex;
7809d0
+{
7809d0
+	ENV *env;
7809d0
+	DB_MPOOL_HASH *hp;
7809d0
+	MPOOL *mp;
7809d0
+	MPOOLFILE *mfp;
7809d0
+	u_int32_t mutex_per_file;
7809d0
+	int busy, i;
7809d0
+
7809d0
+	env = dbmp->env;
7809d0
+	*dead_mutex = 0;
7809d0
+	mutex_per_file = 1;
7809d0
+#ifndef HAVE_ATOMICFILEREAD
7809d0
+	mutex_per_file = 2;
7809d0
+#endif
7809d0
+	mp = dbmp->reginfo[0].primary;
7809d0
+	hp = R_ADDR(dbmp->reginfo, mp->ftab);
7809d0
+	for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
7809d0
+		busy = MUTEX_TRYLOCK(env, hp->mtx_hash);
7809d0
+		if (busy)
7809d0
+			continue;
7809d0
+		SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
7809d0
+			if (mfp->deadfile)
7809d0
+				*dead_mutex += mfp->block_cnt + mutex_per_file;
7809d0
+		}
7809d0
+		MUTEX_UNLOCK(env, hp->mtx_hash);
7809d0
+	}
7809d0
+
7809d0
+	return (0);
7809d0
+}
7809d0
diff -U 5 -r db-5.3.21.old/src/mp/mp_method.c db-5.3.21/src/mp/mp_method.c
7809d0
--- db-5.3.21.old/src/mp/mp_method.c	2012-05-12 01:57:53.000000000 +0800
7809d0
+++ db-5.3.21/src/mp/mp_method.c	2016-10-25 17:22:23.000000000 +0800
7809d0
@@ -638,11 +638,11 @@
7809d0
 	DB_MPOOL_HASH *hp, *nhp;
7809d0
 	MPOOL *mp;
7809d0
 	MPOOLFILE *mfp;
7809d0
 	roff_t newname_off;
7809d0
 	u_int32_t bucket;
7809d0
-	int locked, ret;
7809d0
+	int locked, purge_dead, ret;
7809d0
 	size_t nlen;
7809d0
 	void *p;
7809d0
 
7809d0
 #undef	op_is_remove
7809d0
 #define	op_is_remove	(newname == NULL)
7809d0
@@ -655,10 +655,11 @@
7809d0
 	dbmp = NULL;
7809d0
 	mfp = NULL;
7809d0
 	nhp = NULL;
7809d0
 	p = NULL;
7809d0
 	locked = ret = 0;
7809d0
+	purge_dead = 0;
7809d0
 
7809d0
 	if (!MPOOL_ON(env))
7809d0
 		goto fsop;
7809d0
 
7809d0
 	dbmp = env->mp_handle;
7809d0
@@ -747,11 +748,11 @@
7809d0
 		 * they do not get reclaimed as long as they exist.  Since we
7809d0
 		 * are now deleting the database, we need to dec that count.
7809d0
 		 */
7809d0
 		if (mfp->no_backing_file)
7809d0
 			mfp->mpf_cnt--;
7809d0
-		mfp->deadfile = 1;
7809d0
+		__memp_mf_mark_dead(dbmp, mfp, &purge_dead);
7809d0
 		MUTEX_UNLOCK(env, mfp->mutex);
7809d0
 	} else {
7809d0
 		/*
7809d0
 		 * Else, it's a rename.  We've allocated memory for the new
7809d0
 		 * name.  Swap it with the old one.  If it's in memory we
7809d0
@@ -806,10 +807,16 @@
7809d0
 	if (locked == 1) {
7809d0
 		MUTEX_UNLOCK(env, hp->mtx_hash);
7809d0
 		if (nhp != NULL && nhp != hp)
7809d0
 			MUTEX_UNLOCK(env, nhp->mtx_hash);
7809d0
 	}
7809d0
+	/* 
7809d0
+	 * __memp_purge_dead_files() must be called when the hash bucket is
7809d0
+	 * unlocked.
7809d0
+	 */
7809d0
+	if (purge_dead)
7809d0
+		(void)__memp_purge_dead_files(env);
7809d0
 	return (ret);
7809d0
 }
7809d0
 
7809d0
 /*
7809d0
  * __memp_ftruncate __
7809d0
diff -U 5 -r db-5.3.21.old/src/mp/mp_sync.c db-5.3.21/src/mp/mp_sync.c
7809d0
--- db-5.3.21.old/src/mp/mp_sync.c	2012-05-12 01:57:53.000000000 +0800
7809d0
+++ db-5.3.21/src/mp/mp_sync.c	2016-10-25 17:26:58.000000000 +0800
7809d0
@@ -24,10 +24,11 @@
7809d0
 static int __bhcmp __P((const void *, const void *));
7809d0
 static int __memp_close_flush_files __P((ENV *, int));
7809d0
 static int __memp_sync_files __P((ENV *));
7809d0
 static int __memp_sync_file __P((ENV *,
7809d0
 		MPOOLFILE *, void *, u_int32_t *, u_int32_t));
7809d0
+static inline void __update_err_ret(int, int*);
7809d0
 
7809d0
 /*
7809d0
  * __memp_walk_files --
7809d0
  * PUBLIC: int __memp_walk_files __P((ENV *, MPOOL *,
7809d0
  * PUBLIC:	int (*) __P((ENV *, MPOOLFILE *, void *,
7809d0
@@ -961,5 +962,125 @@
7809d0
 		return (-1);
7809d0
 	if (bhp1->track_pgno > bhp2->track_pgno)
7809d0
 		return (1);
7809d0
 	return (0);
7809d0
 }
7809d0
+
7809d0
+/*
7809d0
+ * __memp_purge_dead_files --
7809d0
+ *	Remove all dead files and their buffers from the mpool. The caller
7809d0
+ *	cannot hold any lock on the dead MPOOLFILE handles, their buffers
7809d0
+ *	or their hash buckets.
7809d0
+ *
7809d0
+ * PUBLIC: int __memp_purge_dead_files __P((ENV *));
7809d0
+ */
7809d0
+int
7809d0
+__memp_purge_dead_files(env)
7809d0
+	ENV *env;
7809d0
+{
7809d0
+	BH *bhp;
7809d0
+	DB_MPOOL *dbmp;
7809d0
+	DB_MPOOL_HASH *hp, *hp_end;
7809d0
+	REGINFO *infop;
7809d0
+	MPOOL *c_mp, *mp;
7809d0
+	MPOOLFILE *mfp;
7809d0
+	u_int32_t i_cache;
7809d0
+	int ret, t_ret, h_lock;
7809d0
+
7809d0
+	if (!MPOOL_ON(env))
7809d0
+		return (0);
7809d0
+
7809d0
+	dbmp = env->mp_handle;
7809d0
+	mp = dbmp->reginfo[0].primary;
7809d0
+	ret = t_ret = h_lock = 0;
7809d0
+
7809d0
+	/*
7809d0
+	 * Walk each cache's list of buffers and free all buffers whose
7809d0
+	 * MPOOLFILE is marked as dead.
7809d0
+	 */
7809d0
+	for (i_cache = 0; i_cache < mp->nreg; i_cache++) {
7809d0
+		infop = &dbmp->reginfo[i_cache]; 
7809d0
+		c_mp = infop->primary;
7809d0
+
7809d0
+		hp = R_ADDR(infop, c_mp->htab);
7809d0
+		hp_end = &hp[c_mp->htab_buckets];
7809d0
+		for (; hp < hp_end; hp++) {
7809d0
+			/* Skip empty buckets. */
7809d0
+			if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
7809d0
+				continue;
7809d0
+
7809d0
+			/* 
7809d0
+			 * Search for a dead buffer. Other places that call
7809d0
+			 * __memp_bhfree() acquire the buffer lock before the
7809d0
+			 * hash bucket lock. Even though we acquire the two
7809d0
+			 * locks in reverse order, we cannot deadlock here
7809d0
+			 * because we don't block waiting for the locks.
7809d0
+			 */
7809d0
+			t_ret = MUTEX_TRYLOCK(env, hp->mtx_hash);
7809d0
+			if (t_ret != 0) {
7809d0
+				__update_err_ret(t_ret, &ret;;
7809d0
+				continue;
7809d0
+			}
7809d0
+			h_lock = 1;
7809d0
+			SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
7809d0
+				/* Skip buffers that are being used. */
7809d0
+				if (BH_REFCOUNT(bhp) > 0)
7809d0
+					continue;
7809d0
+
7809d0
+				mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
7809d0
+				if (!mfp->deadfile)
7809d0
+					continue;
7809d0
+
7809d0
+				/* Found a dead buffer. Prepare to free it. */
7809d0
+				t_ret = MUTEX_TRYLOCK(env, bhp->mtx_buf);
7809d0
+				if (t_ret != 0) {
7809d0
+					__update_err_ret(t_ret, &ret;;
7809d0
+					continue;
7809d0
+				}
7809d0
+
7809d0
+				DB_ASSERT(env, (!F_ISSET(bhp, BH_EXCLUSIVE) &&
7809d0
+				    BH_REFCOUNT(bhp) == 0));
7809d0
+				F_SET(bhp, BH_EXCLUSIVE);
7809d0
+				(void)atomic_inc(env, &bhp->ref);
7809d0
+
7809d0
+				__memp_bh_clear_dirty(env, hp, bhp);
7809d0
+
7809d0
+				/*
7809d0
+				 * Free the buffer. The buffer and hash bucket
7809d0
+				 * are unlocked by __memp_bhfree.
7809d0
+				 */
7809d0
+				if ((t_ret = __memp_bhfree(dbmp, infop, mfp,
7809d0
+				    hp, bhp, BH_FREE_FREEMEM)) == 0)
7809d0
+					/*
7809d0
+					 * Decrement hp, so the next turn will
7809d0
+					 * search the same bucket again.
7809d0
+					 */
7809d0
+					hp--;
7809d0
+				else
7809d0
+					__update_err_ret(t_ret, &ret;;
7809d0
+
7809d0
+				/*
7809d0
+				 * The hash bucket is unlocked, we need to
7809d0
+				 * start over again.
7809d0
+				 */
7809d0
+				h_lock = 0;
7809d0
+				break;
7809d0
+			}
7809d0
+
7809d0
+			if (h_lock) {
7809d0
+				MUTEX_UNLOCK(env, hp->mtx_hash);
7809d0
+				h_lock = 0;
7809d0
+			}
7809d0
+		}
7809d0
+	}
7809d0
+
7809d0
+	return (ret);
7809d0
+}
7809d0
+
7809d0
+static inline void
7809d0
+__update_err_ret(t_ret, retp)
7809d0
+	int t_ret;
7809d0
+	int *retp;
7809d0
+{
7809d0
+	if (t_ret != 0 && t_ret != DB_LOCK_NOTGRANTED && *retp == 0)
7809d0
+		*retp = t_ret;
7809d0
+}
7809d0
diff -U 5 -r db-5.3.21.old/src/mp/mp_trickle.c db-5.3.21/src/mp/mp_trickle.c
7809d0
--- db-5.3.21.old/src/mp/mp_trickle.c	2012-05-12 01:57:53.000000000 +0800
7809d0
+++ db-5.3.21/src/mp/mp_trickle.c	2016-10-25 17:27:57.000000000 +0800
7809d0
@@ -65,10 +65,14 @@
7809d0
 	    "DB_ENV->memp_trickle: %d: percent must be between 1 and 100",
7809d0
 		    "%d"), pct);
7809d0
 		return (EINVAL);
7809d0
 	}
7809d0
 
7809d0
+	/* First we purge all dead files and their buffers. */
7809d0
+	if ((ret = __memp_purge_dead_files(env)) != 0)
7809d0
+		return (ret);
7809d0
+
7809d0
 	/*
7809d0
 	 * Loop through the caches counting total/dirty buffers.
7809d0
 	 *
7809d0
 	 * XXX
7809d0
 	 * Using hash_page_dirty is our only choice at the moment, but it's not
7809d0
diff -U 5 -r db-5.3.21.old/src/mutex/mut_region.c db-5.3.21/src/mutex/mut_region.c
7809d0
--- db-5.3.21.old/src/mutex/mut_region.c	2012-05-12 01:57:54.000000000 +0800
7809d0
+++ db-5.3.21/src/mutex/mut_region.c	2016-10-25 17:34:22.000000000 +0800
7809d0
@@ -15,11 +15,11 @@
7809d0
 #include "dbinc/txn.h"
7809d0
 
7809d0
 static db_size_t __mutex_align_size __P((ENV *));
7809d0
 static int __mutex_region_init __P((ENV *, DB_MUTEXMGR *));
7809d0
 static size_t __mutex_region_size __P((ENV *));
7809d0
-static size_t __mutex_region_max __P((ENV *));
7809d0
+static size_t __mutex_region_max __P((ENV *, u_int32_t));
7809d0
 
7809d0
 /*
7809d0
  * __mutex_open --
7809d0
  *	Open a mutex region.
7809d0
  *
7809d0
@@ -32,11 +32,11 @@
7809d0
 {
7809d0
 	DB_ENV *dbenv;
7809d0
 	DB_MUTEXMGR *mtxmgr;
7809d0
 	DB_MUTEXREGION *mtxregion;
7809d0
 	size_t size;
7809d0
-	u_int32_t cpu_count;
7809d0
+	u_int32_t cpu_count, mutex_needed;
7809d0
 	int ret;
7809d0
 #ifndef HAVE_ATOMIC_SUPPORT
7809d0
 	u_int i;
7809d0
 #endif
7809d0
 
7809d0
@@ -59,23 +59,24 @@
7809d0
 		    cpu_count : cpu_count * MUTEX_SPINS_PER_PROCESSOR)) != 0)
7809d0
 			return (ret);
7809d0
 	}
7809d0
 
7809d0
 	/*
7809d0
-	 * If the user didn't set an absolute value on the number of mutexes
7809d0
-	 * we'll need, figure it out.  We're conservative in our allocation,
7809d0
-	 * we need mutexes for DB handles, group-commit queues and other things
7809d0
-	 * applications allocate at run-time.  The application may have kicked
7809d0
-	 * up our count to allocate its own mutexes, add that in.
7809d0
+	 * Figure out the number of mutexes we'll need.  We're conservative in
7809d0
+	 * our allocation, we need mutexes for DB handles, group-commit queues
7809d0
+	 * and other things applications allocate at run-time.  The application
7809d0
+	 * may have kicked up our count to allocate its own mutexes, add that
7809d0
+	 * in.
7809d0
 	 */
7809d0
+	mutex_needed =
7809d0
+	    __lock_region_mutex_count(env) +
7809d0
+	    __log_region_mutex_count(env) +
7809d0
+	    __memp_region_mutex_count(env) +
7809d0
+	    __txn_region_mutex_count(env);
7809d0
 	if (dbenv->mutex_cnt == 0 &&
7809d0
 	    F_ISSET(env, ENV_PRIVATE | ENV_THREAD) != ENV_PRIVATE)
7809d0
-		dbenv->mutex_cnt =
7809d0
-		    __lock_region_mutex_count(env) +
7809d0
-		    __log_region_mutex_count(env) +
7809d0
-		    __memp_region_mutex_count(env) +
7809d0
-		    __txn_region_mutex_count(env);
7809d0
+		dbenv->mutex_cnt = mutex_needed;
7809d0
 
7809d0
 	if (dbenv->mutex_max != 0 && dbenv->mutex_cnt > dbenv->mutex_max)
7809d0
 		dbenv->mutex_cnt = dbenv->mutex_max;
7809d0
 
7809d0
 	/* Create/initialize the mutex manager structure. */
7809d0
@@ -88,12 +89,12 @@
7809d0
 	mtxmgr->reginfo.id = INVALID_REGION_ID;
7809d0
 	mtxmgr->reginfo.flags = REGION_JOIN_OK;
7809d0
 	size = __mutex_region_size(env);
7809d0
 	if (create_ok)
7809d0
 		F_SET(&mtxmgr->reginfo, REGION_CREATE_OK);
7809d0
-	if ((ret = __env_region_attach(env,
7809d0
-	    &mtxmgr->reginfo, size, size + __mutex_region_max(env))) != 0)
7809d0
+	if ((ret = __env_region_attach(env, &mtxmgr->reginfo,
7809d0
+	    size, size + __mutex_region_max(env, mutex_needed))) != 0)
7809d0
 		goto err;
7809d0
 
7809d0
 	/* If we created the region, initialize it. */
7809d0
 	if (F_ISSET(&mtxmgr->reginfo, REGION_CREATE))
7809d0
 		if ((ret = __mutex_region_init(env, mtxmgr)) != 0)
7809d0
@@ -350,44 +351,62 @@
7809d0
 
7809d0
 	dbenv = env->dbenv;
7809d0
 
7809d0
 	s = sizeof(DB_MUTEXMGR) + 1024;
7809d0
 
7809d0
-	/* We discard one mutex for the OOB slot. */
7809d0
+	/* 
7809d0
+	 * We discard one mutex for the OOB slot. Make sure mutex_cnt doesn't
7809d0
+	 * overflow.
7809d0
+	 */
7809d0
 	s += __env_alloc_size(
7809d0
-	    (dbenv->mutex_cnt + 1) *__mutex_align_size(env));
7809d0
+	    (dbenv->mutex_cnt + (dbenv->mutex_cnt == UINT32_MAX ? 0 : 1)) *
7809d0
+	    __mutex_align_size(env));
7809d0
 
7809d0
 	return (s);
7809d0
 }
7809d0
 
7809d0
 /*
7809d0
  * __mutex_region_max --
7809d0
  *	 Return the amount of space needed to reach the maximum size.
7809d0
  */
7809d0
 static size_t
7809d0
-__mutex_region_max(env)
7809d0
+__mutex_region_max(env, mutex_needed)
7809d0
 	ENV *env;
7809d0
+	u_int32_t mutex_needed;
7809d0
 {
7809d0
 	DB_ENV *dbenv;
7809d0
-	u_int32_t max;
7809d0
+	u_int32_t max, mutex_cnt;
7809d0
 
7809d0
 	dbenv = env->dbenv;
7809d0
+	mutex_cnt = dbenv->mutex_cnt;
7809d0
 
7809d0
-	if ((max = dbenv->mutex_max) == 0) {
7809d0
+	/*
7809d0
+	 * We want to limit the region size to accommodate at most UINT32_MAX
7809d0
+	 * mutexes. If mutex_cnt is UINT32_MAX, no more space is allowed.
7809d0
+	 */
7809d0
+	if ((max = dbenv->mutex_max) == 0 && mutex_cnt != UINT32_MAX)
7809d0
 		if (F_ISSET(env, ENV_PRIVATE | ENV_THREAD) == ENV_PRIVATE)
7809d0
-			max = dbenv->mutex_inc + 1;
7809d0
-		else
7809d0
+			if (dbenv->mutex_inc + 1 < UINT32_MAX - mutex_cnt)
7809d0
+				max = dbenv->mutex_inc + 1 + mutex_cnt;
7809d0
+			else
7809d0
+				max = UINT32_MAX;
7809d0
+		else {
7809d0
 			max = __lock_region_mutex_max(env) +
7809d0
 			    __txn_region_mutex_max(env) +
7809d0
 			    __log_region_mutex_max(env) +
7809d0
 			    dbenv->mutex_inc + 100;
7809d0
-	} else if (max <= dbenv->mutex_cnt)
7809d0
+			if (max < UINT32_MAX - mutex_needed)
7809d0
+				max += mutex_needed;
7809d0
+			else
7809d0
+				max = UINT32_MAX;
7809d0
+		}
7809d0
+
7809d0
+	if (max <= mutex_cnt)
7809d0
 		return (0);
7809d0
 	else
7809d0
-		max -= dbenv->mutex_cnt;
7809d0
-
7809d0
-	return ( __env_alloc_size(max * __mutex_align_size(env)));
7809d0
+		return (__env_alloc_size(
7809d0
+		    (max - mutex_cnt) * __mutex_align_size(env)));
7809d0
 }
7809d0
 
7809d0
 #ifdef	HAVE_MUTEX_SYSTEM_RESOURCES
7809d0
 /*
7809d0
  * __mutex_resource_return
7809d0