3604df
From 3f88e480aedac049b90d94321a5e0a0eddecef4c Mon Sep 17 00:00:00 2001
3604df
From: Milind Changire <mchangir@redhat.com>
3604df
Date: Fri, 2 Dec 2016 11:14:32 +0530
3604df
Subject: [PATCH 214/227] cluster/tier: handle fast demotions
3604df
3604df
Demote files on priority if hi-watermark has been breached and continue
3604df
to demote until the watermark drops below hi-watermark.
3604df
3604df
Monitor watermark more frequently.
3604df
Trigger demotion as soon as hi-watermark is breached.
3604df
Add cluster.tier-query-limit option to limit number
3604df
of files returned from the database query for every iteration of
3604df
tier_migrate_using_query_file(). If watermark hasn't dropped below
3604df
hi-watermark during the first iteration, the next iteration will be
3604df
triggered approximately 1 second after tier_demote() returns to the
3604df
main tiering loop.
3604df
Update changetimerecorder xlator to handle query for emergency demote
3604df
mode.
3604df
3604df
Add tier-ctr-interface.h:
3604df
Move tier and ctr interface specific macros and struct definition from
3604df
libglusterfs/src/gfdb/gfdb_data_store.h to new header
3604df
libglusterfs/src/tier-ctr-interface.h
3604df
3604df
Fix op-version for tier-query-limit option.
3604df
3604df
> Reviewed-on: http://review.gluster.org/15158
3604df
> Smoke: Gluster Build System <jenkins@build.gluster.org>
3604df
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
3604df
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
3604df
> Reviewed-by: Dan Lambright <dlambrig@redhat.com>
3604df
(cherry picked from commit 460016428cf27484c333227f534c2e2f73a37fb1)
3604df
3604df
> BUG: 1394482
3604df
> Reviewed-on: http://review.gluster.org/15835
3604df
> Smoke: Gluster Build System <jenkins@build.gluster.org>
3604df
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
3604df
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
3604df
> Reviewed-by: Dan Lambright <dlambrig@redhat.com>
3604df
(cherry picked from commit 1001a4f7705f991f3ae1611997cf01b341ac453a)
3604df
3604df
> BUG: 1394482
3604df
> Reviewed-on: http://review.gluster.org/16000
3604df
(merged commit from release-3.9 branch)
3604df
3604df
Change-Id: If56af78c6c81d37529b9b6e65ae606ba5c99a811
3604df
BUG: 1361759
3604df
Signed-off-by: Milind Changire <mchangir@redhat.com>
3604df
Reviewed-on: https://code.engineering.redhat.com/gerrit/91861
3604df
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
3604df
---
3604df
 libglusterfs/src/Makefile.am                       |  3 +-
3604df
 libglusterfs/src/gfdb/gfdb_data_store.c            |  8 +-
3604df
 libglusterfs/src/gfdb/gfdb_data_store.h            | 48 +++--------
3604df
 libglusterfs/src/gfdb/gfdb_data_store_types.h      |  8 +-
3604df
 libglusterfs/src/gfdb/gfdb_sqlite3.c               | 37 +++++++--
3604df
 libglusterfs/src/gfdb/gfdb_sqlite3.h               |  3 +-
3604df
 libglusterfs/src/globals.h                         |  4 +-
3604df
 libglusterfs/src/tier-ctr-interface.h              | 44 ++++++++++
3604df
 xlators/cluster/dht/src/dht-common.h               |  1 +
3604df
 xlators/cluster/dht/src/dht-shared.c               |  4 +
3604df
 xlators/cluster/dht/src/tier.c                     | 93 ++++++++++++++++++----
3604df
 xlators/cluster/dht/src/tier.h                     |  3 +-
3604df
 .../changetimerecorder/src/changetimerecorder.c    | 17 +++-
3604df
 xlators/mgmt/glusterd/src/glusterd-volume-set.c    | 15 +++-
3604df
 14 files changed, 218 insertions(+), 70 deletions(-)
3604df
 create mode 100644 libglusterfs/src/tier-ctr-interface.h
3604df
3604df
diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am
3604df
index 849cb97..b16b87a 100644
3604df
--- a/libglusterfs/src/Makefile.am
3604df
+++ b/libglusterfs/src/Makefile.am
3604df
@@ -60,7 +60,8 @@ noinst_HEADERS = unittest/unittest.h \
3604df
 	$(CONTRIBDIR)/rbtree/rb.h \
3604df
 	$(CONTRIBDIR)/mount/mntent_compat.h \
3604df
 	$(CONTRIBDIR)/libexecinfo/execinfo_compat.h \
3604df
-	$(CONTRIBDIR)/timer-wheel/timer-wheel.h
3604df
+	$(CONTRIBDIR)/timer-wheel/timer-wheel.h \
3604df
+	tier-ctr-interface.h
3604df
 
3604df
 if !HAVE_LIBUUID
3604df
 # FIXME: unbundle libuuid, see compat-uuid.h.
3604df
diff --git a/libglusterfs/src/gfdb/gfdb_data_store.c b/libglusterfs/src/gfdb/gfdb_data_store.c
3604df
index 9c042f9..7b346ae 100644
3604df
--- a/libglusterfs/src/gfdb/gfdb_data_store.c
3604df
+++ b/libglusterfs/src/gfdb/gfdb_data_store.c
3604df
@@ -444,12 +444,14 @@ delete_record (gfdb_conn_node_t *_conn_node,
3604df
  *                        for every record found
3604df
  *      _query_cbk_args : Custom argument passed for the call back
3604df
  *                        function query_callback
3604df
+ *      query_limit     : number to limit number of rows returned by the query
3604df
  * Returns : if successful return 0 or
3604df
  *          -ve value in case of failure*/
3604df
 int
3604df
 find_all (gfdb_conn_node_t      *_conn_node,
3604df
           gf_query_callback_t   query_callback,
3604df
-          void                  *_query_cbk_args)
3604df
+          void                  *_query_cbk_args,
3604df
+          int                   query_limit)
3604df
 {
3604df
         int ret                                 = 0;
3604df
         gfdb_db_operations_t *db_operations_t   = NULL;
3604df
@@ -463,7 +465,8 @@ find_all (gfdb_conn_node_t      *_conn_node,
3604df
         if (db_operations_t->find_all_op) {
3604df
                 ret = db_operations_t->find_all_op (gf_db_connection,
3604df
                                                     query_callback,
3604df
-                                                    _query_cbk_args);
3604df
+                                                    _query_cbk_args,
3604df
+                                                    query_limit);
3604df
                 if (ret) {
3604df
                         gf_msg (GFDB_DATA_STORE, GF_LOG_ERROR, 0,
3604df
                                 LG_MSG_FIND_OP_FAILED, "Find all operation "
3604df
@@ -814,6 +817,7 @@ void get_gfdb_methods (gfdb_methods_t *methods)
3604df
 {
3604df
         methods->init_db = init_db;
3604df
         methods->fini_db = fini_db;
3604df
+        methods->find_all = find_all;
3604df
         methods->find_unchanged_for_time = find_unchanged_for_time;
3604df
         methods->find_recently_changed_files = find_recently_changed_files;
3604df
         methods->find_unchanged_for_time_freq = find_unchanged_for_time_freq;
3604df
diff --git a/libglusterfs/src/gfdb/gfdb_data_store.h b/libglusterfs/src/gfdb/gfdb_data_store.h
3604df
index eacb852..085df69 100644
3604df
--- a/libglusterfs/src/gfdb/gfdb_data_store.h
3604df
+++ b/libglusterfs/src/gfdb/gfdb_data_store.h
3604df
@@ -20,42 +20,6 @@
3604df
 
3604df
 #include "gfdb_data_store_types.h"
3604df
 
3604df
-#define GFDB_IPC_CTR_KEY "gfdb.ipc-ctr-op"
3604df
-
3604df
-/*
3604df
- * CTR IPC OPERATIONS
3604df
- *
3604df
- *
3604df
- */
3604df
-#define GFDB_IPC_CTR_QUERY_OPS "gfdb.ipc-ctr-query-op"
3604df
-#define GFDB_IPC_CTR_CLEAR_OPS "gfdb.ipc-ctr-clear-op"
3604df
-#define GFDB_IPC_CTR_GET_DB_PARAM_OPS "gfdb.ipc-ctr-get-db-parm"
3604df
-#define GFDB_IPC_CTR_GET_DB_VERSION_OPS "gfdb.ipc-ctr-get-db-version"
3604df
-
3604df
-/*
3604df
- * CTR IPC INPUT/OUTPUT
3604df
- *
3604df
- *
3604df
- */
3604df
-#define GFDB_IPC_CTR_GET_QFILE_PATH "gfdb.ipc-ctr-get-qfile-path"
3604df
-#define GFDB_IPC_CTR_GET_QUERY_PARAMS "gfdb.ipc-ctr-get-query-parms"
3604df
-#define GFDB_IPC_CTR_RET_QUERY_COUNT "gfdb.ipc-ctr-ret-rec-count"
3604df
-#define GFDB_IPC_CTR_GET_DB_KEY "gfdb.ipc-ctr-get-params-key"
3604df
-#define GFDB_IPC_CTR_RET_DB_VERSION "gfdb.ipc-ctr-ret-db-version"
3604df
-
3604df
-/*
3604df
- * gfdb ipc ctr params for query
3604df
- *
3604df
- *
3604df
- */
3604df
-typedef struct gfdb_ipc_ctr_params {
3604df
-        gf_boolean_t is_promote;
3604df
-        int write_freq_threshold;
3604df
-        int read_freq_threshold;
3604df
-        gfdb_time_t time_stamp;
3604df
-} gfdb_ipc_ctr_params_t;
3604df
-
3604df
-
3604df
 /* GFDB Connection Node:
3604df
  * ~~~~~~~~~~~~~~~~~~~~
3604df
  * Represents the connection to the database while using libgfdb
3604df
@@ -146,11 +110,20 @@ delete_record(gfdb_conn_node_t *, gfdb_db_record_t *gfdb_db_record);
3604df
  *                        for every record found
3604df
  *      _query_cbk_args : Custom argument passed for the call back
3604df
  *                        function query_callback
3604df
+ *      query_limit     : 0 - umlimited,
3604df
+ *                        any positive value - adds the LIMIT clause
3604df
+ *                        to the SQL query
3604df
+ *
3604df
  * Returns : if successful return 0 or
3604df
  *          -ve value in case of failure*/
3604df
 int find_all(gfdb_conn_node_t *, gf_query_callback_t query_callback,
3604df
-                void *_query_cbk_args);
3604df
+                void *_query_cbk_args,
3604df
+                int query_limit);
3604df
 
3604df
+typedef int (*find_all_t) (gfdb_conn_node_t *,
3604df
+                           gf_query_callback_t query_callback,
3604df
+                           void *_query_cbk_args,
3604df
+                           int query_limit);
3604df
 
3604df
 
3604df
 
3604df
@@ -353,6 +326,7 @@ typedef int (*set_db_params_t)(gfdb_conn_node_t *db_conn,
3604df
 typedef struct gfdb_methods_s {
3604df
         init_db_t                       init_db;
3604df
         fini_db_t                       fini_db;
3604df
+        find_all_t                      find_all;
3604df
         find_unchanged_for_time_t       find_unchanged_for_time;
3604df
         find_recently_changed_files_t   find_recently_changed_files;
3604df
         find_unchanged_for_time_freq_t  find_unchanged_for_time_freq;
3604df
diff --git a/libglusterfs/src/gfdb/gfdb_data_store_types.h b/libglusterfs/src/gfdb/gfdb_data_store_types.h
3604df
index 1acbdf2..5341f51 100644
3604df
--- a/libglusterfs/src/gfdb/gfdb_data_store_types.h
3604df
+++ b/libglusterfs/src/gfdb/gfdb_data_store_types.h
3604df
@@ -381,12 +381,18 @@ typedef int
3604df
  *                        for every record found
3604df
  *      _query_cbk_args : Custom argument passed for the call back
3604df
  *                        function query_callback
3604df
+ *      query_limit     : 0 - list all files
3604df
+ *                        positive value - add the LIMIT clause to
3604df
+ *                        the SQL query to limit the number of records
3604df
+ *                        returned
3604df
+ *
3604df
  * Returns : if successful return 0 or
3604df
  *          -ve value in case of failure*/
3604df
 typedef int
3604df
 (*gfdb_find_all_t)(void *db_conn,
3604df
                                gf_query_callback_t query_callback,
3604df
-                               void *_cbk_args);
3604df
+                               void *_cbk_args,
3604df
+                               int query_limit);
3604df
 
3604df
 
3604df
 
3604df
diff --git a/libglusterfs/src/gfdb/gfdb_sqlite3.c b/libglusterfs/src/gfdb/gfdb_sqlite3.c
3604df
index 04781be..ec7fe39 100644
3604df
--- a/libglusterfs/src/gfdb/gfdb_sqlite3.c
3604df
+++ b/libglusterfs/src/gfdb/gfdb_sqlite3.c
3604df
@@ -631,12 +631,15 @@ gf_get_basic_query_stmt (char **out_stmt)
3604df
  * */
3604df
 int
3604df
 gf_sqlite3_find_all (void *db_conn, gf_query_callback_t query_callback,
3604df
-                        void *query_cbk_args)
3604df
+                        void *query_cbk_args,
3604df
+                        int query_limit)
3604df
 {
3604df
         int ret                                 =       -1;
3604df
         char *query_str                         =       NULL;
3604df
         gf_sql_connection_t *sql_conn           =       db_conn;
3604df
         sqlite3_stmt *prep_stmt                 =       NULL;
3604df
+        char *limit_query                       =       NULL;
3604df
+        char *query                             =       NULL;
3604df
 
3604df
         CHECK_SQL_CONN (sql_conn, out);
3604df
         GF_VALIDATE_OR_GOTO(GFDB_STR_SQLITE3, query_callback, out);
3604df
@@ -646,12 +649,28 @@ gf_sqlite3_find_all (void *db_conn, gf_query_callback_t query_callback,
3604df
                 goto out;
3604df
         }
3604df
 
3604df
-        ret = sqlite3_prepare (sql_conn->sqlite3_db_conn, query_str, -1,
3604df
+        query = query_str;
3604df
+
3604df
+        if (query_limit > 0) {
3604df
+                ret = gf_asprintf (&limit_query, "%s LIMIT %d",
3604df
+                                   query, query_limit);
3604df
+                if (ret < 0) {
3604df
+                        gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0,
3604df
+                                LG_MSG_QUERY_FAILED,
3604df
+                                "Failed creating limit query statement");
3604df
+                        limit_query = NULL;
3604df
+                        goto out;
3604df
+                }
3604df
+
3604df
+                query = limit_query;
3604df
+        }
3604df
+
3604df
+        ret = sqlite3_prepare (sql_conn->sqlite3_db_conn, query, -1,
3604df
                                 &prep_stmt, 0);
3604df
         if (ret != SQLITE_OK) {
3604df
                 gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0,
3604df
-                        LG_MSG_PREPARE_FAILED, "Failed to prepare statement %s :"
3604df
-                        "%s", query_str,
3604df
+                        LG_MSG_PREPARE_FAILED,
3604df
+                        "Failed to prepare statement %s: %s", query,
3604df
                         sqlite3_errmsg (sql_conn->sqlite3_db_conn));
3604df
                 ret = -1;
3604df
                 goto out;
3604df
@@ -660,7 +679,7 @@ gf_sqlite3_find_all (void *db_conn, gf_query_callback_t query_callback,
3604df
         ret = gf_sql_query_function (prep_stmt, query_callback, query_cbk_args);
3604df
         if (ret) {
3604df
                 gf_msg (GFDB_STR_SQLITE3, GF_LOG_ERROR, 0, LG_MSG_QUERY_FAILED,
3604df
-                        "Failed Query %s", query_str);
3604df
+                        "Failed Query %s", query);
3604df
                 goto out;
3604df
         }
3604df
 
3604df
@@ -668,6 +687,10 @@ gf_sqlite3_find_all (void *db_conn, gf_query_callback_t query_callback,
3604df
 out:
3604df
         sqlite3_finalize (prep_stmt);
3604df
         GF_FREE (query_str);
3604df
+
3604df
+        if (limit_query)
3604df
+                GF_FREE (limit_query);
3604df
+
3604df
         return ret;
3604df
 }
3604df
 
3604df
@@ -1069,10 +1092,10 @@ gf_sqlite3_find_unchanged_for_time_freq (void *db_conn,
3604df
                 GF_COL_TB_WMSEC ") >= ? ) ) )"
3604df
                 " AND "
3604df
                 /*Second condition: For Reads
3604df
-                 * Files that have reaASCd wind time smaller than for_time
3604df
+                 * Files that have read wind time smaller than for_time
3604df
                  * OR
3604df
                  * File that have read wind time greater than for_time,
3604df
-                 * but write_frequency less than freq_write_cnt*/
3604df
+                 * but read_frequency less than freq_read_cnt*/
3604df
                 "( ((" GF_COL_TB_RWSEC " * " TOSTRING(GFDB_MICROSEC) " + "
3604df
                 GF_COL_TB_RWMSEC ") < ? )"
3604df
                 " OR "
3604df
diff --git a/libglusterfs/src/gfdb/gfdb_sqlite3.h b/libglusterfs/src/gfdb/gfdb_sqlite3.h
3604df
index e69251c..96301a4 100644
3604df
--- a/libglusterfs/src/gfdb/gfdb_sqlite3.h
3604df
+++ b/libglusterfs/src/gfdb/gfdb_sqlite3.h
3604df
@@ -254,7 +254,8 @@ int gf_sqlite3_delete (void *db_conn, gfdb_db_record_t *);
3604df
 
3604df
 /*querying modules*/
3604df
 int gf_sqlite3_find_all (void *db_conn, gf_query_callback_t,
3604df
-                        void *_query_cbk_args);
3604df
+                        void *_query_cbk_args,
3604df
+                        int query_limit);
3604df
 int gf_sqlite3_find_unchanged_for_time (void *db_conn,
3604df
                                         gf_query_callback_t query_callback,
3604df
                                         void *_query_cbk_args,
3604df
diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h
3604df
index b12b0ad..f6164c6 100644
3604df
--- a/libglusterfs/src/globals.h
3604df
+++ b/libglusterfs/src/globals.h
3604df
@@ -43,7 +43,7 @@
3604df
  */
3604df
 #define GD_OP_VERSION_MIN  1 /* MIN is the fresh start op-version, mostly
3604df
                                 should not change */
3604df
-#define GD_OP_VERSION_MAX  GD_OP_VERSION_3_9_0 /* MAX VERSION is the maximum
3604df
+#define GD_OP_VERSION_MAX  GD_OP_VERSION_3_9_1 /* MAX VERSION is the maximum
3604df
                                                   count in VME table, should
3604df
                                                   keep changing with
3604df
                                                   introduction of newer
3604df
@@ -83,6 +83,8 @@
3604df
 
3604df
 #define GD_OP_VERSION_3_9_0    30900 /* Op-version for GlusterFS 3.9.0 */
3604df
 
3604df
+#define GD_OP_VERSION_3_9_1    30901 /* Op-version for GlusterFS 3.9.1 */
3604df
+
3604df
 #include "xlator.h"
3604df
 
3604df
 /* THIS */
3604df
diff --git a/libglusterfs/src/tier-ctr-interface.h b/libglusterfs/src/tier-ctr-interface.h
3604df
new file mode 100644
3604df
index 0000000..cfd3f8a
3604df
--- /dev/null
3604df
+++ b/libglusterfs/src/tier-ctr-interface.h
3604df
@@ -0,0 +1,44 @@
3604df
+#ifndef _TIER_CTR_INTERFACE_H_
3604df
+#define _TIER_CTR_INTERFACE_H_
3604df
+
3604df
+#include "common-utils.h"
3604df
+#include "gfdb_data_store_types.h"
3604df
+
3604df
+#define GFDB_IPC_CTR_KEY "gfdb.ipc-ctr-op"
3604df
+
3604df
+/*
3604df
+ * CTR IPC OPERATIONS
3604df
+ *
3604df
+ *
3604df
+ */
3604df
+#define GFDB_IPC_CTR_QUERY_OPS "gfdb.ipc-ctr-query-op"
3604df
+#define GFDB_IPC_CTR_CLEAR_OPS "gfdb.ipc-ctr-clear-op"
3604df
+#define GFDB_IPC_CTR_GET_DB_PARAM_OPS "gfdb.ipc-ctr-get-db-parm"
3604df
+#define GFDB_IPC_CTR_GET_DB_VERSION_OPS "gfdb.ipc-ctr-get-db-version"
3604df
+#define GFDB_IPC_CTR_SET_COMPACT_PRAGMA "gfdb.ipc-ctr-set-compact-pragma"
3604df
+/*
3604df
+ * CTR IPC INPUT/OUTPUT
3604df
+ *
3604df
+ *
3604df
+ */
3604df
+#define GFDB_IPC_CTR_GET_QFILE_PATH "gfdb.ipc-ctr-get-qfile-path"
3604df
+#define GFDB_IPC_CTR_GET_QUERY_PARAMS "gfdb.ipc-ctr-get-query-parms"
3604df
+#define GFDB_IPC_CTR_RET_QUERY_COUNT "gfdb.ipc-ctr-ret-rec-count"
3604df
+#define GFDB_IPC_CTR_GET_DB_KEY "gfdb.ipc-ctr-get-params-key"
3604df
+#define GFDB_IPC_CTR_RET_DB_VERSION "gfdb.ipc-ctr-ret-db-version"
3604df
+
3604df
+/*
3604df
+ * gfdb ipc ctr params for query
3604df
+ *
3604df
+ *
3604df
+ */
3604df
+typedef struct gfdb_ipc_ctr_params {
3604df
+        gf_boolean_t is_promote;
3604df
+        int write_freq_threshold;
3604df
+        int read_freq_threshold;
3604df
+        gfdb_time_t time_stamp;
3604df
+        int query_limit;
3604df
+        gf_boolean_t emergency_demote;
3604df
+} gfdb_ipc_ctr_params_t;
3604df
+
3604df
+#endif
3604df
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
3604df
index a4285c5..227dc08 100644
3604df
--- a/xlators/cluster/dht/src/dht-common.h
3604df
+++ b/xlators/cluster/dht/src/dht-common.h
3604df
@@ -384,6 +384,7 @@ typedef struct gf_tier_conf {
3604df
         int                          percent_full;
3604df
         uint64_t                     max_migrate_bytes;
3604df
         int                          max_migrate_files;
3604df
+        int                          query_limit;
3604df
         tier_mode_t                  mode;
3604df
         int                          tier_max_promote_size;
3604df
         int                          tier_promote_frequency;
3604df
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
3604df
index 0fea1d5..48ec9ff 100644
3604df
--- a/xlators/cluster/dht/src/dht-shared.c
3604df
+++ b/xlators/cluster/dht/src/dht-shared.c
3604df
@@ -1050,6 +1050,10 @@ struct volume_options options[] = {
3604df
           .type = GF_OPTION_TYPE_INT,
3604df
           .default_value = "10000",
3604df
         },
3604df
+        { .key         = {"tier-query-limit"},
3604df
+          .type = GF_OPTION_TYPE_INT,
3604df
+          .default_value = "100",
3604df
+        },
3604df
         /* switch option */
3604df
         { .key  = {"pattern.switch.case"},
3604df
           .type = GF_OPTION_TYPE_ANY
3604df
diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c
3604df
index 83903e1..abd8925 100644
3604df
--- a/xlators/cluster/dht/src/tier.c
3604df
+++ b/xlators/cluster/dht/src/tier.c
3604df
@@ -15,6 +15,7 @@
3604df
 #include "tier-common.h"
3604df
 #include "syscall.h"
3604df
 #include "events.h"
3604df
+#include "tier-ctr-interface.h"
3604df
 
3604df
 /*Hard coded DB info*/
3604df
 static gfdb_db_type_t dht_tier_db_type = GFDB_SQLITE3;
3604df
@@ -193,6 +194,7 @@ out:
3604df
 
3604df
 /* Check and update the watermark every WM_INTERVAL seconds */
3604df
 #define WM_INTERVAL            5
3604df
+#define WM_INTERVAL_EMERG      1
3604df
 
3604df
 static int
3604df
 tier_check_same_node (xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)
3604df
@@ -524,7 +526,7 @@ tier_can_promote_file (xlator_t *this, char const *file_name,
3604df
                                 defrag->tier_conf.blocks_used;
3604df
 
3604df
         /* test if the estimated block usage goes above HI watermark */
3604df
-        if (GF_PERCENTAGE (estimated_usage, defrag->tier_conf.blocks_total) >
3604df
+        if (GF_PERCENTAGE (estimated_usage, defrag->tier_conf.blocks_total) >=
3604df
                         defrag->tier_conf.watermark_hi) {
3604df
                 gf_msg (this->name, GF_LOG_INFO, 0,
3604df
                         DHT_MSG_LOG_TIER_STATUS,
3604df
@@ -576,6 +578,7 @@ tier_migrate_using_query_file (void *_args)
3604df
         gfdb_time_t  current_time               = { 0 };
3604df
         int total_time                          = 0;
3604df
         int max_time                            = 0;
3604df
+        gf_boolean_t emergency_demote_mode      = _gf_false;
3604df
 
3604df
 
3604df
         GF_VALIDATE_OR_GOTO ("tier", query_cbk_args, out);
3604df
@@ -592,6 +595,9 @@ tier_migrate_using_query_file (void *_args)
3604df
         if (!migrate_data)
3604df
                 goto out;
3604df
 
3604df
+        emergency_demote_mode = (!query_cbk_args->is_promotion &&
3604df
+                                 is_hot_tier_full(&defrag->tier_conf));
3604df
+
3604df
         xdata_request = dict_new ();
3604df
         if (!xdata_request) {
3604df
                 gf_msg (this->name, GF_LOG_ERROR, 0,
3604df
@@ -1014,6 +1020,18 @@ per_file_out:
3604df
 
3604df
                 gfdb_methods.gfdb_query_record_free (query_record);
3604df
                 query_record = NULL;
3604df
+
3604df
+                /* If we are demoting and the entry watermark was HI, then
3604df
+                 * we are done with emergency demotions if the current
3604df
+                 * watermark has fallen below hi-watermark level
3604df
+                 */
3604df
+                if (emergency_demote_mode) {
3604df
+                        if (tier_check_watermark (this) == 0) {
3604df
+                                if (!is_hot_tier_full (&defrag->tier_conf)) {
3604df
+                                        break;
3604df
+                                }
3604df
+                        }
3604df
+                }
3604df
         }
3604df
 
3604df
 out:
3604df
@@ -1127,14 +1145,23 @@ tier_process_self_query (tier_brick_list_t *local_brick, void *args)
3604df
                 goto out;
3604df
         }
3604df
         if (!gfdb_brick_info->_gfdb_promote) {
3604df
-                if (query_cbk_args->defrag->write_freq_threshold == 0 &&
3604df
-                        query_cbk_args->defrag->read_freq_threshold == 0) {
3604df
-                                ret = gfdb_methods.find_unchanged_for_time (
3604df
-                                        conn_node,
3604df
-                                        tier_gf_query_callback,
3604df
-                                        (void *)query_cbk_args,
3604df
-                                        gfdb_brick_info->time_stamp);
3604df
+                if (query_cbk_args->defrag->tier_conf.watermark_last ==
3604df
+                        TIER_WM_HI) {
3604df
+                        /* emergency demotion mode */
3604df
+                        ret = gfdb_methods.find_all (conn_node,
3604df
+                                tier_gf_query_callback,
3604df
+                                (void *)query_cbk_args,
3604df
+                                query_cbk_args->defrag->tier_conf.
3604df
+                                        query_limit);
3604df
                 } else {
3604df
+                        if (query_cbk_args->defrag->write_freq_threshold == 0 &&
3604df
+                            query_cbk_args->defrag->read_freq_threshold == 0) {
3604df
+                                ret = gfdb_methods.find_unchanged_for_time (
3604df
+                                                conn_node,
3604df
+                                                tier_gf_query_callback,
3604df
+                                                (void *)query_cbk_args,
3604df
+                                                gfdb_brick_info->time_stamp);
3604df
+                        } else {
3604df
                                 ret = gfdb_methods.find_unchanged_for_time_freq (
3604df
                                         conn_node,
3604df
                                         tier_gf_query_callback,
3604df
@@ -1145,6 +1172,7 @@ tier_process_self_query (tier_brick_list_t *local_brick, void *args)
3604df
                                         query_cbk_args->defrag->
3604df
                                                         read_freq_threshold,
3604df
                                         _gf_false);
3604df
+                        }
3604df
                 }
3604df
         } else {
3604df
                 if (query_cbk_args->defrag->write_freq_threshold == 0 &&
3604df
@@ -1160,8 +1188,7 @@ tier_process_self_query (tier_brick_list_t *local_brick, void *args)
3604df
                                 tier_gf_query_callback,
3604df
                                 (void *)query_cbk_args,
3604df
                                 gfdb_brick_info->time_stamp,
3604df
-                                query_cbk_args->defrag->
3604df
-                                write_freq_threshold,
3604df
+                                query_cbk_args->defrag->write_freq_threshold,
3604df
                                 query_cbk_args->defrag->read_freq_threshold,
3604df
                                 _gf_false);
3604df
                 }
3604df
@@ -1268,10 +1295,21 @@ tier_process_ctr_query (tier_brick_list_t *local_brick, void *args)
3604df
 
3604df
         /* set all the query params*/
3604df
         ipc_ctr_params->is_promote = gfdb_brick_info->_gfdb_promote;
3604df
-        ipc_ctr_params->write_freq_threshold = query_cbk_args->
3604df
-                                                defrag->write_freq_threshold;
3604df
-        ipc_ctr_params->read_freq_threshold = query_cbk_args->
3604df
-                                                defrag->read_freq_threshold;
3604df
+
3604df
+        ipc_ctr_params->write_freq_threshold =
3604df
+                query_cbk_args->defrag->write_freq_threshold;
3604df
+
3604df
+        ipc_ctr_params->read_freq_threshold =
3604df
+                query_cbk_args->defrag->read_freq_threshold;
3604df
+
3604df
+        ipc_ctr_params->query_limit =
3604df
+                query_cbk_args->defrag->tier_conf.query_limit;
3604df
+
3604df
+        ipc_ctr_params->emergency_demote =
3604df
+                (!gfdb_brick_info->_gfdb_promote &&
3604df
+                 query_cbk_args->defrag->tier_conf.watermark_last ==
3604df
+                        TIER_WM_HI);
3604df
+
3604df
         memcpy (&ipc_ctr_params->time_stamp,
3604df
                 gfdb_brick_info->time_stamp,
3604df
                 sizeof (gfdb_time_t));
3604df
@@ -1860,6 +1898,15 @@ out:
3604df
         return;
3604df
 }
3604df
 
3604df
+static int
3604df
+tier_get_wm_interval(tier_mode_t mode, tier_watermark_op_t wm)
3604df
+{
3604df
+        if (mode == TIER_MODE_WM && wm == TIER_WM_HI)
3604df
+                return WM_INTERVAL_EMERG;
3604df
+
3604df
+        return WM_INTERVAL;
3604df
+}
3604df
+
3604df
 /*
3604df
  * Main tiering loop. This is called from the promotion and the
3604df
  * demotion threads spawned in tier_start().
3604df
@@ -1968,7 +2015,10 @@ static void
3604df
 
3604df
                 check_watermark++;
3604df
 
3604df
-                if (check_watermark >= WM_INTERVAL) {
3604df
+                /* emergency demotion requires frequent watermark monitoring */
3604df
+                if (check_watermark >=
3604df
+                        tier_get_wm_interval(tier_conf->mode,
3604df
+                                             tier_conf->watermark_last)) {
3604df
                         check_watermark = 0;
3604df
                         if (tier_conf->mode == TIER_MODE_WM) {
3604df
                                 ret = tier_get_fs_stat (this, &root_loc);
3604df
@@ -2396,6 +2446,15 @@ tier_init (xlator_t *this)
3604df
 
3604df
         defrag->tier_conf.max_migrate_files = freq;
3604df
 
3604df
+
3604df
+        ret = dict_get_int32 (this->options,
3604df
+                              "tier-query-limit",
3604df
+                              &(defrag->tier_conf.query_limit));
3604df
+        if (ret) {
3604df
+                defrag->tier_conf.query_limit =
3604df
+                        DEFAULT_TIER_QUERY_LIMIT;
3604df
+        }
3604df
+
3604df
         ret = dict_get_str (this->options,
3604df
                             "tier-mode", &mode);
3604df
         if (ret) {
3604df
@@ -2564,6 +2623,10 @@ tier_reconfigure (xlator_t *this, dict_t *options)
3604df
                                   defrag->tier_conf.max_migrate_files, options,
3604df
                                   int32, out);
3604df
 
3604df
+                GF_OPTION_RECONF ("tier-query-limit",
3604df
+                                  defrag->tier_conf.query_limit,
3604df
+                                  options, int32, out);
3604df
+
3604df
                 GF_OPTION_RECONF ("tier-pause",
3604df
                                   req_pause, options,
3604df
                                   bool, out);
3604df
diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h
3604df
index 0807608..5745609 100644
3604df
--- a/xlators/cluster/dht/src/tier.h
3604df
+++ b/xlators/cluster/dht/src/tier.h
3604df
@@ -93,7 +93,7 @@ typedef enum tier_watermark_op_ {
3604df
 
3604df
 #define DEFAULT_PROMOTE_FREQ_SEC       120
3604df
 #define DEFAULT_DEMOTE_FREQ_SEC        120
3604df
-#define DEFAULT_DEMOTE_DEGRADED        10
3604df
+#define DEFAULT_DEMOTE_DEGRADED        1
3604df
 #define DEFAULT_WRITE_FREQ_SEC         0
3604df
 #define DEFAULT_READ_FREQ_SEC          0
3604df
 #define DEFAULT_WM_LOW                 75
3604df
@@ -101,5 +101,6 @@ typedef enum tier_watermark_op_ {
3604df
 #define DEFAULT_TIER_MODE              TIER_MODE_TEST
3604df
 #define DEFAULT_TIER_MAX_MIGRATE_MB    1000
3604df
 #define DEFAULT_TIER_MAX_MIGRATE_FILES 5000
3604df
+#define DEFAULT_TIER_QUERY_LIMIT       100
3604df
 
3604df
 #endif
3604df
diff --git a/xlators/features/changetimerecorder/src/changetimerecorder.c b/xlators/features/changetimerecorder/src/changetimerecorder.c
3604df
index 3d2e78a..dba4265 100644
3604df
--- a/xlators/features/changetimerecorder/src/changetimerecorder.c
3604df
+++ b/xlators/features/changetimerecorder/src/changetimerecorder.c
3604df
@@ -15,6 +15,9 @@
3604df
 #include "ctr-messages.h"
3604df
 #include "syscall.h"
3604df
 
3604df
+#include "changetimerecorder.h"
3604df
+#include "tier-ctr-interface.h"
3604df
+
3604df
 /*******************************inode forget***********************************/
3604df
 
3604df
 int
3604df
@@ -1724,14 +1727,21 @@ ctr_db_query (xlator_t *this,
3604df
                 goto out;
3604df
         }
3604df
         if (!ipc_ctr_params->is_promote) {
3604df
-                if (ipc_ctr_params->write_freq_threshold == 0 &&
3604df
-                        ipc_ctr_params->read_freq_threshold == 0) {
3604df
+                if (ipc_ctr_params->emergency_demote) {
3604df
+                        /* emergency demotion mode */
3604df
+                        ret = find_all (conn_node,
3604df
+                                ctr_db_query_callback,
3604df
+                                (void *)&query_cbk_args,
3604df
+                                ipc_ctr_params->query_limit);
3604df
+                } else {
3604df
+                        if (ipc_ctr_params->write_freq_threshold == 0 &&
3604df
+                                ipc_ctr_params->read_freq_threshold == 0) {
3604df
                                 ret = find_unchanged_for_time (
3604df
                                         conn_node,
3604df
                                         ctr_db_query_callback,
3604df
                                         (void *)&query_cbk_args,
3604df
                                         &ipc_ctr_params->time_stamp);
3604df
-                } else {
3604df
+                        } else {
3604df
                                 ret = find_unchanged_for_time_freq (
3604df
                                         conn_node,
3604df
                                         ctr_db_query_callback,
3604df
@@ -1740,6 +1750,7 @@ ctr_db_query (xlator_t *this,
3604df
                                         ipc_ctr_params->write_freq_threshold,
3604df
                                         ipc_ctr_params->read_freq_threshold,
3604df
                                         _gf_false);
3604df
+                        }
3604df
                 }
3604df
         } else {
3604df
                 if (ipc_ctr_params->write_freq_threshold == 0 &&
3604df
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
3604df
index a293021..93ed1c8 100644
3604df
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
3604df
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
3604df
@@ -454,7 +454,8 @@ validate_tier (glusterd_volinfo_t *volinfo, dict_t *dict, char *key,
3604df
                    strstr (key, "tier-max-mb") ||
3604df
                    strstr (key, "tier-max-promote-file-size") ||
3604df
                    strstr (key, "tier-max-files") ||
3604df
-                   strstr (key, "tier-demote-frequency")) {
3604df
+                   strstr (key, "tier-demote-frequency") ||
3604df
+                   strstr (key, "tier-query-limit")) {
3604df
                 if (origin_val < 1) {
3604df
                         snprintf (errstr, sizeof (errstr), "%s is not a "
3604df
                                   " compatible value. %s expects a positive "
3604df
@@ -2736,6 +2737,18 @@ struct volopt_map_entry glusterd_volopt_map[] = {
3604df
           .description = "The maximum number of files that may be migrated"
3604df
           " in any direction in a given cycle by a single node."
3604df
         },
3604df
+        { .key         = "cluster.tier-query-limit",
3604df
+          .voltype     = "cluster/tier",
3604df
+          .option      = "tier-query-limit",
3604df
+          .value       = "100",
3604df
+          .op_version  = GD_OP_VERSION_3_9_1,
3604df
+          .flags       = OPT_FLAG_CLIENT_OPT,
3604df
+          .validate_fn = validate_tier,
3604df
+          .type        = NO_DOC,
3604df
+          .description = "The maximum number of files that may be migrated "
3604df
+                         "during an emergency demote. An emergency condition "
3604df
+                         "is flagged when writes breach the hi-watermark."
3604df
+        },
3604df
         { .key         = "features.ctr-enabled",
3604df
           .voltype     = "features/changetimerecorder",
3604df
           .value       = "off",
3604df
-- 
3604df
2.9.3
3604df