From 8df95679519364d0993572ecbea72ab89e5250a5 Mon Sep 17 00:00:00 2001 From: Simon Pichugin Date: Thu, 20 May 2021 14:24:25 +0200 Subject: [PATCH 09/12] Issue 4623 - RFE - Monitor the current DB locks (#4762) Description: DB lock gets exhausted because of unindexed internal searches (under a transaction). Indexing those searches is the way to prevent exhaustion. If db lock get exhausted during a txn, it leads to db panic and the later recovery can possibly fail. That leads to a full reinit of the instance where the db locks got exhausted. Add three attributes to global BDB config: "nsslapd-db-locks-monitoring-enabled", "nsslapd-db-locks-monitoring-threshold" and "nsslapd-db-locks-monitoring-pause". By default, nsslapd-db-locks-monitoring-enabled is turned on, nsslapd-db-locks-monitoring-threshold is set to 90% and nsslapd-db-locks-monitoring-threshold is 500ms. When current locks are close to the maximum locks value of 90% - returning the next candidate will fail until the maximum of locks won't be increased or current locks are released. The monitoring thread runs with the configurable interval of 500ms. Add the setting to UI and CLI tools. Fixes: https://github.com/389ds/389-ds-base/issues/4623 Reviewed by: @Firstyear, @tbordaz, @jchapma, @mreynolds389 (Thank you!!) --- .../suites/monitor/db_locks_monitor_test.py | 251 ++++++++++++++++++ ldap/servers/slapd/back-ldbm/back-ldbm.h | 13 +- .../slapd/back-ldbm/db-bdb/bdb_config.c | 99 +++++++ .../slapd/back-ldbm/db-bdb/bdb_layer.c | 85 ++++++ ldap/servers/slapd/back-ldbm/init.c | 3 + ldap/servers/slapd/back-ldbm/ldbm_config.c | 3 + ldap/servers/slapd/back-ldbm/ldbm_config.h | 3 + ldap/servers/slapd/back-ldbm/ldbm_search.c | 13 + ldap/servers/slapd/libglobs.c | 4 +- src/cockpit/389-console/src/css/ds.css | 4 + src/cockpit/389-console/src/database.jsx | 7 + src/cockpit/389-console/src/index.html | 2 +- .../src/lib/database/databaseConfig.jsx | 88 +++++- src/lib389/lib389/backend.py | 3 + src/lib389/lib389/cli_conf/backend.py | 10 + 15 files changed, 576 insertions(+), 12 deletions(-) create mode 100644 dirsrvtests/tests/suites/monitor/db_locks_monitor_test.py diff --git a/dirsrvtests/tests/suites/monitor/db_locks_monitor_test.py b/dirsrvtests/tests/suites/monitor/db_locks_monitor_test.py new file mode 100644 index 000000000..7f9938f30 --- /dev/null +++ b/dirsrvtests/tests/suites/monitor/db_locks_monitor_test.py @@ -0,0 +1,251 @@ +# --- BEGIN COPYRIGHT BLOCK --- +# Copyright (C) 2021 Red Hat, Inc. +# All rights reserved. +# +# License: GPL (version 3 or any later version). +# See LICENSE for details. +# --- END COPYRIGHT BLOCK --- +# +import logging +import pytest +import datetime +import subprocess +from multiprocessing import Process, Queue +from lib389 import pid_from_file +from lib389.utils import ldap, os +from lib389._constants import DEFAULT_SUFFIX, ReplicaRole +from lib389.cli_base import LogCapture +from lib389.idm.user import UserAccounts +from lib389.idm.organizationalunit import OrganizationalUnits +from lib389.tasks import AccessLog +from lib389.backend import Backends +from lib389.ldclt import Ldclt +from lib389.dbgen import dbgen_users +from lib389.tasks import ImportTask +from lib389.index import Indexes +from lib389.plugins import AttributeUniquenessPlugin +from lib389.config import BDB_LDBMConfig +from lib389.monitor import MonitorLDBM +from lib389.topologies import create_topology, _remove_ssca_db + +pytestmark = pytest.mark.tier2 +db_locks_monitoring_ack = pytest.mark.skipif(not os.environ.get('DB_LOCKS_MONITORING_ACK', False), + reason="DB locks monitoring tests may take hours if the feature is not present or another failure exists. " + "Also, the feature requires a big amount of space as we set nsslapd-db-locks to 1300000.") + +DEBUGGING = os.getenv('DEBUGGING', default=False) +if DEBUGGING: + logging.getLogger(__name__).setLevel(logging.DEBUG) +else: + logging.getLogger(__name__).setLevel(logging.INFO) +log = logging.getLogger(__name__) + + +def _kill_ns_slapd(inst): + pid = str(pid_from_file(inst.ds_paths.pid_file)) + cmd = ['kill', '-9', pid] + subprocess.Popen(cmd, stdout=subprocess.PIPE) + + +@pytest.fixture(scope="function") +def topology_st_fn(request): + """Create DS standalone instance for each test case""" + + topology = create_topology({ReplicaRole.STANDALONE: 1}) + + def fin(): + # Kill the hanging process at the end of test to prevent failures in the following tests + if DEBUGGING: + [_kill_ns_slapd(inst) for inst in topology] + else: + [_kill_ns_slapd(inst) for inst in topology] + assert _remove_ssca_db(topology) + [inst.stop() for inst in topology if inst.exists()] + [inst.delete() for inst in topology if inst.exists()] + request.addfinalizer(fin) + + topology.logcap = LogCapture() + return topology + + +@pytest.fixture(scope="function") +def setup_attruniq_index_be_import(topology_st_fn): + """Enable Attribute Uniqueness, disable indexes and + import 120000 entries to the default backend + """ + inst = topology_st_fn.standalone + + inst.config.loglevel([AccessLog.DEFAULT, AccessLog.INTERNAL], service='access') + inst.config.set('nsslapd-plugin-logging', 'on') + inst.restart() + + attruniq = AttributeUniquenessPlugin(inst, dn="cn=attruniq,cn=plugins,cn=config") + attruniq.create(properties={'cn': 'attruniq'}) + for cn in ['uid', 'cn', 'sn', 'uidNumber', 'gidNumber', 'homeDirectory', 'givenName', 'description']: + attruniq.add_unique_attribute(cn) + attruniq.add_unique_subtree(DEFAULT_SUFFIX) + attruniq.enable_all_subtrees() + attruniq.enable() + + indexes = Indexes(inst) + for cn in ['uid', 'cn', 'sn', 'uidNumber', 'gidNumber', 'homeDirectory', 'givenName', 'description']: + indexes.ensure_state(properties={ + 'cn': cn, + 'nsSystemIndex': 'false', + 'nsIndexType': 'none'}) + + bdb_config = BDB_LDBMConfig(inst) + bdb_config.replace("nsslapd-db-locks", "130000") + inst.restart() + + ldif_dir = inst.get_ldif_dir() + import_ldif = ldif_dir + '/perf_import.ldif' + + # Valid online import + import_task = ImportTask(inst) + dbgen_users(inst, 120000, import_ldif, DEFAULT_SUFFIX, entry_name="userNew") + import_task.import_suffix_from_ldif(ldiffile=import_ldif, suffix=DEFAULT_SUFFIX) + import_task.wait() + assert import_task.is_complete() + + +def create_user_wrapper(q, users): + try: + users.create_test_user() + except Exception as ex: + q.put(ex) + + +def spawn_worker_thread(function, users, log, timeout, info): + log.info(f"Starting the thread - {info}") + q = Queue() + p = Process(target=function, args=(q,users,)) + p.start() + + log.info(f"Waiting for {timeout} seconds for the thread to finish") + p.join(timeout) + + if p.is_alive(): + log.info("Killing the thread as it's still running") + p.terminate() + p.join() + raise RuntimeError(f"Function call was aborted: {info}") + result = q.get() + if isinstance(result, Exception): + raise result + else: + return result + + +@db_locks_monitoring_ack +@pytest.mark.parametrize("lock_threshold", [("70"), ("80"), ("95")]) +def test_exhaust_db_locks_basic(topology_st_fn, setup_attruniq_index_be_import, lock_threshold): + """Test that when all of the locks are exhausted the instance still working + and database is not corrupted + + :id: 299108cc-04d8-4ddc-b58e-99157fccd643 + :setup: Standalone instance with Attr Uniq plugin and user indexes disabled + :steps: 1. Set nsslapd-db-locks to 11000 + 2. Check that we stop acquiring new locks when the threshold is reached + 3. Check that we can regulate a pause interval for DB locks monitoring thread + 4. Make sure the feature works for different backends on the same suffix + :expectedresults: + 1. Success + 2. Success + 3. Success + 4. Success + """ + + inst = topology_st_fn.standalone + ADDITIONAL_SUFFIX = 'ou=newpeople,dc=example,dc=com' + + backends = Backends(inst) + backends.create(properties={'nsslapd-suffix': ADDITIONAL_SUFFIX, + 'name': ADDITIONAL_SUFFIX[-3:]}) + ous = OrganizationalUnits(inst, DEFAULT_SUFFIX) + ous.create(properties={'ou': 'newpeople'}) + + bdb_config = BDB_LDBMConfig(inst) + bdb_config.replace("nsslapd-db-locks", "11000") + + # Restart server + inst.restart() + + for lock_enabled in ["on", "off"]: + for lock_pause in ["100", "500", "1000"]: + bdb_config.replace("nsslapd-db-locks-monitoring-enabled", lock_enabled) + bdb_config.replace("nsslapd-db-locks-monitoring-threshold", lock_threshold) + bdb_config.replace("nsslapd-db-locks-monitoring-pause", lock_pause) + inst.restart() + + if lock_enabled == "off": + raised_exception = (RuntimeError, ldap.SERVER_DOWN) + else: + raised_exception = ldap.OPERATIONS_ERROR + + users = UserAccounts(inst, DEFAULT_SUFFIX) + with pytest.raises(raised_exception): + spawn_worker_thread(create_user_wrapper, users, log, 30, + f"Adding user with monitoring enabled='{lock_enabled}'; " + f"threshold='{lock_threshold}'; pause='{lock_pause}'.") + # Restart because we already run out of locks and the next unindexed searches will fail eventually + if lock_enabled == "off": + _kill_ns_slapd(inst) + inst.restart() + + users = UserAccounts(inst, ADDITIONAL_SUFFIX, rdn=None) + with pytest.raises(raised_exception): + spawn_worker_thread(create_user_wrapper, users, log, 30, + f"Adding user with monitoring enabled='{lock_enabled}'; " + f"threshold='{lock_threshold}'; pause='{lock_pause}'.") + # In case feature is disabled - restart for the clean up + if lock_enabled == "off": + _kill_ns_slapd(inst) + inst.restart() + + +@db_locks_monitoring_ack +def test_exhaust_db_locks_big_pause(topology_st_fn, setup_attruniq_index_be_import): + """Test that DB lock pause setting increases the wait interval value for the monitoring thread + + :id: 7d5bf838-5d4e-4ad5-8c03-5716afb84ea6 + :setup: Standalone instance with Attr Uniq plugin and user indexes disabled + :steps: 1. Set nsslapd-db-locks to 20000 while using the default threshold value (95%) + 2. Set nsslapd-db-locks-monitoring-pause to 10000 (10 seconds) + 3. Make sure that the pause is successfully increased a few times in a row + :expectedresults: + 1. Success + 2. Success + 3. Success + """ + + inst = topology_st_fn.standalone + + bdb_config = BDB_LDBMConfig(inst) + bdb_config.replace("nsslapd-db-locks", "20000") + lock_pause = bdb_config.get_attr_val_int("nsslapd-db-locks-monitoring-pause") + assert lock_pause == 500 + lock_pause = "10000" + bdb_config.replace("nsslapd-db-locks-monitoring-pause", lock_pause) + + # Restart server + inst.restart() + + lock_enabled = bdb_config.get_attr_val_utf8_l("nsslapd-db-locks-monitoring-enabled") + lock_threshold = bdb_config.get_attr_val_int("nsslapd-db-locks-monitoring-threshold") + assert lock_enabled == "on" + assert lock_threshold == 90 + + users = UserAccounts(inst, DEFAULT_SUFFIX) + start = datetime.datetime.now() + with pytest.raises(ldap.OPERATIONS_ERROR): + spawn_worker_thread(create_user_wrapper, users, log, 30, + f"Adding user with monitoring enabled='{lock_enabled}'; " + f"threshold='{lock_threshold}'; pause='{lock_pause}'. Expect it to 'Work'") + end = datetime.datetime.now() + time_delta = end - start + if time_delta.seconds < 9: + raise RuntimeError("nsslapd-db-locks-monitoring-pause attribute doesn't function correctly. " + f"Finished the execution in {time_delta.seconds} seconds") + # In case something has failed - restart for the clean up + inst.restart() diff --git a/ldap/servers/slapd/back-ldbm/back-ldbm.h b/ldap/servers/slapd/back-ldbm/back-ldbm.h index 571b0a58b..afb831c32 100644 --- a/ldap/servers/slapd/back-ldbm/back-ldbm.h +++ b/ldap/servers/slapd/back-ldbm/back-ldbm.h @@ -155,6 +155,8 @@ typedef unsigned short u_int16_t; #define DEFAULT_DNCACHE_MAXCOUNT -1 /* no limit */ #define DEFAULT_DBCACHE_SIZE 33554432 #define DEFAULT_DBCACHE_SIZE_STR "33554432" +#define DEFAULT_DBLOCK_PAUSE 500 +#define DEFAULT_DBLOCK_PAUSE_STR "500" #define DEFAULT_MODE 0600 #define DEFAULT_ALLIDSTHRESHOLD 4000 #define DEFAULT_IDL_TUNE 1 @@ -575,12 +577,21 @@ struct ldbminfo char *li_backend_implement; /* low layer backend implementation */ int li_noparentcheck; /* check if parent exists on add */ - /* the next 3 fields are for the params that don't get changed until + /* db lock monitoring */ + /* if we decide to move the values to bdb_config, we can use slapi_back_get_info function to retrieve the values */ + int32_t li_dblock_monitoring; /* enables db locks monitoring thread - requires restart */ + uint32_t li_dblock_monitoring_pause; /* an interval for db locks monitoring thread */ + uint32_t li_dblock_threshold; /* when the percentage is reached, abort the search in ldbm_back_next_search_entry - requires restart*/ + uint32_t li_dblock_threshold_reached; + + /* the next 4 fields are for the params that don't get changed until * the server is restarted (used by the admin console) */ char *li_new_directory; uint64_t li_new_dbcachesize; int li_new_dblock; + int32_t li_new_dblock_monitoring; + uint64_t li_new_dblock_threshold; int li_new_dbncache; diff --git a/ldap/servers/slapd/back-ldbm/db-bdb/bdb_config.c b/ldap/servers/slapd/back-ldbm/db-bdb/bdb_config.c index 738b841aa..167644943 100644 --- a/ldap/servers/slapd/back-ldbm/db-bdb/bdb_config.c +++ b/ldap/servers/slapd/back-ldbm/db-bdb/bdb_config.c @@ -190,6 +190,102 @@ bdb_config_db_lock_set(void *arg, void *value, char *errorbuf, int phase, int ap return retval; } +static void * +bdb_config_db_lock_monitoring_get(void *arg) +{ + struct ldbminfo *li = (struct ldbminfo *)arg; + + return (void *)((intptr_t)(li->li_new_dblock_monitoring)); +} + +static int +bdb_config_db_lock_monitoring_set(void *arg, void *value, char *errorbuf __attribute__((unused)), int phase __attribute__((unused)), int apply) +{ + struct ldbminfo *li = (struct ldbminfo *)arg; + int retval = LDAP_SUCCESS; + int val = (int32_t)((intptr_t)value); + + if (apply) { + if (CONFIG_PHASE_RUNNING == phase) { + li->li_new_dblock_monitoring = val; + slapi_log_err(SLAPI_LOG_NOTICE, "bdb_config_db_lock_monitoring_set", + "New nsslapd-db-lock-monitoring value will not take affect until the server is restarted\n"); + } else { + li->li_new_dblock_monitoring = val; + li->li_dblock_monitoring = val; + } + } + + return retval; +} + +static void * +bdb_config_db_lock_pause_get(void *arg) +{ + struct ldbminfo *li = (struct ldbminfo *)arg; + + return (void *)((uintptr_t)(slapi_atomic_load_32((int32_t *)&(li->li_dblock_monitoring_pause), __ATOMIC_RELAXED))); +} + +static int +bdb_config_db_lock_pause_set(void *arg, void *value, char *errorbuf, int phase __attribute__((unused)), int apply) +{ + struct ldbminfo *li = (struct ldbminfo *)arg; + int retval = LDAP_SUCCESS; + u_int32_t val = (u_int32_t)((uintptr_t)value); + + if (val == 0) { + slapi_log_err(SLAPI_LOG_NOTICE, "bdb_config_db_lock_pause_set", + "%s was set to '0'. The default value will be used (%s)", + CONFIG_DB_LOCKS_PAUSE, DEFAULT_DBLOCK_PAUSE_STR); + val = DEFAULT_DBLOCK_PAUSE; + } + + if (apply) { + slapi_atomic_store_32((int32_t *)&(li->li_dblock_monitoring_pause), val, __ATOMIC_RELAXED); + } + return retval; +} + +static void * +bdb_config_db_lock_threshold_get(void *arg) +{ + struct ldbminfo *li = (struct ldbminfo *)arg; + + return (void *)((uintptr_t)(li->li_new_dblock_threshold)); +} + +static int +bdb_config_db_lock_threshold_set(void *arg, void *value, char *errorbuf, int phase __attribute__((unused)), int apply) +{ + struct ldbminfo *li = (struct ldbminfo *)arg; + int retval = LDAP_SUCCESS; + u_int32_t val = (u_int32_t)((uintptr_t)value); + + if (val < 70 || val > 95) { + slapi_create_errormsg(errorbuf, SLAPI_DSE_RETURNTEXT_SIZE, + "%s: \"%d\" is invalid, threshold is indicated as a percentage and it must lie in range of 70 and 95", + CONFIG_DB_LOCKS_THRESHOLD, val); + slapi_log_err(SLAPI_LOG_ERR, "bdb_config_db_lock_threshold_set", + "%s: \"%d\" is invalid, threshold is indicated as a percentage and it must lie in range of 70 and 95", + CONFIG_DB_LOCKS_THRESHOLD, val); + retval = LDAP_OPERATIONS_ERROR; + return retval; + } + + if (apply) { + if (CONFIG_PHASE_RUNNING == phase) { + li->li_new_dblock_threshold = val; + slapi_log_err(SLAPI_LOG_NOTICE, "bdb_config_db_lock_threshold_set", + "New nsslapd-db-lock-monitoring-threshold value will not take affect until the server is restarted\n"); + } else { + li->li_new_dblock_threshold = val; + li->li_dblock_threshold = val; + } + } + return retval; +} + static void * bdb_config_dbcachesize_get(void *arg) { @@ -1409,6 +1505,9 @@ static config_info bdb_config_param[] = { {CONFIG_SERIAL_LOCK, CONFIG_TYPE_ONOFF, "on", &bdb_config_serial_lock_get, &bdb_config_serial_lock_set, CONFIG_FLAG_ALWAYS_SHOW | CONFIG_FLAG_ALLOW_RUNNING_CHANGE}, {CONFIG_USE_LEGACY_ERRORCODE, CONFIG_TYPE_ONOFF, "off", &bdb_config_legacy_errcode_get, &bdb_config_legacy_errcode_set, 0}, {CONFIG_DB_DEADLOCK_POLICY, CONFIG_TYPE_INT, STRINGIFYDEFINE(DB_LOCK_YOUNGEST), &bdb_config_db_deadlock_policy_get, &bdb_config_db_deadlock_policy_set, CONFIG_FLAG_ALWAYS_SHOW | CONFIG_FLAG_ALLOW_RUNNING_CHANGE}, + {CONFIG_DB_LOCKS_MONITORING, CONFIG_TYPE_ONOFF, "on", &bdb_config_db_lock_monitoring_get, &bdb_config_db_lock_monitoring_set, CONFIG_FLAG_ALWAYS_SHOW | CONFIG_FLAG_ALLOW_RUNNING_CHANGE}, + {CONFIG_DB_LOCKS_THRESHOLD, CONFIG_TYPE_INT, "90", &bdb_config_db_lock_threshold_get, &bdb_config_db_lock_threshold_set, CONFIG_FLAG_ALWAYS_SHOW | CONFIG_FLAG_ALLOW_RUNNING_CHANGE}, + {CONFIG_DB_LOCKS_PAUSE, CONFIG_TYPE_INT, DEFAULT_DBLOCK_PAUSE_STR, &bdb_config_db_lock_pause_get, &bdb_config_db_lock_pause_set, CONFIG_FLAG_ALWAYS_SHOW | CONFIG_FLAG_ALLOW_RUNNING_CHANGE}, {NULL, 0, NULL, NULL, NULL, 0}}; void diff --git a/ldap/servers/slapd/back-ldbm/db-bdb/bdb_layer.c b/ldap/servers/slapd/back-ldbm/db-bdb/bdb_layer.c index 6cccad8e6..2f25f67a2 100644 --- a/ldap/servers/slapd/back-ldbm/db-bdb/bdb_layer.c +++ b/ldap/servers/slapd/back-ldbm/db-bdb/bdb_layer.c @@ -35,6 +35,8 @@ (env)->txn_checkpoint((env), (kbyte), (min), (flags)) #define MEMP_STAT(env, gsp, fsp, flags, malloc) \ (env)->memp_stat((env), (gsp), (fsp), (flags)) +#define LOCK_STAT(env, statp, flags, malloc) \ + (env)->lock_stat((env), (statp), (flags)) #define MEMP_TRICKLE(env, pct, nwrotep) \ (env)->memp_trickle((env), (pct), (nwrotep)) #define LOG_ARCHIVE(env, listp, flags, malloc) \ @@ -66,6 +68,7 @@ #define NEWDIR_MODE 0755 #define DB_REGION_PREFIX "__db." +static int locks_monitoring_threadmain(void *param); static int perf_threadmain(void *param); static int checkpoint_threadmain(void *param); static int trickle_threadmain(void *param); @@ -84,6 +87,7 @@ static int bdb_start_checkpoint_thread(struct ldbminfo *li); static int bdb_start_trickle_thread(struct ldbminfo *li); static int bdb_start_perf_thread(struct ldbminfo *li); static int bdb_start_txn_test_thread(struct ldbminfo *li); +static int bdb_start_locks_monitoring_thread(struct ldbminfo *li); static int trans_batch_count = 0; static int trans_batch_limit = 0; static int trans_batch_txn_min_sleep = 50; /* ms */ @@ -1299,6 +1303,10 @@ bdb_start(struct ldbminfo *li, int dbmode) return return_value; } + if (0 != (return_value = bdb_start_locks_monitoring_thread(li))) { + return return_value; + } + /* We need to free the memory to avoid a leak * Also, we have to evaluate if the performance counter * should be preserved or not for database restore. @@ -2885,6 +2893,7 @@ bdb_start_perf_thread(struct ldbminfo *li) return return_value; } + /* Performance thread */ static int perf_threadmain(void *param) @@ -2910,6 +2919,82 @@ perf_threadmain(void *param) return 0; } + +/* + * create a thread for locks_monitoring_threadmain + */ +static int +bdb_start_locks_monitoring_thread(struct ldbminfo *li) +{ + int return_value = 0; + if (li->li_dblock_monitoring) { + if (NULL == PR_CreateThread(PR_USER_THREAD, + (VFP)(void *)locks_monitoring_threadmain, li, + PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD, + PR_UNJOINABLE_THREAD, + SLAPD_DEFAULT_THREAD_STACKSIZE)) { + PRErrorCode prerr = PR_GetError(); + slapi_log_err(SLAPI_LOG_ERR, "bdb_start_locks_monitoring_thread", + "Failed to create database locks monitoring thread, " SLAPI_COMPONENT_NAME_NSPR " error %d (%s)\n", + prerr, slapd_pr_strerror(prerr)); + return_value = -1; + } + } + return return_value; +} + + +/* DB Locks Monitoring thread */ +static int +locks_monitoring_threadmain(void *param) +{ + int ret = 0; + uint64_t current_locks = 0; + uint64_t max_locks = 0; + uint32_t lock_exhaustion = 0; + PRIntervalTime interval; + struct ldbminfo *li = NULL; + + PR_ASSERT(NULL != param); + li = (struct ldbminfo *)param; + + dblayer_private *priv = li->li_dblayer_private; + bdb_db_env *pEnv = (bdb_db_env *)priv->dblayer_env; + PR_ASSERT(NULL != priv); + + INCR_THREAD_COUNT(pEnv); + + while (!BDB_CONFIG(li)->bdb_stop_threads) { + if (dblayer_db_uses_locking(pEnv->bdb_DB_ENV)) { + DB_LOCK_STAT *lockstat = NULL; + ret = LOCK_STAT(pEnv->bdb_DB_ENV, &lockstat, 0, (void *)slapi_ch_malloc); + if (0 == ret) { + current_locks = lockstat->st_nlocks; + max_locks = lockstat->st_maxlocks; + if (max_locks){ + lock_exhaustion = (uint32_t)((double)current_locks / (double)max_locks * 100.0); + } else { + lock_exhaustion = 0; + } + if ((li->li_dblock_threshold) && + (lock_exhaustion >= li->li_dblock_threshold)) { + slapi_atomic_store_32((int32_t *)&(li->li_dblock_threshold_reached), 1, __ATOMIC_RELAXED); + } else { + slapi_atomic_store_32((int32_t *)&(li->li_dblock_threshold_reached), 0, __ATOMIC_RELAXED); + } + } + slapi_ch_free((void **)&lockstat); + } + interval = PR_MillisecondsToInterval(slapi_atomic_load_32((int32_t *)&(li->li_dblock_monitoring_pause), __ATOMIC_RELAXED)); + DS_Sleep(interval); + } + + DECR_THREAD_COUNT(pEnv); + slapi_log_err(SLAPI_LOG_TRACE, "locks_monitoring_threadmain", "Leaving locks_monitoring_threadmain\n"); + return 0; +} + + /* * create a thread for deadlock_threadmain */ diff --git a/ldap/servers/slapd/back-ldbm/init.c b/ldap/servers/slapd/back-ldbm/init.c index 893776699..4165c8fad 100644 --- a/ldap/servers/slapd/back-ldbm/init.c +++ b/ldap/servers/slapd/back-ldbm/init.c @@ -70,6 +70,9 @@ ldbm_back_init(Slapi_PBlock *pb) /* Initialize the set of instances. */ li->li_instance_set = objset_new(&ldbm_back_instance_set_destructor); + /* Init lock threshold value */ + li->li_dblock_threshold_reached = 0; + /* ask the factory to give us space in the Connection object * (only bulk import uses this) */ diff --git a/ldap/servers/slapd/back-ldbm/ldbm_config.c b/ldap/servers/slapd/back-ldbm/ldbm_config.c index 10cef250f..60884cf33 100644 --- a/ldap/servers/slapd/back-ldbm/ldbm_config.c +++ b/ldap/servers/slapd/back-ldbm/ldbm_config.c @@ -87,6 +87,9 @@ static char *ldbm_config_moved_attributes[] = CONFIG_SERIAL_LOCK, CONFIG_USE_LEGACY_ERRORCODE, CONFIG_DB_DEADLOCK_POLICY, + CONFIG_DB_LOCKS_MONITORING, + CONFIG_DB_LOCKS_THRESHOLD, + CONFIG_DB_LOCKS_PAUSE, ""}; /* Used to add an array of entries, like the one above and diff --git a/ldap/servers/slapd/back-ldbm/ldbm_config.h b/ldap/servers/slapd/back-ldbm/ldbm_config.h index 58e64799c..6fa8292eb 100644 --- a/ldap/servers/slapd/back-ldbm/ldbm_config.h +++ b/ldap/servers/slapd/back-ldbm/ldbm_config.h @@ -104,6 +104,9 @@ struct config_info #define CONFIG_DB_VERBOSE "nsslapd-db-verbose" #define CONFIG_DB_DEBUG "nsslapd-db-debug" #define CONFIG_DB_LOCK "nsslapd-db-locks" +#define CONFIG_DB_LOCKS_MONITORING "nsslapd-db-locks-monitoring-enabled" +#define CONFIG_DB_LOCKS_THRESHOLD "nsslapd-db-locks-monitoring-threshold" +#define CONFIG_DB_LOCKS_PAUSE "nsslapd-db-locks-monitoring-pause" #define CONFIG_DB_NAMED_REGIONS "nsslapd-db-named-regions" #define CONFIG_DB_PRIVATE_MEM "nsslapd-db-private-mem" #define CONFIG_DB_PRIVATE_IMPORT_MEM "nsslapd-db-private-import-mem" diff --git a/ldap/servers/slapd/back-ldbm/ldbm_search.c b/ldap/servers/slapd/back-ldbm/ldbm_search.c index 1a7b510d4..6e22debde 100644 --- a/ldap/servers/slapd/back-ldbm/ldbm_search.c +++ b/ldap/servers/slapd/back-ldbm/ldbm_search.c @@ -1472,6 +1472,7 @@ ldbm_back_next_search_entry_ext(Slapi_PBlock *pb, int use_extension) slapi_pblock_get(pb, SLAPI_CONNECTION, &conn); slapi_pblock_get(pb, SLAPI_OPERATION, &op); + if ((reverse_list = operation_is_flag_set(op, OP_FLAG_REVERSE_CANDIDATE_ORDER))) { /* * Start at the end of the list and work our way forward. Since a single @@ -1538,6 +1539,18 @@ ldbm_back_next_search_entry_ext(Slapi_PBlock *pb, int use_extension) /* Find the next candidate entry and return it. */ while (1) { + if (li->li_dblock_monitoring && + slapi_atomic_load_32((int32_t *)&(li->li_dblock_threshold_reached), __ATOMIC_RELAXED)) { + slapi_log_err(SLAPI_LOG_CRIT, "ldbm_back_next_search_entry", + "DB locks threshold is reached (nsslapd-db-locks-monitoring-threshold " + "under cn=bdb,cn=config,cn=ldbm database,cn=plugins,cn=config). " + "Please, increase nsslapd-db-locks according to your needs.\n"); + slapi_pblock_set(pb, SLAPI_SEARCH_RESULT_ENTRY, NULL); + delete_search_result_set(pb, &sr); + rc = SLAPI_FAIL_GENERAL; + slapi_send_ldap_result(pb, LDAP_UNWILLING_TO_PERFORM, NULL, "DB locks threshold is reached (nsslapd-db-locks-monitoring-threshold)", 0, NULL); + goto bail; + } /* check for abandon */ if (slapi_op_abandoned(pb) || (NULL == sr)) { diff --git a/ldap/servers/slapd/libglobs.c b/ldap/servers/slapd/libglobs.c index 388616b36..db7d01bbc 100644 --- a/ldap/servers/slapd/libglobs.c +++ b/ldap/servers/slapd/libglobs.c @@ -8171,8 +8171,8 @@ config_set(const char *attr, struct berval **values, char *errorbuf, int apply) #if 0 debugHashTable(attr); #endif - slapi_create_errormsg(errorbuf, SLAPI_DSE_RETURNTEXT_SIZE, "Unknown attribute %s will be ignored", attr); - slapi_log_err(SLAPI_LOG_ERR, "config_set", "Unknown attribute %s will be ignored", attr); + slapi_create_errormsg(errorbuf, SLAPI_DSE_RETURNTEXT_SIZE, "Unknown attribute %s will be ignored\n", attr); + slapi_log_err(SLAPI_LOG_ERR, "config_set", "Unknown attribute %s will be ignored\n", attr); return LDAP_NO_SUCH_ATTRIBUTE; } diff --git a/src/cockpit/389-console/src/css/ds.css b/src/cockpit/389-console/src/css/ds.css index 9248116e7..3cf50b593 100644 --- a/src/cockpit/389-console/src/css/ds.css +++ b/src/cockpit/389-console/src/css/ds.css @@ -639,6 +639,10 @@ option { padding-right: 0 !important; } +.ds-vertical-scroll-auto { + overflow-y: auto !important; +} + .alert { max-width: 750px; } diff --git a/src/cockpit/389-console/src/database.jsx b/src/cockpit/389-console/src/database.jsx index efa3ce6d5..11cae972c 100644 --- a/src/cockpit/389-console/src/database.jsx +++ b/src/cockpit/389-console/src/database.jsx @@ -157,6 +157,7 @@ export class Database extends React.Component { const attrs = config.attrs; let db_cache_auto = false; let import_cache_auto = false; + let dblocksMonitoring = false; let dbhome = ""; if ('nsslapd-db-home-directory' in attrs) { @@ -168,6 +169,9 @@ export class Database extends React.Component { if (attrs['nsslapd-import-cache-autosize'] != "0") { import_cache_auto = true; } + if (attrs['nsslapd-db-locks-monitoring-enabled'][0] == "on") { + dblocksMonitoring = true; + } this.setState(() => ( { @@ -187,6 +191,9 @@ export class Database extends React.Component { txnlogdir: attrs['nsslapd-db-logdirectory'], dbhomedir: dbhome, dblocks: attrs['nsslapd-db-locks'], + dblocksMonitoring: dblocksMonitoring, + dblocksMonitoringThreshold: attrs['nsslapd-db-locks-monitoring-threshold'], + dblocksMonitoringPause: attrs['nsslapd-db-locks-monitoring-pause'], chxpoint: attrs['nsslapd-db-checkpoint-interval'], compactinterval: attrs['nsslapd-db-compactdb-interval'], importcacheauto: attrs['nsslapd-import-cache-autosize'], diff --git a/src/cockpit/389-console/src/index.html b/src/cockpit/389-console/src/index.html index 1278844fc..fd0eeb669 100644 --- a/src/cockpit/389-console/src/index.html +++ b/src/cockpit/389-console/src/index.html @@ -12,7 +12,7 @@ - +
diff --git a/src/cockpit/389-console/src/lib/database/databaseConfig.jsx b/src/cockpit/389-console/src/lib/database/databaseConfig.jsx index f6e662bca..6a71c138d 100644 --- a/src/cockpit/389-console/src/lib/database/databaseConfig.jsx +++ b/src/cockpit/389-console/src/lib/database/databaseConfig.jsx @@ -31,6 +31,9 @@ export class GlobalDatabaseConfig extends React.Component { txnlogdir: this.props.data.txnlogdir, dbhomedir: this.props.data.dbhomedir, dblocks: this.props.data.dblocks, + dblocksMonitoring: this.props.data.dblocksMonitoring, + dblocksMonitoringThreshold: this.props.data.dblocksMonitoringThreshold, + dblocksMonitoringPause: this.props.data.dblocksMonitoringPause, chxpoint: this.props.data.chxpoint, compactinterval: this.props.data.compactinterval, importcachesize: this.props.data.importcachesize, @@ -47,6 +50,9 @@ export class GlobalDatabaseConfig extends React.Component { _txnlogdir: this.props.data.txnlogdir, _dbhomedir: this.props.data.dbhomedir, _dblocks: this.props.data.dblocks, + _dblocksMonitoring: this.props.data.dblocksMonitoring, + _dblocksMonitoringThreshold: this.props.data.dblocksMonitoringThreshold, + _dblocksMonitoringPause: this.props.data.dblocksMonitoringPause, _chxpoint: this.props.data.chxpoint, _compactinterval: this.props.data.compactinterval, _importcachesize: this.props.data.importcachesize, @@ -55,6 +61,7 @@ export class GlobalDatabaseConfig extends React.Component { _import_cache_auto: this.props.data.import_cache_auto, }; this.handleChange = this.handleChange.bind(this); + this.select_db_locks_monitoring = this.select_db_locks_monitoring.bind(this); this.select_auto_cache = this.select_auto_cache.bind(this); this.select_auto_import_cache = this.select_auto_import_cache.bind(this); this.save_db_config = this.save_db_config.bind(this); @@ -76,6 +83,12 @@ export class GlobalDatabaseConfig extends React.Component { }, this.handleChange(e)); } + select_db_locks_monitoring (val, e) { + this.setState({ + dblocksMonitoring: !this.state.dblocksMonitoring + }, this.handleChange(val, e)); + } + handleChange(e) { // Generic const value = e.target.type === 'checkbox' ? e.target.checked : e.target.value; @@ -150,6 +163,21 @@ export class GlobalDatabaseConfig extends React.Component { cmd.push("--locks=" + this.state.dblocks); requireRestart = true; } + if (this.state._dblocksMonitoring != this.state.dblocksMonitoring) { + if (this.state.dblocksMonitoring) { + cmd.push("--locks-monitoring-enabled=on"); + } else { + cmd.push("--locks-monitoring-enabled=off"); + } + requireRestart = true; + } + if (this.state._dblocksMonitoringThreshold != this.state.dblocksMonitoringThreshold) { + cmd.push("--locks-monitoring-threshold=" + this.state.dblocksMonitoringThreshold); + requireRestart = true; + } + if (this.state._dblocksMonitoringPause != this.state.dblocksMonitoringPause) { + cmd.push("--locks-monitoring-pause=" + this.state.dblocksMonitoringPause); + } if (this.state._chxpoint != this.state.chxpoint) { cmd.push("--checkpoint-interval=" + this.state.chxpoint); requireRestart = true; @@ -216,6 +244,28 @@ export class GlobalDatabaseConfig extends React.Component { let import_cache_form; let db_auto_checked = false; let import_auto_checked = false; + let dblocksMonitor = ""; + + if (this.state.dblocksMonitoring) { + dblocksMonitor =
+ + + DB Locks Threshold Percentage + + + + + + + + DB Locks Pause Milliseconds + + + + + +
; + } if (this.state.db_cache_auto) { db_cache_form =
@@ -422,14 +472,6 @@ export class GlobalDatabaseConfig extends React.Component { - - - Database Locks - - - - - Database Checkpoint Interval @@ -446,6 +488,36 @@ export class GlobalDatabaseConfig extends React.Component { + + + Database Locks + + + + + + + +
DB Locks Monitoring
+
+ +
+ + + + Enable Monitoring + + + + + + {dblocksMonitor} + +
diff --git a/src/lib389/lib389/backend.py b/src/lib389/lib389/backend.py index bcd7b383f..13bb27842 100644 --- a/src/lib389/lib389/backend.py +++ b/src/lib389/lib389/backend.py @@ -1011,6 +1011,9 @@ class DatabaseConfig(DSLdapObject): 'nsslapd-db-transaction-batch-max-wait', 'nsslapd-db-logbuf-size', 'nsslapd-db-locks', + 'nsslapd-db-locks-monitoring-enabled', + 'nsslapd-db-locks-monitoring-threshold', + 'nsslapd-db-locks-monitoring-pause', 'nsslapd-db-private-import-mem', 'nsslapd-import-cache-autosize', 'nsslapd-cache-autosize', diff --git a/src/lib389/lib389/cli_conf/backend.py b/src/lib389/lib389/cli_conf/backend.py index 6bfbcb036..722764d10 100644 --- a/src/lib389/lib389/cli_conf/backend.py +++ b/src/lib389/lib389/cli_conf/backend.py @@ -46,6 +46,9 @@ arg_to_attr = { 'txn_batch_max': 'nsslapd-db-transaction-batch-max-wait', 'logbufsize': 'nsslapd-db-logbuf-size', 'locks': 'nsslapd-db-locks', + 'locks_monitoring_enabled': 'nsslapd-db-locks-monitoring-enabled', + 'locks_monitoring_threshold': 'nsslapd-db-locks-monitoring-threshold', + 'locks_monitoring_pause': 'nsslapd-db-locks-monitoring-pause', 'import_cache_autosize': 'nsslapd-import-cache-autosize', 'cache_autosize': 'nsslapd-cache-autosize', 'cache_autosize_split': 'nsslapd-cache-autosize-split', @@ -998,6 +1001,13 @@ def create_parser(subparsers): 'the batch count (only works when txn-batch-val is set)') set_db_config_parser.add_argument('--logbufsize', help='Specifies the transaction log information buffer size') set_db_config_parser.add_argument('--locks', help='Sets the maximum number of database locks') + set_db_config_parser.add_argument('--locks-monitoring-enabled', help='Set to "on" or "off" to monitor DB locks. When it crosses the percentage value ' + 'set with "--locks-monitoring-threshold" ("on" by default)') + set_db_config_parser.add_argument('--locks-monitoring-threshold', help='Sets the DB lock exhaustion value in percentage (valid range is 70-95). If too many locks are ' + 'acquired, the server will abort the searches while the number of locks ' + 'are not decreased. It helps to avoid DB corruption and long recovery.') + set_db_config_parser.add_argument('--locks-monitoring-pause', help='Sets the DB lock monitoring value in milliseconds for the amount of time ' + 'that the monitoring thread spends waiting between checks.') set_db_config_parser.add_argument('--import-cache-autosize', help='Set to "on" or "off" to automatically set the size of the import ' 'cache to be used during the the import process of LDIF files') set_db_config_parser.add_argument('--cache-autosize', help='Sets the percentage of free memory that is used in total for the database ' -- 2.26.3