Blob Blame History Raw
From 45f4abd6befa50b129fc49b331b42c280f37199a Mon Sep 17 00:00:00 2001
From: Mark Reynolds <mreynolds@redhat.com>
Date: Thu, 13 Jun 2019 13:01:35 -0400
Subject: [PATCH] Issue 49602 - Revise replication status messages

Bug Description: All agreement status messages start with "Error (##)" followed
                 by a text string.  Even success states start with "Error", and
                 this is confusing.

                 Added new attributes to display the status in a JSON format
                 for easier parsing for applications:

                     replicaLastUpdateStatusJSON
                     replicaLastInitStatusJSON

Design Doc:  https://www.port389.org/docs/389ds/design/repl-agmt-status-design.html

https://pagure.io/389-ds-base/issue/49602

Reviewed by: firstyear(Thanks!)
---
 .../suites/replication/single_master_test.py  | 19 +++--
 ldap/schema/01core389.ldif                    |  2 +
 ldap/servers/plugins/replication/repl5_agmt.c | 84 ++++++++++++++++---
 .../plugins/replication/repl5_protocol_util.c | 13 +--
 4 files changed, 96 insertions(+), 22 deletions(-)

diff --git a/dirsrvtests/tests/suites/replication/single_master_test.py b/dirsrvtests/tests/suites/replication/single_master_test.py
index 5b73e23ae..78f849da7 100644
--- a/dirsrvtests/tests/suites/replication/single_master_test.py
+++ b/dirsrvtests/tests/suites/replication/single_master_test.py
@@ -23,6 +23,7 @@ from lib389._constants import (ReplicaRole, DEFAULT_SUFFIX, REPLICAID_MASTER_1,
                                 REPLICATION_BIND_METHOD, REPLICATION_TRANSPORT, DEFAULT_BACKUPDIR,
                                 RA_NAME, RA_BINDDN, RA_BINDPW, RA_METHOD, RA_TRANSPORT_PROT,
                                 defaultProperties)
+import json
 
 pytestmark = pytest.mark.tier1
 
@@ -95,7 +96,7 @@ def test_mail_attr_repl(topo_r):
     consumer.start()
 
     log.info("Make a search for mail attribute in attempt to crash server")
-    consumer.search_s(DEFAULT_SUFFIX, ldap.SCOPE_SUBTREE, "(mail=testuser@redhat.com)", ["mail"])
+    c_user.get_attr_val("mail")
 
     log.info("Make sure that server hasn't crashed")
     repl.test_replication(master, consumer)
@@ -111,11 +112,13 @@ def test_lastupdate_attr_before_init(topo_nr):
         1. Check nsds5replicaLastUpdateStart value
         2. Check nsds5replicaLastUpdateEnd value
         3. Check nsds5replicaLastUpdateStatus value
+        4. Check nsds5replicaLastUpdateStatusJSON is parsable
     :expectedresults:
         1. nsds5replicaLastUpdateStart should be equal to 0
         2. nsds5replicaLastUpdateEnd should be equal to 0
         3. nsds5replicaLastUpdateStatus should not be equal
-           to "0 Replica acquired successfully: Incremental update started"
+           to "Replica acquired successfully: Incremental update started"
+        4. Success
     """
 
     master = topo_nr.ins["standalone1"]
@@ -139,11 +142,15 @@ def test_lastupdate_attr_before_init(topo_nr):
     with pytest.raises(Exception):
         repl.wait_for_replication(master, consumer, timeout=5)
 
-    assert agmt.get_attr_val_bytes('nsds5replicaLastUpdateStart') == b"19700101000000Z"
-    assert agmt.get_attr_val_bytes("nsds5replicaLastUpdateEnd") == b"19700101000000Z"
-    assert b"Replica acquired successfully" not in agmt.get_attr_val_bytes("nsds5replicaLastUpdateStatus")
-
+    assert agmt.get_attr_val_utf8('nsds5replicaLastUpdateStart') == "19700101000000Z"
+    assert agmt.get_attr_val_utf8("nsds5replicaLastUpdateEnd") == "19700101000000Z"
+    assert "replica acquired successfully" not in agmt.get_attr_val_utf8_l("nsds5replicaLastUpdateStatus")
 
+    # make sure the JSON attribute is parsable
+    json_status = agmt.get_attr_val_utf8("nsds5replicaLastUpdateStatusJSON")
+    if json_status is not None:
+        json_obj = json.loads(json_status)
+        log.debug("JSON status message: {}".format(json_obj))
 
 if __name__ == '__main__':
     # Run isolated
diff --git a/ldap/schema/01core389.ldif b/ldap/schema/01core389.ldif
index 993fa4a6d..7bf4acc5b 100644
--- a/ldap/schema/01core389.ldif
+++ b/ldap/schema/01core389.ldif
@@ -312,6 +312,8 @@ attributeTypes: ( 2.16.840.1.113730.3.1.2341 NAME 'nsslapd-changelogmaxentries'
 attributeTypes: ( 2.16.840.1.113730.3.1.2344 NAME 'nsslapd-tls-check-crl' DESC 'Check CRL when opening outbound TLS connections. Valid options are none, peer, all.' SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE X-ORIGIN '389 Directory Server' )
 attributeTypes: ( 2.16.840.1.113730.3.1.2353 NAME 'nsslapd-encryptionalgorithm' DESC 'The encryption algorithm used to encrypt the changelog' SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE X-ORIGIN '389 Directory Server' )
 attributeTypes: ( 2.16.840.1.113730.3.1.2084 NAME 'nsSymmetricKey' DESC 'A symmetric key - currently used by attribute encryption' SYNTAX 1.3.6.1.4.1.1466.115.121.1.40 SINGLE-VALUE X-ORIGIN 'attribute encryption' )
+attributeTypes: ( 2.16.840.1.113730.3.1.2364 NAME 'nsds5replicaLastInitStatusJSON' DESC 'Netscape defined attribute type' SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE NO-USER-MODIFICATION X-ORIGIN 'Netscape Directory Server' )
+attributeTypes: ( 2.16.840.1.113730.3.1.2365 NAME 'nsds5replicaLastUpdateStatusJSON' DESC 'Netscape defined attribute type' SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE NO-USER-MODIFICATION X-ORIGIN 'Netscape Directory Server' )
 #
 # objectclasses
 #
diff --git a/ldap/servers/plugins/replication/repl5_agmt.c b/ldap/servers/plugins/replication/repl5_agmt.c
index 53e6708c8..8e4586d8b 100644
--- a/ldap/servers/plugins/replication/repl5_agmt.c
+++ b/ldap/servers/plugins/replication/repl5_agmt.c
@@ -60,7 +60,11 @@
 #define DEFAULT_TIMEOUT 120             /* (seconds) default outbound LDAP connection */
 #define DEFAULT_FLOWCONTROL_WINDOW 1000 /* #entries sent without acknowledgment */
 #define DEFAULT_FLOWCONTROL_PAUSE 2000  /* msec of pause when #entries sent witout acknowledgment */
-#define STATUS_LEN 1024
+#define STATUS_LEN 2048
+#define STATUS_GOOD "green"
+#define STATUS_WARNING "amber"
+#define STATUS_BAD "red"
+
 
 struct changecounter
 {
@@ -93,11 +97,13 @@ typedef struct repl5agmt
     time_t last_update_start_time;       /* Local start time of last update session */
     time_t last_update_end_time;         /* Local end time of last update session */
     char last_update_status[STATUS_LEN]; /* Status of last update. Format = numeric code <space> textual description */
+    char last_update_status_json[STATUS_LEN];
     PRBool update_in_progress;
     PRBool is_enabled;
     time_t last_init_start_time;       /* Local start time of last total init */
     time_t last_init_end_time;         /* Local end time of last total init */
     char last_init_status[STATUS_LEN]; /* Status of last total init. Format = numeric code <space> textual description */
+    char last_init_status_json[STATUS_LEN];
     PRLock *lock;
     Object *consumerRUV;     /* last RUV received from the consumer - used for changelog purging */
     CSN *consumerSchemaCSN;  /* last schema CSN received from the consumer */
@@ -2443,6 +2449,21 @@ agmt_set_last_init_end(Repl_Agmt *ra, time_t end_time)
     }
 }
 
+static void
+agmt_set_last_update_status_json(Repl_Agmt *ra, char *state, int ldaprc, int replrc)
+{
+    char ts[SLAPI_TIMESTAMP_BUFSIZE];
+    time_t now;
+
+    time(&now);
+    strftime(ts, sizeof ts, "%FT%TZ", gmtime(&now));
+    PR_snprintf(ra->last_update_status_json, STATUS_LEN,
+            "{\"state\": \"%s\", \"ldap_rc\": \"%d\", \"ldap_rc_text\": \"%s\", "
+            "\"repl_rc\": \"%d\", \"repl_rc_text\": \"%s\", \"date\": \"%s\", \"message\": \"%s\"}",
+            state, ldaprc, ldap_err2string(ldaprc), replrc, protocol_response2string(replrc),
+            ts, ra->last_update_status);
+}
+
 void
 agmt_set_last_update_status(Repl_Agmt *ra, int ldaprc, int replrc, const char *message)
 {
@@ -2463,19 +2484,29 @@ agmt_set_last_update_status(Repl_Agmt *ra, int ldaprc, int replrc, const char *m
             PR_snprintf(ra->last_update_status, STATUS_LEN, "Error (%d) %s%s - LDAP error: %s%s%s%s",
                         ldaprc, message ? message : "", message ? "" : " - ",
                         slapi_err2string(ldaprc), replmsg ? " (" : "", replmsg ? replmsg : "", replmsg ? ")" : "");
+            agmt_set_last_update_status_json(ra, STATUS_BAD, ldaprc, replrc);
         }
         /* ldaprc == LDAP_SUCCESS */
         else if (replrc != 0) {
             if (replrc == NSDS50_REPL_REPLICA_BUSY) {
                 PR_snprintf(ra->last_update_status, STATUS_LEN,
-                            "Error (%d) Can't acquire busy replica", replrc);
+                            "Error (%d) Can't acquire busy replica (%s)",
+                            replrc, message ? message : "");
+                agmt_set_last_update_status_json(ra, STATUS_WARNING, ldaprc, replrc);
+            } else if (replrc == NSDS50_REPL_TRANSIENT_ERROR  || replrc == NSDS50_REPL_BACKOFF) {
+                PR_snprintf(ra->last_update_status, STATUS_LEN,
+                            "Error (%d) Can't acquire replica (%s)",
+                            replrc, message ? message : "");
+                agmt_set_last_update_status_json(ra, STATUS_WARNING, ldaprc, replrc);
             } else if (replrc == NSDS50_REPL_REPLICA_RELEASE_SUCCEEDED) {
                 PR_snprintf(ra->last_update_status, STATUS_LEN, "Error (0) Replication session successful");
+                agmt_set_last_update_status_json(ra, STATUS_GOOD, ldaprc, replrc);
             } else if (replrc == NSDS50_REPL_DISABLED) {
                 PR_snprintf(ra->last_update_status, STATUS_LEN, "Error (%d) Incremental update aborted: "
                                                                 "Replication agreement for %s\n can not be updated while the replica is disabled.\n"
                                                                 "(If the suffix is disabled you must enable it then restart the server for replication to take place).",
                             replrc, ra->long_name ? ra->long_name : "a replica");
+                agmt_set_last_update_status_json(ra, STATUS_BAD, ldaprc, replrc);
                 /* Log into the errors log, as "ra->long_name" is not accessible from the caller */
                 slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name,
                               "Incremental update aborted: Replication agreement for \"%s\" "
@@ -2487,17 +2518,35 @@ agmt_set_last_update_status(Repl_Agmt *ra, int ldaprc, int replrc, const char *m
                 PR_snprintf(ra->last_update_status, STATUS_LEN,
                             "Error (%d) Replication error acquiring replica: %s%s(%s)",
                             replrc, message ? message : "", message ? " " : "", protocol_response2string(replrc));
+                agmt_set_last_update_status_json(ra, STATUS_BAD, ldaprc, replrc);
             }
         } else if (message != NULL) /* replrc == NSDS50_REPL_REPLICA_READY == 0 */
         {
             PR_snprintf(ra->last_update_status, STATUS_LEN,
                         "Error (0) Replica acquired successfully: %s", message);
+            agmt_set_last_update_status_json(ra, STATUS_GOOD, ldaprc, replrc);
         } else { /* agmt_set_last_update_status(0,0,NULL) to reset agmt */
             ra->last_update_status[0] = '\0';
+            ra->last_update_status_json[0] = '\0';
         }
     }
 }
 
+static void
+agmt_set_last_init_status_json(Repl_Agmt *ra, char *state, int ldaprc, int replrc, int connrc)
+{
+    char ts[SLAPI_TIMESTAMP_BUFSIZE];
+    time_t now;
+
+    time(&now);
+    strftime(ts, sizeof ts, "%FT%TZ", gmtime(&now));
+    PR_snprintf(ra->last_init_status_json, STATUS_LEN,
+            "{\"state\": \"%s\", \"ldap_rc\": \"%d\", \"ldap_rc_text\": \"%s\", \"repl_rc\": \"%d\", \"repl_rc_text\": \"%s\", "
+            "\"conn_rc\": \"%d\", \"conn_rc_text\": \"%s\", \"date\": \"%s\", \"message\": \"%s\"}",
+            state, ldaprc, ldap_err2string(ldaprc), replrc, protocol_response2string(replrc),
+            connrc, conn_result2string(connrc), ts, ra->last_init_status);
+}
+
 void
 agmt_set_last_init_status(Repl_Agmt *ra, int ldaprc, int replrc, int connrc, const char *message)
 {
@@ -2523,16 +2572,16 @@ agmt_set_last_init_status(Repl_Agmt *ra, int ldaprc, int replrc, int connrc, con
                     replmsg = NULL;
                 }
             }
-            PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d) %s%sLDAP error: %s%s%s%s%s",
+            PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d)%s%sLDAP error: %s%s%s%s%s",
                         ldaprc, message ? message : "", message ? "" : " - ",
                         slapi_err2string(ldaprc), replmsg ? " - " : "", replmsg ? replmsg : "",
                         connrc ? " - " : "", connrc ? connmsg : "");
+            agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc);
         }
         /* ldaprc == LDAP_SUCCESS */
         else if (replrc != 0) {
             if (replrc == NSDS50_REPL_REPLICA_RELEASE_SUCCEEDED) {
-                PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d) %s",
-                            ldaprc, "Replication session successful");
+                PR_snprintf(ra->last_init_status, STATUS_LEN, "Replication session successful");
             } else if (replrc == NSDS50_REPL_DISABLED) {
                 if (agmt_is_enabled(ra)) {
                     slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "Total update aborted: "
@@ -2543,6 +2592,7 @@ agmt_set_last_init_status(Repl_Agmt *ra, int ldaprc, int replrc, int connrc, con
                                                                   "Replication agreement for \"%s\" can not be updated while the suffix is disabled.\n"
                                                                   "You must enable it then restart the server for replication to take place).",
                                 replrc, ra->long_name ? ra->long_name : "a replica");
+                    agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc);
                 } else {
                     /* You do not need to restart the server after enabling the agreement */
                     slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "Total update aborted: "
@@ -2551,6 +2601,7 @@ agmt_set_last_init_status(Repl_Agmt *ra, int ldaprc, int replrc, int connrc, con
                     PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d) Total update aborted: "
                                                                   "Replication agreement for \"%s\" can not be updated while the agreement is disabled.",
                                 replrc, ra->long_name ? ra->long_name : "a replica");
+                    agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc);
                 }
             } else {
                 PR_snprintf(ra->last_init_status, STATUS_LEN,
@@ -2558,19 +2609,21 @@ agmt_set_last_init_status(Repl_Agmt *ra, int ldaprc, int replrc, int connrc, con
                             replrc, protocol_response2string(replrc),
                             message ? " - " : "", message ? message : "",
                             connrc ? " - " : "", connrc ? connmsg : "");
+                agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc);
             }
         } else if (connrc != CONN_OPERATION_SUCCESS) {
             PR_snprintf(ra->last_init_status, STATUS_LEN,
                         "Error (%d) connection error: %s%s%s",
                         connrc, connmsg,
                         message ? " - " : "", message ? message : "");
-        } else if (message != NULL) /* replrc == NSDS50_REPL_REPLICA_READY == 0 */
-        {
+            agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc);
+        } else if (message != NULL) { /* replrc == NSDS50_REPL_REPLICA_READY == 0 */
             PR_snprintf(ra->last_init_status, STATUS_LEN,
-                        "Error (%d) %s",
-                        ldaprc, message);
+                        "Error (%d) %s", ldaprc, message);
+            agmt_set_last_init_status_json(ra, STATUS_GOOD, ldaprc, replrc, connrc);
         } else { /* agmt_set_last_init_status(0,0,NULL) to reset agmt */
-            PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d)", ldaprc);
+            ra->last_init_status[0] = '\0';
+            ra->last_init_status_json[0] = '\0';
         }
     }
 }
@@ -2705,10 +2758,20 @@ get_agmt_status(Slapi_PBlock *pb __attribute__((unused)),
         agmt_get_changecount_string(ra, changecount_string, sizeof(changecount_string));
         slapi_entry_add_string(e, "nsds5replicaChangesSentSinceStartup", changecount_string);
         if (ra->last_update_status[0] == '\0') {
+            char status_msg[STATUS_LEN];
+            char ts[SLAPI_TIMESTAMP_BUFSIZE];
+            time_t now;
+            time(&now);
+            strftime(ts, sizeof ts, "%FT%TZ", gmtime(&now));
             slapi_entry_add_string(e, "nsds5replicaLastUpdateStatus",
                                    "Error (0) No replication sessions started since server startup");
+            PR_snprintf(status_msg, STATUS_LEN,
+                    "{\"state\": \"green\", \"ldap_rc\": \"0\", \"ldap_rc_text\": \"success\", \"repl_rc\": \"0\", \"repl_rc_text\": \"replica acquired\", "
+                    "\"date\": \"%s\", \"message\": \"Error (0) No replication sessions started since server startup\"}", ts);
+            slapi_entry_add_string(e, "nsds5replicaLastUpdateStatusJSON", status_msg);
         } else {
             slapi_entry_add_string(e, "nsds5replicaLastUpdateStatus", ra->last_update_status);
+            slapi_entry_add_string(e, "nsds5replicaLastUpdateStatusJSON", ra->last_update_status_json);
         }
         slapi_entry_add_string(e, "nsds5replicaUpdateInProgress", ra->update_in_progress ? "TRUE" : "FALSE");
 
@@ -2724,6 +2787,7 @@ get_agmt_status(Slapi_PBlock *pb __attribute__((unused)),
 
         if (ra->last_init_status[0] != '\0') {
             slapi_entry_add_string(e, "nsds5replicaLastInitStatus", ra->last_init_status);
+            slapi_entry_add_string(e, "nsds5replicaLastInitStatusJSON", ra->last_init_status_json);
         }
     }
 bail:
diff --git a/ldap/servers/plugins/replication/repl5_protocol_util.c b/ldap/servers/plugins/replication/repl5_protocol_util.c
index a48d4d02a..bb9f9e7e1 100644
--- a/ldap/servers/plugins/replication/repl5_protocol_util.c
+++ b/ldap/servers/plugins/replication/repl5_protocol_util.c
@@ -374,13 +374,13 @@ acquire_replica(Private_Repl_Protocol *prp, char *prot_oid, RUV **ruv)
                                       "has the same Replica ID as this one. "
                                       "Replication is aborting.\n",
                                       agmt_get_long_name(prp->agmt));
-                        agmt_set_last_update_status(prp->agmt, 0, 0,
-                                                    "Unable to aquire replica: the replica has the same "
+                        agmt_set_last_update_status(prp->agmt, 0, NSDS50_REPL_REPLICAID_ERROR,
+                                                    "Unable to acquire replica: the replica has the same "
                                                     "Replica ID as this one. Replication is aborting.");
                         return_value = ACQUIRE_FATAL_ERROR;
                         break;
                     case NSDS50_REPL_BACKOFF:
-                        /* A replication sesssion hook on the replica
+                        /* A replication session hook on the replica
                          * wants us to go into backoff mode. */
                         slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name,
                                       "acquire_replica - "
@@ -487,9 +487,8 @@ acquire_replica(Private_Repl_Protocol *prp, char *prot_oid, RUV **ruv)
                           "%s: Unable to obtain current CSN. "
                           "Replication is aborting.\n",
                           agmt_get_long_name(prp->agmt));
-            agmt_set_last_update_status(prp->agmt, 0, 0,
-                                        "Unable to obtain current CSN. "
-                                        "Replication is aborting.");
+            agmt_set_last_update_status(prp->agmt, 0, NSDS50_REPL_INTERNAL_ERROR,
+                                        "Unable to obtain current CSN. Replication is aborting.");
             return_value = ACQUIRE_FATAL_ERROR;
         }
     }
@@ -665,6 +664,8 @@ protocol_response2string(int response)
         return "transient warning";
     case NSDS50_REPL_RUV_ERROR:
         return "RUV error";
+    case NSDS50_REPL_REPLICA_NO_RESPONSE:
+        return "no response received";
     default:
         return "unknown error";
     }
-- 
2.21.0