233933
From 584ee2dbb8158ee3d3c3f055f1b06ff3d9177192 Mon Sep 17 00:00:00 2001
233933
From: Kotresh HR <khiremat@redhat.com>
233933
Date: Thu, 13 Jun 2019 16:23:21 +0530
233933
Subject: [PATCH 196/221] posix/ctime: Fix ctime upgrade issue
233933
233933
Problem:
233933
On a EC volume, during upgrade from the older version where
233933
ctime feature is not enabled(or not present) to the newer
233933
version where the ctime feature is available (enabled default),
233933
the self heal hangs and doesn't complete.
233933
233933
Cause:
233933
The ctime feature has both client side code (utime) and
233933
server side code (posix). The feature is driven from client.
233933
Only if the client side sets the time in the frame, should
233933
the server side sets the time attributes in xattr. But posix
233933
setattr/fseattr was not doing that. When one of the server
233933
nodes is updated, since ctime is enabled by default, it
233933
starts setting xattr on setattr/fseattr on the updated node/brick.
233933
233933
On a EC volume the first two updated nodes(bricks) are not a
233933
problem because there are 4 other bricks with consistent data.
233933
However once the third brick is updated, the new attribute(mdata xattr)
233933
will cause an inconsistency on metadata on 3 bricks, which
233933
prevents the file to be repaired.
233933
233933
Fix:
233933
Don't create mdata xattr with utimes/utimensat system call.
233933
Only update if already present.
233933
233933
Backport of:
233933
 > Patch: https://review.gluster.org/22858
233933
 > Change-Id: Ieacedecb8a738bb437283ef3e0f042fd49dc4c8c
233933
 > fixes: bz#1720201
233933
 > Signed-off-by: Kotresh HR <khiremat@redhat.com>
233933
233933
Change-Id: Ieacedecb8a738bb437283ef3e0f042fd49dc4c8c
233933
BUG: 1713664
233933
Signed-off-by: Kotresh HR <khiremat@redhat.com>
233933
Reviewed-on: https://code.engineering.redhat.com/gerrit/174238
233933
Tested-by: RHGS Build Bot <nigelb@redhat.com>
233933
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
233933
---
233933
 tests/basic/afr/split-brain-healing.t      |  36 ++++---
233933
 tests/utils/get-mdata-xattr.c              | 152 +++++++++++++++++++++++++++++
233933
 tests/volume.rc                            |  30 ++++++
233933
 xlators/storage/posix/src/posix-metadata.c |  21 ++++
233933
 4 files changed, 223 insertions(+), 16 deletions(-)
233933
 create mode 100644 tests/utils/get-mdata-xattr.c
233933
233933
diff --git a/tests/basic/afr/split-brain-healing.t b/tests/basic/afr/split-brain-healing.t
233933
index c80f900..78553e6 100644
233933
--- a/tests/basic/afr/split-brain-healing.t
233933
+++ b/tests/basic/afr/split-brain-healing.t
233933
@@ -20,11 +20,14 @@ function get_replicate_subvol_number {
233933
 cleanup;
233933
 
233933
 AREQUAL_PATH=$(dirname $0)/../../utils
233933
+GET_MDATA_PATH=$(dirname $0)/../../utils
233933
 CFLAGS=""
233933
 test "`uname -s`" != "Linux" && {
233933
     CFLAGS="$CFLAGS -lintl";
233933
 }
233933
 build_tester $AREQUAL_PATH/arequal-checksum.c $CFLAGS
233933
+build_tester $GET_MDATA_PATH/get-mdata-xattr.c
233933
+
233933
 TEST glusterd
233933
 TEST pidof glusterd
233933
 TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2,3,4}
233933
@@ -152,13 +155,13 @@ EXPECT $SMALLER_FILE_SIZE stat -c %s file4
233933
 subvolume=$(get_replicate_subvol_number file5)
233933
 if [ $subvolume == 0 ]
233933
 then
233933
-        mtime1=$(stat -c %Y $B0/${V0}1/file5)
233933
-        mtime2=$(stat -c %Y $B0/${V0}2/file5)
233933
+        mtime1=$(get_mtime $B0/${V0}1/file5)
233933
+        mtime2=$(get_mtime $B0/${V0}2/file5)
233933
         LATEST_MTIME=$(($mtime1 > $mtime2 ? $mtime1:$mtime2))
233933
 elif [ $subvolume == 1 ]
233933
 then
233933
-        mtime1=$(stat -c %Y $B0/${V0}3/file5)
233933
-        mtime2=$(stat -c %Y $B0/${V0}4/file5)
233933
+        mtime1=$(get_mtime $B0/${V0}3/file5)
233933
+        mtime2=$(get_mtime $B0/${V0}4/file5)
233933
         LATEST_MTIME=$(($mtime1 > $mtime2 ? $mtime1:$mtime2))
233933
 fi
233933
 $CLI volume heal $V0 split-brain latest-mtime /file5
233933
@@ -166,12 +169,12 @@ EXPECT "0" echo $?
233933
 
233933
 if [ $subvolume == 0 ]
233933
 then
233933
-        mtime1_after_heal=$(stat -c %Y $B0/${V0}1/file5)
233933
-        mtime2_after_heal=$(stat -c %Y $B0/${V0}2/file5)
233933
+        mtime1_after_heal=$(get_mtime $B0/${V0}1/file5)
233933
+        mtime2_after_heal=$(get_mtime $B0/${V0}2/file5)
233933
 elif [ $subvolume == 1 ]
233933
 then
233933
-        mtime1_after_heal=$(stat -c %Y $B0/${V0}3/file5)
233933
-        mtime2_after_heal=$(stat -c %Y $B0/${V0}4/file5)
233933
+        mtime1_after_heal=$(get_mtime $B0/${V0}3/file5)
233933
+        mtime2_after_heal=$(get_mtime $B0/${V0}4/file5)
233933
 fi
233933
 
233933
 #TODO: To below comparisons on full sub-second resolution
233933
@@ -188,14 +191,14 @@ subvolume=$(get_replicate_subvol_number file6)
233933
 if [ $subvolume == 0 ]
233933
 then
233933
         GFID=$(gf_get_gfid_xattr $B0/${V0}1/file6)
233933
-        mtime1=$(stat -c %Y $B0/${V0}1/file6)
233933
-        mtime2=$(stat -c %Y $B0/${V0}2/file6)
233933
+        mtime1=$(get_mtime $B0/${V0}1/file6)
233933
+        mtime2=$(get_mtime $B0/${V0}2/file6)
233933
         LATEST_MTIME=$(($mtime1 > $mtime2 ? $mtime1:$mtime2))
233933
 elif [ $subvolume == 1 ]
233933
 then
233933
         GFID=$(gf_get_gfid_xattr $B0/${V0}3/file6)
233933
-        mtime1=$(stat -c %Y $B0/${V0}3/file6)
233933
-        mtime2=$(stat -c %Y $B0/${V0}4/file6)
233933
+        mtime1=$(get_mtime $B0/${V0}3/file6)
233933
+        mtime2=$(get_mtime $B0/${V0}4/file6)
233933
         LATEST_MTIME=$(($mtime1 > $mtime2 ? $mtime1:$mtime2))
233933
 fi
233933
 GFIDSTR="gfid:$(gf_gfid_xattr_to_str $GFID)"
233933
@@ -204,12 +207,12 @@ EXPECT "0" echo $?
233933
 
233933
 if [ $subvolume == 0 ]
233933
 then
233933
-        mtime1_after_heal=$(stat -c %Y $B0/${V0}1/file6)
233933
-        mtime2_after_heal=$(stat -c %Y $B0/${V0}2/file6)
233933
+        mtime1_after_heal=$(get_mtime $B0/${V0}1/file6)
233933
+        mtime2_after_heal=$(get_mtime $B0/${V0}2/file6)
233933
 elif [ $subvolume == 1 ]
233933
 then
233933
-        mtime1_after_heal=$(stat -c %Y $B0/${V0}3/file6)
233933
-        mtime2_after_heal=$(stat -c %Y $B0/${V0}4/file6)
233933
+        mtime1_after_heal=$(get_mtime $B0/${V0}3/file6)
233933
+        mtime2_after_heal=$(get_mtime $B0/${V0}4/file6)
233933
 fi
233933
 
233933
 #TODO: To below comparisons on full sub-second resolution
233933
@@ -253,4 +256,5 @@ EXPECT "1" echo $?
233933
 
233933
 cd -
233933
 TEST rm $AREQUAL_PATH/arequal-checksum
233933
+TEST rm $GET_MDATA_PATH/get-mdata-xattr
233933
 cleanup
233933
diff --git a/tests/utils/get-mdata-xattr.c b/tests/utils/get-mdata-xattr.c
233933
new file mode 100644
233933
index 0000000..e9f5471
233933
--- /dev/null
233933
+++ b/tests/utils/get-mdata-xattr.c
233933
@@ -0,0 +1,152 @@
233933
+/*
233933
+   Copyright (c) 2019 Red Hat, Inc. <http://www.redhat.com>
233933
+   This file is part of GlusterFS.
233933
+
233933
+   This file is licensed to you under your choice of the GNU Lesser
233933
+   General Public License, version 3 or any later version (LGPLv3 or
233933
+   later), or the GNU General Public License, version 2 (GPLv2), in all
233933
+   cases as published by the Free Software Foundation.
233933
+*/
233933
+
233933
+#include <stdlib.h>
233933
+#include <endian.h>
233933
+#include <stdio.h>
233933
+#include <time.h>
233933
+#include <string.h>
233933
+#include <inttypes.h>
233933
+#include <sys/types.h>
233933
+#include <sys/xattr.h>
233933
+#include <errno.h>
233933
+
233933
+typedef struct gf_timespec_disk {
233933
+    uint64_t tv_sec;
233933
+    uint64_t tv_nsec;
233933
+} gf_timespec_disk_t;
233933
+
233933
+/* posix_mdata_t on disk structure */
233933
+typedef struct __attribute__((__packed__)) posix_mdata_disk {
233933
+    /* version of structure, bumped up if any new member is added */
233933
+    uint8_t version;
233933
+    /* flags indicates valid fields in the structure */
233933
+    uint64_t flags;
233933
+    gf_timespec_disk_t ctime;
233933
+    gf_timespec_disk_t mtime;
233933
+    gf_timespec_disk_t atime;
233933
+} posix_mdata_disk_t;
233933
+
233933
+/* In memory representation posix metadata xattr */
233933
+typedef struct {
233933
+    /* version of structure, bumped up if any new member is added */
233933
+    uint8_t version;
233933
+    /* flags indicates valid fields in the structure */
233933
+    uint64_t flags;
233933
+    struct timespec ctime;
233933
+    struct timespec mtime;
233933
+    struct timespec atime;
233933
+} posix_mdata_t;
233933
+
233933
+#define GF_XATTR_MDATA_KEY "trusted.glusterfs.mdata"
233933
+
233933
+/* posix_mdata_from_disk converts posix_mdata_disk_t into host byte order
233933
+ */
233933
+static inline void
233933
+posix_mdata_from_disk(posix_mdata_t *out, posix_mdata_disk_t *in)
233933
+{
233933
+    out->version = in->version;
233933
+    out->flags = be64toh(in->flags);
233933
+
233933
+    out->ctime.tv_sec = be64toh(in->ctime.tv_sec);
233933
+    out->ctime.tv_nsec = be64toh(in->ctime.tv_nsec);
233933
+
233933
+    out->mtime.tv_sec = be64toh(in->mtime.tv_sec);
233933
+    out->mtime.tv_nsec = be64toh(in->mtime.tv_nsec);
233933
+
233933
+    out->atime.tv_sec = be64toh(in->atime.tv_sec);
233933
+    out->atime.tv_nsec = be64toh(in->atime.tv_nsec);
233933
+}
233933
+
233933
+/* posix_fetch_mdata_xattr fetches the posix_mdata_t from disk */
233933
+static int
233933
+posix_fetch_mdata_xattr(const char *real_path, posix_mdata_t *metadata)
233933
+{
233933
+    size_t size = -1;
233933
+    char *value = NULL;
233933
+    char gfid_str[64] = {0};
233933
+
233933
+    char *key = GF_XATTR_MDATA_KEY;
233933
+
233933
+    if (!metadata || !real_path) {
233933
+        goto err;
233933
+    }
233933
+
233933
+    /* Get size */
233933
+    size = lgetxattr(real_path, key, NULL, 0);
233933
+    if (size == -1) {
233933
+        goto err;
233933
+    }
233933
+
233933
+    value = calloc(size + 1, sizeof(char));
233933
+    if (!value) {
233933
+        goto err;
233933
+    }
233933
+
233933
+    /* Get xattr value */
233933
+    size = lgetxattr(real_path, key, value, size);
233933
+    if (size == -1) {
233933
+        goto err;
233933
+    }
233933
+    posix_mdata_from_disk(metadata, (posix_mdata_disk_t *)value);
233933
+
233933
+out:
233933
+    if (value)
233933
+        free(value);
233933
+    return 0;
233933
+err:
233933
+    if (value)
233933
+        free(value);
233933
+    return -1;
233933
+}
233933
+
233933
+int
233933
+main(int argc, char *argv[])
233933
+{
233933
+    posix_mdata_t metadata;
233933
+    uint64_t result;
233933
+
233933
+    if (argc != 3) {
233933
+        /*
233933
+        Usage: get_mdata_xattr -c|-m|-a <file-name>
233933
+                       where -c --> ctime
233933
+                             -m --> mtime
233933
+                             -a --> atime
233933
+        */
233933
+        printf("-1");
233933
+        goto err;
233933
+    }
233933
+
233933
+    if (posix_fetch_mdata_xattr(argv[2], &metadata)) {
233933
+        printf("-1");
233933
+        goto err;
233933
+    }
233933
+
233933
+    switch (argv[1][1]) {
233933
+        case 'c':
233933
+            result = metadata.ctime.tv_sec;
233933
+            break;
233933
+        case 'm':
233933
+            result = metadata.mtime.tv_sec;
233933
+            break;
233933
+        case 'a':
233933
+            result = metadata.atime.tv_sec;
233933
+            break;
233933
+        default:
233933
+            printf("-1");
233933
+            goto err;
233933
+    }
233933
+    printf("%" PRIu64, result);
233933
+    fflush(stdout);
233933
+    return 0;
233933
+err:
233933
+    fflush(stdout);
233933
+    return -1;
233933
+}
233933
diff --git a/tests/volume.rc b/tests/volume.rc
233933
index bb400cc..6a78c37 100644
233933
--- a/tests/volume.rc
233933
+++ b/tests/volume.rc
233933
@@ -927,3 +927,33 @@ function number_healer_threads_shd {
233933
    local pid=$(get_shd_mux_pid $1)
233933
    pstack $pid | grep $2 | wc -l
233933
 }
233933
+
233933
+function get_mtime {
233933
+    local time=$(get-mdata-xattr -m $1)
233933
+    if [ $time == "-1" ];
233933
+    then
233933
+        echo $(stat -c %Y $1)
233933
+    else
233933
+        echo $time
233933
+    fi
233933
+}
233933
+
233933
+function get_ctime {
233933
+    local time=$(get-mdata-xattr -c $1)
233933
+    if [ $time == "-1" ];
233933
+    then
233933
+        echo $(stat -c %Z $2)
233933
+    else
233933
+        echo $time
233933
+    fi
233933
+}
233933
+
233933
+function get_atime {
233933
+    local time=$(get-mdata-xattr -a $1)
233933
+    if [ $time == "-1" ];
233933
+    then
233933
+        echo $(stat -c %X $1)
233933
+    else
233933
+        echo $time
233933
+    fi
233933
+}
233933
diff --git a/xlators/storage/posix/src/posix-metadata.c b/xlators/storage/posix/src/posix-metadata.c
233933
index e96f222..5a5e6cd 100644
233933
--- a/xlators/storage/posix/src/posix-metadata.c
233933
+++ b/xlators/storage/posix/src/posix-metadata.c
233933
@@ -416,6 +416,22 @@ posix_set_mdata_xattr(xlator_t *this, const char *real_path, int fd,
233933
                  * still fine as the times would get eventually
233933
                  * accurate.
233933
                  */
233933
+
233933
+                /* Don't create xattr with utimes/utimensat, only update if
233933
+                 * present. This otherwise causes issues during inservice
233933
+                 * upgrade. It causes inconsistent xattr values with in replica
233933
+                 * set. The scenario happens during upgrade where clients are
233933
+                 * older versions (without the ctime feature) and the server is
233933
+                 * upgraded to the new version (with the ctime feature which
233933
+                 * is enabled by default).
233933
+                 */
233933
+
233933
+                if (update_utime) {
233933
+                    UNLOCK(&inode->lock);
233933
+                    GF_FREE(mdata);
233933
+                    return 0;
233933
+                }
233933
+
233933
                 mdata->version = 1;
233933
                 mdata->flags = 0;
233933
                 mdata->ctime.tv_sec = time->tv_sec;
233933
@@ -527,6 +543,11 @@ posix_update_utime_in_mdata(xlator_t *this, const char *real_path, int fd,
233933
 
233933
     priv = this->private;
233933
 
233933
+    /* NOTE:
233933
+     * This routine (utimes) is intentionally allowed for all internal and
233933
+     * external clients even if ctime is not set. This is because AFR and
233933
+     * WORM uses time attributes for it's internal operations
233933
+     */
233933
     if (inode && priv->ctime) {
233933
         if ((valid & GF_SET_ATTR_ATIME) == GF_SET_ATTR_ATIME) {
233933
             tv.tv_sec = stbuf->ia_atime;
233933
-- 
233933
1.8.3.1
233933