Blob Blame History Raw
From 2a397f0ca64505438edf96a15de901bb640cd871 Mon Sep 17 00:00:00 2001
From: Sunil Kumar Acharya <sheggodu@redhat.com>
Date: Wed, 17 May 2017 14:35:41 +0530
Subject: [PATCH 467/473] cluster/ec: Implement FALLOCATE FOP for EC

FALLOCATE file operations is not implemented in the
existing EC code. This change set implements it
for EC.

>BUG: 1448293
>Change-Id: Id9ed914db984c327c16878a5b2304a0ea461b623
>Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
>Reviewed-on: https://review.gluster.org/15200
>Smoke: Gluster Build System <jenkins@build.gluster.org>
>NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
>Reviewed-by: Pranith Kumar Karampuri <pkarampu@redhat.com>
>CentOS-regression: Gluster Build System <jenkins@build.gluster.org>

BUG: 1447559
Change-Id: Ibeaf1fc8d564e2bef602653894672812c68397fe
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/107051
Reviewed-by: Ashish Pandey <aspandey@redhat.com>
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
 libglusterfs/src/compat.h               |   8 ++
 tests/basic/ec/ec-fallocate.t           |  72 +++++++++++
 tests/basic/ec/ec-rebalance.t           |  60 ++++++++++
 xlators/cluster/ec/src/ec-fops.h        |   4 +
 xlators/cluster/ec/src/ec-inode-write.c | 203 +++++++++++++++++++++++++++++++-
 xlators/cluster/ec/src/ec.c             |   5 +-
 xlators/storage/posix/src/posix.c       |  10 +-
 7 files changed, 354 insertions(+), 8 deletions(-)
 create mode 100644 tests/basic/ec/ec-fallocate.t
 create mode 100644 tests/basic/ec/ec-rebalance.t

diff --git a/libglusterfs/src/compat.h b/libglusterfs/src/compat.h
index ea72202..2738657 100644
--- a/libglusterfs/src/compat.h
+++ b/libglusterfs/src/compat.h
@@ -59,6 +59,12 @@
 #ifndef FALLOC_FL_ZERO_RANGE
 #define FALLOC_FL_ZERO_RANGE    0x10 /* zeroes out range */
 #endif
+#ifndef FALLOC_FL_COLLAPSE_RANGE
+#define FALLOC_FL_COLLAPSE_RANGE  0x08 /* reduces the size */
+#endif
+#ifndef FALLOC_FL_INSERT_RANGE
+#define FALLOC_FL_INSERT_RANGE  0x20 /* expands the size */
+#endif
 
 #ifndef HAVE_LLISTXATTR
 
@@ -177,6 +183,8 @@ enum {
 #define FALLOC_FL_KEEP_SIZE     0x01 /* default is extend size */
 #define FALLOC_FL_PUNCH_HOLE    0x02 /* de-allocates range */
 #define FALLOC_FL_ZERO_RANGE    0x10 /* zeroes out range */
+#define FALLOC_FL_INSERT_RANGE  0x20 /* Expands the size */
+#define FALLOC_FL_COLLAPSE_RANGE 0x08 /* Reduces the size */
 
 #ifndef _PATH_UMOUNT
   #define _PATH_UMOUNT "/sbin/umount"
diff --git a/tests/basic/ec/ec-fallocate.t b/tests/basic/ec/ec-fallocate.t
new file mode 100644
index 0000000..1b827ee
--- /dev/null
+++ b/tests/basic/ec/ec-fallocate.t
@@ -0,0 +1,72 @@
+#!/bin/bash
+#
+# Run several commands to verify basic fallocate functionality. We verify that
+# fallocate creates and allocates blocks to a file. We also verify that the keep
+# size option does not modify the file size.
+###
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../fallocate.rc
+
+cleanup
+
+#cleate and start volume
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}
+TEST $CLI volume start $V0
+
+#Mount the volume
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+
+# check for fallocate support before continuing the test
+require_fallocate -l 1m -n $M0/file && rm -f $M0/file
+
+# fallocate a file and verify blocks are allocated
+TEST fallocate -l 1m $M0/file
+blksz=`stat -c %b $M0/file`
+nblks=`stat -c %B $M0/file`
+TEST [ $(($blksz * $nblks)) -eq 1048576 ]
+
+TEST unlink $M0/file
+
+# truncate a file to a fixed size, fallocate and verify that the size does not
+# change
+TEST truncate -s 1M $M0/file
+TEST fallocate -l 2m -n $M0/file
+blksz=`stat -c %b $M0/file`
+nblks=`stat -c %B $M0/file`
+sz=`stat -c %s $M0/file`
+TEST [ $sz -eq 1048576 ]
+# Note that gluster currently incorporates a hack to limit the number of blocks
+# reported as allocated to the file by the file size. We have allocated beyond the
+# file size here. Just check for non-zero allocation to avoid setting a land mine
+# for if/when that behavior might change.
+TEST [ ! $(($blksz * $nblks)) -eq 0 ]
+TEST unlink $M0/file
+
+# write some data, fallocate within and outside the range
+# and check for data corruption.
+TEST dd if=/dev/urandom of=$M0/file bs=1024k count=1
+TEST cp $M0/file $M0/file.copy.pre
+TEST fallocate -o 512k -l 128k $M0/file
+TEST cp $M0/file $M0/file.copy.post
+TEST cmp $M0/file.copy.pre $M0/file.copy.post
+TEST fallocate -o 1000k -l 128k $M0/file
+TEST cp $M0/file $M0/file.copy.post2
+TEST ! cmp $M0/file.copy.pre $M0/file.copy.post2
+TEST truncate -s 1M $M0/file.copy.post2
+TEST cmp $M0/file.copy.pre $M0/file.copy.post2
+TEST unlink $M0/file
+
+#Make sure offset/size are modified so that 3 blocks are allocated
+TEST touch $M0/f1
+TEST fallocate -o 1280 -l 1024 $M0/f1
+EXPECT "^2304$" stat -c "%s" $M0/f1
+EXPECT "^1536$" stat -c "%s" $B0/${V0}0/f1
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+
+cleanup;
diff --git a/tests/basic/ec/ec-rebalance.t b/tests/basic/ec/ec-rebalance.t
new file mode 100644
index 0000000..b5c3072
--- /dev/null
+++ b/tests/basic/ec/ec-rebalance.t
@@ -0,0 +1,60 @@
+#!/bin/bash
+#
+# This will test the rebalance failure reported in 1447559
+#
+###
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../fallocate.rc
+
+cleanup
+
+#cleate and start volume
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2}
+TEST $CLI volume start $V0
+
+#Mount the volume
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
+
+# Create files
+for i in {1..10}
+do
+    dd if=/dev/urandom of=$M0/file$i bs=1024k count=1
+done
+
+md5_1=$(md5sum $M0/file1 | awk '{print $1}')
+md5_2=$(md5sum $M0/file2 | awk '{print $1}')
+md5_3=$(md5sum $M0/file3 | awk '{print $1}')
+md5_4=$(md5sum $M0/file4 | awk '{print $1}')
+md5_5=$(md5sum $M0/file5 | awk '{print $1}')
+md5_6=$(md5sum $M0/file6 | awk '{print $1}')
+md5_7=$(md5sum $M0/file7 | awk '{print $1}')
+md5_8=$(md5sum $M0/file8 | awk '{print $1}')
+md5_9=$(md5sum $M0/file9 | awk '{print $1}')
+md5_10=$(md5sum $M0/file10 | awk '{print $1}')
+# Add brick
+TEST $CLI volume add-brick $V0 $H0:$B0/${V0}{3..5}
+
+#Trigger rebalance
+TEST $CLI volume rebalance $V0 start force
+EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" rebalance_status_field $V0
+
+#Remount to avoid any caches
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
+EXPECT "$md5_1" echo $(md5sum $M0/file1 | awk '{print $1}')
+EXPECT "$md5_2" echo $(md5sum $M0/file2 | awk '{print $1}')
+EXPECT "$md5_3" echo $(md5sum $M0/file3 | awk '{print $1}')
+EXPECT "$md5_4" echo $(md5sum $M0/file4 | awk '{print $1}')
+EXPECT "$md5_5" echo $(md5sum $M0/file5 | awk '{print $1}')
+EXPECT "$md5_6" echo $(md5sum $M0/file6 | awk '{print $1}')
+EXPECT "$md5_7" echo $(md5sum $M0/file7 | awk '{print $1}')
+EXPECT "$md5_8" echo $(md5sum $M0/file8 | awk '{print $1}')
+EXPECT "$md5_9" echo $(md5sum $M0/file9 | awk '{print $1}')
+EXPECT "$md5_10" echo $(md5sum $M0/file10 | awk '{print $1}')
+
+cleanup;
diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h
index 053c6b4..9842f6a 100644
--- a/xlators/cluster/ec/src/ec-fops.h
+++ b/xlators/cluster/ec/src/ec-fops.h
@@ -168,6 +168,10 @@ void ec_symlink(call_frame_t * frame, xlator_t * this, uintptr_t target,
                 const char * linkname, loc_t * loc, mode_t umask,
                 dict_t * xdata);
 
+void ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+              int32_t minimum, fop_fallocate_cbk_t func, void *data, fd_t *fd,
+              int32_t mode, off_t offset, size_t len, dict_t *xdata);
+
 void ec_truncate(call_frame_t * frame, xlator_t * this, uintptr_t target,
                  int32_t minimum, fop_truncate_cbk_t func, void *data,
                  loc_t * loc, off_t offset, dict_t * xdata);
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
index 92c6b8a..8b72373 100644
--- a/xlators/cluster/ec/src/ec-inode-write.c
+++ b/xlators/cluster/ec/src/ec-inode-write.c
@@ -827,7 +827,208 @@ out:
     }
 }
 
-/* FOP: truncate */
+/*********************************************************************
+ *
+ * File Operation : fallocate
+ *
+ *********************************************************************/
+
+int32_t ec_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+                         int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+                         struct iatt *postbuf, dict_t *xdata)
+{
+    return ec_inode_write_cbk (frame, this, cookie, op_ret, op_errno,
+                                   prebuf, postbuf, xdata);
+}
+
+void ec_wind_fallocate(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
+{
+    ec_trace("WIND", fop, "idx=%d", idx);
+
+    STACK_WIND_COOKIE(fop->frame, ec_fallocate_cbk, (void *)(uintptr_t)idx,
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->fallocate,
+                      fop->fd, fop->int32, fop->offset,
+                      fop->size, fop->xdata);
+}
+
+int32_t ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
+{
+    ec_cbk_data_t *cbk = NULL;
+
+    switch (state) {
+    case EC_STATE_INIT:
+        if (fop->size == 0) {
+                ec_fop_set_error(fop, EINVAL);
+                return EC_STATE_REPORT;
+        }
+        if (fop->int32 & (FALLOC_FL_COLLAPSE_RANGE
+                         |FALLOC_FL_INSERT_RANGE
+                         |FALLOC_FL_ZERO_RANGE
+                         |FALLOC_FL_PUNCH_HOLE)) {
+                ec_fop_set_error(fop, ENOTSUP);
+                return EC_STATE_REPORT;
+        }
+        fop->user_size = fop->offset + fop->size;
+        fop->head = ec_adjust_offset (fop->xl->private, &fop->offset, 1);
+        fop->size = ec_adjust_size (fop->xl->private, fop->head + fop->size, 1);
+
+        /* Fall through */
+
+    case EC_STATE_LOCK:
+        ec_lock_prepare_fd(fop, fop->fd,
+                           EC_UPDATE_DATA | EC_UPDATE_META |
+                           EC_QUERY_INFO);
+        ec_lock(fop);
+
+        return EC_STATE_DISPATCH;
+
+    case EC_STATE_DISPATCH:
+
+        ec_dispatch_all(fop);
+
+        return EC_STATE_PREPARE_ANSWER;
+
+    case EC_STATE_PREPARE_ANSWER:
+        cbk = ec_fop_prepare_answer(fop, _gf_false);
+        if (cbk != NULL) {
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2,
+                            cbk->count);
+
+                /* This shouldn't fail because we have the inode locked. */
+                GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
+                                        &cbk->iatt[0].ia_size));
+
+                /*If mode has FALLOC_FL_KEEP_SIZE keep the size */
+                if (fop->int32 & FALLOC_FL_KEEP_SIZE) {
+                        cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+                } else if (fop->user_size > cbk->iatt[0].ia_size) {
+                        cbk->iatt[1].ia_size = fop->user_size;
+
+                        /* This shouldn't fail because we have the inode
+                         * locked. */
+                        GF_ASSERT(ec_set_inode_size(fop,
+                                  fop->locks[0].lock->loc.inode,
+                                            cbk->iatt[1].ia_size));
+                } else {
+                        cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
+                }
+
+        }
+
+        return EC_STATE_REPORT;
+
+    case EC_STATE_REPORT:
+        cbk = fop->answer;
+
+        GF_ASSERT(cbk != NULL);
+
+        if (fop->cbks.fallocate != NULL) {
+                fop->cbks.fallocate(fop->req_frame, fop, fop->xl, cbk->op_ret,
+                                    cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1],
+                                    cbk->xdata);
+        }
+
+        return EC_STATE_LOCK_REUSE;
+
+    case -EC_STATE_INIT:
+    case -EC_STATE_LOCK:
+    case -EC_STATE_DISPATCH:
+    case -EC_STATE_PREPARE_ANSWER:
+    case -EC_STATE_REPORT:
+        GF_ASSERT(fop->error != 0);
+
+        if (fop->cbks.fallocate != NULL) {
+                fop->cbks.fallocate(fop->req_frame, fop, fop->xl, -1,
+                                    fop->error, NULL, NULL, NULL);
+        }
+
+        return EC_STATE_LOCK_REUSE;
+
+    case -EC_STATE_LOCK_REUSE:
+    case EC_STATE_LOCK_REUSE:
+        ec_lock_reuse(fop);
+
+        return EC_STATE_UNLOCK;
+
+    case -EC_STATE_UNLOCK:
+    case EC_STATE_UNLOCK:
+        ec_unlock(fop);
+
+        return EC_STATE_END;
+
+    default:
+        gf_msg (fop->xl->name, GF_LOG_ERROR, EINVAL,
+                EC_MSG_UNHANDLED_STATE,
+                "Unhandled state %d for %s",
+                state, ec_fop_name(fop->id));
+
+        return EC_STATE_END;
+    }
+}
+
+void ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target,
+              int32_t minimum, fop_fallocate_cbk_t func, void *data, fd_t *fd,
+              int32_t mode, off_t offset, size_t len, dict_t *xdata)
+{
+    ec_cbk_t callback = { .fallocate = func };
+    ec_fop_data_t *fop = NULL;
+    int32_t error = ENOMEM;
+
+    gf_msg_trace ("ec", 0, "EC(FALLOCATE) %p", frame);
+
+    VALIDATE_OR_GOTO(this, out);
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_FALLOCATE, 0, target,
+                               minimum, ec_wind_fallocate, ec_manager_fallocate,
+                               callback, data);
+    if (fop == NULL) {
+        goto out;
+    }
+
+    fop->use_fd = 1;
+    fop->int32 = mode;
+    fop->offset = offset;
+    fop->size = len;
+
+    if (fd != NULL) {
+        fop->fd = fd_ref(fd);
+        if (fop->fd == NULL) {
+                gf_msg (this->name, GF_LOG_ERROR, 0,
+                        EC_MSG_FILE_DESC_REF_FAIL,
+                        "Failed to reference a "
+                        "file descriptor.");
+                goto out;
+        }
+    }
+
+    if (xdata != NULL) {
+        fop->xdata = dict_ref(xdata);
+        if (fop->xdata == NULL) {
+                gf_msg (this->name, GF_LOG_ERROR, 0,
+                        EC_MSG_DICT_REF_FAIL,
+                        "Failed to reference a "
+                        "dictionary.");
+                goto out;
+        }
+    }
+
+    error = 0;
+
+out:
+    if (fop != NULL) {
+        ec_manager(fop, error);
+    } else {
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
+    }
+}
+
+/*********************************************************************
+ *
+ * File Operation : truncate
+ *
+ *********************************************************************/
 
 int32_t ec_truncate_write(ec_fop_data_t * fop, uintptr_t mask)
 {
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
index f687050..bad5578 100644
--- a/xlators/cluster/ec/src/ec.c
+++ b/xlators/cluster/ec/src/ec.c
@@ -734,10 +734,11 @@ int32_t ec_gf_fentrylk(call_frame_t * frame, xlator_t * this,
 }
 
 int32_t ec_gf_fallocate(call_frame_t * frame, xlator_t * this, fd_t * fd,
-                        int32_t keep_size, off_t offset, size_t len,
+                        int32_t mode, off_t offset, size_t len,
                         dict_t * xdata)
 {
-    default_fallocate_failure_cbk(frame, ENOTSUP);
+    ec_fallocate(frame, this, -1, EC_MINIMUM_MIN, default_fallocate_cbk,
+                 NULL, fd, mode, offset, len, xdata);
 
     return 0;
 }
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
index ed70782..afd0ff8 100644
--- a/xlators/storage/posix/src/posix.c
+++ b/xlators/storage/posix/src/posix.c
@@ -696,11 +696,11 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd,
                 goto out;
         }
 
-	ret = sys_fallocate (pfd->fd, flags, offset, len);
-	if (ret == -1) {
-		ret = -errno;
-		goto out;
-	}
+        ret = sys_fallocate (pfd->fd, flags, offset, len);
+        if (ret == -1) {
+                ret = -errno;
+                goto out;
+        }
 
         ret = posix_fdstat (this, pfd->fd, statpost);
         if (ret == -1) {
-- 
1.8.3.1