From 2a397f0ca64505438edf96a15de901bb640cd871 Mon Sep 17 00:00:00 2001 From: Sunil Kumar Acharya Date: Wed, 17 May 2017 14:35:41 +0530 Subject: [PATCH 467/473] cluster/ec: Implement FALLOCATE FOP for EC FALLOCATE file operations is not implemented in the existing EC code. This change set implements it for EC. >BUG: 1448293 >Change-Id: Id9ed914db984c327c16878a5b2304a0ea461b623 >Signed-off-by: Sunil Kumar Acharya >Reviewed-on: https://review.gluster.org/15200 >Smoke: Gluster Build System >NetBSD-regression: NetBSD Build System >Reviewed-by: Pranith Kumar Karampuri >CentOS-regression: Gluster Build System BUG: 1447559 Change-Id: Ibeaf1fc8d564e2bef602653894672812c68397fe Signed-off-by: Sunil Kumar Acharya Reviewed-on: https://code.engineering.redhat.com/gerrit/107051 Reviewed-by: Ashish Pandey Reviewed-by: Atin Mukherjee --- libglusterfs/src/compat.h | 8 ++ tests/basic/ec/ec-fallocate.t | 72 +++++++++++ tests/basic/ec/ec-rebalance.t | 60 ++++++++++ xlators/cluster/ec/src/ec-fops.h | 4 + xlators/cluster/ec/src/ec-inode-write.c | 203 +++++++++++++++++++++++++++++++- xlators/cluster/ec/src/ec.c | 5 +- xlators/storage/posix/src/posix.c | 10 +- 7 files changed, 354 insertions(+), 8 deletions(-) create mode 100644 tests/basic/ec/ec-fallocate.t create mode 100644 tests/basic/ec/ec-rebalance.t diff --git a/libglusterfs/src/compat.h b/libglusterfs/src/compat.h index ea72202..2738657 100644 --- a/libglusterfs/src/compat.h +++ b/libglusterfs/src/compat.h @@ -59,6 +59,12 @@ #ifndef FALLOC_FL_ZERO_RANGE #define FALLOC_FL_ZERO_RANGE 0x10 /* zeroes out range */ #endif +#ifndef FALLOC_FL_COLLAPSE_RANGE +#define FALLOC_FL_COLLAPSE_RANGE 0x08 /* reduces the size */ +#endif +#ifndef FALLOC_FL_INSERT_RANGE +#define FALLOC_FL_INSERT_RANGE 0x20 /* expands the size */ +#endif #ifndef HAVE_LLISTXATTR @@ -177,6 +183,8 @@ enum { #define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */ #define FALLOC_FL_PUNCH_HOLE 0x02 /* de-allocates range */ #define FALLOC_FL_ZERO_RANGE 0x10 /* zeroes out range */ +#define FALLOC_FL_INSERT_RANGE 0x20 /* Expands the size */ +#define FALLOC_FL_COLLAPSE_RANGE 0x08 /* Reduces the size */ #ifndef _PATH_UMOUNT #define _PATH_UMOUNT "/sbin/umount" diff --git a/tests/basic/ec/ec-fallocate.t b/tests/basic/ec/ec-fallocate.t new file mode 100644 index 0000000..1b827ee --- /dev/null +++ b/tests/basic/ec/ec-fallocate.t @@ -0,0 +1,72 @@ +#!/bin/bash +# +# Run several commands to verify basic fallocate functionality. We verify that +# fallocate creates and allocates blocks to a file. We also verify that the keep +# size option does not modify the file size. +### + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../fallocate.rc + +cleanup + +#cleate and start volume +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2} +TEST $CLI volume start $V0 + +#Mount the volume +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 + +# check for fallocate support before continuing the test +require_fallocate -l 1m -n $M0/file && rm -f $M0/file + +# fallocate a file and verify blocks are allocated +TEST fallocate -l 1m $M0/file +blksz=`stat -c %b $M0/file` +nblks=`stat -c %B $M0/file` +TEST [ $(($blksz * $nblks)) -eq 1048576 ] + +TEST unlink $M0/file + +# truncate a file to a fixed size, fallocate and verify that the size does not +# change +TEST truncate -s 1M $M0/file +TEST fallocate -l 2m -n $M0/file +blksz=`stat -c %b $M0/file` +nblks=`stat -c %B $M0/file` +sz=`stat -c %s $M0/file` +TEST [ $sz -eq 1048576 ] +# Note that gluster currently incorporates a hack to limit the number of blocks +# reported as allocated to the file by the file size. We have allocated beyond the +# file size here. Just check for non-zero allocation to avoid setting a land mine +# for if/when that behavior might change. +TEST [ ! $(($blksz * $nblks)) -eq 0 ] +TEST unlink $M0/file + +# write some data, fallocate within and outside the range +# and check for data corruption. +TEST dd if=/dev/urandom of=$M0/file bs=1024k count=1 +TEST cp $M0/file $M0/file.copy.pre +TEST fallocate -o 512k -l 128k $M0/file +TEST cp $M0/file $M0/file.copy.post +TEST cmp $M0/file.copy.pre $M0/file.copy.post +TEST fallocate -o 1000k -l 128k $M0/file +TEST cp $M0/file $M0/file.copy.post2 +TEST ! cmp $M0/file.copy.pre $M0/file.copy.post2 +TEST truncate -s 1M $M0/file.copy.post2 +TEST cmp $M0/file.copy.pre $M0/file.copy.post2 +TEST unlink $M0/file + +#Make sure offset/size are modified so that 3 blocks are allocated +TEST touch $M0/f1 +TEST fallocate -o 1280 -l 1024 $M0/f1 +EXPECT "^2304$" stat -c "%s" $M0/f1 +EXPECT "^1536$" stat -c "%s" $B0/${V0}0/f1 + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + +cleanup; diff --git a/tests/basic/ec/ec-rebalance.t b/tests/basic/ec/ec-rebalance.t new file mode 100644 index 0000000..b5c3072 --- /dev/null +++ b/tests/basic/ec/ec-rebalance.t @@ -0,0 +1,60 @@ +#!/bin/bash +# +# This will test the rebalance failure reported in 1447559 +# +### + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../fallocate.rc + +cleanup + +#cleate and start volume +TEST glusterd +TEST pidof glusterd +TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..2} +TEST $CLI volume start $V0 + +#Mount the volume +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0 + +# Create files +for i in {1..10} +do + dd if=/dev/urandom of=$M0/file$i bs=1024k count=1 +done + +md5_1=$(md5sum $M0/file1 | awk '{print $1}') +md5_2=$(md5sum $M0/file2 | awk '{print $1}') +md5_3=$(md5sum $M0/file3 | awk '{print $1}') +md5_4=$(md5sum $M0/file4 | awk '{print $1}') +md5_5=$(md5sum $M0/file5 | awk '{print $1}') +md5_6=$(md5sum $M0/file6 | awk '{print $1}') +md5_7=$(md5sum $M0/file7 | awk '{print $1}') +md5_8=$(md5sum $M0/file8 | awk '{print $1}') +md5_9=$(md5sum $M0/file9 | awk '{print $1}') +md5_10=$(md5sum $M0/file10 | awk '{print $1}') +# Add brick +TEST $CLI volume add-brick $V0 $H0:$B0/${V0}{3..5} + +#Trigger rebalance +TEST $CLI volume rebalance $V0 start force +EXPECT_WITHIN $REBALANCE_TIMEOUT "completed" rebalance_status_field $V0 + +#Remount to avoid any caches +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +EXPECT "$md5_1" echo $(md5sum $M0/file1 | awk '{print $1}') +EXPECT "$md5_2" echo $(md5sum $M0/file2 | awk '{print $1}') +EXPECT "$md5_3" echo $(md5sum $M0/file3 | awk '{print $1}') +EXPECT "$md5_4" echo $(md5sum $M0/file4 | awk '{print $1}') +EXPECT "$md5_5" echo $(md5sum $M0/file5 | awk '{print $1}') +EXPECT "$md5_6" echo $(md5sum $M0/file6 | awk '{print $1}') +EXPECT "$md5_7" echo $(md5sum $M0/file7 | awk '{print $1}') +EXPECT "$md5_8" echo $(md5sum $M0/file8 | awk '{print $1}') +EXPECT "$md5_9" echo $(md5sum $M0/file9 | awk '{print $1}') +EXPECT "$md5_10" echo $(md5sum $M0/file10 | awk '{print $1}') + +cleanup; diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h index 053c6b4..9842f6a 100644 --- a/xlators/cluster/ec/src/ec-fops.h +++ b/xlators/cluster/ec/src/ec-fops.h @@ -168,6 +168,10 @@ void ec_symlink(call_frame_t * frame, xlator_t * this, uintptr_t target, const char * linkname, loc_t * loc, mode_t umask, dict_t * xdata); +void ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target, + int32_t minimum, fop_fallocate_cbk_t func, void *data, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata); + void ec_truncate(call_frame_t * frame, xlator_t * this, uintptr_t target, int32_t minimum, fop_truncate_cbk_t func, void *data, loc_t * loc, off_t offset, dict_t * xdata); diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c index 92c6b8a..8b72373 100644 --- a/xlators/cluster/ec/src/ec-inode-write.c +++ b/xlators/cluster/ec/src/ec-inode-write.c @@ -827,7 +827,208 @@ out: } } -/* FOP: truncate */ +/********************************************************************* + * + * File Operation : fallocate + * + *********************************************************************/ + +int32_t ec_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + return ec_inode_write_cbk (frame, this, cookie, op_ret, op_errno, + prebuf, postbuf, xdata); +} + +void ec_wind_fallocate(ec_t *ec, ec_fop_data_t *fop, int32_t idx) +{ + ec_trace("WIND", fop, "idx=%d", idx); + + STACK_WIND_COOKIE(fop->frame, ec_fallocate_cbk, (void *)(uintptr_t)idx, + ec->xl_list[idx], ec->xl_list[idx]->fops->fallocate, + fop->fd, fop->int32, fop->offset, + fop->size, fop->xdata); +} + +int32_t ec_manager_fallocate(ec_fop_data_t *fop, int32_t state) +{ + ec_cbk_data_t *cbk = NULL; + + switch (state) { + case EC_STATE_INIT: + if (fop->size == 0) { + ec_fop_set_error(fop, EINVAL); + return EC_STATE_REPORT; + } + if (fop->int32 & (FALLOC_FL_COLLAPSE_RANGE + |FALLOC_FL_INSERT_RANGE + |FALLOC_FL_ZERO_RANGE + |FALLOC_FL_PUNCH_HOLE)) { + ec_fop_set_error(fop, ENOTSUP); + return EC_STATE_REPORT; + } + fop->user_size = fop->offset + fop->size; + fop->head = ec_adjust_offset (fop->xl->private, &fop->offset, 1); + fop->size = ec_adjust_size (fop->xl->private, fop->head + fop->size, 1); + + /* Fall through */ + + case EC_STATE_LOCK: + ec_lock_prepare_fd(fop, fop->fd, + EC_UPDATE_DATA | EC_UPDATE_META | + EC_QUERY_INFO); + ec_lock(fop); + + return EC_STATE_DISPATCH; + + case EC_STATE_DISPATCH: + + ec_dispatch_all(fop); + + return EC_STATE_PREPARE_ANSWER; + + case EC_STATE_PREPARE_ANSWER: + cbk = ec_fop_prepare_answer(fop, _gf_false); + if (cbk != NULL) { + ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2, + cbk->count); + + /* This shouldn't fail because we have the inode locked. */ + GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode, + &cbk->iatt[0].ia_size)); + + /*If mode has FALLOC_FL_KEEP_SIZE keep the size */ + if (fop->int32 & FALLOC_FL_KEEP_SIZE) { + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; + } else if (fop->user_size > cbk->iatt[0].ia_size) { + cbk->iatt[1].ia_size = fop->user_size; + + /* This shouldn't fail because we have the inode + * locked. */ + GF_ASSERT(ec_set_inode_size(fop, + fop->locks[0].lock->loc.inode, + cbk->iatt[1].ia_size)); + } else { + cbk->iatt[1].ia_size = cbk->iatt[0].ia_size; + } + + } + + return EC_STATE_REPORT; + + case EC_STATE_REPORT: + cbk = fop->answer; + + GF_ASSERT(cbk != NULL); + + if (fop->cbks.fallocate != NULL) { + fop->cbks.fallocate(fop->req_frame, fop, fop->xl, cbk->op_ret, + cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1], + cbk->xdata); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_INIT: + case -EC_STATE_LOCK: + case -EC_STATE_DISPATCH: + case -EC_STATE_PREPARE_ANSWER: + case -EC_STATE_REPORT: + GF_ASSERT(fop->error != 0); + + if (fop->cbks.fallocate != NULL) { + fop->cbks.fallocate(fop->req_frame, fop, fop->xl, -1, + fop->error, NULL, NULL, NULL); + } + + return EC_STATE_LOCK_REUSE; + + case -EC_STATE_LOCK_REUSE: + case EC_STATE_LOCK_REUSE: + ec_lock_reuse(fop); + + return EC_STATE_UNLOCK; + + case -EC_STATE_UNLOCK: + case EC_STATE_UNLOCK: + ec_unlock(fop); + + return EC_STATE_END; + + default: + gf_msg (fop->xl->name, GF_LOG_ERROR, EINVAL, + EC_MSG_UNHANDLED_STATE, + "Unhandled state %d for %s", + state, ec_fop_name(fop->id)); + + return EC_STATE_END; + } +} + +void ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target, + int32_t minimum, fop_fallocate_cbk_t func, void *data, fd_t *fd, + int32_t mode, off_t offset, size_t len, dict_t *xdata) +{ + ec_cbk_t callback = { .fallocate = func }; + ec_fop_data_t *fop = NULL; + int32_t error = ENOMEM; + + gf_msg_trace ("ec", 0, "EC(FALLOCATE) %p", frame); + + VALIDATE_OR_GOTO(this, out); + GF_VALIDATE_OR_GOTO(this->name, frame, out); + GF_VALIDATE_OR_GOTO(this->name, this->private, out); + + fop = ec_fop_data_allocate(frame, this, GF_FOP_FALLOCATE, 0, target, + minimum, ec_wind_fallocate, ec_manager_fallocate, + callback, data); + if (fop == NULL) { + goto out; + } + + fop->use_fd = 1; + fop->int32 = mode; + fop->offset = offset; + fop->size = len; + + if (fd != NULL) { + fop->fd = fd_ref(fd); + if (fop->fd == NULL) { + gf_msg (this->name, GF_LOG_ERROR, 0, + EC_MSG_FILE_DESC_REF_FAIL, + "Failed to reference a " + "file descriptor."); + goto out; + } + } + + if (xdata != NULL) { + fop->xdata = dict_ref(xdata); + if (fop->xdata == NULL) { + gf_msg (this->name, GF_LOG_ERROR, 0, + EC_MSG_DICT_REF_FAIL, + "Failed to reference a " + "dictionary."); + goto out; + } + } + + error = 0; + +out: + if (fop != NULL) { + ec_manager(fop, error); + } else { + func(frame, NULL, this, -1, error, NULL, NULL, NULL); + } +} + +/********************************************************************* + * + * File Operation : truncate + * + *********************************************************************/ int32_t ec_truncate_write(ec_fop_data_t * fop, uintptr_t mask) { diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c index f687050..bad5578 100644 --- a/xlators/cluster/ec/src/ec.c +++ b/xlators/cluster/ec/src/ec.c @@ -734,10 +734,11 @@ int32_t ec_gf_fentrylk(call_frame_t * frame, xlator_t * this, } int32_t ec_gf_fallocate(call_frame_t * frame, xlator_t * this, fd_t * fd, - int32_t keep_size, off_t offset, size_t len, + int32_t mode, off_t offset, size_t len, dict_t * xdata) { - default_fallocate_failure_cbk(frame, ENOTSUP); + ec_fallocate(frame, this, -1, EC_MINIMUM_MIN, default_fallocate_cbk, + NULL, fd, mode, offset, len, xdata); return 0; } diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c index ed70782..afd0ff8 100644 --- a/xlators/storage/posix/src/posix.c +++ b/xlators/storage/posix/src/posix.c @@ -696,11 +696,11 @@ posix_do_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, goto out; } - ret = sys_fallocate (pfd->fd, flags, offset, len); - if (ret == -1) { - ret = -errno; - goto out; - } + ret = sys_fallocate (pfd->fd, flags, offset, len); + if (ret == -1) { + ret = -errno; + goto out; + } ret = posix_fdstat (this, pfd->fd, statpost); if (ret == -1) { -- 1.8.3.1