a3470f
From 8b596eacd72527b55ccff6a26a44014b6cf76b48 Mon Sep 17 00:00:00 2001
a3470f
From: Sunil Kumar Acharya <sheggodu@redhat.com>
a3470f
Date: Wed, 14 Jun 2017 16:28:40 +0530
a3470f
Subject: [PATCH 088/128] cluster/ec: Implement DISCARD FOP for EC
a3470f
a3470f
Updates #254
a3470f
a3470f
This code change implements DISCARD FOP support for
a3470f
EC.
a3470f
a3470f
>BUG: 1461018
a3470f
>Change-Id: I09a9cb2aa9d91ec27add4f422dc9074af5b8b2db
a3470f
>Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
a3470f
a3470f
Upstream Patch: https://review.gluster.org/#/c/17777/
a3470f
a3470f
BUG: 1499865
a3470f
Change-Id: I09a9cb2aa9d91ec27add4f422dc9074af5b8b2db
a3470f
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
a3470f
Reviewed-on: https://code.engineering.redhat.com/gerrit/123694
a3470f
Tested-by: RHGS Build Bot <nigelb@redhat.com>
a3470f
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
a3470f
---
a3470f
 tests/basic/ec/ec-discard.t             | 197 +++++++++++++++++
a3470f
 tests/include.rc                        |   7 +
a3470f
 xlators/cluster/ec/src/ec-common.h      |   3 +-
a3470f
 xlators/cluster/ec/src/ec-fops.h        |   4 +
a3470f
 xlators/cluster/ec/src/ec-helpers.h     |   5 +-
a3470f
 xlators/cluster/ec/src/ec-inode-write.c | 365 ++++++++++++++++++++++++++++----
a3470f
 xlators/cluster/ec/src/ec.c             |   3 +-
a3470f
 7 files changed, 536 insertions(+), 48 deletions(-)
a3470f
 create mode 100644 tests/basic/ec/ec-discard.t
a3470f
a3470f
diff --git a/tests/basic/ec/ec-discard.t b/tests/basic/ec/ec-discard.t
a3470f
new file mode 100644
a3470f
index 0000000..4a44cec
a3470f
--- /dev/null
a3470f
+++ b/tests/basic/ec/ec-discard.t
a3470f
@@ -0,0 +1,197 @@
a3470f
+#!/bin/bash
a3470f
+#
a3470f
+# Test discard functionality
a3470f
+#
a3470f
+# Test that basic discard (hole punch) functionality works via the fallocate
a3470f
+# command line tool. Hole punch deallocates a region of a file, creating a hole
a3470f
+# and a zero-filled data region. We verify that hole punch works, frees blocks
a3470f
+# and that subsequent reads do not read stale data (caches are invalidated).
a3470f
+#
a3470f
+# NOTE: fuse fallocate is known to be broken with regard to cache invalidation
a3470f
+# 	up to 3.9.0 kernels. Therefore, FOPEN_KEEP_CACHE is not used in this
a3470f
+#	test (opens will invalidate the fuse cache).
a3470f
+###
a3470f
+
a3470f
+. $(dirname $0)/../../include.rc
a3470f
+. $(dirname $0)/../../fallocate.rc
a3470f
+. $(dirname $0)/../../volume.rc
a3470f
+
a3470f
+cleanup
a3470f
+
a3470f
+#cleate and start volume
a3470f
+TEST glusterd
a3470f
+TEST pidof glusterd
a3470f
+TEST $CLI volume create $V0 disperse 6 redundancy 2 $H0:$B0/${V0}{0..5}
a3470f
+TEST $CLI volume set $V0 disperse.optimistic-change-log on
a3470f
+TEST $CLI volume start $V0
a3470f
+
a3470f
+#Mount the volume
a3470f
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
a3470f
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0
a3470f
+
a3470f
+#Check for fallocate and hole punch support
a3470f
+require_fallocate -l 1m $M0/file
a3470f
+require_fallocate -p -l 512k $M0/file && rm -f $M0/file
a3470f
+
a3470f
+#Write some data, punch a hole and verify the file content changes
a3470f
+TEST dd if=/dev/urandom of=$M0/file bs=1024k count=1
a3470f
+TEST cp $M0/file $M0/file.copy.pre
a3470f
+TEST fallocate -p -o 512k -l 128k $M0/file
a3470f
+TEST ! cmp $M0/file.copy.pre $M0/file
a3470f
+TEST rm -f $M0/file $M0/file.copy.pre
a3470f
+
a3470f
+#Allocate some blocks, punch a hole and verify block allocation
a3470f
+TEST fallocate -l 1m $M0/file
a3470f
+blksz=`stat -c %B $M0/file`
a3470f
+nblks=`stat -c %b $M0/file`
a3470f
+TEST [ $(($blksz * $nblks)) -ge 1048576 ]
a3470f
+TEST fallocate -p -o 512k -l 128k $M0/file
a3470f
+nblks=`stat -c %b $M0/file`
a3470f
+TEST [ $(($blksz * $nblks)) -lt $((933889)) ]
a3470f
+TEST unlink $M0/file
a3470f
+
a3470f
+###Punch hole test cases without fallocate
a3470f
+##With write
a3470f
+#Touching starting boundary
a3470f
+TEST dd if=/dev/urandom of=$B0/test_file bs=1024 count=8
a3470f
+TEST cp $B0/test_file $M0/test_file
a3470f
+TEST fallocate -p -o 0 -l 500 $B0/test_file
a3470f
+TEST fallocate -p -o 0 -l 500 $M0/test_file
a3470f
+TEST md5_sum=`get_md5_sum $B0/test_file`
a3470f
+EXPECT $md5_sum get_md5_sum $M0/test_file
a3470f
+TEST rm -f $B0/test_file $M0/test_file
a3470f
+
a3470f
+#Touching boundary
a3470f
+TEST dd if=/dev/urandom of=$B0/test_file bs=1024 count=8
a3470f
+TEST cp $B0/test_file $M0/test_file
a3470f
+TEST fallocate -p -o 500 -l 1548 $B0/test_file
a3470f
+TEST fallocate -p -o 500 -l 1548 $M0/test_file
a3470f
+TEST md5_sum=`get_md5_sum $B0/test_file`
a3470f
+EXPECT $md5_sum get_md5_sum $M0/test_file
a3470f
+TEST rm -f $B0/test_file $M0/test_file
a3470f
+
a3470f
+#Not touching boundary
a3470f
+TEST dd if=/dev/urandom of=$B0/test_file bs=1024 count=8
a3470f
+TEST cp $B0/test_file $M0/test_file
a3470f
+TEST fallocate -p -o 500 -l 1000 $B0/test_file
a3470f
+TEST fallocate -p -o 500 -l 1000 $M0/test_file
a3470f
+TEST md5_sum=`get_md5_sum $B0/test_file`
a3470f
+EXPECT $md5_sum get_md5_sum $M0/test_file
a3470f
+TEST rm -f $B0/test_file $M0/test_file
a3470f
+
a3470f
+#Over boundary
a3470f
+TEST dd if=/dev/urandom of=$B0/test_file bs=1024 count=8
a3470f
+TEST cp $B0/test_file $M0/test_file
a3470f
+TEST fallocate -p -o 1500 -l 1000 $B0/test_file
a3470f
+TEST fallocate -p -o 1500 -l 1000 $M0/test_file
a3470f
+TEST md5_sum=`get_md5_sum $B0/test_file`
a3470f
+EXPECT $md5_sum get_md5_sum $M0/test_file
a3470f
+TEST rm -f $B0/test_file $M0/test_file
a3470f
+
a3470f
+###Punch hole test cases with fallocate
a3470f
+##Without write
a3470f
+
a3470f
+#Zero size
a3470f
+TEST dd if=/dev/urandom of=$M0/test_file bs=1024 count=8
a3470f
+TEST ! fallocate -p -o 1500 -l 0 $M0/test_file
a3470f
+
a3470f
+#Negative size
a3470f
+TEST ! fallocate -p -o 1500 -l -100 $M0/test_file
a3470f
+TEST rm -f $M0/test_file
a3470f
+
a3470f
+#Touching boundary
a3470f
+TEST dd if=/dev/urandom of=$B0/test_file bs=1024 count=8
a3470f
+TEST cp $B0/test_file $M0/test_file
a3470f
+TEST fallocate -p -o 2048 -l 2048 $B0/test_file
a3470f
+TEST fallocate -p -o 2048 -l 2048 $M0/test_file
a3470f
+TEST md5_sum=`get_md5_sum $B0/test_file`
a3470f
+EXPECT $md5_sum get_md5_sum $M0/test_file
a3470f
+TEST rm -f $B0/test_file $M0/test_file
a3470f
+
a3470f
+#Touching boundary,multiple stripe
a3470f
+TEST dd if=/dev/urandom of=$B0/test_file bs=1024 count=8
a3470f
+TEST cp $B0/test_file $M0/test_file
a3470f
+TEST fallocate -p -o 2048 -l 4096 $B0/test_file
a3470f
+TEST fallocate -p -o 2048 -l 4096 $M0/test_file
a3470f
+TEST md5_sum=`get_md5_sum $B0/test_file`
a3470f
+EXPECT $md5_sum get_md5_sum $M0/test_file
a3470f
+TEST rm -f $B0/test_file $M0/test_file
a3470f
+
a3470f
+##With write
a3470f
+
a3470f
+#Size ends in boundary
a3470f
+TEST dd if=/dev/urandom of=$B0/test_file bs=1024 count=8
a3470f
+TEST cp $B0/test_file $M0/test_file
a3470f
+TEST fallocate -p -o 600 -l 3496 $B0/test_file
a3470f
+TEST fallocate -p -o 600 -l 3496 $M0/test_file
a3470f
+TEST md5_sum=`get_md5_sum $B0/test_file`
a3470f
+EXPECT $md5_sum get_md5_sum $M0/test_file
a3470f
+TEST rm -f $B0/test_file $M0/test_file
a3470f
+
a3470f
+#Offset at boundary
a3470f
+TEST dd if=/dev/urandom of=$B0/test_file bs=1024 count=8
a3470f
+TEST cp $B0/test_file $M0/test_file
a3470f
+TEST fallocate -p -o 2048 -l 3072 $B0/test_file
a3470f
+TEST fallocate -p -o 2048 -l 3072 $M0/test_file
a3470f
+TEST md5_sum=`get_md5_sum $B0/test_file`
a3470f
+EXPECT $md5_sum get_md5_sum $M0/test_file
a3470f
+TEST rm -f $B0/test_file $M0/test_file
a3470f
+
a3470f
+#Offset and Size not at boundary
a3470f
+TEST dd if=/dev/urandom of=$B0/test_file bs=1024 count=8
a3470f
+TEST cp $B0/test_file $M0/test_file
a3470f
+TEST fallocate -p -o 1000 -l 3072 $B0/test_file
a3470f
+TEST fallocate -p -o 1000 -l 3072 $M0/test_file
a3470f
+TEST md5_sum=`get_md5_sum $B0/test_file`
a3470f
+EXPECT $md5_sum get_md5_sum $M0/test_file
a3470f
+#TEST rm -f $B0/test_file $M0/test_file
a3470f
+
a3470f
+#Data Corruption Tests
a3470f
+#Kill brick1 and brick2
a3470f
+TEST kill_brick $V0 $H0 $B0/${V0}0
a3470f
+TEST kill_brick $V0 $H0 $B0/${V0}1
a3470f
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "4" ec_child_up_count $V0 0
a3470f
+
a3470f
+#Unmount and mount
a3470f
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0;
a3470f
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
a3470f
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "4" ec_child_up_count $V0 0
a3470f
+
a3470f
+#verify md5 sum
a3470f
+EXPECT $md5_sum get_md5_sum $M0/test_file
a3470f
+
a3470f
+#Bring up the bricks
a3470f
+TEST $CLI volume start $V0 force
a3470f
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0
a3470f
+
a3470f
+#Kill brick3 and brick4
a3470f
+TEST kill_brick $V0 $H0 $B0/${V0}2
a3470f
+TEST kill_brick $V0 $H0 $B0/${V0}3
a3470f
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "4" ec_child_up_count $V0 0
a3470f
+
a3470f
+#Unmount and mount
a3470f
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0;
a3470f
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
a3470f
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "4" ec_child_up_count $V0 0
a3470f
+
a3470f
+#verify md5 sum
a3470f
+EXPECT $md5_sum get_md5_sum $M0/test_file
a3470f
+
a3470f
+#Bring up the bricks
a3470f
+TEST $CLI volume start $V0 force
a3470f
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0
a3470f
+
a3470f
+#Kill brick5 and brick6
a3470f
+TEST kill_brick $V0 $H0 $B0/${V0}4
a3470f
+TEST kill_brick $V0 $H0 $B0/${V0}5
a3470f
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "4" ec_child_up_count $V0 0
a3470f
+
a3470f
+#Unmount and mount
a3470f
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0;
a3470f
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
a3470f
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "4" ec_child_up_count $V0 0
a3470f
+
a3470f
+#verify md5 sum
a3470f
+EXPECT $md5_sum get_md5_sum $M0/test_file
a3470f
+
a3470f
+cleanup
a3470f
diff --git a/tests/include.rc b/tests/include.rc
a3470f
index 7470ea1..45392e0 100644
a3470f
--- a/tests/include.rc
a3470f
+++ b/tests/include.rc
a3470f
@@ -1229,3 +1229,10 @@ function STAT_INO()
a3470f
                 echo 0
a3470f
         fi
a3470f
 }
a3470f
+
a3470f
+function get_md5_sum()
a3470f
+{
a3470f
+    local file=$1;
a3470f
+    md5_sum=$(md5sum $file | awk '{print $1}');
a3470f
+    echo $md5_sum
a3470f
+}
a3470f
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
a3470f
index 1a947cc..0f7a252 100644
a3470f
--- a/xlators/cluster/ec/src/ec-common.h
a3470f
+++ b/xlators/cluster/ec/src/ec-common.h
a3470f
@@ -85,6 +85,8 @@ void ec_update_good(ec_fop_data_t *fop, uintptr_t good);
a3470f
 
a3470f
 void ec_fop_set_error(ec_fop_data_t *fop, int32_t error);
a3470f
 
a3470f
+void __ec_fop_set_error(ec_fop_data_t *fop, int32_t error);
a3470f
+
a3470f
 ec_cbk_data_t *
a3470f
 ec_fop_prepare_answer(ec_fop_data_t *fop, gf_boolean_t ro);
a3470f
 
a3470f
@@ -133,5 +135,4 @@ ec_heal_inspect (call_frame_t *frame, ec_t *ec,
a3470f
                  gf_boolean_t *need_heal);
a3470f
 int32_t
a3470f
 ec_get_heal_info (xlator_t *this, loc_t *loc, dict_t **dict);
a3470f
-
a3470f
 #endif /* __EC_COMMON_H__ */
a3470f
diff --git a/xlators/cluster/ec/src/ec-fops.h b/xlators/cluster/ec/src/ec-fops.h
a3470f
index fab22d8..4a926cf 100644
a3470f
--- a/xlators/cluster/ec/src/ec-fops.h
a3470f
+++ b/xlators/cluster/ec/src/ec-fops.h
a3470f
@@ -172,6 +172,10 @@ void ec_fallocate(call_frame_t *frame, xlator_t *this, uintptr_t target,
a3470f
               int32_t minimum, fop_fallocate_cbk_t func, void *data, fd_t *fd,
a3470f
               int32_t mode, off_t offset, size_t len, dict_t *xdata);
a3470f
 
a3470f
+void ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target,
a3470f
+                int32_t minimum, fop_discard_cbk_t func, void *data, fd_t *fd,
a3470f
+                off_t offset, size_t len, dict_t *xdata);
a3470f
+
a3470f
 void ec_truncate(call_frame_t * frame, xlator_t * this, uintptr_t target,
a3470f
                  int32_t minimum, fop_truncate_cbk_t func, void *data,
a3470f
                  loc_t * loc, off_t offset, dict_t * xdata);
a3470f
diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h
a3470f
index cfd7daa..a8f153a 100644
a3470f
--- a/xlators/cluster/ec/src/ec-helpers.h
a3470f
+++ b/xlators/cluster/ec/src/ec-helpers.h
a3470f
@@ -178,8 +178,5 @@ ec_is_data_fop (glusterfs_fop_t fop);
a3470f
 
a3470f
 int32_t
a3470f
 ec_launch_replace_heal (ec_t *ec);
a3470f
-/*
a3470f
-gf_boolean_t
a3470f
-ec_is_metadata_fop (glusterfs_fop_t fop);
a3470f
-*/
a3470f
+
a3470f
 #endif /* __EC_HELPERS_H__ */
a3470f
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
a3470f
index e6a67cf..ae51202 100644
a3470f
--- a/xlators/cluster/ec/src/ec-inode-write.c
a3470f
+++ b/xlators/cluster/ec/src/ec-inode-write.c
a3470f
@@ -19,6 +19,97 @@
a3470f
 #include "ec-method.h"
a3470f
 #include "ec-fops.h"
a3470f
 
a3470f
+int32_t
a3470f
+ec_update_writev_cbk (call_frame_t *frame, void *cookie,
a3470f
+                      xlator_t *this, int32_t op_ret, int32_t op_errno,
a3470f
+                      struct iatt *prebuf, struct iatt *postbuf,
a3470f
+                      dict_t *xdata)
a3470f
+{
a3470f
+    ec_fop_data_t *fop    = cookie;
a3470f
+    ec_cbk_data_t *cbk    = NULL;
a3470f
+    ec_fop_data_t *parent = fop->parent;
a3470f
+    int           i       = 0;
a3470f
+
a3470f
+    ec_trace("UPDATE_WRITEV_CBK", cookie, "ret=%d, errno=%d, parent-fop=%s",
a3470f
+             op_ret, op_errno, ec_fop_name (parent->id));
a3470f
+
a3470f
+    if (op_ret < 0) {
a3470f
+            ec_fop_set_error (parent, op_errno);
a3470f
+            goto out;
a3470f
+    }
a3470f
+    cbk = ec_cbk_data_allocate (parent->frame, this, parent,
a3470f
+                                parent->id, 0, op_ret, op_errno);
a3470f
+    if (!cbk) {
a3470f
+            ec_fop_set_error (parent, ENOMEM);
a3470f
+            goto out;
a3470f
+    }
a3470f
+
a3470f
+    if (xdata)
a3470f
+            cbk->xdata = dict_ref (xdata);
a3470f
+
a3470f
+    if (prebuf)
a3470f
+            cbk->iatt[i++] = *prebuf;
a3470f
+
a3470f
+    if (postbuf)
a3470f
+            cbk->iatt[i++] = *postbuf;
a3470f
+
a3470f
+    LOCK (&parent->lock);
a3470f
+    {
a3470f
+            parent->good &= fop->good;
a3470f
+
a3470f
+            if (gf_bits_count (parent->good) < parent->minimum) {
a3470f
+                    __ec_fop_set_error (parent, EIO);
a3470f
+            } else if (fop->error == 0 && parent->answer == NULL) {
a3470f
+                    parent->answer = cbk;
a3470f
+            }
a3470f
+    }
a3470f
+    UNLOCK (&parent->lock);
a3470f
+out:
a3470f
+    return 0;
a3470f
+}
a3470f
+
a3470f
+int32_t ec_update_write(ec_fop_data_t *fop, uintptr_t mask, off_t offset,
a3470f
+                        size_t size)
a3470f
+{
a3470f
+    struct iobref *iobref = NULL;
a3470f
+    struct iobuf *iobuf = NULL;
a3470f
+    struct iovec vector;
a3470f
+    int32_t err = -ENOMEM;
a3470f
+
a3470f
+    iobref = iobref_new();
a3470f
+    if (iobref == NULL) {
a3470f
+        goto out;
a3470f
+    }
a3470f
+    iobuf = iobuf_get(fop->xl->ctx->iobuf_pool);
a3470f
+    if (iobuf == NULL) {
a3470f
+        goto out;
a3470f
+    }
a3470f
+    err = iobref_add(iobref, iobuf);
a3470f
+    if (err != 0) {
a3470f
+        goto out;
a3470f
+    }
a3470f
+
a3470f
+    vector.iov_base = iobuf->ptr;
a3470f
+    vector.iov_len = size;
a3470f
+    memset(vector.iov_base, 0, vector.iov_len);
a3470f
+
a3470f
+    ec_writev(fop->frame, fop->xl, mask, fop->minimum,
a3470f
+              ec_update_writev_cbk, NULL, fop->fd, &vector, 1,
a3470f
+              offset, 0, iobref, NULL);
a3470f
+
a3470f
+    err = 0;
a3470f
+
a3470f
+out:
a3470f
+    if (iobuf != NULL) {
a3470f
+        iobuf_unref(iobuf);
a3470f
+    }
a3470f
+    if (iobref != NULL) {
a3470f
+        iobref_unref(iobref);
a3470f
+    }
a3470f
+
a3470f
+    return err;
a3470f
+}
a3470f
+
a3470f
 int
a3470f
 ec_inode_write_cbk (call_frame_t *frame, xlator_t *this, void *cookie,
a3470f
                     int op_ret, int op_errno, struct iatt *prestat,
a3470f
@@ -1034,62 +1125,252 @@ out:
a3470f
     }
a3470f
 }
a3470f
 
a3470f
-int32_t
a3470f
-ec_truncate_writev_cbk (call_frame_t *frame, void *cookie,
a3470f
-                        xlator_t *this, int32_t op_ret, int32_t op_errno,
a3470f
-                        struct iatt *prebuf, struct iatt *postbuf,
a3470f
-                        dict_t *xdata)
a3470f
+/*********************************************************************
a3470f
+ *
a3470f
+ * File Operation : Discard
a3470f
+ *
a3470f
+ *********************************************************************/
a3470f
+void ec_update_discard_write(ec_fop_data_t *fop, uintptr_t mask)
a3470f
 {
a3470f
-    ec_fop_data_t *fop = cookie;
a3470f
+    ec_t   *ec       = fop->xl->private;
a3470f
+    off_t  off_head  = 0;
a3470f
+    off_t  off_tail  = 0;
a3470f
+    size_t size_head = 0;
a3470f
+    size_t size_tail = 0;
a3470f
+    int    error     = 0;
a3470f
+
a3470f
+    off_head = fop->offset * ec->fragments - fop->int32;
a3470f
+    if (fop->size == 0) {
a3470f
+            error = ec_update_write (fop, mask, off_head, fop->user_size);
a3470f
+    } else {
a3470f
+            size_head = fop->int32;
a3470f
+            size_tail = (fop->user_size - fop->int32) % ec->stripe_size;
a3470f
+            off_tail = off_head + fop->user_size - size_tail;
a3470f
+            if (size_head) {
a3470f
+                    error = ec_update_write (fop, mask, off_head, size_head);
a3470f
+                    goto out;
a3470f
+            }
a3470f
+            if (size_tail) {
a3470f
+                    error = ec_update_write (fop, mask, off_tail, size_tail);
a3470f
+            }
a3470f
+    }
a3470f
+out:
a3470f
+    if (error)
a3470f
+            ec_fop_set_error (fop, -error);
a3470f
+}
a3470f
 
a3470f
-    fop->parent->good &= fop->good;
a3470f
-    ec_trace("TRUNCATE_WRITEV_CBK", cookie, "ret=%d, errno=%d",
a3470f
-             op_ret, op_errno);
a3470f
-    return 0;
a3470f
+void ec_discard_adjust_offset_size(ec_fop_data_t *fop)
a3470f
+{
a3470f
+        ec_t *ec = fop->xl->private;
a3470f
+
a3470f
+        fop->user_size = fop->size;
a3470f
+        /* If discard length covers atleast a fragment on brick, we will
a3470f
+         * perform discard operation(when fop->size is non-zero) else we just
a3470f
+         * write zeros.
a3470f
+         */
a3470f
+        fop->int32 = ec_adjust_offset_up(ec, &fop->offset, _gf_true);
a3470f
+        if (fop->size < fop->int32) {
a3470f
+                fop->size = 0;
a3470f
+        } else {
a3470f
+                fop->size -= fop->int32;
a3470f
+                ec_adjust_size_down(ec, &fop->size, _gf_true);
a3470f
+        }
a3470f
 }
a3470f
 
a3470f
-int32_t ec_truncate_write(ec_fop_data_t * fop, uintptr_t mask)
a3470f
+int32_t ec_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
a3470f
+                       int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
a3470f
+                       struct iatt *postbuf, dict_t *xdata)
a3470f
 {
a3470f
-    ec_t * ec = fop->xl->private;
a3470f
-    struct iobref * iobref = NULL;
a3470f
-    struct iobuf * iobuf = NULL;
a3470f
-    struct iovec vector;
a3470f
-    int32_t err = -ENOMEM;
a3470f
+    return ec_inode_write_cbk (frame, this, cookie, op_ret, op_errno,
a3470f
+                               prebuf, postbuf, xdata);
a3470f
+}
a3470f
 
a3470f
-    iobref = iobref_new();
a3470f
-    if (iobref == NULL) {
a3470f
-        goto out;
a3470f
-    }
a3470f
-    iobuf = iobuf_get(fop->xl->ctx->iobuf_pool);
a3470f
-    if (iobuf == NULL) {
a3470f
-        goto out;
a3470f
+void ec_wind_discard(ec_t *ec, ec_fop_data_t *fop, int32_t idx)
a3470f
+{
a3470f
+    ec_trace("WIND", fop, "idx=%d", idx);
a3470f
+
a3470f
+    STACK_WIND_COOKIE(fop->frame, ec_discard_cbk, (void *)(uintptr_t)idx,
a3470f
+                      ec->xl_list[idx], ec->xl_list[idx]->fops->discard,
a3470f
+                      fop->fd, fop->offset, fop->size, fop->xdata);
a3470f
+}
a3470f
+
a3470f
+int32_t ec_manager_discard(ec_fop_data_t *fop, int32_t state)
a3470f
+{
a3470f
+    ec_cbk_data_t *cbk     = NULL;
a3470f
+    off_t         fl_start = 0;
a3470f
+    size_t        fl_size  = 0;
a3470f
+
a3470f
+
a3470f
+    switch (state) {
a3470f
+    case EC_STATE_INIT:
a3470f
+        if ((fop->size <= 0) || (fop->offset < 0)) {
a3470f
+                ec_fop_set_error(fop, EINVAL);
a3470f
+                return EC_STATE_REPORT;
a3470f
+        }
a3470f
+        /* Because of the head/tail writes, "discard" happens on the remaining
a3470f
+         * regions, but we need to compute region including head/tail writes
a3470f
+         * so compute them separately*/
a3470f
+        fl_start = fop->offset;
a3470f
+        fl_size = fop->size;
a3470f
+        fl_size += ec_adjust_offset_down (fop->xl->private, &fl_start,
a3470f
+                                          _gf_true);
a3470f
+        ec_adjust_size_up (fop->xl->private, &fl_size, _gf_true);
a3470f
+
a3470f
+        ec_discard_adjust_offset_size(fop);
a3470f
+
a3470f
+    /* Fall through */
a3470f
+
a3470f
+    case EC_STATE_LOCK:
a3470f
+        ec_lock_prepare_fd(fop, fop->fd,
a3470f
+                           EC_UPDATE_DATA | EC_UPDATE_META |
a3470f
+                           EC_QUERY_INFO, fl_start, fl_size);
a3470f
+        ec_lock(fop);
a3470f
+
a3470f
+        return EC_STATE_DISPATCH;
a3470f
+
a3470f
+    case EC_STATE_DISPATCH:
a3470f
+
a3470f
+        /* Dispatch discard fop only if we have whole fragment
a3470f
+         * to deallocate */
a3470f
+        if (fop->size) {
a3470f
+                ec_dispatch_all(fop);
a3470f
+                return EC_STATE_DELAYED_START;
a3470f
+        } else {
a3470f
+                /*Assume discard to have succeeded on mask*/
a3470f
+                fop->good = fop->mask;
a3470f
+        }
a3470f
+
a3470f
+        /* Fall through */
a3470f
+
a3470f
+    case EC_STATE_DELAYED_START:
a3470f
+
a3470f
+        if (fop->size) {
a3470f
+                if (fop->answer && fop->answer->op_ret == 0)
a3470f
+                        ec_update_discard_write (fop, fop->answer->mask);
a3470f
+        } else {
a3470f
+                ec_update_discard_write (fop, fop->mask);
a3470f
+        }
a3470f
+
a3470f
+        return EC_STATE_PREPARE_ANSWER;
a3470f
+
a3470f
+    case EC_STATE_PREPARE_ANSWER:
a3470f
+        cbk = ec_fop_prepare_answer(fop, _gf_false);
a3470f
+        if (cbk != NULL) {
a3470f
+                ec_iatt_rebuild(fop->xl->private, cbk->iatt, 2,
a3470f
+                                cbk->count);
a3470f
+
a3470f
+                /* This shouldn't fail because we have the inode locked. */
a3470f
+                GF_ASSERT(ec_get_inode_size(fop, fop->locks[0].lock->loc.inode,
a3470f
+                                            &cbk->iatt[0].ia_size));
a3470f
+
a3470f
+                cbk->iatt[1].ia_size = cbk->iatt[0].ia_size;
a3470f
+        }
a3470f
+        return EC_STATE_REPORT;
a3470f
+
a3470f
+    case EC_STATE_REPORT:
a3470f
+        cbk = fop->answer;
a3470f
+
a3470f
+        GF_ASSERT(cbk != NULL);
a3470f
+
a3470f
+        if (fop->cbks.discard != NULL) {
a3470f
+                fop->cbks.discard(fop->req_frame, fop, fop->xl, cbk->op_ret,
a3470f
+                                  cbk->op_errno, &cbk->iatt[0], &cbk->iatt[1],
a3470f
+                                  cbk->xdata);
a3470f
+        }
a3470f
+
a3470f
+        return EC_STATE_LOCK_REUSE;
a3470f
+
a3470f
+    case -EC_STATE_INIT:
a3470f
+    case -EC_STATE_LOCK:
a3470f
+    case -EC_STATE_DISPATCH:
a3470f
+    case -EC_STATE_DELAYED_START:
a3470f
+    case -EC_STATE_PREPARE_ANSWER:
a3470f
+    case -EC_STATE_REPORT:
a3470f
+        GF_ASSERT(fop->error != 0);
a3470f
+
a3470f
+        if (fop->cbks.discard != NULL) {
a3470f
+                fop->cbks.discard(fop->req_frame, fop, fop->xl, -1,
a3470f
+                                  fop->error, NULL, NULL, NULL);
a3470f
+        }
a3470f
+
a3470f
+        return EC_STATE_LOCK_REUSE;
a3470f
+
a3470f
+    case -EC_STATE_LOCK_REUSE:
a3470f
+    case EC_STATE_LOCK_REUSE:
a3470f
+        ec_lock_reuse(fop);
a3470f
+
a3470f
+        return EC_STATE_UNLOCK;
a3470f
+
a3470f
+    case -EC_STATE_UNLOCK:
a3470f
+    case EC_STATE_UNLOCK:
a3470f
+        ec_unlock(fop);
a3470f
+
a3470f
+        return EC_STATE_END;
a3470f
+
a3470f
+    default:
a3470f
+        gf_msg (fop->xl->name, GF_LOG_ERROR, EINVAL,
a3470f
+                EC_MSG_UNHANDLED_STATE,
a3470f
+                "Unhandled state %d for %s",
a3470f
+                state, ec_fop_name(fop->id));
a3470f
+
a3470f
+        return EC_STATE_END;
a3470f
     }
a3470f
-    err = iobref_add(iobref, iobuf);
a3470f
-    if (err != 0) {
a3470f
+}
a3470f
+
a3470f
+void ec_discard(call_frame_t *frame, xlator_t *this, uintptr_t target,
a3470f
+                int32_t minimum, fop_discard_cbk_t func, void *data, fd_t *fd,
a3470f
+                off_t offset, size_t len, dict_t *xdata)
a3470f
+{
a3470f
+    ec_cbk_t callback = { .discard = func };
a3470f
+    ec_fop_data_t *fop = NULL;
a3470f
+    int32_t error = ENOMEM;
a3470f
+
a3470f
+    gf_msg_trace ("ec", 0, "EC(DISCARD) %p", frame);
a3470f
+
a3470f
+    VALIDATE_OR_GOTO(this, out);
a3470f
+    GF_VALIDATE_OR_GOTO(this->name, frame, out);
a3470f
+    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
a3470f
+
a3470f
+    fop = ec_fop_data_allocate(frame, this, GF_FOP_DISCARD, 0, target,
a3470f
+                               minimum, ec_wind_discard, ec_manager_discard,
a3470f
+                               callback, data);
a3470f
+    if (fop == NULL) {
a3470f
         goto out;
a3470f
     }
a3470f
 
a3470f
-    vector.iov_base = iobuf->ptr;
a3470f
-    vector.iov_len = fop->offset * ec->fragments - fop->user_size;
a3470f
-    memset(vector.iov_base, 0, vector.iov_len);
a3470f
+    fop->use_fd = 1;
a3470f
+    fop->offset = offset;
a3470f
+    fop->size = len;
a3470f
 
a3470f
-    iobuf_unref (iobuf);
a3470f
-    iobuf = NULL;
a3470f
+    if (fd != NULL) {
a3470f
+        fop->fd = fd_ref(fd);
a3470f
+    }
a3470f
 
a3470f
-    ec_writev(fop->frame, fop->xl, mask, fop->minimum, ec_truncate_writev_cbk,
a3470f
-              NULL, fop->fd, &vector, 1, fop->user_size, 0, iobref, NULL);
a3470f
+    if (xdata != NULL) {
a3470f
+        fop->xdata = dict_ref(xdata);
a3470f
+    }
a3470f
 
a3470f
-    err = 0;
a3470f
+    error = 0;
a3470f
 
a3470f
 out:
a3470f
-    if (iobuf != NULL) {
a3470f
-        iobuf_unref(iobuf);
a3470f
-    }
a3470f
-    if (iobref != NULL) {
a3470f
-        iobref_unref(iobref);
a3470f
+    if (fop != NULL) {
a3470f
+        ec_manager(fop, error);
a3470f
+    } else {
a3470f
+        func(frame, NULL, this, -1, error, NULL, NULL, NULL);
a3470f
     }
a3470f
+}
a3470f
 
a3470f
-    return err;
a3470f
+/*********************************************************************
a3470f
+ *
a3470f
+ * File Operation : truncate
a3470f
+ *
a3470f
+ *********************************************************************/
a3470f
+
a3470f
+int32_t ec_update_truncate_write (ec_fop_data_t *fop, uintptr_t mask)
a3470f
+{
a3470f
+        ec_t *ec = fop->xl->private;
a3470f
+        size_t size = fop->offset * ec->fragments - fop->user_size;
a3470f
+        return ec_update_write (fop, mask, fop->user_size, size);
a3470f
 }
a3470f
 
a3470f
 int32_t ec_truncate_open_cbk(call_frame_t * frame, void * cookie,
a3470f
@@ -1102,9 +1383,9 @@ int32_t ec_truncate_open_cbk(call_frame_t * frame, void * cookie,
a3470f
     fop->parent->good &= fop->good;
a3470f
     if (op_ret >= 0) {
a3470f
         fd_bind (fd);
a3470f
-        err = ec_truncate_write(fop->parent, fop->answer->mask);
a3470f
+        err = ec_update_truncate_write (fop->parent, fop->answer->mask);
a3470f
         if (err != 0) {
a3470f
-            fop->error = -err;
a3470f
+            ec_fop_set_error (fop->parent, -err);
a3470f
         }
a3470f
     }
a3470f
 
a3470f
@@ -1125,7 +1406,7 @@ int32_t ec_truncate_clean(ec_fop_data_t * fop)
a3470f
 
a3470f
         return 0;
a3470f
     } else {
a3470f
-        return ec_truncate_write(fop, fop->answer->mask);
a3470f
+        return ec_update_truncate_write (fop, fop->answer->mask);
a3470f
     }
a3470f
 }
a3470f
 
a3470f
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
a3470f
index 856d60c..09c5fa8 100644
a3470f
--- a/xlators/cluster/ec/src/ec.c
a3470f
+++ b/xlators/cluster/ec/src/ec.c
a3470f
@@ -729,7 +729,8 @@ int32_t ec_gf_create(call_frame_t * frame, xlator_t * this, loc_t * loc,
a3470f
 int32_t ec_gf_discard(call_frame_t * frame, xlator_t * this, fd_t * fd,
a3470f
                       off_t offset, size_t len, dict_t * xdata)
a3470f
 {
a3470f
-    default_discard_failure_cbk(frame, ENOTSUP);
a3470f
+    ec_discard(frame, this, -1, EC_MINIMUM_MIN, default_discard_cbk,
a3470f
+               NULL, fd, offset, len, xdata);
a3470f
 
a3470f
     return 0;
a3470f
 }
a3470f
-- 
a3470f
1.8.3.1
a3470f