cb8e9e
From 1c94a3d3bd4ac1237d497f03a899ba29590fc37f Mon Sep 17 00:00:00 2001
cb8e9e
From: Pranith Kumar K <pkarampu@redhat.com>
cb8e9e
Date: Mon, 15 Jun 2015 16:32:06 +0530
cb8e9e
Subject: [PATCH 139/190] cluster/ec: wind fops on good subvols for access/readdir[p]
cb8e9e
cb8e9e
        Backport of http://review.gluster.org/11246
cb8e9e
cb8e9e
Change-Id: I1e629a6adc803c4b7164a5a7a81ee5cb1d0e139c
cb8e9e
BUG: 1228525
cb8e9e
Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
cb8e9e
Reviewed-on: https://code.engineering.redhat.com/gerrit/51322
cb8e9e
---
cb8e9e
 tests/basic/ec/ec-readdir.t            |   30 +++++-
cb8e9e
 xlators/cluster/ec/src/ec-combine.c    |   12 +--
cb8e9e
 xlators/cluster/ec/src/ec-common.c     |   28 ++++-
cb8e9e
 xlators/cluster/ec/src/ec-common.h     |    3 +-
cb8e9e
 xlators/cluster/ec/src/ec-data.c       |    2 +
cb8e9e
 xlators/cluster/ec/src/ec-data.h       |    1 +
cb8e9e
 xlators/cluster/ec/src/ec-dir-read.c   |  207 +++++++++++++++++---------------
cb8e9e
 xlators/cluster/ec/src/ec-inode-read.c |   89 +++++++++++----
cb8e9e
 8 files changed, 232 insertions(+), 140 deletions(-)
cb8e9e
cb8e9e
diff --git a/tests/basic/ec/ec-readdir.t b/tests/basic/ec/ec-readdir.t
cb8e9e
index bbfa2dd..fad101b 100644
cb8e9e
--- a/tests/basic/ec/ec-readdir.t
cb8e9e
+++ b/tests/basic/ec/ec-readdir.t
cb8e9e
@@ -9,12 +9,38 @@ cleanup
cb8e9e
 TEST glusterd
cb8e9e
 TEST pidof glusterd
cb8e9e
 TEST $CLI volume create $V0 disperse 3 redundancy 1 $H0:$B0/${V0}{0..5}
cb8e9e
+TEST $CLI volume heal $V0 disable
cb8e9e
 TEST $CLI volume start $V0
cb8e9e
 
cb8e9e
 TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0;
cb8e9e
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
cb8e9e
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 1
cb8e9e
 
cb8e9e
-TEST touch $M0/{1..100}
cb8e9e
-EXPECT "100" echo $(ls $M0/* | wc -l)
cb8e9e
+TEST mkdir $M0/d
cb8e9e
+TEST touch $M0/d/{1..100}
cb8e9e
+EXPECT "100" echo $(ls $M0/d/* | wc -l)
cb8e9e
+TEST kill_brick $V0 $H0 $B0/${V0}0
cb8e9e
+TEST kill_brick $V0 $H0 $B0/${V0}3
cb8e9e
+TEST rm -rf $M0/d/{1..100}
cb8e9e
+TEST $CLI volume start $V0 force
cb8e9e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
cb8e9e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 1
cb8e9e
+#Do it 3 times so that even with all load balancing, readdir never falls
cb8e9e
+#on stale bricks
cb8e9e
+EXPECT "0" echo $(ls $M0/d/ | wc -l)
cb8e9e
+EXPECT "0" echo $(ls $M0/d/ | wc -l)
cb8e9e
+EXPECT "0" echo $(ls $M0/d/ | wc -l)
cb8e9e
+#Do the same test above for creation of entries
cb8e9e
+TEST mkdir $M0/d1
cb8e9e
+TEST kill_brick $V0 $H0 $B0/${V0}0
cb8e9e
+TEST kill_brick $V0 $H0 $B0/${V0}3
cb8e9e
+TEST touch $M0/d1/{1..100}
cb8e9e
+TEST $CLI volume start $V0 force
cb8e9e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 0
cb8e9e
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "3" ec_child_up_count $V0 1
cb8e9e
+#Do it 3 times so that even with all load balancing, readdir never falls
cb8e9e
+#on stale bricks
cb8e9e
+EXPECT "100" echo $(ls $M0/d1/ | wc -l)
cb8e9e
+EXPECT "100" echo $(ls $M0/d1/ | wc -l)
cb8e9e
+EXPECT "100" echo $(ls $M0/d1/ | wc -l)
cb8e9e
 cleanup
cb8e9e
diff --git a/xlators/cluster/ec/src/ec-combine.c b/xlators/cluster/ec/src/ec-combine.c
cb8e9e
index a8cfabc..bf698d9 100644
cb8e9e
--- a/xlators/cluster/ec/src/ec-combine.c
cb8e9e
+++ b/xlators/cluster/ec/src/ec-combine.c
cb8e9e
@@ -875,7 +875,7 @@ void ec_combine (ec_cbk_data_t *newcbk, ec_combine_f combine)
cb8e9e
     ec_fop_data_t *fop = newcbk->fop;
cb8e9e
     ec_cbk_data_t *cbk = NULL, *tmp = NULL;
cb8e9e
     struct list_head *item = NULL;
cb8e9e
-    int32_t needed = 0, resume = 0;
cb8e9e
+    int32_t needed = 0;
cb8e9e
     char str[32];
cb8e9e
 
cb8e9e
     LOCK(&fop->lock);
cb8e9e
@@ -912,12 +912,6 @@ void ec_combine (ec_cbk_data_t *newcbk, ec_combine_f combine)
cb8e9e
     ec_trace("ANSWER", fop, "combine=%s[%d]",
cb8e9e
              ec_bin(str, sizeof(str), newcbk->mask, 0), newcbk->count);
cb8e9e
 
cb8e9e
-    if ((newcbk->count == fop->expected) && (fop->answer == NULL)) {
cb8e9e
-        fop->answer = newcbk;
cb8e9e
-
cb8e9e
-        resume = 1;
cb8e9e
-    }
cb8e9e
-
cb8e9e
     cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list);
cb8e9e
     if ((fop->mask ^ fop->remaining) == fop->received) {
cb8e9e
         needed = fop->minimum - cbk->count;
cb8e9e
@@ -927,9 +921,5 @@ void ec_combine (ec_cbk_data_t *newcbk, ec_combine_f combine)
cb8e9e
 
cb8e9e
     if (needed > 0) {
cb8e9e
         ec_dispatch_next(fop, newcbk->idx);
cb8e9e
-    } else if (resume) {
cb8e9e
-        ec_update_bad(fop, newcbk->mask);
cb8e9e
-
cb8e9e
-        ec_resume(fop, 0);
cb8e9e
     }
cb8e9e
 }
cb8e9e
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
cb8e9e
index f69234e..4dbbe28 100644
cb8e9e
--- a/xlators/cluster/ec/src/ec-common.c
cb8e9e
+++ b/xlators/cluster/ec/src/ec-common.c
cb8e9e
@@ -186,6 +186,10 @@ void ec_update_bad(ec_fop_data_t * fop, uintptr_t good)
cb8e9e
     ec_t *ec = fop->xl->private;
cb8e9e
     uintptr_t bad;
cb8e9e
 
cb8e9e
+    /*Don't let fops that do dispatch_one() to update bad*/
cb8e9e
+    if (fop->expected == 1)
cb8e9e
+            return;
cb8e9e
+
cb8e9e
     bad = ec->xl_up & ~(fop->remaining | good);
cb8e9e
     fop->bad |= bad;
cb8e9e
     fop->good |= good;
cb8e9e
@@ -314,6 +318,20 @@ void ec_resume_parent(ec_fop_data_t * fop, int32_t error)
cb8e9e
     }
cb8e9e
 }
cb8e9e
 
cb8e9e
+gf_boolean_t
cb8e9e
+ec_is_recoverable_error (int32_t op_errno)
cb8e9e
+{
cb8e9e
+        switch (op_errno) {
cb8e9e
+        case ENOTCONN:
cb8e9e
+        case ESTALE:
cb8e9e
+        case ENOENT:
cb8e9e
+        case EBADFD:/*Opened fd but brick is disconnected*/
cb8e9e
+        case EIO:/*Backend-fs crash like XFS/ext4 etc*/
cb8e9e
+                return _gf_true;
cb8e9e
+        }
cb8e9e
+        return _gf_false;
cb8e9e
+}
cb8e9e
+
cb8e9e
 void ec_complete(ec_fop_data_t * fop)
cb8e9e
 {
cb8e9e
     ec_cbk_data_t * cbk = NULL;
cb8e9e
@@ -329,14 +347,12 @@ void ec_complete(ec_fop_data_t * fop)
cb8e9e
             if (!list_empty(&fop->cbk_list)) {
cb8e9e
                 cbk = list_entry(fop->cbk_list.next, ec_cbk_data_t, list);
cb8e9e
                 healing_count = ec_bits_count (cbk->mask & fop->healing);
cb8e9e
+                    /* fop shouldn't be treated as success if it is not
cb8e9e
+                     * successful on at least fop->minimum good copies*/
cb8e9e
                 if ((cbk->count - healing_count) >= fop->minimum) {
cb8e9e
-                        /* fop shouldn't be treated as success if it is not
cb8e9e
-                         * successful on at least fop->minimum good copies*/
cb8e9e
-                    if ((cbk->op_ret >= 0) || (cbk->op_errno != ENOTCONN)) {
cb8e9e
-                        fop->answer = cbk;
cb8e9e
+                    fop->answer = cbk;
cb8e9e
 
cb8e9e
-                        update = 1;
cb8e9e
-                    }
cb8e9e
+                    update = 1;
cb8e9e
                 }
cb8e9e
             }
cb8e9e
 
cb8e9e
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
cb8e9e
index e3f01ca..3334a7b 100644
cb8e9e
--- a/xlators/cluster/ec/src/ec-common.h
cb8e9e
+++ b/xlators/cluster/ec/src/ec-common.h
cb8e9e
@@ -75,7 +75,7 @@ typedef enum {
cb8e9e
 #define EC_STATE_HEAL_POST_INODELK_UNLOCK   217
cb8e9e
 #define EC_STATE_HEAL_DISPATCH              218
cb8e9e
 
cb8e9e
-int32_t ec_dispatch_one_retry(ec_fop_data_t *fop, int32_t idx, int32_t op_ret);
cb8e9e
+int32_t ec_dispatch_one_retry (ec_fop_data_t *fop, int32_t idx, int32_t op_ret);
cb8e9e
 int32_t ec_dispatch_next(ec_fop_data_t * fop, int32_t idx);
cb8e9e
 
cb8e9e
 void ec_complete(ec_fop_data_t * fop);
cb8e9e
@@ -111,5 +111,6 @@ void ec_resume(ec_fop_data_t * fop, int32_t error);
cb8e9e
 void ec_resume_parent(ec_fop_data_t * fop, int32_t error);
cb8e9e
 
cb8e9e
 void ec_manager(ec_fop_data_t * fop, int32_t error);
cb8e9e
+gf_boolean_t ec_is_recoverable_error (int32_t op_errno);
cb8e9e
 
cb8e9e
 #endif /* __EC_COMMON_H__ */
cb8e9e
diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c
cb8e9e
index 047ccd5..72f3b0b 100644
cb8e9e
--- a/xlators/cluster/ec/src/ec-data.c
cb8e9e
+++ b/xlators/cluster/ec/src/ec-data.c
cb8e9e
@@ -59,6 +59,7 @@ ec_cbk_data_t * ec_cbk_data_allocate(call_frame_t * frame, xlator_t * this,
cb8e9e
     cbk->count = 1;
cb8e9e
     cbk->op_ret = op_ret;
cb8e9e
     cbk->op_errno = op_errno;
cb8e9e
+    INIT_LIST_HEAD (&cbk->entries.list);
cb8e9e
 
cb8e9e
     LOCK(&fop->lock);
cb8e9e
 
cb8e9e
@@ -92,6 +93,7 @@ void ec_cbk_data_destroy(ec_cbk_data_t * cbk)
cb8e9e
         iobref_unref(cbk->buffers);
cb8e9e
     }
cb8e9e
     GF_FREE(cbk->vector);
cb8e9e
+    gf_dirent_free (&cbk->entries);
cb8e9e
 
cb8e9e
     mem_put(cbk);
cb8e9e
 }
cb8e9e
diff --git a/xlators/cluster/ec/src/ec-data.h b/xlators/cluster/ec/src/ec-data.h
cb8e9e
index 8f6d5de..670b3b8 100644
cb8e9e
--- a/xlators/cluster/ec/src/ec-data.h
cb8e9e
+++ b/xlators/cluster/ec/src/ec-data.h
cb8e9e
@@ -266,6 +266,7 @@ struct _ec_cbk_data
cb8e9e
     struct iovec *   vector;
cb8e9e
     struct iobref *  buffers;
cb8e9e
     uint64_t         dirty[2];
cb8e9e
+    gf_dirent_t      entries;
cb8e9e
 };
cb8e9e
 
cb8e9e
 struct _ec_heal
cb8e9e
diff --git a/xlators/cluster/ec/src/ec-dir-read.c b/xlators/cluster/ec/src/ec-dir-read.c
cb8e9e
index 7821878..69df4d2 100644
cb8e9e
--- a/xlators/cluster/ec/src/ec-dir-read.c
cb8e9e
+++ b/xlators/cluster/ec/src/ec-dir-read.c
cb8e9e
@@ -316,34 +316,36 @@ out:
cb8e9e
 
cb8e9e
 /* FOP: readdir */
cb8e9e
 
cb8e9e
-void ec_adjust_readdir(ec_t * ec, int32_t idx, gf_dirent_t * entries)
cb8e9e
+void ec_adjust_readdirp (ec_t *ec, int32_t idx, gf_dirent_t *entries)
cb8e9e
 {
cb8e9e
     gf_dirent_t * entry;
cb8e9e
 
cb8e9e
     list_for_each_entry(entry, &entries->list, list)
cb8e9e
     {
cb8e9e
+        if (!entry->inode)
cb8e9e
+                continue;
cb8e9e
+
cb8e9e
         if (entry->d_stat.ia_type == IA_IFREG)
cb8e9e
         {
cb8e9e
             if ((entry->dict == NULL) ||
cb8e9e
                 (ec_dict_del_number(entry->dict, EC_XATTR_SIZE,
cb8e9e
-                                    &entry->d_stat.ia_size) != 0))
cb8e9e
-            {
cb8e9e
-                gf_log(ec->xl->name, GF_LOG_WARNING, "Unable to get exact "
cb8e9e
-                                                     "file size.");
cb8e9e
-
cb8e9e
-                entry->d_stat.ia_size *= ec->fragments;
cb8e9e
+                                    &entry->d_stat.ia_size) != 0)) {
cb8e9e
+                    inode_unref (entry->inode);
cb8e9e
+                    entry->inode = NULL;
cb8e9e
+            } else {
cb8e9e
+                ec_iatt_rebuild(ec, &entry->d_stat, 1, 1);
cb8e9e
             }
cb8e9e
-
cb8e9e
-            ec_iatt_rebuild(ec, &entry->d_stat, 1, 1);
cb8e9e
         }
cb8e9e
     }
cb8e9e
 }
cb8e9e
 
cb8e9e
-int32_t ec_readdir_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
cb8e9e
-                       int32_t op_ret, int32_t op_errno, gf_dirent_t * entries,
cb8e9e
-                       dict_t * xdata)
cb8e9e
+int32_t
cb8e9e
+ec_common_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
cb8e9e
+                       int32_t op_ret, int32_t op_errno,
cb8e9e
+                       gf_dirent_t *entries, dict_t *xdata)
cb8e9e
 {
cb8e9e
-    ec_fop_data_t * fop = NULL;
cb8e9e
+    ec_fop_data_t *fop = NULL;
cb8e9e
+    ec_cbk_data_t *cbk = NULL;
cb8e9e
     int32_t idx = (int32_t)(uintptr_t)cookie;
cb8e9e
 
cb8e9e
     VALIDATE_OR_GOTO(this, out);
cb8e9e
@@ -356,18 +358,15 @@ int32_t ec_readdir_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
cb8e9e
     ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx,
cb8e9e
              frame, op_ret, op_errno);
cb8e9e
 
cb8e9e
-    if (op_ret > 0)
cb8e9e
-    {
cb8e9e
-        ec_adjust_readdir(fop->xl->private, idx, entries);
cb8e9e
-    }
cb8e9e
-
cb8e9e
-    if (!ec_dispatch_one_retry(fop, idx, op_ret))
cb8e9e
-    {
cb8e9e
-        if (fop->cbks.readdir != NULL)
cb8e9e
-        {
cb8e9e
-            fop->cbks.readdir(fop->req_frame, fop, this, op_ret, op_errno,
cb8e9e
-                              entries, xdata);
cb8e9e
-        }
cb8e9e
+    cbk = ec_cbk_data_allocate (frame, this, fop, fop->id,
cb8e9e
+                                idx, op_ret, op_errno);
cb8e9e
+    if (cbk) {
cb8e9e
+        if (xdata)
cb8e9e
+                cbk->xdata = dict_ref (xdata);
cb8e9e
+        if (cbk->op_ret >= 0)
cb8e9e
+                list_splice_init (&entries->list,
cb8e9e
+                                  &cbk->entries.list);
cb8e9e
+        ec_combine (cbk, NULL);
cb8e9e
     }
cb8e9e
 
cb8e9e
 out:
cb8e9e
@@ -383,14 +382,15 @@ void ec_wind_readdir(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
cb8e9e
 {
cb8e9e
     ec_trace("WIND", fop, "idx=%d", idx);
cb8e9e
 
cb8e9e
-    STACK_WIND_COOKIE(fop->frame, ec_readdir_cbk, (void *)(uintptr_t)idx,
cb8e9e
+    STACK_WIND_COOKIE(fop->frame, ec_common_readdir_cbk, (void *)(uintptr_t)idx,
cb8e9e
                       ec->xl_list[idx], ec->xl_list[idx]->fops->readdir,
cb8e9e
                       fop->fd, fop->size, fop->offset, fop->xdata);
cb8e9e
 }
cb8e9e
 
cb8e9e
 int32_t ec_manager_readdir(ec_fop_data_t * fop, int32_t state)
cb8e9e
 {
cb8e9e
-    ec_fd_t *ctx;
cb8e9e
+    ec_fd_t *ctx = NULL;
cb8e9e
+    ec_cbk_data_t *cbk = NULL;
cb8e9e
 
cb8e9e
     switch (state)
cb8e9e
     {
cb8e9e
@@ -404,27 +404,21 @@ int32_t ec_manager_readdir(ec_fop_data_t * fop, int32_t state)
cb8e9e
                 return EC_STATE_REPORT;
cb8e9e
             }
cb8e9e
 
cb8e9e
-            if (fop->xdata == NULL)
cb8e9e
-            {
cb8e9e
-                fop->xdata = dict_new();
cb8e9e
-                if (fop->xdata == NULL)
cb8e9e
-                {
cb8e9e
-                    gf_log(fop->xl->name, GF_LOG_ERROR, "Unable to prepare "
cb8e9e
-                                                        "readdirp request");
cb8e9e
-
cb8e9e
-                    fop->error = EIO;
cb8e9e
+            if (fop->id == GF_FOP_READDIRP) {
cb8e9e
+                    if (fop->xdata == NULL) {
cb8e9e
+                        fop->xdata = dict_new();
cb8e9e
+                        if (fop->xdata == NULL) {
cb8e9e
+                            fop->error = EIO;
cb8e9e
 
cb8e9e
-                    return EC_STATE_REPORT;
cb8e9e
-                }
cb8e9e
-            }
cb8e9e
-            if (dict_set_uint64(fop->xdata, EC_XATTR_SIZE, 0) != 0)
cb8e9e
-            {
cb8e9e
-                gf_log(fop->xl->name, GF_LOG_ERROR, "Unable to prepare "
cb8e9e
-                                                    "readdirp request");
cb8e9e
+                            return EC_STATE_REPORT;
cb8e9e
+                        }
cb8e9e
+                    }
cb8e9e
 
cb8e9e
-                fop->error = EIO;
cb8e9e
+                    if (dict_set_uint64(fop->xdata, EC_XATTR_SIZE, 0)) {
cb8e9e
+                        fop->error = EIO;
cb8e9e
 
cb8e9e
-                return EC_STATE_REPORT;
cb8e9e
+                        return EC_STATE_REPORT;
cb8e9e
+                    }
cb8e9e
             }
cb8e9e
 
cb8e9e
             if (fop->offset != 0)
cb8e9e
@@ -440,37 +434,92 @@ int32_t ec_manager_readdir(ec_fop_data_t * fop, int32_t state)
cb8e9e
                         return EC_STATE_REPORT;
cb8e9e
                 }
cb8e9e
                 fop->mask &= 1ULL << idx;
cb8e9e
+            } else {
cb8e9e
+                    ec_lock_prepare_fd(fop, fop->fd, EC_QUERY_INFO);
cb8e9e
+                    ec_lock(fop);
cb8e9e
             }
cb8e9e
 
cb8e9e
-        /* Fall through */
cb8e9e
+            return EC_STATE_DISPATCH;
cb8e9e
 
cb8e9e
         case EC_STATE_DISPATCH:
cb8e9e
             ec_dispatch_one(fop);
cb8e9e
 
cb8e9e
+            return EC_STATE_PREPARE_ANSWER;
cb8e9e
+
cb8e9e
+        case EC_STATE_PREPARE_ANSWER:
cb8e9e
+            cbk = fop->answer;
cb8e9e
+            if (cbk) {
cb8e9e
+                if ((cbk->op_ret < 0) &&
cb8e9e
+                    ec_is_recoverable_error (cbk->op_errno)) {
cb8e9e
+                    GF_ASSERT (fop->mask & (1ULL<<cbk->idx));
cb8e9e
+                    fop->mask ^= (1ULL << cbk->idx);
cb8e9e
+                    if (fop->mask == 0)
cb8e9e
+                            return EC_STATE_REPORT;
cb8e9e
+                    return EC_STATE_DISPATCH;
cb8e9e
+                }
cb8e9e
+                if ((cbk->op_ret > 0) && (fop->id == GF_FOP_READDIRP)) {
cb8e9e
+                    ec_adjust_readdirp (fop->xl->private, cbk->idx,
cb8e9e
+                                        &cbk->entries);
cb8e9e
+                }
cb8e9e
+            } else {
cb8e9e
+                ec_fop_set_error(fop, EIO);
cb8e9e
+            }
cb8e9e
             return EC_STATE_REPORT;
cb8e9e
 
cb8e9e
+        case EC_STATE_REPORT:
cb8e9e
+            cbk = fop->answer;
cb8e9e
+            GF_ASSERT (cbk);
cb8e9e
+            if (fop->id == GF_FOP_READDIR) {
cb8e9e
+                if (fop->cbks.readdir != NULL) {
cb8e9e
+                    fop->cbks.readdir(fop->req_frame, fop, fop->xl, cbk->op_ret,
cb8e9e
+                                      cbk->op_errno, &cbk->entries, cbk->xdata);
cb8e9e
+                }
cb8e9e
+            } else {
cb8e9e
+                if (fop->cbks.readdirp != NULL) {
cb8e9e
+                    fop->cbks.readdirp(fop->req_frame, fop, fop->xl,
cb8e9e
+                                       cbk->op_ret, cbk->op_errno,
cb8e9e
+                                       &cbk->entries, cbk->xdata);
cb8e9e
+                }
cb8e9e
+            }
cb8e9e
+            if (fop->offset == 0)
cb8e9e
+                    return EC_STATE_LOCK_REUSE;
cb8e9e
+            else
cb8e9e
+                    return EC_STATE_END;
cb8e9e
+
cb8e9e
         case -EC_STATE_INIT:
cb8e9e
-            if (fop->id == GF_FOP_READDIR)
cb8e9e
-            {
cb8e9e
-                if (fop->cbks.readdir != NULL)
cb8e9e
-                {
cb8e9e
+        case -EC_STATE_LOCK:
cb8e9e
+        case -EC_STATE_DISPATCH:
cb8e9e
+        case -EC_STATE_PREPARE_ANSWER:
cb8e9e
+        case -EC_STATE_REPORT:
cb8e9e
+            if (fop->id == GF_FOP_READDIR) {
cb8e9e
+                if (fop->cbks.readdir != NULL) {
cb8e9e
                     fop->cbks.readdir(fop->req_frame, fop, fop->xl, -1,
cb8e9e
                                       fop->error, NULL, NULL);
cb8e9e
                 }
cb8e9e
-            }
cb8e9e
-            else
cb8e9e
-            {
cb8e9e
-                if (fop->cbks.readdirp != NULL)
cb8e9e
-                {
cb8e9e
+            } else {
cb8e9e
+                if (fop->cbks.readdirp != NULL) {
cb8e9e
                     fop->cbks.readdirp(fop->req_frame, fop, fop->xl, -1,
cb8e9e
                                        fop->error, NULL, NULL);
cb8e9e
                 }
cb8e9e
             }
cb8e9e
+            if (fop->offset == 0)
cb8e9e
+                    return EC_STATE_LOCK_REUSE;
cb8e9e
+            else
cb8e9e
+                    return EC_STATE_END;
cb8e9e
 
cb8e9e
-        case EC_STATE_REPORT:
cb8e9e
-        case -EC_STATE_REPORT:
cb8e9e
-            return EC_STATE_END;
cb8e9e
+        case -EC_STATE_LOCK_REUSE:
cb8e9e
+        case EC_STATE_LOCK_REUSE:
cb8e9e
+            GF_ASSERT (fop->offset == 0);
cb8e9e
+            ec_lock_reuse(fop);
cb8e9e
+
cb8e9e
+            return EC_STATE_UNLOCK;
cb8e9e
 
cb8e9e
+        case -EC_STATE_UNLOCK:
cb8e9e
+        case EC_STATE_UNLOCK:
cb8e9e
+            GF_ASSERT (fop->offset == 0);
cb8e9e
+            ec_unlock(fop);
cb8e9e
+
cb8e9e
+            return EC_STATE_END;
cb8e9e
         default:
cb8e9e
             gf_log(fop->xl->name, GF_LOG_ERROR, "Unhandled state %d for %s",
cb8e9e
                    state, ec_fop_name(fop->id));
cb8e9e
@@ -544,51 +593,11 @@ out:
cb8e9e
 
cb8e9e
 /* FOP: readdirp */
cb8e9e
 
cb8e9e
-int32_t ec_readdirp_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
cb8e9e
-                        int32_t op_ret, int32_t op_errno,
cb8e9e
-                        gf_dirent_t * entries, dict_t * xdata)
cb8e9e
-{
cb8e9e
-    ec_fop_data_t * fop = NULL;
cb8e9e
-    int32_t idx = (int32_t)(uintptr_t)cookie;
cb8e9e
-
cb8e9e
-    VALIDATE_OR_GOTO(this, out);
cb8e9e
-    GF_VALIDATE_OR_GOTO(this->name, frame, out);
cb8e9e
-    GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
cb8e9e
-    GF_VALIDATE_OR_GOTO(this->name, this->private, out);
cb8e9e
-
cb8e9e
-    fop = frame->local;
cb8e9e
-
cb8e9e
-    ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx,
cb8e9e
-             frame, op_ret, op_errno);
cb8e9e
-
cb8e9e
-    if (op_ret > 0)
cb8e9e
-    {
cb8e9e
-        ec_adjust_readdir(fop->xl->private, idx, entries);
cb8e9e
-    }
cb8e9e
-
cb8e9e
-    if (!ec_dispatch_one_retry(fop, idx, op_ret))
cb8e9e
-    {
cb8e9e
-        if (fop->cbks.readdirp != NULL)
cb8e9e
-        {
cb8e9e
-            fop->cbks.readdirp(fop->req_frame, fop, this, op_ret, op_errno,
cb8e9e
-                               entries, xdata);
cb8e9e
-        }
cb8e9e
-    }
cb8e9e
-
cb8e9e
-out:
cb8e9e
-    if (fop != NULL)
cb8e9e
-    {
cb8e9e
-        ec_complete(fop);
cb8e9e
-    }
cb8e9e
-
cb8e9e
-    return 0;
cb8e9e
-}
cb8e9e
-
cb8e9e
 void ec_wind_readdirp(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
cb8e9e
 {
cb8e9e
     ec_trace("WIND", fop, "idx=%d", idx);
cb8e9e
 
cb8e9e
-    STACK_WIND_COOKIE(fop->frame, ec_readdirp_cbk, (void *)(uintptr_t)idx,
cb8e9e
+    STACK_WIND_COOKIE(fop->frame, ec_common_readdir_cbk, (void *)(uintptr_t)idx,
cb8e9e
                       ec->xl_list[idx], ec->xl_list[idx]->fops->readdirp,
cb8e9e
                       fop->fd, fop->size, fop->offset, fop->xdata);
cb8e9e
 }
cb8e9e
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
cb8e9e
index ef2170f..1f91391 100644
cb8e9e
--- a/xlators/cluster/ec/src/ec-inode-read.c
cb8e9e
+++ b/xlators/cluster/ec/src/ec-inode-read.c
cb8e9e
@@ -22,7 +22,8 @@
cb8e9e
 int32_t ec_access_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
cb8e9e
                       int32_t op_ret, int32_t op_errno, dict_t * xdata)
cb8e9e
 {
cb8e9e
-    ec_fop_data_t * fop = NULL;
cb8e9e
+    ec_fop_data_t *fop = NULL;
cb8e9e
+    ec_cbk_data_t *cbk = NULL;
cb8e9e
     int32_t idx = (int32_t)(uintptr_t)cookie;
cb8e9e
 
cb8e9e
     VALIDATE_OR_GOTO(this, out);
cb8e9e
@@ -35,19 +36,18 @@ int32_t ec_access_cbk(call_frame_t * frame, void * cookie, xlator_t * this,
cb8e9e
     ec_trace("CBK", fop, "idx=%d, frame=%p, op_ret=%d, op_errno=%d", idx,
cb8e9e
              frame, op_ret, op_errno);
cb8e9e
 
cb8e9e
-    if (!ec_dispatch_one_retry(fop, idx, op_ret))
cb8e9e
-    {
cb8e9e
-        if (fop->cbks.access != NULL)
cb8e9e
-        {
cb8e9e
-            fop->cbks.access(fop->req_frame, fop, this, op_ret, op_errno,
cb8e9e
-                             xdata);
cb8e9e
-        }
cb8e9e
+    cbk = ec_cbk_data_allocate (frame, this, fop, GF_FOP_ACCESS,
cb8e9e
+                                idx, op_ret, op_errno);
cb8e9e
+    if (cbk) {
cb8e9e
+        if (xdata)
cb8e9e
+               cbk->xdata = dict_ref (xdata);
cb8e9e
+        ec_combine (cbk, NULL);
cb8e9e
     }
cb8e9e
 
cb8e9e
 out:
cb8e9e
     if (fop != NULL)
cb8e9e
     {
cb8e9e
-        ec_complete(fop);
cb8e9e
+        ec_complete (fop);
cb8e9e
     }
cb8e9e
 
cb8e9e
     return 0;
cb8e9e
@@ -62,25 +62,72 @@ void ec_wind_access(ec_t * ec, ec_fop_data_t * fop, int32_t idx)
cb8e9e
                       &fop->loc[0], fop->int32, fop->xdata);
cb8e9e
 }
cb8e9e
 
cb8e9e
-int32_t ec_manager_access(ec_fop_data_t * fop, int32_t state)
cb8e9e
+int32_t
cb8e9e
+ec_manager_access(ec_fop_data_t *fop, int32_t state)
cb8e9e
 {
cb8e9e
-    switch (state)
cb8e9e
-    {
cb8e9e
+        ec_cbk_data_t *cbk = NULL;
cb8e9e
+
cb8e9e
+        switch (state) {
cb8e9e
         case EC_STATE_INIT:
cb8e9e
+        case EC_STATE_LOCK:
cb8e9e
+            ec_lock_prepare_inode (fop, &fop->loc[0], EC_QUERY_INFO);
cb8e9e
+            ec_lock (fop);
cb8e9e
+
cb8e9e
+            return EC_STATE_DISPATCH;
cb8e9e
+
cb8e9e
         case EC_STATE_DISPATCH:
cb8e9e
-            ec_dispatch_one(fop);
cb8e9e
+            ec_dispatch_one (fop);
cb8e9e
 
cb8e9e
+            return EC_STATE_PREPARE_ANSWER;
cb8e9e
+
cb8e9e
+        case EC_STATE_PREPARE_ANSWER:
cb8e9e
+            cbk = fop->answer;
cb8e9e
+            if (cbk) {
cb8e9e
+                if ((cbk->op_ret < 0) && ec_is_recoverable_error (cbk->op_errno)) {
cb8e9e
+                    GF_ASSERT (fop->mask & (1ULL<<cbk->idx));
cb8e9e
+                    fop->mask ^= (1ULL << cbk->idx);
cb8e9e
+                    if (fop->mask == 0)
cb8e9e
+                            return EC_STATE_REPORT;
cb8e9e
+                    return EC_STATE_DISPATCH;
cb8e9e
+                }
cb8e9e
+            } else {
cb8e9e
+                ec_fop_set_error(fop, EIO);
cb8e9e
+            }
cb8e9e
             return EC_STATE_REPORT;
cb8e9e
 
cb8e9e
-        case -EC_STATE_INIT:
cb8e9e
-            if (fop->cbks.access != NULL)
cb8e9e
-            {
cb8e9e
-                fop->cbks.access(fop->req_frame, fop, fop->xl, -1, fop->error,
cb8e9e
-                                 NULL);
cb8e9e
+        case EC_STATE_REPORT:
cb8e9e
+            cbk = fop->answer;
cb8e9e
+            GF_ASSERT (cbk);
cb8e9e
+            if (fop->cbks.access != NULL) {
cb8e9e
+                if (cbk) {
cb8e9e
+                    fop->cbks.access(fop->req_frame, fop, fop->xl,
cb8e9e
+                                     cbk->op_ret, cbk->op_errno,
cb8e9e
+                                     cbk->xdata);
cb8e9e
+                }
cb8e9e
             }
cb8e9e
+            return EC_STATE_LOCK_REUSE;
cb8e9e
 
cb8e9e
+        case -EC_STATE_INIT:
cb8e9e
+        case -EC_STATE_LOCK:
cb8e9e
+        case -EC_STATE_DISPATCH:
cb8e9e
+        case -EC_STATE_PREPARE_ANSWER:
cb8e9e
         case -EC_STATE_REPORT:
cb8e9e
-        case EC_STATE_REPORT:
cb8e9e
+            if (fop->cbks.access != NULL) {
cb8e9e
+                fop->cbks.access(fop->req_frame, fop, fop->xl, -1,
cb8e9e
+                                 fop->error, NULL);
cb8e9e
+            }
cb8e9e
+            return -EC_STATE_LOCK_REUSE;
cb8e9e
+
cb8e9e
+        case -EC_STATE_LOCK_REUSE:
cb8e9e
+        case EC_STATE_LOCK_REUSE:
cb8e9e
+            ec_lock_reuse(fop);
cb8e9e
+
cb8e9e
+            return EC_STATE_UNLOCK;
cb8e9e
+
cb8e9e
+        case -EC_STATE_UNLOCK:
cb8e9e
+        case EC_STATE_UNLOCK:
cb8e9e
+            ec_unlock(fop);
cb8e9e
+
cb8e9e
             return EC_STATE_END;
cb8e9e
 
cb8e9e
         default:
cb8e9e
@@ -88,7 +135,7 @@ int32_t ec_manager_access(ec_fop_data_t * fop, int32_t state)
cb8e9e
                    state, ec_fop_name(fop->id));
cb8e9e
 
cb8e9e
             return EC_STATE_END;
cb8e9e
-    }
cb8e9e
+        }
cb8e9e
 }
cb8e9e
 
cb8e9e
 void ec_access(call_frame_t * frame, xlator_t * this, uintptr_t target,
cb8e9e
-- 
cb8e9e
1.7.1
cb8e9e