d1681e
From c3161248afdb42d1bf5e06a32041180cc4be457d Mon Sep 17 00:00:00 2001
d1681e
From: Xavier Hernandez <jahernan@redhat.com>
d1681e
Date: Fri, 6 Oct 2017 10:39:58 +0200
d1681e
Subject: [PATCH 077/128] cluster/ec: add functions for stripe alignment
d1681e
d1681e
This patch removes old functions to align offsets and sizes
d1681e
to stripe size boundaries and adds new ones to offer more
d1681e
possibilities.
d1681e
d1681e
The new functions are:
d1681e
d1681e
 * ec_adjust_offset_down()
d1681e
     Aligns a given offset to a multiple of the stripe size
d1681e
     equal or smaller than the initial one. It returns the
d1681e
     size of the gap between the aligned offset and the given
d1681e
     one.
d1681e
d1681e
 * ec_adjust_offset_up()
d1681e
     Aligns a given offset to a multiple of the stripe size
d1681e
     equal or greater than the initial one. It returns the
d1681e
     size of the skipped region between the given offset and
d1681e
     the aligned one. If an overflow happens, the returned
d1681e
     valid has negative sign (but correct value) and the
d1681e
     offset is set to the maximum value (not aligned).
d1681e
d1681e
 * ec_adjust_size_down()
d1681e
     Aligns the given size to a multiple of the stripe size
d1681e
     equal or smaller than the initial one. It returns the
d1681e
     size of the missed region between the aligned size and
d1681e
     the given one.
d1681e
d1681e
 * ec_adjust_size_up()
d1681e
     Aligns the given size to a multiple of the stripe size
d1681e
     equal or greater than the initial one. It returns the
d1681e
     size of the gap between the given size and the aligned
d1681e
     one. If an overflow happens, the returned value has
d1681e
     negative sign (but correct value) and the size is set
d1681e
     to the maximum value (not aligned).
d1681e
d1681e
These functions have been defined in ec-helpers.h as static
d1681e
inline since they are very small and compilers can optimize
d1681e
them (specially the 'scale' argument).
d1681e
d1681e
upstream patch: https://review.gluster.org/#/c/18440/
d1681e
>Change-Id: I4c91009ad02f76c73772034dfde27ee1c78a80d7
d1681e
>Signed-off-by: Xavier Hernandez <jahernan@redhat.com>
d1681e
d1681e
BUG: 1499865
d1681e
Change-Id: I4c91009ad02f76c73772034dfde27ee1c78a80d7
d1681e
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/123556
d1681e
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d1681e
Reviewed-by: Javier Hernandez Juan <jahernan@redhat.com>
d1681e
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
d1681e
---
d1681e
 xlators/cluster/ec/src/ec-heal.c        |   8 ++-
d1681e
 xlators/cluster/ec/src/ec-helpers.c     |  29 ---------
d1681e
 xlators/cluster/ec/src/ec-helpers.h     | 108 +++++++++++++++++++++++++++++++-
d1681e
 xlators/cluster/ec/src/ec-inode-read.c  |  10 +--
d1681e
 xlators/cluster/ec/src/ec-inode-write.c |  13 ++--
d1681e
 xlators/cluster/ec/src/ec-locks.c       |   8 +--
d1681e
 6 files changed, 129 insertions(+), 47 deletions(-)
d1681e
d1681e
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
d1681e
index a6de3ee..bc25015 100644
d1681e
--- a/xlators/cluster/ec/src/ec-heal.c
d1681e
+++ b/xlators/cluster/ec/src/ec-heal.c
d1681e
@@ -1670,7 +1670,8 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,
d1681e
          * well*/
d1681e
 
d1681e
         if (check_ondisksize) {
d1681e
-                source_size = ec_adjust_size (ec, size[source], 1);
d1681e
+                source_size = size[source];
d1681e
+                ec_adjust_size_up (ec, &source_size, _gf_true);
d1681e
 
d1681e
                 for (i = 0; i < ec->nodes; i++) {
d1681e
                         if (sources[i]) {
d1681e
@@ -1983,7 +1984,7 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
d1681e
         heal->fd = fd_ref (fd);
d1681e
         heal->xl = ec->xl;
d1681e
         heal->data = &barrier;
d1681e
-        size = ec_adjust_size (ec, size, 0);
d1681e
+        ec_adjust_size_up (ec, &size, _gf_false);
d1681e
         heal->total_size = size;
d1681e
         heal->size = (128 * GF_UNIT_KB * (ec->self_heal_window_size));
d1681e
         /* We need to adjust the size to a multiple of the stripe size of the
d1681e
@@ -2038,7 +2039,8 @@ __ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec,
d1681e
                 ret = 0;
d1681e
                 goto out;
d1681e
         }
d1681e
-        trim_offset = ec_adjust_size (ec, size, 1);
d1681e
+        trim_offset = size;
d1681e
+        ec_adjust_offset_up (ec, &trim_offset, _gf_true);
d1681e
         ret = cluster_ftruncate (ec->xl_list, trim, ec->nodes, replies, output,
d1681e
                                  frame, ec->xl, fd, trim_offset, NULL);
d1681e
         for (i = 0; i < ec->nodes; i++) {
d1681e
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
d1681e
index 64b010f..0c66948 100644
d1681e
--- a/xlators/cluster/ec/src/ec-helpers.c
d1681e
+++ b/xlators/cluster/ec/src/ec-helpers.c
d1681e
@@ -799,35 +799,6 @@ ec_fd_t * ec_fd_get(fd_t * fd, xlator_t * xl)
d1681e
     return ctx;
d1681e
 }
d1681e
 
d1681e
-uint32_t ec_adjust_offset(ec_t * ec, off_t * offset, int32_t scale)
d1681e
-{
d1681e
-    off_t head, tmp;
d1681e
-
d1681e
-    tmp = *offset;
d1681e
-    head = tmp % ec->stripe_size;
d1681e
-    tmp -= head;
d1681e
-    if (scale)
d1681e
-    {
d1681e
-        tmp /= ec->fragments;
d1681e
-    }
d1681e
-
d1681e
-    *offset = tmp;
d1681e
-
d1681e
-    return head;
d1681e
-}
d1681e
-
d1681e
-uint64_t ec_adjust_size(ec_t * ec, uint64_t size, int32_t scale)
d1681e
-{
d1681e
-    size += ec->stripe_size - 1;
d1681e
-    size -= size % ec->stripe_size;
d1681e
-    if (scale)
d1681e
-    {
d1681e
-        size /= ec->fragments;
d1681e
-    }
d1681e
-
d1681e
-    return size;
d1681e
-}
d1681e
-
d1681e
 gf_boolean_t
d1681e
 ec_is_internal_xattr (dict_t *dict, char *key, data_t *value, void *data)
d1681e
 {
d1681e
diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h
d1681e
index 4d2145c..cfd7daa 100644
d1681e
--- a/xlators/cluster/ec/src/ec-helpers.h
d1681e
+++ b/xlators/cluster/ec/src/ec-helpers.h
d1681e
@@ -55,8 +55,112 @@ ec_inode_t * ec_inode_get(inode_t * inode, xlator_t * xl);
d1681e
 ec_fd_t * __ec_fd_get(fd_t * fd, xlator_t * xl);
d1681e
 ec_fd_t * ec_fd_get(fd_t * fd, xlator_t * xl);
d1681e
 
d1681e
-uint32_t ec_adjust_offset(ec_t * ec, off_t * offset, int32_t scale);
d1681e
-uint64_t ec_adjust_size(ec_t * ec, uint64_t size, int32_t scale);
d1681e
+static inline uint32_t
d1681e
+ec_adjust_size_down(ec_t *ec, uint64_t *value, gf_boolean_t scale)
d1681e
+{
d1681e
+        uint64_t head, tmp;
d1681e
+
d1681e
+        tmp = *value;
d1681e
+        head = tmp % ec->stripe_size;
d1681e
+        tmp -= head;
d1681e
+
d1681e
+        if (scale) {
d1681e
+                tmp /= ec->fragments;
d1681e
+        }
d1681e
+
d1681e
+        *value = tmp;
d1681e
+
d1681e
+        return (uint32_t)head;
d1681e
+}
d1681e
+
d1681e
+/* This function can cause an overflow if the passed value is too near to the
d1681e
+ * uint64_t limit. If this happens, it returns the tail in negative form and
d1681e
+ * the value is set to UINT64_MAX. */
d1681e
+static inline int32_t
d1681e
+ec_adjust_size_up(ec_t *ec, uint64_t *value, gf_boolean_t scale)
d1681e
+{
d1681e
+        uint64_t tmp;
d1681e
+        int32_t tail;
d1681e
+
d1681e
+        tmp = *value;
d1681e
+        /* We first adjust the value down. This never causes overflow. */
d1681e
+        tail = ec_adjust_size_down(ec, &tmp, scale);
d1681e
+
d1681e
+        /* If the value was already aligned, tail will be 0 and nothing else
d1681e
+         * needs to be done. */
d1681e
+        if (tail != 0) {
d1681e
+                /* Otherwise, we need to compute the real tail and adjust the
d1681e
+                 * returned value to the next stripe. */
d1681e
+                tail = ec->stripe_size - tail;
d1681e
+                if (scale) {
d1681e
+                        tmp += ec->fragment_size;
d1681e
+                } else {
d1681e
+                        tmp += ec->stripe_size;
d1681e
+                        /* If no scaling is requested there's a posibility of
d1681e
+                         * overflow. */
d1681e
+                        if (tmp < ec->stripe_size) {
d1681e
+                                tmp = UINT64_MAX;
d1681e
+                                tail = -tail;
d1681e
+                        }
d1681e
+                }
d1681e
+        }
d1681e
+
d1681e
+        *value = tmp;
d1681e
+
d1681e
+        return tail;
d1681e
+}
d1681e
+
d1681e
+/* This function is equivalent to ec_adjust_size_down() but with a potentially
d1681e
+ * different parameter size (off_t vs uint64_t). */
d1681e
+static inline uint32_t
d1681e
+ec_adjust_offset_down(ec_t *ec, off_t *value, gf_boolean_t scale)
d1681e
+{
d1681e
+        off_t head, tmp;
d1681e
+
d1681e
+        tmp = *value;
d1681e
+        head = tmp % ec->stripe_size;
d1681e
+        tmp -= head;
d1681e
+
d1681e
+        if (scale) {
d1681e
+                tmp /= ec->fragments;
d1681e
+        }
d1681e
+
d1681e
+        *value = tmp;
d1681e
+
d1681e
+        return (uint32_t)head;
d1681e
+}
d1681e
+
d1681e
+/* This function is equivalent to ec_adjust_size_up() but with a potentially
d1681e
+ * different parameter size (off_t vs uint64_t). */
d1681e
+static inline int32_t
d1681e
+ec_adjust_offset_up(ec_t *ec, off_t *value, gf_boolean_t scale)
d1681e
+{
d1681e
+        uint64_t tail, tmp;
d1681e
+
d1681e
+        /* An offset is a signed type that can only have positive values, so
d1681e
+         * we take advantage of this to avoid overflows. We simply convert it
d1681e
+         * to an unsigned integer and operate normally. This won't cause an
d1681e
+         * overflow. Overflow is only checked when converting back to an
d1681e
+         * off_t. */
d1681e
+        tmp = *value;
d1681e
+        tail = ec->stripe_size;
d1681e
+        tail -= (tmp + tail - 1) % tail + 1;
d1681e
+        tmp += tail;
d1681e
+        if (scale) {
d1681e
+                /* If we are scaling, we'll never get an overflow. */
d1681e
+                tmp /= ec->fragments;
d1681e
+        } else {
d1681e
+                /* Check if there has been an overflow. */
d1681e
+                if ((off_t)tmp < 0) {
d1681e
+                        tmp = (1ULL << (sizeof(off_t) * 8 - 1)) - 1ULL;
d1681e
+                        tail = -tail;
d1681e
+                }
d1681e
+        }
d1681e
+
d1681e
+        *value = (off_t)tmp;
d1681e
+
d1681e
+        return (int32_t)tail;
d1681e
+}
d1681e
 
d1681e
 static inline int32_t ec_is_power_of_2(uint32_t value)
d1681e
 {
d1681e
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
d1681e
index d925e82..829f47f 100644
d1681e
--- a/xlators/cluster/ec/src/ec-inode-read.c
d1681e
+++ b/xlators/cluster/ec/src/ec-inode-read.c
d1681e
@@ -1356,9 +1356,10 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)
d1681e
     {
d1681e
         case EC_STATE_INIT:
d1681e
             fop->user_size = fop->size;
d1681e
-            fop->head = ec_adjust_offset(fop->xl->private, &fop->offset, 1);
d1681e
-            fop->size = ec_adjust_size(fop->xl->private, fop->size + fop->head,
d1681e
-                                       1);
d1681e
+            fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset,
d1681e
+                                              _gf_true);
d1681e
+            fop->size += fop->head;
d1681e
+            ec_adjust_size_up(fop->xl->private, &fop->size, _gf_true);
d1681e
 
d1681e
         /* Fall through */
d1681e
 
d1681e
@@ -1561,7 +1562,8 @@ int32_t ec_manager_seek(ec_fop_data_t *fop, int32_t state)
d1681e
     switch (state) {
d1681e
     case EC_STATE_INIT:
d1681e
         fop->user_size = fop->offset;
d1681e
-        fop->head = ec_adjust_offset(fop->xl->private, &fop->offset, 1);
d1681e
+        fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset,
d1681e
+                                          _gf_true);
d1681e
 
d1681e
     /* Fall through */
d1681e
 
d1681e
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
d1681e
index 68bea1a..3ed9b2a 100644
d1681e
--- a/xlators/cluster/ec/src/ec-inode-write.c
d1681e
+++ b/xlators/cluster/ec/src/ec-inode-write.c
d1681e
@@ -870,8 +870,10 @@ int32_t ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
d1681e
                 return EC_STATE_REPORT;
d1681e
         }
d1681e
         fop->user_size = fop->offset + fop->size;
d1681e
-        fop->head = ec_adjust_offset (fop->xl->private, &fop->offset, 1);
d1681e
-        fop->size = ec_adjust_size (fop->xl->private, fop->head + fop->size, 1);
d1681e
+        fop->head = ec_adjust_offset_down (fop->xl->private, &fop->offset,
d1681e
+                                           _gf_true);
d1681e
+        fop->size += fop->head;
d1681e
+        ec_adjust_size_up (fop->xl->private, &fop->size, _gf_true);
d1681e
 
d1681e
         /* Fall through */
d1681e
 
d1681e
@@ -1145,7 +1147,7 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
d1681e
     {
d1681e
         case EC_STATE_INIT:
d1681e
             fop->user_size = fop->offset;
d1681e
-            fop->offset = ec_adjust_size(fop->xl->private, fop->offset, 1);
d1681e
+            ec_adjust_offset_up(fop->xl->private, &fop->offset, _gf_true);
d1681e
 
d1681e
         /* Fall through */
d1681e
 
d1681e
@@ -1508,8 +1510,9 @@ ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop)
d1681e
     int32_t err;
d1681e
 
d1681e
     fop->user_size = iov_length(fop->vector, fop->int32);
d1681e
-    fop->head = ec_adjust_offset(ec, &fop->offset, 0);
d1681e
-    fop->size = ec_adjust_size(ec, fop->user_size + fop->head, 0);
d1681e
+    fop->head = ec_adjust_offset_down(ec, &fop->offset, _gf_false);
d1681e
+    fop->size = fop->user_size + fop->head;
d1681e
+    ec_adjust_size_up(ec, &fop->size, _gf_false);
d1681e
 
d1681e
     if ((fop->int32 != 1) || (fop->head != 0) ||
d1681e
         (fop->size > fop->user_size) ||
d1681e
diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c
d1681e
index ff09852..996035d 100644
d1681e
--- a/xlators/cluster/ec/src/ec-locks.c
d1681e
+++ b/xlators/cluster/ec/src/ec-locks.c
d1681e
@@ -572,10 +572,10 @@ int32_t ec_manager_inodelk(ec_fop_data_t * fop, int32_t state)
d1681e
     switch (state)
d1681e
     {
d1681e
         case EC_STATE_INIT:
d1681e
-            fop->flock.l_len += ec_adjust_offset(fop->xl->private,
d1681e
-                                                 &fop->flock.l_start, 1);
d1681e
-            fop->flock.l_len = ec_adjust_size(fop->xl->private,
d1681e
-                                              fop->flock.l_len, 1);
d1681e
+            fop->flock.l_len += ec_adjust_offset_down(fop->xl->private,
d1681e
+                                                      &fop->flock.l_start,
d1681e
+                                                      _gf_true);
d1681e
+            ec_adjust_offset_up(fop->xl->private, &fop->flock.l_len, _gf_true);
d1681e
             if ((fop->int32 == F_SETLKW) && (fop->flock.l_type != F_UNLCK))
d1681e
             {
d1681e
                 fop->uint32 = EC_LOCK_MODE_ALL;
d1681e
-- 
d1681e
1.8.3.1
d1681e