a3470f
From c3161248afdb42d1bf5e06a32041180cc4be457d Mon Sep 17 00:00:00 2001
a3470f
From: Xavier Hernandez <jahernan@redhat.com>
a3470f
Date: Fri, 6 Oct 2017 10:39:58 +0200
a3470f
Subject: [PATCH 077/128] cluster/ec: add functions for stripe alignment
a3470f
a3470f
This patch removes old functions to align offsets and sizes
a3470f
to stripe size boundaries and adds new ones to offer more
a3470f
possibilities.
a3470f
a3470f
The new functions are:
a3470f
a3470f
 * ec_adjust_offset_down()
a3470f
     Aligns a given offset to a multiple of the stripe size
a3470f
     equal or smaller than the initial one. It returns the
a3470f
     size of the gap between the aligned offset and the given
a3470f
     one.
a3470f
a3470f
 * ec_adjust_offset_up()
a3470f
     Aligns a given offset to a multiple of the stripe size
a3470f
     equal or greater than the initial one. It returns the
a3470f
     size of the skipped region between the given offset and
a3470f
     the aligned one. If an overflow happens, the returned
a3470f
     valid has negative sign (but correct value) and the
a3470f
     offset is set to the maximum value (not aligned).
a3470f
a3470f
 * ec_adjust_size_down()
a3470f
     Aligns the given size to a multiple of the stripe size
a3470f
     equal or smaller than the initial one. It returns the
a3470f
     size of the missed region between the aligned size and
a3470f
     the given one.
a3470f
a3470f
 * ec_adjust_size_up()
a3470f
     Aligns the given size to a multiple of the stripe size
a3470f
     equal or greater than the initial one. It returns the
a3470f
     size of the gap between the given size and the aligned
a3470f
     one. If an overflow happens, the returned value has
a3470f
     negative sign (but correct value) and the size is set
a3470f
     to the maximum value (not aligned).
a3470f
a3470f
These functions have been defined in ec-helpers.h as static
a3470f
inline since they are very small and compilers can optimize
a3470f
them (specially the 'scale' argument).
a3470f
a3470f
upstream patch: https://review.gluster.org/#/c/18440/
a3470f
>Change-Id: I4c91009ad02f76c73772034dfde27ee1c78a80d7
a3470f
>Signed-off-by: Xavier Hernandez <jahernan@redhat.com>
a3470f
a3470f
BUG: 1499865
a3470f
Change-Id: I4c91009ad02f76c73772034dfde27ee1c78a80d7
a3470f
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
a3470f
Reviewed-on: https://code.engineering.redhat.com/gerrit/123556
a3470f
Tested-by: RHGS Build Bot <nigelb@redhat.com>
a3470f
Reviewed-by: Javier Hernandez Juan <jahernan@redhat.com>
a3470f
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
a3470f
---
a3470f
 xlators/cluster/ec/src/ec-heal.c        |   8 ++-
a3470f
 xlators/cluster/ec/src/ec-helpers.c     |  29 ---------
a3470f
 xlators/cluster/ec/src/ec-helpers.h     | 108 +++++++++++++++++++++++++++++++-
a3470f
 xlators/cluster/ec/src/ec-inode-read.c  |  10 +--
a3470f
 xlators/cluster/ec/src/ec-inode-write.c |  13 ++--
a3470f
 xlators/cluster/ec/src/ec-locks.c       |   8 +--
a3470f
 6 files changed, 129 insertions(+), 47 deletions(-)
a3470f
a3470f
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
a3470f
index a6de3ee..bc25015 100644
a3470f
--- a/xlators/cluster/ec/src/ec-heal.c
a3470f
+++ b/xlators/cluster/ec/src/ec-heal.c
a3470f
@@ -1670,7 +1670,8 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,
a3470f
          * well*/
a3470f
 
a3470f
         if (check_ondisksize) {
a3470f
-                source_size = ec_adjust_size (ec, size[source], 1);
a3470f
+                source_size = size[source];
a3470f
+                ec_adjust_size_up (ec, &source_size, _gf_true);
a3470f
 
a3470f
                 for (i = 0; i < ec->nodes; i++) {
a3470f
                         if (sources[i]) {
a3470f
@@ -1983,7 +1984,7 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
a3470f
         heal->fd = fd_ref (fd);
a3470f
         heal->xl = ec->xl;
a3470f
         heal->data = &barrier;
a3470f
-        size = ec_adjust_size (ec, size, 0);
a3470f
+        ec_adjust_size_up (ec, &size, _gf_false);
a3470f
         heal->total_size = size;
a3470f
         heal->size = (128 * GF_UNIT_KB * (ec->self_heal_window_size));
a3470f
         /* We need to adjust the size to a multiple of the stripe size of the
a3470f
@@ -2038,7 +2039,8 @@ __ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec,
a3470f
                 ret = 0;
a3470f
                 goto out;
a3470f
         }
a3470f
-        trim_offset = ec_adjust_size (ec, size, 1);
a3470f
+        trim_offset = size;
a3470f
+        ec_adjust_offset_up (ec, &trim_offset, _gf_true);
a3470f
         ret = cluster_ftruncate (ec->xl_list, trim, ec->nodes, replies, output,
a3470f
                                  frame, ec->xl, fd, trim_offset, NULL);
a3470f
         for (i = 0; i < ec->nodes; i++) {
a3470f
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
a3470f
index 64b010f..0c66948 100644
a3470f
--- a/xlators/cluster/ec/src/ec-helpers.c
a3470f
+++ b/xlators/cluster/ec/src/ec-helpers.c
a3470f
@@ -799,35 +799,6 @@ ec_fd_t * ec_fd_get(fd_t * fd, xlator_t * xl)
a3470f
     return ctx;
a3470f
 }
a3470f
 
a3470f
-uint32_t ec_adjust_offset(ec_t * ec, off_t * offset, int32_t scale)
a3470f
-{
a3470f
-    off_t head, tmp;
a3470f
-
a3470f
-    tmp = *offset;
a3470f
-    head = tmp % ec->stripe_size;
a3470f
-    tmp -= head;
a3470f
-    if (scale)
a3470f
-    {
a3470f
-        tmp /= ec->fragments;
a3470f
-    }
a3470f
-
a3470f
-    *offset = tmp;
a3470f
-
a3470f
-    return head;
a3470f
-}
a3470f
-
a3470f
-uint64_t ec_adjust_size(ec_t * ec, uint64_t size, int32_t scale)
a3470f
-{
a3470f
-    size += ec->stripe_size - 1;
a3470f
-    size -= size % ec->stripe_size;
a3470f
-    if (scale)
a3470f
-    {
a3470f
-        size /= ec->fragments;
a3470f
-    }
a3470f
-
a3470f
-    return size;
a3470f
-}
a3470f
-
a3470f
 gf_boolean_t
a3470f
 ec_is_internal_xattr (dict_t *dict, char *key, data_t *value, void *data)
a3470f
 {
a3470f
diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h
a3470f
index 4d2145c..cfd7daa 100644
a3470f
--- a/xlators/cluster/ec/src/ec-helpers.h
a3470f
+++ b/xlators/cluster/ec/src/ec-helpers.h
a3470f
@@ -55,8 +55,112 @@ ec_inode_t * ec_inode_get(inode_t * inode, xlator_t * xl);
a3470f
 ec_fd_t * __ec_fd_get(fd_t * fd, xlator_t * xl);
a3470f
 ec_fd_t * ec_fd_get(fd_t * fd, xlator_t * xl);
a3470f
 
a3470f
-uint32_t ec_adjust_offset(ec_t * ec, off_t * offset, int32_t scale);
a3470f
-uint64_t ec_adjust_size(ec_t * ec, uint64_t size, int32_t scale);
a3470f
+static inline uint32_t
a3470f
+ec_adjust_size_down(ec_t *ec, uint64_t *value, gf_boolean_t scale)
a3470f
+{
a3470f
+        uint64_t head, tmp;
a3470f
+
a3470f
+        tmp = *value;
a3470f
+        head = tmp % ec->stripe_size;
a3470f
+        tmp -= head;
a3470f
+
a3470f
+        if (scale) {
a3470f
+                tmp /= ec->fragments;
a3470f
+        }
a3470f
+
a3470f
+        *value = tmp;
a3470f
+
a3470f
+        return (uint32_t)head;
a3470f
+}
a3470f
+
a3470f
+/* This function can cause an overflow if the passed value is too near to the
a3470f
+ * uint64_t limit. If this happens, it returns the tail in negative form and
a3470f
+ * the value is set to UINT64_MAX. */
a3470f
+static inline int32_t
a3470f
+ec_adjust_size_up(ec_t *ec, uint64_t *value, gf_boolean_t scale)
a3470f
+{
a3470f
+        uint64_t tmp;
a3470f
+        int32_t tail;
a3470f
+
a3470f
+        tmp = *value;
a3470f
+        /* We first adjust the value down. This never causes overflow. */
a3470f
+        tail = ec_adjust_size_down(ec, &tmp, scale);
a3470f
+
a3470f
+        /* If the value was already aligned, tail will be 0 and nothing else
a3470f
+         * needs to be done. */
a3470f
+        if (tail != 0) {
a3470f
+                /* Otherwise, we need to compute the real tail and adjust the
a3470f
+                 * returned value to the next stripe. */
a3470f
+                tail = ec->stripe_size - tail;
a3470f
+                if (scale) {
a3470f
+                        tmp += ec->fragment_size;
a3470f
+                } else {
a3470f
+                        tmp += ec->stripe_size;
a3470f
+                        /* If no scaling is requested there's a posibility of
a3470f
+                         * overflow. */
a3470f
+                        if (tmp < ec->stripe_size) {
a3470f
+                                tmp = UINT64_MAX;
a3470f
+                                tail = -tail;
a3470f
+                        }
a3470f
+                }
a3470f
+        }
a3470f
+
a3470f
+        *value = tmp;
a3470f
+
a3470f
+        return tail;
a3470f
+}
a3470f
+
a3470f
+/* This function is equivalent to ec_adjust_size_down() but with a potentially
a3470f
+ * different parameter size (off_t vs uint64_t). */
a3470f
+static inline uint32_t
a3470f
+ec_adjust_offset_down(ec_t *ec, off_t *value, gf_boolean_t scale)
a3470f
+{
a3470f
+        off_t head, tmp;
a3470f
+
a3470f
+        tmp = *value;
a3470f
+        head = tmp % ec->stripe_size;
a3470f
+        tmp -= head;
a3470f
+
a3470f
+        if (scale) {
a3470f
+                tmp /= ec->fragments;
a3470f
+        }
a3470f
+
a3470f
+        *value = tmp;
a3470f
+
a3470f
+        return (uint32_t)head;
a3470f
+}
a3470f
+
a3470f
+/* This function is equivalent to ec_adjust_size_up() but with a potentially
a3470f
+ * different parameter size (off_t vs uint64_t). */
a3470f
+static inline int32_t
a3470f
+ec_adjust_offset_up(ec_t *ec, off_t *value, gf_boolean_t scale)
a3470f
+{
a3470f
+        uint64_t tail, tmp;
a3470f
+
a3470f
+        /* An offset is a signed type that can only have positive values, so
a3470f
+         * we take advantage of this to avoid overflows. We simply convert it
a3470f
+         * to an unsigned integer and operate normally. This won't cause an
a3470f
+         * overflow. Overflow is only checked when converting back to an
a3470f
+         * off_t. */
a3470f
+        tmp = *value;
a3470f
+        tail = ec->stripe_size;
a3470f
+        tail -= (tmp + tail - 1) % tail + 1;
a3470f
+        tmp += tail;
a3470f
+        if (scale) {
a3470f
+                /* If we are scaling, we'll never get an overflow. */
a3470f
+                tmp /= ec->fragments;
a3470f
+        } else {
a3470f
+                /* Check if there has been an overflow. */
a3470f
+                if ((off_t)tmp < 0) {
a3470f
+                        tmp = (1ULL << (sizeof(off_t) * 8 - 1)) - 1ULL;
a3470f
+                        tail = -tail;
a3470f
+                }
a3470f
+        }
a3470f
+
a3470f
+        *value = (off_t)tmp;
a3470f
+
a3470f
+        return (int32_t)tail;
a3470f
+}
a3470f
 
a3470f
 static inline int32_t ec_is_power_of_2(uint32_t value)
a3470f
 {
a3470f
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
a3470f
index d925e82..829f47f 100644
a3470f
--- a/xlators/cluster/ec/src/ec-inode-read.c
a3470f
+++ b/xlators/cluster/ec/src/ec-inode-read.c
a3470f
@@ -1356,9 +1356,10 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)
a3470f
     {
a3470f
         case EC_STATE_INIT:
a3470f
             fop->user_size = fop->size;
a3470f
-            fop->head = ec_adjust_offset(fop->xl->private, &fop->offset, 1);
a3470f
-            fop->size = ec_adjust_size(fop->xl->private, fop->size + fop->head,
a3470f
-                                       1);
a3470f
+            fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset,
a3470f
+                                              _gf_true);
a3470f
+            fop->size += fop->head;
a3470f
+            ec_adjust_size_up(fop->xl->private, &fop->size, _gf_true);
a3470f
 
a3470f
         /* Fall through */
a3470f
 
a3470f
@@ -1561,7 +1562,8 @@ int32_t ec_manager_seek(ec_fop_data_t *fop, int32_t state)
a3470f
     switch (state) {
a3470f
     case EC_STATE_INIT:
a3470f
         fop->user_size = fop->offset;
a3470f
-        fop->head = ec_adjust_offset(fop->xl->private, &fop->offset, 1);
a3470f
+        fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset,
a3470f
+                                          _gf_true);
a3470f
 
a3470f
     /* Fall through */
a3470f
 
a3470f
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
a3470f
index 68bea1a..3ed9b2a 100644
a3470f
--- a/xlators/cluster/ec/src/ec-inode-write.c
a3470f
+++ b/xlators/cluster/ec/src/ec-inode-write.c
a3470f
@@ -870,8 +870,10 @@ int32_t ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
a3470f
                 return EC_STATE_REPORT;
a3470f
         }
a3470f
         fop->user_size = fop->offset + fop->size;
a3470f
-        fop->head = ec_adjust_offset (fop->xl->private, &fop->offset, 1);
a3470f
-        fop->size = ec_adjust_size (fop->xl->private, fop->head + fop->size, 1);
a3470f
+        fop->head = ec_adjust_offset_down (fop->xl->private, &fop->offset,
a3470f
+                                           _gf_true);
a3470f
+        fop->size += fop->head;
a3470f
+        ec_adjust_size_up (fop->xl->private, &fop->size, _gf_true);
a3470f
 
a3470f
         /* Fall through */
a3470f
 
a3470f
@@ -1145,7 +1147,7 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
a3470f
     {
a3470f
         case EC_STATE_INIT:
a3470f
             fop->user_size = fop->offset;
a3470f
-            fop->offset = ec_adjust_size(fop->xl->private, fop->offset, 1);
a3470f
+            ec_adjust_offset_up(fop->xl->private, &fop->offset, _gf_true);
a3470f
 
a3470f
         /* Fall through */
a3470f
 
a3470f
@@ -1508,8 +1510,9 @@ ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop)
a3470f
     int32_t err;
a3470f
 
a3470f
     fop->user_size = iov_length(fop->vector, fop->int32);
a3470f
-    fop->head = ec_adjust_offset(ec, &fop->offset, 0);
a3470f
-    fop->size = ec_adjust_size(ec, fop->user_size + fop->head, 0);
a3470f
+    fop->head = ec_adjust_offset_down(ec, &fop->offset, _gf_false);
a3470f
+    fop->size = fop->user_size + fop->head;
a3470f
+    ec_adjust_size_up(ec, &fop->size, _gf_false);
a3470f
 
a3470f
     if ((fop->int32 != 1) || (fop->head != 0) ||
a3470f
         (fop->size > fop->user_size) ||
a3470f
diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c
a3470f
index ff09852..996035d 100644
a3470f
--- a/xlators/cluster/ec/src/ec-locks.c
a3470f
+++ b/xlators/cluster/ec/src/ec-locks.c
a3470f
@@ -572,10 +572,10 @@ int32_t ec_manager_inodelk(ec_fop_data_t * fop, int32_t state)
a3470f
     switch (state)
a3470f
     {
a3470f
         case EC_STATE_INIT:
a3470f
-            fop->flock.l_len += ec_adjust_offset(fop->xl->private,
a3470f
-                                                 &fop->flock.l_start, 1);
a3470f
-            fop->flock.l_len = ec_adjust_size(fop->xl->private,
a3470f
-                                              fop->flock.l_len, 1);
a3470f
+            fop->flock.l_len += ec_adjust_offset_down(fop->xl->private,
a3470f
+                                                      &fop->flock.l_start,
a3470f
+                                                      _gf_true);
a3470f
+            ec_adjust_offset_up(fop->xl->private, &fop->flock.l_len, _gf_true);
a3470f
             if ((fop->int32 == F_SETLKW) && (fop->flock.l_type != F_UNLCK))
a3470f
             {
a3470f
                 fop->uint32 = EC_LOCK_MODE_ALL;
a3470f
-- 
a3470f
1.8.3.1
a3470f