|
|
d1681e |
From c3161248afdb42d1bf5e06a32041180cc4be457d Mon Sep 17 00:00:00 2001
|
|
|
d1681e |
From: Xavier Hernandez <jahernan@redhat.com>
|
|
|
d1681e |
Date: Fri, 6 Oct 2017 10:39:58 +0200
|
|
|
d1681e |
Subject: [PATCH 077/128] cluster/ec: add functions for stripe alignment
|
|
|
d1681e |
|
|
|
d1681e |
This patch removes old functions to align offsets and sizes
|
|
|
d1681e |
to stripe size boundaries and adds new ones to offer more
|
|
|
d1681e |
possibilities.
|
|
|
d1681e |
|
|
|
d1681e |
The new functions are:
|
|
|
d1681e |
|
|
|
d1681e |
* ec_adjust_offset_down()
|
|
|
d1681e |
Aligns a given offset to a multiple of the stripe size
|
|
|
d1681e |
equal or smaller than the initial one. It returns the
|
|
|
d1681e |
size of the gap between the aligned offset and the given
|
|
|
d1681e |
one.
|
|
|
d1681e |
|
|
|
d1681e |
* ec_adjust_offset_up()
|
|
|
d1681e |
Aligns a given offset to a multiple of the stripe size
|
|
|
d1681e |
equal or greater than the initial one. It returns the
|
|
|
d1681e |
size of the skipped region between the given offset and
|
|
|
d1681e |
the aligned one. If an overflow happens, the returned
|
|
|
d1681e |
valid has negative sign (but correct value) and the
|
|
|
d1681e |
offset is set to the maximum value (not aligned).
|
|
|
d1681e |
|
|
|
d1681e |
* ec_adjust_size_down()
|
|
|
d1681e |
Aligns the given size to a multiple of the stripe size
|
|
|
d1681e |
equal or smaller than the initial one. It returns the
|
|
|
d1681e |
size of the missed region between the aligned size and
|
|
|
d1681e |
the given one.
|
|
|
d1681e |
|
|
|
d1681e |
* ec_adjust_size_up()
|
|
|
d1681e |
Aligns the given size to a multiple of the stripe size
|
|
|
d1681e |
equal or greater than the initial one. It returns the
|
|
|
d1681e |
size of the gap between the given size and the aligned
|
|
|
d1681e |
one. If an overflow happens, the returned value has
|
|
|
d1681e |
negative sign (but correct value) and the size is set
|
|
|
d1681e |
to the maximum value (not aligned).
|
|
|
d1681e |
|
|
|
d1681e |
These functions have been defined in ec-helpers.h as static
|
|
|
d1681e |
inline since they are very small and compilers can optimize
|
|
|
d1681e |
them (specially the 'scale' argument).
|
|
|
d1681e |
|
|
|
d1681e |
upstream patch: https://review.gluster.org/#/c/18440/
|
|
|
d1681e |
>Change-Id: I4c91009ad02f76c73772034dfde27ee1c78a80d7
|
|
|
d1681e |
>Signed-off-by: Xavier Hernandez <jahernan@redhat.com>
|
|
|
d1681e |
|
|
|
d1681e |
BUG: 1499865
|
|
|
d1681e |
Change-Id: I4c91009ad02f76c73772034dfde27ee1c78a80d7
|
|
|
d1681e |
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
|
|
|
d1681e |
Reviewed-on: https://code.engineering.redhat.com/gerrit/123556
|
|
|
d1681e |
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
|
d1681e |
Reviewed-by: Javier Hernandez Juan <jahernan@redhat.com>
|
|
|
d1681e |
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
d1681e |
---
|
|
|
d1681e |
xlators/cluster/ec/src/ec-heal.c | 8 ++-
|
|
|
d1681e |
xlators/cluster/ec/src/ec-helpers.c | 29 ---------
|
|
|
d1681e |
xlators/cluster/ec/src/ec-helpers.h | 108 +++++++++++++++++++++++++++++++-
|
|
|
d1681e |
xlators/cluster/ec/src/ec-inode-read.c | 10 +--
|
|
|
d1681e |
xlators/cluster/ec/src/ec-inode-write.c | 13 ++--
|
|
|
d1681e |
xlators/cluster/ec/src/ec-locks.c | 8 +--
|
|
|
d1681e |
6 files changed, 129 insertions(+), 47 deletions(-)
|
|
|
d1681e |
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
|
|
|
d1681e |
index a6de3ee..bc25015 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-heal.c
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-heal.c
|
|
|
d1681e |
@@ -1670,7 +1670,8 @@ ec_heal_data_find_direction (ec_t *ec, default_args_cbk_t *replies,
|
|
|
d1681e |
* well*/
|
|
|
d1681e |
|
|
|
d1681e |
if (check_ondisksize) {
|
|
|
d1681e |
- source_size = ec_adjust_size (ec, size[source], 1);
|
|
|
d1681e |
+ source_size = size[source];
|
|
|
d1681e |
+ ec_adjust_size_up (ec, &source_size, _gf_true);
|
|
|
d1681e |
|
|
|
d1681e |
for (i = 0; i < ec->nodes; i++) {
|
|
|
d1681e |
if (sources[i]) {
|
|
|
d1681e |
@@ -1983,7 +1984,7 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,
|
|
|
d1681e |
heal->fd = fd_ref (fd);
|
|
|
d1681e |
heal->xl = ec->xl;
|
|
|
d1681e |
heal->data = &barrier;
|
|
|
d1681e |
- size = ec_adjust_size (ec, size, 0);
|
|
|
d1681e |
+ ec_adjust_size_up (ec, &size, _gf_false);
|
|
|
d1681e |
heal->total_size = size;
|
|
|
d1681e |
heal->size = (128 * GF_UNIT_KB * (ec->self_heal_window_size));
|
|
|
d1681e |
/* We need to adjust the size to a multiple of the stripe size of the
|
|
|
d1681e |
@@ -2038,7 +2039,8 @@ __ec_heal_trim_sinks (call_frame_t *frame, ec_t *ec,
|
|
|
d1681e |
ret = 0;
|
|
|
d1681e |
goto out;
|
|
|
d1681e |
}
|
|
|
d1681e |
- trim_offset = ec_adjust_size (ec, size, 1);
|
|
|
d1681e |
+ trim_offset = size;
|
|
|
d1681e |
+ ec_adjust_offset_up (ec, &trim_offset, _gf_true);
|
|
|
d1681e |
ret = cluster_ftruncate (ec->xl_list, trim, ec->nodes, replies, output,
|
|
|
d1681e |
frame, ec->xl, fd, trim_offset, NULL);
|
|
|
d1681e |
for (i = 0; i < ec->nodes; i++) {
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
|
|
|
d1681e |
index 64b010f..0c66948 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-helpers.c
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-helpers.c
|
|
|
d1681e |
@@ -799,35 +799,6 @@ ec_fd_t * ec_fd_get(fd_t * fd, xlator_t * xl)
|
|
|
d1681e |
return ctx;
|
|
|
d1681e |
}
|
|
|
d1681e |
|
|
|
d1681e |
-uint32_t ec_adjust_offset(ec_t * ec, off_t * offset, int32_t scale)
|
|
|
d1681e |
-{
|
|
|
d1681e |
- off_t head, tmp;
|
|
|
d1681e |
-
|
|
|
d1681e |
- tmp = *offset;
|
|
|
d1681e |
- head = tmp % ec->stripe_size;
|
|
|
d1681e |
- tmp -= head;
|
|
|
d1681e |
- if (scale)
|
|
|
d1681e |
- {
|
|
|
d1681e |
- tmp /= ec->fragments;
|
|
|
d1681e |
- }
|
|
|
d1681e |
-
|
|
|
d1681e |
- *offset = tmp;
|
|
|
d1681e |
-
|
|
|
d1681e |
- return head;
|
|
|
d1681e |
-}
|
|
|
d1681e |
-
|
|
|
d1681e |
-uint64_t ec_adjust_size(ec_t * ec, uint64_t size, int32_t scale)
|
|
|
d1681e |
-{
|
|
|
d1681e |
- size += ec->stripe_size - 1;
|
|
|
d1681e |
- size -= size % ec->stripe_size;
|
|
|
d1681e |
- if (scale)
|
|
|
d1681e |
- {
|
|
|
d1681e |
- size /= ec->fragments;
|
|
|
d1681e |
- }
|
|
|
d1681e |
-
|
|
|
d1681e |
- return size;
|
|
|
d1681e |
-}
|
|
|
d1681e |
-
|
|
|
d1681e |
gf_boolean_t
|
|
|
d1681e |
ec_is_internal_xattr (dict_t *dict, char *key, data_t *value, void *data)
|
|
|
d1681e |
{
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-helpers.h b/xlators/cluster/ec/src/ec-helpers.h
|
|
|
d1681e |
index 4d2145c..cfd7daa 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-helpers.h
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-helpers.h
|
|
|
d1681e |
@@ -55,8 +55,112 @@ ec_inode_t * ec_inode_get(inode_t * inode, xlator_t * xl);
|
|
|
d1681e |
ec_fd_t * __ec_fd_get(fd_t * fd, xlator_t * xl);
|
|
|
d1681e |
ec_fd_t * ec_fd_get(fd_t * fd, xlator_t * xl);
|
|
|
d1681e |
|
|
|
d1681e |
-uint32_t ec_adjust_offset(ec_t * ec, off_t * offset, int32_t scale);
|
|
|
d1681e |
-uint64_t ec_adjust_size(ec_t * ec, uint64_t size, int32_t scale);
|
|
|
d1681e |
+static inline uint32_t
|
|
|
d1681e |
+ec_adjust_size_down(ec_t *ec, uint64_t *value, gf_boolean_t scale)
|
|
|
d1681e |
+{
|
|
|
d1681e |
+ uint64_t head, tmp;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ tmp = *value;
|
|
|
d1681e |
+ head = tmp % ec->stripe_size;
|
|
|
d1681e |
+ tmp -= head;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ if (scale) {
|
|
|
d1681e |
+ tmp /= ec->fragments;
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+
|
|
|
d1681e |
+ *value = tmp;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ return (uint32_t)head;
|
|
|
d1681e |
+}
|
|
|
d1681e |
+
|
|
|
d1681e |
+/* This function can cause an overflow if the passed value is too near to the
|
|
|
d1681e |
+ * uint64_t limit. If this happens, it returns the tail in negative form and
|
|
|
d1681e |
+ * the value is set to UINT64_MAX. */
|
|
|
d1681e |
+static inline int32_t
|
|
|
d1681e |
+ec_adjust_size_up(ec_t *ec, uint64_t *value, gf_boolean_t scale)
|
|
|
d1681e |
+{
|
|
|
d1681e |
+ uint64_t tmp;
|
|
|
d1681e |
+ int32_t tail;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ tmp = *value;
|
|
|
d1681e |
+ /* We first adjust the value down. This never causes overflow. */
|
|
|
d1681e |
+ tail = ec_adjust_size_down(ec, &tmp, scale);
|
|
|
d1681e |
+
|
|
|
d1681e |
+ /* If the value was already aligned, tail will be 0 and nothing else
|
|
|
d1681e |
+ * needs to be done. */
|
|
|
d1681e |
+ if (tail != 0) {
|
|
|
d1681e |
+ /* Otherwise, we need to compute the real tail and adjust the
|
|
|
d1681e |
+ * returned value to the next stripe. */
|
|
|
d1681e |
+ tail = ec->stripe_size - tail;
|
|
|
d1681e |
+ if (scale) {
|
|
|
d1681e |
+ tmp += ec->fragment_size;
|
|
|
d1681e |
+ } else {
|
|
|
d1681e |
+ tmp += ec->stripe_size;
|
|
|
d1681e |
+ /* If no scaling is requested there's a posibility of
|
|
|
d1681e |
+ * overflow. */
|
|
|
d1681e |
+ if (tmp < ec->stripe_size) {
|
|
|
d1681e |
+ tmp = UINT64_MAX;
|
|
|
d1681e |
+ tail = -tail;
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+
|
|
|
d1681e |
+ *value = tmp;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ return tail;
|
|
|
d1681e |
+}
|
|
|
d1681e |
+
|
|
|
d1681e |
+/* This function is equivalent to ec_adjust_size_down() but with a potentially
|
|
|
d1681e |
+ * different parameter size (off_t vs uint64_t). */
|
|
|
d1681e |
+static inline uint32_t
|
|
|
d1681e |
+ec_adjust_offset_down(ec_t *ec, off_t *value, gf_boolean_t scale)
|
|
|
d1681e |
+{
|
|
|
d1681e |
+ off_t head, tmp;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ tmp = *value;
|
|
|
d1681e |
+ head = tmp % ec->stripe_size;
|
|
|
d1681e |
+ tmp -= head;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ if (scale) {
|
|
|
d1681e |
+ tmp /= ec->fragments;
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+
|
|
|
d1681e |
+ *value = tmp;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ return (uint32_t)head;
|
|
|
d1681e |
+}
|
|
|
d1681e |
+
|
|
|
d1681e |
+/* This function is equivalent to ec_adjust_size_up() but with a potentially
|
|
|
d1681e |
+ * different parameter size (off_t vs uint64_t). */
|
|
|
d1681e |
+static inline int32_t
|
|
|
d1681e |
+ec_adjust_offset_up(ec_t *ec, off_t *value, gf_boolean_t scale)
|
|
|
d1681e |
+{
|
|
|
d1681e |
+ uint64_t tail, tmp;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ /* An offset is a signed type that can only have positive values, so
|
|
|
d1681e |
+ * we take advantage of this to avoid overflows. We simply convert it
|
|
|
d1681e |
+ * to an unsigned integer and operate normally. This won't cause an
|
|
|
d1681e |
+ * overflow. Overflow is only checked when converting back to an
|
|
|
d1681e |
+ * off_t. */
|
|
|
d1681e |
+ tmp = *value;
|
|
|
d1681e |
+ tail = ec->stripe_size;
|
|
|
d1681e |
+ tail -= (tmp + tail - 1) % tail + 1;
|
|
|
d1681e |
+ tmp += tail;
|
|
|
d1681e |
+ if (scale) {
|
|
|
d1681e |
+ /* If we are scaling, we'll never get an overflow. */
|
|
|
d1681e |
+ tmp /= ec->fragments;
|
|
|
d1681e |
+ } else {
|
|
|
d1681e |
+ /* Check if there has been an overflow. */
|
|
|
d1681e |
+ if ((off_t)tmp < 0) {
|
|
|
d1681e |
+ tmp = (1ULL << (sizeof(off_t) * 8 - 1)) - 1ULL;
|
|
|
d1681e |
+ tail = -tail;
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+ }
|
|
|
d1681e |
+
|
|
|
d1681e |
+ *value = (off_t)tmp;
|
|
|
d1681e |
+
|
|
|
d1681e |
+ return (int32_t)tail;
|
|
|
d1681e |
+}
|
|
|
d1681e |
|
|
|
d1681e |
static inline int32_t ec_is_power_of_2(uint32_t value)
|
|
|
d1681e |
{
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
|
|
|
d1681e |
index d925e82..829f47f 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-inode-read.c
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-inode-read.c
|
|
|
d1681e |
@@ -1356,9 +1356,10 @@ int32_t ec_manager_readv(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
{
|
|
|
d1681e |
case EC_STATE_INIT:
|
|
|
d1681e |
fop->user_size = fop->size;
|
|
|
d1681e |
- fop->head = ec_adjust_offset(fop->xl->private, &fop->offset, 1);
|
|
|
d1681e |
- fop->size = ec_adjust_size(fop->xl->private, fop->size + fop->head,
|
|
|
d1681e |
- 1);
|
|
|
d1681e |
+ fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset,
|
|
|
d1681e |
+ _gf_true);
|
|
|
d1681e |
+ fop->size += fop->head;
|
|
|
d1681e |
+ ec_adjust_size_up(fop->xl->private, &fop->size, _gf_true);
|
|
|
d1681e |
|
|
|
d1681e |
/* Fall through */
|
|
|
d1681e |
|
|
|
d1681e |
@@ -1561,7 +1562,8 @@ int32_t ec_manager_seek(ec_fop_data_t *fop, int32_t state)
|
|
|
d1681e |
switch (state) {
|
|
|
d1681e |
case EC_STATE_INIT:
|
|
|
d1681e |
fop->user_size = fop->offset;
|
|
|
d1681e |
- fop->head = ec_adjust_offset(fop->xl->private, &fop->offset, 1);
|
|
|
d1681e |
+ fop->head = ec_adjust_offset_down(fop->xl->private, &fop->offset,
|
|
|
d1681e |
+ _gf_true);
|
|
|
d1681e |
|
|
|
d1681e |
/* Fall through */
|
|
|
d1681e |
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
|
|
|
d1681e |
index 68bea1a..3ed9b2a 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-inode-write.c
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-inode-write.c
|
|
|
d1681e |
@@ -870,8 +870,10 @@ int32_t ec_manager_fallocate(ec_fop_data_t *fop, int32_t state)
|
|
|
d1681e |
return EC_STATE_REPORT;
|
|
|
d1681e |
}
|
|
|
d1681e |
fop->user_size = fop->offset + fop->size;
|
|
|
d1681e |
- fop->head = ec_adjust_offset (fop->xl->private, &fop->offset, 1);
|
|
|
d1681e |
- fop->size = ec_adjust_size (fop->xl->private, fop->head + fop->size, 1);
|
|
|
d1681e |
+ fop->head = ec_adjust_offset_down (fop->xl->private, &fop->offset,
|
|
|
d1681e |
+ _gf_true);
|
|
|
d1681e |
+ fop->size += fop->head;
|
|
|
d1681e |
+ ec_adjust_size_up (fop->xl->private, &fop->size, _gf_true);
|
|
|
d1681e |
|
|
|
d1681e |
/* Fall through */
|
|
|
d1681e |
|
|
|
d1681e |
@@ -1145,7 +1147,7 @@ int32_t ec_manager_truncate(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
{
|
|
|
d1681e |
case EC_STATE_INIT:
|
|
|
d1681e |
fop->user_size = fop->offset;
|
|
|
d1681e |
- fop->offset = ec_adjust_size(fop->xl->private, fop->offset, 1);
|
|
|
d1681e |
+ ec_adjust_offset_up(fop->xl->private, &fop->offset, _gf_true);
|
|
|
d1681e |
|
|
|
d1681e |
/* Fall through */
|
|
|
d1681e |
|
|
|
d1681e |
@@ -1508,8 +1510,9 @@ ec_writev_prepare_buffers(ec_t *ec, ec_fop_data_t *fop)
|
|
|
d1681e |
int32_t err;
|
|
|
d1681e |
|
|
|
d1681e |
fop->user_size = iov_length(fop->vector, fop->int32);
|
|
|
d1681e |
- fop->head = ec_adjust_offset(ec, &fop->offset, 0);
|
|
|
d1681e |
- fop->size = ec_adjust_size(ec, fop->user_size + fop->head, 0);
|
|
|
d1681e |
+ fop->head = ec_adjust_offset_down(ec, &fop->offset, _gf_false);
|
|
|
d1681e |
+ fop->size = fop->user_size + fop->head;
|
|
|
d1681e |
+ ec_adjust_size_up(ec, &fop->size, _gf_false);
|
|
|
d1681e |
|
|
|
d1681e |
if ((fop->int32 != 1) || (fop->head != 0) ||
|
|
|
d1681e |
(fop->size > fop->user_size) ||
|
|
|
d1681e |
diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c
|
|
|
d1681e |
index ff09852..996035d 100644
|
|
|
d1681e |
--- a/xlators/cluster/ec/src/ec-locks.c
|
|
|
d1681e |
+++ b/xlators/cluster/ec/src/ec-locks.c
|
|
|
d1681e |
@@ -572,10 +572,10 @@ int32_t ec_manager_inodelk(ec_fop_data_t * fop, int32_t state)
|
|
|
d1681e |
switch (state)
|
|
|
d1681e |
{
|
|
|
d1681e |
case EC_STATE_INIT:
|
|
|
d1681e |
- fop->flock.l_len += ec_adjust_offset(fop->xl->private,
|
|
|
d1681e |
- &fop->flock.l_start, 1);
|
|
|
d1681e |
- fop->flock.l_len = ec_adjust_size(fop->xl->private,
|
|
|
d1681e |
- fop->flock.l_len, 1);
|
|
|
d1681e |
+ fop->flock.l_len += ec_adjust_offset_down(fop->xl->private,
|
|
|
d1681e |
+ &fop->flock.l_start,
|
|
|
d1681e |
+ _gf_true);
|
|
|
d1681e |
+ ec_adjust_offset_up(fop->xl->private, &fop->flock.l_len, _gf_true);
|
|
|
d1681e |
if ((fop->int32 == F_SETLKW) && (fop->flock.l_type != F_UNLCK))
|
|
|
d1681e |
{
|
|
|
d1681e |
fop->uint32 = EC_LOCK_MODE_ALL;
|
|
|
d1681e |
--
|
|
|
d1681e |
1.8.3.1
|
|
|
d1681e |
|