21ab4e
From a587b4c7e84c18257bc3ba724bb9aae81bb1f3b9 Mon Sep 17 00:00:00 2001
21ab4e
From: Susant Palai <spalai@redhat.com>
21ab4e
Date: Tue, 25 Apr 2017 18:32:45 +0530
21ab4e
Subject: [PATCH 477/486] cluster/dht: fix on demand migration files from
21ab4e
 client
21ab4e
21ab4e
    On demand migration of files i.e. migration done by clients
21ab4e
    triggered by a setfattr was broken.
21ab4e
21ab4e
    Dependency on defrag led to crash when migration was triggered from
21ab4e
    client.
21ab4e
21ab4e
    Note: This functionality is not available for tiered volumes. Migration
21ab4e
    from tier served client will fail with ENOTSUP.
21ab4e
21ab4e
    usage (But refer to the steps mentioned below to avoid any issues) :
21ab4e
    setfattr -n "trusted.distribute.migrate-data" -v "1" <filename>
21ab4e
21ab4e
    The purpose of fixing the on-demand client migration was to give a
21ab4e
    workaround where the user has lots of empty directories compared to
21ab4e
    files and want to do a remove-brick process.
21ab4e
21ab4e
    Here are the steps to trigger file migration for remove-brick process from
21ab4e
    client. (This is highly recommended to follow below steps as is)
21ab4e
21ab4e
    Let's say it is a replica volume and user want to remove a replica pair
21ab4e
    named brick1 and brick2. (Make sure healing is completed before you run
21ab4e
    these steps)
21ab4e
21ab4e
    Step-1: Start remove-brick process
21ab4e
     - gluster v remove-brick <volname> brick1 brick2 start
21ab4e
    Step-2: Kill the rebalance daemon
21ab4e
     - ps aux | grep glusterfs | grep rebalance\/ | awk '{print $2}' | xargs kill
21ab4e
    Step-3: Do a fresh mount as mentioned here
21ab4e
     -  glusterfs -s ${localhostname} --volfile-id rebalance/$volume-name /tmp/mount/point
21ab4e
    Step-4: Go to one of the bricks (among brick1 and brick2)
21ab4e
     - cd <brick1 path>
21ab4e
    Step-5: Run the following command.
21ab4e
     - find . -not \( -path ./.glusterfs -prune \) -type f -not -perm 01000 -exec bash -c 'setfattr -n "distribute.fix.layout" -v "1" ${mountpoint}/$(dirname '{}')' \; -exec  setfattr -n "trusted.distribute.migrate-data" -v "1" ${mountpoint}/'{}' \;
21ab4e
21ab4e
    This command will ignore the linkto files and empty directories. Do a fix-layout of
21ab4e
    the parent directory. And trigger a migration operation on the files.
21ab4e
21ab4e
    Step-6: Once this process is completed do "remove-brick force"
21ab4e
     - gluster v remove-brick <volname> brick1 brick2 force
21ab4e
21ab4e
    Note: Use the above script only when there are large number of empty directories.
21ab4e
    Since the script does a crawl on the brick side directly and avoids directories those
21ab4e
    are empty, the time spent on fixing layout on those directories are eliminated(even if the script
21ab4e
    does not do fix-layout on empty directories, post remove-brick a fresh layout will be built
21ab4e
    for the directory, hence not affecting application continuity).
21ab4e
21ab4e
    Detailing the expectation for hardlink migartion with this patch:
21ab4e
        Hardlink is migrated only for remove-brick process. It is highly essential
21ab4e
    to have a new mount(step-3) for the hardlink migration to happen. Why?:
21ab4e
    setfattr operation is an inode based operation. Since, we are doing setfattr from
21ab4e
    fuse mount here, inode_path will try to build path from the linked dentries to the inode.
21ab4e
    For a file without hardlinks the path construction will be correct. But for hardlinks,
21ab4e
    the inode will have multiple dentries linked.
21ab4e
21ab4e
            Without fresh mount, inode_path will always get the most recently linked dentry.
21ab4e
    e.g. if there are three hardlinks named dir1/link1, dir2/link2, dir3/link3, on a client
21ab4e
    where these hardlinks are looked up, inode_path will always return the path dir3/link3
21ab4e
    if dir3/link3 was looked up most recently. Hence, we won't be able to create linkto
21ab4e
    files for all other hardlinks on destination (read gf_defrag_handle_hardlink for more details
21ab4e
    on hardlink migration).
21ab4e
21ab4e
            With a fresh mount, the lookup and setfattr become serialized. e.g. link2 won't be
21ab4e
    looked up until link1 is looked up and migrated. Hence, inode_path will always have the correct
21ab4e
    path, in this case link1 dentry is picked up(as this is the most recently looked up inode) and
21ab4e
    the path is built right.
21ab4e
21ab4e
    Note: If you run the above script on an existing mount(all entries looked up), hard links may
21ab4e
    not be migrated, but there should not be any other issue. Please raise a bug, if you find any
21ab4e
    issue.
21ab4e
21ab4e
    Tests: Manual
21ab4e
21ab4e
> Change-Id: I9854cdd4955d9e24494f348fb29ba856ea7ac50a
21ab4e
> UG: 1450975
21ab4e
> Signed-off-by: Susant Palai <spalai@redhat.com>
21ab4e
> Reviewed-on: https://review.gluster.org/17115
21ab4e
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
21ab4e
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
21ab4e
> Signed-off-by: Susant Palai <spalai@redhat.com>
21ab4e
21ab4e
Change-Id: I9854cdd4955d9e24494f348fb29ba856ea7ac50a
21ab4e
BUG: 1428936
21ab4e
Signed-off-by: Susant Palai <spalai@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/107633
21ab4e
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
Reviewed-by: Nithya Balachandran <nbalacha@redhat.com>
21ab4e
---
21ab4e
 xlators/cluster/dht/src/dht-common.c    |  3 +++
21ab4e
 xlators/cluster/dht/src/dht-common.h    |  7 +++---
21ab4e
 xlators/cluster/dht/src/dht-rebalance.c | 41 ++++++++++++++++++++-------------
21ab4e
 xlators/cluster/dht/src/dht-shared.c    |  4 +++-
21ab4e
 4 files changed, 35 insertions(+), 20 deletions(-)
21ab4e
21ab4e
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
21ab4e
index 9286125..052ac7f 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-common.c
21ab4e
+++ b/xlators/cluster/dht/src/dht-common.c
21ab4e
@@ -4159,6 +4159,9 @@ dht_setxattr (call_frame_t *frame, xlator_t *this,
21ab4e
                         goto err;
21ab4e
                 }
21ab4e
 
21ab4e
+                if (gf_uuid_is_null (local->loc.pargfid))
21ab4e
+                        gf_uuid_copy (local->loc.pargfid, local->loc.parent->gfid);
21ab4e
+
21ab4e
                 methods->migration_get_dst_subvol(this, local);
21ab4e
 
21ab4e
                 if (!local->rebalance.target_node) {
21ab4e
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
21ab4e
index 184bd22..eb1f2bd 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-common.h
21ab4e
+++ b/xlators/cluster/dht/src/dht-common.h
21ab4e
@@ -463,9 +463,6 @@ struct gf_defrag_info_ {
21ab4e
         int32_t                      current_thread_count;
21ab4e
         pthread_cond_t               df_wakeup_thread;
21ab4e
 
21ab4e
-        /* Hard link handle requirement */
21ab4e
-        synclock_t                   link_lock;
21ab4e
-
21ab4e
         /* lock migration flag */
21ab4e
         gf_boolean_t                 lock_migration_enabled;
21ab4e
 };
21ab4e
@@ -565,6 +562,10 @@ struct dht_conf {
21ab4e
 
21ab4e
         gf_boolean_t    lock_migration_enabled;
21ab4e
         gf_lock_t       lock;
21ab4e
+
21ab4e
+        /* Hard link handle requirement for migration triggered from client*/
21ab4e
+        synclock_t      link_lock;
21ab4e
+
21ab4e
 };
21ab4e
 typedef struct dht_conf dht_conf_t;
21ab4e
 
21ab4e
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
21ab4e
index 0bbe952..1ee76fc 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-rebalance.c
21ab4e
+++ b/xlators/cluster/dht/src/dht-rebalance.c
21ab4e
@@ -494,7 +494,7 @@ out:
21ab4e
 static int
21ab4e
 __check_file_has_hardlink (xlator_t *this, loc_t *loc,
21ab4e
                            struct iatt *stbuf, dict_t *xattrs, int flags,
21ab4e
-                           gf_defrag_info_t *defrag, int *fop_errno)
21ab4e
+                           gf_defrag_info_t *defrag, dht_conf_t *conf, int *fop_errno)
21ab4e
 {
21ab4e
        int ret = 0;
21ab4e
 
21ab4e
@@ -505,10 +505,10 @@ __check_file_has_hardlink (xlator_t *this, loc_t *loc,
21ab4e
        if (stbuf->ia_nlink > 1) {
21ab4e
                 /* support for decomission */
21ab4e
                 if (flags == GF_DHT_MIGRATE_HARDLINK) {
21ab4e
-                        synclock_lock (&defrag->link_lock);
21ab4e
+                        synclock_lock (&conf->link_lock);
21ab4e
                         ret = gf_defrag_handle_hardlink
21ab4e
                                 (this, loc, xattrs, stbuf, fop_errno);
21ab4e
-                        synclock_unlock (&defrag->link_lock);
21ab4e
+                        synclock_unlock (&conf->link_lock);
21ab4e
                         /*
21ab4e
                           Returning zero will force the file to be remigrated.
21ab4e
                           Checkout gf_defrag_handle_hardlink for more
21ab4e
@@ -546,7 +546,8 @@ __check_file_has_hardlink (xlator_t *this, loc_t *loc,
21ab4e
 static int
21ab4e
 __is_file_migratable (xlator_t *this, loc_t *loc,
21ab4e
                       struct iatt *stbuf, dict_t *xattrs, int flags,
21ab4e
-                                gf_defrag_info_t *defrag, int *fop_errno)
21ab4e
+                      gf_defrag_info_t *defrag, dht_conf_t *conf,
21ab4e
+                      int *fop_errno)
21ab4e
 {
21ab4e
         int ret = -1;
21ab4e
         int lock_count = 0;
21ab4e
@@ -561,7 +562,7 @@ __is_file_migratable (xlator_t *this, loc_t *loc,
21ab4e
                 goto out;
21ab4e
         }
21ab4e
 
21ab4e
-        if (!defrag->lock_migration_enabled) {
21ab4e
+        if (!conf->lock_migration_enabled) {
21ab4e
                 ret = dict_get_int32 (xattrs, GLUSTERFS_POSIXLK_COUNT,
21ab4e
                                       &lock_count);
21ab4e
                 if (ret) {
21ab4e
@@ -588,7 +589,7 @@ __is_file_migratable (xlator_t *this, loc_t *loc,
21ab4e
 
21ab4e
         /* Check if file has hardlink*/
21ab4e
         ret = __check_file_has_hardlink (this, loc, stbuf, xattrs,
21ab4e
-                                         flags, defrag, fop_errno);
21ab4e
+                                         flags, defrag, conf, fop_errno);
21ab4e
 out:
21ab4e
         return ret;
21ab4e
 }
21ab4e
@@ -1493,11 +1494,19 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
         fd_t                    *linkto_fd              = NULL;
21ab4e
         gf_boolean_t            ignore_failure          = _gf_false;
21ab4e
 
21ab4e
+
21ab4e
+        /* If defrag is NULL, it should be assumed that migration is triggered
21ab4e
+         * from client */
21ab4e
         defrag = conf->defrag;
21ab4e
-        if (!defrag)
21ab4e
+
21ab4e
+        /* migration of files from clients is restricted to non-tiered clients
21ab4e
+         * for now */
21ab4e
+        if (!defrag && dht_is_tier_xlator (this)) {
21ab4e
+                ret = ENOTSUP;
21ab4e
                 goto out;
21ab4e
+        }
21ab4e
 
21ab4e
-        if (defrag->tier_conf.is_tier)
21ab4e
+        if (defrag && defrag->tier_conf.is_tier)
21ab4e
                 log_level = GF_LOG_TRACE;
21ab4e
 
21ab4e
         gf_log (this->name,
21ab4e
@@ -1526,7 +1535,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
 
21ab4e
         /* Do not migrate file in case lock migration is not enabled on the
21ab4e
          * volume*/
21ab4e
-        if (!defrag->lock_migration_enabled) {
21ab4e
+        if (!conf->lock_migration_enabled) {
21ab4e
                 ret = dict_set_int32 (dict,
21ab4e
                                  GLUSTERFS_POSIXLK_COUNT, sizeof(int32_t));
21ab4e
                 if (ret) {
21ab4e
@@ -1582,7 +1591,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
         src_ia_prot = stbuf.ia_prot;
21ab4e
 
21ab4e
         /* Check if file can be migrated */
21ab4e
-        ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag, defrag,
21ab4e
+        ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag, defrag, conf,
21ab4e
                                     fop_errno);
21ab4e
         if (ret) {
21ab4e
                 if (ret == -2)
21ab4e
@@ -1702,7 +1711,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
 
21ab4e
         /* Check again if file has hardlink */
21ab4e
         ret = __check_file_has_hardlink (this, loc, &stbuf, xattr_rsp,
21ab4e
-                                         flag, defrag, fop_errno);
21ab4e
+                                         flag, defrag, conf, fop_errno);
21ab4e
         if (ret) {
21ab4e
                 if (ret == -2)
21ab4e
                         ret = 0;
21ab4e
@@ -1715,7 +1724,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
 
21ab4e
 
21ab4e
         /* All I/O happens in this function */
21ab4e
-        if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
21ab4e
+        if (defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
21ab4e
                 ret = __tier_migrate_data (defrag, from, to, src_fd, dst_fd,
21ab4e
                                                     stbuf.ia_size,
21ab4e
                                                     file_has_holes, fop_errno);
21ab4e
@@ -1774,7 +1783,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
 
21ab4e
         /* Take meta lock  */
21ab4e
 
21ab4e
-        if (defrag->lock_migration_enabled) {
21ab4e
+        if (conf->lock_migration_enabled) {
21ab4e
                 meta_dict = dict_new ();
21ab4e
                 if (!meta_dict) {
21ab4e
                         gf_msg (this->name, GF_LOG_ERROR, 0,
21ab4e
@@ -1822,7 +1831,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
                 }
21ab4e
         }
21ab4e
 
21ab4e
-        if (!defrag->lock_migration_enabled) {
21ab4e
+        if (!conf->lock_migration_enabled) {
21ab4e
                 plock.l_type = F_WRLCK;
21ab4e
                 plock.l_start = 0;
21ab4e
                 plock.l_len = 0;
21ab4e
@@ -2020,7 +2029,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
         }
21ab4e
 
21ab4e
         /* store size of previous migrated file  */
21ab4e
-        if (defrag->tier_conf.is_tier) {
21ab4e
+        if (defrag && defrag->tier_conf.is_tier) {
21ab4e
                 if (from != TIER_HASHED_SUBVOL) {
21ab4e
                         defrag->tier_conf.st_last_promoted_size = stbuf.ia_size;
21ab4e
                 } else {
21ab4e
@@ -2130,7 +2139,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
21ab4e
 
21ab4e
 metaunlock:
21ab4e
 
21ab4e
-        if (defrag->lock_migration_enabled && meta_locked) {
21ab4e
+        if (conf->lock_migration_enabled && meta_locked) {
21ab4e
 
21ab4e
                 dict_del (meta_dict, GF_META_LOCK_KEY);
21ab4e
 
21ab4e
diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c
21ab4e
index 86b19e3..1128cfe 100644
21ab4e
--- a/xlators/cluster/dht/src/dht-shared.c
21ab4e
+++ b/xlators/cluster/dht/src/dht-shared.c
21ab4e
@@ -241,6 +241,8 @@ dht_fini (xlator_t *this)
21ab4e
 
21ab4e
                 GF_FREE (conf->subvolume_status);
21ab4e
 
21ab4e
+                synclock_destroy (&conf->link_lock);
21ab4e
+
21ab4e
                 if (conf->lock_pool)
21ab4e
                         mem_pool_destroy (conf->lock_pool);
21ab4e
 
21ab4e
@@ -678,6 +680,7 @@ dht_init (xlator_t *this)
21ab4e
         LOCK_INIT (&conf->subvolume_lock);
21ab4e
         LOCK_INIT (&conf->layout_lock);
21ab4e
         LOCK_INIT (&conf->lock);
21ab4e
+        synclock_init (&conf->link_lock, SYNC_LOCK_DEFAULT);
21ab4e
 
21ab4e
         /* We get the commit-hash to set only for rebalance process */
21ab4e
         if (dict_get_uint32 (this->options,
21ab4e
@@ -733,7 +736,6 @@ dht_init (xlator_t *this)
21ab4e
 
21ab4e
                 defrag->wakeup_crawler = 0;
21ab4e
 
21ab4e
-                synclock_init (&defrag->link_lock, SYNC_LOCK_DEFAULT);
21ab4e
                 pthread_mutex_init (&defrag->dfq_mutex, 0);
21ab4e
                 pthread_cond_init  (&defrag->parallel_migration_cond, 0);
21ab4e
                 pthread_cond_init  (&defrag->rebalance_crawler_alarm, 0);
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e