From a587b4c7e84c18257bc3ba724bb9aae81bb1f3b9 Mon Sep 17 00:00:00 2001 From: Susant Palai Date: Tue, 25 Apr 2017 18:32:45 +0530 Subject: [PATCH 477/486] cluster/dht: fix on demand migration files from client On demand migration of files i.e. migration done by clients triggered by a setfattr was broken. Dependency on defrag led to crash when migration was triggered from client. Note: This functionality is not available for tiered volumes. Migration from tier served client will fail with ENOTSUP. usage (But refer to the steps mentioned below to avoid any issues) : setfattr -n "trusted.distribute.migrate-data" -v "1" The purpose of fixing the on-demand client migration was to give a workaround where the user has lots of empty directories compared to files and want to do a remove-brick process. Here are the steps to trigger file migration for remove-brick process from client. (This is highly recommended to follow below steps as is) Let's say it is a replica volume and user want to remove a replica pair named brick1 and brick2. (Make sure healing is completed before you run these steps) Step-1: Start remove-brick process - gluster v remove-brick brick1 brick2 start Step-2: Kill the rebalance daemon - ps aux | grep glusterfs | grep rebalance\/ | awk '{print $2}' | xargs kill Step-3: Do a fresh mount as mentioned here - glusterfs -s ${localhostname} --volfile-id rebalance/$volume-name /tmp/mount/point Step-4: Go to one of the bricks (among brick1 and brick2) - cd Step-5: Run the following command. - find . -not \( -path ./.glusterfs -prune \) -type f -not -perm 01000 -exec bash -c 'setfattr -n "distribute.fix.layout" -v "1" ${mountpoint}/$(dirname '{}')' \; -exec setfattr -n "trusted.distribute.migrate-data" -v "1" ${mountpoint}/'{}' \; This command will ignore the linkto files and empty directories. Do a fix-layout of the parent directory. And trigger a migration operation on the files. Step-6: Once this process is completed do "remove-brick force" - gluster v remove-brick brick1 brick2 force Note: Use the above script only when there are large number of empty directories. Since the script does a crawl on the brick side directly and avoids directories those are empty, the time spent on fixing layout on those directories are eliminated(even if the script does not do fix-layout on empty directories, post remove-brick a fresh layout will be built for the directory, hence not affecting application continuity). Detailing the expectation for hardlink migartion with this patch: Hardlink is migrated only for remove-brick process. It is highly essential to have a new mount(step-3) for the hardlink migration to happen. Why?: setfattr operation is an inode based operation. Since, we are doing setfattr from fuse mount here, inode_path will try to build path from the linked dentries to the inode. For a file without hardlinks the path construction will be correct. But for hardlinks, the inode will have multiple dentries linked. Without fresh mount, inode_path will always get the most recently linked dentry. e.g. if there are three hardlinks named dir1/link1, dir2/link2, dir3/link3, on a client where these hardlinks are looked up, inode_path will always return the path dir3/link3 if dir3/link3 was looked up most recently. Hence, we won't be able to create linkto files for all other hardlinks on destination (read gf_defrag_handle_hardlink for more details on hardlink migration). With a fresh mount, the lookup and setfattr become serialized. e.g. link2 won't be looked up until link1 is looked up and migrated. Hence, inode_path will always have the correct path, in this case link1 dentry is picked up(as this is the most recently looked up inode) and the path is built right. Note: If you run the above script on an existing mount(all entries looked up), hard links may not be migrated, but there should not be any other issue. Please raise a bug, if you find any issue. Tests: Manual > Change-Id: I9854cdd4955d9e24494f348fb29ba856ea7ac50a > UG: 1450975 > Signed-off-by: Susant Palai > Reviewed-on: https://review.gluster.org/17115 > NetBSD-regression: NetBSD Build System > CentOS-regression: Gluster Build System > Smoke: Gluster Build System > Reviewed-by: Raghavendra G > Signed-off-by: Susant Palai Change-Id: I9854cdd4955d9e24494f348fb29ba856ea7ac50a BUG: 1428936 Signed-off-by: Susant Palai Reviewed-on: https://code.engineering.redhat.com/gerrit/107633 Reviewed-by: Atin Mukherjee Reviewed-by: Nithya Balachandran --- xlators/cluster/dht/src/dht-common.c | 3 +++ xlators/cluster/dht/src/dht-common.h | 7 +++--- xlators/cluster/dht/src/dht-rebalance.c | 41 ++++++++++++++++++++------------- xlators/cluster/dht/src/dht-shared.c | 4 +++- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c index 9286125..052ac7f 100644 --- a/xlators/cluster/dht/src/dht-common.c +++ b/xlators/cluster/dht/src/dht-common.c @@ -4159,6 +4159,9 @@ dht_setxattr (call_frame_t *frame, xlator_t *this, goto err; } + if (gf_uuid_is_null (local->loc.pargfid)) + gf_uuid_copy (local->loc.pargfid, local->loc.parent->gfid); + methods->migration_get_dst_subvol(this, local); if (!local->rebalance.target_node) { diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h index 184bd22..eb1f2bd 100644 --- a/xlators/cluster/dht/src/dht-common.h +++ b/xlators/cluster/dht/src/dht-common.h @@ -463,9 +463,6 @@ struct gf_defrag_info_ { int32_t current_thread_count; pthread_cond_t df_wakeup_thread; - /* Hard link handle requirement */ - synclock_t link_lock; - /* lock migration flag */ gf_boolean_t lock_migration_enabled; }; @@ -565,6 +562,10 @@ struct dht_conf { gf_boolean_t lock_migration_enabled; gf_lock_t lock; + + /* Hard link handle requirement for migration triggered from client*/ + synclock_t link_lock; + }; typedef struct dht_conf dht_conf_t; diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c index 0bbe952..1ee76fc 100644 --- a/xlators/cluster/dht/src/dht-rebalance.c +++ b/xlators/cluster/dht/src/dht-rebalance.c @@ -494,7 +494,7 @@ out: static int __check_file_has_hardlink (xlator_t *this, loc_t *loc, struct iatt *stbuf, dict_t *xattrs, int flags, - gf_defrag_info_t *defrag, int *fop_errno) + gf_defrag_info_t *defrag, dht_conf_t *conf, int *fop_errno) { int ret = 0; @@ -505,10 +505,10 @@ __check_file_has_hardlink (xlator_t *this, loc_t *loc, if (stbuf->ia_nlink > 1) { /* support for decomission */ if (flags == GF_DHT_MIGRATE_HARDLINK) { - synclock_lock (&defrag->link_lock); + synclock_lock (&conf->link_lock); ret = gf_defrag_handle_hardlink (this, loc, xattrs, stbuf, fop_errno); - synclock_unlock (&defrag->link_lock); + synclock_unlock (&conf->link_lock); /* Returning zero will force the file to be remigrated. Checkout gf_defrag_handle_hardlink for more @@ -546,7 +546,8 @@ __check_file_has_hardlink (xlator_t *this, loc_t *loc, static int __is_file_migratable (xlator_t *this, loc_t *loc, struct iatt *stbuf, dict_t *xattrs, int flags, - gf_defrag_info_t *defrag, int *fop_errno) + gf_defrag_info_t *defrag, dht_conf_t *conf, + int *fop_errno) { int ret = -1; int lock_count = 0; @@ -561,7 +562,7 @@ __is_file_migratable (xlator_t *this, loc_t *loc, goto out; } - if (!defrag->lock_migration_enabled) { + if (!conf->lock_migration_enabled) { ret = dict_get_int32 (xattrs, GLUSTERFS_POSIXLK_COUNT, &lock_count); if (ret) { @@ -588,7 +589,7 @@ __is_file_migratable (xlator_t *this, loc_t *loc, /* Check if file has hardlink*/ ret = __check_file_has_hardlink (this, loc, stbuf, xattrs, - flags, defrag, fop_errno); + flags, defrag, conf, fop_errno); out: return ret; } @@ -1493,11 +1494,19 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, fd_t *linkto_fd = NULL; gf_boolean_t ignore_failure = _gf_false; + + /* If defrag is NULL, it should be assumed that migration is triggered + * from client */ defrag = conf->defrag; - if (!defrag) + + /* migration of files from clients is restricted to non-tiered clients + * for now */ + if (!defrag && dht_is_tier_xlator (this)) { + ret = ENOTSUP; goto out; + } - if (defrag->tier_conf.is_tier) + if (defrag && defrag->tier_conf.is_tier) log_level = GF_LOG_TRACE; gf_log (this->name, @@ -1526,7 +1535,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, /* Do not migrate file in case lock migration is not enabled on the * volume*/ - if (!defrag->lock_migration_enabled) { + if (!conf->lock_migration_enabled) { ret = dict_set_int32 (dict, GLUSTERFS_POSIXLK_COUNT, sizeof(int32_t)); if (ret) { @@ -1582,7 +1591,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, src_ia_prot = stbuf.ia_prot; /* Check if file can be migrated */ - ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag, defrag, + ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag, defrag, conf, fop_errno); if (ret) { if (ret == -2) @@ -1702,7 +1711,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, /* Check again if file has hardlink */ ret = __check_file_has_hardlink (this, loc, &stbuf, xattr_rsp, - flag, defrag, fop_errno); + flag, defrag, conf, fop_errno); if (ret) { if (ret == -2) ret = 0; @@ -1715,7 +1724,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, /* All I/O happens in this function */ - if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) { + if (defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) { ret = __tier_migrate_data (defrag, from, to, src_fd, dst_fd, stbuf.ia_size, file_has_holes, fop_errno); @@ -1774,7 +1783,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, /* Take meta lock */ - if (defrag->lock_migration_enabled) { + if (conf->lock_migration_enabled) { meta_dict = dict_new (); if (!meta_dict) { gf_msg (this->name, GF_LOG_ERROR, 0, @@ -1822,7 +1831,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, } } - if (!defrag->lock_migration_enabled) { + if (!conf->lock_migration_enabled) { plock.l_type = F_WRLCK; plock.l_start = 0; plock.l_len = 0; @@ -2020,7 +2029,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, } /* store size of previous migrated file */ - if (defrag->tier_conf.is_tier) { + if (defrag && defrag->tier_conf.is_tier) { if (from != TIER_HASHED_SUBVOL) { defrag->tier_conf.st_last_promoted_size = stbuf.ia_size; } else { @@ -2130,7 +2139,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, metaunlock: - if (defrag->lock_migration_enabled && meta_locked) { + if (conf->lock_migration_enabled && meta_locked) { dict_del (meta_dict, GF_META_LOCK_KEY); diff --git a/xlators/cluster/dht/src/dht-shared.c b/xlators/cluster/dht/src/dht-shared.c index 86b19e3..1128cfe 100644 --- a/xlators/cluster/dht/src/dht-shared.c +++ b/xlators/cluster/dht/src/dht-shared.c @@ -241,6 +241,8 @@ dht_fini (xlator_t *this) GF_FREE (conf->subvolume_status); + synclock_destroy (&conf->link_lock); + if (conf->lock_pool) mem_pool_destroy (conf->lock_pool); @@ -678,6 +680,7 @@ dht_init (xlator_t *this) LOCK_INIT (&conf->subvolume_lock); LOCK_INIT (&conf->layout_lock); LOCK_INIT (&conf->lock); + synclock_init (&conf->link_lock, SYNC_LOCK_DEFAULT); /* We get the commit-hash to set only for rebalance process */ if (dict_get_uint32 (this->options, @@ -733,7 +736,6 @@ dht_init (xlator_t *this) defrag->wakeup_crawler = 0; - synclock_init (&defrag->link_lock, SYNC_LOCK_DEFAULT); pthread_mutex_init (&defrag->dfq_mutex, 0); pthread_cond_init (&defrag->parallel_migration_cond, 0); pthread_cond_init (&defrag->rebalance_crawler_alarm, 0); -- 1.8.3.1