17b94a
From 29ec87484b1ee3ad6417c37726db8aa9296f3a83 Mon Sep 17 00:00:00 2001
17b94a
From: Kotresh HR <khiremat@redhat.com>
17b94a
Date: Wed, 8 May 2019 11:26:06 +0530
17b94a
Subject: [PATCH 163/169] geo-rep: Fix sync hang with tarssh
17b94a
17b94a
Problem:
17b94a
Geo-rep sync hangs when tarssh is used as sync
17b94a
engine at heavy workload.
17b94a
17b94a
Analysis and Root cause:
17b94a
It's found out that the tar process was hung.
17b94a
When debugged further, it's found out that stderr
17b94a
buffer of tar process on master was full i.e., 64k.
17b94a
When the buffer was copied to a file from /proc/pid/fd/2,
17b94a
the hang is resolved.
17b94a
17b94a
This can happen when files picked by tar process
17b94a
to sync doesn't exist on master anymore. If this count
17b94a
increases around 1k, the stderr buffer is filled up.
17b94a
17b94a
Fix:
17b94a
The tar process is executed using Popen with stderr as PIPE.
17b94a
The final execution is something like below.
17b94a
17b94a
tar | ssh <args> root@slave tar --overwrite -xf - -C <path>
17b94a
17b94a
It was waiting on ssh process first using communicate() and then tar.
17b94a
Note that communicate() reads stdout and stderr. So when stderr of tar
17b94a
process is filled up, there is no one to read until untar via ssh is
17b94a
completed. This can't happen and leads to deadlock.
17b94a
Hence we should be waiting on both process parallely, so that stderr is
17b94a
read on both processes.
17b94a
17b94a
Backport of:
17b94a
 > Patch: https://review.gluster.org/22684
17b94a
 > Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf
17b94a
 > BUG: 1707728
17b94a
 > Signed-off-by: Kotresh HR <khiremat@redhat.com>
17b94a
17b94a
Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf
17b94a
fixes: bz#1708116
17b94a
Signed-off-by: Kotresh HR <khiremat@redhat.com>
17b94a
Reviewed-on: https://code.engineering.redhat.com/gerrit/172395
17b94a
Tested-by: RHGS Build Bot <nigelb@redhat.com>
17b94a
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
17b94a
---
17b94a
 geo-replication/syncdaemon/resource.py |  22 ++++--
17b94a
 tests/00-geo-rep/georep-stderr-hang.t  | 128 +++++++++++++++++++++++++++++++++
17b94a
 tests/geo-rep.rc                       |  17 +++++
17b94a
 3 files changed, 163 insertions(+), 4 deletions(-)
17b94a
 create mode 100644 tests/00-geo-rep/georep-stderr-hang.t
17b94a
17b94a
diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
17b94a
index 522279b..b16db60 100644
17b94a
--- a/geo-replication/syncdaemon/resource.py
17b94a
+++ b/geo-replication/syncdaemon/resource.py
17b94a
@@ -1540,15 +1540,29 @@ class SSH(object):
17b94a
 
17b94a
         p0.stdin.close()
17b94a
         p0.stdout.close()  # Allow p0 to receive a SIGPIPE if p1 exits.
17b94a
-        # wait for tar to terminate, collecting any errors, further
17b94a
-        # waiting for transfer to complete
17b94a
-        _, stderr1 = p1.communicate()
17b94a
 
17b94a
         # stdin and stdout of p0 is already closed, Reset to None and
17b94a
         # wait for child process to complete
17b94a
         p0.stdin = None
17b94a
         p0.stdout = None
17b94a
-        p0.communicate()
17b94a
+
17b94a
+        def wait_for_tar(p0):
17b94a
+            _, stderr = p0.communicate()
17b94a
+            if log_err:
17b94a
+                for errline in stderr.strip().split("\n")[:-1]:
17b94a
+                    if "No such file or directory" not in errline:
17b94a
+                        logging.error(lf("SYNC Error",
17b94a
+                                         sync_engine="Tarssh",
17b94a
+                                         error=errline))
17b94a
+
17b94a
+        t = syncdutils.Thread(target=wait_for_tar, args=(p0, ))
17b94a
+        # wait for tar to terminate, collecting any errors, further
17b94a
+        # waiting for transfer to complete
17b94a
+        t.start()
17b94a
+
17b94a
+        # wait for ssh process
17b94a
+        _, stderr1 = p1.communicate()
17b94a
+        t.join()
17b94a
 
17b94a
         if log_err:
17b94a
             for errline in stderr1.strip().split("\n")[:-1]:
17b94a
diff --git a/tests/00-geo-rep/georep-stderr-hang.t b/tests/00-geo-rep/georep-stderr-hang.t
17b94a
new file mode 100644
17b94a
index 0000000..496f0e6
17b94a
--- /dev/null
17b94a
+++ b/tests/00-geo-rep/georep-stderr-hang.t
17b94a
@@ -0,0 +1,128 @@
17b94a
+#!/bin/bash
17b94a
+
17b94a
+. $(dirname $0)/../include.rc
17b94a
+. $(dirname $0)/../volume.rc
17b94a
+. $(dirname $0)/../geo-rep.rc
17b94a
+. $(dirname $0)/../env.rc
17b94a
+
17b94a
+SCRIPT_TIMEOUT=500
17b94a
+
17b94a
+AREQUAL_PATH=$(dirname $0)/../utils
17b94a
+test "`uname -s`" != "Linux" && {
17b94a
+    CFLAGS="$CFLAGS -lintl";
17b94a
+}
17b94a
+build_tester $AREQUAL_PATH/arequal-checksum.c $CFLAGS
17b94a
+
17b94a
+### Basic Tests with Distribute Replicate volumes
17b94a
+
17b94a
+##Cleanup and start glusterd
17b94a
+cleanup;
17b94a
+TEST glusterd;
17b94a
+TEST pidof glusterd
17b94a
+
17b94a
+
17b94a
+##Variables
17b94a
+GEOREP_CLI="$CLI volume geo-replication"
17b94a
+master=$GMV0
17b94a
+SH0="127.0.0.1"
17b94a
+slave=${SH0}::${GSV0}
17b94a
+num_active=2
17b94a
+num_passive=2
17b94a
+master_mnt=$M0
17b94a
+slave_mnt=$M1
17b94a
+
17b94a
+############################################################
17b94a
+#SETUP VOLUMES AND GEO-REPLICATION
17b94a
+############################################################
17b94a
+
17b94a
+##create_and_start_master_volume
17b94a
+TEST $CLI volume create $GMV0 $H0:$B0/${GMV0}1;
17b94a
+TEST $CLI volume start $GMV0
17b94a
+
17b94a
+##create_and_start_slave_volume
17b94a
+TEST $CLI volume create $GSV0 $H0:$B0/${GSV0}1;
17b94a
+TEST $CLI volume start $GSV0
17b94a
+TEST $CLI volume set $GSV0 performance.stat-prefetch off
17b94a
+TEST $CLI volume set $GSV0 performance.quick-read off
17b94a
+TEST $CLI volume set $GSV0 performance.readdir-ahead off
17b94a
+TEST $CLI volume set $GSV0 performance.read-ahead off
17b94a
+
17b94a
+##Mount master
17b94a
+TEST glusterfs -s $H0 --volfile-id $GMV0 $M0
17b94a
+
17b94a
+##Mount slave
17b94a
+TEST glusterfs -s $H0 --volfile-id $GSV0 $M1
17b94a
+
17b94a
+############################################################
17b94a
+#BASIC GEO-REPLICATION TESTS
17b94a
+############################################################
17b94a
+
17b94a
+TEST create_georep_session $master $slave
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Created"
17b94a
+
17b94a
+#Config gluster-command-dir
17b94a
+TEST $GEOREP_CLI $master $slave config gluster-command-dir ${GLUSTER_CMD_DIR}
17b94a
+
17b94a
+#Config gluster-command-dir
17b94a
+TEST $GEOREP_CLI $master $slave config slave-gluster-command-dir ${GLUSTER_CMD_DIR}
17b94a
+
17b94a
+#Set changelog roll-over time to 45 secs
17b94a
+TEST $CLI volume set $GMV0 changelog.rollover-time 45
17b94a
+
17b94a
+#Wait for common secret pem file to be created
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT  0 check_common_secret_file
17b94a
+
17b94a
+#Verify the keys are distributed
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT  0 check_keys_distributed
17b94a
+
17b94a
+#Set sync-jobs to 1
17b94a
+TEST $GEOREP_CLI $master $slave config sync-jobs 1
17b94a
+
17b94a
+#Start_georep
17b94a
+TEST $GEOREP_CLI $master $slave start
17b94a
+
17b94a
+touch $M0
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT  1 check_status_num_rows "Active"
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT  1 check_status_num_rows "Changelog Crawl"
17b94a
+
17b94a
+#Check History Crawl.
17b94a
+TEST $GEOREP_CLI $master $slave stop
17b94a
+TEST create_data_hang "rsync_hang"
17b94a
+TEST create_data "history_rsync"
17b94a
+TEST $GEOREP_CLI $master $slave start
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT  1 check_status_num_rows "Active"
17b94a
+
17b94a
+#Verify arequal for whole volume
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
17b94a
+
17b94a
+#Stop Geo-rep
17b94a
+TEST $GEOREP_CLI $master $slave stop
17b94a
+
17b94a
+#Config tarssh as sync-engine
17b94a
+TEST $GEOREP_CLI $master $slave config sync-method tarssh
17b94a
+
17b94a
+#Create tarssh hang data
17b94a
+TEST create_data_hang "tarssh_hang"
17b94a
+TEST create_data "history_tar"
17b94a
+
17b94a
+TEST $GEOREP_CLI $master $slave start
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT  1 check_status_num_rows "Active"
17b94a
+
17b94a
+#Verify arequal for whole volume
17b94a
+EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
17b94a
+
17b94a
+#Stop Geo-rep
17b94a
+TEST $GEOREP_CLI $master $slave stop
17b94a
+
17b94a
+#Delete Geo-rep
17b94a
+TEST $GEOREP_CLI $master $slave delete
17b94a
+
17b94a
+#Cleanup are-equal binary
17b94a
+TEST rm $AREQUAL_PATH/arequal-checksum
17b94a
+
17b94a
+#Cleanup authorized keys
17b94a
+sed -i '/^command=.*SSH_ORIGINAL_COMMAND#.*/d' ~/.ssh/authorized_keys
17b94a
+sed -i '/^command=.*gsyncd.*/d' ~/.ssh/authorized_keys
17b94a
+
17b94a
+cleanup;
17b94a
+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000
17b94a
diff --git a/tests/geo-rep.rc b/tests/geo-rep.rc
17b94a
index 2035b9f..e4f014e 100644
17b94a
--- a/tests/geo-rep.rc
17b94a
+++ b/tests/geo-rep.rc
17b94a
@@ -101,6 +101,23 @@ function create_data()
17b94a
     chown 1000:1000 ${master_mnt}/${prefix}_chown_f1_ಸಂತಸ
17b94a
 }
17b94a
 
17b94a
+function create_data_hang()
17b94a
+{
17b94a
+    prefix=$1
17b94a
+    mkdir ${master_mnt}/${prefix}
17b94a
+    cd ${master_mnt}/${prefix}
17b94a
+    # ~1k files is required with 1 sync-job and hang happens if
17b94a
+    # stderr buffer of tar/ssh executed with Popen is full (i.e., 64k).
17b94a
+    # 64k is hit when ~800 files were  not found while syncing data
17b94a
+    # from master. So around 1k files is required to hit the condition.
17b94a
+    for i in {1..1000}
17b94a
+    do
17b94a
+        echo "test data" > file$i
17b94a
+        mv -f file$i file
17b94a
+    done
17b94a
+    cd -
17b94a
+}
17b94a
+
17b94a
 function chown_file_ok()
17b94a
 {
17b94a
     local file_owner=$(stat --format "%u:%g" "$1")
17b94a
-- 
17b94a
1.8.3.1
17b94a