14f8ab
From 29ec87484b1ee3ad6417c37726db8aa9296f3a83 Mon Sep 17 00:00:00 2001
14f8ab
From: Kotresh HR <khiremat@redhat.com>
14f8ab
Date: Wed, 8 May 2019 11:26:06 +0530
14f8ab
Subject: [PATCH 163/169] geo-rep: Fix sync hang with tarssh
14f8ab
14f8ab
Problem:
14f8ab
Geo-rep sync hangs when tarssh is used as sync
14f8ab
engine at heavy workload.
14f8ab
14f8ab
Analysis and Root cause:
14f8ab
It's found out that the tar process was hung.
14f8ab
When debugged further, it's found out that stderr
14f8ab
buffer of tar process on master was full i.e., 64k.
14f8ab
When the buffer was copied to a file from /proc/pid/fd/2,
14f8ab
the hang is resolved.
14f8ab
14f8ab
This can happen when files picked by tar process
14f8ab
to sync doesn't exist on master anymore. If this count
14f8ab
increases around 1k, the stderr buffer is filled up.
14f8ab
14f8ab
Fix:
14f8ab
The tar process is executed using Popen with stderr as PIPE.
14f8ab
The final execution is something like below.
14f8ab
14f8ab
tar | ssh <args> root@slave tar --overwrite -xf - -C <path>
14f8ab
14f8ab
It was waiting on ssh process first using communicate() and then tar.
14f8ab
Note that communicate() reads stdout and stderr. So when stderr of tar
14f8ab
process is filled up, there is no one to read until untar via ssh is
14f8ab
completed. This can't happen and leads to deadlock.
14f8ab
Hence we should be waiting on both process parallely, so that stderr is
14f8ab
read on both processes.
14f8ab
14f8ab
Backport of:
14f8ab
 > Patch: https://review.gluster.org/22684
14f8ab
 > Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf
14f8ab
 > BUG: 1707728
14f8ab
 > Signed-off-by: Kotresh HR <khiremat@redhat.com>
14f8ab
14f8ab
Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf
14f8ab
fixes: bz#1708116
14f8ab
Signed-off-by: Kotresh HR <khiremat@redhat.com>
14f8ab
Reviewed-on: https://code.engineering.redhat.com/gerrit/172395
14f8ab
Tested-by: RHGS Build Bot <nigelb@redhat.com>
14f8ab
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
14f8ab
---
14f8ab
 geo-replication/syncdaemon/resource.py |  22 ++++--
14f8ab
 tests/00-geo-rep/georep-stderr-hang.t  | 128 +++++++++++++++++++++++++++++++++
14f8ab
 tests/geo-rep.rc                       |  17 +++++
14f8ab
 3 files changed, 163 insertions(+), 4 deletions(-)
14f8ab
 create mode 100644 tests/00-geo-rep/georep-stderr-hang.t
14f8ab
14f8ab
diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
14f8ab
index 522279b..b16db60 100644
14f8ab
--- a/geo-replication/syncdaemon/resource.py
14f8ab
+++ b/geo-replication/syncdaemon/resource.py
14f8ab
@@ -1540,15 +1540,29 @@ class SSH(object):
14f8ab
 
14f8ab
         p0.stdin.close()
14f8ab
         p0.stdout.close()  # Allow p0 to receive a SIGPIPE if p1 exits.
14f8ab
-        # wait for tar to terminate, collecting any errors, further
14f8ab
-        # waiting for transfer to complete
14f8ab
-        _, stderr1 = p1.communicate()
14f8ab
 
14f8ab
         # stdin and stdout of p0 is already closed, Reset to None and
14f8ab
         # wait for child process to complete
14f8ab
         p0.stdin = None
14f8ab
         p0.stdout = None
14f8ab
-        p0.communicate()
14f8ab
+
14f8ab
+        def wait_for_tar(p0):
14f8ab
+            _, stderr = p0.communicate()
14f8ab
+            if log_err:
14f8ab
+                for errline in stderr.strip().split("\n")[:-1]:
14f8ab
+                    if "No such file or directory" not in errline:
14f8ab
+                        logging.error(lf("SYNC Error",
14f8ab
+                                         sync_engine="Tarssh",
14f8ab
+                                         error=errline))
14f8ab
+
14f8ab
+        t = syncdutils.Thread(target=wait_for_tar, args=(p0, ))
14f8ab
+        # wait for tar to terminate, collecting any errors, further
14f8ab
+        # waiting for transfer to complete
14f8ab
+        t.start()
14f8ab
+
14f8ab
+        # wait for ssh process
14f8ab
+        _, stderr1 = p1.communicate()
14f8ab
+        t.join()
14f8ab
 
14f8ab
         if log_err:
14f8ab
             for errline in stderr1.strip().split("\n")[:-1]:
14f8ab
diff --git a/tests/00-geo-rep/georep-stderr-hang.t b/tests/00-geo-rep/georep-stderr-hang.t
14f8ab
new file mode 100644
14f8ab
index 0000000..496f0e6
14f8ab
--- /dev/null
14f8ab
+++ b/tests/00-geo-rep/georep-stderr-hang.t
14f8ab
@@ -0,0 +1,128 @@
14f8ab
+#!/bin/bash
14f8ab
+
14f8ab
+. $(dirname $0)/../include.rc
14f8ab
+. $(dirname $0)/../volume.rc
14f8ab
+. $(dirname $0)/../geo-rep.rc
14f8ab
+. $(dirname $0)/../env.rc
14f8ab
+
14f8ab
+SCRIPT_TIMEOUT=500
14f8ab
+
14f8ab
+AREQUAL_PATH=$(dirname $0)/../utils
14f8ab
+test "`uname -s`" != "Linux" && {
14f8ab
+    CFLAGS="$CFLAGS -lintl";
14f8ab
+}
14f8ab
+build_tester $AREQUAL_PATH/arequal-checksum.c $CFLAGS
14f8ab
+
14f8ab
+### Basic Tests with Distribute Replicate volumes
14f8ab
+
14f8ab
+##Cleanup and start glusterd
14f8ab
+cleanup;
14f8ab
+TEST glusterd;
14f8ab
+TEST pidof glusterd
14f8ab
+
14f8ab
+
14f8ab
+##Variables
14f8ab
+GEOREP_CLI="$CLI volume geo-replication"
14f8ab
+master=$GMV0
14f8ab
+SH0="127.0.0.1"
14f8ab
+slave=${SH0}::${GSV0}
14f8ab
+num_active=2
14f8ab
+num_passive=2
14f8ab
+master_mnt=$M0
14f8ab
+slave_mnt=$M1
14f8ab
+
14f8ab
+############################################################
14f8ab
+#SETUP VOLUMES AND GEO-REPLICATION
14f8ab
+############################################################
14f8ab
+
14f8ab
+##create_and_start_master_volume
14f8ab
+TEST $CLI volume create $GMV0 $H0:$B0/${GMV0}1;
14f8ab
+TEST $CLI volume start $GMV0
14f8ab
+
14f8ab
+##create_and_start_slave_volume
14f8ab
+TEST $CLI volume create $GSV0 $H0:$B0/${GSV0}1;
14f8ab
+TEST $CLI volume start $GSV0
14f8ab
+TEST $CLI volume set $GSV0 performance.stat-prefetch off
14f8ab
+TEST $CLI volume set $GSV0 performance.quick-read off
14f8ab
+TEST $CLI volume set $GSV0 performance.readdir-ahead off
14f8ab
+TEST $CLI volume set $GSV0 performance.read-ahead off
14f8ab
+
14f8ab
+##Mount master
14f8ab
+TEST glusterfs -s $H0 --volfile-id $GMV0 $M0
14f8ab
+
14f8ab
+##Mount slave
14f8ab
+TEST glusterfs -s $H0 --volfile-id $GSV0 $M1
14f8ab
+
14f8ab
+############################################################
14f8ab
+#BASIC GEO-REPLICATION TESTS
14f8ab
+############################################################
14f8ab
+
14f8ab
+TEST create_georep_session $master $slave
14f8ab
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Created"
14f8ab
+
14f8ab
+#Config gluster-command-dir
14f8ab
+TEST $GEOREP_CLI $master $slave config gluster-command-dir ${GLUSTER_CMD_DIR}
14f8ab
+
14f8ab
+#Config gluster-command-dir
14f8ab
+TEST $GEOREP_CLI $master $slave config slave-gluster-command-dir ${GLUSTER_CMD_DIR}
14f8ab
+
14f8ab
+#Set changelog roll-over time to 45 secs
14f8ab
+TEST $CLI volume set $GMV0 changelog.rollover-time 45
14f8ab
+
14f8ab
+#Wait for common secret pem file to be created
14f8ab
+EXPECT_WITHIN $GEO_REP_TIMEOUT  0 check_common_secret_file
14f8ab
+
14f8ab
+#Verify the keys are distributed
14f8ab
+EXPECT_WITHIN $GEO_REP_TIMEOUT  0 check_keys_distributed
14f8ab
+
14f8ab
+#Set sync-jobs to 1
14f8ab
+TEST $GEOREP_CLI $master $slave config sync-jobs 1
14f8ab
+
14f8ab
+#Start_georep
14f8ab
+TEST $GEOREP_CLI $master $slave start
14f8ab
+
14f8ab
+touch $M0
14f8ab
+EXPECT_WITHIN $GEO_REP_TIMEOUT  1 check_status_num_rows "Active"
14f8ab
+EXPECT_WITHIN $GEO_REP_TIMEOUT  1 check_status_num_rows "Changelog Crawl"
14f8ab
+
14f8ab
+#Check History Crawl.
14f8ab
+TEST $GEOREP_CLI $master $slave stop
14f8ab
+TEST create_data_hang "rsync_hang"
14f8ab
+TEST create_data "history_rsync"
14f8ab
+TEST $GEOREP_CLI $master $slave start
14f8ab
+EXPECT_WITHIN $GEO_REP_TIMEOUT  1 check_status_num_rows "Active"
14f8ab
+
14f8ab
+#Verify arequal for whole volume
14f8ab
+EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
14f8ab
+
14f8ab
+#Stop Geo-rep
14f8ab
+TEST $GEOREP_CLI $master $slave stop
14f8ab
+
14f8ab
+#Config tarssh as sync-engine
14f8ab
+TEST $GEOREP_CLI $master $slave config sync-method tarssh
14f8ab
+
14f8ab
+#Create tarssh hang data
14f8ab
+TEST create_data_hang "tarssh_hang"
14f8ab
+TEST create_data "history_tar"
14f8ab
+
14f8ab
+TEST $GEOREP_CLI $master $slave start
14f8ab
+EXPECT_WITHIN $GEO_REP_TIMEOUT  1 check_status_num_rows "Active"
14f8ab
+
14f8ab
+#Verify arequal for whole volume
14f8ab
+EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
14f8ab
+
14f8ab
+#Stop Geo-rep
14f8ab
+TEST $GEOREP_CLI $master $slave stop
14f8ab
+
14f8ab
+#Delete Geo-rep
14f8ab
+TEST $GEOREP_CLI $master $slave delete
14f8ab
+
14f8ab
+#Cleanup are-equal binary
14f8ab
+TEST rm $AREQUAL_PATH/arequal-checksum
14f8ab
+
14f8ab
+#Cleanup authorized keys
14f8ab
+sed -i '/^command=.*SSH_ORIGINAL_COMMAND#.*/d' ~/.ssh/authorized_keys
14f8ab
+sed -i '/^command=.*gsyncd.*/d' ~/.ssh/authorized_keys
14f8ab
+
14f8ab
+cleanup;
14f8ab
+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000
14f8ab
diff --git a/tests/geo-rep.rc b/tests/geo-rep.rc
14f8ab
index 2035b9f..e4f014e 100644
14f8ab
--- a/tests/geo-rep.rc
14f8ab
+++ b/tests/geo-rep.rc
14f8ab
@@ -101,6 +101,23 @@ function create_data()
14f8ab
     chown 1000:1000 ${master_mnt}/${prefix}_chown_f1_ಸಂತಸ
14f8ab
 }
14f8ab
 
14f8ab
+function create_data_hang()
14f8ab
+{
14f8ab
+    prefix=$1
14f8ab
+    mkdir ${master_mnt}/${prefix}
14f8ab
+    cd ${master_mnt}/${prefix}
14f8ab
+    # ~1k files is required with 1 sync-job and hang happens if
14f8ab
+    # stderr buffer of tar/ssh executed with Popen is full (i.e., 64k).
14f8ab
+    # 64k is hit when ~800 files were  not found while syncing data
14f8ab
+    # from master. So around 1k files is required to hit the condition.
14f8ab
+    for i in {1..1000}
14f8ab
+    do
14f8ab
+        echo "test data" > file$i
14f8ab
+        mv -f file$i file
14f8ab
+    done
14f8ab
+    cd -
14f8ab
+}
14f8ab
+
14f8ab
 function chown_file_ok()
14f8ab
 {
14f8ab
     local file_owner=$(stat --format "%u:%g" "$1")
14f8ab
-- 
14f8ab
1.8.3.1
14f8ab