74b1de
From 29ec87484b1ee3ad6417c37726db8aa9296f3a83 Mon Sep 17 00:00:00 2001
74b1de
From: Kotresh HR <khiremat@redhat.com>
74b1de
Date: Wed, 8 May 2019 11:26:06 +0530
74b1de
Subject: [PATCH 163/169] geo-rep: Fix sync hang with tarssh
74b1de
74b1de
Problem:
74b1de
Geo-rep sync hangs when tarssh is used as sync
74b1de
engine at heavy workload.
74b1de
74b1de
Analysis and Root cause:
74b1de
It's found out that the tar process was hung.
74b1de
When debugged further, it's found out that stderr
74b1de
buffer of tar process on master was full i.e., 64k.
74b1de
When the buffer was copied to a file from /proc/pid/fd/2,
74b1de
the hang is resolved.
74b1de
74b1de
This can happen when files picked by tar process
74b1de
to sync doesn't exist on master anymore. If this count
74b1de
increases around 1k, the stderr buffer is filled up.
74b1de
74b1de
Fix:
74b1de
The tar process is executed using Popen with stderr as PIPE.
74b1de
The final execution is something like below.
74b1de
74b1de
tar | ssh <args> root@slave tar --overwrite -xf - -C <path>
74b1de
74b1de
It was waiting on ssh process first using communicate() and then tar.
74b1de
Note that communicate() reads stdout and stderr. So when stderr of tar
74b1de
process is filled up, there is no one to read until untar via ssh is
74b1de
completed. This can't happen and leads to deadlock.
74b1de
Hence we should be waiting on both process parallely, so that stderr is
74b1de
read on both processes.
74b1de
74b1de
Backport of:
74b1de
 > Patch: https://review.gluster.org/22684
74b1de
 > Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf
74b1de
 > BUG: 1707728
74b1de
 > Signed-off-by: Kotresh HR <khiremat@redhat.com>
74b1de
74b1de
Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf
74b1de
fixes: bz#1708116
74b1de
Signed-off-by: Kotresh HR <khiremat@redhat.com>
74b1de
Reviewed-on: https://code.engineering.redhat.com/gerrit/172395
74b1de
Tested-by: RHGS Build Bot <nigelb@redhat.com>
74b1de
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
74b1de
---
74b1de
 geo-replication/syncdaemon/resource.py |  22 ++++--
74b1de
 tests/00-geo-rep/georep-stderr-hang.t  | 128 +++++++++++++++++++++++++++++++++
74b1de
 tests/geo-rep.rc                       |  17 +++++
74b1de
 3 files changed, 163 insertions(+), 4 deletions(-)
74b1de
 create mode 100644 tests/00-geo-rep/georep-stderr-hang.t
74b1de
74b1de
diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
74b1de
index 522279b..b16db60 100644
74b1de
--- a/geo-replication/syncdaemon/resource.py
74b1de
+++ b/geo-replication/syncdaemon/resource.py
74b1de
@@ -1540,15 +1540,29 @@ class SSH(object):
74b1de
 
74b1de
         p0.stdin.close()
74b1de
         p0.stdout.close()  # Allow p0 to receive a SIGPIPE if p1 exits.
74b1de
-        # wait for tar to terminate, collecting any errors, further
74b1de
-        # waiting for transfer to complete
74b1de
-        _, stderr1 = p1.communicate()
74b1de
 
74b1de
         # stdin and stdout of p0 is already closed, Reset to None and
74b1de
         # wait for child process to complete
74b1de
         p0.stdin = None
74b1de
         p0.stdout = None
74b1de
-        p0.communicate()
74b1de
+
74b1de
+        def wait_for_tar(p0):
74b1de
+            _, stderr = p0.communicate()
74b1de
+            if log_err:
74b1de
+                for errline in stderr.strip().split("\n")[:-1]:
74b1de
+                    if "No such file or directory" not in errline:
74b1de
+                        logging.error(lf("SYNC Error",
74b1de
+                                         sync_engine="Tarssh",
74b1de
+                                         error=errline))
74b1de
+
74b1de
+        t = syncdutils.Thread(target=wait_for_tar, args=(p0, ))
74b1de
+        # wait for tar to terminate, collecting any errors, further
74b1de
+        # waiting for transfer to complete
74b1de
+        t.start()
74b1de
+
74b1de
+        # wait for ssh process
74b1de
+        _, stderr1 = p1.communicate()
74b1de
+        t.join()
74b1de
 
74b1de
         if log_err:
74b1de
             for errline in stderr1.strip().split("\n")[:-1]:
74b1de
diff --git a/tests/00-geo-rep/georep-stderr-hang.t b/tests/00-geo-rep/georep-stderr-hang.t
74b1de
new file mode 100644
74b1de
index 0000000..496f0e6
74b1de
--- /dev/null
74b1de
+++ b/tests/00-geo-rep/georep-stderr-hang.t
74b1de
@@ -0,0 +1,128 @@
74b1de
+#!/bin/bash
74b1de
+
74b1de
+. $(dirname $0)/../include.rc
74b1de
+. $(dirname $0)/../volume.rc
74b1de
+. $(dirname $0)/../geo-rep.rc
74b1de
+. $(dirname $0)/../env.rc
74b1de
+
74b1de
+SCRIPT_TIMEOUT=500
74b1de
+
74b1de
+AREQUAL_PATH=$(dirname $0)/../utils
74b1de
+test "`uname -s`" != "Linux" && {
74b1de
+    CFLAGS="$CFLAGS -lintl";
74b1de
+}
74b1de
+build_tester $AREQUAL_PATH/arequal-checksum.c $CFLAGS
74b1de
+
74b1de
+### Basic Tests with Distribute Replicate volumes
74b1de
+
74b1de
+##Cleanup and start glusterd
74b1de
+cleanup;
74b1de
+TEST glusterd;
74b1de
+TEST pidof glusterd
74b1de
+
74b1de
+
74b1de
+##Variables
74b1de
+GEOREP_CLI="$CLI volume geo-replication"
74b1de
+master=$GMV0
74b1de
+SH0="127.0.0.1"
74b1de
+slave=${SH0}::${GSV0}
74b1de
+num_active=2
74b1de
+num_passive=2
74b1de
+master_mnt=$M0
74b1de
+slave_mnt=$M1
74b1de
+
74b1de
+############################################################
74b1de
+#SETUP VOLUMES AND GEO-REPLICATION
74b1de
+############################################################
74b1de
+
74b1de
+##create_and_start_master_volume
74b1de
+TEST $CLI volume create $GMV0 $H0:$B0/${GMV0}1;
74b1de
+TEST $CLI volume start $GMV0
74b1de
+
74b1de
+##create_and_start_slave_volume
74b1de
+TEST $CLI volume create $GSV0 $H0:$B0/${GSV0}1;
74b1de
+TEST $CLI volume start $GSV0
74b1de
+TEST $CLI volume set $GSV0 performance.stat-prefetch off
74b1de
+TEST $CLI volume set $GSV0 performance.quick-read off
74b1de
+TEST $CLI volume set $GSV0 performance.readdir-ahead off
74b1de
+TEST $CLI volume set $GSV0 performance.read-ahead off
74b1de
+
74b1de
+##Mount master
74b1de
+TEST glusterfs -s $H0 --volfile-id $GMV0 $M0
74b1de
+
74b1de
+##Mount slave
74b1de
+TEST glusterfs -s $H0 --volfile-id $GSV0 $M1
74b1de
+
74b1de
+############################################################
74b1de
+#BASIC GEO-REPLICATION TESTS
74b1de
+############################################################
74b1de
+
74b1de
+TEST create_georep_session $master $slave
74b1de
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Created"
74b1de
+
74b1de
+#Config gluster-command-dir
74b1de
+TEST $GEOREP_CLI $master $slave config gluster-command-dir ${GLUSTER_CMD_DIR}
74b1de
+
74b1de
+#Config gluster-command-dir
74b1de
+TEST $GEOREP_CLI $master $slave config slave-gluster-command-dir ${GLUSTER_CMD_DIR}
74b1de
+
74b1de
+#Set changelog roll-over time to 45 secs
74b1de
+TEST $CLI volume set $GMV0 changelog.rollover-time 45
74b1de
+
74b1de
+#Wait for common secret pem file to be created
74b1de
+EXPECT_WITHIN $GEO_REP_TIMEOUT  0 check_common_secret_file
74b1de
+
74b1de
+#Verify the keys are distributed
74b1de
+EXPECT_WITHIN $GEO_REP_TIMEOUT  0 check_keys_distributed
74b1de
+
74b1de
+#Set sync-jobs to 1
74b1de
+TEST $GEOREP_CLI $master $slave config sync-jobs 1
74b1de
+
74b1de
+#Start_georep
74b1de
+TEST $GEOREP_CLI $master $slave start
74b1de
+
74b1de
+touch $M0
74b1de
+EXPECT_WITHIN $GEO_REP_TIMEOUT  1 check_status_num_rows "Active"
74b1de
+EXPECT_WITHIN $GEO_REP_TIMEOUT  1 check_status_num_rows "Changelog Crawl"
74b1de
+
74b1de
+#Check History Crawl.
74b1de
+TEST $GEOREP_CLI $master $slave stop
74b1de
+TEST create_data_hang "rsync_hang"
74b1de
+TEST create_data "history_rsync"
74b1de
+TEST $GEOREP_CLI $master $slave start
74b1de
+EXPECT_WITHIN $GEO_REP_TIMEOUT  1 check_status_num_rows "Active"
74b1de
+
74b1de
+#Verify arequal for whole volume
74b1de
+EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
74b1de
+
74b1de
+#Stop Geo-rep
74b1de
+TEST $GEOREP_CLI $master $slave stop
74b1de
+
74b1de
+#Config tarssh as sync-engine
74b1de
+TEST $GEOREP_CLI $master $slave config sync-method tarssh
74b1de
+
74b1de
+#Create tarssh hang data
74b1de
+TEST create_data_hang "tarssh_hang"
74b1de
+TEST create_data "history_tar"
74b1de
+
74b1de
+TEST $GEOREP_CLI $master $slave start
74b1de
+EXPECT_WITHIN $GEO_REP_TIMEOUT  1 check_status_num_rows "Active"
74b1de
+
74b1de
+#Verify arequal for whole volume
74b1de
+EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
74b1de
+
74b1de
+#Stop Geo-rep
74b1de
+TEST $GEOREP_CLI $master $slave stop
74b1de
+
74b1de
+#Delete Geo-rep
74b1de
+TEST $GEOREP_CLI $master $slave delete
74b1de
+
74b1de
+#Cleanup are-equal binary
74b1de
+TEST rm $AREQUAL_PATH/arequal-checksum
74b1de
+
74b1de
+#Cleanup authorized keys
74b1de
+sed -i '/^command=.*SSH_ORIGINAL_COMMAND#.*/d' ~/.ssh/authorized_keys
74b1de
+sed -i '/^command=.*gsyncd.*/d' ~/.ssh/authorized_keys
74b1de
+
74b1de
+cleanup;
74b1de
+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000
74b1de
diff --git a/tests/geo-rep.rc b/tests/geo-rep.rc
74b1de
index 2035b9f..e4f014e 100644
74b1de
--- a/tests/geo-rep.rc
74b1de
+++ b/tests/geo-rep.rc
74b1de
@@ -101,6 +101,23 @@ function create_data()
74b1de
     chown 1000:1000 ${master_mnt}/${prefix}_chown_f1_ಸಂತಸ
74b1de
 }
74b1de
 
74b1de
+function create_data_hang()
74b1de
+{
74b1de
+    prefix=$1
74b1de
+    mkdir ${master_mnt}/${prefix}
74b1de
+    cd ${master_mnt}/${prefix}
74b1de
+    # ~1k files is required with 1 sync-job and hang happens if
74b1de
+    # stderr buffer of tar/ssh executed with Popen is full (i.e., 64k).
74b1de
+    # 64k is hit when ~800 files were  not found while syncing data
74b1de
+    # from master. So around 1k files is required to hit the condition.
74b1de
+    for i in {1..1000}
74b1de
+    do
74b1de
+        echo "test data" > file$i
74b1de
+        mv -f file$i file
74b1de
+    done
74b1de
+    cd -
74b1de
+}
74b1de
+
74b1de
 function chown_file_ok()
74b1de
 {
74b1de
     local file_owner=$(stat --format "%u:%g" "$1")
74b1de
-- 
74b1de
1.8.3.1
74b1de