d1681e
From ea20e0a38c9f150d9e96076e04f4b77109e41663 Mon Sep 17 00:00:00 2001
d1681e
From: Mohit Agrawal <moagrawa@redhat.com>
d1681e
Date: Wed, 27 Sep 2017 11:37:28 +0530
d1681e
Subject: [PATCH 091/128] extras: scripts to control CPU/MEMORY for any gluster
d1681e
 daemon during runtime
d1681e
d1681e
Problem: Sometime gluster daemons like glustershd can consume a lot of cpu and/
d1681e
or memory if there is a large amount of data/ entries to be healed.
d1681e
d1681e
Solution: Until we have some form of throttling/ QoS mechanisms built into
d1681e
gluster, we can use control groups for regulating cpu and memory of any gluster
d1681e
daemon using control-cpu-load.sh and control-mem.sh scripts respectively.
d1681e
d1681e
Test:    To test the control-cpu-load.sh script follow below procedure:
d1681e
         1) Setup distribute replica environment
d1681e
         2) Selfheal daemon off
d1681e
         3) Down one node from replica nodes
d1681e
         4) Create millions of files from mount point
d1681e
         5) Start down node
d1681e
         6) Check cpu usage for shd process in top command
d1681e
         7) Run script after provide shd pid with CPU quota value
d1681e
         8) Check again cpu usage for shd process in top command
d1681e
d1681e
Note: control-mem.sh script can cap the memory usage of the process to the set
d1681e
limit, beyond which the process gets blocked. It resumes either when the memory
d1681e
usage comes down or if the limit is increased.
d1681e
d1681e
> BUG: 1496335
d1681e
> Change-Id: Id73c36b73ca600fa9f7905d84053d1e8633c996f
d1681e
> Reviewed on https://review.gluster.org/#/c/18404
d1681e
> (cherry picked from commit 2c066c4c365e77421d1009851144efae0b028628
d1681e
d1681e
BUG: 1484446
d1681e
Change-Id: Id73c36b73ca600fa9f7905d84053d1e8633c996f
d1681e
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
d1681e
Reviewed-on: https://code.engineering.redhat.com/gerrit/124875
d1681e
Tested-by: RHGS Build Bot <nigelb@redhat.com>
d1681e
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
d1681e
---
d1681e
 extras/Makefile.am         |   6 ++-
d1681e
 extras/control-cpu-load.sh | 116 ++++++++++++++++++++++++++++++++++++++++
d1681e
 extras/control-mem.sh      | 128 +++++++++++++++++++++++++++++++++++++++++++++
d1681e
 glusterfs.spec.in          |   5 ++
d1681e
 4 files changed, 254 insertions(+), 1 deletion(-)
d1681e
 create mode 100755 extras/control-cpu-load.sh
d1681e
 create mode 100755 extras/control-mem.sh
d1681e
d1681e
diff --git a/extras/Makefile.am b/extras/Makefile.am
d1681e
index 2812a4c..d9572ac 100644
d1681e
--- a/extras/Makefile.am
d1681e
+++ b/extras/Makefile.am
d1681e
@@ -19,6 +19,10 @@ vol_DATA = glusterd.vol
d1681e
 scriptsdir = $(datadir)/glusterfs/scripts
d1681e
 scripts_SCRIPTS = post-upgrade-script-for-quota.sh \
d1681e
 	pre-upgrade-script-for-quota.sh stop-all-gluster-processes.sh
d1681e
+if USE_SYSTEMD
d1681e
+scripts_SCRIPTS += control-cpu-load.sh
d1681e
+scripts_SCRIPTS += control-mem.sh
d1681e
+endif
d1681e
 
d1681e
 EXTRA_DIST = $(conf_DATA) specgen.scm glusterfs-mode.el glusterfs.vim \
d1681e
 	migrate-unify-to-distribute.sh backend-xattr-sanitize.sh backend-cleanup.sh \
d1681e
@@ -26,7 +30,7 @@ EXTRA_DIST = $(conf_DATA) specgen.scm glusterfs-mode.el glusterfs.vim \
d1681e
 	post-upgrade-script-for-quota.sh pre-upgrade-script-for-quota.sh \
d1681e
 	command-completion/gluster.bash command-completion/Makefile \
d1681e
 	command-completion/README stop-all-gluster-processes.sh clang-checker.sh \
d1681e
-	mount-shared-storage.sh
d1681e
+	mount-shared-storage.sh control-cpu-load.sh control-mem.sh
d1681e
 
d1681e
 install-data-local:
d1681e
 	if [ -n "$(tmpfilesdir)" ]; then \
d1681e
diff --git a/extras/control-cpu-load.sh b/extras/control-cpu-load.sh
d1681e
new file mode 100755
d1681e
index 0000000..b739c82
d1681e
--- /dev/null
d1681e
+++ b/extras/control-cpu-load.sh
d1681e
@@ -0,0 +1,116 @@
d1681e
+#!/bin/bash
d1681e
+
d1681e
+USAGE="This script provides a utility to control CPU utilization for any
d1681e
+gluster daemon.In this, we use cgroup framework to configure CPU quota
d1681e
+for a process(like selfheal daemon). Before running this script, make
d1681e
+sure that daemon is running.Every time daemon restarts, it is required
d1681e
+to rerun this command to set CPU quota on new daemon process id.
d1681e
+User can enter any value between 10 to 100 for CPU quota.
d1681e
+Recommended value of quota period is 25. 25 means, kernel will allocate
d1681e
+25 ms period to this group of tasks in every 100 ms period. This 25ms
d1681e
+could be considered as the maximum percentage of CPU quota daemon can take.
d1681e
+This value will be reflected on CPU usage of "top" command.If provided pid
d1681e
+is the only process and no other process is in competition to get CPU, more
d1681e
+ than 25% could be allocated to daemon to speed up the process."
d1681e
+
d1681e
+if [  $# -ge 1 ]; then
d1681e
+  case $1 in
d1681e
+    -h|--help) echo " " "$USAGE" | sed -r -e 's/^[ ]+//g'
d1681e
+               exit 0;
d1681e
+               ;;
d1681e
+  *) echo "Please Provide correct input for script."
d1681e
+     echo "For help correct options are -h or --help."
d1681e
+     exit 1;
d1681e
+               ;;
d1681e
+  esac
d1681e
+fi
d1681e
+
d1681e
+DIR_EXIST=0
d1681e
+LOC="/sys/fs/cgroup/cpu,cpuacct/system.slice/glusterd.service"
d1681e
+echo "Enter gluster daemon pid for which you want to control CPU."
d1681e
+read daemon_pid
d1681e
+
d1681e
+if expr ${daemon_pid} + 0 > /dev/null 2>&1 ;then
d1681e
+  CHECK_PID=$(pgrep -f gluster | grep ${daemon_pid})
d1681e
+  if [ -z "${CHECK_PID}" ]; then
d1681e
+    echo "No daemon is running or pid ${daemon_pid} does not match."
d1681e
+    echo "with running gluster processes."
d1681e
+    exit 1
d1681e
+  fi
d1681e
+else
d1681e
+  echo "Entered daemon_pid is not numeric so Rerun the script."
d1681e
+  exit 1
d1681e
+fi
d1681e
+
d1681e
+
d1681e
+if [ -f ${LOC}/tasks ];then
d1681e
+  CHECK_CGROUP=$(grep ${daemon_pid} ${LOC}/tasks)
d1681e
+  if [ ${CHECK_CGROUP} ]; then
d1681e
+    echo "pid ${daemon_pid} is attached with glusterd.service cgroup."
d1681e
+  fi
d1681e
+fi
d1681e
+
d1681e
+cgroup_name=cgroup_gluster_${daemon_pid}
d1681e
+if [ -f ${LOC}/${cgroup_name}/tasks ]; then
d1681e
+  CHECK_CGROUP=$(grep ${daemon_pid} ${LOC}/${cgroup_name}/tasks)
d1681e
+  if [ ${CHECK_CGROUP} ]; then
d1681e
+    val=`cat ${LOC}/${cgroup_name}/cpu.cfs_quota_us`
d1681e
+    qval=$((val / 1000))
d1681e
+    echo "pid ${daemon_pid} is already attached ${cgroup_name} with quota value ${qval}."
d1681e
+    echo "Press n if you don't want to reassign ${daemon_pid} with new quota value."
d1681e
+    DIR_EXIST=1
d1681e
+  else
d1681e
+    echo "pid ${daemon_pid} is not attached with ${cgroup_name}."
d1681e
+  fi
d1681e
+fi
d1681e
+
d1681e
+read -p "If you want to continue the script to attach ${daemon_pid} with new ${cgroup_name} cgroup Press (y/n)?" choice
d1681e
+case "$choice" in
d1681e
+  y|Y ) echo "yes";;
d1681e
+  n|N ) echo "no";exit;;
d1681e
+  * ) echo "invalid";exit;;
d1681e
+esac
d1681e
+
d1681e
+systemctl set-property glusterd.service CPUShares=1024
d1681e
+
d1681e
+if [ ${DIR_EXIST} -eq 0 ];then
d1681e
+  echo "Creating child cgroup directory '${cgroup_name} cgroup' for glusterd.service."
d1681e
+  mkdir -p ${LOC}/${cgroup_name}
d1681e
+  if [ ! -f ${LOC}/${cgroup_name}/tasks ];then
d1681e
+    echo "Not able to create ${cgroup_name} directory so exit."
d1681e
+    exit 1
d1681e
+  fi
d1681e
+fi
d1681e
+
d1681e
+echo "Enter quota value in range [10,100]:  "
d1681e
+
d1681e
+read quota_value
d1681e
+if expr ${quota_value} + 0 > /dev/null 2>&1 ;then
d1681e
+  if [ ${quota_value} -lt 10 ] || [ ${quota_value} -gt 100 ]; then
d1681e
+    echo "Entered quota value is not correct,it should be in the range ."
d1681e
+    echo "10-100. Ideal value is 25."
d1681e
+    echo "Rerun the sript with correct value."
d1681e
+    exit 1
d1681e
+  else
d1681e
+    echo "Entered quota value is $quota_value"
d1681e
+  fi
d1681e
+else
d1681e
+  echo "Entered quota value is not numeric so Rerun the script."
d1681e
+  exit 1
d1681e
+fi
d1681e
+
d1681e
+quota_value=$((quota_value * 1000))
d1681e
+echo "Setting $quota_value to cpu.cfs_quota_us for gluster_cgroup."
d1681e
+echo ${quota_value} > ${LOC}/${cgroup_name}/cpu.cfs_quota_us
d1681e
+
d1681e
+if ps -T -p ${daemon_pid} | grep gluster > /dev/null; then
d1681e
+  for thid in `ps -T -p ${daemon_pid} | grep gluster | awk -F " " '{print $2}'`;
d1681e
+    do
d1681e
+      echo ${thid} > ${LOC}/${cgroup_name}/tasks ;
d1681e
+    done
d1681e
+  if cat /proc/${daemon_pid}/cgroup | grep -w ${cgroup_name} > /dev/null; then
d1681e
+    echo "Tasks are attached successfully specific to ${daemon_pid} to ${cgroup_name}."
d1681e
+  else
d1681e
+    echo "Tasks are not attached successfully."
d1681e
+  fi
d1681e
+fi
d1681e
diff --git a/extras/control-mem.sh b/extras/control-mem.sh
d1681e
new file mode 100755
d1681e
index 0000000..38aa2a0
d1681e
--- /dev/null
d1681e
+++ b/extras/control-mem.sh
d1681e
@@ -0,0 +1,128 @@
d1681e
+#!/bin/bash
d1681e
+
d1681e
+USAGE="This commands provides a utility to control MEMORY utilization for any
d1681e
+gluster daemon.In this, we use cgroup framework to configure MEMORY limit for
d1681e
+a process. Before running this script, make sure that daemon is running.Every
d1681e
+time daemon restarts, it is required to rerun this command to set memory limit
d1681e
+(in bytes) on new daemon process id.User can enter any value between 100
d1681e
+(in Mega bytes) to 8000000000000 for Memory limit in Mega bytes.
d1681e
+Memory limit value is depends on how much maximum memory user wants to restrict
d1681e
+for specific daemon process.If a process will try to consume memore more than
d1681e
+configured value then cgroup will hang/sleep this task and to resume the task
d1681e
+rerun the script with new increase memory limit value ."
d1681e
+
d1681e
+if [  $# -ge 1 ]; then
d1681e
+  case $1 in
d1681e
+    -h|--help) echo " " "$USAGE" | sed -r -e 's/^[ ]+//g'
d1681e
+               exit 0;
d1681e
+               ;;
d1681e
+    *) echo "Please Provide correct input for script."
d1681e
+       echo "For help correct options are -h of --help."
d1681e
+       exit 1;
d1681e
+               ;;
d1681e
+  esac
d1681e
+fi
d1681e
+
d1681e
+DIR_EXIST=0
d1681e
+LOC="/sys/fs/cgroup/memory/system.slice/glusterd.service"
d1681e
+echo "Enter Any gluster daemon pid for that you want to control MEMORY."
d1681e
+read daemon_pid
d1681e
+
d1681e
+if expr ${daemon_pid} + 0 > /dev/null 2>&1 ;then
d1681e
+  CHECK_PID=$(pgrep -f gluster | grep ${daemon_pid})
d1681e
+  if [ -z "${CHECK_PID}" ]; then
d1681e
+    echo "No daemon is running or pid ${daemon_pid} does not match."
d1681e
+    echo "with running gluster processes."
d1681e
+    exit 1
d1681e
+  fi
d1681e
+else
d1681e
+  echo "Entered daemon_pid is not numeric so Rerun the script."
d1681e
+  exit 1
d1681e
+fi
d1681e
+
d1681e
+
d1681e
+if [ -f ${LOC}/tasks ]; then
d1681e
+  CHECK_CGROUP=$(grep ${daemon_pid} ${LOC}/tasks)
d1681e
+  if [ ${CHECK_CGROUP} ] ;then
d1681e
+    echo "pid ${daemon_pid} is attached with default glusterd.service cgroup."
d1681e
+  fi
d1681e
+fi
d1681e
+
d1681e
+cgroup_name=cgroup_gluster_${daemon_pid}
d1681e
+if [ -f ${LOC}/${cgroup_name}/tasks ];then
d1681e
+  CHECK_CGROUP=$(grep ${daemon_pid} ${LOC}/${cgroup_name}/tasks)
d1681e
+  if [ ${CHECK_CGROUP} ]; then
d1681e
+    val=`cat ${LOC}/${cgroup_name}/memory.limit_in_bytes`
d1681e
+    mval=$((val / 1024 / 1024))
d1681e
+    echo "pid ${daemon_pid} is already attached ${cgroup_name} with mem value ${mval}."
d1681e
+    echo "Press n if you don't want to reassign ${daemon_pid} with new mem value."
d1681e
+    DIR_EXIST=1
d1681e
+  else
d1681e
+    echo "pid ${daemon_pid} is not attached with ${cgroup_name}."
d1681e
+  fi
d1681e
+fi
d1681e
+
d1681e
+read -p "If you want to continue the script to attach daeomon with new cgroup. Press (y/n)?" choice
d1681e
+case "$choice" in
d1681e
+  y|Y ) echo "yes";;
d1681e
+  n|N ) echo "no";exit;;
d1681e
+  * ) echo "invalid";exit;;
d1681e
+esac
d1681e
+
d1681e
+systemctl set-property glusterd.service CPUShares=1024
d1681e
+
d1681e
+if [ ${DIR_EXIST} -eq 0 ];then
d1681e
+  echo "Creating child cgroup directory '${cgroup_name} cgroup' for glusterd.service."
d1681e
+  mkdir -p ${LOC}/${cgroup_name}
d1681e
+  if [ ! -f ${LOC}/${cgroup_name}/tasks ];then
d1681e
+    echo "Not able to create ${LOC}/${cgroup_name} directory so exit."
d1681e
+    exit 1
d1681e
+  fi
d1681e
+fi
d1681e
+
d1681e
+echo "Enter Memory value in Mega bytes [100,8000000000000]:  "
d1681e
+
d1681e
+read mem_value
d1681e
+if expr ${mem_value} + 0 > /dev/null 2>&1 ;then
d1681e
+  if [ ${mem_value} -lt 100 ] || [ ${mem_value} -gt 8000000000000 ]; then
d1681e
+    echo "Entered memory value is not correct,it should be in the range ."
d1681e
+    echo "100-8000000000000, Rerun the script with correct value ."
d1681e
+    exit 1
d1681e
+  else
d1681e
+    echo "Entered memory limit value is ${mem_value}."
d1681e
+  fi
d1681e
+else
d1681e
+  echo "Entered memory value is not numeric so Rerun the script."
d1681e
+  exit 1
d1681e
+fi
d1681e
+
d1681e
+mem_value=$(($mem_value * 1024 * 1024))
d1681e
+if [ ${DIR_EXIST} -eq 0 ];then
d1681e
+  echo "Setting ${mem_value} to memory.limit_in_bytes for ${LOC}/${cgroup_name}."
d1681e
+  echo ${mem_value} > ${LOC}/${cgroup_name}/memory.limit_in_bytes
d1681e
+  #Set memory value to memory.memsw.limit_in_bytes
d1681e
+  echo ${mem_value} > ${LOC}/${cgroup_name}/memory.memsw.limit_in_bytes
d1681e
+  # disable oom_control so that kernel will not send kill signal to the
d1681e
+  # task once limit has reached
d1681e
+  echo 1 > ${LOC}/${cgroup_name}/memory.oom_control
d1681e
+else
d1681e
+  #Increase mem_value to memory.memsw.limit_in_bytes
d1681e
+  echo ${mem_value} > ${LOC}/${cgroup_name}/memory.memsw.limit_in_bytes
d1681e
+  echo "Increase ${mem_value} to memory.limit_in_bytes for ${LOC}/${cgroup_name}."
d1681e
+  echo ${mem_value} > ${LOC}/${cgroup_name}/memory.limit_in_bytes
d1681e
+  # disable oom_control so that kernel will not send kill signal to the
d1681e
+  # task once limit has reached
d1681e
+  echo 1 > ${LOC}/${cgroup_name}/memory.oom_control
d1681e
+fi
d1681e
+
d1681e
+if ps -T -p ${daemon_pid} | grep gluster > /dev/null; then
d1681e
+  for thid in `ps -T -p ${daemon_pid} | grep gluster | awk -F " " '{print $2}'`;
d1681e
+    do
d1681e
+      echo ${thid} > ${LOC}/${cgroup_name}/tasks ;
d1681e
+    done
d1681e
+  if cat /proc/${daemon_pid}/cgroup | grep -iw ${cgroup_name} > /dev/null; then
d1681e
+    echo "Tasks are attached successfully specific to ${daemon_pid} to ${cgroup_name}."
d1681e
+  else
d1681e
+    echo "Tasks are not attached successfully."
d1681e
+  fi
d1681e
+fi
d1681e
diff --git a/glusterfs.spec.in b/glusterfs.spec.in
d1681e
index da8a3e5..56a62a9 100644
d1681e
--- a/glusterfs.spec.in
d1681e
+++ b/glusterfs.spec.in
d1681e
@@ -1553,6 +1553,8 @@ exit 0
d1681e
      %{_datadir}/glusterfs/scripts/stop-all-gluster-processes.sh
d1681e
 %if ( 0%{?_with_systemd:1} )
d1681e
      %{_libexecdir}/glusterfs/mount-shared-storage.sh
d1681e
+     %{_datadir}/glusterfs/scripts/control-cpu-load.sh
d1681e
+     %{_datadir}/glusterfs/scripts/control-mem.sh
d1681e
 %endif
d1681e
 
d1681e
 # Incrementalapi
d1681e
@@ -2178,6 +2180,9 @@ fi
d1681e
 %endif
d1681e
 
d1681e
 %changelog
d1681e
+* Fri Dec 01 2017 Mohit Agrawal <moagrawa@redhat.com>
d1681e
+- Added control-cpu-load.sh and control-mem.sh scripts to glusterfs-server section(#1484446)
d1681e
+
d1681e
 * Mon Nov 13 2017 Jiffin Tony Thottan <jthottan@redhat.com>
d1681e
 - Adding ganesha bits back in gluster repository #1499784
d1681e
 
d1681e
-- 
d1681e
1.8.3.1
d1681e