diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index 93604e8..4e1b6c5 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -1,9 +1,19 @@
-name: Pylint
+name: Linters
on: [push]
jobs:
- build:
+
+ docker-lint:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - uses: hadolint/hadolint-action@v2.1.0
+ with:
+ recursive: true
+ ignore: DL3041
+
+ python-lint:
runs-on: ubuntu-latest
strategy:
diff --git a/Dockerfile b/Dockerfile
index ad6742e..0ab5138 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,12 +2,12 @@ FROM fedora:36
WORKDIR /root
-# for nvme-stas
-RUN dnf install -y python3-dasbus python3-pyudev python3-systemd python3-gobject meson
-# for libnvme
-RUN dnf install -y git gcc g++ cmake openssl-devel libuuid-devel json-c-devel swig python-devel meson
+# first line for nvme-stas
+# second line for libnvme
+RUN dnf install -y python3-dasbus python3-pyudev python3-systemd python3-gobject meson \
+ git gcc g++ cmake openssl-devel libuuid-devel json-c-devel swig python-devel meson && dnf clean all
COPY . .
-RUN meson .build && ninja -C .build && cd .build && meson install
+RUN meson .build && ninja -C .build && meson install -C .build
ENTRYPOINT ["python3"]
diff --git a/NEWS.md b/NEWS.md
index d1515cd..f56a7c9 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,6 +5,7 @@
- Fix issues with I/O controller connection audits
- Eliminate pcie devices from list of I/O controller connections to audit
- Add soaking timer to workaround race condition between kernel and user-space applications on "add" uevents. When the kernel adds a new nvme device (e.g. `/dev/nvme7`) and sends a "add" uevent to notify user-space applications, the attributes associated with that device (e.g. `/sys/class/nvme/nvme7/cntrltype`) may not be fully initialized which can lead `stacd` to dismiss a device that should get audited.
+- Make `sticky-connections=enabled` the default (see `stacd.conf`)
## Changes with release 1.1.5
@@ -32,7 +33,7 @@ stacd: Bug fix. Check that self._cfg_soak_tmr is not None before dereferencing i
## Changes with release 1.1.1
-Make `sticky-connections-disabled` by default
+Make `sticky-connections=disabled` the default (see `stacd.conf`)
## Changes with release 1.1
diff --git a/coverage.sh.in b/coverage.sh.in
index 96b8c53..5ba2ebe 100755
--- a/coverage.sh.in
+++ b/coverage.sh.in
@@ -38,14 +38,24 @@ PRIMARY_GRP=$( id -ng )
PRIMARY_USR=$( id -nu )
PYTHON_PATH=.:./subprojects/libnvme
+log() {
+ msg="$1"
+ printf "%b[1;36m%s%b[0m\n" "\0033" "${msg}" "\0033"
+ sudo logger -i "@@@@@ COVERAGE -" -p 4 "${msg}"
+}
+
sd_stop() {
- unit="$1"-cov.service
+ app="$1"
+ unit="${app}"-cov.service
+ log "Stop ${app}"
sudo systemctl stop "${unit}" >/dev/null 2>&1
sudo systemctl reset-failed "${unit}" >/dev/null 2>&1
}
sd_restart() {
- unit="$1"-cov.service
+ app="$1"
+ unit="${app}"-cov.service
+ log "Restart ${app}"
sudo systemctl restart "${unit}" >/dev/null 2>&1
}
@@ -61,7 +71,7 @@ sd_start() {
cmd="${app} --syslog -f ${conf}"
fi
- printf "\n%b[1;36m%s%b[0m\n" "\0033" "Start ${app}" "\0033"
+ log "Start ${app}"
RUNTIME_DIRECTORY=/tmp/${app}
rm -rf ${RUNTIME_DIRECTORY}
@@ -75,7 +85,7 @@ reload_cfg() {
app="$1"
unit="${app}"-cov.service
pid=$( systemctl show --property MainPID --value "${unit}" )
- printf "%b[1;36m%s%b[0m\n" "\0033" "Reload config ${app}" "\0033"
+ log "Reload config ${app}"
sudo kill -HUP "${pid}"
}
@@ -83,15 +93,24 @@ if [ ! -d coverage ]; then
mkdir coverage
fi
+
+log "START-START-START-START-START-START-START-START-START-START-START-START"
+
+
+
################################################################################
# Load nvme kernel module
+log "modprobe nvme-tcp"
sudo /usr/sbin/modprobe nvme-tcp
+log "nvme disconnect-all"
sudo nvme disconnect-all
################################################################################
# Create a dummy config file for @STAFD_PROCNAME@
-stafd_conf_fname=$(mktemp /tmp/@STAFD_PROCNAME@.conf.XXXXXX)
+file=/tmp/@STAFD_PROCNAME@.conf.XXXXXX
+log "Create dummy config file $file"
+stafd_conf_fname=$(mktemp $file)
cat > "${stafd_conf_fname}" <<'EOF'
[Global]
tron=true
@@ -102,7 +121,9 @@ EOF
################################################################################
# Create a dummy config file for @STACD_PROCNAME@
-stacd_conf_fname=$(mktemp /tmp/@STACD_PROCNAME@.conf.XXXXXX)
+file=/tmp/@STACD_PROCNAME@.conf.XXXXXX
+log "Create dummy config file $file"
+stacd_conf_fname=$(mktemp $file)
cat > "${stacd_conf_fname}" <<'EOF'
[Global]
tron=true
@@ -111,6 +132,7 @@ udev-rule=disabled
sticky-connections=enabled
EOF
+log "Stop & Mask Avahi daemon"
sudo systemctl stop avahi-daemon.service
sudo systemctl stop avahi-daemon.socket
sudo systemctl mask avahi-daemon.service
@@ -118,11 +140,11 @@ sudo systemctl mask avahi-daemon.socket
sleep 1
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status while @STAFD_PROCNAME@ is not running" "\0033"
+log "Invoking @STAFD_CTLNAME@ status while @STAFD_PROCNAME@ is not running"
coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ ls >/dev/null 2>&1
coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ invalid-command >/dev/null 2>&1
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ status while @STACD_PROCNAME@ is not running" "\0033"
+log "Invoking @STACD_CTLNAME@ status while @STACD_PROCNAME@ is not running"
coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls >/dev/null 2>&1
coverage run --rcfile=.coveragerc @STACD_CTLNAME@ invalid-command >/dev/null 2>&1
@@ -132,30 +154,33 @@ sd_start "@STAFD_PROCNAME@" "@STAFD_DBUS_NAME@" "${stafd_conf_fname}"
sd_start "@STACD_PROCNAME@" "@STACD_DBUS_NAME@" "${stacd_conf_fname}"
sleep 3
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status" "\0033"
+log "Invoking @STAFD_CTLNAME@ status"
coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ status >/dev/null 2>&1
reload_cfg "@STAFD_PROCNAME@"
sleep 1
+log "Restart Avahi daemon"
sudo systemctl unmask avahi-daemon.socket
sudo systemctl unmask avahi-daemon.service
sudo systemctl start avahi-daemon.socket
sudo systemctl start avahi-daemon.service
sleep 2
+log "Change stafd config: tron=true, persistent-connections=false, zeroconf=enable"
cat > "${stafd_conf_fname}" <<'EOF'
[Global]
tron=true
persistent-connections=false
[Service Discovery]
-zeroconf=disabled
+zeroconf=enabled
EOF
reload_cfg "@STAFD_PROCNAME@"
sleep 1
+log "Change stafd config: ip-family=ipv4, kato=10, adding multiple controllers"
cat > "${stafd_conf_fname}" <<'EOF'
[Global]
tron=true
@@ -172,11 +197,15 @@ controller=transport=tcp;traddr=abracadabra
controller=
controller=trsvcid
controller=transport=rdma;traddr=!@#$
+controller=transport=fc;traddr=21:00:00:00:00:00:00:00;host-traddr=20:00:00:00:00:00:00:00
+controller=transport=XM;traddr=2.2.2.2
blacklist=transport=tcp;traddr=1.1.1.1
blacklist=transport=tcp;traddr=1000.1000.1000.1000
EOF
reload_cfg "@STAFD_PROCNAME@"
+
+log "Change stacd config: tron=true, udev-rule=disabled, sticky-connections=disabled"
cat > "${stacd_conf_fname}" <<'EOF'
[Global]
tron=true
@@ -186,12 +215,12 @@ EOF
reload_cfg "@STACD_PROCNAME@"
sleep 3
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status" "\0033"
+log "Invoking @STAFD_CTLNAME@ status"
coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ status >/dev/null 2>&1
################################################################################
# Fake mDNS packets from a CDC
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Start Avahi publisher" "\0033"
+log "Start Avahi publisher"
AVAHI_PUBLISHER=mdns_publisher.service
sudo systemctl stop ${AVAHI_PUBLISHER} >/dev/null 2>&1
sudo systemctl reset-failed ${AVAHI_PUBLISHER} >/dev/null 2>&1
@@ -200,7 +229,7 @@ sleep 1
################################################################################
# Start nvme target simulator
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Start nvmet" "\0033"
+log "Start nvmet"
sudo ../utils/nvmet/nvmet.py clean
sudo ../utils/nvmet/nvmet.py create -f ../utils/nvmet/nvmet.conf
sleep 2
@@ -210,76 +239,76 @@ reload_cfg "@STACD_PROCNAME@"
sleep 3
################################################################################
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_PROCNAME@ --version" "\0033"
+log "Invoking @STAFD_PROCNAME@ --version"
coverage run --rcfile=.coveragerc @STAFD_PROCNAME@ --version
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_PROCNAME@ --idl" "\0033"
+log "Invoking @STAFD_PROCNAME@ --idl"
coverage run --rcfile=.coveragerc @STAFD_PROCNAME@ --idl /tmp/@STAFD_PROCNAME@.idl
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_PROCNAME@ --version" "\0033"
+log "Invoking @STACD_PROCNAME@ --version"
coverage run --rcfile=.coveragerc @STACD_PROCNAME@ --version
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_PROCNAME@ --idl" "\0033"
+log "Invoking @STACD_PROCNAME@ --idl"
coverage run --rcfile=.coveragerc @STACD_PROCNAME@ --idl /tmp/@STACD_PROCNAME@.idl
################################################################################
# Stimulate D-Bus activity
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ --version" "\0033"
+log "Invoking @STAFD_CTLNAME@ --version"
sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ --version
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ with a bad command" "\0033"
+log "Invoking @STAFD_CTLNAME@ with a bad command"
sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ blah
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ troff" "\0033"
+log "Invoking @STAFD_CTLNAME@ troff"
sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ troff
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status" "\0033"
+log "Invoking @STAFD_CTLNAME@ status"
coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ status >/dev/null 2>&1
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ tron" "\0033"
+log "Invoking @STAFD_CTLNAME@ tron"
sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ tron
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ ls" "\0033"
+log "Invoking @STAFD_CTLNAME@ ls"
coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ ls -d >/dev/null 2>&1
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ adlp" "\0033"
+log "Invoking @STAFD_CTLNAME@ adlp"
coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ adlp -d >/dev/null 2>&1
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ dlp" "\0033"
+log "Invoking @STAFD_CTLNAME@ dlp"
coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ dlp -t tcp -a ::1 -s 8009 >/dev/null 2>&1
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ --version" "\0033"
+log "Invoking @STACD_CTLNAME@ --version"
sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ --version
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ with a bad command" "\0033"
+log "Invoking @STACD_CTLNAME@ with a bad command"
sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ blah
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ troff" "\0033"
+log "Invoking @STACD_CTLNAME@ troff"
sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ troff
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ status" "\0033"
+log "Invoking @STACD_CTLNAME@ status"
coverage run --rcfile=.coveragerc @STACD_CTLNAME@ status >/dev/null 2>&1
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ tron" "\0033"
+log "Invoking @STACD_CTLNAME@ tron"
sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ tron
-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ ls" "\0033"
+log "Invoking @STACD_CTLNAME@ ls"
coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls -d >/dev/null 2>&1
################################################################################
# Stimulate AENs activity by removing/restoring namespaces
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Remove namespace: klingons" "\0033"
+log "Remove namespace: klingons"
sudo ../utils/nvmet/nvmet.py unlink -p 1 -s klingons
sleep 2
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ ls" "\0033"
+log "Invoking @STACD_CTLNAME@ ls"
coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls -d >/dev/null 2>&1
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Restore namespace: klingons" "\0033"
+log "Restore namespace: klingons"
sudo ../utils/nvmet/nvmet.py link -p 1 -s klingons
sleep 2
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ ls" "\0033"
+log "Invoking @STACD_CTLNAME@ ls"
coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls -d >/dev/null 2>&1
################################################################################
# Stop Avahi Publisher
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop Avahi publisher" "\0033"
+log "Stop Avahi publisher"
sudo systemctl stop ${AVAHI_PUBLISHER}
sleep 1
################################################################################
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Restart Avahi publisher" "\0033"
+log "Restart Avahi publisher"
sudo systemd-run --unit=${AVAHI_PUBLISHER} --working-directory=. avahi-publish -s SFSS _nvme-disc._tcp 8009 "p=tcp"
sleep 2
################################################################################
# Make config changes for @STAFD_PROCNAME@
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Empty configuration and disable zeroconf for @STAFD_PROCNAME@" "\0033"
+log "Empty configuration and disable zeroconf for @STAFD_PROCNAME@"
cat > "${stafd_conf_fname}" <<'EOF'
[Global]
tron=true
@@ -293,7 +322,7 @@ sleep 1
################################################################################
# Make more config changes for @STAFD_PROCNAME@
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Add single controller (::1) and re-enable zeroconf for @STAFD_PROCNAME@" "\0033"
+log "Add single controller (::1) and re-enable zeroconf for @STAFD_PROCNAME@"
cat > "${stafd_conf_fname}" <<'EOF'
[Global]
tron=true
@@ -307,24 +336,23 @@ sleep 2
################################################################################
# Stop Avahi Publisher
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop Avahi publisher" "\0033"
+log "Stop Avahi publisher"
sudo systemctl stop ${AVAHI_PUBLISHER}
sleep 2
################################################################################
# Remove one of the NVMe device's
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Remove (disconnect) nvme1" "\0033"
+log "Remove (disconnect) nvme1"
sudo nvme disconnect -d nvme1
sleep 2
################################################################################
-printf "%b[1;36m%s%b[0m\n" "\0033" "Restart @STAFD_PROCNAME@ and @STACD_PROCNAME@" "\0033"
sd_restart "@STAFD_PROCNAME@"
sd_restart "@STACD_PROCNAME@"
sleep 1
-printf "%b[1;36m%s%b[0m\n" "\0033" "Create invalid conditions for saving/loading @STAFD_PROCNAME@'s last known config" "\0033"
+log "Create invalid conditions for saving/loading @STAFD_PROCNAME@'s last known config"
rm -rf "/tmp/@STAFD_PROCNAME@"
sd_stop "@STAFD_PROCNAME@"
sd_restart "@STACD_PROCNAME@"
@@ -334,7 +362,7 @@ sleep 2
################################################################################
# Stop everything and collect coverage stats
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop @STAFD_PROCNAME@ and @STACD_PROCNAME@" "\0033"
+log "Stop @STAFD_PROCNAME@ and @STACD_PROCNAME@"
sd_stop "@STAFD_PROCNAME@"
sd_stop "@STACD_PROCNAME@"
sleep 1
@@ -345,33 +373,49 @@ sudo chown -R "${PRIMARY_USR}":"${PRIMARY_GRP}" coverage >/dev/null 2>&1
sudo chown -R "${PRIMARY_USR}":"${PRIMARY_GRP}" staslib/__pycache__ >/dev/null 2>&1
sudo chown -R "${PRIMARY_USR}":"${PRIMARY_GRP}" subprojects/libnvme/libnvme/__pycache__ >/dev/null 2>&1
+log "nvme disconnect-all"
sudo nvme disconnect-all
+log "Remove ${stafd_conf_fname} and ${stacd_conf_fname}"
rm "${stafd_conf_fname}"
rm "${stacd_conf_fname}"
+log "Run unit test: test-udev"
PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-udev.py
+log "Run unit test: test-avahi"
PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-avahi.py
+log "Run unit test: test-gtimer"
PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-gtimer.py
+log "Run unit test: test-version"
PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-version.py
+log "Run unit test: test-transport_id"
PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-transport_id.py
+log "Run unit test: test-config"
PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-config.py
+log "Run unit test: test-controller"
PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-controller.py
+log "Run unit test: test-service"
PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-service.py
+log "Run unit test: test-log"
PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-log.py
+log "Run unit test: test-nvme_options"
sudo PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-nvme_options.py
################################################################################
# Stop nvme target simulator
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop nvmet" "\0033"
+log "Stop nvmet"
sudo ../utils/nvmet/nvmet.py clean
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Collect all coverage data" "\0033"
+log "Collect all coverage data"
coverage combine --rcfile=.coveragerc
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Generating coverage report" "\0033"
+log "Generating coverage report"
coverage report -i --rcfile=.coveragerc
-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Generating coverage report (HTML)" "\0033"
+log "Generating coverage report (HTML)"
coverage html -i --rcfile=.coveragerc
+
+log "All done!!!"
+
+log "FINISHED-FINISHED-FINISHED-FINISHED-FINISHED-FINISHED-FINISHED-FINISHED"
diff --git a/doc/man/stacd.conf.xml b/doc/man/stacd.conf.xml
index 60622f6..65ee71a 100644
--- a/doc/man/stacd.conf.xml
+++ b/doc/man/stacd.conf.xml
@@ -378,7 +378,7 @@
entries in <filename>stacd.conf</filename> have been removed.
</para>
- <formalpara><title>With <code>sticky-connections=disabled</code> (default)</title>
+ <formalpara><title>With <code>sticky-connections=disabled</code></title>
<para>
<code>stacd</code> immediately disconnects from
a previously connected IOC if the response to a
@@ -411,7 +411,7 @@
</formalpara>
</formalpara>
- <formalpara><title>With <code>sticky-connections=enabled</code></title>
+ <formalpara><title>With <code>sticky-connections=enabled (default)</code></title>
<para>
<code>stacd</code> does not disconnect from IOCs
when a DPLE is removed or a <literal>controller=</literal>
diff --git a/etc/stas/stacd.conf b/etc/stas/stacd.conf
index 02e7b3e..0434671 100644
--- a/etc/stas/stacd.conf
+++ b/etc/stas/stacd.conf
@@ -202,8 +202,8 @@
#
# Type: String
# Range: [disabled, enabled]
-# Default: disabled
-#sticky-connections=disabled
+# Default: enabled
+#sticky-connections=enabled
[Controllers]
# controller: I/O Controllers (IOC) are specified with this keyword.
diff --git a/stacd.py b/stacd.py
index 708e372..28cefac 100755
--- a/stacd.py
+++ b/stacd.py
@@ -10,14 +10,12 @@
''' STorage Appliance Connector Daemon
'''
import sys
-import logging
from argparse import ArgumentParser
from staslib import defs
-# pylint: disable=consider-using-f-string
-DBUS_IDL = '''
+DBUS_IDL = f'''
<node>
- <interface name="%s.debug">
+ <interface name="{defs.STACD_DBUS_NAME}.debug">
<property name="tron" type="b" access="readwrite"/>
<property name="log_level" type="s" access="read"/>
<method name="process_info">
@@ -34,19 +32,16 @@ DBUS_IDL = '''
</method>
</interface>
- <interface name="%s">
+ <interface name="{defs.STACD_DBUS_NAME}">
<method name="list_controllers">
<arg direction="in" type="b" name="detailed"/>
- <arg direction="out" type="aa{ss}" name="controller_list"/>
+ <arg direction="out" type="aa{{ss}}" name="controller_list"/>
</method>
</interface>
</node>
-''' % (
- defs.STACD_DBUS_NAME,
- defs.STACD_DBUS_NAME,
-)
-
+'''
+# ******************************************************************************
def parse_args(conf_file: str): # pylint: disable=missing-function-docstring
parser = ArgumentParser(
description=f'{defs.STAC_DESCRIPTION} ({defs.STAC_ACRONYM}). Must be root to run this program.'
@@ -77,6 +72,12 @@ ARGS = parse_args(defs.STACD_CONFIG_FILE)
if ARGS.version:
print(f'{defs.PROJECT_NAME} {defs.VERSION}')
+ try:
+ import libnvme
+
+ print(f'libnvme {libnvme.__version__}')
+ except (AttributeError, ModuleNotFoundError):
+ pass
sys.exit(0)
if ARGS.idl:
@@ -85,78 +86,14 @@ if ARGS.idl:
sys.exit(0)
-# There is a reason for having this import here and not at the top of the file.
-# We want to allow running stafd with the --version and --idl options and exit
-# without having to import stas.
-from staslib import stas # pylint: disable=wrong-import-position
-
-# Before going any further, make sure the script is allowed to run.
-stas.check_if_allowed_to_continue()
-
-
-################################################################################
-# Preliminary checks have passed. Let her rip!
-# pylint: disable=wrong-import-position
-# pylint: disable=wrong-import-order
-import json
-import pathlib
-import systemd.daemon
-import dasbus.error
-import dasbus.client.observer
-import dasbus.client.proxy
-from gi.repository import GLib
-from staslib import conf, log, gutil, trid, udev, ctrl, service # pylint: disable=ungrouped-imports
-
-log.init(ARGS.syslog)
-
-UDEV_RULE_SUPPRESS = pathlib.Path('/run/udev/rules.d', '70-nvmf-autoconnect.rules')
-
-
-def udev_rule_ctrl(enable):
- '''@brief We add an empty udev rule to /run/udev/rules.d to suppress
- nvme-cli's udev rule that is used to tell udevd to automatically
- connect to I/O controller. This is to avoid race conditions between
- stacd and udevd. This is configurable. See "udev-rule" in stacd.conf
- for details.
- '''
- if enable:
- try:
- UDEV_RULE_SUPPRESS.unlink()
- except FileNotFoundError:
- pass
- else:
- if not UDEV_RULE_SUPPRESS.exists():
- pathlib.Path('/run/udev/rules.d').mkdir(parents=True, exist_ok=True)
- UDEV_RULE_SUPPRESS.symlink_to('/dev/null')
-
-
# ******************************************************************************
-class Ioc(ctrl.Controller):
- '''@brief This object establishes a connection to one I/O Controller.'''
-
- def __init__(self, root, host, tid: trid.TID):
- super().__init__(root, host, tid)
-
- def _on_udev_remove(self, udev_obj):
- '''Called when the associated nvme device (/dev/nvmeX) is removed
- from the system.
- '''
- super()._on_udev_remove(udev_obj)
-
- # Defer removal of this object to the next main loop's idle period.
- GLib.idle_add(STAC.remove_controller, self)
-
- def _find_existing_connection(self):
- return self._udev.find_nvme_ioc_device(self.tid)
-
-
-# ******************************************************************************
-class Stac(service.Service):
- '''STorage Appliance Connector (STAC)'''
+if __name__ == '__main__':
+ import json
+ import logging
+ from staslib import log, service, stas, udev # pylint: disable=ungrouped-imports
- CONF_STABILITY_SOAK_TIME_SEC = 1.5
- CONF_STABILITY_LONG_SOAK_TIME_SEC = 10 # pylint: disable=invalid-name
- ADD_EVENT_SOAK_TIME_SEC = 1
+ # Before going any further, make sure the script is allowed to run.
+ stas.check_if_allowed_to_continue()
class Dbus:
'''This is the DBus interface that external programs can use to
@@ -205,229 +142,8 @@ class Stac(service.Service):
for controller in STAC.get_controllers()
]
- # ==========================================================================
- def __init__(self, args):
- super().__init__(args, self._reload_hdlr)
-
- # We don't want to apply configuration changes to nvme-cli right away.
- # Often, multiple changes will occur in a short amount of time (sub-second).
- # We want to wait until there are no more changes before applying them
- # to the system. The following timer acts as a "soak period". Changes
- # will be applied by calling self._on_config_ctrls() at the end of
- # the soak period.
- self._cfg_soak_tmr = gutil.GTimer(Stac.CONF_STABILITY_SOAK_TIME_SEC, self._on_config_ctrls)
- self._cfg_soak_tmr.start()
-
- self._add_event_soak_tmr = gutil.GTimer(Stac.ADD_EVENT_SOAK_TIME_SEC, self._on_add_event_soaked)
-
- self._config_connections_audit()
-
- # Create the D-Bus instance.
- self._config_dbus(Stac.Dbus(), defs.STACD_DBUS_NAME, defs.STACD_DBUS_PATH)
-
- # Connect to STAF D-Bus interface
- self._staf = None
- self._staf_watcher = dasbus.client.observer.DBusObserver(self._sysbus, defs.STAFD_DBUS_NAME)
- self._staf_watcher.service_available.connect(self._connect_to_staf)
- self._staf_watcher.service_unavailable.connect(self._disconnect_from_staf)
- self._staf_watcher.connect_once_available()
-
- # Suppress udev rule to auto-connect when AEN is received.
- udev_rule_ctrl(conf.SvcConf().udev_rule_enabled)
-
- def _release_resources(self):
- logging.debug('Stac._release_resources()')
-
- if self._add_event_soak_tmr:
- self._add_event_soak_tmr.kill()
-
- udev_rule_ctrl(True)
-
- if self._udev:
- self._udev.unregister_for_action_events('add')
-
- self._destroy_staf_comlink(self._staf_watcher)
- if self._staf_watcher is not None:
- self._staf_watcher.disconnect()
-
- super()._release_resources()
-
- self._staf = None
- self._staf_watcher = None
- self._add_event_soak_tmr = None
-
- def _audit_connections(self, tids):
- '''A host should only connect to I/O controllers that have been zoned
- for that host or a manual "controller" entry exists in stcd.conf.
- A host should disconnect from an I/O controller when that I/O controller
- is removed from the zone or a manual "controller" entry is removed from
- stacd.conf. stacd will audit connections if "sticky-connections=disabled".
- stacd will delete any connection that is not supposed to exist.
- '''
- logging.debug('Stac._audit_connections() - tids = %s', tids)
- num_controllers = len(self._controllers)
- for tid in tids:
- if tid not in self._controllers:
- self._controllers[tid] = Ioc(self._root, self._host, tid)
-
- if num_controllers != len(self._controllers):
- self._cfg_soak_tmr.start(Stac.CONF_STABILITY_SOAK_TIME_SEC)
-
- def _on_add_event(self, udev_obj): # pylint: disable=unused-argument
- '''@brief This function is called when a "add" event is received from
- the kernel for an NVMe device. This is used to trigger an audit and make
- sure that the connection to an I/O controller is allowed.
-
- WARNING: There is a race condition with the "add" event from the kernel.
- The kernel sends the "add" event a bit early and the sysfs attributes
- associated with the nvme object are not always fully initialized.
- To workaround this problem we use a soaking timer to give time for the
- sysfs attributes to stabilize.
- '''
- self._add_event_soak_tmr.start()
-
- def _on_add_event_soaked(self):
- '''@brief After the add event has been soaking for ADD_EVENT_SOAK_TIME_SEC
- seconds, we can audit the connections.
- '''
- if not conf.SvcConf().sticky_connections:
- self._audit_connections(self._udev.get_nvme_ioc_tids())
- return GLib.SOURCE_REMOVE
-
- def _config_connections_audit(self):
- '''This function checks the "sticky_connections" parameter to determine
- whether audits should be performed. Audits are enabled when
- "sticky_connections" is disabled.
- '''
- if not conf.SvcConf().sticky_connections:
- if self._udev.get_registered_action_cback('add') is None:
- self._udev.register_for_action_events('add', self._on_add_event)
- self._audit_connections(self._udev.get_nvme_ioc_tids())
- else:
- self._udev.unregister_for_action_events('add')
-
- def _keep_connections_on_exit(self):
- '''@brief Determine whether connections should remain when the
- process exits.
- '''
- return True
-
- def _reload_hdlr(self):
- '''@brief Reload configuration file. This is triggered by the SIGHUP
- signal, which can be sent with "systemctl reload stacd".
- '''
- systemd.daemon.notify('RELOADING=1')
- service_cnf = conf.SvcConf()
- service_cnf.reload()
- self.tron = service_cnf.tron
- self._config_connections_audit()
- self._cfg_soak_tmr.start(Stac.CONF_STABILITY_SOAK_TIME_SEC)
- udev_rule_ctrl(service_cnf.udev_rule_enabled)
- systemd.daemon.notify('READY=1')
- return GLib.SOURCE_CONTINUE
-
- def _get_log_pages_from_stafd(self):
- if self._staf:
- try:
- return json.loads(self._staf.get_all_log_pages(True))
- except dasbus.error.DBusError:
- pass
-
- return list()
-
- def _config_ctrls_finish(self, configured_ctrl_list):
- configured_ctrl_list = [
- ctrl_dict for ctrl_dict in configured_ctrl_list if 'traddr' in ctrl_dict and 'subsysnqn' in ctrl_dict
- ]
- logging.debug('Stac._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list)
-
- discovered_ctrl_list = list()
- for staf_data in self._get_log_pages_from_stafd():
- host_traddr = staf_data['discovery-controller']['host-traddr']
- host_iface = staf_data['discovery-controller']['host-iface']
- for dlpe in staf_data['log-pages']:
- if dlpe.get('subtype') == 'nvme': # eliminate discovery controllers
- discovered_ctrl_list.append(stas.cid_from_dlpe(dlpe, host_traddr, host_iface))
-
- logging.debug('Stac._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list)
-
- controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list)
- controllers = stas.remove_invalid_addresses(controllers)
-
- new_controller_ids = {trid.TID(controller) for controller in controllers}
- cur_controller_ids = set(self._controllers.keys())
- controllers_to_add = new_controller_ids - cur_controller_ids
- controllers_to_del = cur_controller_ids - new_controller_ids
-
- logging.debug('Stac._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add))
- logging.debug('Stac._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del))
-
- for tid in controllers_to_del:
- controller = self._controllers.pop(tid, None)
- if controller is not None:
- controller.disconnect(self.remove_controller, conf.SvcConf().sticky_connections)
-
- for tid in controllers_to_add:
- self._controllers[tid] = Ioc(self._root, self._host, tid)
-
- def _connect_to_staf(self, _):
- '''@brief Hook up DBus signal handlers for signals from stafd.'''
- try:
- self._staf = self._sysbus.get_proxy(defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH)
- self._staf.log_pages_changed.connect(self._log_pages_changed)
- self._cfg_soak_tmr.start()
-
- # Make sure timer is set back to its normal value.
- self._cfg_soak_tmr.set_timeout(Stac.CONF_STABILITY_SOAK_TIME_SEC)
- logging.debug('Stac._connect_to_staf() - Connected to staf')
- except dasbus.error.DBusError:
- logging.error('Failed to connect to staf')
-
- def _destroy_staf_comlink(self, watcher): # pylint: disable=unused-argument
- if self._staf:
- self._staf.log_pages_changed.disconnect(self._log_pages_changed)
- dasbus.client.proxy.disconnect_proxy(self._staf)
- self._staf = None
-
- def _disconnect_from_staf(self, watcher):
- self._destroy_staf_comlink(watcher)
-
- # When we lose connectivity with stafd, the most logical explanation
- # is that stafd restarted. In that case, it may take some time for stafd
- # to re-populate its log pages cache. So let's give stafd plenty of time
- # to update its log pages cache and send log pages change notifications
- # before triggering a stacd re-config. We do this by momentarily
- # increasing the config soak timer to a longer period.
- if self._cfg_soak_tmr:
- self._cfg_soak_tmr.set_timeout(Stac.CONF_STABILITY_LONG_SOAK_TIME_SEC)
-
- logging.debug('Stac._disconnect_from_staf() - Disconnected from staf')
-
- def _log_pages_changed( # pylint: disable=too-many-arguments
- self, transport, traddr, trsvcid, host_traddr, host_iface, subsysnqn, device
- ):
- logging.debug(
- 'Stac._log_pages_changed() - transport=%s, traddr=%s, trsvcid=%s, host_traddr=%s, host_iface=%s, subsysnqn=%s, device=%s',
- transport,
- traddr,
- trsvcid,
- host_traddr,
- host_iface,
- subsysnqn,
- device,
- )
- self._cfg_soak_tmr.start(Stac.CONF_STABILITY_SOAK_TIME_SEC)
-
- def _load_last_known_config(self):
- return dict()
-
- def _dump_last_known_config(self, controllers):
- pass
-
-
-# ******************************************************************************
-if __name__ == '__main__':
- STAC = Stac(ARGS)
+ log.init(ARGS.syslog)
+ STAC = service.Stac(ARGS, Dbus())
STAC.run()
STAC = None
diff --git a/stafd.py b/stafd.py
index aff64fd..8a77c51 100755
--- a/stafd.py
+++ b/stafd.py
@@ -10,14 +10,12 @@
''' STorage Appliance Finder Daemon
'''
import sys
-import logging
from argparse import ArgumentParser
from staslib import defs
-# pylint: disable=consider-using-f-string
-DBUS_IDL = '''
+DBUS_IDL = f'''
<node>
- <interface name="%s.debug">
+ <interface name="{defs.STAFD_DBUS_NAME}.debug">
<property name="tron" type="b" access="readwrite"/>
<property name="log_level" type="s" access="read"/>
<method name="process_info">
@@ -34,10 +32,10 @@ DBUS_IDL = '''
</method>
</interface>
- <interface name="%s">
+ <interface name="{defs.STAFD_DBUS_NAME}">
<method name="list_controllers">
<arg direction="in" type="b" name="detailed"/>
- <arg direction="out" type="aa{ss}" name="controller_list"/>
+ <arg direction="out" type="aa{{ss}}" name="controller_list"/>
</method>
<method name="get_log_pages">
<arg direction="in" type="s" name="transport"/>
@@ -46,7 +44,7 @@ DBUS_IDL = '''
<arg direction="in" type="s" name="host_traddr"/>
<arg direction="in" type="s" name="host_iface"/>
<arg direction="in" type="s" name="subsysnqn"/>
- <arg direction="out" type="aa{ss}" name="log_pages"/>
+ <arg direction="out" type="aa{{ss}}" name="log_pages"/>
</method>
<method name="get_all_log_pages">
<arg direction="in" type="b" name="detailed"/>
@@ -63,12 +61,10 @@ DBUS_IDL = '''
</signal>
</interface>
</node>
-''' % (
- defs.STAFD_DBUS_NAME,
- defs.STAFD_DBUS_NAME,
-)
+'''
+# ******************************************************************************
def parse_args(conf_file: str): # pylint: disable=missing-function-docstring
parser = ArgumentParser(
description=f'{defs.STAF_DESCRIPTION} ({defs.STAF_ACRONYM}). Must be root to run this program.'
@@ -99,6 +95,12 @@ ARGS = parse_args(defs.STAFD_CONFIG_FILE)
if ARGS.version:
print(f'{defs.PROJECT_NAME} {defs.VERSION}')
+ try:
+ import libnvme
+
+ print(f'libnvme {libnvme.__version__}')
+ except (AttributeError, ModuleNotFoundError):
+ pass
sys.exit(0)
if ARGS.idl:
@@ -107,250 +109,15 @@ if ARGS.idl:
sys.exit(0)
-# There is a reason for having this import here and not at the top of the file.
-# We want to allow running stafd with the --version and --idl options and exit
-# without having to import stas and avahi.
-from staslib import stas, avahi # pylint: disable=wrong-import-position
-
-# Before going any further, make sure the script is allowed to run.
-stas.check_if_allowed_to_continue()
-
-
-################################################################################
-# Preliminary checks have passed. Let her rip!
-# pylint: disable=wrong-import-position
-# pylint: disable=wrong-import-order
-import json
-import pickle
-import dasbus.server.interface
-import systemd.daemon
-from libnvme import nvme
-from gi.repository import GLib
-from staslib import conf, log, gutil, trid, udev, ctrl, service # pylint: disable=ungrouped-imports
-
-log.init(ARGS.syslog)
-
-DLP_CHANGED = (
- (nvme.NVME_LOG_LID_DISCOVER << 16) | (nvme.NVME_AER_NOTICE_DISC_CHANGED << 8) | nvme.NVME_AER_NOTICE
-) # 0x70f002
-
-
# ******************************************************************************
-class Dc(ctrl.Controller):
- '''@brief This object establishes a connection to one Discover Controller (DC).
- It retrieves the discovery log pages and caches them.
- It also monitors udev events associated with that DC and updates
- the cached discovery log pages accordingly.
- '''
-
- GET_LOG_PAGE_RETRY_RERIOD_SEC = 20
- REGISTRATION_RETRY_RERIOD_SEC = 10
-
- def __init__(self, root, host, tid: trid.TID, log_pages=None):
- super().__init__(root, host, tid, discovery_ctrl=True)
- self._register_op = None
- self._get_log_op = None
- self._log_pages = log_pages if log_pages else list() # Log pages cache
-
- def _release_resources(self):
- logging.debug('Dc._release_resources() - %s | %s', self.id, self.device)
- super()._release_resources()
- self._log_pages = list()
-
- def _kill_ops(self):
- super()._kill_ops()
- if self._get_log_op:
- self._get_log_op.kill()
- self._get_log_op = None
- if self._register_op:
- self._register_op.kill()
- self._register_op = None
-
- def info(self) -> dict:
- '''@brief Get the controller info for this object'''
- info = super().info()
- if self._get_log_op:
- info['get log page operation'] = self._get_log_op.as_dict()
- if self._register_op:
- info['register operation'] = self._register_op.as_dict()
- return info
-
- def cancel(self):
- '''@brief Used to cancel pending operations.'''
- super().cancel()
- if self._get_log_op:
- self._get_log_op.cancel()
- if self._register_op:
- self._register_op.cancel()
-
- def log_pages(self) -> list:
- '''@brief Get the cached log pages for this object'''
- return self._log_pages
-
- def referrals(self) -> list:
- '''@brief Return the list of referrals'''
- return [page for page in self._log_pages if page['subtype'] == 'referral']
-
- def _on_aen(self, aen: int):
- super()._on_aen(aen)
- if aen == DLP_CHANGED and self._get_log_op:
- self._get_log_op.run_async()
-
- def _on_nvme_event(self, nvme_event: str):
- super()._on_nvme_event(nvme_event)
- if nvme_event == 'connected' and self._register_op:
- self._register_op.run_async()
-
- def _on_udev_remove(self, udev_obj):
- super()._on_udev_remove(udev_obj)
- if self._try_to_connect_deferred:
- self._try_to_connect_deferred.schedule()
-
- def _find_existing_connection(self):
- return self._udev.find_nvme_dc_device(self.tid)
-
- # --------------------------------------------------------------------------
- def _on_connect_success(self, op_obj, data):
- '''@brief Function called when we successfully connect to the
- Discovery Controller.
- '''
- super()._on_connect_success(op_obj, data)
-
- if self._alive():
- if self._ctrl.is_registration_supported():
- self._register_op = gutil.AsyncOperationWithRetry(
- self._on_registration_success,
- self._on_registration_fail,
- self._ctrl.registration_ctlr,
- nvme.NVMF_DIM_TAS_REGISTER,
- )
- self._register_op.run_async()
- else:
- self._get_log_op = gutil.AsyncOperationWithRetry(
- self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover
- )
- self._get_log_op.run_async()
-
- # --------------------------------------------------------------------------
- def _on_registration_success(self, op_obj, data): # pylint: disable=unused-argument
- '''@brief Function called when we successfully register with the
- Discovery Controller. See self._register_op object
- for details.
- '''
- if self._alive():
- if data is not None:
- logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data)
- else:
- logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device)
- self._get_log_op = gutil.AsyncOperationWithRetry(
- self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover
- )
- self._get_log_op.run_async()
- else:
- logging.debug(
- 'Dc._on_registration_success() - %s | %s Received event on dead object.', self.id, self.device
- )
-
- def _on_registration_fail(self, op_obj, err, fail_cnt):
- '''@brief Function called when we fail to register with the
- Discovery Controller. See self._register_op object
- for details.
- '''
- if self._alive():
- logging.debug(
- 'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec',
- self.id,
- self.device,
- err,
- Dc.REGISTRATION_RETRY_RERIOD_SEC,
- )
- if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect
- logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err)
- # op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC)
- else:
- logging.debug(
- 'Dc._on_registration_fail() - %s | %s Received event on dead object. %s',
- self.id,
- self.device,
- err,
- )
- op_obj.kill()
-
- # --------------------------------------------------------------------------
- def _on_get_log_success(self, op_obj, data): # pylint: disable=unused-argument
- '''@brief Function called when we successfully retrieve the log pages
- from the Discovery Controller. See self._get_log_op object
- for details.
- '''
- if self._alive():
- # Note that for historical reasons too long to explain, the CDC may
- # return invalid addresses ("0.0.0.0", "::", or ""). Those need to be
- # filtered out.
- referrals_before = self.referrals()
- self._log_pages = (
- [
- {k: str(v) for k, v in dictionary.items()}
- for dictionary in data
- if dictionary.get('traddr') not in ('0.0.0.0', '::', '')
- ]
- if data
- else list()
- )
- logging.info(
- '%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages)
- )
- referrals_after = self.referrals()
- STAF.log_pages_changed(self, self.device)
- if referrals_after != referrals_before:
- logging.debug(
- 'Dc._on_get_log_success() - %s | %s Referrals before = %s',
- self.id,
- self.device,
- referrals_before,
- )
- logging.debug(
- 'Dc._on_get_log_success() - %s | %s Referrals after = %s',
- self.id,
- self.device,
- referrals_after,
- )
- STAF.referrals_changed()
- else:
- logging.debug(
- 'Dc._on_get_log_success() - %s | %s Received event on dead object.', self.id, self.device
- )
-
- def _on_get_log_fail(self, op_obj, err, fail_cnt):
- '''@brief Function called when we fail to retrieve the log pages
- from the Discovery Controller. See self._get_log_op object
- for details.
- '''
- if self._alive():
- logging.debug(
- 'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec',
- self.id,
- self.device,
- err,
- Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC,
- )
- if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect
- logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err)
- op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC)
- else:
- logging.debug(
- 'Dc._on_get_log_fail() - %s | %s Received event on dead object. %s',
- self.id,
- self.device,
- err,
- )
- op_obj.kill()
-
-
-# ******************************************************************************
-class Staf(service.Service):
- '''STorage Appliance Finder (STAF)'''
+if __name__ == '__main__':
+ import json
+ import logging
+ import dasbus.server.interface
+ from staslib import log, service, stas, udev # pylint: disable=ungrouped-imports
- CONF_STABILITY_SOAK_TIME_SEC = 1.5
+ # Before going any further, make sure the script is allowed to run.
+ stas.check_if_allowed_to_continue()
class Dbus:
'''This is the DBus interface that external programs can use to
@@ -431,148 +198,8 @@ class Staf(service.Service):
for controller in STAF.get_controllers()
]
- # ==========================================================================
- def __init__(self, args):
- super().__init__(args, self._reload_hdlr)
-
- self._avahi = avahi.Avahi(self._sysbus, self._avahi_change)
- self._avahi.config_stypes(conf.SvcConf().get_stypes())
-
- # We don't want to apply configuration changes to nvme-cli right away.
- # Often, multiple changes will occur in a short amount of time (sub-second).
- # We want to wait until there are no more changes before applying them
- # to the system. The following timer acts as a "soak period". Changes
- # will be applied by calling self._on_config_ctrls() at the end of
- # the soak period.
- self._cfg_soak_tmr = gutil.GTimer(Staf.CONF_STABILITY_SOAK_TIME_SEC, self._on_config_ctrls)
- self._cfg_soak_tmr.start()
-
- # Create the D-Bus instance.
- self._config_dbus(Staf.Dbus(), defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH)
-
- def info(self) -> dict:
- '''@brief Get the status info for this object (used for debug)'''
- info = super().info()
- info['avahi'] = self._avahi.info()
- return info
-
- def _release_resources(self):
- logging.debug('Staf._release_resources()')
- super()._release_resources()
- if self._avahi:
- self._avahi.kill()
- self._avahi = None
-
- def _load_last_known_config(self):
- try:
- with open(self._lkc_file, 'rb') as file:
- config = pickle.load(file)
- except (FileNotFoundError, AttributeError):
- return dict()
-
- logging.debug('Staf._load_last_known_config() - DC count = %s', len(config))
- return {tid: Dc(self._root, self._host, tid, log_pages) for tid, log_pages in config.items()}
-
- def _dump_last_known_config(self, controllers):
- try:
- with open(self._lkc_file, 'wb') as file:
- config = {tid: dc.log_pages() for tid, dc in controllers.items()}
- logging.debug('Staf._dump_last_known_config() - DC count = %s', len(config))
- pickle.dump(config, file)
- except FileNotFoundError as ex:
- logging.error('Unable to save last known config: %s', ex)
-
- def _keep_connections_on_exit(self):
- '''@brief Determine whether connections should remain when the
- process exits.
- '''
- return conf.SvcConf().persistent_connections
-
- def _reload_hdlr(self):
- '''@brief Reload configuration file. This is triggered by the SIGHUP
- signal, which can be sent with "systemctl reload stafd".
- '''
- systemd.daemon.notify('RELOADING=1')
- service_cnf = conf.SvcConf()
- service_cnf.reload()
- self.tron = service_cnf.tron
- self._avahi.kick_start() # Make sure Avahi is running
- self._avahi.config_stypes(service_cnf.get_stypes())
- self._cfg_soak_tmr.start()
- systemd.daemon.notify('READY=1')
- return GLib.SOURCE_CONTINUE
-
- def log_pages_changed(self, controller, device):
- '''@brief Function invoked when a controller's cached log pages
- have changed. This will emit a D-Bus signal to inform
- other applications that the cached log pages have changed.
- '''
- self._dbus_iface.log_pages_changed.emit(
- controller.tid.transport,
- controller.tid.traddr,
- controller.tid.trsvcid,
- controller.tid.host_traddr,
- controller.tid.host_iface,
- controller.tid.subsysnqn,
- device,
- )
-
- def referrals_changed(self):
- '''@brief Function invoked when a controller's cached referrals
- have changed.
- '''
- logging.debug('Staf.referrals_changed()')
- self._cfg_soak_tmr.start()
-
- def _referrals(self) -> list:
- return [
- stas.cid_from_dlpe(dlpe, controller.tid.host_traddr, controller.tid.host_iface)
- for controller in self.get_controllers()
- for dlpe in controller.referrals()
- ]
-
- def _config_ctrls_finish(self, configured_ctrl_list):
- '''@brief Finish discovery controllers configuration after
- hostnames (if any) have been resolved.
- '''
- configured_ctrl_list = [
- ctrl_dict
- for ctrl_dict in configured_ctrl_list
- if 'traddr' in ctrl_dict and ctrl_dict.setdefault('subsysnqn', defs.WELL_KNOWN_DISC_NQN)
- ]
-
- discovered_ctrl_list = self._avahi.get_controllers()
- referral_ctrl_list = self._referrals()
- logging.debug('Staf._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list)
- logging.debug('Staf._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list)
- logging.debug('Staf._config_ctrls_finish() - referral_ctrl_list = %s', referral_ctrl_list)
-
- controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list + referral_ctrl_list)
- controllers = stas.remove_invalid_addresses(controllers)
-
- new_controller_ids = {trid.TID(controller) for controller in controllers}
- cur_controller_ids = set(self._controllers.keys())
- controllers_to_add = new_controller_ids - cur_controller_ids
- controllers_to_del = cur_controller_ids - new_controller_ids
-
- logging.debug('Staf._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add))
- logging.debug('Staf._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del))
-
- for tid in controllers_to_del:
- controller = self._controllers.pop(tid, None)
- if controller is not None:
- controller.disconnect(self.remove_controller, conf.SvcConf().persistent_connections)
-
- for tid in controllers_to_add:
- self._controllers[tid] = Dc(self._root, self._host, tid)
-
- def _avahi_change(self):
- self._cfg_soak_tmr.start()
-
-
-# ******************************************************************************
-if __name__ == '__main__':
- STAF = Staf(ARGS)
+ log.init(ARGS.syslog)
+ STAF = service.Staf(ARGS, Dbus())
STAF.run()
STAF = None
diff --git a/staslib/avahi.py b/staslib/avahi.py
index 768bbf4..90a67c8 100644
--- a/staslib/avahi.py
+++ b/staslib/avahi.py
@@ -172,9 +172,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes
services = dict()
for service, obj in self._services.items():
interface, protocol, name, stype, domain = service
- key = '({}, {}, {}.{}, {})'.format( # pylint: disable=consider-using-f-string
- socket.if_indextoname(interface), Avahi.protos.get(protocol, 'unknown'), name, domain, stype
- )
+ key = f'({socket.if_indextoname(interface)}, {Avahi.protos.get(protocol, "unknown")}, {name}.{domain}, {stype})'
services[key] = obj.get('data', {})
info = {
@@ -316,7 +314,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes
_interface_name: str,
_signal_name: str,
args: typing.Tuple[int, int, str, str, str, int],
- *_user_data
+ *_user_data,
):
(interface, protocol, name, stype, domain, flags) = args
logging.debug(
@@ -352,7 +350,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes
_interface_name: str,
_signal_name: str,
args: typing.Tuple[int, int, str, str, str, int],
- *_user_data
+ *_user_data,
):
(interface, protocol, name, stype, domain, flags) = args
logging.debug(
@@ -386,7 +384,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes
_interface_name: str,
_signal_name: str,
args: typing.Tuple[int, int, str, str, str, str, int, str, int, list, int],
- *_user_data
+ *_user_data,
):
(interface, protocol, name, stype, domain, host, aprotocol, address, port, txt, flags) = args
txt = _txt2dict(txt)
@@ -428,7 +426,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes
interface_name: str,
_signal_name: str,
args: typing.Tuple[str],
- *_user_data
+ *_user_data,
):
(error,) = args
if 'ServiceResolver' not in interface_name or 'TimeoutError' not in error:
diff --git a/staslib/conf.py b/staslib/conf.py
index 3f52e4f..c314a9e 100644
--- a/staslib/conf.py
+++ b/staslib/conf.py
@@ -74,7 +74,7 @@ class SvcConf(metaclass=singleton.Singleton):
('Global', 'ignore-iface'): 'false',
('Global', 'ip-family'): 'ipv4+ipv6',
('Global', 'udev-rule'): 'enabled',
- ('Global', 'sticky-connections'): 'disabled',
+ ('Global', 'sticky-connections'): 'enabled',
('Service Discovery', 'zeroconf'): 'enabled',
('Controllers', 'controller'): list(),
('Controllers', 'blacklist'): list(),
diff --git a/staslib/ctrl.py b/staslib/ctrl.py
index 5504baa..dbc1973 100644
--- a/staslib/ctrl.py
+++ b/staslib/ctrl.py
@@ -10,69 +10,76 @@
Dc (Discovery Controller) and Ioc (I/O Controller) objects are derived.'''
import logging
-from gi.repository import Gio, GLib
+from gi.repository import GLib
from libnvme import nvme
-from staslib import conf, gutil, trid, udev
+from staslib import conf, gutil, trid, udev, stas
DC_KATO_DEFAULT = 30 # seconds
# ******************************************************************************
-class Controller: # pylint: disable=too-many-instance-attributes
+class Controller(stas.ControllerABC):
'''@brief Base class used to manage the connection to a controller.'''
- CONNECT_RETRY_PERIOD_SEC = 60
- FAST_CONNECT_RETRY_PERIOD_SEC = 3
-
def __init__(self, root, host, tid: trid.TID, discovery_ctrl=False):
- self._root = root
- self._host = host
- self._udev = udev.UDEV
- self._tid = tid
- self._cancellable = Gio.Cancellable()
- self._connect_op = None
- self._connect_attempts = 0
- self._retry_connect_tmr = gutil.GTimer(Controller.CONNECT_RETRY_PERIOD_SEC, self._on_try_to_connect)
- self._device = None
- self._ctrl = None
- self._discovery_ctrl = discovery_ctrl
- self._try_to_connect_deferred = gutil.Deferred(self._try_to_connect)
- self._try_to_connect_deferred.schedule()
+ self._udev = udev.UDEV
+ self._device = None # Refers to the nvme device (e.g. /dev/nvme[n])
+ self._ctrl = None # libnvme's nvme.ctrl object
+ self._connect_op = None
+
+ super().__init__(root, host, tid, discovery_ctrl)
def _release_resources(self):
logging.debug('Controller._release_resources() - %s', self.id)
- # Remove pending deferred from main loop
- if self._try_to_connect_deferred:
- self._try_to_connect_deferred.cancel()
- self._try_to_connect_deferred = None
-
if self._udev:
self._udev.unregister_for_device_events(self._on_udev_notification)
- if self._retry_connect_tmr is not None:
- self._retry_connect_tmr.kill()
-
- if self._cancellable and not self._cancellable.is_cancelled():
- self._cancellable.cancel()
-
self._kill_ops()
- self._tid = None
+ super()._release_resources()
+
self._ctrl = None
- self._device = None
- self._retry_connect_tmr = None
- self._cancellable = None
self._udev = None
- def _alive(self):
- '''There may be race condition where a queued event gets processed
- after the object is no longer configured (i.e. alive). This method
- can be used by callback functions to make sure the object is still
- alive before processing further.
- '''
- return self._cancellable and not self._cancellable.is_cancelled()
+ @property
+ def device(self) -> str:
+ '''@brief return the Linux nvme device id (e.g. nvme3) or empty
+ string if no device is associated with this controller'''
+ if not self._device and self._ctrl and self._ctrl.name:
+ self._device = self._ctrl.name
+
+ return self._device or 'nvme?'
+
+ def controller_id_dict(self) -> dict:
+ '''@brief return the controller ID as a dict.'''
+ cid = super().controller_id_dict()
+ cid['device'] = self.device
+ return cid
+
+ def details(self) -> dict:
+ '''@brief return detailed debug info about this controller'''
+ details = super().details()
+ details.update(
+ self._udev.get_attributes(self.device,
+ ('hostid', 'hostnqn', 'model',
+ 'serial', 'dctype', 'cntrltype'))
+ )
+ return details
+
+ def info(self) -> dict:
+ '''@brief Get the controller info for this object'''
+ info = super().info()
+ if self._connect_op:
+ info['connect operation'] = self._connect_op.as_dict()
+ return info
+
+ def cancel(self):
+ '''@brief Used to cancel pending operations.'''
+ super().cancel()
+ if self._connect_op:
+ self._connect_op.cancel()
def _kill_ops(self):
if self._connect_op:
@@ -91,7 +98,7 @@ class Controller: # pylint: disable=too-many-instance-attributes
self._on_nvme_event(nvme_event)
elif udev_obj.action == 'remove':
logging.info('%s | %s - Received "remove" event', self.id, udev_obj.sys_name)
- self._on_udev_remove(udev_obj)
+ self._on_ctrl_removed(udev_obj)
else:
logging.debug(
'Controller._on_udev_notification() - %s | %s - Received "%s" notification.',
@@ -108,33 +115,12 @@ class Controller: # pylint: disable=too-many-instance-attributes
udev_obj.sys_name,
)
- def _on_aen(self, aen: int):
- pass
-
- def _on_nvme_event(self, nvme_event):
- pass
-
- def _on_udev_remove(self, udev_obj): # pylint: disable=unused-argument
+ def _on_ctrl_removed(self, obj): # pylint: disable=unused-argument
self._udev.unregister_for_device_events(self._on_udev_notification)
self._kill_ops() # Kill all pending operations
self._ctrl = None
- def _find_existing_connection(self):
- raise NotImplementedError()
-
- def _on_try_to_connect(self):
- self._try_to_connect_deferred.schedule()
- return GLib.SOURCE_REMOVE
-
- def _try_to_connect(self):
- # This is a deferred function call. Make sure
- # the source of the deferred is still good.
- source = GLib.main_current_source()
- if source and source.is_destroyed():
- return
-
- self._connect_attempts += 1
-
+ def _do_connect(self):
host_iface = (
self.tid.host_iface
if (self.tid.host_iface and not conf.SvcConf().ignore_iface and conf.NvmeOptions().host_iface_supp)
@@ -164,7 +150,6 @@ class Controller: # pylint: disable=too-many-instance-attributes
self._on_connect_success, self._on_connect_fail, self._ctrl.init, self._host, int(udev_obj.sys_number)
)
else:
- self._device = None
service_conf = conf.SvcConf()
cfg = { 'hdr_digest': service_conf.hdr_digest,
'data_digest': service_conf.data_digest }
@@ -198,11 +183,10 @@ class Controller: # pylint: disable=too-many-instance-attributes
self._connect_op = None
if self._alive():
- if not self._device:
- self._device = self._ctrl.name
+ self._device = self._ctrl.name
logging.info('%s | %s - Connection established!', self.id, self.device)
self._connect_attempts = 0
- self._udev.register_for_device_events(self.device, self._on_udev_notification)
+ self._udev.register_for_device_events(self._device, self._on_udev_notification)
else:
logging.debug(
'Controller._on_connect_success() - %s | %s Received event on dead object. data=%s',
@@ -227,11 +211,11 @@ class Controller: # pylint: disable=too-many-instance-attributes
# the same time. This is perfectly fine, except that we may get a bogus
# failed to connect error. By doing a fast re-try, stacd can quickly
# verify that the connection was actually successful.
- self._retry_connect_tmr.set_timeout(Controller.FAST_CONNECT_RETRY_PERIOD_SEC)
+ self._retry_connect_tmr.set_timeout(self.FAST_CONNECT_RETRY_PERIOD_SEC)
elif self._connect_attempts == 2:
# If the fast connect re-try fails, then we can print a message to
# indicate the failure, and start a slow re-try period.
- self._retry_connect_tmr.set_timeout(Controller.CONNECT_RETRY_PERIOD_SEC)
+ self._retry_connect_tmr.set_timeout(self.CONNECT_RETRY_PERIOD_SEC)
logging.error('%s Failed to connect to controller. %s', self.id, getattr(err, 'message', err))
logging.debug(
@@ -248,53 +232,6 @@ class Controller: # pylint: disable=too-many-instance-attributes
getattr(err, 'message', err),
)
- @property
- def id(self) -> str: # pylint: disable=missing-function-docstring
- return str(self.tid)
-
- @property
- def tid(self): # pylint: disable=missing-function-docstring
- return self._tid
-
- @property
- def device(self) -> str: # pylint: disable=missing-function-docstring
- return self._device if self._device else ''
-
- def controller_id_dict(self) -> dict:
- '''@brief return the controller ID as a dict.'''
- cid = self.tid.as_dict()
- cid['device'] = self.device
- return cid
-
- def details(self) -> dict:
- '''@brief return detailed debug info about this controller'''
- details = self.controller_id_dict()
- details.update(self._udev.get_attributes(self.device, ('hostid', 'hostnqn', 'model', 'serial')))
- details['connect attempts'] = str(self._connect_attempts)
- details['retry connect timer'] = str(self._retry_connect_tmr)
- return details
-
- def info(self) -> dict:
- '''@brief Get the controller info for this object'''
- info = self.details()
- if self._connect_op:
- info['connect operation'] = self._connect_op.as_dict()
- return info
-
- def cancel(self):
- '''@brief Used to cancel pending operations.'''
- if self._cancellable and not self._cancellable.is_cancelled():
- logging.debug('Controller.cancel() - %s', self.id)
- self._cancellable.cancel()
-
- if self._connect_op:
- self._connect_op.cancel()
-
- def kill(self):
- '''@brief Used to release all resources associated with this object.'''
- logging.debug('Controller.kill() - %s', self.id)
- self._release_resources()
-
def disconnect(self, disconnected_cb, keep_connection):
'''@brief Issue an asynchronous disconnect command to a Controller.
Once the async command has completed, the callback 'disconnected_cb'
@@ -313,7 +250,7 @@ class Controller: # pylint: disable=too-many-instance-attributes
# cannot be called directly as the current Controller object is in the
# process of being disconnected and the callback will in fact delete
# the object. This would invariably lead to unpredictable outcome.
- GLib.idle_add(disconnected_cb, self)
+ GLib.idle_add(disconnected_cb, self, True)
def _on_disconn_success(self, op_obj, data, disconnected_cb): # pylint: disable=unused-argument
logging.debug('Controller._on_disconn_success() - %s | %s', self.id, self.device)
@@ -322,7 +259,7 @@ class Controller: # pylint: disable=too-many-instance-attributes
# cannot be called directly as the current Controller object is in the
# process of being disconnected and the callback will in fact delete
# the object. This would invariably lead to unpredictable outcome.
- GLib.idle_add(disconnected_cb, self)
+ GLib.idle_add(disconnected_cb, self, True)
def _on_disconn_fail(self, op_obj, err, fail_cnt, disconnected_cb): # pylint: disable=unused-argument
logging.debug('Controller._on_disconn_fail() - %s | %s: %s', self.id, self.device, err)
@@ -331,4 +268,249 @@ class Controller: # pylint: disable=too-many-instance-attributes
# cannot be called directly as the current Controller object is in the
# process of being disconnected and the callback will in fact delete
# the object. This would invariably lead to unpredictable outcome.
- GLib.idle_add(disconnected_cb, self)
+ GLib.idle_add(disconnected_cb, self, False)
+
+
+# ******************************************************************************
+class Dc(Controller):
+ '''@brief This object establishes a connection to one Discover Controller (DC).
+ It retrieves the discovery log pages and caches them.
+ It also monitors udev events associated with that DC and updates
+ the cached discovery log pages accordingly.
+ '''
+
+ DLP_CHANGED = (
+ (nvme.NVME_LOG_LID_DISCOVER << 16) | (nvme.NVME_AER_NOTICE_DISC_CHANGED << 8) | nvme.NVME_AER_NOTICE
+ ) # 0x70f002
+ GET_LOG_PAGE_RETRY_RERIOD_SEC = 20
+ REGISTRATION_RETRY_RERIOD_SEC = 10
+
+ def __init__(self, staf, root, host, tid: trid.TID, log_pages=None): # pylint: disable=too-many-arguments
+ super().__init__(root, host, tid, discovery_ctrl=True)
+ self._staf = staf
+ self._register_op = None
+ self._get_log_op = None
+ self._log_pages = log_pages if log_pages else list() # Log pages cache
+
+ def _release_resources(self):
+ logging.debug('Dc._release_resources() - %s | %s', self.id, self.device)
+ super()._release_resources()
+ self._log_pages = list()
+ self._staf = None
+
+ def _kill_ops(self):
+ super()._kill_ops()
+ if self._get_log_op:
+ self._get_log_op.kill()
+ self._get_log_op = None
+ if self._register_op:
+ self._register_op.kill()
+ self._register_op = None
+
+ def info(self) -> dict:
+ '''@brief Get the controller info for this object'''
+ info = super().info()
+ if self._get_log_op:
+ info['get log page operation'] = self._get_log_op.as_dict()
+ if self._register_op:
+ info['register operation'] = self._register_op.as_dict()
+ return info
+
+ def cancel(self):
+ '''@brief Used to cancel pending operations.'''
+ super().cancel()
+ if self._get_log_op:
+ self._get_log_op.cancel()
+ if self._register_op:
+ self._register_op.cancel()
+
+ def log_pages(self) -> list:
+ '''@brief Get the cached log pages for this object'''
+ return self._log_pages
+
+ def referrals(self) -> list:
+ '''@brief Return the list of referrals'''
+ return [page for page in self._log_pages if page['subtype'] == 'referral']
+
+ def _on_aen(self, aen: int):
+ if aen == self.DLP_CHANGED and self._get_log_op:
+ self._get_log_op.run_async()
+
+ def _on_nvme_event(self, nvme_event: str):
+ if nvme_event == 'connected' and self._register_op:
+ self._register_op.run_async()
+
+ def _on_ctrl_removed(self, obj):
+ super()._on_ctrl_removed(obj)
+ if self._try_to_connect_deferred:
+ self._try_to_connect_deferred.schedule()
+
+ def _find_existing_connection(self):
+ return self._udev.find_nvme_dc_device(self.tid)
+
+ # --------------------------------------------------------------------------
+ def _on_connect_success(self, op_obj, data):
+ '''@brief Function called when we successfully connect to the
+ Discovery Controller.
+ '''
+ super()._on_connect_success(op_obj, data)
+
+ if self._alive():
+ if self._ctrl.is_registration_supported():
+ self._register_op = gutil.AsyncOperationWithRetry(
+ self._on_registration_success,
+ self._on_registration_fail,
+ self._ctrl.registration_ctlr,
+ nvme.NVMF_DIM_TAS_REGISTER,
+ )
+ self._register_op.run_async()
+ else:
+ self._get_log_op = gutil.AsyncOperationWithRetry(
+ self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover
+ )
+ self._get_log_op.run_async()
+
+ # --------------------------------------------------------------------------
+ def _on_registration_success(self, op_obj, data): # pylint: disable=unused-argument
+ '''@brief Function called when we successfully register with the
+ Discovery Controller. See self._register_op object
+ for details.
+ '''
+ if self._alive():
+ if data is not None:
+ logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data)
+ else:
+ logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device)
+ self._get_log_op = gutil.AsyncOperationWithRetry(
+ self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover
+ )
+ self._get_log_op.run_async()
+ else:
+ logging.debug(
+ 'Dc._on_registration_success() - %s | %s Received event on dead object.', self.id, self.device
+ )
+
+ def _on_registration_fail(self, op_obj, err, fail_cnt):
+ '''@brief Function called when we fail to register with the
+ Discovery Controller. See self._register_op object
+ for details.
+ '''
+ if self._alive():
+ logging.debug(
+ 'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec',
+ self.id,
+ self.device,
+ err,
+ Dc.REGISTRATION_RETRY_RERIOD_SEC,
+ )
+ if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect
+ logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err)
+ # op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC)
+ else:
+ logging.debug(
+ 'Dc._on_registration_fail() - %s | %s Received event on dead object. %s',
+ self.id,
+ self.device,
+ err,
+ )
+ op_obj.kill()
+
+ # --------------------------------------------------------------------------
+ def _on_get_log_success(self, op_obj, data): # pylint: disable=unused-argument
+ '''@brief Function called when we successfully retrieve the log pages
+ from the Discovery Controller. See self._get_log_op object
+ for details.
+ '''
+ if self._alive():
+ # Note that for historical reasons too long to explain, the CDC may
+ # return invalid addresses ("0.0.0.0", "::", or ""). Those need to be
+ # filtered out.
+ referrals_before = self.referrals()
+ self._log_pages = (
+ [
+ {k: str(v) for k, v in dictionary.items()}
+ for dictionary in data
+ if dictionary.get('traddr') not in ('0.0.0.0', '::', '')
+ ]
+ if data
+ else list()
+ )
+ logging.info(
+ '%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages)
+ )
+ referrals_after = self.referrals()
+ self._staf.log_pages_changed(self, self.device)
+ if referrals_after != referrals_before:
+ logging.debug(
+ 'Dc._on_get_log_success() - %s | %s Referrals before = %s',
+ self.id,
+ self.device,
+ referrals_before,
+ )
+ logging.debug(
+ 'Dc._on_get_log_success() - %s | %s Referrals after = %s',
+ self.id,
+ self.device,
+ referrals_after,
+ )
+ self._staf.referrals_changed()
+ else:
+ logging.debug(
+ 'Dc._on_get_log_success() - %s | %s Received event on dead object.', self.id, self.device
+ )
+
+ def _on_get_log_fail(self, op_obj, err, fail_cnt):
+ '''@brief Function called when we fail to retrieve the log pages
+ from the Discovery Controller. See self._get_log_op object
+ for details.
+ '''
+ if self._alive():
+ logging.debug(
+ 'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec',
+ self.id,
+ self.device,
+ err,
+ Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC,
+ )
+ if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect
+ logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err)
+ op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC)
+ else:
+ logging.debug(
+ 'Dc._on_get_log_fail() - %s | %s Received event on dead object. %s',
+ self.id,
+ self.device,
+ err,
+ )
+ op_obj.kill()
+
+
+# ******************************************************************************
+class Ioc(Controller):
+ '''@brief This object establishes a connection to one I/O Controller.'''
+
+ def __init__(self, stac, root, host, tid: trid.TID):
+ self._stac = stac
+ super().__init__(root, host, tid)
+
+ def _release_resources(self):
+ super()._release_resources()
+ self._stac = None
+
+ def _on_ctrl_removed(self, obj):
+ '''Called when the associated nvme device (/dev/nvmeX) is removed
+ from the system.
+ '''
+ super()._on_ctrl_removed(obj)
+
+ # Defer removal of this object to the next main loop's idle period.
+ GLib.idle_add(self._stac.remove_controller, self, True)
+
+ def _find_existing_connection(self):
+ return self._udev.find_nvme_ioc_device(self.tid)
+
+ def _on_aen(self, aen: int):
+ pass
+
+ def _on_nvme_event(self, nvme_event):
+ pass
diff --git a/staslib/gutil.py b/staslib/gutil.py
index b302f3a..36ce2c7 100644
--- a/staslib/gutil.py
+++ b/staslib/gutil.py
@@ -104,8 +104,7 @@ class GTimer:
# ******************************************************************************
-class NameResolver:
- # pylint: disable=too-few-public-methods
+class NameResolver: # pylint: disable=too-few-public-methods
'''@brief DNS resolver to convert host names to IP addresses.'''
def __init__(self):
@@ -133,8 +132,10 @@ class NameResolver:
else:
logging.error('Cannot resolve traddr: %s', hostname)
- except GLib.GError:
- logging.error('Cannot resolve traddr: %s', hostname)
+ except GLib.GError as err:
+ # We don't need to report "cancellation" errors.
+ if not err.matches(Gio.io_error_quark(), Gio.IOErrorEnum.CANCELLED):
+ logging.error('Cannot resolve traddr: %s. %s', hostname, err.message) # pylint: disable=no-member
logging.debug('NameResolver.resolve_ctrl_async() - resolved \'%s\' -> %s', hostname, traddr)
controllers[indx]['traddr'] = traddr
diff --git a/staslib/log.py b/staslib/log.py
index c624978..9622e98 100644
--- a/staslib/log.py
+++ b/staslib/log.py
@@ -24,7 +24,7 @@ def init(syslog: bool):
if syslog:
try:
# Try journal logger first
- import systemd.journal # pylint: disable=redefined-outer-name,import-outside-toplevel
+ import systemd.journal # pylint: disable=import-outside-toplevel
handler = systemd.journal.JournalHandler(SYSLOG_IDENTIFIER=defs.PROG_NAME)
except ModuleNotFoundError:
@@ -32,9 +32,7 @@ def init(syslog: bool):
from logging.handlers import SysLogHandler # pylint: disable=import-outside-toplevel
handler = SysLogHandler(address="/dev/log")
- handler.setFormatter(
- logging.Formatter('{}: %(message)s'.format(defs.PROG_NAME)) # pylint: disable=consider-using-f-string
- )
+ handler.setFormatter(logging.Formatter(f'{defs.PROG_NAME}: %(message)s'))
else:
# Log to stdout
handler = logging.StreamHandler(stream=sys.stdout)
diff --git a/staslib/service.py b/staslib/service.py
index 556a9f9..a48e66d 100644
--- a/staslib/service.py
+++ b/staslib/service.py
@@ -9,248 +9,416 @@
'''This module defines the base Service object from
which the Staf and the Stac objects are derived.'''
-import os
-import signal
+import json
+import pickle
import logging
+import pathlib
import systemd.daemon
-import dasbus.connection
+import dasbus.error
+import dasbus.client.observer
+import dasbus.client.proxy
-from gi.repository import Gio, GLib
+from gi.repository import GLib
from libnvme import nvme
-from staslib import conf, ctrl, defs, gutil, log, stas, trid, udev
+from staslib import avahi, conf, ctrl, defs, gutil, stas, trid, udev
# ******************************************************************************
-class Service: # pylint: disable=too-many-instance-attributes
+class Service(stas.ServiceABC):
'''@brief Base class used to manage a STorage Appliance Service'''
def __init__(self, args, reload_hdlr):
-
sysconf = conf.SysConf()
self._root = nvme.root()
self._host = nvme.host(self._root, sysconf.hostnqn, sysconf.hostid, sysconf.hostsymname)
- service_conf = conf.SvcConf()
- service_conf.set_conf_file(args.conf_file) # reload configuration
- self._tron = args.tron or service_conf.tron
- log.set_level_from_tron(self._tron)
- self._root.log_level("debug" if self._tron else "err")
+ super().__init__(args, reload_hdlr)
- self._lkc_file = os.path.join(os.environ.get('RUNTIME_DIRECTORY', os.path.join('/run', defs.PROG_NAME)), 'last-known-config.pickle')
- self._loop = GLib.MainLoop()
- self._udev = udev.UDEV
- self._cancellable = Gio.Cancellable()
- self._resolver = gutil.NameResolver()
- self._controllers = self._load_last_known_config()
- self._dbus_iface = None
- self._cfg_soak_tmr = None
- self._sysbus = dasbus.connection.SystemMessageBus()
-
- GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGINT, self._stop_hdlr) # CTRL-C
- GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGTERM, self._stop_hdlr) # systemctl stop stafd
- GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGHUP, reload_hdlr) # systemctl reload stafd
-
- nvme_options = conf.NvmeOptions()
- if not nvme_options.host_iface_supp or not nvme_options.discovery_supp:
- logging.warning(
- 'Kernel does not appear to support all the options needed to run this program. Consider updating to a later kernel version.'
- )
+ self._root.log_level("debug" if self._tron else "err")
def _release_resources(self):
logging.debug('Service._release_resources()')
+ super()._release_resources()
- if self._cancellable and not self._cancellable.is_cancelled():
- self._cancellable.cancel()
+ self._host = None
+ self._root = None
- if self._cfg_soak_tmr is not None:
- self._cfg_soak_tmr.kill()
+ @stas.ServiceABC.tron.setter
+ def tron(self, value):
+ '''@brief Set Trace ON property'''
+ super(__class__, self.__class__).tron.__set__(self, value)
+ self._root.log_level("debug" if self._tron else "err")
- self._controllers.clear()
- if self._sysbus:
- self._sysbus.disconnect()
+# ******************************************************************************
+def udev_rule_ctrl(enable):
+ '''@brief We add an empty udev rule to /run/udev/rules.d to suppress
+ nvme-cli's udev rule that is used to tell udevd to automatically
+ connect to I/O controller. This is to avoid race conditions between
+ stacd and udevd. This is configurable. See "udev-rule" in stacd.conf
+ for details.
+ '''
+ udev_rule_suppress = pathlib.Path('/run/udev/rules.d', '70-nvmf-autoconnect.rules')
+ if enable:
+ try:
+ udev_rule_suppress.unlink()
+ except FileNotFoundError:
+ pass
+ else:
+ if not udev_rule_suppress.exists():
+ pathlib.Path('/run/udev/rules.d').mkdir(parents=True, exist_ok=True)
+ udev_rule_suppress.symlink_to('/dev/null')
- self._cfg_soak_tmr = None
- self._cancellable = None
- self._resolver = None
- self._lkc_file = None
- self._sysbus = None
- self._udev = None
- def _config_dbus(self, iface_obj, bus_name: str, obj_name: str):
- self._dbus_iface = iface_obj
- self._sysbus.publish_object(obj_name, iface_obj)
- self._sysbus.register_service(bus_name)
+# ******************************************************************************
+class Stac(Service):
+ '''STorage Appliance Connector (STAC)'''
- @property
- def tron(self):
- '''@brief Get Trace ON property'''
- return self._tron
+ CONF_STABILITY_LONG_SOAK_TIME_SEC = 10 # pylint: disable=invalid-name
+ ADD_EVENT_SOAK_TIME_SEC = 1
- @tron.setter
- def tron(self, value): # pylint: disable=no-self-use
- '''@brief Set Trace ON property'''
- self._tron = value
- log.set_level_from_tron(self._tron)
- self._root.log_level("debug" if self._tron else "err")
+ def __init__(self, args, dbus):
+ super().__init__(args, self._reload_hdlr)
- def run(self):
- '''@brief Start the main loop execution'''
- try:
- self._loop.run()
- except Exception as ex: # pylint: disable=broad-except
- logging.critical('exception: %s', ex)
+ self._udev = udev.UDEV
- self._loop = None
+ self._add_event_soak_tmr = gutil.GTimer(self.ADD_EVENT_SOAK_TIME_SEC, self._on_add_event_soaked)
- def info(self) -> dict:
- '''@brief Get the status info for this object (used for debug)'''
- nvme_options = conf.NvmeOptions()
- return {
- 'last known config file': self._lkc_file,
- 'config soak timer': str(self._cfg_soak_tmr),
- 'kernel support': {
- 'TP8013': nvme_options.discovery_supp,
- 'host_iface': nvme_options.host_iface_supp,
- },
- 'system config': conf.SysConf().as_dict(),
- }
-
- def get_controllers(self):
- '''@brief return the list of controller objects'''
- return self._controllers.values()
-
- def get_controller(
- self, transport: str, traddr: str, trsvcid: str, host_traddr: str, host_iface: str, subsysnqn: str
- ): # pylint: disable=too-many-arguments
- '''@brief get the specified controller object from the list of controllers'''
- cid = {
- 'transport': transport,
- 'traddr': traddr,
- 'trsvcid': trsvcid,
- 'host-traddr': host_traddr,
- 'host-iface': host_iface,
- 'subsysnqn': subsysnqn,
- }
- return self._controllers.get(trid.TID(cid))
-
- def _remove_ctrl_from_dict(self, controller):
- tid_to_pop = controller.tid
- if not tid_to_pop:
- # Being paranoid. This should not happen, but let's say the
- # controller object has been purged, but it is somehow still
- # listed in self._controllers.
- for tid, _controller in self._controllers.items():
- if _controller is controller:
- tid_to_pop = tid
- break
-
- if tid_to_pop:
- logging.debug('Service._remove_ctrl_from_dict() - %s | %s', tid_to_pop, controller.device)
- self._controllers.pop(tid_to_pop, None)
- else:
- logging.debug('Service._remove_ctrl_from_dict() - already removed')
+ self._config_connections_audit()
- def remove_controller(self, controller):
- '''@brief remove the specified controller object from the list of controllers'''
- logging.debug('Service.remove_controller()')
- if isinstance(controller, ctrl.Controller):
- self._remove_ctrl_from_dict(controller)
+ # Create the D-Bus instance.
+ self._config_dbus(dbus, defs.STACD_DBUS_NAME, defs.STACD_DBUS_PATH)
- controller.kill()
+ # Connect to STAF D-Bus interface
+ self._staf = None
+ self._staf_watcher = dasbus.client.observer.DBusObserver(self._sysbus, defs.STAFD_DBUS_NAME)
+ self._staf_watcher.service_available.connect(self._connect_to_staf)
+ self._staf_watcher.service_unavailable.connect(self._disconnect_from_staf)
+ self._staf_watcher.connect_once_available()
- if self._cfg_soak_tmr:
- self._cfg_soak_tmr.start()
+ # Suppress udev rule to auto-connect when AEN is received.
+ udev_rule_ctrl(conf.SvcConf().udev_rule_enabled)
- def _cancel(self):
- logging.debug('Service._cancel()')
- if not self._cancellable.is_cancelled():
- self._cancellable.cancel()
+ def _release_resources(self):
+ logging.debug('Stac._release_resources()')
+
+ if self._add_event_soak_tmr:
+ self._add_event_soak_tmr.kill()
+
+ udev_rule_ctrl(True)
+
+ if self._udev:
+ self._udev.unregister_for_action_events('add')
+
+ self._destroy_staf_comlink(self._staf_watcher)
+ if self._staf_watcher is not None:
+ self._staf_watcher.disconnect()
- for controller in self._controllers.values():
- controller.cancel()
+ super()._release_resources()
+
+ self._udev = None
+ self._staf = None
+ self._staf_watcher = None
+ self._add_event_soak_tmr = None
+
+ def _audit_connections(self, tids):
+ '''A host should only connect to I/O controllers that have been zoned
+ for that host or a manual "controller" entry exists in stcd.conf.
+ A host should disconnect from an I/O controller when that I/O controller
+ is removed from the zone or a manual "controller" entry is removed from
+ stacd.conf. stacd will audit connections if "sticky-connections=disabled".
+ stacd will delete any connection that is not supposed to exist.
+ '''
+ logging.debug('Stac._audit_connections() - tids = %s', tids)
+ num_controllers = len(self._controllers)
+ for tid in tids:
+ if tid not in self._controllers:
+ self._controllers[tid] = ctrl.Ioc(self, self._root, self._host, tid)
+
+ if num_controllers != len(self._controllers):
+ self._cfg_soak_tmr.start(self.CONF_STABILITY_SOAK_TIME_SEC)
+
+ def _on_add_event(self, udev_obj): # pylint: disable=unused-argument
+ '''@brief This function is called when a "add" event is received from
+ the kernel for an NVMe device. This is used to trigger an audit and make
+ sure that the connection to an I/O controller is allowed.
+
+ WARNING: There is a race condition with the "add" event from the kernel.
+ The kernel sends the "add" event a bit early and the sysfs attributes
+ associated with the nvme object are not always fully initialized.
+ To workaround this problem we use a soaking timer to give time for the
+ sysfs attributes to stabilize.
+ '''
+ self._add_event_soak_tmr.start()
+
+ def _on_add_event_soaked(self):
+ '''@brief After the add event has been soaking for ADD_EVENT_SOAK_TIME_SEC
+ seconds, we can audit the connections.
+ '''
+ if not conf.SvcConf().sticky_connections:
+ self._audit_connections(self._udev.get_nvme_ioc_tids())
+ return GLib.SOURCE_REMOVE
+
+ def _config_connections_audit(self):
+ '''This function checks the "sticky_connections" parameter to determine
+ whether audits should be performed. Audits are enabled when
+ "sticky_connections" is disabled.
+ '''
+ if not conf.SvcConf().sticky_connections:
+ if self._udev.get_registered_action_cback('add') is None:
+ self._udev.register_for_action_events('add', self._on_add_event)
+ self._audit_connections(self._udev.get_nvme_ioc_tids())
+ else:
+ self._udev.unregister_for_action_events('add')
def _keep_connections_on_exit(self):
'''@brief Determine whether connections should remain when the
process exits.
-
- NOTE) This is the base class method used to define the interface.
- It must be overloaded by a child class.
'''
- raise NotImplementedError()
+ return True
- def _stop_hdlr(self):
- systemd.daemon.notify('STOPPING=1')
+ def _reload_hdlr(self):
+ '''@brief Reload configuration file. This is triggered by the SIGHUP
+ signal, which can be sent with "systemctl reload stacd".
+ '''
+ systemd.daemon.notify('RELOADING=1')
+ service_cnf = conf.SvcConf()
+ service_cnf.reload()
+ self.tron = service_cnf.tron
+ self._config_connections_audit()
+ self._cfg_soak_tmr.start(self.CONF_STABILITY_SOAK_TIME_SEC)
+ udev_rule_ctrl(service_cnf.udev_rule_enabled)
+ systemd.daemon.notify('READY=1')
+ return GLib.SOURCE_CONTINUE
+
+ def _get_log_pages_from_stafd(self):
+ if self._staf:
+ try:
+ return json.loads(self._staf.get_all_log_pages(True))
+ except dasbus.error.DBusError:
+ pass
+
+ return list()
- self._cancel() # Cancel pending operations
+ def _config_ctrls_finish(self, configured_ctrl_list):
+ configured_ctrl_list = [
+ ctrl_dict for ctrl_dict in configured_ctrl_list if 'traddr' in ctrl_dict and 'subsysnqn' in ctrl_dict
+ ]
+ logging.debug('Stac._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list)
+
+ discovered_ctrl_list = list()
+ for staf_data in self._get_log_pages_from_stafd():
+ host_traddr = staf_data['discovery-controller']['host-traddr']
+ host_iface = staf_data['discovery-controller']['host-iface']
+ for dlpe in staf_data['log-pages']:
+ if dlpe.get('subtype') == 'nvme': # eliminate discovery controllers
+ discovered_ctrl_list.append(stas.cid_from_dlpe(dlpe, host_traddr, host_iface))
+
+ logging.debug('Stac._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list)
+
+ controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list)
+ controllers = stas.remove_invalid_addresses(controllers)
+
+ new_controller_ids = {trid.TID(controller) for controller in controllers}
+ cur_controller_ids = set(self._controllers.keys())
+ controllers_to_add = new_controller_ids - cur_controller_ids
+ controllers_to_del = cur_controller_ids - new_controller_ids
+
+ logging.debug('Stac._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add))
+ logging.debug('Stac._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del))
+
+ for tid in controllers_to_del:
+ controller = self._controllers.pop(tid, None)
+ if controller is not None:
+ controller.disconnect(self.remove_controller, conf.SvcConf().sticky_connections)
+
+ for tid in controllers_to_add:
+ self._controllers[tid] = ctrl.Ioc(self, self._root, self._host, tid)
+
+ def _connect_to_staf(self, _):
+ '''@brief Hook up DBus signal handlers for signals from stafd.'''
+ try:
+ self._staf = self._sysbus.get_proxy(defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH)
+ self._staf.log_pages_changed.connect(self._log_pages_changed)
+ self._cfg_soak_tmr.start()
- self._dump_last_known_config(self._controllers)
+ # Make sure timer is set back to its normal value.
+ self._cfg_soak_tmr.set_timeout(self.CONF_STABILITY_SOAK_TIME_SEC)
+ logging.debug('Stac._connect_to_staf() - Connected to staf')
+ except dasbus.error.DBusError:
+ logging.error('Failed to connect to staf')
+
+ def _destroy_staf_comlink(self, watcher): # pylint: disable=unused-argument
+ if self._staf:
+ self._staf.log_pages_changed.disconnect(self._log_pages_changed)
+ dasbus.client.proxy.disconnect_proxy(self._staf)
+ self._staf = None
+
+ def _disconnect_from_staf(self, watcher):
+ self._destroy_staf_comlink(watcher)
+
+ # When we lose connectivity with stafd, the most logical explanation
+ # is that stafd restarted. In that case, it may take some time for stafd
+ # to re-populate its log pages cache. So let's give stafd plenty of time
+ # to update its log pages cache and send log pages change notifications
+ # before triggering a stacd re-config. We do this by momentarily
+ # increasing the config soak timer to a longer period.
+ if self._cfg_soak_tmr:
+ self._cfg_soak_tmr.set_timeout(self.CONF_STABILITY_LONG_SOAK_TIME_SEC)
+
+ logging.debug('Stac._disconnect_from_staf() - Disconnected from staf')
+
+ def _log_pages_changed( # pylint: disable=too-many-arguments
+ self, transport, traddr, trsvcid, host_traddr, host_iface, subsysnqn, device
+ ):
+ logging.debug(
+ 'Stac._log_pages_changed() - transport=%s, traddr=%s, trsvcid=%s, host_traddr=%s, host_iface=%s, subsysnqn=%s, device=%s',
+ transport,
+ traddr,
+ trsvcid,
+ host_traddr,
+ host_iface,
+ subsysnqn,
+ device,
+ )
+ if self._cfg_soak_tmr:
+ self._cfg_soak_tmr.start(self.CONF_STABILITY_SOAK_TIME_SEC)
- if len(self._controllers) == 0:
- GLib.idle_add(self._exit)
- else:
- # Tell all controller objects to disconnect
- keep_connections = self._keep_connections_on_exit()
- controllers = self._controllers.values()
- for controller in controllers:
- controller.disconnect(self._on_final_disconnect, keep_connections)
+ def _load_last_known_config(self):
+ return dict()
- return GLib.SOURCE_REMOVE
+ def _dump_last_known_config(self, controllers):
+ pass
- def _on_final_disconnect(self, controller):
- '''Callback invoked after a controller is disconnected.
- THIS IS USED DURING PROCESS SHUTDOWN TO WAIT FOR ALL CONTROLLERS TO BE
- DISCONNECTED BEFORE EXITING THE PROGRAM. ONLY CALL ON SHUTDOWN!
- '''
- logging.debug('Service._on_final_disconnect()')
- self._remove_ctrl_from_dict(controller)
- controller.kill()
+# ******************************************************************************
+class Staf(Service):
+ '''STorage Appliance Finder (STAF)'''
- # When all controllers have disconnected, we can finish the clean up
- if len(self._controllers) == 0:
- # Defer exit to the next main loop's idle period.
- GLib.idle_add(self._exit)
+ def __init__(self, args, dbus):
+ super().__init__(args, self._reload_hdlr)
- def _exit(self):
- logging.debug('Service._exit()')
- self._release_resources()
- self._loop.quit()
+ self._avahi = avahi.Avahi(self._sysbus, self._avahi_change)
+ self._avahi.config_stypes(conf.SvcConf().get_stypes())
- def _on_config_ctrls(self, *_user_data):
- self._config_ctrls()
- return GLib.SOURCE_REMOVE
+ # Create the D-Bus instance.
+ self._config_dbus(dbus, defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH)
- def _config_ctrls(self):
- '''@brief Start controllers configuration.'''
- # The configuration file may contain controllers and/or blacklist
- # elements with traddr specified as hostname instead of IP address.
- # Because of this, we need to remove those blacklisted elements before
- # running name resolution. And we will need to remove blacklisted
- # elements after name resolution is complete (i.e. in the calback
- # function _config_ctrls_finish)
- logging.debug('Service._config_ctrls()')
- configured_controllers = stas.remove_blacklisted(conf.SvcConf().get_controllers())
- self._resolver.resolve_ctrl_async(self._cancellable, configured_controllers, self._config_ctrls_finish)
+ def info(self) -> dict:
+ '''@brief Get the status info for this object (used for debug)'''
+ info = super().info()
+ info['avahi'] = self._avahi.info()
+ return info
- def _config_ctrls_finish(self, configured_ctrl_list):
- '''@brief Finish controllers configuration after hostnames (if any)
- have been resolved.
-
- Configuring controllers must be done asynchronously in 2 steps.
- In the first step, host names get resolved to find their IP addresses.
- Name resolution can take a while, especially when an external name
- resolution server is used. Once that step completed, the callback
- method _config_ctrls_finish() (i.e. this method), gets invoked to
- complete the controller configuration.
-
- NOTE) This is the base class method used to define the interface.
- It must be overloaded by a child class.
- '''
- raise NotImplementedError()
+ def _release_resources(self):
+ logging.debug('Staf._release_resources()')
+ super()._release_resources()
+ if self._avahi:
+ self._avahi.kill()
+ self._avahi = None
def _load_last_known_config(self):
- raise NotImplementedError()
+ try:
+ with open(self._lkc_file, 'rb') as file:
+ config = pickle.load(file)
+ except (FileNotFoundError, AttributeError):
+ return dict()
+
+ logging.debug('Staf._load_last_known_config() - DC count = %s', len(config))
+ return {tid: ctrl.Dc(self, self._root, self._host, tid, log_pages) for tid, log_pages in config.items()}
def _dump_last_known_config(self, controllers):
- raise NotImplementedError()
+ try:
+ with open(self._lkc_file, 'wb') as file:
+ config = {tid: dc.log_pages() for tid, dc in controllers.items()}
+ logging.debug('Staf._dump_last_known_config() - DC count = %s', len(config))
+ pickle.dump(config, file)
+ except FileNotFoundError as ex:
+ logging.error('Unable to save last known config: %s', ex)
+
+ def _keep_connections_on_exit(self):
+ '''@brief Determine whether connections should remain when the
+ process exits.
+ '''
+ return conf.SvcConf().persistent_connections
+
+ def _reload_hdlr(self):
+ '''@brief Reload configuration file. This is triggered by the SIGHUP
+ signal, which can be sent with "systemctl reload stafd".
+ '''
+ systemd.daemon.notify('RELOADING=1')
+ service_cnf = conf.SvcConf()
+ service_cnf.reload()
+ self.tron = service_cnf.tron
+ self._avahi.kick_start() # Make sure Avahi is running
+ self._avahi.config_stypes(service_cnf.get_stypes())
+ self._cfg_soak_tmr.start()
+ systemd.daemon.notify('READY=1')
+ return GLib.SOURCE_CONTINUE
+
+ def log_pages_changed(self, controller, device):
+ '''@brief Function invoked when a controller's cached log pages
+ have changed. This will emit a D-Bus signal to inform
+ other applications that the cached log pages have changed.
+ '''
+ self._dbus_iface.log_pages_changed.emit(
+ controller.tid.transport,
+ controller.tid.traddr,
+ controller.tid.trsvcid,
+ controller.tid.host_traddr,
+ controller.tid.host_iface,
+ controller.tid.subsysnqn,
+ device,
+ )
+
+ def referrals_changed(self):
+ '''@brief Function invoked when a controller's cached referrals
+ have changed.
+ '''
+ logging.debug('Staf.referrals_changed()')
+ self._cfg_soak_tmr.start()
+
+ def _referrals(self) -> list:
+ return [
+ stas.cid_from_dlpe(dlpe, controller.tid.host_traddr, controller.tid.host_iface)
+ for controller in self.get_controllers()
+ for dlpe in controller.referrals()
+ ]
+
+ def _config_ctrls_finish(self, configured_ctrl_list):
+ '''@brief Finish discovery controllers configuration after
+ hostnames (if any) have been resolved.
+ '''
+ configured_ctrl_list = [
+ ctrl_dict
+ for ctrl_dict in configured_ctrl_list
+ if 'traddr' in ctrl_dict and ctrl_dict.setdefault('subsysnqn', defs.WELL_KNOWN_DISC_NQN)
+ ]
+
+ discovered_ctrl_list = self._avahi.get_controllers()
+ referral_ctrl_list = self._referrals()
+ logging.debug('Staf._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list)
+ logging.debug('Staf._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list)
+ logging.debug('Staf._config_ctrls_finish() - referral_ctrl_list = %s', referral_ctrl_list)
+
+ controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list + referral_ctrl_list)
+ controllers = stas.remove_invalid_addresses(controllers)
+
+ new_controller_ids = {trid.TID(controller) for controller in controllers}
+ cur_controller_ids = set(self._controllers.keys())
+ controllers_to_add = new_controller_ids - cur_controller_ids
+ controllers_to_del = cur_controller_ids - new_controller_ids
+
+ logging.debug('Staf._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add))
+ logging.debug('Staf._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del))
+
+ for tid in controllers_to_del:
+ controller = self._controllers.pop(tid, None)
+ if controller is not None:
+ controller.disconnect(self.remove_controller, conf.SvcConf().persistent_connections)
+
+ for tid in controllers_to_add:
+ self._controllers[tid] = ctrl.Dc(self, self._root, self._host, tid)
+
+ def _avahi_change(self):
+ self._cfg_soak_tmr.start()
diff --git a/staslib/stas.py b/staslib/stas.py
index 7bf91e0..496f063 100644
--- a/staslib/stas.py
+++ b/staslib/stas.py
@@ -6,14 +6,19 @@
#
# Authors: Martin Belanger <Martin.Belanger@dell.com>
#
-'''Library for staf/stac'''
+'''Library for staf/stac. You will find here common code for stafd and stacd
+including the Abstract Base Classes (ABC) for Controllers and Services'''
import os
import sys
-import ipaddress
+import abc
+import signal
import logging
-
-from staslib import conf, defs, trid
+import ipaddress
+import systemd.daemon
+import dasbus.connection
+from gi.repository import Gio, GLib
+from staslib import conf, defs, gutil, log, trid
# ******************************************************************************
@@ -108,3 +113,379 @@ def remove_invalid_addresses(controllers: list):
logging.warning('Invalid transport %s', transport)
return valid_controllers
+
+
+# ******************************************************************************
+class ControllerABC(abc.ABC): # pylint: disable=too-many-instance-attributes
+ '''@brief Base class used to manage the connection to a controller.'''
+
+ CONNECT_RETRY_PERIOD_SEC = 60
+ FAST_CONNECT_RETRY_PERIOD_SEC = 3
+
+ def __init__(self, root, host, tid: trid.TID, discovery_ctrl=False):
+ self._root = root
+ self._host = host
+ self._tid = tid
+ self._cancellable = Gio.Cancellable()
+ self._connect_attempts = 0
+ self._retry_connect_tmr = gutil.GTimer(self.CONNECT_RETRY_PERIOD_SEC, self._on_try_to_connect)
+ self._discovery_ctrl = discovery_ctrl
+ self._try_to_connect_deferred = gutil.Deferred(self._try_to_connect)
+ self._try_to_connect_deferred.schedule()
+
+ def _release_resources(self):
+ # Remove pending deferred from main loop
+ if self._try_to_connect_deferred:
+ self._try_to_connect_deferred.cancel()
+
+ if self._retry_connect_tmr is not None:
+ self._retry_connect_tmr.kill()
+
+ if self._cancellable and not self._cancellable.is_cancelled():
+ self._cancellable.cancel()
+
+ self._tid = None
+ self._cancellable = None
+ self._retry_connect_tmr = None
+ self._try_to_connect_deferred = None
+
+ @property
+ def id(self) -> str:
+ '''@brief Return the Transport ID as a printable string'''
+ return str(self.tid)
+
+ @property
+ def tid(self):
+ '''@brief Return the Transport ID object'''
+ return self._tid
+
+ def controller_id_dict(self) -> dict:
+ '''@brief return the controller ID as a dict.'''
+ return self.tid.as_dict()
+
+ def details(self) -> dict:
+ '''@brief return detailed debug info about this controller'''
+ details = self.controller_id_dict()
+ details['connect attempts'] = str(self._connect_attempts)
+ details['retry connect timer'] = str(self._retry_connect_tmr)
+ return details
+
+ def info(self) -> dict:
+ '''@brief Get the controller info for this object'''
+ return self.details()
+
+ def cancel(self):
+ '''@brief Used to cancel pending operations.'''
+ if self._cancellable and not self._cancellable.is_cancelled():
+ logging.debug('ControllerABC.cancel() - %s', self.id)
+ self._cancellable.cancel()
+
+ def kill(self):
+ '''@brief Used to release all resources associated with this object.'''
+ logging.debug('ControllerABC.kill() - %s', self.id)
+ self._release_resources()
+
+ def _alive(self):
+ '''There may be race condition where a queued event gets processed
+ after the object is no longer configured (i.e. alive). This method
+ can be used by callback functions to make sure the object is still
+ alive before processing further.
+ '''
+ return self._cancellable and not self._cancellable.is_cancelled()
+
+ def _on_try_to_connect(self):
+ self._try_to_connect_deferred.schedule()
+ return GLib.SOURCE_REMOVE
+
+ def _try_to_connect(self):
+ # This is a deferred function call. Make sure
+ # the source of the deferred is still good.
+ source = GLib.main_current_source()
+ if source and source.is_destroyed():
+ return
+
+ self._connect_attempts += 1
+
+ self._do_connect()
+
+ @abc.abstractmethod
+ def _do_connect(self):
+ raise NotImplementedError()
+
+ @abc.abstractmethod
+ def _on_aen(self, aen: int):
+ raise NotImplementedError()
+
+ @abc.abstractmethod
+ def _on_nvme_event(self, nvme_event):
+ raise NotImplementedError()
+
+ @abc.abstractmethod
+ def _on_ctrl_removed(self, obj):
+ raise NotImplementedError()
+
+ @abc.abstractmethod
+ def _find_existing_connection(self):
+ raise NotImplementedError()
+
+ @abc.abstractmethod
+ def disconnect(self, disconnected_cb, keep_connection):
+ '''@brief Issue an asynchronous disconnect command to a Controller.
+ Once the async command has completed, the callback 'disconnected_cb'
+ will be invoked. If a controller is already disconnected, then the
+ callback will be added to the main loop's next idle slot to be executed
+ ASAP.
+ '''
+ raise NotImplementedError()
+
+
+# ******************************************************************************
+class ServiceABC(abc.ABC): # pylint: disable=too-many-instance-attributes
+ '''@brief Base class used to manage a STorage Appliance Service'''
+
+ CONF_STABILITY_SOAK_TIME_SEC = 1.5
+
+ def __init__(self, args, reload_hdlr):
+
+ service_conf = conf.SvcConf()
+ service_conf.set_conf_file(args.conf_file) # reload configuration
+ self._tron = args.tron or service_conf.tron
+ log.set_level_from_tron(self._tron)
+
+ self._lkc_file = os.path.join(os.environ.get('RUNTIME_DIRECTORY', os.path.join('/run', defs.PROG_NAME)), 'last-known-config.pickle')
+ self._loop = GLib.MainLoop()
+ self._cancellable = Gio.Cancellable()
+ self._resolver = gutil.NameResolver()
+ self._controllers = self._load_last_known_config()
+ self._dbus_iface = None
+ self._cfg_soak_tmr = gutil.GTimer(self.CONF_STABILITY_SOAK_TIME_SEC, self._on_config_ctrls)
+ self._sysbus = dasbus.connection.SystemMessageBus()
+
+ GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGINT, self._stop_hdlr) # CTRL-C
+ GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGTERM, self._stop_hdlr) # systemctl stop stafd
+ GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGHUP, reload_hdlr) # systemctl reload stafd
+
+ nvme_options = conf.NvmeOptions()
+ if not nvme_options.host_iface_supp or not nvme_options.discovery_supp:
+ logging.warning(
+ 'Kernel does not appear to support all the options needed to run this program. Consider updating to a later kernel version.'
+ )
+
+ # We don't want to apply configuration changes to nvme-cli right away.
+ # Often, multiple changes will occur in a short amount of time (sub-second).
+ # We want to wait until there are no more changes before applying them
+ # to the system. The following timer acts as a "soak period". Changes
+ # will be applied by calling self._on_config_ctrls() at the end of
+ # the soak period.
+ self._cfg_soak_tmr.start()
+
+ def _release_resources(self):
+ logging.debug('ServiceABC._release_resources()')
+
+ if self._cancellable and not self._cancellable.is_cancelled():
+ self._cancellable.cancel()
+
+ if self._cfg_soak_tmr is not None:
+ self._cfg_soak_tmr.kill()
+
+ self._controllers.clear()
+
+ if self._sysbus:
+ self._sysbus.disconnect()
+
+ self._cfg_soak_tmr = None
+ self._cancellable = None
+ self._resolver = None
+ self._lkc_file = None
+ self._sysbus = None
+
+ def _config_dbus(self, iface_obj, bus_name: str, obj_name: str):
+ self._dbus_iface = iface_obj
+ self._sysbus.publish_object(obj_name, iface_obj)
+ self._sysbus.register_service(bus_name)
+
+ @property
+ def tron(self):
+ '''@brief Get Trace ON property'''
+ return self._tron
+
+ @tron.setter
+ def tron(self, value):
+ '''@brief Set Trace ON property'''
+ self._tron = value
+ log.set_level_from_tron(self._tron)
+
+ def run(self):
+ '''@brief Start the main loop execution'''
+ try:
+ self._loop.run()
+ except Exception as ex: # pylint: disable=broad-except
+ logging.critical('exception: %s', ex)
+
+ self._loop = None
+
+ def info(self) -> dict:
+ '''@brief Get the status info for this object (used for debug)'''
+ nvme_options = conf.NvmeOptions()
+ return {
+ 'last known config file': self._lkc_file,
+ 'config soak timer': str(self._cfg_soak_tmr),
+ 'kernel support': {
+ 'TP8013': nvme_options.discovery_supp,
+ 'host_iface': nvme_options.host_iface_supp,
+ },
+ 'system config': conf.SysConf().as_dict(),
+ }
+
+ def get_controllers(self) -> dict:
+ '''@brief return the list of controller objects'''
+ return self._controllers.values()
+
+ def get_controller(
+ self, transport: str, traddr: str, trsvcid: str, host_traddr: str, host_iface: str, subsysnqn: str
+ ): # pylint: disable=too-many-arguments
+ '''@brief get the specified controller object from the list of controllers'''
+ cid = {
+ 'transport': transport,
+ 'traddr': traddr,
+ 'trsvcid': trsvcid,
+ 'host-traddr': host_traddr,
+ 'host-iface': host_iface,
+ 'subsysnqn': subsysnqn,
+ }
+ return self._controllers.get(trid.TID(cid))
+
+ def _remove_ctrl_from_dict(self, controller):
+ tid_to_pop = controller.tid
+ if not tid_to_pop:
+ # Being paranoid. This should not happen, but let's say the
+ # controller object has been purged, but it is somehow still
+ # listed in self._controllers.
+ for tid, _controller in self._controllers.items():
+ if _controller is controller:
+ tid_to_pop = tid
+ break
+
+ if tid_to_pop:
+ logging.debug('ServiceABC._remove_ctrl_from_dict()- %s | %s', tid_to_pop, controller.device)
+ self._controllers.pop(tid_to_pop, None)
+ else:
+ logging.debug('ServiceABC._remove_ctrl_from_dict()- already removed')
+
+ def remove_controller(self, controller, success): # pylint: disable=unused-argument
+ '''@brief remove the specified controller object from the list of controllers
+ @param controller: the controller object
+ @param success: whether the disconnect was successful'''
+ logging.debug('ServiceABC.remove_controller()')
+ if isinstance(controller, ControllerABC):
+ self._remove_ctrl_from_dict(controller)
+
+ controller.kill()
+
+ if self._cfg_soak_tmr:
+ self._cfg_soak_tmr.start()
+
+ def _cancel(self):
+ logging.debug('ServiceABC._cancel()')
+ if not self._cancellable.is_cancelled():
+ self._cancellable.cancel()
+
+ for controller in self._controllers.values():
+ controller.cancel()
+
+ def _stop_hdlr(self):
+ logging.debug('ServiceABC._stop_hdlr()')
+ systemd.daemon.notify('STOPPING=1')
+
+ self._cancel() # Cancel pending operations
+
+ self._dump_last_known_config(self._controllers)
+
+ if len(self._controllers) == 0:
+ GLib.idle_add(self._exit)
+ else:
+ # Tell all controller objects to disconnect
+ keep_connections = self._keep_connections_on_exit()
+ controllers = self._controllers.values()
+ logging.debug(
+ 'ServiceABC._stop_hdlr() - Controller count = %s, keep_connections = %s',
+ len(controllers), keep_connections
+ )
+ for controller in controllers:
+ controller.disconnect(self._on_final_disconnect, keep_connections)
+
+ return GLib.SOURCE_REMOVE
+
+ def _on_final_disconnect(self, controller, success):
+ '''Callback invoked after a controller is disconnected.
+ THIS IS USED DURING PROCESS SHUTDOWN TO WAIT FOR ALL CONTROLLERS TO BE
+ DISCONNECTED BEFORE EXITING THE PROGRAM. ONLY CALL ON SHUTDOWN!
+ @param controller: the controller object
+ @param success: whether the disconnect operation was successful
+ '''
+ logging.debug('ServiceABC._on_final_disconnect() - %s | %s disconnect %s',
+ controller.id, controller.device, 'succeeded' if success else 'failed')
+ self._remove_ctrl_from_dict(controller)
+
+ controller.kill()
+
+ # When all controllers have disconnected, we can finish the clean up
+ if len(self._controllers) == 0:
+ # Defer exit to the next main loop's idle period.
+ GLib.idle_add(self._exit)
+
+ def _exit(self):
+ logging.debug('ServiceABC._exit()')
+ self._release_resources()
+ self._loop.quit()
+
+ def _on_config_ctrls(self, *_user_data):
+ self._config_ctrls()
+ return GLib.SOURCE_REMOVE
+
+ def _config_ctrls(self):
+ '''@brief Start controllers configuration.'''
+ # The configuration file may contain controllers and/or blacklist
+ # elements with traddr specified as hostname instead of IP address.
+ # Because of this, we need to remove those blacklisted elements before
+ # running name resolution. And we will need to remove blacklisted
+ # elements after name resolution is complete (i.e. in the calback
+ # function _config_ctrls_finish)
+ logging.debug('ServiceABC._config_ctrls()')
+ configured_controllers = remove_blacklisted(conf.SvcConf().get_controllers())
+ self._resolver.resolve_ctrl_async(self._cancellable, configured_controllers, self._config_ctrls_finish)
+
+ @abc.abstractmethod
+ def _keep_connections_on_exit(self):
+ '''@brief Determine whether connections should remain when the
+ process exits.
+
+ NOTE) This is the base class method used to define the interface.
+ It must be overloaded by a child class.
+ '''
+ raise NotImplementedError()
+
+ @abc.abstractmethod
+ def _config_ctrls_finish(self, configured_ctrl_list):
+ '''@brief Finish controllers configuration after hostnames (if any)
+ have been resolved.
+
+ Configuring controllers must be done asynchronously in 2 steps.
+ In the first step, host names get resolved to find their IP addresses.
+ Name resolution can take a while, especially when an external name
+ resolution server is used. Once that step completed, the callback
+ method _config_ctrls_finish() (i.e. this method), gets invoked to
+ complete the controller configuration.
+
+ NOTE) This is the base class method used to define the interface.
+ It must be overloaded by a child class.
+ '''
+ raise NotImplementedError()
+
+ @abc.abstractmethod
+ def _load_last_known_config(self):
+ raise NotImplementedError()
+
+ @abc.abstractmethod
+ def _dump_last_known_config(self, controllers):
+ raise NotImplementedError()
diff --git a/staslib/trid.py b/staslib/trid.py
index def6ab2..38619e7 100644
--- a/staslib/trid.py
+++ b/staslib/trid.py
@@ -12,8 +12,7 @@ throughout nvme-stas to uniquely identify a Controller'''
import hashlib
from staslib import conf
-class TID:
- # pylint: disable=too-many-instance-attributes
+class TID: # pylint: disable=too-many-instance-attributes
'''Transport Identifier'''
RDMA_IP_PORT = '4420'
DISC_IP_PORT = '8009'
diff --git a/staslib/udev.py b/staslib/udev.py
index 29370b8..37b63cc 100644
--- a/staslib/udev.py
+++ b/staslib/udev.py
@@ -16,7 +16,7 @@ from staslib import defs, trid
try:
from pyudev.glib import MonitorObserver
except (ModuleNotFoundError, AttributeError):
- from staslib.glibudev import MonitorObserver # pylint: disable=relative-beyond-top-level,ungrouped-imports
+ from staslib.glibudev import MonitorObserver # pylint: disable=ungrouped-imports
# ******************************************************************************
class Udev:
@@ -99,7 +99,7 @@ class Udev:
def get_attributes(self, sys_name: str, attr_ids) -> dict:
'''@brief Get all the attributes associated with device @sys_name'''
attrs = {attr_id: '' for attr_id in attr_ids}
- if sys_name:
+ if sys_name and sys_name != 'nvme?':
udev = self.get_nvme_device(sys_name)
if udev is not None:
for attr_id in attr_ids:
diff --git a/test/test-config.py b/test/test-config.py
index dad0ebd..db58883 100755
--- a/test/test-config.py
+++ b/test/test-config.py
@@ -40,7 +40,7 @@ class StasProcessConfUnitTest(unittest.TestCase):
self.assertFalse(service_conf.data_digest)
self.assertTrue(service_conf.persistent_connections)
self.assertTrue(service_conf.udev_rule_enabled)
- self.assertFalse(service_conf.sticky_connections)
+ self.assertTrue(service_conf.sticky_connections)
self.assertFalse(service_conf.ignore_iface)
self.assertIn(6, service_conf.ip_family)
self.assertNotIn(4, service_conf.ip_family)
diff --git a/test/test-controller.py b/test/test-controller.py
index f23125e..f55781a 100755
--- a/test/test-controller.py
+++ b/test/test-controller.py
@@ -8,24 +8,43 @@ from pyfakefs.fake_filesystem_unittest import TestCase
LOOP = GLib.MainLoop()
+
+class TestController(ctrl.Controller):
+ def _find_existing_connection(self):
+ pass
+
+ def _on_aen(self, aen: int):
+ pass
+
+ def _on_nvme_event(self, nvme_event):
+ pass
+
+
class Test(TestCase):
'''Unit tests for class Controller'''
def setUp(self):
self.setUpPyfakefs()
- self.fs.create_file('/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n')
- self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n')
- self.fs.create_file('/dev/nvme-fabrics', contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n')
+ self.fs.create_file(
+ '/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n'
+ )
+ self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n')
+ self.fs.create_file(
+ '/dev/nvme-fabrics',
+ contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n',
+ )
- self.NVME_TID = trid.TID({
- 'transport': 'tcp',
- 'traddr': '10.10.10.10',
- 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8',
- 'trsvcid': '8009',
- 'host-traddr': '1.2.3.4',
- 'host-iface': 'wlp0s20f3',
- })
+ self.NVME_TID = trid.TID(
+ {
+ 'transport': 'tcp',
+ 'traddr': '10.10.10.10',
+ 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8',
+ 'trsvcid': '8009',
+ 'host-traddr': '1.2.3.4',
+ 'host-iface': 'wlp0s20f3',
+ }
+ )
sysconf = conf.SysConf()
self.root = nvme.root()
@@ -34,32 +53,92 @@ class Test(TestCase):
def tearDown(self):
LOOP.quit()
+ def test_cannot_instantiate_concrete_classes_if_abstract_method_are_not_implemented(self):
+ # Make sure we can't instantiate the ABC directly (Abstract Base Class).
+ class Controller(ctrl.Controller):
+ pass
+
+ self.assertRaises(TypeError, lambda: ctrl.Controller(root=self.root, host=self.host, tid=self.NVME_TID))
+
def test_get_device(self):
- controller = ctrl.Controller(root=self.root, host=self.host, tid=self.NVME_TID)
+ controller = TestController(root=self.root, host=self.host, tid=self.NVME_TID)
self.assertEqual(controller._connect_attempts, 0)
- self.assertRaises(NotImplementedError, controller._try_to_connect)
+ controller._try_to_connect()
self.assertEqual(controller._connect_attempts, 1)
- self.assertRaises(NotImplementedError, controller._find_existing_connection)
- self.assertEqual(controller.id, "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)")
+ self.assertEqual(
+ controller.id, "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)"
+ )
# raise Exception(controller._connect_op)
- self.assertEqual(str(controller.tid), "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)")
- self.assertEqual(controller.device, '')
- self.assertEqual(str(controller.controller_id_dict()), "{'transport': 'tcp', 'traddr': '10.10.10.10', 'trsvcid': '8009', 'host-traddr': '1.2.3.4', 'host-iface': 'wlp0s20f3', 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', 'device': ''}")
- # self.assertEqual(controller.details(), "{'transport': 'tcp', 'traddr': '10.10.10.[265 chars]ff]'}")
- self.assertEqual(controller.info(), {'transport': 'tcp', 'traddr': '10.10.10.10', 'trsvcid': '8009', 'host-traddr': '1.2.3.4', 'host-iface': 'wlp0s20f3', 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', 'device': '', 'hostid': '', 'hostnqn': '', 'model': '', 'serial': '', 'connect attempts': '1', 'retry connect timer': '60.0s [off]'})
+ self.assertEqual(
+ str(controller.tid),
+ "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)",
+ )
+ self.assertEqual(controller.device, 'nvme?')
+ self.assertEqual(
+ str(controller.controller_id_dict()),
+ "{'transport': 'tcp', 'traddr': '10.10.10.10', 'trsvcid': '8009', 'host-traddr': '1.2.3.4', 'host-iface': 'wlp0s20f3', 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', 'device': 'nvme?'}",
+ )
+ self.assertEqual(
+ controller.details(),
+ {
+ 'dctype': '',
+ 'cntrltype': '',
+ 'transport': 'tcp',
+ 'traddr': '10.10.10.10',
+ 'trsvcid': '8009',
+ 'host-traddr': '1.2.3.4',
+ 'host-iface': 'wlp0s20f3',
+ 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8',
+ 'device': 'nvme?',
+ 'connect attempts': '1',
+ 'retry connect timer': '60.0s [off]',
+ 'hostid': '',
+ 'hostnqn': '',
+ 'model': '',
+ 'serial': '',
+ },
+ )
+ self.assertEqual(
+ controller.info(),
+ {
+ 'dctype': '',
+ 'cntrltype': '',
+ 'transport': 'tcp',
+ 'traddr': '10.10.10.10',
+ 'trsvcid': '8009',
+ 'host-traddr': '1.2.3.4',
+ 'host-iface': 'wlp0s20f3',
+ 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8',
+ 'device': 'nvme?',
+ 'connect attempts': '1',
+ 'retry connect timer': '60.0s [off]',
+ 'hostid': '',
+ 'hostnqn': '',
+ 'model': '',
+ 'serial': '',
+ 'connect operation': {'fail count': 0},
+ },
+ )
+
# print(controller._connect_op)
self.assertEqual(controller.cancel(), None)
self.assertEqual(controller.kill(), None)
# self.assertEqual(controller.disconnect(), 0)
def test_connect(self):
- controller = ctrl.Controller(root=self.root, host=self.host, tid=self.NVME_TID)
+ controller = TestController(root=self.root, host=self.host, tid=self.NVME_TID)
self.assertEqual(controller._connect_attempts, 0)
- controller._find_existing_connection = lambda : None
+ controller._find_existing_connection = lambda: None
with self.assertLogs(logger=logging.getLogger(), level='DEBUG') as captured:
controller._try_to_connect()
self.assertEqual(len(captured.records), 1)
- self.assertTrue(captured.records[0].getMessage().startswith("Controller._try_to_connect() - (tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4) Connecting to nvme control with cfg={'hdr_digest': False, 'data_digest': False"))
+ self.assertTrue(
+ captured.records[0]
+ .getMessage()
+ .startswith(
+ "Controller._try_to_connect() - (tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4) Connecting to nvme control with cfg={'hdr_digest': False, 'data_digest': False"
+ )
+ )
self.assertEqual(controller._connect_attempts, 1)
diff --git a/test/test-service.py b/test/test-service.py
index 19f9b0c..4ce37be 100755
--- a/test/test-service.py
+++ b/test/test-service.py
@@ -4,6 +4,7 @@ import unittest
from staslib import service
from pyfakefs.fake_filesystem_unittest import TestCase
+
class Args:
def __init__(self):
self.tron = True
@@ -11,6 +12,20 @@ class Args:
self.conf_file = '/dev/null'
+class TestService(service.Service):
+ def _config_ctrls_finish(self, configured_ctrl_list):
+ pass
+
+ def _dump_last_known_config(self, controllers):
+ pass
+
+ def _keep_connections_on_exit(self):
+ pass
+
+ def _load_last_known_config(self):
+ return dict()
+
+
class Test(TestCase):
'''Unit tests for class Service'''
@@ -18,22 +33,39 @@ class Test(TestCase):
self.setUpPyfakefs()
os.environ['RUNTIME_DIRECTORY'] = "/run"
- self.fs.create_file('/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n')
- self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n')
- self.fs.create_file('/dev/nvme-fabrics', contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n')
+ self.fs.create_file(
+ '/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n'
+ )
+ self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n')
+ self.fs.create_file(
+ '/dev/nvme-fabrics',
+ contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n',
+ )
+
+ def test_cannot_instantiate_concrete_classes_if_abstract_method_are_not_implemented(self):
+ # Make sure we can't instantiate the ABC directly (Abstract Base Class).
+ class Service(service.Service):
+ pass
+
+ self.assertRaises(TypeError, lambda: Service(Args(), reload_hdlr=lambda x: x))
def test_get_controller(self):
- # FIXME: this is hack, fix it later
- service.Service._load_last_known_config = lambda x : dict()
- # start the test
-
- srv = service.Service(Args(), reload_hdlr=lambda x : x)
- self.assertRaises(NotImplementedError, srv._keep_connections_on_exit)
- self.assertRaises(NotImplementedError, srv._dump_last_known_config, [])
- self.assertRaises(NotImplementedError, srv._on_config_ctrls)
- #self.assertEqual(srv.get_controllers(), dict())
- self.assertEqual(srv.get_controller(transport='tcp', traddr='10.10.10.10', trsvcid='8009', host_traddr='1.2.3.4', host_iface='wlp0s20f3', subsysnqn='nqn.1988-11.com.dell:SFSS:2:20220208134025e8'), None)
- self.assertEqual(srv.remove_controller(controller=None), None)
+ srv = TestService(Args(), reload_hdlr=lambda x: x)
+
+ self.assertEqual(list(srv.get_controllers()), list())
+ self.assertEqual(
+ srv.get_controller(
+ transport='tcp',
+ traddr='10.10.10.10',
+ trsvcid='8009',
+ host_traddr='1.2.3.4',
+ host_iface='wlp0s20f3',
+ subsysnqn='nqn.1988-11.com.dell:SFSS:2:20220208134025e8',
+ ),
+ None,
+ )
+ self.assertEqual(srv.remove_controller(controller=None, success=True), None)
+
if __name__ == '__main__':
unittest.main()