diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d0b884 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +SOURCES/nvme-stas-1.1.6.tar.gz diff --git a/.nvme-stas.metadata b/.nvme-stas.metadata new file mode 100644 index 0000000..487f8a9 --- /dev/null +++ b/.nvme-stas.metadata @@ -0,0 +1 @@ +829d844d8ee2f797fdbf557be1815310cd37a1e3 SOURCES/nvme-stas-1.1.6.tar.gz diff --git a/SOURCES/0001-sync-with-1.1.6.patch b/SOURCES/0001-sync-with-1.1.6.patch new file mode 100644 index 0000000..c28df7f --- /dev/null +++ b/SOURCES/0001-sync-with-1.1.6.patch @@ -0,0 +1,3307 @@ +diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml +index 93604e8..4e1b6c5 100644 +--- a/.github/workflows/pylint.yml ++++ b/.github/workflows/pylint.yml +@@ -1,9 +1,19 @@ +-name: Pylint ++name: Linters + + on: [push] + + jobs: +- build: ++ ++ docker-lint: ++ runs-on: ubuntu-latest ++ steps: ++ - uses: actions/checkout@v3 ++ - uses: hadolint/hadolint-action@v2.1.0 ++ with: ++ recursive: true ++ ignore: DL3041 ++ ++ python-lint: + runs-on: ubuntu-latest + + strategy: +diff --git a/Dockerfile b/Dockerfile +index ad6742e..0ab5138 100644 +--- a/Dockerfile ++++ b/Dockerfile +@@ -2,12 +2,12 @@ FROM fedora:36 + + WORKDIR /root + +-# for nvme-stas +-RUN dnf install -y python3-dasbus python3-pyudev python3-systemd python3-gobject meson +-# for libnvme +-RUN dnf install -y git gcc g++ cmake openssl-devel libuuid-devel json-c-devel swig python-devel meson ++# first line for nvme-stas ++# second line for libnvme ++RUN dnf install -y python3-dasbus python3-pyudev python3-systemd python3-gobject meson \ ++ git gcc g++ cmake openssl-devel libuuid-devel json-c-devel swig python-devel meson && dnf clean all + + COPY . . +-RUN meson .build && ninja -C .build && cd .build && meson install ++RUN meson .build && ninja -C .build && meson install -C .build + + ENTRYPOINT ["python3"] +diff --git a/NEWS.md b/NEWS.md +index d1515cd..f56a7c9 100644 +--- a/NEWS.md ++++ b/NEWS.md +@@ -5,6 +5,7 @@ + - Fix issues with I/O controller connection audits + - Eliminate pcie devices from list of I/O controller connections to audit + - Add soaking timer to workaround race condition between kernel and user-space applications on "add" uevents. When the kernel adds a new nvme device (e.g. `/dev/nvme7`) and sends a "add" uevent to notify user-space applications, the attributes associated with that device (e.g. `/sys/class/nvme/nvme7/cntrltype`) may not be fully initialized which can lead `stacd` to dismiss a device that should get audited. ++- Make `sticky-connections=enabled` the default (see `stacd.conf`) + + ## Changes with release 1.1.5 + +@@ -32,7 +33,7 @@ stacd: Bug fix. Check that self._cfg_soak_tmr is not None before dereferencing i + + ## Changes with release 1.1.1 + +-Make `sticky-connections-disabled` by default ++Make `sticky-connections=disabled` the default (see `stacd.conf`) + + ## Changes with release 1.1 + +diff --git a/coverage.sh.in b/coverage.sh.in +index 96b8c53..5ba2ebe 100755 +--- a/coverage.sh.in ++++ b/coverage.sh.in +@@ -38,14 +38,24 @@ PRIMARY_GRP=$( id -ng ) + PRIMARY_USR=$( id -nu ) + PYTHON_PATH=.:./subprojects/libnvme + ++log() { ++ msg="$1" ++ printf "%b[1;36m%s%b[0m\n" "\0033" "${msg}" "\0033" ++ sudo logger -i "@@@@@ COVERAGE -" -p 4 "${msg}" ++} ++ + sd_stop() { +- unit="$1"-cov.service ++ app="$1" ++ unit="${app}"-cov.service ++ log "Stop ${app}" + sudo systemctl stop "${unit}" >/dev/null 2>&1 + sudo systemctl reset-failed "${unit}" >/dev/null 2>&1 + } + + sd_restart() { +- unit="$1"-cov.service ++ app="$1" ++ unit="${app}"-cov.service ++ log "Restart ${app}" + sudo systemctl restart "${unit}" >/dev/null 2>&1 + } + +@@ -61,7 +71,7 @@ sd_start() { + cmd="${app} --syslog -f ${conf}" + fi + +- printf "\n%b[1;36m%s%b[0m\n" "\0033" "Start ${app}" "\0033" ++ log "Start ${app}" + + RUNTIME_DIRECTORY=/tmp/${app} + rm -rf ${RUNTIME_DIRECTORY} +@@ -75,7 +85,7 @@ reload_cfg() { + app="$1" + unit="${app}"-cov.service + pid=$( systemctl show --property MainPID --value "${unit}" ) +- printf "%b[1;36m%s%b[0m\n" "\0033" "Reload config ${app}" "\0033" ++ log "Reload config ${app}" + sudo kill -HUP "${pid}" + } + +@@ -83,15 +93,24 @@ if [ ! -d coverage ]; then + mkdir coverage + fi + ++ ++log "START-START-START-START-START-START-START-START-START-START-START-START" ++ ++ ++ + ################################################################################ + # Load nvme kernel module ++log "modprobe nvme-tcp" + sudo /usr/sbin/modprobe nvme-tcp + ++log "nvme disconnect-all" + sudo nvme disconnect-all + + ################################################################################ + # Create a dummy config file for @STAFD_PROCNAME@ +-stafd_conf_fname=$(mktemp /tmp/@STAFD_PROCNAME@.conf.XXXXXX) ++file=/tmp/@STAFD_PROCNAME@.conf.XXXXXX ++log "Create dummy config file $file" ++stafd_conf_fname=$(mktemp $file) + cat > "${stafd_conf_fname}" <<'EOF' + [Global] + tron=true +@@ -102,7 +121,9 @@ EOF + + ################################################################################ + # Create a dummy config file for @STACD_PROCNAME@ +-stacd_conf_fname=$(mktemp /tmp/@STACD_PROCNAME@.conf.XXXXXX) ++file=/tmp/@STACD_PROCNAME@.conf.XXXXXX ++log "Create dummy config file $file" ++stacd_conf_fname=$(mktemp $file) + cat > "${stacd_conf_fname}" <<'EOF' + [Global] + tron=true +@@ -111,6 +132,7 @@ udev-rule=disabled + sticky-connections=enabled + EOF + ++log "Stop & Mask Avahi daemon" + sudo systemctl stop avahi-daemon.service + sudo systemctl stop avahi-daemon.socket + sudo systemctl mask avahi-daemon.service +@@ -118,11 +140,11 @@ sudo systemctl mask avahi-daemon.socket + sleep 1 + + +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status while @STAFD_PROCNAME@ is not running" "\0033" ++log "Invoking @STAFD_CTLNAME@ status while @STAFD_PROCNAME@ is not running" + coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ ls >/dev/null 2>&1 + coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ invalid-command >/dev/null 2>&1 + +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ status while @STACD_PROCNAME@ is not running" "\0033" ++log "Invoking @STACD_CTLNAME@ status while @STACD_PROCNAME@ is not running" + coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls >/dev/null 2>&1 + coverage run --rcfile=.coveragerc @STACD_CTLNAME@ invalid-command >/dev/null 2>&1 + +@@ -132,30 +154,33 @@ sd_start "@STAFD_PROCNAME@" "@STAFD_DBUS_NAME@" "${stafd_conf_fname}" + sd_start "@STACD_PROCNAME@" "@STACD_DBUS_NAME@" "${stacd_conf_fname}" + sleep 3 + +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status" "\0033" ++log "Invoking @STAFD_CTLNAME@ status" + coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ status >/dev/null 2>&1 + + reload_cfg "@STAFD_PROCNAME@" + sleep 1 + ++log "Restart Avahi daemon" + sudo systemctl unmask avahi-daemon.socket + sudo systemctl unmask avahi-daemon.service + sudo systemctl start avahi-daemon.socket + sudo systemctl start avahi-daemon.service + sleep 2 + ++log "Change stafd config: tron=true, persistent-connections=false, zeroconf=enable" + cat > "${stafd_conf_fname}" <<'EOF' + [Global] + tron=true + persistent-connections=false + + [Service Discovery] +-zeroconf=disabled ++zeroconf=enabled + EOF + reload_cfg "@STAFD_PROCNAME@" + + sleep 1 + ++log "Change stafd config: ip-family=ipv4, kato=10, adding multiple controllers" + cat > "${stafd_conf_fname}" <<'EOF' + [Global] + tron=true +@@ -172,11 +197,15 @@ controller=transport=tcp;traddr=abracadabra + controller= + controller=trsvcid + controller=transport=rdma;traddr=!@#$ ++controller=transport=fc;traddr=21:00:00:00:00:00:00:00;host-traddr=20:00:00:00:00:00:00:00 ++controller=transport=XM;traddr=2.2.2.2 + blacklist=transport=tcp;traddr=1.1.1.1 + blacklist=transport=tcp;traddr=1000.1000.1000.1000 + EOF + reload_cfg "@STAFD_PROCNAME@" + ++ ++log "Change stacd config: tron=true, udev-rule=disabled, sticky-connections=disabled" + cat > "${stacd_conf_fname}" <<'EOF' + [Global] + tron=true +@@ -186,12 +215,12 @@ EOF + reload_cfg "@STACD_PROCNAME@" + sleep 3 + +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status" "\0033" ++log "Invoking @STAFD_CTLNAME@ status" + coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ status >/dev/null 2>&1 + + ################################################################################ + # Fake mDNS packets from a CDC +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Start Avahi publisher" "\0033" ++log "Start Avahi publisher" + AVAHI_PUBLISHER=mdns_publisher.service + sudo systemctl stop ${AVAHI_PUBLISHER} >/dev/null 2>&1 + sudo systemctl reset-failed ${AVAHI_PUBLISHER} >/dev/null 2>&1 +@@ -200,7 +229,7 @@ sleep 1 + + ################################################################################ + # Start nvme target simulator +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Start nvmet" "\0033" ++log "Start nvmet" + sudo ../utils/nvmet/nvmet.py clean + sudo ../utils/nvmet/nvmet.py create -f ../utils/nvmet/nvmet.conf + sleep 2 +@@ -210,76 +239,76 @@ reload_cfg "@STACD_PROCNAME@" + sleep 3 + + ################################################################################ +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_PROCNAME@ --version" "\0033" ++log "Invoking @STAFD_PROCNAME@ --version" + coverage run --rcfile=.coveragerc @STAFD_PROCNAME@ --version +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_PROCNAME@ --idl" "\0033" ++log "Invoking @STAFD_PROCNAME@ --idl" + coverage run --rcfile=.coveragerc @STAFD_PROCNAME@ --idl /tmp/@STAFD_PROCNAME@.idl + +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_PROCNAME@ --version" "\0033" ++log "Invoking @STACD_PROCNAME@ --version" + coverage run --rcfile=.coveragerc @STACD_PROCNAME@ --version +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_PROCNAME@ --idl" "\0033" ++log "Invoking @STACD_PROCNAME@ --idl" + coverage run --rcfile=.coveragerc @STACD_PROCNAME@ --idl /tmp/@STACD_PROCNAME@.idl + + ################################################################################ + # Stimulate D-Bus activity +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ --version" "\0033" ++log "Invoking @STAFD_CTLNAME@ --version" + sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ --version +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ with a bad command" "\0033" ++log "Invoking @STAFD_CTLNAME@ with a bad command" + sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ blah +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ troff" "\0033" ++log "Invoking @STAFD_CTLNAME@ troff" + sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ troff +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ status" "\0033" ++log "Invoking @STAFD_CTLNAME@ status" + coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ status >/dev/null 2>&1 +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ tron" "\0033" ++log "Invoking @STAFD_CTLNAME@ tron" + sudo coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ tron +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ ls" "\0033" ++log "Invoking @STAFD_CTLNAME@ ls" + coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ ls -d >/dev/null 2>&1 +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ adlp" "\0033" ++log "Invoking @STAFD_CTLNAME@ adlp" + coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ adlp -d >/dev/null 2>&1 +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STAFD_CTLNAME@ dlp" "\0033" ++log "Invoking @STAFD_CTLNAME@ dlp" + coverage run --rcfile=.coveragerc @STAFD_CTLNAME@ dlp -t tcp -a ::1 -s 8009 >/dev/null 2>&1 + +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ --version" "\0033" ++log "Invoking @STACD_CTLNAME@ --version" + sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ --version +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ with a bad command" "\0033" ++log "Invoking @STACD_CTLNAME@ with a bad command" + sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ blah +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ troff" "\0033" ++log "Invoking @STACD_CTLNAME@ troff" + sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ troff +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ status" "\0033" ++log "Invoking @STACD_CTLNAME@ status" + coverage run --rcfile=.coveragerc @STACD_CTLNAME@ status >/dev/null 2>&1 +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ tron" "\0033" ++log "Invoking @STACD_CTLNAME@ tron" + sudo coverage run --rcfile=.coveragerc @STACD_CTLNAME@ tron +-printf "%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ ls" "\0033" ++log "Invoking @STACD_CTLNAME@ ls" + coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls -d >/dev/null 2>&1 + + ################################################################################ + # Stimulate AENs activity by removing/restoring namespaces +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Remove namespace: klingons" "\0033" ++log "Remove namespace: klingons" + sudo ../utils/nvmet/nvmet.py unlink -p 1 -s klingons + sleep 2 +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ ls" "\0033" ++log "Invoking @STACD_CTLNAME@ ls" + coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls -d >/dev/null 2>&1 + +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Restore namespace: klingons" "\0033" ++log "Restore namespace: klingons" + sudo ../utils/nvmet/nvmet.py link -p 1 -s klingons + sleep 2 +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Invoking @STACD_CTLNAME@ ls" "\0033" ++log "Invoking @STACD_CTLNAME@ ls" + coverage run --rcfile=.coveragerc @STACD_CTLNAME@ ls -d >/dev/null 2>&1 + + ################################################################################ + # Stop Avahi Publisher +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop Avahi publisher" "\0033" ++log "Stop Avahi publisher" + sudo systemctl stop ${AVAHI_PUBLISHER} + sleep 1 + + ################################################################################ +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Restart Avahi publisher" "\0033" ++log "Restart Avahi publisher" + sudo systemd-run --unit=${AVAHI_PUBLISHER} --working-directory=. avahi-publish -s SFSS _nvme-disc._tcp 8009 "p=tcp" + sleep 2 + + ################################################################################ + # Make config changes for @STAFD_PROCNAME@ +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Empty configuration and disable zeroconf for @STAFD_PROCNAME@" "\0033" ++log "Empty configuration and disable zeroconf for @STAFD_PROCNAME@" + cat > "${stafd_conf_fname}" <<'EOF' + [Global] + tron=true +@@ -293,7 +322,7 @@ sleep 1 + + ################################################################################ + # Make more config changes for @STAFD_PROCNAME@ +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Add single controller (::1) and re-enable zeroconf for @STAFD_PROCNAME@" "\0033" ++log "Add single controller (::1) and re-enable zeroconf for @STAFD_PROCNAME@" + cat > "${stafd_conf_fname}" <<'EOF' + [Global] + tron=true +@@ -307,24 +336,23 @@ sleep 2 + + ################################################################################ + # Stop Avahi Publisher +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop Avahi publisher" "\0033" ++log "Stop Avahi publisher" + sudo systemctl stop ${AVAHI_PUBLISHER} + sleep 2 + + ################################################################################ + # Remove one of the NVMe device's +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Remove (disconnect) nvme1" "\0033" ++log "Remove (disconnect) nvme1" + sudo nvme disconnect -d nvme1 + sleep 2 + + + ################################################################################ +-printf "%b[1;36m%s%b[0m\n" "\0033" "Restart @STAFD_PROCNAME@ and @STACD_PROCNAME@" "\0033" + sd_restart "@STAFD_PROCNAME@" + sd_restart "@STACD_PROCNAME@" + sleep 1 + +-printf "%b[1;36m%s%b[0m\n" "\0033" "Create invalid conditions for saving/loading @STAFD_PROCNAME@'s last known config" "\0033" ++log "Create invalid conditions for saving/loading @STAFD_PROCNAME@'s last known config" + rm -rf "/tmp/@STAFD_PROCNAME@" + sd_stop "@STAFD_PROCNAME@" + sd_restart "@STACD_PROCNAME@" +@@ -334,7 +362,7 @@ sleep 2 + + ################################################################################ + # Stop everything and collect coverage stats +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop @STAFD_PROCNAME@ and @STACD_PROCNAME@" "\0033" ++log "Stop @STAFD_PROCNAME@ and @STACD_PROCNAME@" + sd_stop "@STAFD_PROCNAME@" + sd_stop "@STACD_PROCNAME@" + sleep 1 +@@ -345,33 +373,49 @@ sudo chown -R "${PRIMARY_USR}":"${PRIMARY_GRP}" coverage >/dev/null 2>&1 + sudo chown -R "${PRIMARY_USR}":"${PRIMARY_GRP}" staslib/__pycache__ >/dev/null 2>&1 + sudo chown -R "${PRIMARY_USR}":"${PRIMARY_GRP}" subprojects/libnvme/libnvme/__pycache__ >/dev/null 2>&1 + ++log "nvme disconnect-all" + sudo nvme disconnect-all + ++log "Remove ${stafd_conf_fname} and ${stacd_conf_fname}" + rm "${stafd_conf_fname}" + rm "${stacd_conf_fname}" + ++log "Run unit test: test-udev" + PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-udev.py ++log "Run unit test: test-avahi" + PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-avahi.py ++log "Run unit test: test-gtimer" + PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-gtimer.py ++log "Run unit test: test-version" + PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-version.py ++log "Run unit test: test-transport_id" + PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-transport_id.py ++log "Run unit test: test-config" + PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-config.py ++log "Run unit test: test-controller" + PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-controller.py ++log "Run unit test: test-service" + PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-service.py ++log "Run unit test: test-log" + PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-log.py ++log "Run unit test: test-nvme_options" + sudo PYTHONPATH=${PYTHON_PATH} coverage run --rcfile=.coveragerc ../test/test-nvme_options.py + + ################################################################################ + # Stop nvme target simulator +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Stop nvmet" "\0033" ++log "Stop nvmet" + sudo ../utils/nvmet/nvmet.py clean + +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Collect all coverage data" "\0033" ++log "Collect all coverage data" + coverage combine --rcfile=.coveragerc + +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Generating coverage report" "\0033" ++log "Generating coverage report" + coverage report -i --rcfile=.coveragerc + +-printf "\n%b[1;36m%s%b[0m\n" "\0033" "Generating coverage report (HTML)" "\0033" ++log "Generating coverage report (HTML)" + coverage html -i --rcfile=.coveragerc + ++ ++log "All done!!!" ++ ++log "FINISHED-FINISHED-FINISHED-FINISHED-FINISHED-FINISHED-FINISHED-FINISHED" +diff --git a/doc/man/stacd.conf.xml b/doc/man/stacd.conf.xml +index 60622f6..65ee71a 100644 +--- a/doc/man/stacd.conf.xml ++++ b/doc/man/stacd.conf.xml +@@ -378,7 +378,7 @@ + entries in stacd.conf have been removed. + + +- With <code>sticky-connections=disabled</code> (default) ++ With <code>sticky-connections=disabled</code> + + stacd immediately disconnects from + a previously connected IOC if the response to a +@@ -411,7 +411,7 @@ + + + +- With <code>sticky-connections=enabled</code> ++ With <code>sticky-connections=enabled (default)</code> + + stacd does not disconnect from IOCs + when a DPLE is removed or a controller= +diff --git a/etc/stas/stacd.conf b/etc/stas/stacd.conf +index 02e7b3e..0434671 100644 +--- a/etc/stas/stacd.conf ++++ b/etc/stas/stacd.conf +@@ -202,8 +202,8 @@ + # + # Type: String + # Range: [disabled, enabled] +-# Default: disabled +-#sticky-connections=disabled ++# Default: enabled ++#sticky-connections=enabled + + [Controllers] + # controller: I/O Controllers (IOC) are specified with this keyword. +diff --git a/stacd.py b/stacd.py +index 708e372..28cefac 100755 +--- a/stacd.py ++++ b/stacd.py +@@ -10,14 +10,12 @@ + ''' STorage Appliance Connector Daemon + ''' + import sys +-import logging + from argparse import ArgumentParser + from staslib import defs + +-# pylint: disable=consider-using-f-string +-DBUS_IDL = ''' ++DBUS_IDL = f''' + +- ++ + + + +@@ -34,19 +32,16 @@ DBUS_IDL = ''' + + + +- ++ + + +- ++ + + + +-''' % ( +- defs.STACD_DBUS_NAME, +- defs.STACD_DBUS_NAME, +-) +- ++''' + ++# ****************************************************************************** + def parse_args(conf_file: str): # pylint: disable=missing-function-docstring + parser = ArgumentParser( + description=f'{defs.STAC_DESCRIPTION} ({defs.STAC_ACRONYM}). Must be root to run this program.' +@@ -77,6 +72,12 @@ ARGS = parse_args(defs.STACD_CONFIG_FILE) + + if ARGS.version: + print(f'{defs.PROJECT_NAME} {defs.VERSION}') ++ try: ++ import libnvme ++ ++ print(f'libnvme {libnvme.__version__}') ++ except (AttributeError, ModuleNotFoundError): ++ pass + sys.exit(0) + + if ARGS.idl: +@@ -85,78 +86,14 @@ if ARGS.idl: + sys.exit(0) + + +-# There is a reason for having this import here and not at the top of the file. +-# We want to allow running stafd with the --version and --idl options and exit +-# without having to import stas. +-from staslib import stas # pylint: disable=wrong-import-position +- +-# Before going any further, make sure the script is allowed to run. +-stas.check_if_allowed_to_continue() +- +- +-################################################################################ +-# Preliminary checks have passed. Let her rip! +-# pylint: disable=wrong-import-position +-# pylint: disable=wrong-import-order +-import json +-import pathlib +-import systemd.daemon +-import dasbus.error +-import dasbus.client.observer +-import dasbus.client.proxy +-from gi.repository import GLib +-from staslib import conf, log, gutil, trid, udev, ctrl, service # pylint: disable=ungrouped-imports +- +-log.init(ARGS.syslog) +- +-UDEV_RULE_SUPPRESS = pathlib.Path('/run/udev/rules.d', '70-nvmf-autoconnect.rules') +- +- +-def udev_rule_ctrl(enable): +- '''@brief We add an empty udev rule to /run/udev/rules.d to suppress +- nvme-cli's udev rule that is used to tell udevd to automatically +- connect to I/O controller. This is to avoid race conditions between +- stacd and udevd. This is configurable. See "udev-rule" in stacd.conf +- for details. +- ''' +- if enable: +- try: +- UDEV_RULE_SUPPRESS.unlink() +- except FileNotFoundError: +- pass +- else: +- if not UDEV_RULE_SUPPRESS.exists(): +- pathlib.Path('/run/udev/rules.d').mkdir(parents=True, exist_ok=True) +- UDEV_RULE_SUPPRESS.symlink_to('/dev/null') +- +- + # ****************************************************************************** +-class Ioc(ctrl.Controller): +- '''@brief This object establishes a connection to one I/O Controller.''' +- +- def __init__(self, root, host, tid: trid.TID): +- super().__init__(root, host, tid) +- +- def _on_udev_remove(self, udev_obj): +- '''Called when the associated nvme device (/dev/nvmeX) is removed +- from the system. +- ''' +- super()._on_udev_remove(udev_obj) +- +- # Defer removal of this object to the next main loop's idle period. +- GLib.idle_add(STAC.remove_controller, self) +- +- def _find_existing_connection(self): +- return self._udev.find_nvme_ioc_device(self.tid) +- +- +-# ****************************************************************************** +-class Stac(service.Service): +- '''STorage Appliance Connector (STAC)''' ++if __name__ == '__main__': ++ import json ++ import logging ++ from staslib import log, service, stas, udev # pylint: disable=ungrouped-imports + +- CONF_STABILITY_SOAK_TIME_SEC = 1.5 +- CONF_STABILITY_LONG_SOAK_TIME_SEC = 10 # pylint: disable=invalid-name +- ADD_EVENT_SOAK_TIME_SEC = 1 ++ # Before going any further, make sure the script is allowed to run. ++ stas.check_if_allowed_to_continue() + + class Dbus: + '''This is the DBus interface that external programs can use to +@@ -205,229 +142,8 @@ class Stac(service.Service): + for controller in STAC.get_controllers() + ] + +- # ========================================================================== +- def __init__(self, args): +- super().__init__(args, self._reload_hdlr) +- +- # We don't want to apply configuration changes to nvme-cli right away. +- # Often, multiple changes will occur in a short amount of time (sub-second). +- # We want to wait until there are no more changes before applying them +- # to the system. The following timer acts as a "soak period". Changes +- # will be applied by calling self._on_config_ctrls() at the end of +- # the soak period. +- self._cfg_soak_tmr = gutil.GTimer(Stac.CONF_STABILITY_SOAK_TIME_SEC, self._on_config_ctrls) +- self._cfg_soak_tmr.start() +- +- self._add_event_soak_tmr = gutil.GTimer(Stac.ADD_EVENT_SOAK_TIME_SEC, self._on_add_event_soaked) +- +- self._config_connections_audit() +- +- # Create the D-Bus instance. +- self._config_dbus(Stac.Dbus(), defs.STACD_DBUS_NAME, defs.STACD_DBUS_PATH) +- +- # Connect to STAF D-Bus interface +- self._staf = None +- self._staf_watcher = dasbus.client.observer.DBusObserver(self._sysbus, defs.STAFD_DBUS_NAME) +- self._staf_watcher.service_available.connect(self._connect_to_staf) +- self._staf_watcher.service_unavailable.connect(self._disconnect_from_staf) +- self._staf_watcher.connect_once_available() +- +- # Suppress udev rule to auto-connect when AEN is received. +- udev_rule_ctrl(conf.SvcConf().udev_rule_enabled) +- +- def _release_resources(self): +- logging.debug('Stac._release_resources()') +- +- if self._add_event_soak_tmr: +- self._add_event_soak_tmr.kill() +- +- udev_rule_ctrl(True) +- +- if self._udev: +- self._udev.unregister_for_action_events('add') +- +- self._destroy_staf_comlink(self._staf_watcher) +- if self._staf_watcher is not None: +- self._staf_watcher.disconnect() +- +- super()._release_resources() +- +- self._staf = None +- self._staf_watcher = None +- self._add_event_soak_tmr = None +- +- def _audit_connections(self, tids): +- '''A host should only connect to I/O controllers that have been zoned +- for that host or a manual "controller" entry exists in stcd.conf. +- A host should disconnect from an I/O controller when that I/O controller +- is removed from the zone or a manual "controller" entry is removed from +- stacd.conf. stacd will audit connections if "sticky-connections=disabled". +- stacd will delete any connection that is not supposed to exist. +- ''' +- logging.debug('Stac._audit_connections() - tids = %s', tids) +- num_controllers = len(self._controllers) +- for tid in tids: +- if tid not in self._controllers: +- self._controllers[tid] = Ioc(self._root, self._host, tid) +- +- if num_controllers != len(self._controllers): +- self._cfg_soak_tmr.start(Stac.CONF_STABILITY_SOAK_TIME_SEC) +- +- def _on_add_event(self, udev_obj): # pylint: disable=unused-argument +- '''@brief This function is called when a "add" event is received from +- the kernel for an NVMe device. This is used to trigger an audit and make +- sure that the connection to an I/O controller is allowed. +- +- WARNING: There is a race condition with the "add" event from the kernel. +- The kernel sends the "add" event a bit early and the sysfs attributes +- associated with the nvme object are not always fully initialized. +- To workaround this problem we use a soaking timer to give time for the +- sysfs attributes to stabilize. +- ''' +- self._add_event_soak_tmr.start() +- +- def _on_add_event_soaked(self): +- '''@brief After the add event has been soaking for ADD_EVENT_SOAK_TIME_SEC +- seconds, we can audit the connections. +- ''' +- if not conf.SvcConf().sticky_connections: +- self._audit_connections(self._udev.get_nvme_ioc_tids()) +- return GLib.SOURCE_REMOVE +- +- def _config_connections_audit(self): +- '''This function checks the "sticky_connections" parameter to determine +- whether audits should be performed. Audits are enabled when +- "sticky_connections" is disabled. +- ''' +- if not conf.SvcConf().sticky_connections: +- if self._udev.get_registered_action_cback('add') is None: +- self._udev.register_for_action_events('add', self._on_add_event) +- self._audit_connections(self._udev.get_nvme_ioc_tids()) +- else: +- self._udev.unregister_for_action_events('add') +- +- def _keep_connections_on_exit(self): +- '''@brief Determine whether connections should remain when the +- process exits. +- ''' +- return True +- +- def _reload_hdlr(self): +- '''@brief Reload configuration file. This is triggered by the SIGHUP +- signal, which can be sent with "systemctl reload stacd". +- ''' +- systemd.daemon.notify('RELOADING=1') +- service_cnf = conf.SvcConf() +- service_cnf.reload() +- self.tron = service_cnf.tron +- self._config_connections_audit() +- self._cfg_soak_tmr.start(Stac.CONF_STABILITY_SOAK_TIME_SEC) +- udev_rule_ctrl(service_cnf.udev_rule_enabled) +- systemd.daemon.notify('READY=1') +- return GLib.SOURCE_CONTINUE +- +- def _get_log_pages_from_stafd(self): +- if self._staf: +- try: +- return json.loads(self._staf.get_all_log_pages(True)) +- except dasbus.error.DBusError: +- pass +- +- return list() +- +- def _config_ctrls_finish(self, configured_ctrl_list): +- configured_ctrl_list = [ +- ctrl_dict for ctrl_dict in configured_ctrl_list if 'traddr' in ctrl_dict and 'subsysnqn' in ctrl_dict +- ] +- logging.debug('Stac._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list) +- +- discovered_ctrl_list = list() +- for staf_data in self._get_log_pages_from_stafd(): +- host_traddr = staf_data['discovery-controller']['host-traddr'] +- host_iface = staf_data['discovery-controller']['host-iface'] +- for dlpe in staf_data['log-pages']: +- if dlpe.get('subtype') == 'nvme': # eliminate discovery controllers +- discovered_ctrl_list.append(stas.cid_from_dlpe(dlpe, host_traddr, host_iface)) +- +- logging.debug('Stac._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list) +- +- controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list) +- controllers = stas.remove_invalid_addresses(controllers) +- +- new_controller_ids = {trid.TID(controller) for controller in controllers} +- cur_controller_ids = set(self._controllers.keys()) +- controllers_to_add = new_controller_ids - cur_controller_ids +- controllers_to_del = cur_controller_ids - new_controller_ids +- +- logging.debug('Stac._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add)) +- logging.debug('Stac._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del)) +- +- for tid in controllers_to_del: +- controller = self._controllers.pop(tid, None) +- if controller is not None: +- controller.disconnect(self.remove_controller, conf.SvcConf().sticky_connections) +- +- for tid in controllers_to_add: +- self._controllers[tid] = Ioc(self._root, self._host, tid) +- +- def _connect_to_staf(self, _): +- '''@brief Hook up DBus signal handlers for signals from stafd.''' +- try: +- self._staf = self._sysbus.get_proxy(defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH) +- self._staf.log_pages_changed.connect(self._log_pages_changed) +- self._cfg_soak_tmr.start() +- +- # Make sure timer is set back to its normal value. +- self._cfg_soak_tmr.set_timeout(Stac.CONF_STABILITY_SOAK_TIME_SEC) +- logging.debug('Stac._connect_to_staf() - Connected to staf') +- except dasbus.error.DBusError: +- logging.error('Failed to connect to staf') +- +- def _destroy_staf_comlink(self, watcher): # pylint: disable=unused-argument +- if self._staf: +- self._staf.log_pages_changed.disconnect(self._log_pages_changed) +- dasbus.client.proxy.disconnect_proxy(self._staf) +- self._staf = None +- +- def _disconnect_from_staf(self, watcher): +- self._destroy_staf_comlink(watcher) +- +- # When we lose connectivity with stafd, the most logical explanation +- # is that stafd restarted. In that case, it may take some time for stafd +- # to re-populate its log pages cache. So let's give stafd plenty of time +- # to update its log pages cache and send log pages change notifications +- # before triggering a stacd re-config. We do this by momentarily +- # increasing the config soak timer to a longer period. +- if self._cfg_soak_tmr: +- self._cfg_soak_tmr.set_timeout(Stac.CONF_STABILITY_LONG_SOAK_TIME_SEC) +- +- logging.debug('Stac._disconnect_from_staf() - Disconnected from staf') +- +- def _log_pages_changed( # pylint: disable=too-many-arguments +- self, transport, traddr, trsvcid, host_traddr, host_iface, subsysnqn, device +- ): +- logging.debug( +- 'Stac._log_pages_changed() - transport=%s, traddr=%s, trsvcid=%s, host_traddr=%s, host_iface=%s, subsysnqn=%s, device=%s', +- transport, +- traddr, +- trsvcid, +- host_traddr, +- host_iface, +- subsysnqn, +- device, +- ) +- self._cfg_soak_tmr.start(Stac.CONF_STABILITY_SOAK_TIME_SEC) +- +- def _load_last_known_config(self): +- return dict() +- +- def _dump_last_known_config(self, controllers): +- pass +- +- +-# ****************************************************************************** +-if __name__ == '__main__': +- STAC = Stac(ARGS) ++ log.init(ARGS.syslog) ++ STAC = service.Stac(ARGS, Dbus()) + STAC.run() + + STAC = None +diff --git a/stafd.py b/stafd.py +index aff64fd..8a77c51 100755 +--- a/stafd.py ++++ b/stafd.py +@@ -10,14 +10,12 @@ + ''' STorage Appliance Finder Daemon + ''' + import sys +-import logging + from argparse import ArgumentParser + from staslib import defs + +-# pylint: disable=consider-using-f-string +-DBUS_IDL = ''' ++DBUS_IDL = f''' + +- ++ + + + +@@ -34,10 +32,10 @@ DBUS_IDL = ''' + + + +- ++ + + +- ++ + + + +@@ -46,7 +44,7 @@ DBUS_IDL = ''' + + + +- ++ + + + +@@ -63,12 +61,10 @@ DBUS_IDL = ''' + + + +-''' % ( +- defs.STAFD_DBUS_NAME, +- defs.STAFD_DBUS_NAME, +-) ++''' + + ++# ****************************************************************************** + def parse_args(conf_file: str): # pylint: disable=missing-function-docstring + parser = ArgumentParser( + description=f'{defs.STAF_DESCRIPTION} ({defs.STAF_ACRONYM}). Must be root to run this program.' +@@ -99,6 +95,12 @@ ARGS = parse_args(defs.STAFD_CONFIG_FILE) + + if ARGS.version: + print(f'{defs.PROJECT_NAME} {defs.VERSION}') ++ try: ++ import libnvme ++ ++ print(f'libnvme {libnvme.__version__}') ++ except (AttributeError, ModuleNotFoundError): ++ pass + sys.exit(0) + + if ARGS.idl: +@@ -107,250 +109,15 @@ if ARGS.idl: + sys.exit(0) + + +-# There is a reason for having this import here and not at the top of the file. +-# We want to allow running stafd with the --version and --idl options and exit +-# without having to import stas and avahi. +-from staslib import stas, avahi # pylint: disable=wrong-import-position +- +-# Before going any further, make sure the script is allowed to run. +-stas.check_if_allowed_to_continue() +- +- +-################################################################################ +-# Preliminary checks have passed. Let her rip! +-# pylint: disable=wrong-import-position +-# pylint: disable=wrong-import-order +-import json +-import pickle +-import dasbus.server.interface +-import systemd.daemon +-from libnvme import nvme +-from gi.repository import GLib +-from staslib import conf, log, gutil, trid, udev, ctrl, service # pylint: disable=ungrouped-imports +- +-log.init(ARGS.syslog) +- +-DLP_CHANGED = ( +- (nvme.NVME_LOG_LID_DISCOVER << 16) | (nvme.NVME_AER_NOTICE_DISC_CHANGED << 8) | nvme.NVME_AER_NOTICE +-) # 0x70f002 +- +- + # ****************************************************************************** +-class Dc(ctrl.Controller): +- '''@brief This object establishes a connection to one Discover Controller (DC). +- It retrieves the discovery log pages and caches them. +- It also monitors udev events associated with that DC and updates +- the cached discovery log pages accordingly. +- ''' +- +- GET_LOG_PAGE_RETRY_RERIOD_SEC = 20 +- REGISTRATION_RETRY_RERIOD_SEC = 10 +- +- def __init__(self, root, host, tid: trid.TID, log_pages=None): +- super().__init__(root, host, tid, discovery_ctrl=True) +- self._register_op = None +- self._get_log_op = None +- self._log_pages = log_pages if log_pages else list() # Log pages cache +- +- def _release_resources(self): +- logging.debug('Dc._release_resources() - %s | %s', self.id, self.device) +- super()._release_resources() +- self._log_pages = list() +- +- def _kill_ops(self): +- super()._kill_ops() +- if self._get_log_op: +- self._get_log_op.kill() +- self._get_log_op = None +- if self._register_op: +- self._register_op.kill() +- self._register_op = None +- +- def info(self) -> dict: +- '''@brief Get the controller info for this object''' +- info = super().info() +- if self._get_log_op: +- info['get log page operation'] = self._get_log_op.as_dict() +- if self._register_op: +- info['register operation'] = self._register_op.as_dict() +- return info +- +- def cancel(self): +- '''@brief Used to cancel pending operations.''' +- super().cancel() +- if self._get_log_op: +- self._get_log_op.cancel() +- if self._register_op: +- self._register_op.cancel() +- +- def log_pages(self) -> list: +- '''@brief Get the cached log pages for this object''' +- return self._log_pages +- +- def referrals(self) -> list: +- '''@brief Return the list of referrals''' +- return [page for page in self._log_pages if page['subtype'] == 'referral'] +- +- def _on_aen(self, aen: int): +- super()._on_aen(aen) +- if aen == DLP_CHANGED and self._get_log_op: +- self._get_log_op.run_async() +- +- def _on_nvme_event(self, nvme_event: str): +- super()._on_nvme_event(nvme_event) +- if nvme_event == 'connected' and self._register_op: +- self._register_op.run_async() +- +- def _on_udev_remove(self, udev_obj): +- super()._on_udev_remove(udev_obj) +- if self._try_to_connect_deferred: +- self._try_to_connect_deferred.schedule() +- +- def _find_existing_connection(self): +- return self._udev.find_nvme_dc_device(self.tid) +- +- # -------------------------------------------------------------------------- +- def _on_connect_success(self, op_obj, data): +- '''@brief Function called when we successfully connect to the +- Discovery Controller. +- ''' +- super()._on_connect_success(op_obj, data) +- +- if self._alive(): +- if self._ctrl.is_registration_supported(): +- self._register_op = gutil.AsyncOperationWithRetry( +- self._on_registration_success, +- self._on_registration_fail, +- self._ctrl.registration_ctlr, +- nvme.NVMF_DIM_TAS_REGISTER, +- ) +- self._register_op.run_async() +- else: +- self._get_log_op = gutil.AsyncOperationWithRetry( +- self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover +- ) +- self._get_log_op.run_async() +- +- # -------------------------------------------------------------------------- +- def _on_registration_success(self, op_obj, data): # pylint: disable=unused-argument +- '''@brief Function called when we successfully register with the +- Discovery Controller. See self._register_op object +- for details. +- ''' +- if self._alive(): +- if data is not None: +- logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data) +- else: +- logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device) +- self._get_log_op = gutil.AsyncOperationWithRetry( +- self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover +- ) +- self._get_log_op.run_async() +- else: +- logging.debug( +- 'Dc._on_registration_success() - %s | %s Received event on dead object.', self.id, self.device +- ) +- +- def _on_registration_fail(self, op_obj, err, fail_cnt): +- '''@brief Function called when we fail to register with the +- Discovery Controller. See self._register_op object +- for details. +- ''' +- if self._alive(): +- logging.debug( +- 'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec', +- self.id, +- self.device, +- err, +- Dc.REGISTRATION_RETRY_RERIOD_SEC, +- ) +- if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect +- logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err) +- # op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC) +- else: +- logging.debug( +- 'Dc._on_registration_fail() - %s | %s Received event on dead object. %s', +- self.id, +- self.device, +- err, +- ) +- op_obj.kill() +- +- # -------------------------------------------------------------------------- +- def _on_get_log_success(self, op_obj, data): # pylint: disable=unused-argument +- '''@brief Function called when we successfully retrieve the log pages +- from the Discovery Controller. See self._get_log_op object +- for details. +- ''' +- if self._alive(): +- # Note that for historical reasons too long to explain, the CDC may +- # return invalid addresses ("0.0.0.0", "::", or ""). Those need to be +- # filtered out. +- referrals_before = self.referrals() +- self._log_pages = ( +- [ +- {k: str(v) for k, v in dictionary.items()} +- for dictionary in data +- if dictionary.get('traddr') not in ('0.0.0.0', '::', '') +- ] +- if data +- else list() +- ) +- logging.info( +- '%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages) +- ) +- referrals_after = self.referrals() +- STAF.log_pages_changed(self, self.device) +- if referrals_after != referrals_before: +- logging.debug( +- 'Dc._on_get_log_success() - %s | %s Referrals before = %s', +- self.id, +- self.device, +- referrals_before, +- ) +- logging.debug( +- 'Dc._on_get_log_success() - %s | %s Referrals after = %s', +- self.id, +- self.device, +- referrals_after, +- ) +- STAF.referrals_changed() +- else: +- logging.debug( +- 'Dc._on_get_log_success() - %s | %s Received event on dead object.', self.id, self.device +- ) +- +- def _on_get_log_fail(self, op_obj, err, fail_cnt): +- '''@brief Function called when we fail to retrieve the log pages +- from the Discovery Controller. See self._get_log_op object +- for details. +- ''' +- if self._alive(): +- logging.debug( +- 'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec', +- self.id, +- self.device, +- err, +- Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC, +- ) +- if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect +- logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err) +- op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC) +- else: +- logging.debug( +- 'Dc._on_get_log_fail() - %s | %s Received event on dead object. %s', +- self.id, +- self.device, +- err, +- ) +- op_obj.kill() +- +- +-# ****************************************************************************** +-class Staf(service.Service): +- '''STorage Appliance Finder (STAF)''' ++if __name__ == '__main__': ++ import json ++ import logging ++ import dasbus.server.interface ++ from staslib import log, service, stas, udev # pylint: disable=ungrouped-imports + +- CONF_STABILITY_SOAK_TIME_SEC = 1.5 ++ # Before going any further, make sure the script is allowed to run. ++ stas.check_if_allowed_to_continue() + + class Dbus: + '''This is the DBus interface that external programs can use to +@@ -431,148 +198,8 @@ class Staf(service.Service): + for controller in STAF.get_controllers() + ] + +- # ========================================================================== +- def __init__(self, args): +- super().__init__(args, self._reload_hdlr) +- +- self._avahi = avahi.Avahi(self._sysbus, self._avahi_change) +- self._avahi.config_stypes(conf.SvcConf().get_stypes()) +- +- # We don't want to apply configuration changes to nvme-cli right away. +- # Often, multiple changes will occur in a short amount of time (sub-second). +- # We want to wait until there are no more changes before applying them +- # to the system. The following timer acts as a "soak period". Changes +- # will be applied by calling self._on_config_ctrls() at the end of +- # the soak period. +- self._cfg_soak_tmr = gutil.GTimer(Staf.CONF_STABILITY_SOAK_TIME_SEC, self._on_config_ctrls) +- self._cfg_soak_tmr.start() +- +- # Create the D-Bus instance. +- self._config_dbus(Staf.Dbus(), defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH) +- +- def info(self) -> dict: +- '''@brief Get the status info for this object (used for debug)''' +- info = super().info() +- info['avahi'] = self._avahi.info() +- return info +- +- def _release_resources(self): +- logging.debug('Staf._release_resources()') +- super()._release_resources() +- if self._avahi: +- self._avahi.kill() +- self._avahi = None +- +- def _load_last_known_config(self): +- try: +- with open(self._lkc_file, 'rb') as file: +- config = pickle.load(file) +- except (FileNotFoundError, AttributeError): +- return dict() +- +- logging.debug('Staf._load_last_known_config() - DC count = %s', len(config)) +- return {tid: Dc(self._root, self._host, tid, log_pages) for tid, log_pages in config.items()} +- +- def _dump_last_known_config(self, controllers): +- try: +- with open(self._lkc_file, 'wb') as file: +- config = {tid: dc.log_pages() for tid, dc in controllers.items()} +- logging.debug('Staf._dump_last_known_config() - DC count = %s', len(config)) +- pickle.dump(config, file) +- except FileNotFoundError as ex: +- logging.error('Unable to save last known config: %s', ex) +- +- def _keep_connections_on_exit(self): +- '''@brief Determine whether connections should remain when the +- process exits. +- ''' +- return conf.SvcConf().persistent_connections +- +- def _reload_hdlr(self): +- '''@brief Reload configuration file. This is triggered by the SIGHUP +- signal, which can be sent with "systemctl reload stafd". +- ''' +- systemd.daemon.notify('RELOADING=1') +- service_cnf = conf.SvcConf() +- service_cnf.reload() +- self.tron = service_cnf.tron +- self._avahi.kick_start() # Make sure Avahi is running +- self._avahi.config_stypes(service_cnf.get_stypes()) +- self._cfg_soak_tmr.start() +- systemd.daemon.notify('READY=1') +- return GLib.SOURCE_CONTINUE +- +- def log_pages_changed(self, controller, device): +- '''@brief Function invoked when a controller's cached log pages +- have changed. This will emit a D-Bus signal to inform +- other applications that the cached log pages have changed. +- ''' +- self._dbus_iface.log_pages_changed.emit( +- controller.tid.transport, +- controller.tid.traddr, +- controller.tid.trsvcid, +- controller.tid.host_traddr, +- controller.tid.host_iface, +- controller.tid.subsysnqn, +- device, +- ) +- +- def referrals_changed(self): +- '''@brief Function invoked when a controller's cached referrals +- have changed. +- ''' +- logging.debug('Staf.referrals_changed()') +- self._cfg_soak_tmr.start() +- +- def _referrals(self) -> list: +- return [ +- stas.cid_from_dlpe(dlpe, controller.tid.host_traddr, controller.tid.host_iface) +- for controller in self.get_controllers() +- for dlpe in controller.referrals() +- ] +- +- def _config_ctrls_finish(self, configured_ctrl_list): +- '''@brief Finish discovery controllers configuration after +- hostnames (if any) have been resolved. +- ''' +- configured_ctrl_list = [ +- ctrl_dict +- for ctrl_dict in configured_ctrl_list +- if 'traddr' in ctrl_dict and ctrl_dict.setdefault('subsysnqn', defs.WELL_KNOWN_DISC_NQN) +- ] +- +- discovered_ctrl_list = self._avahi.get_controllers() +- referral_ctrl_list = self._referrals() +- logging.debug('Staf._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list) +- logging.debug('Staf._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list) +- logging.debug('Staf._config_ctrls_finish() - referral_ctrl_list = %s', referral_ctrl_list) +- +- controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list + referral_ctrl_list) +- controllers = stas.remove_invalid_addresses(controllers) +- +- new_controller_ids = {trid.TID(controller) for controller in controllers} +- cur_controller_ids = set(self._controllers.keys()) +- controllers_to_add = new_controller_ids - cur_controller_ids +- controllers_to_del = cur_controller_ids - new_controller_ids +- +- logging.debug('Staf._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add)) +- logging.debug('Staf._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del)) +- +- for tid in controllers_to_del: +- controller = self._controllers.pop(tid, None) +- if controller is not None: +- controller.disconnect(self.remove_controller, conf.SvcConf().persistent_connections) +- +- for tid in controllers_to_add: +- self._controllers[tid] = Dc(self._root, self._host, tid) +- +- def _avahi_change(self): +- self._cfg_soak_tmr.start() +- +- +-# ****************************************************************************** +-if __name__ == '__main__': +- STAF = Staf(ARGS) ++ log.init(ARGS.syslog) ++ STAF = service.Staf(ARGS, Dbus()) + STAF.run() + + STAF = None +diff --git a/staslib/avahi.py b/staslib/avahi.py +index 768bbf4..90a67c8 100644 +--- a/staslib/avahi.py ++++ b/staslib/avahi.py +@@ -172,9 +172,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes + services = dict() + for service, obj in self._services.items(): + interface, protocol, name, stype, domain = service +- key = '({}, {}, {}.{}, {})'.format( # pylint: disable=consider-using-f-string +- socket.if_indextoname(interface), Avahi.protos.get(protocol, 'unknown'), name, domain, stype +- ) ++ key = f'({socket.if_indextoname(interface)}, {Avahi.protos.get(protocol, "unknown")}, {name}.{domain}, {stype})' + services[key] = obj.get('data', {}) + + info = { +@@ -316,7 +314,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes + _interface_name: str, + _signal_name: str, + args: typing.Tuple[int, int, str, str, str, int], +- *_user_data ++ *_user_data, + ): + (interface, protocol, name, stype, domain, flags) = args + logging.debug( +@@ -352,7 +350,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes + _interface_name: str, + _signal_name: str, + args: typing.Tuple[int, int, str, str, str, int], +- *_user_data ++ *_user_data, + ): + (interface, protocol, name, stype, domain, flags) = args + logging.debug( +@@ -386,7 +384,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes + _interface_name: str, + _signal_name: str, + args: typing.Tuple[int, int, str, str, str, str, int, str, int, list, int], +- *_user_data ++ *_user_data, + ): + (interface, protocol, name, stype, domain, host, aprotocol, address, port, txt, flags) = args + txt = _txt2dict(txt) +@@ -428,7 +426,7 @@ class Avahi: # pylint: disable=too-many-instance-attributes + interface_name: str, + _signal_name: str, + args: typing.Tuple[str], +- *_user_data ++ *_user_data, + ): + (error,) = args + if 'ServiceResolver' not in interface_name or 'TimeoutError' not in error: +diff --git a/staslib/conf.py b/staslib/conf.py +index 3f52e4f..c314a9e 100644 +--- a/staslib/conf.py ++++ b/staslib/conf.py +@@ -74,7 +74,7 @@ class SvcConf(metaclass=singleton.Singleton): + ('Global', 'ignore-iface'): 'false', + ('Global', 'ip-family'): 'ipv4+ipv6', + ('Global', 'udev-rule'): 'enabled', +- ('Global', 'sticky-connections'): 'disabled', ++ ('Global', 'sticky-connections'): 'enabled', + ('Service Discovery', 'zeroconf'): 'enabled', + ('Controllers', 'controller'): list(), + ('Controllers', 'blacklist'): list(), +diff --git a/staslib/ctrl.py b/staslib/ctrl.py +index 5504baa..dbc1973 100644 +--- a/staslib/ctrl.py ++++ b/staslib/ctrl.py +@@ -10,69 +10,76 @@ + Dc (Discovery Controller) and Ioc (I/O Controller) objects are derived.''' + + import logging +-from gi.repository import Gio, GLib ++from gi.repository import GLib + from libnvme import nvme +-from staslib import conf, gutil, trid, udev ++from staslib import conf, gutil, trid, udev, stas + + + DC_KATO_DEFAULT = 30 # seconds + + + # ****************************************************************************** +-class Controller: # pylint: disable=too-many-instance-attributes ++class Controller(stas.ControllerABC): + '''@brief Base class used to manage the connection to a controller.''' + +- CONNECT_RETRY_PERIOD_SEC = 60 +- FAST_CONNECT_RETRY_PERIOD_SEC = 3 +- + def __init__(self, root, host, tid: trid.TID, discovery_ctrl=False): +- self._root = root +- self._host = host +- self._udev = udev.UDEV +- self._tid = tid +- self._cancellable = Gio.Cancellable() +- self._connect_op = None +- self._connect_attempts = 0 +- self._retry_connect_tmr = gutil.GTimer(Controller.CONNECT_RETRY_PERIOD_SEC, self._on_try_to_connect) +- self._device = None +- self._ctrl = None +- self._discovery_ctrl = discovery_ctrl +- self._try_to_connect_deferred = gutil.Deferred(self._try_to_connect) +- self._try_to_connect_deferred.schedule() ++ self._udev = udev.UDEV ++ self._device = None # Refers to the nvme device (e.g. /dev/nvme[n]) ++ self._ctrl = None # libnvme's nvme.ctrl object ++ self._connect_op = None ++ ++ super().__init__(root, host, tid, discovery_ctrl) + + def _release_resources(self): + logging.debug('Controller._release_resources() - %s', self.id) + +- # Remove pending deferred from main loop +- if self._try_to_connect_deferred: +- self._try_to_connect_deferred.cancel() +- self._try_to_connect_deferred = None +- + if self._udev: + self._udev.unregister_for_device_events(self._on_udev_notification) + +- if self._retry_connect_tmr is not None: +- self._retry_connect_tmr.kill() +- +- if self._cancellable and not self._cancellable.is_cancelled(): +- self._cancellable.cancel() +- + self._kill_ops() + +- self._tid = None ++ super()._release_resources() ++ + self._ctrl = None +- self._device = None +- self._retry_connect_tmr = None +- self._cancellable = None + self._udev = None + +- def _alive(self): +- '''There may be race condition where a queued event gets processed +- after the object is no longer configured (i.e. alive). This method +- can be used by callback functions to make sure the object is still +- alive before processing further. +- ''' +- return self._cancellable and not self._cancellable.is_cancelled() ++ @property ++ def device(self) -> str: ++ '''@brief return the Linux nvme device id (e.g. nvme3) or empty ++ string if no device is associated with this controller''' ++ if not self._device and self._ctrl and self._ctrl.name: ++ self._device = self._ctrl.name ++ ++ return self._device or 'nvme?' ++ ++ def controller_id_dict(self) -> dict: ++ '''@brief return the controller ID as a dict.''' ++ cid = super().controller_id_dict() ++ cid['device'] = self.device ++ return cid ++ ++ def details(self) -> dict: ++ '''@brief return detailed debug info about this controller''' ++ details = super().details() ++ details.update( ++ self._udev.get_attributes(self.device, ++ ('hostid', 'hostnqn', 'model', ++ 'serial', 'dctype', 'cntrltype')) ++ ) ++ return details ++ ++ def info(self) -> dict: ++ '''@brief Get the controller info for this object''' ++ info = super().info() ++ if self._connect_op: ++ info['connect operation'] = self._connect_op.as_dict() ++ return info ++ ++ def cancel(self): ++ '''@brief Used to cancel pending operations.''' ++ super().cancel() ++ if self._connect_op: ++ self._connect_op.cancel() + + def _kill_ops(self): + if self._connect_op: +@@ -91,7 +98,7 @@ class Controller: # pylint: disable=too-many-instance-attributes + self._on_nvme_event(nvme_event) + elif udev_obj.action == 'remove': + logging.info('%s | %s - Received "remove" event', self.id, udev_obj.sys_name) +- self._on_udev_remove(udev_obj) ++ self._on_ctrl_removed(udev_obj) + else: + logging.debug( + 'Controller._on_udev_notification() - %s | %s - Received "%s" notification.', +@@ -108,33 +115,12 @@ class Controller: # pylint: disable=too-many-instance-attributes + udev_obj.sys_name, + ) + +- def _on_aen(self, aen: int): +- pass +- +- def _on_nvme_event(self, nvme_event): +- pass +- +- def _on_udev_remove(self, udev_obj): # pylint: disable=unused-argument ++ def _on_ctrl_removed(self, obj): # pylint: disable=unused-argument + self._udev.unregister_for_device_events(self._on_udev_notification) + self._kill_ops() # Kill all pending operations + self._ctrl = None + +- def _find_existing_connection(self): +- raise NotImplementedError() +- +- def _on_try_to_connect(self): +- self._try_to_connect_deferred.schedule() +- return GLib.SOURCE_REMOVE +- +- def _try_to_connect(self): +- # This is a deferred function call. Make sure +- # the source of the deferred is still good. +- source = GLib.main_current_source() +- if source and source.is_destroyed(): +- return +- +- self._connect_attempts += 1 +- ++ def _do_connect(self): + host_iface = ( + self.tid.host_iface + if (self.tid.host_iface and not conf.SvcConf().ignore_iface and conf.NvmeOptions().host_iface_supp) +@@ -164,7 +150,6 @@ class Controller: # pylint: disable=too-many-instance-attributes + self._on_connect_success, self._on_connect_fail, self._ctrl.init, self._host, int(udev_obj.sys_number) + ) + else: +- self._device = None + service_conf = conf.SvcConf() + cfg = { 'hdr_digest': service_conf.hdr_digest, + 'data_digest': service_conf.data_digest } +@@ -198,11 +183,10 @@ class Controller: # pylint: disable=too-many-instance-attributes + self._connect_op = None + + if self._alive(): +- if not self._device: +- self._device = self._ctrl.name ++ self._device = self._ctrl.name + logging.info('%s | %s - Connection established!', self.id, self.device) + self._connect_attempts = 0 +- self._udev.register_for_device_events(self.device, self._on_udev_notification) ++ self._udev.register_for_device_events(self._device, self._on_udev_notification) + else: + logging.debug( + 'Controller._on_connect_success() - %s | %s Received event on dead object. data=%s', +@@ -227,11 +211,11 @@ class Controller: # pylint: disable=too-many-instance-attributes + # the same time. This is perfectly fine, except that we may get a bogus + # failed to connect error. By doing a fast re-try, stacd can quickly + # verify that the connection was actually successful. +- self._retry_connect_tmr.set_timeout(Controller.FAST_CONNECT_RETRY_PERIOD_SEC) ++ self._retry_connect_tmr.set_timeout(self.FAST_CONNECT_RETRY_PERIOD_SEC) + elif self._connect_attempts == 2: + # If the fast connect re-try fails, then we can print a message to + # indicate the failure, and start a slow re-try period. +- self._retry_connect_tmr.set_timeout(Controller.CONNECT_RETRY_PERIOD_SEC) ++ self._retry_connect_tmr.set_timeout(self.CONNECT_RETRY_PERIOD_SEC) + logging.error('%s Failed to connect to controller. %s', self.id, getattr(err, 'message', err)) + + logging.debug( +@@ -248,53 +232,6 @@ class Controller: # pylint: disable=too-many-instance-attributes + getattr(err, 'message', err), + ) + +- @property +- def id(self) -> str: # pylint: disable=missing-function-docstring +- return str(self.tid) +- +- @property +- def tid(self): # pylint: disable=missing-function-docstring +- return self._tid +- +- @property +- def device(self) -> str: # pylint: disable=missing-function-docstring +- return self._device if self._device else '' +- +- def controller_id_dict(self) -> dict: +- '''@brief return the controller ID as a dict.''' +- cid = self.tid.as_dict() +- cid['device'] = self.device +- return cid +- +- def details(self) -> dict: +- '''@brief return detailed debug info about this controller''' +- details = self.controller_id_dict() +- details.update(self._udev.get_attributes(self.device, ('hostid', 'hostnqn', 'model', 'serial'))) +- details['connect attempts'] = str(self._connect_attempts) +- details['retry connect timer'] = str(self._retry_connect_tmr) +- return details +- +- def info(self) -> dict: +- '''@brief Get the controller info for this object''' +- info = self.details() +- if self._connect_op: +- info['connect operation'] = self._connect_op.as_dict() +- return info +- +- def cancel(self): +- '''@brief Used to cancel pending operations.''' +- if self._cancellable and not self._cancellable.is_cancelled(): +- logging.debug('Controller.cancel() - %s', self.id) +- self._cancellable.cancel() +- +- if self._connect_op: +- self._connect_op.cancel() +- +- def kill(self): +- '''@brief Used to release all resources associated with this object.''' +- logging.debug('Controller.kill() - %s', self.id) +- self._release_resources() +- + def disconnect(self, disconnected_cb, keep_connection): + '''@brief Issue an asynchronous disconnect command to a Controller. + Once the async command has completed, the callback 'disconnected_cb' +@@ -313,7 +250,7 @@ class Controller: # pylint: disable=too-many-instance-attributes + # cannot be called directly as the current Controller object is in the + # process of being disconnected and the callback will in fact delete + # the object. This would invariably lead to unpredictable outcome. +- GLib.idle_add(disconnected_cb, self) ++ GLib.idle_add(disconnected_cb, self, True) + + def _on_disconn_success(self, op_obj, data, disconnected_cb): # pylint: disable=unused-argument + logging.debug('Controller._on_disconn_success() - %s | %s', self.id, self.device) +@@ -322,7 +259,7 @@ class Controller: # pylint: disable=too-many-instance-attributes + # cannot be called directly as the current Controller object is in the + # process of being disconnected and the callback will in fact delete + # the object. This would invariably lead to unpredictable outcome. +- GLib.idle_add(disconnected_cb, self) ++ GLib.idle_add(disconnected_cb, self, True) + + def _on_disconn_fail(self, op_obj, err, fail_cnt, disconnected_cb): # pylint: disable=unused-argument + logging.debug('Controller._on_disconn_fail() - %s | %s: %s', self.id, self.device, err) +@@ -331,4 +268,249 @@ class Controller: # pylint: disable=too-many-instance-attributes + # cannot be called directly as the current Controller object is in the + # process of being disconnected and the callback will in fact delete + # the object. This would invariably lead to unpredictable outcome. +- GLib.idle_add(disconnected_cb, self) ++ GLib.idle_add(disconnected_cb, self, False) ++ ++ ++# ****************************************************************************** ++class Dc(Controller): ++ '''@brief This object establishes a connection to one Discover Controller (DC). ++ It retrieves the discovery log pages and caches them. ++ It also monitors udev events associated with that DC and updates ++ the cached discovery log pages accordingly. ++ ''' ++ ++ DLP_CHANGED = ( ++ (nvme.NVME_LOG_LID_DISCOVER << 16) | (nvme.NVME_AER_NOTICE_DISC_CHANGED << 8) | nvme.NVME_AER_NOTICE ++ ) # 0x70f002 ++ GET_LOG_PAGE_RETRY_RERIOD_SEC = 20 ++ REGISTRATION_RETRY_RERIOD_SEC = 10 ++ ++ def __init__(self, staf, root, host, tid: trid.TID, log_pages=None): # pylint: disable=too-many-arguments ++ super().__init__(root, host, tid, discovery_ctrl=True) ++ self._staf = staf ++ self._register_op = None ++ self._get_log_op = None ++ self._log_pages = log_pages if log_pages else list() # Log pages cache ++ ++ def _release_resources(self): ++ logging.debug('Dc._release_resources() - %s | %s', self.id, self.device) ++ super()._release_resources() ++ self._log_pages = list() ++ self._staf = None ++ ++ def _kill_ops(self): ++ super()._kill_ops() ++ if self._get_log_op: ++ self._get_log_op.kill() ++ self._get_log_op = None ++ if self._register_op: ++ self._register_op.kill() ++ self._register_op = None ++ ++ def info(self) -> dict: ++ '''@brief Get the controller info for this object''' ++ info = super().info() ++ if self._get_log_op: ++ info['get log page operation'] = self._get_log_op.as_dict() ++ if self._register_op: ++ info['register operation'] = self._register_op.as_dict() ++ return info ++ ++ def cancel(self): ++ '''@brief Used to cancel pending operations.''' ++ super().cancel() ++ if self._get_log_op: ++ self._get_log_op.cancel() ++ if self._register_op: ++ self._register_op.cancel() ++ ++ def log_pages(self) -> list: ++ '''@brief Get the cached log pages for this object''' ++ return self._log_pages ++ ++ def referrals(self) -> list: ++ '''@brief Return the list of referrals''' ++ return [page for page in self._log_pages if page['subtype'] == 'referral'] ++ ++ def _on_aen(self, aen: int): ++ if aen == self.DLP_CHANGED and self._get_log_op: ++ self._get_log_op.run_async() ++ ++ def _on_nvme_event(self, nvme_event: str): ++ if nvme_event == 'connected' and self._register_op: ++ self._register_op.run_async() ++ ++ def _on_ctrl_removed(self, obj): ++ super()._on_ctrl_removed(obj) ++ if self._try_to_connect_deferred: ++ self._try_to_connect_deferred.schedule() ++ ++ def _find_existing_connection(self): ++ return self._udev.find_nvme_dc_device(self.tid) ++ ++ # -------------------------------------------------------------------------- ++ def _on_connect_success(self, op_obj, data): ++ '''@brief Function called when we successfully connect to the ++ Discovery Controller. ++ ''' ++ super()._on_connect_success(op_obj, data) ++ ++ if self._alive(): ++ if self._ctrl.is_registration_supported(): ++ self._register_op = gutil.AsyncOperationWithRetry( ++ self._on_registration_success, ++ self._on_registration_fail, ++ self._ctrl.registration_ctlr, ++ nvme.NVMF_DIM_TAS_REGISTER, ++ ) ++ self._register_op.run_async() ++ else: ++ self._get_log_op = gutil.AsyncOperationWithRetry( ++ self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover ++ ) ++ self._get_log_op.run_async() ++ ++ # -------------------------------------------------------------------------- ++ def _on_registration_success(self, op_obj, data): # pylint: disable=unused-argument ++ '''@brief Function called when we successfully register with the ++ Discovery Controller. See self._register_op object ++ for details. ++ ''' ++ if self._alive(): ++ if data is not None: ++ logging.warning('%s | %s - Registration error. %s.', self.id, self.device, data) ++ else: ++ logging.debug('Dc._on_registration_success() - %s | %s', self.id, self.device) ++ self._get_log_op = gutil.AsyncOperationWithRetry( ++ self._on_get_log_success, self._on_get_log_fail, self._ctrl.discover ++ ) ++ self._get_log_op.run_async() ++ else: ++ logging.debug( ++ 'Dc._on_registration_success() - %s | %s Received event on dead object.', self.id, self.device ++ ) ++ ++ def _on_registration_fail(self, op_obj, err, fail_cnt): ++ '''@brief Function called when we fail to register with the ++ Discovery Controller. See self._register_op object ++ for details. ++ ''' ++ if self._alive(): ++ logging.debug( ++ 'Dc._on_registration_fail() - %s | %s: %s. Retry in %s sec', ++ self.id, ++ self.device, ++ err, ++ Dc.REGISTRATION_RETRY_RERIOD_SEC, ++ ) ++ if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect ++ logging.error('%s | %s - Failed to register with Discovery Controller. %s', self.id, self.device, err) ++ # op_obj.retry(Dc.REGISTRATION_RETRY_RERIOD_SEC) ++ else: ++ logging.debug( ++ 'Dc._on_registration_fail() - %s | %s Received event on dead object. %s', ++ self.id, ++ self.device, ++ err, ++ ) ++ op_obj.kill() ++ ++ # -------------------------------------------------------------------------- ++ def _on_get_log_success(self, op_obj, data): # pylint: disable=unused-argument ++ '''@brief Function called when we successfully retrieve the log pages ++ from the Discovery Controller. See self._get_log_op object ++ for details. ++ ''' ++ if self._alive(): ++ # Note that for historical reasons too long to explain, the CDC may ++ # return invalid addresses ("0.0.0.0", "::", or ""). Those need to be ++ # filtered out. ++ referrals_before = self.referrals() ++ self._log_pages = ( ++ [ ++ {k: str(v) for k, v in dictionary.items()} ++ for dictionary in data ++ if dictionary.get('traddr') not in ('0.0.0.0', '::', '') ++ ] ++ if data ++ else list() ++ ) ++ logging.info( ++ '%s | %s - Received discovery log pages (num records=%s).', self.id, self.device, len(self._log_pages) ++ ) ++ referrals_after = self.referrals() ++ self._staf.log_pages_changed(self, self.device) ++ if referrals_after != referrals_before: ++ logging.debug( ++ 'Dc._on_get_log_success() - %s | %s Referrals before = %s', ++ self.id, ++ self.device, ++ referrals_before, ++ ) ++ logging.debug( ++ 'Dc._on_get_log_success() - %s | %s Referrals after = %s', ++ self.id, ++ self.device, ++ referrals_after, ++ ) ++ self._staf.referrals_changed() ++ else: ++ logging.debug( ++ 'Dc._on_get_log_success() - %s | %s Received event on dead object.', self.id, self.device ++ ) ++ ++ def _on_get_log_fail(self, op_obj, err, fail_cnt): ++ '''@brief Function called when we fail to retrieve the log pages ++ from the Discovery Controller. See self._get_log_op object ++ for details. ++ ''' ++ if self._alive(): ++ logging.debug( ++ 'Dc._on_get_log_fail() - %s | %s: %s. Retry in %s sec', ++ self.id, ++ self.device, ++ err, ++ Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC, ++ ) ++ if fail_cnt == 1: # Throttle the logs. Only print the first time we fail to connect ++ logging.error('%s | %s - Failed to retrieve log pages. %s', self.id, self.device, err) ++ op_obj.retry(Dc.GET_LOG_PAGE_RETRY_RERIOD_SEC) ++ else: ++ logging.debug( ++ 'Dc._on_get_log_fail() - %s | %s Received event on dead object. %s', ++ self.id, ++ self.device, ++ err, ++ ) ++ op_obj.kill() ++ ++ ++# ****************************************************************************** ++class Ioc(Controller): ++ '''@brief This object establishes a connection to one I/O Controller.''' ++ ++ def __init__(self, stac, root, host, tid: trid.TID): ++ self._stac = stac ++ super().__init__(root, host, tid) ++ ++ def _release_resources(self): ++ super()._release_resources() ++ self._stac = None ++ ++ def _on_ctrl_removed(self, obj): ++ '''Called when the associated nvme device (/dev/nvmeX) is removed ++ from the system. ++ ''' ++ super()._on_ctrl_removed(obj) ++ ++ # Defer removal of this object to the next main loop's idle period. ++ GLib.idle_add(self._stac.remove_controller, self, True) ++ ++ def _find_existing_connection(self): ++ return self._udev.find_nvme_ioc_device(self.tid) ++ ++ def _on_aen(self, aen: int): ++ pass ++ ++ def _on_nvme_event(self, nvme_event): ++ pass +diff --git a/staslib/gutil.py b/staslib/gutil.py +index b302f3a..36ce2c7 100644 +--- a/staslib/gutil.py ++++ b/staslib/gutil.py +@@ -104,8 +104,7 @@ class GTimer: + + + # ****************************************************************************** +-class NameResolver: +- # pylint: disable=too-few-public-methods ++class NameResolver: # pylint: disable=too-few-public-methods + '''@brief DNS resolver to convert host names to IP addresses.''' + + def __init__(self): +@@ -133,8 +132,10 @@ class NameResolver: + else: + logging.error('Cannot resolve traddr: %s', hostname) + +- except GLib.GError: +- logging.error('Cannot resolve traddr: %s', hostname) ++ except GLib.GError as err: ++ # We don't need to report "cancellation" errors. ++ if not err.matches(Gio.io_error_quark(), Gio.IOErrorEnum.CANCELLED): ++ logging.error('Cannot resolve traddr: %s. %s', hostname, err.message) # pylint: disable=no-member + + logging.debug('NameResolver.resolve_ctrl_async() - resolved \'%s\' -> %s', hostname, traddr) + controllers[indx]['traddr'] = traddr +diff --git a/staslib/log.py b/staslib/log.py +index c624978..9622e98 100644 +--- a/staslib/log.py ++++ b/staslib/log.py +@@ -24,7 +24,7 @@ def init(syslog: bool): + if syslog: + try: + # Try journal logger first +- import systemd.journal # pylint: disable=redefined-outer-name,import-outside-toplevel ++ import systemd.journal # pylint: disable=import-outside-toplevel + + handler = systemd.journal.JournalHandler(SYSLOG_IDENTIFIER=defs.PROG_NAME) + except ModuleNotFoundError: +@@ -32,9 +32,7 @@ def init(syslog: bool): + from logging.handlers import SysLogHandler # pylint: disable=import-outside-toplevel + + handler = SysLogHandler(address="/dev/log") +- handler.setFormatter( +- logging.Formatter('{}: %(message)s'.format(defs.PROG_NAME)) # pylint: disable=consider-using-f-string +- ) ++ handler.setFormatter(logging.Formatter(f'{defs.PROG_NAME}: %(message)s')) + else: + # Log to stdout + handler = logging.StreamHandler(stream=sys.stdout) +diff --git a/staslib/service.py b/staslib/service.py +index 556a9f9..a48e66d 100644 +--- a/staslib/service.py ++++ b/staslib/service.py +@@ -9,248 +9,416 @@ + '''This module defines the base Service object from + which the Staf and the Stac objects are derived.''' + +-import os +-import signal ++import json ++import pickle + import logging ++import pathlib + import systemd.daemon +-import dasbus.connection ++import dasbus.error ++import dasbus.client.observer ++import dasbus.client.proxy + +-from gi.repository import Gio, GLib ++from gi.repository import GLib + from libnvme import nvme +-from staslib import conf, ctrl, defs, gutil, log, stas, trid, udev ++from staslib import avahi, conf, ctrl, defs, gutil, stas, trid, udev + + + # ****************************************************************************** +-class Service: # pylint: disable=too-many-instance-attributes ++class Service(stas.ServiceABC): + '''@brief Base class used to manage a STorage Appliance Service''' + + def __init__(self, args, reload_hdlr): +- + sysconf = conf.SysConf() + self._root = nvme.root() + self._host = nvme.host(self._root, sysconf.hostnqn, sysconf.hostid, sysconf.hostsymname) + +- service_conf = conf.SvcConf() +- service_conf.set_conf_file(args.conf_file) # reload configuration +- self._tron = args.tron or service_conf.tron +- log.set_level_from_tron(self._tron) +- self._root.log_level("debug" if self._tron else "err") ++ super().__init__(args, reload_hdlr) + +- self._lkc_file = os.path.join(os.environ.get('RUNTIME_DIRECTORY', os.path.join('/run', defs.PROG_NAME)), 'last-known-config.pickle') +- self._loop = GLib.MainLoop() +- self._udev = udev.UDEV +- self._cancellable = Gio.Cancellable() +- self._resolver = gutil.NameResolver() +- self._controllers = self._load_last_known_config() +- self._dbus_iface = None +- self._cfg_soak_tmr = None +- self._sysbus = dasbus.connection.SystemMessageBus() +- +- GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGINT, self._stop_hdlr) # CTRL-C +- GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGTERM, self._stop_hdlr) # systemctl stop stafd +- GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGHUP, reload_hdlr) # systemctl reload stafd +- +- nvme_options = conf.NvmeOptions() +- if not nvme_options.host_iface_supp or not nvme_options.discovery_supp: +- logging.warning( +- 'Kernel does not appear to support all the options needed to run this program. Consider updating to a later kernel version.' +- ) ++ self._root.log_level("debug" if self._tron else "err") + + def _release_resources(self): + logging.debug('Service._release_resources()') ++ super()._release_resources() + +- if self._cancellable and not self._cancellable.is_cancelled(): +- self._cancellable.cancel() ++ self._host = None ++ self._root = None + +- if self._cfg_soak_tmr is not None: +- self._cfg_soak_tmr.kill() ++ @stas.ServiceABC.tron.setter ++ def tron(self, value): ++ '''@brief Set Trace ON property''' ++ super(__class__, self.__class__).tron.__set__(self, value) ++ self._root.log_level("debug" if self._tron else "err") + +- self._controllers.clear() + +- if self._sysbus: +- self._sysbus.disconnect() ++# ****************************************************************************** ++def udev_rule_ctrl(enable): ++ '''@brief We add an empty udev rule to /run/udev/rules.d to suppress ++ nvme-cli's udev rule that is used to tell udevd to automatically ++ connect to I/O controller. This is to avoid race conditions between ++ stacd and udevd. This is configurable. See "udev-rule" in stacd.conf ++ for details. ++ ''' ++ udev_rule_suppress = pathlib.Path('/run/udev/rules.d', '70-nvmf-autoconnect.rules') ++ if enable: ++ try: ++ udev_rule_suppress.unlink() ++ except FileNotFoundError: ++ pass ++ else: ++ if not udev_rule_suppress.exists(): ++ pathlib.Path('/run/udev/rules.d').mkdir(parents=True, exist_ok=True) ++ udev_rule_suppress.symlink_to('/dev/null') + +- self._cfg_soak_tmr = None +- self._cancellable = None +- self._resolver = None +- self._lkc_file = None +- self._sysbus = None +- self._udev = None + +- def _config_dbus(self, iface_obj, bus_name: str, obj_name: str): +- self._dbus_iface = iface_obj +- self._sysbus.publish_object(obj_name, iface_obj) +- self._sysbus.register_service(bus_name) ++# ****************************************************************************** ++class Stac(Service): ++ '''STorage Appliance Connector (STAC)''' + +- @property +- def tron(self): +- '''@brief Get Trace ON property''' +- return self._tron ++ CONF_STABILITY_LONG_SOAK_TIME_SEC = 10 # pylint: disable=invalid-name ++ ADD_EVENT_SOAK_TIME_SEC = 1 + +- @tron.setter +- def tron(self, value): # pylint: disable=no-self-use +- '''@brief Set Trace ON property''' +- self._tron = value +- log.set_level_from_tron(self._tron) +- self._root.log_level("debug" if self._tron else "err") ++ def __init__(self, args, dbus): ++ super().__init__(args, self._reload_hdlr) + +- def run(self): +- '''@brief Start the main loop execution''' +- try: +- self._loop.run() +- except Exception as ex: # pylint: disable=broad-except +- logging.critical('exception: %s', ex) ++ self._udev = udev.UDEV + +- self._loop = None ++ self._add_event_soak_tmr = gutil.GTimer(self.ADD_EVENT_SOAK_TIME_SEC, self._on_add_event_soaked) + +- def info(self) -> dict: +- '''@brief Get the status info for this object (used for debug)''' +- nvme_options = conf.NvmeOptions() +- return { +- 'last known config file': self._lkc_file, +- 'config soak timer': str(self._cfg_soak_tmr), +- 'kernel support': { +- 'TP8013': nvme_options.discovery_supp, +- 'host_iface': nvme_options.host_iface_supp, +- }, +- 'system config': conf.SysConf().as_dict(), +- } +- +- def get_controllers(self): +- '''@brief return the list of controller objects''' +- return self._controllers.values() +- +- def get_controller( +- self, transport: str, traddr: str, trsvcid: str, host_traddr: str, host_iface: str, subsysnqn: str +- ): # pylint: disable=too-many-arguments +- '''@brief get the specified controller object from the list of controllers''' +- cid = { +- 'transport': transport, +- 'traddr': traddr, +- 'trsvcid': trsvcid, +- 'host-traddr': host_traddr, +- 'host-iface': host_iface, +- 'subsysnqn': subsysnqn, +- } +- return self._controllers.get(trid.TID(cid)) +- +- def _remove_ctrl_from_dict(self, controller): +- tid_to_pop = controller.tid +- if not tid_to_pop: +- # Being paranoid. This should not happen, but let's say the +- # controller object has been purged, but it is somehow still +- # listed in self._controllers. +- for tid, _controller in self._controllers.items(): +- if _controller is controller: +- tid_to_pop = tid +- break +- +- if tid_to_pop: +- logging.debug('Service._remove_ctrl_from_dict() - %s | %s', tid_to_pop, controller.device) +- self._controllers.pop(tid_to_pop, None) +- else: +- logging.debug('Service._remove_ctrl_from_dict() - already removed') ++ self._config_connections_audit() + +- def remove_controller(self, controller): +- '''@brief remove the specified controller object from the list of controllers''' +- logging.debug('Service.remove_controller()') +- if isinstance(controller, ctrl.Controller): +- self._remove_ctrl_from_dict(controller) ++ # Create the D-Bus instance. ++ self._config_dbus(dbus, defs.STACD_DBUS_NAME, defs.STACD_DBUS_PATH) + +- controller.kill() ++ # Connect to STAF D-Bus interface ++ self._staf = None ++ self._staf_watcher = dasbus.client.observer.DBusObserver(self._sysbus, defs.STAFD_DBUS_NAME) ++ self._staf_watcher.service_available.connect(self._connect_to_staf) ++ self._staf_watcher.service_unavailable.connect(self._disconnect_from_staf) ++ self._staf_watcher.connect_once_available() + +- if self._cfg_soak_tmr: +- self._cfg_soak_tmr.start() ++ # Suppress udev rule to auto-connect when AEN is received. ++ udev_rule_ctrl(conf.SvcConf().udev_rule_enabled) + +- def _cancel(self): +- logging.debug('Service._cancel()') +- if not self._cancellable.is_cancelled(): +- self._cancellable.cancel() ++ def _release_resources(self): ++ logging.debug('Stac._release_resources()') ++ ++ if self._add_event_soak_tmr: ++ self._add_event_soak_tmr.kill() ++ ++ udev_rule_ctrl(True) ++ ++ if self._udev: ++ self._udev.unregister_for_action_events('add') ++ ++ self._destroy_staf_comlink(self._staf_watcher) ++ if self._staf_watcher is not None: ++ self._staf_watcher.disconnect() + +- for controller in self._controllers.values(): +- controller.cancel() ++ super()._release_resources() ++ ++ self._udev = None ++ self._staf = None ++ self._staf_watcher = None ++ self._add_event_soak_tmr = None ++ ++ def _audit_connections(self, tids): ++ '''A host should only connect to I/O controllers that have been zoned ++ for that host or a manual "controller" entry exists in stcd.conf. ++ A host should disconnect from an I/O controller when that I/O controller ++ is removed from the zone or a manual "controller" entry is removed from ++ stacd.conf. stacd will audit connections if "sticky-connections=disabled". ++ stacd will delete any connection that is not supposed to exist. ++ ''' ++ logging.debug('Stac._audit_connections() - tids = %s', tids) ++ num_controllers = len(self._controllers) ++ for tid in tids: ++ if tid not in self._controllers: ++ self._controllers[tid] = ctrl.Ioc(self, self._root, self._host, tid) ++ ++ if num_controllers != len(self._controllers): ++ self._cfg_soak_tmr.start(self.CONF_STABILITY_SOAK_TIME_SEC) ++ ++ def _on_add_event(self, udev_obj): # pylint: disable=unused-argument ++ '''@brief This function is called when a "add" event is received from ++ the kernel for an NVMe device. This is used to trigger an audit and make ++ sure that the connection to an I/O controller is allowed. ++ ++ WARNING: There is a race condition with the "add" event from the kernel. ++ The kernel sends the "add" event a bit early and the sysfs attributes ++ associated with the nvme object are not always fully initialized. ++ To workaround this problem we use a soaking timer to give time for the ++ sysfs attributes to stabilize. ++ ''' ++ self._add_event_soak_tmr.start() ++ ++ def _on_add_event_soaked(self): ++ '''@brief After the add event has been soaking for ADD_EVENT_SOAK_TIME_SEC ++ seconds, we can audit the connections. ++ ''' ++ if not conf.SvcConf().sticky_connections: ++ self._audit_connections(self._udev.get_nvme_ioc_tids()) ++ return GLib.SOURCE_REMOVE ++ ++ def _config_connections_audit(self): ++ '''This function checks the "sticky_connections" parameter to determine ++ whether audits should be performed. Audits are enabled when ++ "sticky_connections" is disabled. ++ ''' ++ if not conf.SvcConf().sticky_connections: ++ if self._udev.get_registered_action_cback('add') is None: ++ self._udev.register_for_action_events('add', self._on_add_event) ++ self._audit_connections(self._udev.get_nvme_ioc_tids()) ++ else: ++ self._udev.unregister_for_action_events('add') + + def _keep_connections_on_exit(self): + '''@brief Determine whether connections should remain when the + process exits. +- +- NOTE) This is the base class method used to define the interface. +- It must be overloaded by a child class. + ''' +- raise NotImplementedError() ++ return True + +- def _stop_hdlr(self): +- systemd.daemon.notify('STOPPING=1') ++ def _reload_hdlr(self): ++ '''@brief Reload configuration file. This is triggered by the SIGHUP ++ signal, which can be sent with "systemctl reload stacd". ++ ''' ++ systemd.daemon.notify('RELOADING=1') ++ service_cnf = conf.SvcConf() ++ service_cnf.reload() ++ self.tron = service_cnf.tron ++ self._config_connections_audit() ++ self._cfg_soak_tmr.start(self.CONF_STABILITY_SOAK_TIME_SEC) ++ udev_rule_ctrl(service_cnf.udev_rule_enabled) ++ systemd.daemon.notify('READY=1') ++ return GLib.SOURCE_CONTINUE ++ ++ def _get_log_pages_from_stafd(self): ++ if self._staf: ++ try: ++ return json.loads(self._staf.get_all_log_pages(True)) ++ except dasbus.error.DBusError: ++ pass ++ ++ return list() + +- self._cancel() # Cancel pending operations ++ def _config_ctrls_finish(self, configured_ctrl_list): ++ configured_ctrl_list = [ ++ ctrl_dict for ctrl_dict in configured_ctrl_list if 'traddr' in ctrl_dict and 'subsysnqn' in ctrl_dict ++ ] ++ logging.debug('Stac._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list) ++ ++ discovered_ctrl_list = list() ++ for staf_data in self._get_log_pages_from_stafd(): ++ host_traddr = staf_data['discovery-controller']['host-traddr'] ++ host_iface = staf_data['discovery-controller']['host-iface'] ++ for dlpe in staf_data['log-pages']: ++ if dlpe.get('subtype') == 'nvme': # eliminate discovery controllers ++ discovered_ctrl_list.append(stas.cid_from_dlpe(dlpe, host_traddr, host_iface)) ++ ++ logging.debug('Stac._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list) ++ ++ controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list) ++ controllers = stas.remove_invalid_addresses(controllers) ++ ++ new_controller_ids = {trid.TID(controller) for controller in controllers} ++ cur_controller_ids = set(self._controllers.keys()) ++ controllers_to_add = new_controller_ids - cur_controller_ids ++ controllers_to_del = cur_controller_ids - new_controller_ids ++ ++ logging.debug('Stac._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add)) ++ logging.debug('Stac._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del)) ++ ++ for tid in controllers_to_del: ++ controller = self._controllers.pop(tid, None) ++ if controller is not None: ++ controller.disconnect(self.remove_controller, conf.SvcConf().sticky_connections) ++ ++ for tid in controllers_to_add: ++ self._controllers[tid] = ctrl.Ioc(self, self._root, self._host, tid) ++ ++ def _connect_to_staf(self, _): ++ '''@brief Hook up DBus signal handlers for signals from stafd.''' ++ try: ++ self._staf = self._sysbus.get_proxy(defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH) ++ self._staf.log_pages_changed.connect(self._log_pages_changed) ++ self._cfg_soak_tmr.start() + +- self._dump_last_known_config(self._controllers) ++ # Make sure timer is set back to its normal value. ++ self._cfg_soak_tmr.set_timeout(self.CONF_STABILITY_SOAK_TIME_SEC) ++ logging.debug('Stac._connect_to_staf() - Connected to staf') ++ except dasbus.error.DBusError: ++ logging.error('Failed to connect to staf') ++ ++ def _destroy_staf_comlink(self, watcher): # pylint: disable=unused-argument ++ if self._staf: ++ self._staf.log_pages_changed.disconnect(self._log_pages_changed) ++ dasbus.client.proxy.disconnect_proxy(self._staf) ++ self._staf = None ++ ++ def _disconnect_from_staf(self, watcher): ++ self._destroy_staf_comlink(watcher) ++ ++ # When we lose connectivity with stafd, the most logical explanation ++ # is that stafd restarted. In that case, it may take some time for stafd ++ # to re-populate its log pages cache. So let's give stafd plenty of time ++ # to update its log pages cache and send log pages change notifications ++ # before triggering a stacd re-config. We do this by momentarily ++ # increasing the config soak timer to a longer period. ++ if self._cfg_soak_tmr: ++ self._cfg_soak_tmr.set_timeout(self.CONF_STABILITY_LONG_SOAK_TIME_SEC) ++ ++ logging.debug('Stac._disconnect_from_staf() - Disconnected from staf') ++ ++ def _log_pages_changed( # pylint: disable=too-many-arguments ++ self, transport, traddr, trsvcid, host_traddr, host_iface, subsysnqn, device ++ ): ++ logging.debug( ++ 'Stac._log_pages_changed() - transport=%s, traddr=%s, trsvcid=%s, host_traddr=%s, host_iface=%s, subsysnqn=%s, device=%s', ++ transport, ++ traddr, ++ trsvcid, ++ host_traddr, ++ host_iface, ++ subsysnqn, ++ device, ++ ) ++ if self._cfg_soak_tmr: ++ self._cfg_soak_tmr.start(self.CONF_STABILITY_SOAK_TIME_SEC) + +- if len(self._controllers) == 0: +- GLib.idle_add(self._exit) +- else: +- # Tell all controller objects to disconnect +- keep_connections = self._keep_connections_on_exit() +- controllers = self._controllers.values() +- for controller in controllers: +- controller.disconnect(self._on_final_disconnect, keep_connections) ++ def _load_last_known_config(self): ++ return dict() + +- return GLib.SOURCE_REMOVE ++ def _dump_last_known_config(self, controllers): ++ pass + +- def _on_final_disconnect(self, controller): +- '''Callback invoked after a controller is disconnected. +- THIS IS USED DURING PROCESS SHUTDOWN TO WAIT FOR ALL CONTROLLERS TO BE +- DISCONNECTED BEFORE EXITING THE PROGRAM. ONLY CALL ON SHUTDOWN! +- ''' +- logging.debug('Service._on_final_disconnect()') +- self._remove_ctrl_from_dict(controller) + +- controller.kill() ++# ****************************************************************************** ++class Staf(Service): ++ '''STorage Appliance Finder (STAF)''' + +- # When all controllers have disconnected, we can finish the clean up +- if len(self._controllers) == 0: +- # Defer exit to the next main loop's idle period. +- GLib.idle_add(self._exit) ++ def __init__(self, args, dbus): ++ super().__init__(args, self._reload_hdlr) + +- def _exit(self): +- logging.debug('Service._exit()') +- self._release_resources() +- self._loop.quit() ++ self._avahi = avahi.Avahi(self._sysbus, self._avahi_change) ++ self._avahi.config_stypes(conf.SvcConf().get_stypes()) + +- def _on_config_ctrls(self, *_user_data): +- self._config_ctrls() +- return GLib.SOURCE_REMOVE ++ # Create the D-Bus instance. ++ self._config_dbus(dbus, defs.STAFD_DBUS_NAME, defs.STAFD_DBUS_PATH) + +- def _config_ctrls(self): +- '''@brief Start controllers configuration.''' +- # The configuration file may contain controllers and/or blacklist +- # elements with traddr specified as hostname instead of IP address. +- # Because of this, we need to remove those blacklisted elements before +- # running name resolution. And we will need to remove blacklisted +- # elements after name resolution is complete (i.e. in the calback +- # function _config_ctrls_finish) +- logging.debug('Service._config_ctrls()') +- configured_controllers = stas.remove_blacklisted(conf.SvcConf().get_controllers()) +- self._resolver.resolve_ctrl_async(self._cancellable, configured_controllers, self._config_ctrls_finish) ++ def info(self) -> dict: ++ '''@brief Get the status info for this object (used for debug)''' ++ info = super().info() ++ info['avahi'] = self._avahi.info() ++ return info + +- def _config_ctrls_finish(self, configured_ctrl_list): +- '''@brief Finish controllers configuration after hostnames (if any) +- have been resolved. +- +- Configuring controllers must be done asynchronously in 2 steps. +- In the first step, host names get resolved to find their IP addresses. +- Name resolution can take a while, especially when an external name +- resolution server is used. Once that step completed, the callback +- method _config_ctrls_finish() (i.e. this method), gets invoked to +- complete the controller configuration. +- +- NOTE) This is the base class method used to define the interface. +- It must be overloaded by a child class. +- ''' +- raise NotImplementedError() ++ def _release_resources(self): ++ logging.debug('Staf._release_resources()') ++ super()._release_resources() ++ if self._avahi: ++ self._avahi.kill() ++ self._avahi = None + + def _load_last_known_config(self): +- raise NotImplementedError() ++ try: ++ with open(self._lkc_file, 'rb') as file: ++ config = pickle.load(file) ++ except (FileNotFoundError, AttributeError): ++ return dict() ++ ++ logging.debug('Staf._load_last_known_config() - DC count = %s', len(config)) ++ return {tid: ctrl.Dc(self, self._root, self._host, tid, log_pages) for tid, log_pages in config.items()} + + def _dump_last_known_config(self, controllers): +- raise NotImplementedError() ++ try: ++ with open(self._lkc_file, 'wb') as file: ++ config = {tid: dc.log_pages() for tid, dc in controllers.items()} ++ logging.debug('Staf._dump_last_known_config() - DC count = %s', len(config)) ++ pickle.dump(config, file) ++ except FileNotFoundError as ex: ++ logging.error('Unable to save last known config: %s', ex) ++ ++ def _keep_connections_on_exit(self): ++ '''@brief Determine whether connections should remain when the ++ process exits. ++ ''' ++ return conf.SvcConf().persistent_connections ++ ++ def _reload_hdlr(self): ++ '''@brief Reload configuration file. This is triggered by the SIGHUP ++ signal, which can be sent with "systemctl reload stafd". ++ ''' ++ systemd.daemon.notify('RELOADING=1') ++ service_cnf = conf.SvcConf() ++ service_cnf.reload() ++ self.tron = service_cnf.tron ++ self._avahi.kick_start() # Make sure Avahi is running ++ self._avahi.config_stypes(service_cnf.get_stypes()) ++ self._cfg_soak_tmr.start() ++ systemd.daemon.notify('READY=1') ++ return GLib.SOURCE_CONTINUE ++ ++ def log_pages_changed(self, controller, device): ++ '''@brief Function invoked when a controller's cached log pages ++ have changed. This will emit a D-Bus signal to inform ++ other applications that the cached log pages have changed. ++ ''' ++ self._dbus_iface.log_pages_changed.emit( ++ controller.tid.transport, ++ controller.tid.traddr, ++ controller.tid.trsvcid, ++ controller.tid.host_traddr, ++ controller.tid.host_iface, ++ controller.tid.subsysnqn, ++ device, ++ ) ++ ++ def referrals_changed(self): ++ '''@brief Function invoked when a controller's cached referrals ++ have changed. ++ ''' ++ logging.debug('Staf.referrals_changed()') ++ self._cfg_soak_tmr.start() ++ ++ def _referrals(self) -> list: ++ return [ ++ stas.cid_from_dlpe(dlpe, controller.tid.host_traddr, controller.tid.host_iface) ++ for controller in self.get_controllers() ++ for dlpe in controller.referrals() ++ ] ++ ++ def _config_ctrls_finish(self, configured_ctrl_list): ++ '''@brief Finish discovery controllers configuration after ++ hostnames (if any) have been resolved. ++ ''' ++ configured_ctrl_list = [ ++ ctrl_dict ++ for ctrl_dict in configured_ctrl_list ++ if 'traddr' in ctrl_dict and ctrl_dict.setdefault('subsysnqn', defs.WELL_KNOWN_DISC_NQN) ++ ] ++ ++ discovered_ctrl_list = self._avahi.get_controllers() ++ referral_ctrl_list = self._referrals() ++ logging.debug('Staf._config_ctrls_finish() - configured_ctrl_list = %s', configured_ctrl_list) ++ logging.debug('Staf._config_ctrls_finish() - discovered_ctrl_list = %s', discovered_ctrl_list) ++ logging.debug('Staf._config_ctrls_finish() - referral_ctrl_list = %s', referral_ctrl_list) ++ ++ controllers = stas.remove_blacklisted(configured_ctrl_list + discovered_ctrl_list + referral_ctrl_list) ++ controllers = stas.remove_invalid_addresses(controllers) ++ ++ new_controller_ids = {trid.TID(controller) for controller in controllers} ++ cur_controller_ids = set(self._controllers.keys()) ++ controllers_to_add = new_controller_ids - cur_controller_ids ++ controllers_to_del = cur_controller_ids - new_controller_ids ++ ++ logging.debug('Staf._config_ctrls_finish() - controllers_to_add = %s', list(controllers_to_add)) ++ logging.debug('Staf._config_ctrls_finish() - controllers_to_del = %s', list(controllers_to_del)) ++ ++ for tid in controllers_to_del: ++ controller = self._controllers.pop(tid, None) ++ if controller is not None: ++ controller.disconnect(self.remove_controller, conf.SvcConf().persistent_connections) ++ ++ for tid in controllers_to_add: ++ self._controllers[tid] = ctrl.Dc(self, self._root, self._host, tid) ++ ++ def _avahi_change(self): ++ self._cfg_soak_tmr.start() +diff --git a/staslib/stas.py b/staslib/stas.py +index 7bf91e0..496f063 100644 +--- a/staslib/stas.py ++++ b/staslib/stas.py +@@ -6,14 +6,19 @@ + # + # Authors: Martin Belanger + # +-'''Library for staf/stac''' ++'''Library for staf/stac. You will find here common code for stafd and stacd ++including the Abstract Base Classes (ABC) for Controllers and Services''' + + import os + import sys +-import ipaddress ++import abc ++import signal + import logging +- +-from staslib import conf, defs, trid ++import ipaddress ++import systemd.daemon ++import dasbus.connection ++from gi.repository import Gio, GLib ++from staslib import conf, defs, gutil, log, trid + + + # ****************************************************************************** +@@ -108,3 +113,379 @@ def remove_invalid_addresses(controllers: list): + logging.warning('Invalid transport %s', transport) + + return valid_controllers ++ ++ ++# ****************************************************************************** ++class ControllerABC(abc.ABC): # pylint: disable=too-many-instance-attributes ++ '''@brief Base class used to manage the connection to a controller.''' ++ ++ CONNECT_RETRY_PERIOD_SEC = 60 ++ FAST_CONNECT_RETRY_PERIOD_SEC = 3 ++ ++ def __init__(self, root, host, tid: trid.TID, discovery_ctrl=False): ++ self._root = root ++ self._host = host ++ self._tid = tid ++ self._cancellable = Gio.Cancellable() ++ self._connect_attempts = 0 ++ self._retry_connect_tmr = gutil.GTimer(self.CONNECT_RETRY_PERIOD_SEC, self._on_try_to_connect) ++ self._discovery_ctrl = discovery_ctrl ++ self._try_to_connect_deferred = gutil.Deferred(self._try_to_connect) ++ self._try_to_connect_deferred.schedule() ++ ++ def _release_resources(self): ++ # Remove pending deferred from main loop ++ if self._try_to_connect_deferred: ++ self._try_to_connect_deferred.cancel() ++ ++ if self._retry_connect_tmr is not None: ++ self._retry_connect_tmr.kill() ++ ++ if self._cancellable and not self._cancellable.is_cancelled(): ++ self._cancellable.cancel() ++ ++ self._tid = None ++ self._cancellable = None ++ self._retry_connect_tmr = None ++ self._try_to_connect_deferred = None ++ ++ @property ++ def id(self) -> str: ++ '''@brief Return the Transport ID as a printable string''' ++ return str(self.tid) ++ ++ @property ++ def tid(self): ++ '''@brief Return the Transport ID object''' ++ return self._tid ++ ++ def controller_id_dict(self) -> dict: ++ '''@brief return the controller ID as a dict.''' ++ return self.tid.as_dict() ++ ++ def details(self) -> dict: ++ '''@brief return detailed debug info about this controller''' ++ details = self.controller_id_dict() ++ details['connect attempts'] = str(self._connect_attempts) ++ details['retry connect timer'] = str(self._retry_connect_tmr) ++ return details ++ ++ def info(self) -> dict: ++ '''@brief Get the controller info for this object''' ++ return self.details() ++ ++ def cancel(self): ++ '''@brief Used to cancel pending operations.''' ++ if self._cancellable and not self._cancellable.is_cancelled(): ++ logging.debug('ControllerABC.cancel() - %s', self.id) ++ self._cancellable.cancel() ++ ++ def kill(self): ++ '''@brief Used to release all resources associated with this object.''' ++ logging.debug('ControllerABC.kill() - %s', self.id) ++ self._release_resources() ++ ++ def _alive(self): ++ '''There may be race condition where a queued event gets processed ++ after the object is no longer configured (i.e. alive). This method ++ can be used by callback functions to make sure the object is still ++ alive before processing further. ++ ''' ++ return self._cancellable and not self._cancellable.is_cancelled() ++ ++ def _on_try_to_connect(self): ++ self._try_to_connect_deferred.schedule() ++ return GLib.SOURCE_REMOVE ++ ++ def _try_to_connect(self): ++ # This is a deferred function call. Make sure ++ # the source of the deferred is still good. ++ source = GLib.main_current_source() ++ if source and source.is_destroyed(): ++ return ++ ++ self._connect_attempts += 1 ++ ++ self._do_connect() ++ ++ @abc.abstractmethod ++ def _do_connect(self): ++ raise NotImplementedError() ++ ++ @abc.abstractmethod ++ def _on_aen(self, aen: int): ++ raise NotImplementedError() ++ ++ @abc.abstractmethod ++ def _on_nvme_event(self, nvme_event): ++ raise NotImplementedError() ++ ++ @abc.abstractmethod ++ def _on_ctrl_removed(self, obj): ++ raise NotImplementedError() ++ ++ @abc.abstractmethod ++ def _find_existing_connection(self): ++ raise NotImplementedError() ++ ++ @abc.abstractmethod ++ def disconnect(self, disconnected_cb, keep_connection): ++ '''@brief Issue an asynchronous disconnect command to a Controller. ++ Once the async command has completed, the callback 'disconnected_cb' ++ will be invoked. If a controller is already disconnected, then the ++ callback will be added to the main loop's next idle slot to be executed ++ ASAP. ++ ''' ++ raise NotImplementedError() ++ ++ ++# ****************************************************************************** ++class ServiceABC(abc.ABC): # pylint: disable=too-many-instance-attributes ++ '''@brief Base class used to manage a STorage Appliance Service''' ++ ++ CONF_STABILITY_SOAK_TIME_SEC = 1.5 ++ ++ def __init__(self, args, reload_hdlr): ++ ++ service_conf = conf.SvcConf() ++ service_conf.set_conf_file(args.conf_file) # reload configuration ++ self._tron = args.tron or service_conf.tron ++ log.set_level_from_tron(self._tron) ++ ++ self._lkc_file = os.path.join(os.environ.get('RUNTIME_DIRECTORY', os.path.join('/run', defs.PROG_NAME)), 'last-known-config.pickle') ++ self._loop = GLib.MainLoop() ++ self._cancellable = Gio.Cancellable() ++ self._resolver = gutil.NameResolver() ++ self._controllers = self._load_last_known_config() ++ self._dbus_iface = None ++ self._cfg_soak_tmr = gutil.GTimer(self.CONF_STABILITY_SOAK_TIME_SEC, self._on_config_ctrls) ++ self._sysbus = dasbus.connection.SystemMessageBus() ++ ++ GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGINT, self._stop_hdlr) # CTRL-C ++ GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGTERM, self._stop_hdlr) # systemctl stop stafd ++ GLib.unix_signal_add(GLib.PRIORITY_HIGH, signal.SIGHUP, reload_hdlr) # systemctl reload stafd ++ ++ nvme_options = conf.NvmeOptions() ++ if not nvme_options.host_iface_supp or not nvme_options.discovery_supp: ++ logging.warning( ++ 'Kernel does not appear to support all the options needed to run this program. Consider updating to a later kernel version.' ++ ) ++ ++ # We don't want to apply configuration changes to nvme-cli right away. ++ # Often, multiple changes will occur in a short amount of time (sub-second). ++ # We want to wait until there are no more changes before applying them ++ # to the system. The following timer acts as a "soak period". Changes ++ # will be applied by calling self._on_config_ctrls() at the end of ++ # the soak period. ++ self._cfg_soak_tmr.start() ++ ++ def _release_resources(self): ++ logging.debug('ServiceABC._release_resources()') ++ ++ if self._cancellable and not self._cancellable.is_cancelled(): ++ self._cancellable.cancel() ++ ++ if self._cfg_soak_tmr is not None: ++ self._cfg_soak_tmr.kill() ++ ++ self._controllers.clear() ++ ++ if self._sysbus: ++ self._sysbus.disconnect() ++ ++ self._cfg_soak_tmr = None ++ self._cancellable = None ++ self._resolver = None ++ self._lkc_file = None ++ self._sysbus = None ++ ++ def _config_dbus(self, iface_obj, bus_name: str, obj_name: str): ++ self._dbus_iface = iface_obj ++ self._sysbus.publish_object(obj_name, iface_obj) ++ self._sysbus.register_service(bus_name) ++ ++ @property ++ def tron(self): ++ '''@brief Get Trace ON property''' ++ return self._tron ++ ++ @tron.setter ++ def tron(self, value): ++ '''@brief Set Trace ON property''' ++ self._tron = value ++ log.set_level_from_tron(self._tron) ++ ++ def run(self): ++ '''@brief Start the main loop execution''' ++ try: ++ self._loop.run() ++ except Exception as ex: # pylint: disable=broad-except ++ logging.critical('exception: %s', ex) ++ ++ self._loop = None ++ ++ def info(self) -> dict: ++ '''@brief Get the status info for this object (used for debug)''' ++ nvme_options = conf.NvmeOptions() ++ return { ++ 'last known config file': self._lkc_file, ++ 'config soak timer': str(self._cfg_soak_tmr), ++ 'kernel support': { ++ 'TP8013': nvme_options.discovery_supp, ++ 'host_iface': nvme_options.host_iface_supp, ++ }, ++ 'system config': conf.SysConf().as_dict(), ++ } ++ ++ def get_controllers(self) -> dict: ++ '''@brief return the list of controller objects''' ++ return self._controllers.values() ++ ++ def get_controller( ++ self, transport: str, traddr: str, trsvcid: str, host_traddr: str, host_iface: str, subsysnqn: str ++ ): # pylint: disable=too-many-arguments ++ '''@brief get the specified controller object from the list of controllers''' ++ cid = { ++ 'transport': transport, ++ 'traddr': traddr, ++ 'trsvcid': trsvcid, ++ 'host-traddr': host_traddr, ++ 'host-iface': host_iface, ++ 'subsysnqn': subsysnqn, ++ } ++ return self._controllers.get(trid.TID(cid)) ++ ++ def _remove_ctrl_from_dict(self, controller): ++ tid_to_pop = controller.tid ++ if not tid_to_pop: ++ # Being paranoid. This should not happen, but let's say the ++ # controller object has been purged, but it is somehow still ++ # listed in self._controllers. ++ for tid, _controller in self._controllers.items(): ++ if _controller is controller: ++ tid_to_pop = tid ++ break ++ ++ if tid_to_pop: ++ logging.debug('ServiceABC._remove_ctrl_from_dict()- %s | %s', tid_to_pop, controller.device) ++ self._controllers.pop(tid_to_pop, None) ++ else: ++ logging.debug('ServiceABC._remove_ctrl_from_dict()- already removed') ++ ++ def remove_controller(self, controller, success): # pylint: disable=unused-argument ++ '''@brief remove the specified controller object from the list of controllers ++ @param controller: the controller object ++ @param success: whether the disconnect was successful''' ++ logging.debug('ServiceABC.remove_controller()') ++ if isinstance(controller, ControllerABC): ++ self._remove_ctrl_from_dict(controller) ++ ++ controller.kill() ++ ++ if self._cfg_soak_tmr: ++ self._cfg_soak_tmr.start() ++ ++ def _cancel(self): ++ logging.debug('ServiceABC._cancel()') ++ if not self._cancellable.is_cancelled(): ++ self._cancellable.cancel() ++ ++ for controller in self._controllers.values(): ++ controller.cancel() ++ ++ def _stop_hdlr(self): ++ logging.debug('ServiceABC._stop_hdlr()') ++ systemd.daemon.notify('STOPPING=1') ++ ++ self._cancel() # Cancel pending operations ++ ++ self._dump_last_known_config(self._controllers) ++ ++ if len(self._controllers) == 0: ++ GLib.idle_add(self._exit) ++ else: ++ # Tell all controller objects to disconnect ++ keep_connections = self._keep_connections_on_exit() ++ controllers = self._controllers.values() ++ logging.debug( ++ 'ServiceABC._stop_hdlr() - Controller count = %s, keep_connections = %s', ++ len(controllers), keep_connections ++ ) ++ for controller in controllers: ++ controller.disconnect(self._on_final_disconnect, keep_connections) ++ ++ return GLib.SOURCE_REMOVE ++ ++ def _on_final_disconnect(self, controller, success): ++ '''Callback invoked after a controller is disconnected. ++ THIS IS USED DURING PROCESS SHUTDOWN TO WAIT FOR ALL CONTROLLERS TO BE ++ DISCONNECTED BEFORE EXITING THE PROGRAM. ONLY CALL ON SHUTDOWN! ++ @param controller: the controller object ++ @param success: whether the disconnect operation was successful ++ ''' ++ logging.debug('ServiceABC._on_final_disconnect() - %s | %s disconnect %s', ++ controller.id, controller.device, 'succeeded' if success else 'failed') ++ self._remove_ctrl_from_dict(controller) ++ ++ controller.kill() ++ ++ # When all controllers have disconnected, we can finish the clean up ++ if len(self._controllers) == 0: ++ # Defer exit to the next main loop's idle period. ++ GLib.idle_add(self._exit) ++ ++ def _exit(self): ++ logging.debug('ServiceABC._exit()') ++ self._release_resources() ++ self._loop.quit() ++ ++ def _on_config_ctrls(self, *_user_data): ++ self._config_ctrls() ++ return GLib.SOURCE_REMOVE ++ ++ def _config_ctrls(self): ++ '''@brief Start controllers configuration.''' ++ # The configuration file may contain controllers and/or blacklist ++ # elements with traddr specified as hostname instead of IP address. ++ # Because of this, we need to remove those blacklisted elements before ++ # running name resolution. And we will need to remove blacklisted ++ # elements after name resolution is complete (i.e. in the calback ++ # function _config_ctrls_finish) ++ logging.debug('ServiceABC._config_ctrls()') ++ configured_controllers = remove_blacklisted(conf.SvcConf().get_controllers()) ++ self._resolver.resolve_ctrl_async(self._cancellable, configured_controllers, self._config_ctrls_finish) ++ ++ @abc.abstractmethod ++ def _keep_connections_on_exit(self): ++ '''@brief Determine whether connections should remain when the ++ process exits. ++ ++ NOTE) This is the base class method used to define the interface. ++ It must be overloaded by a child class. ++ ''' ++ raise NotImplementedError() ++ ++ @abc.abstractmethod ++ def _config_ctrls_finish(self, configured_ctrl_list): ++ '''@brief Finish controllers configuration after hostnames (if any) ++ have been resolved. ++ ++ Configuring controllers must be done asynchronously in 2 steps. ++ In the first step, host names get resolved to find their IP addresses. ++ Name resolution can take a while, especially when an external name ++ resolution server is used. Once that step completed, the callback ++ method _config_ctrls_finish() (i.e. this method), gets invoked to ++ complete the controller configuration. ++ ++ NOTE) This is the base class method used to define the interface. ++ It must be overloaded by a child class. ++ ''' ++ raise NotImplementedError() ++ ++ @abc.abstractmethod ++ def _load_last_known_config(self): ++ raise NotImplementedError() ++ ++ @abc.abstractmethod ++ def _dump_last_known_config(self, controllers): ++ raise NotImplementedError() +diff --git a/staslib/trid.py b/staslib/trid.py +index def6ab2..38619e7 100644 +--- a/staslib/trid.py ++++ b/staslib/trid.py +@@ -12,8 +12,7 @@ throughout nvme-stas to uniquely identify a Controller''' + import hashlib + from staslib import conf + +-class TID: +- # pylint: disable=too-many-instance-attributes ++class TID: # pylint: disable=too-many-instance-attributes + '''Transport Identifier''' + RDMA_IP_PORT = '4420' + DISC_IP_PORT = '8009' +diff --git a/staslib/udev.py b/staslib/udev.py +index 29370b8..37b63cc 100644 +--- a/staslib/udev.py ++++ b/staslib/udev.py +@@ -16,7 +16,7 @@ from staslib import defs, trid + try: + from pyudev.glib import MonitorObserver + except (ModuleNotFoundError, AttributeError): +- from staslib.glibudev import MonitorObserver # pylint: disable=relative-beyond-top-level,ungrouped-imports ++ from staslib.glibudev import MonitorObserver # pylint: disable=ungrouped-imports + + # ****************************************************************************** + class Udev: +@@ -99,7 +99,7 @@ class Udev: + def get_attributes(self, sys_name: str, attr_ids) -> dict: + '''@brief Get all the attributes associated with device @sys_name''' + attrs = {attr_id: '' for attr_id in attr_ids} +- if sys_name: ++ if sys_name and sys_name != 'nvme?': + udev = self.get_nvme_device(sys_name) + if udev is not None: + for attr_id in attr_ids: +diff --git a/test/test-config.py b/test/test-config.py +index dad0ebd..db58883 100755 +--- a/test/test-config.py ++++ b/test/test-config.py +@@ -40,7 +40,7 @@ class StasProcessConfUnitTest(unittest.TestCase): + self.assertFalse(service_conf.data_digest) + self.assertTrue(service_conf.persistent_connections) + self.assertTrue(service_conf.udev_rule_enabled) +- self.assertFalse(service_conf.sticky_connections) ++ self.assertTrue(service_conf.sticky_connections) + self.assertFalse(service_conf.ignore_iface) + self.assertIn(6, service_conf.ip_family) + self.assertNotIn(4, service_conf.ip_family) +diff --git a/test/test-controller.py b/test/test-controller.py +index f23125e..f55781a 100755 +--- a/test/test-controller.py ++++ b/test/test-controller.py +@@ -8,24 +8,43 @@ from pyfakefs.fake_filesystem_unittest import TestCase + + LOOP = GLib.MainLoop() + ++ ++class TestController(ctrl.Controller): ++ def _find_existing_connection(self): ++ pass ++ ++ def _on_aen(self, aen: int): ++ pass ++ ++ def _on_nvme_event(self, nvme_event): ++ pass ++ ++ + class Test(TestCase): + '''Unit tests for class Controller''' + + def setUp(self): + self.setUpPyfakefs() + +- self.fs.create_file('/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n') +- self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n') +- self.fs.create_file('/dev/nvme-fabrics', contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n') ++ self.fs.create_file( ++ '/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n' ++ ) ++ self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n') ++ self.fs.create_file( ++ '/dev/nvme-fabrics', ++ contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n', ++ ) + +- self.NVME_TID = trid.TID({ +- 'transport': 'tcp', +- 'traddr': '10.10.10.10', +- 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', +- 'trsvcid': '8009', +- 'host-traddr': '1.2.3.4', +- 'host-iface': 'wlp0s20f3', +- }) ++ self.NVME_TID = trid.TID( ++ { ++ 'transport': 'tcp', ++ 'traddr': '10.10.10.10', ++ 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', ++ 'trsvcid': '8009', ++ 'host-traddr': '1.2.3.4', ++ 'host-iface': 'wlp0s20f3', ++ } ++ ) + + sysconf = conf.SysConf() + self.root = nvme.root() +@@ -34,32 +53,92 @@ class Test(TestCase): + def tearDown(self): + LOOP.quit() + ++ def test_cannot_instantiate_concrete_classes_if_abstract_method_are_not_implemented(self): ++ # Make sure we can't instantiate the ABC directly (Abstract Base Class). ++ class Controller(ctrl.Controller): ++ pass ++ ++ self.assertRaises(TypeError, lambda: ctrl.Controller(root=self.root, host=self.host, tid=self.NVME_TID)) ++ + def test_get_device(self): +- controller = ctrl.Controller(root=self.root, host=self.host, tid=self.NVME_TID) ++ controller = TestController(root=self.root, host=self.host, tid=self.NVME_TID) + self.assertEqual(controller._connect_attempts, 0) +- self.assertRaises(NotImplementedError, controller._try_to_connect) ++ controller._try_to_connect() + self.assertEqual(controller._connect_attempts, 1) +- self.assertRaises(NotImplementedError, controller._find_existing_connection) +- self.assertEqual(controller.id, "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)") ++ self.assertEqual( ++ controller.id, "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)" ++ ) + # raise Exception(controller._connect_op) +- self.assertEqual(str(controller.tid), "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)") +- self.assertEqual(controller.device, '') +- self.assertEqual(str(controller.controller_id_dict()), "{'transport': 'tcp', 'traddr': '10.10.10.10', 'trsvcid': '8009', 'host-traddr': '1.2.3.4', 'host-iface': 'wlp0s20f3', 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', 'device': ''}") +- # self.assertEqual(controller.details(), "{'transport': 'tcp', 'traddr': '10.10.10.[265 chars]ff]'}") +- self.assertEqual(controller.info(), {'transport': 'tcp', 'traddr': '10.10.10.10', 'trsvcid': '8009', 'host-traddr': '1.2.3.4', 'host-iface': 'wlp0s20f3', 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', 'device': '', 'hostid': '', 'hostnqn': '', 'model': '', 'serial': '', 'connect attempts': '1', 'retry connect timer': '60.0s [off]'}) ++ self.assertEqual( ++ str(controller.tid), ++ "(tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4)", ++ ) ++ self.assertEqual(controller.device, 'nvme?') ++ self.assertEqual( ++ str(controller.controller_id_dict()), ++ "{'transport': 'tcp', 'traddr': '10.10.10.10', 'trsvcid': '8009', 'host-traddr': '1.2.3.4', 'host-iface': 'wlp0s20f3', 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', 'device': 'nvme?'}", ++ ) ++ self.assertEqual( ++ controller.details(), ++ { ++ 'dctype': '', ++ 'cntrltype': '', ++ 'transport': 'tcp', ++ 'traddr': '10.10.10.10', ++ 'trsvcid': '8009', ++ 'host-traddr': '1.2.3.4', ++ 'host-iface': 'wlp0s20f3', ++ 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', ++ 'device': 'nvme?', ++ 'connect attempts': '1', ++ 'retry connect timer': '60.0s [off]', ++ 'hostid': '', ++ 'hostnqn': '', ++ 'model': '', ++ 'serial': '', ++ }, ++ ) ++ self.assertEqual( ++ controller.info(), ++ { ++ 'dctype': '', ++ 'cntrltype': '', ++ 'transport': 'tcp', ++ 'traddr': '10.10.10.10', ++ 'trsvcid': '8009', ++ 'host-traddr': '1.2.3.4', ++ 'host-iface': 'wlp0s20f3', ++ 'subsysnqn': 'nqn.1988-11.com.dell:SFSS:2:20220208134025e8', ++ 'device': 'nvme?', ++ 'connect attempts': '1', ++ 'retry connect timer': '60.0s [off]', ++ 'hostid': '', ++ 'hostnqn': '', ++ 'model': '', ++ 'serial': '', ++ 'connect operation': {'fail count': 0}, ++ }, ++ ) ++ + # print(controller._connect_op) + self.assertEqual(controller.cancel(), None) + self.assertEqual(controller.kill(), None) + # self.assertEqual(controller.disconnect(), 0) + + def test_connect(self): +- controller = ctrl.Controller(root=self.root, host=self.host, tid=self.NVME_TID) ++ controller = TestController(root=self.root, host=self.host, tid=self.NVME_TID) + self.assertEqual(controller._connect_attempts, 0) +- controller._find_existing_connection = lambda : None ++ controller._find_existing_connection = lambda: None + with self.assertLogs(logger=logging.getLogger(), level='DEBUG') as captured: + controller._try_to_connect() + self.assertEqual(len(captured.records), 1) +- self.assertTrue(captured.records[0].getMessage().startswith("Controller._try_to_connect() - (tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4) Connecting to nvme control with cfg={'hdr_digest': False, 'data_digest': False")) ++ self.assertTrue( ++ captured.records[0] ++ .getMessage() ++ .startswith( ++ "Controller._try_to_connect() - (tcp, 10.10.10.10, 8009, nqn.1988-11.com.dell:SFSS:2:20220208134025e8, wlp0s20f3, 1.2.3.4) Connecting to nvme control with cfg={'hdr_digest': False, 'data_digest': False" ++ ) ++ ) + self.assertEqual(controller._connect_attempts, 1) + + +diff --git a/test/test-service.py b/test/test-service.py +index 19f9b0c..4ce37be 100755 +--- a/test/test-service.py ++++ b/test/test-service.py +@@ -4,6 +4,7 @@ import unittest + from staslib import service + from pyfakefs.fake_filesystem_unittest import TestCase + ++ + class Args: + def __init__(self): + self.tron = True +@@ -11,6 +12,20 @@ class Args: + self.conf_file = '/dev/null' + + ++class TestService(service.Service): ++ def _config_ctrls_finish(self, configured_ctrl_list): ++ pass ++ ++ def _dump_last_known_config(self, controllers): ++ pass ++ ++ def _keep_connections_on_exit(self): ++ pass ++ ++ def _load_last_known_config(self): ++ return dict() ++ ++ + class Test(TestCase): + '''Unit tests for class Service''' + +@@ -18,22 +33,39 @@ class Test(TestCase): + self.setUpPyfakefs() + + os.environ['RUNTIME_DIRECTORY'] = "/run" +- self.fs.create_file('/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n') +- self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n') +- self.fs.create_file('/dev/nvme-fabrics', contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n') ++ self.fs.create_file( ++ '/etc/nvme/hostnqn', contents='nqn.2014-08.org.nvmexpress:uuid:01234567-0123-0123-0123-0123456789ab\n' ++ ) ++ self.fs.create_file('/etc/nvme/hostid', contents='01234567-89ab-cdef-0123-456789abcdef\n') ++ self.fs.create_file( ++ '/dev/nvme-fabrics', ++ contents='instance=-1,cntlid=-1,transport=%s,traddr=%s,trsvcid=%s,nqn=%s,queue_size=%d,nr_io_queues=%d,reconnect_delay=%d,ctrl_loss_tmo=%d,keep_alive_tmo=%d,hostnqn=%s,host_traddr=%s,host_iface=%s,hostid=%s,duplicate_connect,disable_sqflow,hdr_digest,data_digest,nr_write_queues=%d,nr_poll_queues=%d,tos=%d,fast_io_fail_tmo=%d,discovery,dhchap_secret=%s,dhchap_ctrl_secret=%s\n', ++ ) ++ ++ def test_cannot_instantiate_concrete_classes_if_abstract_method_are_not_implemented(self): ++ # Make sure we can't instantiate the ABC directly (Abstract Base Class). ++ class Service(service.Service): ++ pass ++ ++ self.assertRaises(TypeError, lambda: Service(Args(), reload_hdlr=lambda x: x)) + + def test_get_controller(self): +- # FIXME: this is hack, fix it later +- service.Service._load_last_known_config = lambda x : dict() +- # start the test +- +- srv = service.Service(Args(), reload_hdlr=lambda x : x) +- self.assertRaises(NotImplementedError, srv._keep_connections_on_exit) +- self.assertRaises(NotImplementedError, srv._dump_last_known_config, []) +- self.assertRaises(NotImplementedError, srv._on_config_ctrls) +- #self.assertEqual(srv.get_controllers(), dict()) +- self.assertEqual(srv.get_controller(transport='tcp', traddr='10.10.10.10', trsvcid='8009', host_traddr='1.2.3.4', host_iface='wlp0s20f3', subsysnqn='nqn.1988-11.com.dell:SFSS:2:20220208134025e8'), None) +- self.assertEqual(srv.remove_controller(controller=None), None) ++ srv = TestService(Args(), reload_hdlr=lambda x: x) ++ ++ self.assertEqual(list(srv.get_controllers()), list()) ++ self.assertEqual( ++ srv.get_controller( ++ transport='tcp', ++ traddr='10.10.10.10', ++ trsvcid='8009', ++ host_traddr='1.2.3.4', ++ host_iface='wlp0s20f3', ++ subsysnqn='nqn.1988-11.com.dell:SFSS:2:20220208134025e8', ++ ), ++ None, ++ ) ++ self.assertEqual(srv.remove_controller(controller=None, success=True), None) ++ + + if __name__ == '__main__': + unittest.main() diff --git a/SPECS/nvme-stas.spec b/SPECS/nvme-stas.spec new file mode 100644 index 0000000..f8175ea --- /dev/null +++ b/SPECS/nvme-stas.spec @@ -0,0 +1,121 @@ +# RHEL 8 compatibility +%{!?version_no_tilde: %define version_no_tilde %{shrink:%(echo '%{version}' | tr '~' '-')}} + +Name: nvme-stas +Summary: NVMe STorage Appliance Services +Version: 1.1.6 +Release: 3%{?dist} +License: ASL 2.0 +URL: https://github.com/linux-nvme/nvme-stas +Source0: %{url}/archive/v%{version_no_tilde}/%{name}-%{version_no_tilde}.tar.gz +Patch0: 0001-sync-with-1.1.6.patch + +BuildArch: noarch + +BuildRequires: meson >= 0.57.0 +BuildRequires: glib2-devel +BuildRequires: libnvme-devel +BuildRequires: libxslt +BuildRequires: docbook-style-xsl +BuildRequires: systemd-devel + +BuildRequires: python3-devel +#BuildRequires: python3-pyflakes +#BuildRequires: python3-pylint +#BuildRequires: pylint + +BuildRequires: python3-libnvme +BuildRequires: python3-dasbus +BuildRequires: python3-pyudev +BuildRequires: python3-systemd +BuildRequires: python3-gobject-devel +BuildRequires: python3-lxml + +Requires: avahi +Requires: python3-libnvme +Requires: python3-dasbus +Requires: python3-pyudev +Requires: python3-systemd +Requires: python3-gobject +Requires: python3-lxml + +%description +nvme-stas is a Central Discovery Controller (CDC) client for Linux. It +handles Asynchronous Event Notifications (AEN), Automated NVMe subsystem +connection controls, Error handling and reporting, and Automatic (zeroconf) +and Manual configuration. nvme-stas is composed of two daemons: +stafd (STorage Appliance Finder) and stacd (STorage Appliance Connector). + +%prep +%autosetup -p1 -n %{name}-%{version_no_tilde} + +%build +%meson -Dman=true -Dhtml=true +%meson_build + +%install +%meson_install +mv %{buildroot}/%{_sysconfdir}/stas/sys.conf.doc %{buildroot}/%{_sysconfdir}/stas/sys.conf + +%post +%systemd_post stacd.service +%systemd_post stafd.service + +%preun +%systemd_preun stacd.service +%systemd_preun stafd.service + +%postun +%systemd_postun_with_restart stacd.service +%systemd_postun_with_restart stafd.service + +%files +%license LICENSE +%doc README.md +%dir %{_sysconfdir}/stas +%config(noreplace) %{_sysconfdir}/stas/stacd.conf +%config(noreplace) %{_sysconfdir}/stas/stafd.conf +%config(noreplace) %{_sysconfdir}/stas/sys.conf +%{_datadir}/dbus-1/system.d/org.nvmexpress.*.conf +%{_bindir}/stacctl +%{_bindir}/stafctl +%{_bindir}/stasadm +%{_sbindir}/stacd +%{_sbindir}/stafd +%{_unitdir}/stacd.service +%{_unitdir}/stafd.service +%{_unitdir}/stas-config.target +%{_unitdir}/stas-config@.service +%dir %{python3_sitelib}/staslib +%{python3_sitelib}/staslib/* +%doc %{_pkgdocdir}/html +%{_mandir}/man1/sta*.1* +%{_mandir}/man5/*.5* +%{_mandir}/man7/nvme*.7* +%{_mandir}/man8/sta*.8* + + +%changelog +* Thu Aug 04 2022 Maurizio Lombardi - 1.1.6-3 +- Sync with the official 1.1.6 version + +* Wed Jul 27 2022 Maurizio Lombardi - 1.1.6-2 +- Rebuild for CentOS Stream + +* Wed Jul 20 2022 Maurizio Lombardi - 1.1.6-1 +- Update to version 1.1.6 + +* Mon Jun 27 2022 Maurizio Lombardi - 1.1.4-1 +- Update to version 1.1.4 + +* Wed Apr 20 2022 Tomas Bzatek - 1.0-1 +- Upstream v1.0 official release + +* Tue Apr 05 2022 Tomas Bzatek - 1.0~rc7-1 +- Upstream v1.0 Release Candidate 7 + +* Fri Mar 25 2022 Tomas Bzatek - 1.0~rc5-1 +- Upstream v1.0 Release Candidate 5 + +* Mon Mar 07 2022 Tomas Bzatek - 1.0~rc3-1 +- Upstream v1.0 Release Candidate 3