diff --git a/.gitignore b/.gitignore index 23a1052..4a72216 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ -SOURCES/skiboot-6.6.tar.gz +SOURCES/skiboot-6.6.3.tar.gz diff --git a/.opal-prd.metadata b/.opal-prd.metadata index b815d1c..a19bd40 100644 --- a/.opal-prd.metadata +++ b/.opal-prd.metadata @@ -1 +1 @@ -97a6f924c558a9c8315333b591eae4d4ea3c9f9d SOURCES/skiboot-6.6.tar.gz +7ba62e1904d77dee4d9b38aad0d4ad273cf0a651 SOURCES/skiboot-6.6.3.tar.gz diff --git a/SOURCES/opal-prd-6.6.3-8cbd0de88d162e387f11569eee1bdecef8fad2e3.patch b/SOURCES/opal-prd-6.6.3-8cbd0de88d162e387f11569eee1bdecef8fad2e3.patch new file mode 100644 index 0000000..0337a99 --- /dev/null +++ b/SOURCES/opal-prd-6.6.3-8cbd0de88d162e387f11569eee1bdecef8fad2e3.patch @@ -0,0 +1,147 @@ +commit 8cbd0de88d162e387f11569eee1bdecef8fad2e3 +Author: Oliver O'Halloran +Date: Wed Sep 23 16:12:20 2020 +1000 + + opal-prd: Have a worker process handle page offlining + + The memory_error() hservice interface expects the memory_error() call to + just accept the offline request and return without actually offlining the + memory. Currently we will attempt to offline the marked pages before + returning to HBRT which can result in an excessively long time spent in the + memory_error() hservice call which blocks HBRT from processing other + errors. Fix this by adding a worker process which performs the page + offlining via the sysfs memory error interfaces. + + Reviewed-by: Vasant Hegde + Signed-off-by: Oliver O'Halloran + +diff --git a/external/opal-prd/opal-prd.c b/external/opal-prd/opal-prd.c +index 40e5a984..d74d8039 100644 +--- a/external/opal-prd/opal-prd.c ++++ b/external/opal-prd/opal-prd.c +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -696,13 +697,42 @@ out: + return rc; + } + ++static int memory_error_worker(const char *sysfsfile, const char *type, ++ uint64_t i_start_addr, uint64_t i_endAddr) ++{ ++ int memfd, rc, n, ret = 0; ++ char buf[ADDR_STRING_SZ]; ++ uint64_t addr; ++ ++ memfd = open(sysfsfile, O_WRONLY); ++ if (memfd < 0) { ++ pr_log(LOG_CRIT, "MEM: Failed to offline memory! " ++ "Unable to open sysfs node %s: %m", sysfsfile); ++ return -1; ++ } ++ ++ for (addr = i_start_addr; addr <= i_endAddr; addr += ctx->page_size) { ++ n = snprintf(buf, ADDR_STRING_SZ, "0x%lx", addr); ++ rc = write(memfd, buf, n); ++ if (rc != n) { ++ pr_log(LOG_CRIT, "MEM: Failed to offline memory! " ++ "page addr: %016lx type: %s: %m", ++ addr, type); ++ ret = 1; ++ } ++ } ++ pr_log(LOG_CRIT, "MEM: Offlined %016lx,%016lx, type %s: %m\n", ++ i_start_addr, addr, type); ++ ++ close(memfd); ++ return ret; ++} ++ + int hservice_memory_error(uint64_t i_start_addr, uint64_t i_endAddr, + enum MemoryError_t i_errorType) + { + const char *sysfsfile, *typestr; +- char buf[ADDR_STRING_SZ]; +- int memfd, rc, n, ret = 0; +- uint64_t addr; ++ pid_t pid; + + switch(i_errorType) { + case MEMORY_ERROR_CE: +@@ -722,26 +752,21 @@ int hservice_memory_error(uint64_t i_start_addr, uint64_t i_endAddr, + pr_log(LOG_ERR, "MEM: Memory error: range %016lx-%016lx, type: %s", + i_start_addr, i_endAddr, typestr); + ++ /* ++ * HBRT expects the memory offlining process to happen in the background ++ * after the notification is delivered. ++ */ ++ pid = fork(); ++ if (pid > 0) ++ exit(memory_error_worker(sysfsfile, typestr, i_start_addr, i_endAddr)); + +- memfd = open(sysfsfile, O_WRONLY); +- if (memfd < 0) { +- pr_log(LOG_CRIT, "MEM: Failed to offline memory! " +- "Unable to open sysfs node %s: %m", sysfsfile); ++ if (pid < 0) { ++ perror("MEM: unable to fork worker to offline memory!\n"); + return -1; + } + +- for (addr = i_start_addr; addr <= i_endAddr; addr += ctx->page_size) { +- n = snprintf(buf, ADDR_STRING_SZ, "0x%lx", addr); +- rc = write(memfd, buf, n); +- if (rc != n) { +- pr_log(LOG_CRIT, "MEM: Failed to offline memory! " +- "page addr: %016lx type: %d: %m", +- addr, i_errorType); +- ret = rc; +- } +- } +- +- return ret; ++ pr_log(LOG_INFO, "MEM: forked off %d to handle mem error\n", pid); ++ return 0; + } + + uint64_t hservice_get_interface_capabilities(uint64_t set) +@@ -2112,6 +2137,10 @@ static int init_control_socket(struct opal_prd_ctx *ctx) + return 0; + } + ++static struct sigaction sigchild_action = { ++ .sa_flags = SA_NOCLDWAIT | SA_RESTART, ++ .sa_handler = SIG_DFL, ++}; + + static int run_prd_daemon(struct opal_prd_ctx *ctx) + { +@@ -2243,6 +2272,22 @@ static int run_prd_daemon(struct opal_prd_ctx *ctx) + pr_debug("SCOM: f00f: %lx", be64toh(val)); + } + ++ /* ++ * Setup the SIGCHLD handler to automatically reap the worker threads ++ * we use for memory offlining. We can't do this earlier since the ++ * modprobe helper spawns workers and wants to check their exit status ++ * with waitpid(). Auto-reaping breaks that so enable it just before ++ * entering the attn loop. ++ * ++ * We also setup system call restarting on SIGCHLD since opal-prd ++ * doesn't make any real attempt to handle blocking functions exiting ++ * due to EINTR. ++ */ ++ if (sigaction(SIGCHLD, &sigchild_action, NULL)) { ++ pr_log(LOG_ERR, "CTRL: Failed to register signal handler %m\n"); ++ return -1; ++ } ++ + run_attn_loop(ctx); + rc = 0; + diff --git a/SOURCES/skiboot-6.6.2-ffspart.patch b/SOURCES/skiboot-6.6.2-ffspart.patch new file mode 100644 index 0000000..c39282b --- /dev/null +++ b/SOURCES/skiboot-6.6.2-ffspart.patch @@ -0,0 +1,25 @@ +commit 6278c6df4ff2123725efc10e5e6ea48d02fda55a +Author: Dan Horák +Date: Mon Aug 10 12:59:04 2020 +0200 + + external/ffspart: define $(sbindir) for Makefile + + Right now the $(sbindir) variable isn't defined, so the binary gets installed + directly into $(DESTDIR). + + Signed-off-by: Dan Horák + +diff --git a/external/ffspart/rules.mk b/external/ffspart/rules.mk +index 40972c688..e006dc5b7 100644 +--- a/external/ffspart/rules.mk ++++ b/external/ffspart/rules.mk +@@ -10,6 +10,9 @@ LIBFLASH_SRC := $(addprefix libflash/,$(LIBFLASH_FILES)) + OBJS += $(LIBFLASH_OBJS) + OBJS += common-arch_flash.o + ++prefix = /usr/local/ ++sbindir = $(prefix)/sbin ++ + CC = $(CROSS_COMPILE)gcc + + FFSPART_VERSION ?= $(shell ./make_version.sh $(EXE)) diff --git a/SPECS/opal-prd.spec b/SPECS/opal-prd.spec index 91176f8..189662d 100644 --- a/SPECS/opal-prd.spec +++ b/SPECS/opal-prd.spec @@ -1,8 +1,8 @@ %global project skiboot Name: opal-prd -Version: 6.6 -Release: 1%{?dist} +Version: 6.6.3 +Release: 2%{?dist} Summary: OPAL Processor Recovery Diagnostics Daemon Group: System Environment/Daemons @@ -27,6 +27,10 @@ Source0: https://github.com/open-power/%{project}/archive/v%{version}/%{project} Source1: opal-prd-rsyslog Source2: opal-prd-logrotate +Patch0: skiboot-6.6.2-ffspart.patch +# upstream fix +Patch1: opal-prd-6.6.3-8cbd0de88d162e387f11569eee1bdecef8fad2e3.patch + %description This package provides a daemon to load and run the OpenPower firmware's Processor Recovery Diagnostics binary. This is responsible for run time @@ -56,6 +60,8 @@ services to the OS (Linux) on IBM Power and OpenPower systems. %prep %setup -q -n %{project}-%{version} +%patch0 -p1 -b .build +%patch1 -p1 -b .8cbd0de88d162e387f11569eee1bdecef8fad2e3 %build OPAL_PRD_VERSION=%{version} make V=1 CC="gcc" CFLAGS="%{build_cflags}" LDFLAGS="%{build_ldflags}" ASFLAGS="-m64 -Wa,--generate-missing-build-notes=yes" -C external/opal-prd @@ -123,6 +129,12 @@ install -m 644 %{SOURCE2} %{buildroot}/%{_sysconfdir}/logrotate.d/opal-prd %{_datadir}/qemu/ %changelog +* Mon Oct 05 2020 Than Ngo - 6.6.3-2 +- Resolves: #1885134, fix which makes the actual page off lining asynchronous + +* Thu Oct 01 2020 Than Ngo - 6.6.3-1 +- Resolves: #1844427, rebase to 6.6.3 + * Fri Apr 24 2020 Than Ngo - 6.6-1 - Resolves: #1779211, rebase to 6.6