|
David Johansen |
447ac5 |
commit e4d27840e173491ab29c2d97017da9344e2c2526
|
|
David Johansen |
447ac5 |
Author: lvying <lvying6@huawei.com>
|
|
David Johansen |
447ac5 |
Date: Sat Oct 31 17:57:14 2020 +0800
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
ras-page-isolation: do_page_offline always considers page offline was successful
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
do_page_offline always consider page offline was successful even if
|
|
David Johansen |
447ac5 |
kernel soft/hard offline page failed.
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
Calling rasdaemon with:
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
/etc/sysconfig/rasdaemon PAGE_CE_THRESHOLD="1"
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
i.e when a page's address occurs Corrected Error, rasdaemon should
|
|
David Johansen |
447ac5 |
trigger this page soft offline.
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
However, after adding a livepatch into kernel's
|
|
David Johansen |
447ac5 |
store_soft_offline_page to observe this function's return value,
|
|
David Johansen |
447ac5 |
when injecting a CE into address 0x3f7ec30000, the Kernel
|
|
David Johansen |
447ac5 |
lot reports:
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
soft_offline: 0x3f7ec30: unknown non LRU page type ffffe0000000000 ()
|
|
David Johansen |
447ac5 |
[store_soft_offline_page]return from soft_offline_page: -5
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
While rasdaemon log reports:
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
rasdaemon[73711]: cpu 00:rasdaemon: Corrected Errors at 0x3f7ec30000 exceed threshold
|
|
David Johansen |
447ac5 |
rasdaemon[73711]: rasdaemon: Result of offlining page at 0x3f7ec30000: offlined
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
using strace to record rasdaemon's system call, it reports:
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
strace -p 73711
|
|
David Johansen |
447ac5 |
openat(AT_FDCWD, "/sys/devices/system/memory/soft_offline_page",
|
|
David Johansen |
447ac5 |
O_WRONLY|O_CREAT|O_TRUNC, 0666) = 28
|
|
David Johansen |
447ac5 |
fstat(28, {st_mode=S_IFREG|0200, st_size=4096, ...}) = 0
|
|
David Johansen |
447ac5 |
write(28, "0x3f7ec30000", 12) = -1 EIO (Input/output error)
|
|
David Johansen |
447ac5 |
close(28) = 0
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
So, kernel actually soft offline pfn 0x3f7ec30 failed and
|
|
David Johansen |
447ac5 |
store_soft_offline_page returned -EIO. However, rasdaemon always
|
|
David Johansen |
447ac5 |
considers the page offline to be successful.
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
According to strace display, ferror was unable of detecting the
|
|
David Johansen |
447ac5 |
failure of the write syscall.
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
This patch changes fopen-fprintf-ferror-fclose process to use
|
|
David Johansen |
447ac5 |
the lower I/O level, by using instead open-write-close, which
|
|
David Johansen |
447ac5 |
can detect such syscall failure.
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
Signed-off-by: lvying <lvying6@huawei.com>
|
|
David Johansen |
447ac5 |
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
diff --git a/ras-page-isolation.c b/ras-page-isolation.c
|
|
David Johansen |
447ac5 |
index 50e4406..dc07545 100644
|
|
David Johansen |
447ac5 |
--- a/ras-page-isolation.c
|
|
David Johansen |
447ac5 |
+++ b/ras-page-isolation.c
|
|
David Johansen |
447ac5 |
@@ -17,6 +17,9 @@
|
|
David Johansen |
447ac5 |
#include <stdlib.h>
|
|
David Johansen |
447ac5 |
#include <string.h>
|
|
David Johansen |
447ac5 |
#include <unistd.h>
|
|
David Johansen |
447ac5 |
+#include <sys/stat.h>
|
|
David Johansen |
447ac5 |
+#include <fcntl.h>
|
|
David Johansen |
447ac5 |
+#include <errno.h>
|
|
David Johansen |
447ac5 |
#include "ras-logger.h"
|
|
David Johansen |
447ac5 |
#include "ras-page-isolation.h"
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
@@ -210,18 +213,22 @@ void ras_page_account_init(void)
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
static int do_page_offline(unsigned long long addr, enum otype type)
|
|
David Johansen |
447ac5 |
{
|
|
David Johansen |
447ac5 |
- FILE *offline_file;
|
|
David Johansen |
447ac5 |
- int err;
|
|
David Johansen |
447ac5 |
+ int fd, rc;
|
|
David Johansen |
447ac5 |
+ char buf[20];
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
- offline_file = fopen(kernel_offline[type], "w");
|
|
David Johansen |
447ac5 |
- if (!offline_file)
|
|
David Johansen |
447ac5 |
+ fd = open(kernel_offline[type], O_WRONLY);
|
|
David Johansen |
447ac5 |
+ if (fd == -1) {
|
|
David Johansen |
447ac5 |
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, kernel_offline[type]);
|
|
David Johansen |
447ac5 |
return -1;
|
|
David Johansen |
447ac5 |
+ }
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
- fprintf(offline_file, "%#llx", addr);
|
|
David Johansen |
447ac5 |
- err = ferror(offline_file) ? -1 : 0;
|
|
David Johansen |
447ac5 |
- fclose(offline_file);
|
|
David Johansen |
447ac5 |
-
|
|
David Johansen |
447ac5 |
- return err;
|
|
David Johansen |
447ac5 |
+ sprintf(buf, "%#llx", addr);
|
|
David Johansen |
447ac5 |
+ rc = write(fd, buf, strlen(buf));
|
|
David Johansen |
447ac5 |
+ if (rc < 0) {
|
|
David Johansen |
447ac5 |
+ log(TERM, LOG_ERR, "page offline addr(%s) by %s failed, errno:%d\n", buf, kernel_offline[type], errno);
|
|
David Johansen |
447ac5 |
+ }
|
|
David Johansen |
447ac5 |
+ close(fd);
|
|
David Johansen |
447ac5 |
+ return rc;
|
|
David Johansen |
447ac5 |
}
|
|
David Johansen |
447ac5 |
|
|
David Johansen |
447ac5 |
static void page_offline(struct page_record *pr)
|