Blame SOURCES/e4d27840e173491ab29c2d97017da9344e2c2526.patch

David Johansen 447ac5
commit e4d27840e173491ab29c2d97017da9344e2c2526
David Johansen 447ac5
Author: lvying <lvying6@huawei.com>
David Johansen 447ac5
Date:   Sat Oct 31 17:57:14 2020 +0800
David Johansen 447ac5
David Johansen 447ac5
    ras-page-isolation: do_page_offline always considers page offline was successful
David Johansen 447ac5
    
David Johansen 447ac5
    do_page_offline always consider page offline was successful even if
David Johansen 447ac5
    kernel soft/hard offline page failed.
David Johansen 447ac5
    
David Johansen 447ac5
    Calling rasdaemon with:
David Johansen 447ac5
    
David Johansen 447ac5
    	/etc/sysconfig/rasdaemon PAGE_CE_THRESHOLD="1"
David Johansen 447ac5
    
David Johansen 447ac5
    i.e when a page's address occurs Corrected Error, rasdaemon should
David Johansen 447ac5
    trigger this page soft offline.
David Johansen 447ac5
    
David Johansen 447ac5
    However, after adding a livepatch into kernel's
David Johansen 447ac5
    store_soft_offline_page to observe this function's return value,
David Johansen 447ac5
    when injecting a CE into address 0x3f7ec30000, the Kernel
David Johansen 447ac5
    lot reports:
David Johansen 447ac5
    
David Johansen 447ac5
    	soft_offline: 0x3f7ec30: unknown non LRU page type ffffe0000000000 ()
David Johansen 447ac5
    	[store_soft_offline_page]return from soft_offline_page: -5
David Johansen 447ac5
    
David Johansen 447ac5
    While rasdaemon log reports:
David Johansen 447ac5
    
David Johansen 447ac5
    	rasdaemon[73711]: cpu 00:rasdaemon: Corrected Errors at 0x3f7ec30000 exceed threshold
David Johansen 447ac5
    	rasdaemon[73711]: rasdaemon: Result of offlining page at 0x3f7ec30000: offlined
David Johansen 447ac5
    
David Johansen 447ac5
    using strace to record rasdaemon's system call, it reports:
David Johansen 447ac5
    
David Johansen 447ac5
    	strace -p 73711
David Johansen 447ac5
    	openat(AT_FDCWD, "/sys/devices/system/memory/soft_offline_page",
David Johansen 447ac5
    	       O_WRONLY|O_CREAT|O_TRUNC, 0666) = 28
David Johansen 447ac5
    	fstat(28, {st_mode=S_IFREG|0200, st_size=4096, ...}) = 0
David Johansen 447ac5
    	write(28, "0x3f7ec30000", 12)           = -1 EIO (Input/output error)
David Johansen 447ac5
    	close(28)                               = 0
David Johansen 447ac5
    
David Johansen 447ac5
    So, kernel actually soft offline pfn 0x3f7ec30 failed and
David Johansen 447ac5
    store_soft_offline_page returned -EIO. However, rasdaemon always
David Johansen 447ac5
    considers the page offline to be successful.
David Johansen 447ac5
    
David Johansen 447ac5
    According to strace display, ferror was unable of detecting the
David Johansen 447ac5
    failure of the write syscall.
David Johansen 447ac5
    
David Johansen 447ac5
    This patch changes fopen-fprintf-ferror-fclose process to use
David Johansen 447ac5
    the lower I/O level, by using instead open-write-close, which
David Johansen 447ac5
    can detect such syscall failure.
David Johansen 447ac5
    
David Johansen 447ac5
    Signed-off-by: lvying <lvying6@huawei.com>
David Johansen 447ac5
    Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
David Johansen 447ac5
David Johansen 447ac5
diff --git a/ras-page-isolation.c b/ras-page-isolation.c
David Johansen 447ac5
index 50e4406..dc07545 100644
David Johansen 447ac5
--- a/ras-page-isolation.c
David Johansen 447ac5
+++ b/ras-page-isolation.c
David Johansen 447ac5
@@ -17,6 +17,9 @@
David Johansen 447ac5
 #include <stdlib.h>
David Johansen 447ac5
 #include <string.h>
David Johansen 447ac5
 #include <unistd.h>
David Johansen 447ac5
+#include <sys/stat.h>
David Johansen 447ac5
+#include <fcntl.h>
David Johansen 447ac5
+#include <errno.h>
David Johansen 447ac5
 #include "ras-logger.h"
David Johansen 447ac5
 #include "ras-page-isolation.h"
David Johansen 447ac5
 
David Johansen 447ac5
@@ -210,18 +213,22 @@ void ras_page_account_init(void)
David Johansen 447ac5
 
David Johansen 447ac5
 static int do_page_offline(unsigned long long addr, enum otype type)
David Johansen 447ac5
 {
David Johansen 447ac5
-	FILE *offline_file;
David Johansen 447ac5
-	int err;
David Johansen 447ac5
+	int fd, rc;
David Johansen 447ac5
+	char buf[20];
David Johansen 447ac5
 
David Johansen 447ac5
-	offline_file = fopen(kernel_offline[type], "w");
David Johansen 447ac5
-	if (!offline_file)
David Johansen 447ac5
+	fd = open(kernel_offline[type], O_WRONLY);
David Johansen 447ac5
+	if (fd == -1) {
David Johansen 447ac5
+		log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, kernel_offline[type]);
David Johansen 447ac5
 		return -1;
David Johansen 447ac5
+	}
David Johansen 447ac5
 
David Johansen 447ac5
-	fprintf(offline_file, "%#llx", addr);
David Johansen 447ac5
-	err = ferror(offline_file) ? -1 : 0;
David Johansen 447ac5
-	fclose(offline_file);
David Johansen 447ac5
-
David Johansen 447ac5
-	return err;
David Johansen 447ac5
+	sprintf(buf, "%#llx", addr);
David Johansen 447ac5
+	rc = write(fd, buf, strlen(buf));
David Johansen 447ac5
+	if (rc < 0) {
David Johansen 447ac5
+		log(TERM, LOG_ERR, "page offline addr(%s) by %s failed, errno:%d\n", buf, kernel_offline[type], errno);
David Johansen 447ac5
+	}
David Johansen 447ac5
+	close(fd);
David Johansen 447ac5
+	return rc;
David Johansen 447ac5
 }
David Johansen 447ac5
 
David Johansen 447ac5
 static void page_offline(struct page_record *pr)