Blame SOURCES/0315-ieee1275-support-runtime-memory-claiming.patch

b35c50
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
b35c50
From: Daniel Axtens <dja@axtens.net>
b35c50
Date: Mon, 6 Feb 2023 10:03:22 -0500
b35c50
Subject: [PATCH] ieee1275: support runtime memory claiming
b35c50
b35c50
On powerpc-ieee1275, we are running out of memory trying to verify
b35c50
anything. This is because:
b35c50
b35c50
 - we have to load an entire file into memory to verify it. This is
b35c50
   difficult to change with appended signatures.
b35c50
 - We only have 32MB of heap.
b35c50
 - Distro kernels are now often around 30MB.
b35c50
b35c50
So we want to be able to claim more memory from OpenFirmware for our heap
b35c50
at runtime.
b35c50
b35c50
There are some complications:
b35c50
b35c50
 - The grub mm code isn't the only thing that will make claims on
b35c50
   memory from OpenFirmware:
b35c50
b35c50
    * PFW/SLOF will have claimed some for their own use.
b35c50
b35c50
    * The ieee1275 loader will try to find other bits of memory that we
b35c50
      haven't claimed to place the kernel and initrd when we go to boot.
b35c50
b35c50
    * Once we load Linux, it will also try to claim memory. It claims
b35c50
      memory without any reference to /memory/available, it just starts
b35c50
      at min(top of RMO, 768MB) and works down. So we need to avoid this
b35c50
      area. See arch/powerpc/kernel/prom_init.c as of v5.11.
b35c50
b35c50
 - The smallest amount of memory a ppc64 KVM guest can have is 256MB.
b35c50
   It doesn't work with distro kernels but can work with custom kernels.
b35c50
   We should maintain support for that. (ppc32 can boot with even less,
b35c50
   and we shouldn't break that either.)
b35c50
b35c50
 - Even if a VM has more memory, the memory OpenFirmware makes available
b35c50
   as Real Memory Area can be restricted. Even with our CAS work, an LPAR
b35c50
   on a PowerVM box is likely to have only 512MB available to OpenFirmware
b35c50
   even if it has many gigabytes of memory allocated.
b35c50
b35c50
What should we do?
b35c50
b35c50
We don't know in advance how big the kernel and initrd are going to be,
b35c50
which makes figuring out how much memory we can take a bit tricky.
b35c50
b35c50
To figure out how much memory we should leave unused, I looked at:
b35c50
b35c50
 - an Ubuntu 20.04.1 ppc64le pseries KVM guest:
b35c50
    vmlinux: ~30MB
b35c50
    initrd:  ~50MB
b35c50
b35c50
 - a RHEL8.2 ppc64le pseries KVM guest:
b35c50
    vmlinux: ~30MB
b35c50
    initrd:  ~30MB
b35c50
b35c50
So to give us a little wriggle room, I think we want to leave at least
b35c50
128MB for the loader to put vmlinux and initrd in memory and leave Linux
b35c50
with space to satisfy its early allocations.
b35c50
b35c50
Allow other space to be allocated at runtime.
b35c50
b35c50
Tested-by: Stefan Berger <stefanb@linux.ibm.com>
b35c50
Signed-off-by: Daniel Axtens <dja@axtens.net>
b35c50
(cherry picked from commit a5c710789ccdd27a84ae4a34c7d453bd585e2b66)
b35c50
[rharwood: _start?]
b35c50
---
b35c50
 grub-core/kern/ieee1275/init.c | 270 ++++++++++++++++++++++++++++++++++++++---
b35c50
 docs/grub-dev.texi             |   7 +-
b35c50
 2 files changed, 257 insertions(+), 20 deletions(-)
b35c50
b35c50
diff --git a/grub-core/kern/ieee1275/init.c b/grub-core/kern/ieee1275/init.c
b35c50
index c8d551759d..85af8fa97b 100644
b35c50
--- a/grub-core/kern/ieee1275/init.c
b35c50
+++ b/grub-core/kern/ieee1275/init.c
b35c50
@@ -46,13 +46,26 @@
b35c50
 #endif
b35c50
 #include <grub/lockdown.h>
b35c50
 
b35c50
-/* The maximum heap size we're going to claim */
b35c50
+/* The maximum heap size we're going to claim at boot. Not used by sparc. */
b35c50
 #ifdef __i386__
b35c50
 #define HEAP_MAX_SIZE		(unsigned long) (64 * 1024 * 1024)
b35c50
-#else
b35c50
+#else /* __powerpc__ */
b35c50
 #define HEAP_MAX_SIZE		(unsigned long) (32 * 1024 * 1024)
b35c50
 #endif
b35c50
 
b35c50
+/* RMO max. address at 768 MB */
b35c50
+#define RMO_ADDR_MAX		(grub_uint64_t) (768 * 1024 * 1024)
b35c50
+
b35c50
+/*
b35c50
+ * The amount of OF space we will not claim here so as to leave space for
b35c50
+ * the loader and linux to service early allocations.
b35c50
+ *
b35c50
+ * In 2021, Daniel Axtens claims that we should leave at least 128MB to
b35c50
+ * ensure we can load a stock kernel and initrd on a pseries guest with
b35c50
+ * a 512MB real memory area under PowerVM.
b35c50
+ */
b35c50
+#define RUNTIME_MIN_SPACE (128UL * 1024 * 1024)
b35c50
+
b35c50
 extern char _end[];
b35c50
 
b35c50
 #ifdef __sparc__
b35c50
@@ -147,16 +160,52 @@ grub_claim_heap (void)
b35c50
 				 + GRUB_KERNEL_MACHINE_STACK_SIZE), 0x200000);
b35c50
 }
b35c50
 #else
b35c50
-/* Helper for grub_claim_heap.  */
b35c50
+/* Helpers for mm on powerpc. */
b35c50
+
b35c50
+/*
b35c50
+ * How much memory does OF believe exists in total?
b35c50
+ *
b35c50
+ * This isn't necessarily the true total. It can be the total memory
b35c50
+ * accessible in real mode for a pseries guest, for example.
b35c50
+ */
b35c50
+static grub_uint64_t rmo_top;
b35c50
+
b35c50
 static int
b35c50
-heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
b35c50
-	   void *data)
b35c50
+count_free (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
b35c50
+	    void *data)
b35c50
 {
b35c50
-  unsigned long *total = data;
b35c50
+  if (type != GRUB_MEMORY_AVAILABLE)
b35c50
+    return 0;
b35c50
+
b35c50
+  /* Do not consider memory beyond 4GB */
b35c50
+  if (addr > 0xffffffffULL)
b35c50
+    return 0;
b35c50
+
b35c50
+  if (addr + len > 0xffffffffULL)
b35c50
+    len = 0xffffffffULL - addr;
b35c50
+
b35c50
+  *(grub_uint32_t *) data += len;
b35c50
+
b35c50
+  return 0;
b35c50
+}
b35c50
+
b35c50
+static int
b35c50
+regions_claim (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
b35c50
+	      unsigned int flags, void *data)
b35c50
+{
b35c50
+  grub_uint32_t total = *(grub_uint32_t *) data;
b35c50
+  grub_uint64_t linux_rmo_save;
b35c50
 
b35c50
   if (type != GRUB_MEMORY_AVAILABLE)
b35c50
     return 0;
b35c50
 
b35c50
+  /* Do not consider memory beyond 4GB */
b35c50
+  if (addr > 0xffffffffULL)
b35c50
+    return 0;
b35c50
+
b35c50
+  if (addr + len > 0xffffffffULL)
b35c50
+    len = 0xffffffffULL - addr;
b35c50
+
b35c50
   if (grub_ieee1275_test_flag (GRUB_IEEE1275_FLAG_NO_PRE1_5M_CLAIM))
b35c50
     {
b35c50
       if (addr + len <= 0x180000)
b35c50
@@ -169,10 +218,6 @@ heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
b35c50
 	}
b35c50
     }
b35c50
 
b35c50
-  /* Never exceed HEAP_MAX_SIZE  */
b35c50
-  if (*total + len > HEAP_MAX_SIZE)
b35c50
-    len = HEAP_MAX_SIZE - *total;
b35c50
-
b35c50
   /* In theory, firmware should already prevent this from happening by not
b35c50
      listing our own image in /memory/available.  The check below is intended
b35c50
      as a safeguard in case that doesn't happen.  However, it doesn't protect
b35c50
@@ -184,6 +229,108 @@ heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
b35c50
       len = 0;
b35c50
     }
b35c50
 
b35c50
+  /*
b35c50
+   * Linux likes to claim memory at min(RMO top, 768MB) and works down
b35c50
+   * without reference to /memory/available. (See prom_init.c::alloc_down)
b35c50
+   *
b35c50
+   * If this block contains min(RMO top, 768MB), do not claim below that for
b35c50
+   * at least a few MB (this is where RTAS, SML and potentially TCEs live).
b35c50
+   *
b35c50
+   * We also need to leave enough space for the DT in the RMA. (See
b35c50
+   * prom_init.c::alloc_up)
b35c50
+   *
b35c50
+   * Finally, we also want to make sure that when grub loads the kernel,
b35c50
+   * it isn't going to use up all the memory we're trying to reserve! So
b35c50
+   * enforce our entire RUNTIME_MIN_SPACE here:
b35c50
+   *
b35c50
+   * |---------- Top of memory ----------|
b35c50
+   * |                                   |
b35c50
+   * |             available             |
b35c50
+   * |                                   |
b35c50
+   * |----------     768 MB    ----------|
b35c50
+   * |                                   |
b35c50
+   * |              reserved             |
b35c50
+   * |                                   |
b35c50
+   * |--- 768 MB - runtime min space  ---|
b35c50
+   * |                                   |
b35c50
+   * |             available             |
b35c50
+   * |                                   |
b35c50
+   * |----------      0 MB     ----------|
b35c50
+   *
b35c50
+   * Edge cases:
b35c50
+   *
b35c50
+   * - Total memory less than RUNTIME_MIN_SPACE: only claim up to HEAP_MAX_SIZE.
b35c50
+   *   (enforced elsewhere)
b35c50
+   *
b35c50
+   * - Total memory between RUNTIME_MIN_SPACE and 768MB:
b35c50
+   *
b35c50
+   * |---------- Top of memory ----------|
b35c50
+   * |                                   |
b35c50
+   * |              reserved             |
b35c50
+   * |                                   |
b35c50
+   * |----  top - runtime min space  ----|
b35c50
+   * |                                   |
b35c50
+   * |             available             |
b35c50
+   * |                                   |
b35c50
+   * |----------      0 MB     ----------|
b35c50
+   *
b35c50
+   * This by itself would not leave us with RUNTIME_MIN_SPACE of free bytes: if
b35c50
+   * rmo_top < 768MB, we will almost certainly have FW claims in the reserved
b35c50
+   * region. We try to address that elsewhere: grub_ieee1275_mm_add_region will
b35c50
+   * not call us if the resulting free space would be less than RUNTIME_MIN_SPACE.
b35c50
+   */
b35c50
+  linux_rmo_save = grub_min (RMO_ADDR_MAX, rmo_top) - RUNTIME_MIN_SPACE;
b35c50
+  if (rmo_top > RUNTIME_MIN_SPACE)
b35c50
+    {
b35c50
+      if (rmo_top <= RMO_ADDR_MAX)
b35c50
+        {
b35c50
+          if (addr > linux_rmo_save)
b35c50
+            {
b35c50
+              grub_dprintf ("ieee1275", "rejecting region in RUNTIME_MIN_SPACE reservation (%llx)\n",
b35c50
+                            addr);
b35c50
+              return 0;
b35c50
+            }
b35c50
+          else if (addr + len > linux_rmo_save)
b35c50
+            {
b35c50
+              grub_dprintf ("ieee1275", "capping region: (%llx -> %llx) -> (%llx -> %llx)\n",
b35c50
+                            addr, addr + len, addr, rmo_top - RUNTIME_MIN_SPACE);
b35c50
+              len = linux_rmo_save - addr;
b35c50
+            }
b35c50
+        }
b35c50
+      else
b35c50
+        {
b35c50
+          /*
b35c50
+           * we order these cases to prefer higher addresses and avoid some
b35c50
+           * splitting issues
b35c50
+           */
b35c50
+          if (addr < RMO_ADDR_MAX && (addr + len) > RMO_ADDR_MAX)
b35c50
+            {
b35c50
+              grub_dprintf ("ieee1275",
b35c50
+                            "adjusting region for RUNTIME_MIN_SPACE: (%llx -> %llx) -> (%llx -> %llx)\n",
b35c50
+                            addr, addr + len, RMO_ADDR_MAX, addr + len);
b35c50
+              len = (addr + len) - RMO_ADDR_MAX;
b35c50
+              addr = RMO_ADDR_MAX;
b35c50
+            }
b35c50
+          else if ((addr < linux_rmo_save) && ((addr + len) > linux_rmo_save))
b35c50
+            {
b35c50
+              grub_dprintf ("ieee1275", "capping region: (%llx -> %llx) -> (%llx -> %llx)\n",
b35c50
+                            addr, addr + len, addr, linux_rmo_save);
b35c50
+              len = linux_rmo_save - addr;
b35c50
+            }
b35c50
+          else if (addr >= linux_rmo_save && (addr + len) <= RMO_ADDR_MAX)
b35c50
+            {
b35c50
+              grub_dprintf ("ieee1275", "rejecting region in RUNTIME_MIN_SPACE reservation (%llx)\n",
b35c50
+                            addr);
b35c50
+              return 0;
b35c50
+            }
b35c50
+        }
b35c50
+    }
b35c50
+  if (flags & GRUB_MM_ADD_REGION_CONSECUTIVE && len < total)
b35c50
+    return 0;
b35c50
+
b35c50
+  if (len > total)
b35c50
+    len = total;
b35c50
+
b35c50
   if (len)
b35c50
     {
b35c50
       grub_err_t err;
b35c50
@@ -192,15 +339,95 @@ heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
b35c50
       if (err)
b35c50
 	return err;
b35c50
       grub_mm_init_region ((void *) (grub_addr_t) addr, len);
b35c50
+      total -= len;
b35c50
     }
b35c50
 
b35c50
-  *total += len;
b35c50
-  if (*total >= HEAP_MAX_SIZE)
b35c50
+  *(grub_uint32_t *) data = total;
b35c50
+
b35c50
+  if (total == 0)
b35c50
     return 1;
b35c50
 
b35c50
   return 0;
b35c50
 }
b35c50
 
b35c50
+static int
b35c50
+heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
b35c50
+	   void *data)
b35c50
+{
b35c50
+  return regions_claim (addr, len, type, GRUB_MM_ADD_REGION_NONE, data);
b35c50
+}
b35c50
+
b35c50
+static int
b35c50
+region_claim (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
b35c50
+	   void *data)
b35c50
+{
b35c50
+  return regions_claim (addr, len, type, GRUB_MM_ADD_REGION_CONSECUTIVE, data);
b35c50
+}
b35c50
+
b35c50
+static grub_err_t
b35c50
+grub_ieee1275_mm_add_region (grub_size_t size, unsigned int flags)
b35c50
+{
b35c50
+  grub_uint32_t free_memory = 0;
b35c50
+  grub_uint32_t avail = 0;
b35c50
+  grub_uint32_t total;
b35c50
+
b35c50
+  grub_dprintf ("ieee1275", "mm requested region of size %x, flags %x\n",
b35c50
+               size, flags);
b35c50
+
b35c50
+  /*
b35c50
+   * Update free memory each time, which is a bit inefficient but guards us
b35c50
+   * against a situation where some OF driver goes out to firmware for
b35c50
+   * memory and we don't realise.
b35c50
+   */
b35c50
+  grub_machine_mmap_iterate (count_free, &free_memory);
b35c50
+
b35c50
+  /* Ensure we leave enough space to boot. */
b35c50
+  if (free_memory <= RUNTIME_MIN_SPACE + size)
b35c50
+    {
b35c50
+      grub_dprintf ("ieee1275", "Cannot satisfy allocation and retain minimum runtime space\n");
b35c50
+      return GRUB_ERR_OUT_OF_MEMORY;
b35c50
+    }
b35c50
+
b35c50
+  if (free_memory > RUNTIME_MIN_SPACE)
b35c50
+      avail = free_memory - RUNTIME_MIN_SPACE;
b35c50
+
b35c50
+  grub_dprintf ("ieee1275", "free = 0x%x available = 0x%x\n", free_memory, avail);
b35c50
+
b35c50
+  if (flags & GRUB_MM_ADD_REGION_CONSECUTIVE)
b35c50
+    {
b35c50
+      /* first try rounding up hard for the sake of speed */
b35c50
+      total = grub_max (ALIGN_UP (size, 1024 * 1024) + 1024 * 1024, 32 * 1024 * 1024);
b35c50
+      total = grub_min (avail, total);
b35c50
+
b35c50
+      grub_dprintf ("ieee1275", "looking for %x bytes of memory (%x requested)\n", total, size);
b35c50
+
b35c50
+      grub_machine_mmap_iterate (region_claim, &total);
b35c50
+      grub_dprintf ("ieee1275", "get memory from fw %s\n", total == 0 ? "succeeded" : "failed");
b35c50
+
b35c50
+      if (total != 0)
b35c50
+        {
b35c50
+          total = grub_min (avail, size);
b35c50
+
b35c50
+          grub_dprintf ("ieee1275", "fallback for %x bytes of memory (%x requested)\n", total, size);
b35c50
+
b35c50
+          grub_machine_mmap_iterate (region_claim, &total);
b35c50
+          grub_dprintf ("ieee1275", "fallback from fw %s\n", total == 0 ? "succeeded" : "failed");
b35c50
+        }
b35c50
+    }
b35c50
+  else
b35c50
+    {
b35c50
+      /* provide padding for a grub_mm_header_t and region */
b35c50
+      total = grub_min (avail, size);
b35c50
+      grub_machine_mmap_iterate (heap_init, &total);
b35c50
+      grub_dprintf ("ieee1275", "get noncontig memory from fw %s\n", total == 0 ? "succeeded" : "failed");
b35c50
+    }
b35c50
+
b35c50
+  if (total == 0)
b35c50
+    return GRUB_ERR_NONE;
b35c50
+  else
b35c50
+    return GRUB_ERR_OUT_OF_MEMORY;
b35c50
+}
b35c50
+
b35c50
 /*
b35c50
  * How much memory does OF believe it has? (regardless of whether
b35c50
  * it's accessible or not)
b35c50
@@ -356,17 +583,24 @@ grub_ieee1275_ibm_cas (void)
b35c50
 static void
b35c50
 grub_claim_heap (void)
b35c50
 {
b35c50
-  unsigned long total = 0;
b35c50
+  grub_err_t err;
b35c50
+  grub_uint32_t total = HEAP_MAX_SIZE;
b35c50
+
b35c50
+  err = grub_ieee1275_total_mem (&rmo_top);
b35c50
+
b35c50
+  /*
b35c50
+   * If we cannot size the available memory, we can't be sure we're leaving
b35c50
+   * space for the kernel, initrd and things Linux loads early in boot. So only
b35c50
+   * allow further allocations from firmware on success
b35c50
+   */
b35c50
+  if (err == GRUB_ERR_NONE)
b35c50
+    grub_mm_add_region_fn = grub_ieee1275_mm_add_region;
b35c50
 
b35c50
 #if defined(__powerpc__)
b35c50
   if (grub_ieee1275_test_flag (GRUB_IEEE1275_FLAG_CAN_TRY_CAS_FOR_MORE_MEMORY))
b35c50
     {
b35c50
-      grub_uint64_t rma_size;
b35c50
-      grub_err_t err;
b35c50
-
b35c50
-      err = grub_ieee1275_total_mem (&rma_size);
b35c50
       /* if we have an error, don't call CAS, just hope for the best */
b35c50
-      if (err == GRUB_ERR_NONE && rma_size < (512 * 1024 * 1024))
b35c50
+      if (err == GRUB_ERR_NONE && rmo_top < (512 * 1024 * 1024))
b35c50
 	grub_ieee1275_ibm_cas ();
b35c50
     }
b35c50
 #endif
b35c50
diff --git a/docs/grub-dev.texi b/docs/grub-dev.texi
b35c50
index 7b2455a8fe..7edc5b7e2b 100644
b35c50
--- a/docs/grub-dev.texi
b35c50
+++ b/docs/grub-dev.texi
b35c50
@@ -1047,7 +1047,10 @@ space is limited to 4GiB. GRUB allocates pages from EFI for its heap, at most
b35c50
 1.6 GiB.
b35c50
 
b35c50
 On i386-ieee1275 and powerpc-ieee1275 GRUB uses same stack as IEEE1275.
b35c50
-It allocates at most 32MiB for its heap.
b35c50
+
b35c50
+On i386-ieee1275 and powerpc-ieee1275, GRUB will allocate 32MiB for its heap on
b35c50
+startup. It may allocate more at runtime, as long as at least 128MiB remain free
b35c50
+in OpenFirmware.
b35c50
 
b35c50
 On sparc64-ieee1275 stack is 256KiB and heap is 2MiB.
b35c50
 
b35c50
@@ -1075,7 +1078,7 @@ In short:
b35c50
 @item i386-qemu               @tab 60 KiB  @tab < 4 GiB
b35c50
 @item *-efi                   @tab ?       @tab < 1.6 GiB
b35c50
 @item i386-ieee1275           @tab ?       @tab < 32 MiB
b35c50
-@item powerpc-ieee1275        @tab ?       @tab < 32 MiB
b35c50
+@item powerpc-ieee1275        @tab ?       @tab available memory - 128MiB
b35c50
 @item sparc64-ieee1275        @tab 256KiB  @tab 2 MiB
b35c50
 @item arm-uboot               @tab 256KiB  @tab 2 MiB
b35c50
 @item mips(el)-qemu_mips      @tab 2MiB    @tab 253 MiB