Boot With Small Pages on RISC-V

问题

在 RISC-V 上启动 Linux Kernel 时,内核本身占用的内存页面使用的是大页模式(其中,32 位 CPU 用的是 4MB 模式,64 位 CPU 用的是 2MB 模式,这是由硬件决定的),从内核实现来看,这有助于代码的简洁和效率。

仔细分析后发现,第一个 2MB 物理内存页里面放的是 sbi 程序,实际只使用了一百多 KB,几乎是完全空闲掉了;Linux Kernel 内核本身放在了从第二个开始的若干个物理内存页,实际数量取决于内核大小。

所以目前的问题是,因为项目里所配置的内存容量有限,第一个物理内存页的空闲空间必须充分使用起来。

内核本身改用 4KB 分页

考虑到第一个物理页面的使用率不足,主要还是 2MB 分页引起的,所以解决办法是切换到 4KB 分页模式,提高使用率。

目前 Linux Kernel 里这部分代码实现的是固定的大页模式,没有选项可以使用,所以只能修改内核来实现(基本 5.14 版本的内核)。

1. 以下是修改步骤

这是所用到函数之间的调用关系:

_start @ arch/riscv/kernel/head.S
    _start_kernel
        setup_vm()  <-------------- (*)
        start_kernel()
          setup_arch()
            paging_init()
              setup_vm_final()  <-- (*)

(1) 为了与原代码兼容,添加了一个编译选项 CONFIG_BOOT_WITH_SMALL_PAGES,只有打开这个选项时,才会使用到 4KB 分页;

(2) 预分配了 pmd_tpte_t 内存,其尺寸取决于内核本身的大小(目前暂定 4MB),这是考虑到了项目的实际情况;另外还有 dtb 的占用;

(3) setup_vm() 函数里,因为 4KB 分页的实际情况,跟大页对比下来,需要多建立一级映射:

  • 除了原有的 trampoline_pmd,添加了 trampoline_pte
  • 除了原有的 early_pg_dir,添加了 early_kernel_pmdearly_kernel_pte
  • 除了原有的 early_dtb_pmd,添加了 early_dtb_pte

(4) setup_vm_final() 函数里,根据 CONFIG_BOOT_WITH_SMALL_PAGES 的选项,选择相应的物理页面尺寸,去调用后续的函数;

2. 补丁:

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -249,6 +249,13 @@ static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;

 pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);

+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+#define MAX_KERNEL_MEGA_PAGES   4
+#define MAX_EARLY_MAPPING_SIZE  (MAX_KERNEL_MEGA_PAGES << PMD_SHIFT)
+static pmd_t early_kernel_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
+static pte_t early_kernel_pte[PTRS_PER_PTE * MAX_KERNEL_MEGA_PAGES] __initdata __aligned(PAGE_SIZE);
+#endif
+
 #ifdef CONFIG_XIP_KERNEL
 #define trampoline_pg_dir      ((pgd_t *)XIP_FIXUP(trampoline_pg_dir))
 #define fixmap_pte             ((pte_t *)XIP_FIXUP(fixmap_pte))
@@ -330,6 +337,11 @@ static pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
 static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
 static pmd_t early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);

+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+pte_t trampoline_pte[PTRS_PER_PTE * MAX_KERNEL_MEGA_PAGES] __page_aligned_bss;
+static pte_t early_dtb_pte[PTRS_PER_PTE * 2] __initdata __aligned(PAGE_SIZE);
+#endif
+
 #ifdef CONFIG_XIP_KERNEL
 #define trampoline_pmd ((pmd_t *)XIP_FIXUP(trampoline_pmd))
 #define fixmap_pmd     ((pmd_t *)XIP_FIXUP(fixmap_pmd))
@@ -563,6 +575,12 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 #ifndef __PAGETABLE_PMD_FOLDED
    pmd_t fix_bmap_spmd, fix_bmap_epmd;
 #endif
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+   uintptr_t va, end_va;
+   uintptr_t pmd_offset;
+   pte_t *pte;
+   uintptr_t load_sz;
+#endif

    kernel_map.virt_addr = KERNEL_LINK_ADDR;

@@ -587,16 +605,25 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)

    pfn_base = PFN_DOWN(kernel_map.phys_addr);

+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+   map_size = PAGE_SIZE;
+#else
    /*
     * Enforce boot alignment requirements of RV32 and
     * RV64 by only allowing PMD or PGD mappings.
     */
    map_size = PMD_SIZE;
+#endif

    /* Sanity check alignment and size */
    BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0);
    BUG_ON((kernel_map.phys_addr % map_size) != 0);

+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+   load_sz = ALIGN(kernel_map.size, PMD_SIZE);
+   BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE);
+#endif
+
 #ifdef CONFIG_64BIT
    /*
     * The last 4K bytes of the addressable memory can not be mapped because
@@ -622,6 +649,20 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
    /* Setup trampoline PGD and PMD */
    create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr,
               (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
+
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+   end_va = kernel_map.virt_addr + load_sz;
+   for (va = kernel_map.virt_addr; va < end_va; va += map_size) {
+       pmd_offset = (va - kernel_map.virt_addr) >> PMD_SHIFT;
+       pte = &trampoline_pte[PTRS_PER_PTE * pmd_offset];
+
+       create_pmd_mapping(trampoline_pmd, va,
+                  (uintptr_t)pte, PMD_SIZE, PAGE_TABLE);
+       create_pte_mapping(pte, va,
+                  kernel_map.phys_addr + (va - kernel_map.virt_addr),
+                  map_size, PAGE_KERNEL_EXEC);
+   }
+#else
 #ifdef CONFIG_XIP_KERNEL
    create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr,
               kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC);
@@ -629,6 +670,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
    create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr,
               kernel_map.phys_addr, PMD_SIZE, PAGE_KERNEL_EXEC);
 #endif
+#endif
 #else
    /* Setup trampoline PGD */
    create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr,
@@ -640,7 +682,23 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
     * us to reach paging_init(). We map all memory banks later
     * in setup_vm_final() below.
     */
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+   end_va = kernel_map.virt_addr + load_sz;
+   for (va = kernel_map.virt_addr; va < end_va; va += map_size) {
+       pmd_offset = (va - kernel_map.virt_addr) >> PMD_SHIFT;
+       pte = &early_kernel_pte[PTRS_PER_PTE * pmd_offset];
+
+       create_pgd_mapping(early_pg_dir, va,
+                  (uintptr_t)early_kernel_pmd, PGDIR_SIZE, PAGE_TABLE);
+       create_pmd_mapping(early_kernel_pmd, va,
+                  (uintptr_t)pte, PMD_SIZE, PAGE_TABLE);
+       create_pte_mapping(pte, va,
+                  kernel_map.phys_addr + (va - kernel_map.virt_addr),
+                  map_size, PAGE_KERNEL_EXEC);
+   }
+#else
    create_kernel_page_table(early_pg_dir, map_size, true);
+#endif

 #ifndef __PAGETABLE_PMD_FOLDED
    /* Setup early PMD for DTB */
@@ -649,10 +707,24 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 #ifndef CONFIG_BUILTIN_DTB
    /* Create two consecutive PMD mappings for FDT early scan */
    pa = dtb_pa & ~(PMD_SIZE - 1);
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+   end_va = DTB_EARLY_BASE_VA + PMD_SIZE * 2;
+   for (va = DTB_EARLY_BASE_VA; va < end_va; va += map_size) {
+       pmd_offset = (va - DTB_EARLY_BASE_VA) >> PMD_SHIFT;
+       pte = &early_dtb_pte[PTRS_PER_PTE * pmd_offset];
+
+       create_pmd_mapping(early_dtb_pmd, va,
+                  (uintptr_t)pte, PMD_SIZE, PAGE_TABLE);
+       create_pte_mapping(pte, va,
+                  pa + (va - DTB_EARLY_BASE_VA),
+                  map_size, PAGE_KERNEL);
+   }
+#else
    create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA,
               pa, PMD_SIZE, PAGE_KERNEL);
    create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA + PMD_SIZE,
               pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL);
+#endif
    dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1));
 #else /* CONFIG_BUILTIN_DTB */
 #ifdef CONFIG_64BIT
@@ -722,6 +794,12 @@ static void __init setup_vm_final(void)
    phys_addr_t pa, start, end;
    u64 i;

+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+   pr_notice("%s: boot with small pages option on\n", __func__);
+#else
+   pr_notice("%s: boot with small pages option off\n", __func__);
+#endif
+
    /**
     * MMU is enabled at this point. But page table setup is not complete yet.
     * fixmap page table alloc functions should be used at this point
@@ -757,8 +835,13 @@ static void __init setup_vm_final(void)
    }

 #ifdef CONFIG_64BIT
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+   map_size = PAGE_SIZE;
+#else
+   map_size = PMD_SIZE;
+#endif
    /* Map the kernel */
-   create_kernel_page_table(swapper_pg_dir, PMD_SIZE, false);
+   create_kernel_page_table(swapper_pg_dir, map_size, false);
 #endif

    /* Clear fixmap PTE and PMD mappings */
diff --git a/init/Kconfig b/init/Kconfig
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2021,6 +2021,11 @@ config PROFILING
 config TRACEPOINTS
    bool

+config BOOT_WITH_SMALL_PAGES
+   bool "Allow booting with small pages"
+   default n
+   depends on RISCV
+
 endmenu        # General setup

 source "arch/Kconfig"

Read More: