Boot With Small Pages on RISC-V
问题
在 RISC-V 上启动 Linux Kernel 时,内核本身占用的内存页面使用的是大页模式(其中,32 位 CPU 用的是 4MB 模式,64 位 CPU 用的是 2MB 模式,这是由硬件决定的),从内核实现来看,这有助于代码的简洁和效率。
仔细分析后发现,第一个 2MB 物理内存页里面放的是 sbi 程序,实际只使用了一百多 KB,几乎是完全空闲掉了;Linux Kernel 内核本身放在了从第二个开始的若干个物理内存页,实际数量取决于内核大小。
所以目前的问题是,因为项目里所配置的内存容量有限,第一个物理内存页的空闲空间必须充分使用起来。
内核本身改用 4KB 分页
考虑到第一个物理页面的使用率不足,主要还是 2MB 分页引起的,所以解决办法是切换到 4KB 分页模式,提高使用率。
目前 Linux Kernel 里这部分代码实现的是固定的大页模式,没有选项可以使用,所以只能修改内核来实现(基本 5.14 版本的内核)。
1. 以下是修改步骤
这是所用到函数之间的调用关系:
_start @ arch/riscv/kernel/head.S
_start_kernel
setup_vm() <-------------- (*)
start_kernel()
setup_arch()
paging_init()
setup_vm_final() <-- (*)
(1) 为了与原代码兼容,添加了一个编译选项 CONFIG_BOOT_WITH_SMALL_PAGES
,只有打开这个选项时,才会使用到 4KB 分页;
(2) 预分配了 pmd_t
和 pte_t
内存,其尺寸取决于内核本身的大小(目前暂定 4MB),这是考虑到了项目的实际情况;另外还有 dtb 的占用;
(3) setup_vm()
函数里,因为 4KB 分页的实际情况,跟大页对比下来,需要多建立一级映射:
- 除了原有的
trampoline_pmd
,添加了trampoline_pte
; - 除了原有的
early_pg_dir
,添加了early_kernel_pmd
和early_kernel_pte
; - 除了原有的
early_dtb_pmd
,添加了early_dtb_pte
;
(4) setup_vm_final()
函数里,根据 CONFIG_BOOT_WITH_SMALL_PAGES
的选项,选择相应的物理页面尺寸,去调用后续的函数;
2. 补丁:
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -249,6 +249,13 @@ static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+#define MAX_KERNEL_MEGA_PAGES 4
+#define MAX_EARLY_MAPPING_SIZE (MAX_KERNEL_MEGA_PAGES << PMD_SHIFT)
+static pmd_t early_kernel_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
+static pte_t early_kernel_pte[PTRS_PER_PTE * MAX_KERNEL_MEGA_PAGES] __initdata __aligned(PAGE_SIZE);
+#endif
+
#ifdef CONFIG_XIP_KERNEL
#define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir))
#define fixmap_pte ((pte_t *)XIP_FIXUP(fixmap_pte))
@@ -330,6 +337,11 @@ static pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
static pmd_t early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+pte_t trampoline_pte[PTRS_PER_PTE * MAX_KERNEL_MEGA_PAGES] __page_aligned_bss;
+static pte_t early_dtb_pte[PTRS_PER_PTE * 2] __initdata __aligned(PAGE_SIZE);
+#endif
+
#ifdef CONFIG_XIP_KERNEL
#define trampoline_pmd ((pmd_t *)XIP_FIXUP(trampoline_pmd))
#define fixmap_pmd ((pmd_t *)XIP_FIXUP(fixmap_pmd))
@@ -563,6 +575,12 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
#ifndef __PAGETABLE_PMD_FOLDED
pmd_t fix_bmap_spmd, fix_bmap_epmd;
#endif
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+ uintptr_t va, end_va;
+ uintptr_t pmd_offset;
+ pte_t *pte;
+ uintptr_t load_sz;
+#endif
kernel_map.virt_addr = KERNEL_LINK_ADDR;
@@ -587,16 +605,25 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
pfn_base = PFN_DOWN(kernel_map.phys_addr);
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+ map_size = PAGE_SIZE;
+#else
/*
* Enforce boot alignment requirements of RV32 and
* RV64 by only allowing PMD or PGD mappings.
*/
map_size = PMD_SIZE;
+#endif
/* Sanity check alignment and size */
BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0);
BUG_ON((kernel_map.phys_addr % map_size) != 0);
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+ load_sz = ALIGN(kernel_map.size, PMD_SIZE);
+ BUG_ON(load_sz > MAX_EARLY_MAPPING_SIZE);
+#endif
+
#ifdef CONFIG_64BIT
/*
* The last 4K bytes of the addressable memory can not be mapped because
@@ -622,6 +649,20 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
/* Setup trampoline PGD and PMD */
create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr,
(uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
+
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+ end_va = kernel_map.virt_addr + load_sz;
+ for (va = kernel_map.virt_addr; va < end_va; va += map_size) {
+ pmd_offset = (va - kernel_map.virt_addr) >> PMD_SHIFT;
+ pte = &trampoline_pte[PTRS_PER_PTE * pmd_offset];
+
+ create_pmd_mapping(trampoline_pmd, va,
+ (uintptr_t)pte, PMD_SIZE, PAGE_TABLE);
+ create_pte_mapping(pte, va,
+ kernel_map.phys_addr + (va - kernel_map.virt_addr),
+ map_size, PAGE_KERNEL_EXEC);
+ }
+#else
#ifdef CONFIG_XIP_KERNEL
create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr,
kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC);
@@ -629,6 +670,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr,
kernel_map.phys_addr, PMD_SIZE, PAGE_KERNEL_EXEC);
#endif
+#endif
#else
/* Setup trampoline PGD */
create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr,
@@ -640,7 +682,23 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
* us to reach paging_init(). We map all memory banks later
* in setup_vm_final() below.
*/
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+ end_va = kernel_map.virt_addr + load_sz;
+ for (va = kernel_map.virt_addr; va < end_va; va += map_size) {
+ pmd_offset = (va - kernel_map.virt_addr) >> PMD_SHIFT;
+ pte = &early_kernel_pte[PTRS_PER_PTE * pmd_offset];
+
+ create_pgd_mapping(early_pg_dir, va,
+ (uintptr_t)early_kernel_pmd, PGDIR_SIZE, PAGE_TABLE);
+ create_pmd_mapping(early_kernel_pmd, va,
+ (uintptr_t)pte, PMD_SIZE, PAGE_TABLE);
+ create_pte_mapping(pte, va,
+ kernel_map.phys_addr + (va - kernel_map.virt_addr),
+ map_size, PAGE_KERNEL_EXEC);
+ }
+#else
create_kernel_page_table(early_pg_dir, map_size, true);
+#endif
#ifndef __PAGETABLE_PMD_FOLDED
/* Setup early PMD for DTB */
@@ -649,10 +707,24 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
#ifndef CONFIG_BUILTIN_DTB
/* Create two consecutive PMD mappings for FDT early scan */
pa = dtb_pa & ~(PMD_SIZE - 1);
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+ end_va = DTB_EARLY_BASE_VA + PMD_SIZE * 2;
+ for (va = DTB_EARLY_BASE_VA; va < end_va; va += map_size) {
+ pmd_offset = (va - DTB_EARLY_BASE_VA) >> PMD_SHIFT;
+ pte = &early_dtb_pte[PTRS_PER_PTE * pmd_offset];
+
+ create_pmd_mapping(early_dtb_pmd, va,
+ (uintptr_t)pte, PMD_SIZE, PAGE_TABLE);
+ create_pte_mapping(pte, va,
+ pa + (va - DTB_EARLY_BASE_VA),
+ map_size, PAGE_KERNEL);
+ }
+#else
create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA,
pa, PMD_SIZE, PAGE_KERNEL);
create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA + PMD_SIZE,
pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL);
+#endif
dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1));
#else /* CONFIG_BUILTIN_DTB */
#ifdef CONFIG_64BIT
@@ -722,6 +794,12 @@ static void __init setup_vm_final(void)
phys_addr_t pa, start, end;
u64 i;
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+ pr_notice("%s: boot with small pages option on\n", __func__);
+#else
+ pr_notice("%s: boot with small pages option off\n", __func__);
+#endif
+
/**
* MMU is enabled at this point. But page table setup is not complete yet.
* fixmap page table alloc functions should be used at this point
@@ -757,8 +835,13 @@ static void __init setup_vm_final(void)
}
#ifdef CONFIG_64BIT
+#ifdef CONFIG_BOOT_WITH_SMALL_PAGES
+ map_size = PAGE_SIZE;
+#else
+ map_size = PMD_SIZE;
+#endif
/* Map the kernel */
- create_kernel_page_table(swapper_pg_dir, PMD_SIZE, false);
+ create_kernel_page_table(swapper_pg_dir, map_size, false);
#endif
/* Clear fixmap PTE and PMD mappings */
diff --git a/init/Kconfig b/init/Kconfig
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2021,6 +2021,11 @@ config PROFILING
config TRACEPOINTS
bool
+config BOOT_WITH_SMALL_PAGES
+ bool "Allow booting with small pages"
+ default n
+ depends on RISCV
+
endmenu # General setup
source "arch/Kconfig"