Kernel Pwn 学习之路（五）

admin

0
文章

0
评论

2023-11-30 17:41:42 AnQuanKeInfo 来源：ZONE.CI 全球网 0 阅读模式

0x01 前言

由于关于Kernel安全的文章实在过于繁杂，本文有部分内容大篇幅或全文引用了参考文献，若出现此情况的，将在相关内容的开头予以说明，部分引用参考文献的将在文件结尾的参考链接中注明。

Kernel的相关知识以及一些实例在Kernel中的利用已经在Kernel Pwn 学习之路(一)(二)给予了说明

Kernel中内存管理的相关知识已经在Kernel Pwn 学习之路(三)给予了说明

本文主要接续Kernel Pwn 学习之路(四)，继续研究内核中断的相关机制。本文涉及到的所有Linux Kernel相关代码均基于5.6.2版本。

限于篇幅的原因，本文仅介绍了异常中断前处理，下一篇文章将深入中断服务函数，介绍其内部实现~

【传送门】：Kernel Pwn 学习之路(一)

【传送门】：Kernel Pwn 学习之路(二)

【传送门】：Kernel Pwn 学习之路(三)

【传送门】：Kernel Pwn 学习之路(四)

0x02 通用内核代码中的IDT相关处理

在上一篇文章的分析中，处理机进入了保护模式以及长模式，在平台相关代码中完成了IDT的初始化。在那之后流程将转移到通用内核代码，接下来我们进行分析通用内核代码中的IDT相关处理代码。

入口函数在/source/init/main.c中实现(这里省略不分析的函数)，这个函数将完成内核以pid - 1运行第一个init进程之前的所有初始化工作。

asmlinkage __visible void __init start_kernel(void)
{
    char *command_line;
    char *after_dashes;

    ......

    local_irq_disable(); // Line 12
    early_boot_irqs_disabled = true;

    /*
     * Interrupts are still disabled. Do necessary setups, then
     * enable them.
     */

    ......

    setup_arch(&command_line);  // Line 23

    ......

    boot_init_stack_canary();  // Line 123

    ......

    early_boot_irqs_disabled = false;
    local_irq_enable();   // Line 133

    ......

}

为中断栈设置`Stack Canary`

在start_kernel()的line 123调用了boot_init_stack_canary()来设置canary值来缓解中断栈溢出。

此函数在/source/arch/x86/include/asm/stackprotector.h#L61处实现

/* SPDX-License-Identifier: GPL-2.0 */
/*
 * GCC stack protector support.
 *
 * Stack protector works by putting predefined pattern at the start of
 * the stack frame and verifying that it hasn't been overwritten when
 * returning from the function.  The pattern is called stack canary
 * and unfortunately gcc requires it to be at a fixed offset from %gs.
 * On x86_64, the offset is 40 bytes and on x86_32 20 bytes.  x86_64
 * and x86_32 use segment registers differently and thus handles this
 * requirement differently.
 *
 * On x86_64, %gs is shared by percpu area and stack canary.  All
 * percpu symbols are zero based and %gs points to the base of percpu
 * area.  The first occupant of the percpu area is always
 * fixed_percpu_data which contains stack_canary at offset 40.  Userland
 * %gs is always saved and restored on kernel entry and exit using
 * swapgs, so stack protector doesn't add any complexity there.
 *
 * On x86_32, it's slightly more complicated.  As in x86_64, %gs is
 * used for userland TLS.  Unfortunately, some processors are much
 * slower at loading segment registers with different value when
 * entering and leaving the kernel, so the kernel uses %fs for percpu
 * area and manages %gs lazily so that %gs is switched only when
 * necessary, usually during task switch.
 *
 * As gcc requires the stack canary at %gs:20, %gs can't be managed
 * lazily if stack protector is enabled, so the kernel saves and
 * restores userland %gs on kernel entry and exit.  This behavior is
 * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in
 * system.h to hide the details.
 */

#ifndef _ASM_STACKPROTECTOR_H
#define _ASM_STACKPROTECTOR_H 1

#ifdef CONFIG_STACKPROTECTOR

#include <asm/tsc.h>
#include <asm/processor.h>
#include <asm/percpu.h>
#include <asm/desc.h>

#include <linux/random.h>
#include <linux/sched.h>

/*
 * 24 byte read-only segment initializer for stack canary.  Linker
 * can't handle the address bit shifting.  Address will be set in
 * head_32 for boot CPU and setup_per_cpu_areas() for others.
 */
#define GDT_STACK_CANARY_INIT                        
    [GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18),

/*
 * Initialize the stackprotector canary value.
 *
 * NOTE: this must only be called from functions that never return,
 * and it must always be inlined.
 */
static __always_inline void boot_init_stack_canary(void)
{
    u64 canary;
    u64 tsc;

/* 
 * 如果设置了内核配置选项 CONFIG_X86_64 ，那么一开始将检查结构体 fixed_percpu_data 的状态
 * 这个结构体代表了 per-cpu 中断栈，其与 stack_canary 值中间有 40 个字节的 offset
 */
#ifdef CONFIG_X86_64
    BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40);
#endif
    /*
     * We both use the random pool and the current TSC as a source
     * of randomness. The TSC only matters for very early init,
     * there it already has some randomness on most systems. Later
     * on during the bootup the random pool has true entropy too.
     * 使用随机数和时戳计数器计算新的 canary 值
     */
    get_random_bytes(&canary, sizeof(canary));
    tsc = rdtsc();
    canary += tsc + (tsc << 32UL);
    canary &= CANARY_MASK;

    current->stack_canary = canary;
#ifdef CONFIG_X86_64
    // 通过 this_cpu_write 宏将 canary 值写入了 fixed_percpu_data 中:
    this_cpu_write(fixed_percpu_data.stack_canary, canary);
#else
    this_cpu_write(stack_canary.canary, canary);
#endif
}
......
#else    /* STACKPROTECTOR */
......
#endif    /* _ASM_STACKPROTECTOR_H */

它的实现取决于 CONFIG_STACKPROTECTOR 这个内核配置选项。如果该选项没有置位，那该函数将是一个空函数。

禁用/启用本地中断

在start_kernel()的line 12调用了local_irq_disable()来禁用本地中断。

在start_kernel()的line 133调用了local_irq_enable()来启用本地中断。

local_irq_enable()是一个宏定义，它定义在/source/include/linux/irqflags.h#L109

local_irq_disable()是一个宏定义，它定义在/source/include/linux/irqflags.h#L111

/*
 * The local_irq_*() APIs are equal to the raw_local_irq*()
 * if !TRACE_IRQFLAGS.
 */
#ifdef CONFIG_TRACE_IRQFLAGS
#define local_irq_enable() 
    do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0)
#define local_irq_disable() 
    do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0)

......

#else /* !CONFIG_TRACE_IRQFLAGS */

#define local_irq_enable()    do { raw_local_irq_enable(); } while (0)
#define local_irq_disable()    do { raw_local_irq_disable(); } while (0)

......

#endif /* CONFIG_TRACE_IRQFLAGS */

当 CONFIG_TRACE_IRQFLAGS_SUPPORT 选项置位时， local_irq_* 宏将同时调用 trace_hardirqs_* 函数。在Linux死锁检测模块lockdep中有一项功能 irq-flags tracing，它可以追踪 hardirq 和 softirq 的状态。在这种情况下， lockdep 死锁检测模块可以提供系统中关于硬/软中断的开/关事件的相关信息。

函数 trace_hardirqs_* 的定义位于/source/kernel/trace/trace_preemptirq.c#L22

void trace_hardirqs_on(void)
{
    if (this_cpu_read(tracing_irq_cpu)) {
        if (!in_nmi())
            trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
        tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
        this_cpu_write(tracing_irq_cpu, 0);
    }

    lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
NOKPROBE_SYMBOL(trace_hardirqs_on);

void trace_hardirqs_off(void)
{
    if (!this_cpu_read(tracing_irq_cpu)) {
        this_cpu_write(tracing_irq_cpu, 1);
        tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
        if (!in_nmi())
            trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
    }

    lockdep_hardirqs_off(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_off);
NOKPROBE_SYMBOL(trace_hardirqs_off);

可见它只是调用了 lockdep_hardirqs_* 函数。 lockdep_hardirqs_* 函数,该函数检查了当前进程的 hardirqs_enabled 域，如果本次 local_irq_disable 调用是冗余的话，便使 redundant_hardirqs_off 域的值增长，否则便使 hardirqs_off_events 域的值增加。这两个域或其它与死锁检测模块 lockdep 统计相关的域定义在/source/kernel/locking/lockdep_internals.h#L168处的 lockdep_stats 结构体中:

/*
 * Various lockdep statistics.
 * We want them per cpu as they are often accessed in fast path
 * and we want to avoid too much cache bouncing.
 */
struct lockdep_stats {
    unsigned long  chain_lookup_hits;
    unsigned int   chain_lookup_misses;
    unsigned long  hardirqs_on_events;
    unsigned long  hardirqs_off_events;
    unsigned long  redundant_hardirqs_on;
    unsigned long  redundant_hardirqs_off;
    unsigned long  softirqs_on_events;
    unsigned long  softirqs_off_events;
    unsigned long  redundant_softirqs_on;
    unsigned long  redundant_softirqs_off;
    int            nr_unused_locks;
    unsigned int   nr_redundant_checks;
    unsigned int   nr_redundant;
    unsigned int   nr_cyclic_checks;
    unsigned int   nr_find_usage_forwards_checks;
    unsigned int   nr_find_usage_backwards_checks;

    /*
     * Per lock class locking operation stat counts
     */
    unsigned long lock_class_ops[MAX_LOCKDEP_KEYS];
};

如果开启了 CONFIG_DEBUG_LOCKDEP 内核配置选项，lockdep_stats_debug_show函数会将所有的调试信息写入 /proc/lockdep 文件中。

接下来来分析 raw_local_irq_disable ，这个宏定义在/source/include/linux/irqflags.h#L79处实现，其展开后的样子是:

/*
 * Wrap the arch provided IRQ routines to provide appropriate checks.
 */
#define raw_local_irq_disable()        arch_local_irq_disable()
#define raw_local_irq_enable()        arch_local_irq_enable()

// In /source/arch/x86/include/asm/irqflags.h#L87

static inline notrace void arch_local_irq_disable(void)
{
    native_irq_disable();
}

static inline notrace void arch_local_irq_enable(void)
{
    native_irq_enable();
}

// In /source/arch/x86/include/asm/irqflags.h#L47

static inline void native_irq_disable(void)
{
    asm volatile("cli": : :"memory");
}

static inline void native_irq_enable(void)
{
    asm volatile("sti": : :"memory");
}

cli/sti 指令将清除/设置IF标志位，这个标志位控制着处理器是否响应中断或异常。

早期版本的内核中提供了一个叫做 cli 的函数来禁用所有处理器的中断，该函数已经被移除，替代它的是 local_irq_{enabled,disable} 宏，用于禁用或启用当前处理器的中断。我们在调用 local_irq_disable 宏禁用中断以后，接着设置了变量值:

early_boot_irqs_disabled = true;

变量 early_boot_irqs_disabled 定义在文件/source/include/linux/kernel.h中:

extern bool early_boot_irqs_disabled;

并在另外的地方使用。例如在/source/kernel/smp.c中的 smp_call_function_many 函数中，通过这个变量来检查当前是否由于中断禁用而处于死锁状态:

WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
                     && !oops_in_progress && !early_boot_irqs_disabled);

早期 `trap gate` 初始化

在start_kernel()的line 23调用了setup_arch()来完成很多架构相关的初始化工作。

在 setup_arch 函数中与中断相关的第一个函数是 idt_setup_early_traps函数，其对IDT进行了中断服务函数入口的填充。

`idt_setup_early_traps`函数分析

idt_setup_early_traps函数于/source/arch/x86/kernel/idt.c#L253处实现

/**
 * idt_setup_early_traps - Initialize the idt table with early traps
 *
 * On X8664 these traps do not use interrupt stacks as they can't work
 * before cpu_init() is invoked and sets up TSS. The IST variants are
 * installed after that.
 */
void __init idt_setup_early_traps(void)
{
    idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts), true);
    load_idt(&idt_descr);
}

// In /source/arch/x86/kernel/idt.c#L58

/*
 * Early traps running on the DEFAULT_STACK because the other interrupt
 * stacks work only after cpu_init().
 */
static const __initconst struct idt_data early_idts[] = {
    INTG(X86_TRAP_DB,        debug),
    SYSG(X86_TRAP_BP,        int3),
#ifdef CONFIG_X86_32
    INTG(X86_TRAP_PF,        page_fault),
#endif
};

// In /source/arch/x86/kernel/idt.c#L218

static void idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
{
    gate_desc desc;

    for (; size > 0; t++, size--) {
        // 初始化 desc 的各个成员变量
        idt_init_desc(&desc, t);
        // 将 desc 填入 idt
        write_idt_entry(idt, t->vector, &desc);
        if (sys)
            set_bit(t->vector, system_vectors);
    }
}

// In /source/arch/x86/kernel/idt.c#L203

static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
{
    unsigned long addr = (unsigned long) d->addr;

    gate->offset_low    = (u16) addr;
    gate->segment        = (u16) d->segment;
    gate->bits        = d->bits;
    gate->offset_middle    = (u16) (addr >> 16);
#ifdef CONFIG_X86_64
    gate->offset_high    = (u32) (addr >> 32);
    gate->reserved        = 0;
#endif
}

在idt_setup_from_table中，首先调用了idt_init_desc初始化了一个表示 IDT 入口项的 gate_desc 类型的结构体。

然后把这个中断门通过 write_idt_entry 宏填入了 IDT 中。这个宏展开后是 native_write_idt_entry ，其将中断门信息通过索引拷贝到了 idt_table 之中

// In /source/arch/x86/include/asm/desc.h#L128

#define write_idt_entry(dt, entry, g)        native_write_idt_entry(dt, entry, g)

// In /source/arch/x86/include/asm/desc.h#L141

static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
{
    memcpy(&idt[entry], gate, sizeof(*gate));
}

关于 `gate_desc` 结构体

gate_desc 结构体是一个在 x86 中被称为门的 16 字节数组。它拥有下面的结构：

gate_desc在/source/arch/x86/include/asm/desc_defs.h#L88中定义

typedef struct gate_struct gate_desc;

gate_struct在/source/arch/x86/include/asm/desc_defs.h#L77中定义

struct gate_struct {
    u16        offset_low;
    u16        segment;
    struct idt_bits    bits;
    u16        offset_middle;
#ifdef CONFIG_X86_64
    u32        offset_high;
    u32        reserved;
#endif
} __attribute__((packed));

struct idt_bits {
    u16        ist    : 3,
            zero    : 5,
            type    : 5,
            dpl    : 2,
            p    : 1;
} __attribute__((packed));

为了能从中断号得到对应的IDT，处理器把异常和中断向量分为 16 个级别。处理器处理异常和中断的发生就像它看到 call 指令时处理一个程序调用一样。处理器使用中断或异常的唯一的识别码(即中断号)作为索引来寻找对应的 IDT 的条目。

在IDT中的 IDT 条目由下面的域组成：

0-15 bits – 段选择器偏移，处理器用它作为中断处理程序的入口指针基址。
16-31 bits – 段选择器基址，包含中断处理程序入口指针。
IST – 在 x86_64 上的一个新的机制。
Type – 描述了 IDT 条目的类型。(即：中断门、任务门、陷阱门)
DPL – 描述符特权等级。
P – 段存在标志。
48-63 bits – 中断处理程序基址的第二部分。
64-95 bits – 中断处理程序基址的第三部分。
96-127 bits – CPU 保留位。

0x03 异常处理前处理

我们在之前讨论了IDT的初始化过程，现在我们来详细的看一看异常处理究竟是如何执行的。

首先我们注意到给idt_setup_from_table传入的参数有一项为early_idts数组，其中定义了DEBUG、INT3(、page_fault)两种异常(32位架构时，额外定义page_fault异常)。也就是说，在cpu_init()执行前，内核就已经能够处理这两种异常，那么我们就以这两种异常为例进行分析。

调试异常和断点异常

第一个异常 —— debug异常(助记符为#DB)，通常在在调试事件发生异常时报告。

例如：尝试更改调试寄存器的内容。(调试寄存器是x86从英特尔80386处理器开始出现在处理器中的特殊寄存器，从此它的名称可以确定这些寄存器的主要用途是调试)这些寄存器允许在代码上设置断点，并读取或写入数据以对其进行跟踪。调试寄存器只能在特权模式下访问，以任何其他特权级别执行时尝试读取或写入调试寄存器都会导致一般保护错误异常(General_protection_fault)。因此使用set_intr_gate_ist初始化#DB异常，而不是set_system_intr_gate_ist。

#DB异常的Verctor编号为1(也称为X86_TRAP_DB），并且正如我们在规范中看到的那样，该异常没有错误代码

Verctor 编号	异常助记符	异常描述	异常类型	错误代码
1	#DB	Reserved	F/T	NO

第二个异常 —— breakpoint异常(助记符为#BP)，当处理器执行int 3#INT_3)指令时发生异常。与DB异常不同，该#BP异常可能发生在用户空间中。我们可以将其添加到代码中的任何位置，例如，让我们看一下简单的程序：

// breakpoint.c
#include <stdio.h>

int main() {
    int i;
    while (i < 6){
        printf("i equal to: %dn", i);
        __asm__("int3");
        ++i;
    }
}

如果我们编译并运行该程序，我们将看到以下输出：

$ gcc breakpoint.c -o breakpoint
i equal to: 0
Trace/breakpoint trap

但是，如果将其与gdb一起运行，我们将看到断点并可以继续执行程序：

$ gdb breakpoint
...
...
...
(gdb) run
Starting program: /home/alex/breakpoints 
i equal to: 0

Program received signal SIGTRAP, Trace/breakpoint trap.
0x0000000000400585 in main ()
=> 0x0000000000400585 <main+31>:    83 45 fc 01    add    DWORD PTR [rbp-0x4],0x1
(gdb) c
Continuing.
i equal to: 1

Program received signal SIGTRAP, Trace/breakpoint trap.
0x0000000000400585 in main ()
=> 0x0000000000400585 <main+31>:    83 45 fc 01    add    DWORD PTR [rbp-0x4],0x1
(gdb) c
Continuing.
i equal to: 2

Program received signal SIGTRAP, Trace/breakpoint trap.
0x0000000000400585 in main ()
=> 0x0000000000400585 <main+31>:    83 45 fc 01    add    DWORD PTR [rbp-0x4],0x1
...
...
...

异常处理程序调用前准备

#DB和#BP的异常处理程序位于/source/arch/x86/include/asm/traps.h#L13

asmlinkage void divide_error(void);
asmlinkage void debug(void);
asmlinkage void nmi(void);
asmlinkage void int3(void);
asmlinkage void overflow(void);
asmlinkage void bounds(void);
asmlinkage void invalid_op(void);
asmlinkage void device_not_available(void);

asmlinkage是gcc的特殊说明符。实际上，对于C从汇编码中调用的函数，我们需要显式声明函数调用约定。如果函数使用asmlinkage描述符创建，gcc将从堆栈中检索参数以编译该函数。

因此，两个处理程序都在带有idtentry宏的/arch/x86/entry/entry_64.S中定义：

idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
idtentry int3 do_int3 has_error_code=0 create_gap=1

每个异常处理程序可以由两部分组成：

第一部分是通用部分，所有异常处理程序都相同。异常处理程序应将通用寄存器保存在堆栈上，如果异常来自用户空间，则应切换到内核堆栈，并将控制权转移到异常处理程序的第二部分。
异常处理程序的第二部分完成的工作取决于具体的异常。例如，页面错误异常处理程序应找到给定地址的虚拟页面，无效的操作码异常处理程序应发送SIGILL 信号等。

现在来分析idtentry宏的实现。如我们所见，该宏采用七个参数：

sym – 定义全局符号，该符号.globl name将作为异常处理程序的入口。
do_sym – 符号名称，这表示异常处理程序的辅助条目。
has_error_code – 异常是否存在错误代码。

最后四个参数是可选的：

paranoid – 非零表示可以使用用户GSBASE和/或用户CR3从内核模式调用此中断向量。
shift_ist – 如果内核模式下的中断条目使用IST堆栈，以便使得嵌套的中断条目获得新的中断栈，则置位。 (这是针对#DB的，它具有递归的逻辑。(这很糟糕！))
create_gap – 从内核模式进入此中断处理程序时，创建一个6字大小的堆栈间隙。
read_cr2 – 在调用任何C代码之前，将CR2加载到第3个参数中

.idtentry宏的定义：(实现在/source/arch/x86/entry/entry_64.S#L970)

/**
 * idtentry - Generate an IDT entry stub
 * @sym:        Name of the generated entry point
 * @do_sym:        C function to be called
 * @has_error_code:    True if this IDT vector has an error code on the stack
 * @paranoid:        non-zero means that this vector may be invoked from
 *            kernel mode with user GSBASE and/or user CR3.
 *            2 is special -- see below.
 * @shift_ist:        Set to an IST index if entries from kernel mode should
 *            decrement the IST stack so that nested entries get a
 *            fresh stack.  (This is for #DB, which has a nasty habit
 *            of recursing.)
 * @create_gap:        create a 6-word stack gap when coming from kernel mode.
 * @read_cr2:        load CR2 into the 3rd argument; done before calling any C code
 *
 * idtentry generates an IDT stub that sets up a usable kernel context,
 * creates struct pt_regs, and calls @do_sym.  The stub has the following
 * special behaviors:
 *
 * On an entry from user mode, the stub switches from the trampoline or
 * IST stack to the normal thread stack.  On an exit to user mode, the
 * normal exit-to-usermode path is invoked.
 *
 * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
 * whereas we omit the preemption check if @paranoid != 0.  This is purely
 * because the implementation is simpler this way.  The kernel only needs
 * to check for asynchronous kernel preemption when IRQ handlers return.
 *
 * If @paranoid == 0, then the stub will handle IRET faults by pretending
 * that the fault came from user mode.  It will handle gs_change faults by
 * pretending that the fault happened with kernel GSBASE.  Since this handling
 * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
 * @paranoid == 0.  This special handling will do the wrong thing for
 * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
 *
 * @paranoid == 2 is special: the stub will never switch stacks.  This is for
 * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
 */
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
SYM_CODE_START(sym)
    UNWIND_HINT_IRET_REGS offset=has_error_code*8

    /* Sanity check */
    .if shift_ist != -1 && paranoid != 1
    .error "using shift_ist requires paranoid=1"
    .endif

    .if create_gap && paranoid
    .error "using create_gap requires paranoid=0"
    .endif

    ASM_CLAC

    .if has_error_code == 0
    pushq    $-1                /* ORIG_RAX: no syscall to restart */
    .endif

    .if paranoid == 1
    testb    $3, CS-ORIG_RAX(%rsp)        /* If coming from userspace, switch stacks */
    jnz    .Lfrom_usermode_switch_stack_@
    .endif

    .if create_gap == 1
    /*
     * If coming from kernel space, create a 6-word gap to allow the
     * int3 handler to emulate a call instruction.
     */
    testb    $3, CS-ORIG_RAX(%rsp)
    jnz    .Lfrom_usermode_no_gap_@
    .rept    6
    pushq    5*8(%rsp)
    .endr
    UNWIND_HINT_IRET_REGS offset=8
.Lfrom_usermode_no_gap_@:
    .endif

    idtentry_part do_sym, has_error_code, read_cr2, paranoid, shift_ist, ist_offset

    .if paranoid == 1
    /*
     * Entry from userspace.  Switch stacks and treat it
     * as a normal entry.  This means that paranoid handlers
     * run in real process context if user_mode(regs).
     */
.Lfrom_usermode_switch_stack_@:
    idtentry_part do_sym, has_error_code, read_cr2, paranoid=0
    .endif

_ASM_NOKPROBE(sym)
SYM_CODE_END(sym)
.endm

在分析idtentry宏的内部实现之前，首先明确，这是发生异常时的堆栈状态：

    +------------+
+40 | %SS        |
+32 | %RSP       |
+24 | %RFLAGS    |
+16 | %CS        |
 +8 | %RIP       |
  0 | ERROR CODE | <-- %RSP
    +------------+

然后结合#DB和#BP的异常处理程序定义来看idtentry宏的内部实现：

idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
idtentry int3 do_int3 has_error_code=0 create_gap=1

编译器将生成带有debug和int3名称的两个例程，并且经过一些准备后，这两个异常处理程序将分别调用do_debug和do_int3辅助处理程序。第三个参数定义了错误代码是否存在，此处的两个异常都没有错误代码。如上面的堆栈结构所示，如果有异常，处理器会将错误代码压入堆栈。那么我们可以很直观的看出，对于提供错误代码的异常和未提供错误代码的异常，堆栈的外观会有所不同。这就是为什么idtentry宏的实现中，在异常未提供错误代码的情况下将会把”伪造”的错误代码放入堆栈：
```
.if has_error_code == 0
pushq    $-1                /* ORIG_RAX: no syscall to restart */
.endif
```
但这不仅仅是一个”伪造”的错误代码，-1还会代表无效的系统调用号，因此这不会触发系统调用的重新启动逻辑。
接下来的第一个可选参数 – shift_ist参数将表征异常处理程序是否使用了IST栈。系统中的每个内核线程都有自己的堆栈。除了这些堆栈外，还有一些专用堆栈与系统中的每个处理器相关联，异常栈就是这类专用堆栈之一。x86_64架构提供了一个新机制，它被称为Interrupt Stack Table(IST机制)。此机制允许在发生指定事件时(例如double fault之类的原子异常等)切换到新堆栈。shift_ist参数就用来标识是否需要使用IST机制为异常处理程序创建一个新的堆栈。
第二个可选参数 – paranoid定义了一种方法，可以帮助我们知道服务程序的调用是来自用户空间还是来自异常处理程序。确定这一点的最简单方法是通过在CS段寄存器中的CPL(Current Privilege Level)。如果等于3，则来自用户空间，如果为零，则来自内核空间。
```
   .if paranoid == 1
     testb    $3, CS-ORIG_RAX(%rsp)        /* If coming from userspace, switch stacks */
     jnz    .Lfrom_usermode_switch_stack_@
   .endif
```
但是不幸的是，这种方法不能提供100％的保证。如内核文档中所述：

如果我们处于 NMI/MCE/DEBUG 以及其他任何 super-atomic 入口上下文中，那么在正常入口将CS写入堆栈之后，执行SWAPGS之前可能已经触发异常，那么检查GS的唯一安全方法是一种速度较慢的方法：RDMSR。

换言之，例如NMI(不可屏蔽中断)发生在swapgs指令的内部。这样的话，我们应该检查MSR_GS_BASE的值，该寄存器存储指向每个cpu区域开始的指针。因此，要检查我们是否来自用户空间，我们应该检查MSR_GS_BASE，如果它是负数，则我们来自内核空间，否则我们来自用户空间：
```
  movl $MSR_GS_BASE,%ecx
  rdmsr
  testl %edx,%edx
  js 1f
```
在前两行代码中，我们将MSR_GS_BASE的值按edx:eax成对读取，我们不能为用户空间中的gs寄存器设置负值。但是从另一方面说，我们知道物理内存的直接映射是从0xffff880000000000虚拟地址开始的。这样，MSR_GS_BASE将包含从0xffff880000000000到的地址0xffffc7ffffffffff。而后rdmsr指令将被执行，%edx寄存器中可能的最小值将会是0xffff8800也就是-30720(unsigned 4 bytes)。这就是gs指向per-cpu区域开始的内核空间包含负值的原因。
在为通用寄存器分配空间之后，我们进行一些检查以了解异常是否来自用户空间，如果是，则应移回中断的进程堆栈或保留在异常堆栈上：

.if paranoid
    .if paranoid == 1
        testb    $3, CS(%rsp)
        jnz    1f
    .endif
    call    paranoid_entry
.else
    call    error_entry
.endif

让我们考虑一下所有这些情况。

当用户空间中发生异常时

可以看到，当用户空间中发生异常时，内核会执行如下处理逻辑：

.if paranoid == 1
    testb    $3, CS-ORIG_RAX(%rsp)        /* If coming from userspace, switch stacks */
    jnz    .Lfrom_usermode_switch_stack_@
.endif
.if paranoid == 1
    /*
     * Entry from userspace.  Switch stacks and treat it
     * as a normal entry.  This means that paranoid handlers
     * run in real process context if user_mode(regs).
     */
.Lfrom_usermode_switch_stack_@:
    idtentry_part do_sym, has_error_code, read_cr2, paranoid=0
.endif

也就是核心是执行idtentry_part do_sym, has_error_code, read_cr2, paranoid=0

那么关于idtentry_part在/source/arch/x86/entry/entry_64.S#L868处实现

/*
 * Exception entry points.
 */
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)

.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0

    .if paranoid
        call    paranoid_entry
    /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
    .else
        call    error_entry
    .endif
    UNWIND_HINT_REGS

    .if read_cr2
    /*
     * Store CR2 early so subsequent faults cannot clobber it. Use R12 as
     * intermediate storage as RDX can be clobbered in enter_from_user_mode().
     * GET_CR2_INTO can clobber RAX.
     */
    GET_CR2_INTO(%r12);
    .endif

    .if shift_ist != -1
    TRACE_IRQS_OFF_DEBUG            /* reload IDT in case of recursion */
    .else
    TRACE_IRQS_OFF
    .endif

    .if paranoid == 0
    testb    $3, CS(%rsp)
    jz    .Lfrom_kernel_no_context_tracking_@
    CALL_enter_from_user_mode
.Lfrom_kernel_no_context_tracking_@:
    .endif

    movq    %rsp, %rdi            /* pt_regs pointer */

    .if has_error_code
    movq    ORIG_RAX(%rsp), %rsi        /* get error code */
    movq    $-1, ORIG_RAX(%rsp)        /* no syscall to restart */
    .else
    xorl    %esi, %esi            /* no error code */
    .endif

    .if shift_ist != -1
    subq    $ist_offset, CPU_TSS_IST(shift_ist)
    .endif

    .if read_cr2
    movq    %r12, %rdx            /* Move CR2 into 3rd argument */
    .endif

    call    do_sym

    .if shift_ist != -1
    addq    $ist_offset, CPU_TSS_IST(shift_ist)
    .endif

    .if paranoid
    /* this procedure expect "no swapgs" flag in ebx */
    jmp    paranoid_exit
    .else
    jmp    error_exit
    .endif

.endm

error_entry处理分析

假设我们此时进入了error_entry的处理逻辑，它在/source/arch/x86/entry/entry_64.S#L1287处实现：

/*
 * Save all registers in pt_regs, and switch GS if needed.
 */
SYM_CODE_START_LOCAL(error_entry)
    UNWIND_HINT_FUNC
    cld
    PUSH_AND_CLEAR_REGS save_ret=1
    ENCODE_FRAME_POINTER 8
    testb    $3, CS+8(%rsp)
    jz    .Lerror_kernelspace

    /*
     * We entered from user mode or we're pretending to have entered
     * from user mode due to an IRET fault.
     */
    SWAPGS
    FENCE_SWAPGS_USER_ENTRY
    /* We have user CR3.  Change to kernel CR3. */
    SWITCH_TO_KERNEL_CR3 scratch_reg=%rax

.Lerror_entry_from_usermode_after_swapgs:
    /* Put us onto the real thread stack. */
    popq    %r12                /* save return addr in %12 */
    movq    %rsp, %rdi            /* arg0 = pt_regs pointer */
    call    sync_regs
    movq    %rax, %rsp            /* switch stack */
    ENCODE_FRAME_POINTER
    pushq    %r12
    ret

.Lerror_entry_done_lfence:
    FENCE_SWAPGS_KERNEL_ENTRY
.Lerror_entry_done:
    ret

    /*
     * There are two places in the kernel that can potentially fault with
     * usergs. Handle them here.  B stepping K8s sometimes report a
     * truncated RIP for IRET exceptions returning to compat mode. Check
     * for these here too.
     */
.Lerror_kernelspace:
    leaq    native_irq_return_iret(%rip), %rcx
    cmpq    %rcx, RIP+8(%rsp)
    je    .Lerror_bad_iret
    movl    %ecx, %eax            /* zero extend */
    cmpq    %rax, RIP+8(%rsp)
    je    .Lbstep_iret
    cmpq    $.Lgs_change, RIP+8(%rsp)
    jne    .Lerror_entry_done_lfence

    /*
     * hack: .Lgs_change can fail with user gsbase.  If this happens, fix up
     * gsbase and proceed.  We'll fix up the exception and land in
     * .Lgs_change's error handler with kernel gsbase.
     */
    SWAPGS
    FENCE_SWAPGS_USER_ENTRY
    SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
    jmp .Lerror_entry_done

.Lbstep_iret:
    /* Fix truncated RIP */
    movq    %rcx, RIP+8(%rsp)
    /* fall through */

.Lerror_bad_iret:
    /*
     * We came from an IRET to user mode, so we have user
     * gsbase and CR3.  Switch to kernel gsbase and CR3:
     */
    SWAPGS
    FENCE_SWAPGS_USER_ENTRY
    SWITCH_TO_KERNEL_CR3 scratch_reg=%rax

    /*
     * Pretend that the exception came from user mode: set up pt_regs
     * as if we faulted immediately after IRET.
     */
    mov    %rsp, %rdi
    call    fixup_bad_iret
    mov    %rax, %rsp
    jmp    .Lerror_entry_from_usermode_after_swapgs
SYM_CODE_END(error_entry)

保存现场(储存所有通用寄存器)

首先内核会把返回地址保存在R12寄存器中，随即会调用PUSH_AND_CLEAR_REGS将通用寄存器的值存储在中断栈上：

首先内核会调用PUSH_AND_CLEAR_REGS将通用寄存器的值存储在中断栈上：

.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
    /*
     * Push registers and sanitize registers of values that a
     * speculation attack might otherwise want to exploit. The
     * lower registers are likely clobbered well before they
     * could be put to use in a speculative execution gadget.
     * Interleave XOR with PUSH for better uop scheduling:
     */
    .if save_ret
    pushq    %rsi        /* pt_regs->si */
    movq    8(%rsp), %rsi    /* temporarily store the return address in %rsi */
    movq    %rdi, 8(%rsp)    /* pt_regs->di (overwriting original return address) */
    .else
    pushq   %rdi        /* pt_regs->di */
    pushq   %rsi        /* pt_regs->si */
    .endif
    pushq    rdx        /* pt_regs->dx */
    xorl    %edx, %edx    /* nospec   dx */
    pushq   %rcx        /* pt_regs->cx */
    xorl    %ecx, %ecx    /* nospec   cx */
    pushq   rax        /* pt_regs->ax */
    pushq   %r8        /* pt_regs->r8 */
    xorl    %r8d, %r8d    /* nospec   r8 */
    pushq   %r9        /* pt_regs->r9 */
    xorl    %r9d, %r9d    /* nospec   r9 */
    pushq   %r10        /* pt_regs->r10 */
    xorl    %r10d, %r10d    /* nospec   r10 */
    pushq   %r11        /* pt_regs->r11 */
    xorl    %r11d, %r11d    /* nospec   r11*/
    pushq    %rbx        /* pt_regs->rbx */
    xorl    %ebx, %ebx    /* nospec   rbx*/
    pushq    %rbp        /* pt_regs->rbp */
    xorl    %ebp, %ebp    /* nospec   rbp*/
    pushq    %r12        /* pt_regs->r12 */
    xorl    %r12d, %r12d    /* nospec   r12*/
    pushq    %r13        /* pt_regs->r13 */
    xorl    %r13d, %r13d    /* nospec   r13*/
    pushq    %r14        /* pt_regs->r14 */
    xorl    %r14d, %r14d    /* nospec   r14*/
    pushq    %r15        /* pt_regs->r15 */
    xorl    %r15d, %r15d    /* nospec   r15*/
    UNWIND_HINT_REGS
    .if save_ret
    pushq    %rsi        /* return address on top of stack */
    .endif
.endm

执行后，堆栈将如下所示：

     +------------+
+160 | %SS        |
+152 | %RSP       |
+144 | %RFLAGS    |
+136 | %CS        |
+128 | %RIP       |
+120 | ERROR CODE |
     |------------|
+112 | %RDI       |
+104 | %RSI       |
 +96 | %RDX       |
 +88 | %RCX       |
 +80 | %RAX       |
 +72 | %R8        |
 +64 | %R9        |
 +56 | %R10       |
 +48 | %R11       |
 +40 | %RBX       |
 +32 | %RBP       |
 +24 | %R12       |
 +16 | %R13       |
  +8 | %R14       |
  +0 | %R15       | <- %RSP
     +------------+

再次检查`CPL`

内核将通用寄存器保存在堆栈中之后，因为正如官方文档中描述的那样，一旦发生%RIP中断，则有可能发生错误，我们应该使用以下命令再次检查是否来自用户空间空间：

testb  $3, CS+8(%rsp)
jz  .Lerror_kernelspace

<a name=”初始化`GS`寄存器” class=”reference-link”>初始化`GS`寄存器

接下来将执行SWAPGS指令，这将会交换MSR_KERNEL_GS_BASE和MSR_GS_BASE中的值。从这一刻起，%gs寄存器将指向内核结构的基址。

获取运行栈的栈指针(`sync_regs`函数分析)

接下来将会进入.Lerror_entry_from_usermode_after_swapgs:中：

movq    %rsp, %rdi
call    sync_regs

在这里，我们将堆栈的基址指针置入%rdi寄存器这将作为sync_regs函数的参数。

接下来我们来分析sync_regs函数：(在/source/arch/x86/kernel/traps.c#L613中实现)

/*
 * Help handler running on a per-cpu (IST or entry trampoline) stack
 * to switch to the normal thread stack if the interrupted code was in
 * user mode. The actual stack switch is done in entry_64.S
 */
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
    struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
    if (regs != eregs)
        *regs = *eregs;
    return regs;
}
NOKPROBE_SYMBOL(sync_regs);

// In /source/include/linux/percpu-defs.h#L507

#define this_cpu_read(pcp)        __pcpu_size_call_return(this_cpu_read_, pcp)

这将会获取运行栈的栈指针将其存储在中断栈中并返回，这意味着异常处理程序将在实际流程上下文中运行。

栈切换

接下来我们进行栈切换操作

正如我们来自用户空间一样，这意味着异常处理程序将在实际流程上下文中运行。从堆栈指针中获取堆栈指针后，sync_regs我们切换堆栈：

movq    %rax, %rsp

然后内核从R12中取出返回地址，返回上级函数

可选参数逻辑分析

在用户空间发生异常的处理逻辑下，接下来只需要处理以下三个选项has_error_code, read_cr2, paranoid=0

    .if read_cr2
    /*
     * Store CR2 early so subsequent faults cannot clobber it. Use R12 as
     * intermediate storage as RDX can be clobbered in enter_from_user_mode().
     * GET_CR2_INTO can clobber RAX.
     */
    GET_CR2_INTO(%r12);
    .endif

    .if shift_ist != -1
        ......(代码省略)
    .endif

    .if paranoid == 0
    testb    $3, CS(%rsp)
    jz    .Lfrom_kernel_no_context_tracking_@
    CALL_enter_from_user_mode
.Lfrom_kernel_no_context_tracking_@:
    .endif

    movq    %rsp, %rdi            /* pt_regs pointer */

    .if has_error_code
    movq    ORIG_RAX(%rsp), %rsi        /* get error code */
    movq    $-1, ORIG_RAX(%rsp)        /* no syscall to restart */
    .else
    xorl    %esi, %esi            /* no error code */
    .endif

    .if shift_ist != -1
        ......(代码省略)
    .endif

    .if read_cr2
    movq    %r12, %rdx            /* Move CR2 into 3rd argument */
    .endif

    call    do_sym

    .if shift_ist != -1
    addq    $ist_offset, CPU_TSS_IST(shift_ist)
    .endif

    .if paranoid
    /* this procedure expect "no swapgs" flag in ebx */
    jmp    paranoid_exit
    .else
    jmp    error_exit
    .endif

<a name=”若`read_cr2`被设置” class=”reference-link”>若`read_cr2`被设置

read_cr2相关的逻辑有两处，第一处是

.if read_cr2
    /*
     * Store CR2 early so subsequent faults cannot clobber it. Use R12 as
     * intermediate storage as RDX can be clobbered in enter_from_user_mode().
     * GET_CR2_INTO can clobber RAX.
     */
    GET_CR2_INTO(%r12);
.endif

# In /source/arch/x86/entry/calling.h#L365

#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg

作用是存储CR2寄存器的值到R12寄存器。

第二处逻辑是

.if read_cr2
    movq    %r12, %rdx            /* Move CR2 into 3rd argument */
.endif

作用是存储R12寄存器的值到RDX寄存器，也就是把CR2寄存器的值存储到RDX寄存器作为接下来调用函数的第三个参数。

若`has_error_code`被设置

.if has_error_code
    movq    ORIG_RAX(%rsp), %rsi
    movq    $-1, ORIG_RAX(%rsp)
.else
    xorl    %esi, %esi
.endif

作用是将错误代码传递给RSI寄存器，这将作为将是异常处理程序的第二个参数，在那之后将其设置-1以防止再次启动系统调用，另外，如果异常不提供错误代码，将会清空ESI寄存器。

收尾逻辑分析

最后一定会执行的逻辑是：

.if paranoid == 0
    testb    $3, CS(%rsp)
    jz    .Lfrom_kernel_no_context_tracking_@
    CALL_enter_from_user_mode
.Lfrom_kernel_no_context_tracking_@:
.endif

movq    %rsp, %rdi            /* pt_regs pointer */

首先再次检查CPL以确保异常来自用户控件，然后将pt_regs(存储了保存的”现场”)赋值给RDI，这将作为中断服务程序的第一个参数，最后调用辅助异常处理程序

call  do_sym

若是debug异常，则调用：

dotraplinkage void do_debug(struct pt_regs *regs, long error_code);

若是int3异常，则调用：

dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code);

当内核空间中发生异常时

当内核空间中发生异常且paranoid > 0时，内核将进入paranoid_entry进行处理

`paranoid_entry`处理分析

paranoid_entry的处理逻辑在/source/arch/x86/entry/entry_64.S#L1218处实现：

/*
 * Save all registers in pt_regs, and switch gs if needed.
 * Use slow, but surefire "are we in kernel?" check.
 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
 */
SYM_CODE_START_LOCAL(paranoid_entry)
    UNWIND_HINT_FUNC
    cld
    PUSH_AND_CLEAR_REGS save_ret=1
    ENCODE_FRAME_POINTER 8
    movl    $1, %ebx
    movl    $MSR_GS_BASE, %ecx
    rdmsr
    testl    %edx, %edx
    js    1f                /* negative -> in kernel */
    SWAPGS
    xorl    %ebx, %ebx

1:
    /*
     * Always stash CR3 in %r14.  This value will be restored,
     * verbatim, at exit.  Needed if paranoid_entry interrupted
     * another entry that already switched to the user CR3 value
     * but has not yet returned to userspace.
     *
     * This is also why CS (stashed in the "iret frame" by the
     * hardware at entry) can not be used: this may be a return
     * to kernel code, but with a user CR3 value.
     */
    SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14

    /*
     * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
     * unconditional CR3 write, even in the PTI case.  So do an lfence
     * to prevent GS speculation, regardless of whether PTI is enabled.
     */
    FENCE_SWAPGS_KERNEL_ENTRY

    ret
SYM_CODE_END(paranoid_entry)

正如之前所说明的那样，这个入口将会以较慢的方式来获取有关被中断任务的先前状态以检查异常是否真的来自内核空间，可以看到我们首先执行的操作和error_entry逻辑相同，首先保存现场，然后使用较慢的方式检查异常的来源，随即返回到上级函数。

0x04 一个简单内核模块的编写

事实上，本篇文章的内容到0x03就已经结束了，我们将在下一篇文章介绍具体的中断服务函数的实现。

但是在这里我想添加一点内容，就是如何去编译一个简易的内核模块并运行。

编译Linux Kernel

这个部分已经在Kernel Pwn 学习之路(一)给予了说明故此处不再赘述

这里需要注意一点，因为我们想要在使用QEMU启动时使其支持9p协议，因此我们需要需要修改.config文件，需要将文件里的

CONFIG_NET_9P=m
CONFIG_NET_9P_VIRTIO=m
CONFIG_NET_9P_XEN=m
CONFIG_NET_9P_RDMA=m
# CONFIG_NET_9P_DEBUG is not set
......
CONFIG_9P_FS=m
CONFIG_9P_FSCACHE=y
CONFIG_9P_FS_POSIX_ACL=y
CONFIG_9P_FS_SECURITY=y

替换为

CONFIG_NET_9P=y
CONFIG_NET_9P_VIRTIO=y
CONFIG_NET_9P_XEN=m
CONFIG_NET_9P_RDMA=m
CONFIG_NET_9P_DEBUG=y (Optional)
......
CONFIG_9P_FS=y
CONFIG_9P_FSCACHE=y
CONFIG_9P_FS_POSIX_ACL=y
CONFIG_9P_FS_SECURITY=y

⚠：如果执行make编译后无法在/arch/x86/boot中找到bzImage，请尝试执行make -jx bzImage(x是你期望使用的核数)直至看到以下提示：

构建文件系统

首先找一个已经构建好的文件系统解包(可以直接利用Busybox生成)，重点是bin、sbin、usr这三个文件夹以及根目录下的linuxrc文件，其他文件夹均可暂时置空，然后在/etc下建立passwd文件以建立用户，内容如下：

root:x:0:0:root:/root:/bin/sh
error404:x:1000:1000:error404:/home/error404:/bin/sh

然后可以继续建立profile文件，内容如下：

alias ll='ls -al '
alias l='ls '
if [ $(id -u) == 0 ]; then 
    COLOR="31"  
else
    COLOR="34"
    cd /home/user
fi
export PS1="e[01;${COLOR}m $(whoami)@my-kernel [33[00m]:[33[36m]w[33[00m]$ "

最后在根目录下建立最重要的init文件：

#!/bin/sh

mount -t devtmpfs none /dev
mount -t proc proc /proc
mount -t sysfs sysfs /sys

#
# module
#
insmod /lib/modules/*/error404/*.ko
chmod 666 /dev/Test
# mmap_min_addr to 0 for the challenge to be simpler for now ;)
echo 0 > /proc/sys/vm/mmap_min_addr

#
# shell
#
echo "Hello!"
export ENV=/etc/profile
setsid cttyhack setuidgid 1000 sh

umount /proc
umount /sys
umount /dev

poweroff -f

内核模块代码

这是一个相当简单的内核模块代码

#include <linux/init.h>
#include <linux/module.h>
#include <linux/cred.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>

MODULE_LICENSE("Dual BSD/GPL");

static int hello_init(void)
{
    printk(KERN_ALERT "[ERROR404]My First Module！");
    printk(KERN_ALERT "[ERROR404]sizeof cred   : %d", sizeof(struct cred));
    printk(KERN_ALERT "[ERROR404]sizeof tty    : %d", sizeof(struct tty_struct));
    printk(KERN_ALERT "[ERROR404]sizeof tty_op : %d", sizeof(struct tty_operations));
    return 0;
}

static void hello_exit(void)
{
    printk(KERN_ALERT "[ERROR404]Bye!");
}

module_init(hello_init);
module_exit(hello_exit);

我们首先需要在代码的同目录下写一个makefile，内容如下：

obj-m := Test.o
KERNELBUILD := /home/error404/Desktop/Mac_desktop/Linux-Kernel/SourceCode/linux-5.5.6
CURDIR := /home/error404/Desktop/Mac_desktop/Linux-Kernel/build/Test

modules:
    make -C $(KERNELBUILD) M=$(CURDIR) modules
clean:
    make -C $(KERNELBUILD) M=$(CURDIR) clean

执行make进行编译，将编译出的文件放在/lib/modules/5.5.6/error404/下即可

启动QEMU

建立Start_Kernel.sh文件，内容如下：

#！sh

qemu-system-x86_64   
        -kernel ./bzImage 
        -initrd ./rootfs.cpio  
        -append 'console=ttyS0 loglevel=0 pti=off oops=panic panic=1 nokaslr' 
        -nographic

直接运行Start_Kernel.sh即可，紧接着运行dmesg即可看到结果