当前位置：首页 > news >正文

Linux Perf Swevent软件事件计数与Hrtimer触发

news 2026/6/14 8:09:04

Linux perf_swevent 软件事件计数与 hrtimer 触发

一、软件事件的分类与注册

perf 软件事件（swevent）是指不由硬件 PMU 计数，而由内核代码路径直接触发的事件。注册方式为 PERF_TYPE_SOFTWARE 类型，支持的预定义事件：

enum perf_sw_ids {
PERF_COUNT_SW_CPU_CLOCK = 0, /* CPU 时钟高精度定时器 */
PERF_COUNT_SW_TASK_CLOCK = 1, /* 任务运行时钟 */
PERF_COUNT_SW_PAGE_FAULTS = 2, /* 缺页异常 */
PERF_COUNT_SW_CONTEXT_SWITCHES= 3, /* 上下文切换 */
PERF_COUNT_SW_CPU_MIGRATIONS = 4, /* 进程 CPU 迁移 */
PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, /* 缺页（minor） */
PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, /* 缺页（major） */
PERF_COUNT_SW_ALIGNMENT_FAULTS= 7, /* 对齐错误 */
PERF_COUNT_SW_EMULATION_FAULTS= 8, /* 指令模拟 */
PERF_COUNT_SW_DUMMY = 9, /* 哑事件 */
PERF_COUNT_SW_BPF_OUTPUT = 10,/* BPF 输出 */
PERF_COUNT_SW_CGROUP_SWITCHES = 11,/* cgroup 切换 */
PERF_COUNT_SW_MAX,
};

二、perf_swevent_init 初始化

软件事件的初始化函数为 swevent_hlist_put 和 swevent_hlist_get 管理事件链表：

static int perf_swevent_init(struct perf_event *event)
{
int event_id = event->attr.config;

/* 检查 event_id 范围 */
if (event_id >= PERF_COUNT_SW_MAX)
return -ENOENT;

/* 设置 PMU 回调 */
event->pmu = &perf_swevent;

/* 如果是跟踪点事件，初始化 hlist 表 */
if (event_id == PERF_COUNT_SW_DUMMY ||
event_id >= PERF_COUNT_SW_MAX) {
/* 动态事件需要特殊处理 */
}

/* 初始化定时器相关字段（用于采样的周期性触发） */
if (is_sampling_event(event)) {
/* 分配 hrtimer */
event->hw.timer = hrtimer_alloc(perf_swevent_hrtimer,
CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
}

return 0;
}

三、hrtimer 驱动的周期性采样

对于 CPU_CLOCK 和 TASK_CLOCK 等需要周期性计数的 swevent，perf_swevent 使用 hrtimer 定时触发。事件启用时启动定时器：

static void perf_swevent_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;

if (is_sampling_event(event)) {
/* 设置初始定时器过期时间 */
hwc->sample_period = event->attr.sample_period;
if (hwc->sample_period) {
/* 计算下一个过期时间 */
u64 period = hwc->sample_period;
hrtimer_start(&hwc->timer,
ns_to_ktime(period),
HRTIMER_MODE_REL_PINNED);
}
}

/* 标记事件为活动状态 */
event->state = PERF_EVENT_STATE_ACTIVE;
}

static void perf_swevent_stop(struct perf_event *event, int flags)
{
/* 停止定时器 */
hrtimer_cancel(&event->hw.timer);
event->state = PERF_EVENT_STATE_INACTIVE;
}

定时器回调函数 perf_swevent_hrtimer：

static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
struct hw_perf_event *hwc;
struct perf_event *event;
struct perf_sample_data data;
struct pt_regs *regs;

/* 从 hrtimer 反推 event */
hwc = container_of(hrtimer, struct hw_perf_event, timer);
event = container_of(hwc, struct perf_event, hw);

/* 如果 event 已关闭则不再重启 */
if (event->state != PERF_EVENT_STATE_ACTIVE)
return HRTIMER_NORESTART;

/* 递增计数器 */
local64_add(event->attr.sample_period, &event->count);

/* 构造采样数据 */
perf_sample_data_init(&data, 0, event->hw.last_period);
data.period = event->attr.sample_period;
data.time = perf_event_time(event);

/* 获取 pt_regs，如果可能 */
regs = get_irq_regs();

/* 调用溢出处理（写入 ring buffer） */
if (perf_event_overflow(event, &data, regs))
/* 如果 throttle 了，不再重启定时器 */
return HRTIMER_NORESTART;

/* 重置定时器 */
hrtimer_forward_now(hrtimer,
ns_to_ktime(event->attr.sample_period));

return HRTIMER_RESTART;
}

四、perf_swevent_event 内核调用点

内核代码通过 perf_sw_event 宏在关键路径埋点：

void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
struct perf_sample_data data;

/* 检查是否有任何 event 注册了此 ID */
if (!static_key_false(&perf_swevent_enabled[event_id]))
return; /* 无监听者，快速返回 */

/* 初始化采样数据 */
perf_sample_data_init(&data, addr, 0);
data.raw = NULL;

/* 触发事件 */
perf_swevent_event(event_id, nr, &data, regs);
}

perf_swevent_event 的查找与分发逻辑：

static void perf_swevent_event(struct perf_event *swevent, u64 nr,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct hlist_head *head;
struct perf_event *event;

/* 通过 percpu hlist 查找所有注册的 event */
head = this_cpu_ptr(&swevent_htable.recursion[event_id].head);

rcu_read_lock();
hlist_for_each_entry_rcu(event, head, hlist_entry) {
/* 检查 event 是否被过滤 */
if (perf_swevent_match(event, data, regs))
/* 写入 perf ring buffer */
perf_swevent_add(event, nr, data, regs);
}
rcu_read_unlock();
}

五、perf_swevent_add 与 ring buffer 写入

perf_swevent_add 负责计数递增和采样数据输出：

static int perf_swevent_add(struct perf_event *event, u64 nr,
struct perf_sample_data *data,
struct pt_regs *regs)
{
/* 1. 递增计数器（计数模式不需要采样） */
if (!(event->attr.sample_type & PERF_SAMPLE_READ)) {
local64_add(nr + data->period, &event->count);
}

/* 2. 采样：根据 sample_type 写入 ring buffer */
if (event->attr.sample_period) {
struct perf_output_handle handle;

/* 开始输出 */
ret = perf_output_begin(&handle, data, event, regs);
if (ret)
return ret;

/* 写入标准采样字段 */
perf_output_sample(&handle, event, data, regs);

/* 提交 */
perf_output_end(&handle);
}

return 0;
}

perf_output_sample 写入标准化字段的次序取决于 sample_type 位的设置：

void perf_output_sample(struct perf_output_handle *handle,
struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
/* 固定顺序写入 */
if (event->attr.sample_type & PERF_SAMPLE_IP)
perf_output_put(handle, data->ip);
if (event->attr.sample_type & PERF_SAMPLE_TID)
perf_output_put(handle, data->tid_entry);
if (event->attr.sample_type & PERF_SAMPLE_TIME)
perf_output_put(handle, data->time);
if (event->attr.sample_type & PERF_SAMPLE_ADDR)
perf_output_put(handle, data->addr);
if (event->attr.sample_type & PERF_SAMPLE_ID)
perf_output_put(handle, data->id);
if (event->attr.sample_type & PERF_SAMPLE_STREAM_ID)
perf_output_put(handle, data->stream_id);
if (event->attr.sample_type & PERF_SAMPLE_CPU)
perf_output_put(handle, data->cpu_entry);
if (event->attr.sample_type & PERF_SAMPLE_PERIOD)
perf_output_put(handle, data->period);
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
perf_output_sample_callchain(handle, event, data);
/* ... PERF_SAMPLE_RAW, PERF_SAMPLE_BRANCH_STACK etc */
}

六、上下文切换事件的触发

PERF_COUNT_SW_CONTEXT_SWITCHES 事件在调度器切换处触发：

// kernel/sched/core.c
static void __sched notrace __schedule(unsigned int sched_mode)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq_flags rf;
struct rq *rq;

/* ... */
prev = rq->curr;

if (prev->on_rq || sched_mode == SM_NONE) {
/* 触发 context-switch swevent */
perf_event_task_sched_out(prev, next);
}
/* ... */
}

void perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next)
{
struct perf_event_context *ctx;
int ctxn;

for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (likely(!ctx))
continue;

perf_ctx_sched_out(ctx, EVENT_ALL);
}

/* 触发软件事件 */
perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
}

page_fault 事件在缺页异常处理路径触发：

// mm/memory.c
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;

/* ... */
/* 计数 major/minor fault */
if (flags & FAULT_FLAG_TRIED) {
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
} else {
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
/* ... */
}

七、per_swevent 的 per-CPU hlist 结构

swevent_htable 是 percpu 变量，将 event_id 映射到 hlist：

struct swevent_htable {
struct swevent_hlist *swevent_hlist;
struct mutex hlist_mutex;
int recursion[PERF_COUNT_SW_MAX];
};

struct swevent_hlist {
struct hlist_head heads[PERF_COUNT_SW_MAX];
};

每个 CPU 有一个 swevent_htable，数组 heads[event_id] 是链表的头。

事件插入与删除：

static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
struct swevent_hlist *hlist;
int event_id = event->attr.config;

/* 分配 hlist（只分配一次） */
if (!swhash->swevent_hlist) {
hlist = kzalloc(sizeof(struct swevent_hlist), GFP_KERNEL);
/* ... */
rcu_assign_pointer(swhash->swevent_hlist, hlist);
}

/* 加入链表 */
hlist_add_head_rcu(&event->hlist_entry,
&swhash->swevent_hlist->heads[event_id]);
return 0;
}

八、与硬件事件的对比总结

| 特性 | 硬件事件 (PERF_TYPE_HARDWARE) | 软件事件 (PERF_TYPE_SOFTWARE) |
|-------------------|----------------------------------|---------------------------------|
| 计数来源 | PMU 寄存器 | 内核代码路径 static call |
| 溢出触发 | PMU NMI 中断 | hrtimer 定时器 |
| 精度 | 纳秒级（受外频限制） | 微秒至纳秒级（取决于 hrtimer） |
| 上下文感知 | 无法区分用户/内核 | 显式通过 perf_sw_event 传递 |
| 资源消耗 | 极低（硬件计数） | 函数调用 + 链表遍历 |
| 并发性 | 硬件原子操作 | local64_add + spinlock |

软件事件的计数精度受限于内核代码中触发点的密度，而硬件事件可以精确到每个 CPU 周期。但软件事件的灵活性和可扩展性远超硬件事件。