Linux Perf Swevent软件事件计数与Hrtimer触发
Linux perf_swevent 软件事件计数与 hrtimer 触发
一、软件事件的分类与注册
perf 软件事件(swevent)是指不由硬件 PMU 计数,而由内核代码路径直接触发的事件。注册方式为 PERF_TYPE_SOFTWARE 类型,支持的预定义事件:
enum perf_sw_ids {
PERF_COUNT_SW_CPU_CLOCK = 0, /* CPU 时钟高精度定时器 */
PERF_COUNT_SW_TASK_CLOCK = 1, /* 任务运行时钟 */
PERF_COUNT_SW_PAGE_FAULTS = 2, /* 缺页异常 */
PERF_COUNT_SW_CONTEXT_SWITCHES= 3, /* 上下文切换 */
PERF_COUNT_SW_CPU_MIGRATIONS = 4, /* 进程 CPU 迁移 */
PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, /* 缺页(minor) */
PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, /* 缺页(major) */
PERF_COUNT_SW_ALIGNMENT_FAULTS= 7, /* 对齐错误 */
PERF_COUNT_SW_EMULATION_FAULTS= 8, /* 指令模拟 */
PERF_COUNT_SW_DUMMY = 9, /* 哑事件 */
PERF_COUNT_SW_BPF_OUTPUT = 10,/* BPF 输出 */
PERF_COUNT_SW_CGROUP_SWITCHES = 11,/* cgroup 切换 */
PERF_COUNT_SW_MAX,
};
二、perf_swevent_init 初始化
软件事件的初始化函数为 swevent_hlist_put 和 swevent_hlist_get 管理事件链表:
static int perf_swevent_init(struct perf_event *event)
{
int event_id = event->attr.config;
/* 检查 event_id 范围 */
if (event_id >= PERF_COUNT_SW_MAX)
return -ENOENT;
/* 设置 PMU 回调 */
event->pmu = &perf_swevent;
/* 如果是跟踪点事件,初始化 hlist 表 */
if (event_id == PERF_COUNT_SW_DUMMY ||
event_id >= PERF_COUNT_SW_MAX) {
/* 动态事件需要特殊处理 */
}
/* 初始化定时器相关字段(用于采样的周期性触发) */
if (is_sampling_event(event)) {
/* 分配 hrtimer */
event->hw.timer = hrtimer_alloc(perf_swevent_hrtimer,
CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
}
return 0;
}
三、hrtimer 驱动的周期性采样
对于 CPU_CLOCK 和 TASK_CLOCK 等需要周期性计数的 swevent,perf_swevent 使用 hrtimer 定时触发。事件启用时启动定时器:
static void perf_swevent_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
if (is_sampling_event(event)) {
/* 设置初始定时器过期时间 */
hwc->sample_period = event->attr.sample_period;
if (hwc->sample_period) {
/* 计算下一个过期时间 */
u64 period = hwc->sample_period;
hrtimer_start(&hwc->timer,
ns_to_ktime(period),
HRTIMER_MODE_REL_PINNED);
}
}
/* 标记事件为活动状态 */
event->state = PERF_EVENT_STATE_ACTIVE;
}
static void perf_swevent_stop(struct perf_event *event, int flags)
{
/* 停止定时器 */
hrtimer_cancel(&event->hw.timer);
event->state = PERF_EVENT_STATE_INACTIVE;
}
定时器回调函数 perf_swevent_hrtimer:
static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
struct hw_perf_event *hwc;
struct perf_event *event;
struct perf_sample_data data;
struct pt_regs *regs;
/* 从 hrtimer 反推 event */
hwc = container_of(hrtimer, struct hw_perf_event, timer);
event = container_of(hwc, struct perf_event, hw);
/* 如果 event 已关闭则不再重启 */
if (event->state != PERF_EVENT_STATE_ACTIVE)
return HRTIMER_NORESTART;
/* 递增计数器 */
local64_add(event->attr.sample_period, &event->count);
/* 构造采样数据 */
perf_sample_data_init(&data, 0, event->hw.last_period);
data.period = event->attr.sample_period;
data.time = perf_event_time(event);
/* 获取 pt_regs,如果可能 */
regs = get_irq_regs();
/* 调用溢出处理(写入 ring buffer) */
if (perf_event_overflow(event, &data, regs))
/* 如果 throttle 了,不再重启定时器 */
return HRTIMER_NORESTART;
/* 重置定时器 */
hrtimer_forward_now(hrtimer,
ns_to_ktime(event->attr.sample_period));
return HRTIMER_RESTART;
}
四、perf_swevent_event 内核调用点
内核代码通过 perf_sw_event 宏在关键路径埋点:
void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
struct perf_sample_data data;
/* 检查是否有任何 event 注册了此 ID */
if (!static_key_false(&perf_swevent_enabled[event_id]))
return; /* 无监听者,快速返回 */
/* 初始化采样数据 */
perf_sample_data_init(&data, addr, 0);
data.raw = NULL;
/* 触发事件 */
perf_swevent_event(event_id, nr, &data, regs);
}
perf_swevent_event 的查找与分发逻辑:
static void perf_swevent_event(struct perf_event *swevent, u64 nr,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct hlist_head *head;
struct perf_event *event;
/* 通过 percpu hlist 查找所有注册的 event */
head = this_cpu_ptr(&swevent_htable.recursion[event_id].head);
rcu_read_lock();
hlist_for_each_entry_rcu(event, head, hlist_entry) {
/* 检查 event 是否被过滤 */
if (perf_swevent_match(event, data, regs))
/* 写入 perf ring buffer */
perf_swevent_add(event, nr, data, regs);
}
rcu_read_unlock();
}
五、perf_swevent_add 与 ring buffer 写入
perf_swevent_add 负责计数递增和采样数据输出:
static int perf_swevent_add(struct perf_event *event, u64 nr,
struct perf_sample_data *data,
struct pt_regs *regs)
{
/* 1. 递增计数器(计数模式不需要采样) */
if (!(event->attr.sample_type & PERF_SAMPLE_READ)) {
local64_add(nr + data->period, &event->count);
}
/* 2. 采样:根据 sample_type 写入 ring buffer */
if (event->attr.sample_period) {
struct perf_output_handle handle;
/* 开始输出 */
ret = perf_output_begin(&handle, data, event, regs);
if (ret)
return ret;
/* 写入标准采样字段 */
perf_output_sample(&handle, event, data, regs);
/* 提交 */
perf_output_end(&handle);
}
return 0;
}
perf_output_sample 写入标准化字段的次序取决于 sample_type 位的设置:
void perf_output_sample(struct perf_output_handle *handle,
struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
/* 固定顺序写入 */
if (event->attr.sample_type & PERF_SAMPLE_IP)
perf_output_put(handle, data->ip);
if (event->attr.sample_type & PERF_SAMPLE_TID)
perf_output_put(handle, data->tid_entry);
if (event->attr.sample_type & PERF_SAMPLE_TIME)
perf_output_put(handle, data->time);
if (event->attr.sample_type & PERF_SAMPLE_ADDR)
perf_output_put(handle, data->addr);
if (event->attr.sample_type & PERF_SAMPLE_ID)
perf_output_put(handle, data->id);
if (event->attr.sample_type & PERF_SAMPLE_STREAM_ID)
perf_output_put(handle, data->stream_id);
if (event->attr.sample_type & PERF_SAMPLE_CPU)
perf_output_put(handle, data->cpu_entry);
if (event->attr.sample_type & PERF_SAMPLE_PERIOD)
perf_output_put(handle, data->period);
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
perf_output_sample_callchain(handle, event, data);
/* ... PERF_SAMPLE_RAW, PERF_SAMPLE_BRANCH_STACK etc */
}
六、上下文切换事件的触发
PERF_COUNT_SW_CONTEXT_SWITCHES 事件在调度器切换处触发:
// kernel/sched/core.c
static void __sched notrace __schedule(unsigned int sched_mode)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq_flags rf;
struct rq *rq;
/* ... */
prev = rq->curr;
if (prev->on_rq || sched_mode == SM_NONE) {
/* 触发 context-switch swevent */
perf_event_task_sched_out(prev, next);
}
/* ... */
}
void perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next)
{
struct perf_event_context *ctx;
int ctxn;
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (likely(!ctx))
continue;
perf_ctx_sched_out(ctx, EVENT_ALL);
}
/* 触发软件事件 */
perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
}
page_fault 事件在缺页异常处理路径触发:
// mm/memory.c
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
/* ... */
/* 计数 major/minor fault */
if (flags & FAULT_FLAG_TRIED) {
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
} else {
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
/* ... */
}
七、per_swevent 的 per-CPU hlist 结构
swevent_htable 是 percpu 变量,将 event_id 映射到 hlist:
struct swevent_htable {
struct swevent_hlist *swevent_hlist;
struct mutex hlist_mutex;
int recursion[PERF_COUNT_SW_MAX];
};
struct swevent_hlist {
struct hlist_head heads[PERF_COUNT_SW_MAX];
};
每个 CPU 有一个 swevent_htable,数组 heads[event_id] 是链表的头。
事件插入与删除:
static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
struct swevent_hlist *hlist;
int event_id = event->attr.config;
/* 分配 hlist(只分配一次) */
if (!swhash->swevent_hlist) {
hlist = kzalloc(sizeof(struct swevent_hlist), GFP_KERNEL);
/* ... */
rcu_assign_pointer(swhash->swevent_hlist, hlist);
}
/* 加入链表 */
hlist_add_head_rcu(&event->hlist_entry,
&swhash->swevent_hlist->heads[event_id]);
return 0;
}
八、与硬件事件的对比总结
| 特性 | 硬件事件 (PERF_TYPE_HARDWARE) | 软件事件 (PERF_TYPE_SOFTWARE) |
|-------------------|----------------------------------|---------------------------------|
| 计数来源 | PMU 寄存器 | 内核代码路径 static call |
| 溢出触发 | PMU NMI 中断 | hrtimer 定时器 |
| 精度 | 纳秒级(受外频限制) | 微秒至纳秒级(取决于 hrtimer) |
| 上下文感知 | 无法区分用户/内核 | 显式通过 perf_sw_event 传递 |
| 资源消耗 | 极低(硬件计数) | 函数调用 + 链表遍历 |
| 并发性 | 硬件原子操作 | local64_add + spinlock |
软件事件的计数精度受限于内核代码中触发点的密度,而硬件事件可以精确到每个 CPU 周期。但软件事件的灵活性和可扩展性远超硬件事件。
