Linux Perf Event Open硬件事件采样与Ring Buffer
Linux perf_event_open 硬件事件采样与 ring_buffer一、系统调用入口与 perf_event 分配perf_event_open 是 Linux 性能监控的核心系统调用定义在 kernel/events/core.cSYSCALL_DEFINE5(perf_event_open,struct perf_event_attr __user *, attr_uptr,pid_t, pid, int, cpu, int, group_fd, unsigned long, flags){struct perf_event *event;struct perf_event_attr attr;/* 从用户态拷贝属性 */if (copy_from_user(attr, attr_uptr, sizeof(attr)))return -EFAULT;/* 权限校验需要 CAP_SYS_ADMIN 或 perf_event_paranoid 许可 */err perf_event_paranoid_check(attr);/* 核心分配函数 */event perf_event_alloc(attr, cpu, task, NULL, NULL, NULL);if (IS_ERR(event))return PTR_ERR(event);/* 分配 event 文件描述符 */event_fd anon_inode_getfd([perf_event], perf_fops, event, 0);if (event_fd 0) {perf_event_release_kernel(event);return event_fd;}/* 安装 fd 到当前进程的 fdtable */fd_install(event_fd, event_file);return event_fd;}二、perf_event_alloc 与 PMU 初始化perf_event_alloc 分配 struct perf_event 结构体初始化硬件事件上下文static struct perf_event *perf_event_alloc(struct perf_event_attr *attr, int cpu,struct task_struct *task,struct perf_event *group_leader,struct perf_event *parent_event,perf_overflow_handler_t overflow_handler,void *context){struct perf_event *event;int node;event kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL, node);if (!event)return ERR_PTR(-ENOMEM);/* 初始化事件核心字段 */event-attr *attr;event-cpu cpu;event-state PERF_EVENT_STATE_INACTIVE;/* 初始化采样相关结构 */if (attr-sample_period) {/* 硬件计数器溢出采样 */event-hw.sample_period attr-sample_period;event-hw.last_period attr-sample_period;local64_set(event-hw.period_left, attr-sample_period);}/* 初始化 ring buffer采样缓冲区 */if (attr-sample_type PERF_SAMPLE_RAW ||attr-watermark || attr-wakeup_events) {/* 在 mmap 时实际分配 ring buffer */event-rb NULL;atomic_set(event-rb_refcount, 0);}/* 分配 PMU 特定的 hw 结构 */event-pmu perf_init_event(event);if (IS_ERR(event-pmu)) {err PTR_ERR(event-pmu);goto err_free;}return event;}三、Ring Buffer 的数据结构perf ring buffer 是通过 mmap 映射到用户态的数据区域结构体为 struct perf_bufferstruct perf_buffer {int page; /* 当前写入页索引 */int nr_pages; /* 总页面数 */int overwritable; /* 是否可覆盖 */struct perf_event_mmap_page *user_page; /* 用户态头 */unsigned long *data_pages[]; /* 数据页指针数组 */};mmap 时分配 ring buffer 的路径static int perf_mmap(struct file *file, struct vm_area_struct *vma){struct perf_event *event file-private_data;unsigned long nr_pages;struct perf_buffer *rb;/* 检查权限和模式 */if (vma-vm_flags VM_WRITE)return -EINVAL; /* 用户态只读 *//* 计算页数 */nr_pages (vma-vm_end - vma-vm_start - PAGE_SIZE) PAGE_SHIFT;/* 分配 ring buffer */rb rb_alloc(nr_pages,event-attr.watermark ? PERF_RB_WATERMARK : 0,event-attr.write_backward ? PERF_RB_WRITE_BACKWARD : 0);/* 初始化用户态头部区域 */rb-user_page-data_head 0;rb-user_page-data_tail 0;rb-user_page-data_offset PAGE_SIZE; /* 数据区在头部页之后 */rb-user_page-data_size nr_pages * PAGE_SIZE;event-rb rb;/* 将 ring buffer 映射到用户空间 */vm_insert_page(vma, vma-vm_start, virt_to_page(rb-user_page));for (i 0; i nr_pages; i)vm_insert_page(vma, vma-vm_start (i 1) * PAGE_SIZE,virt_to_page(rb-data_pages[i]));return 0;}四、硬件采样路径PMU 中断到 ring buffer 写入硬件性能计数器溢出时触发 NMI 或中断调用 perf_event_overflowvoid perf_event_overflow(struct perf_event *event,struct perf_sample_data *data,struct pt_regs *regs){/* 1. 取当前 ring buffer */struct perf_buffer *rb rcu_dereference(event-rb);if (!rb)return;/* 2. 计算下一次采样周期 */perf_event_update_userpage(event);/* 3. 将采样数据写入 ring buffer */int ret perf_output_begin(handle, event,perf_sample_size(data));if (ret)return; /* buffer 满且不可覆盖 *//* 4. 写入 event type */perf_output_put(handle, data-type);/* 5. 根据 sample_type 写入各字段 */if (event-attr.sample_type PERF_SAMPLE_IP)perf_output_put(handle, data-ip);if (event-attr.sample_type PERF_SAMPLE_TID)perf_output_put(handle, data-tid_entry);if (event-attr.sample_type PERF_SAMPLE_TIME)perf_output_put(handle, data-time);if (event-attr.sample_type PERF_SAMPLE_CPU)perf_output_put(handle, data-cpu_entry);if (event-attr.sample_type PERF_SAMPLE_RAW)perf_output_put(handle, data-raw);/* 6. 提交刷新 data_head */perf_output_end(handle);}perf_output_begin 的核心逻辑int perf_output_begin(struct perf_output_handle *handle,struct perf_event *event, unsigned int size){struct perf_buffer *rb rcu_dereference(event-rb);int wakeup_events event-attr.wakeup_events;unsigned long head;/* 获取当前的 data_head */head local_read(rb-user_page-data_head);/* 检查是否有足够空间 */if (rb-overwritable) {/* 可覆盖模式直接写覆盖旧数据 */} else {unsigned long tail READ_ONCE(rb-user_page-data_tail);/* 检查剩余空间 */if (head - tail rb-nr_pages * PAGE_SIZE - size)return -ENOSPC;}/* 分配 handle 中的偏移 */handle-rb rb;handle-event event;handle-size size;handle-offset head;handle-wakeup wakeup_events;return 0;}五、用户态读取采样数据用户态通过 mmap 的 ring buffer 读取采样数据struct perf_event_mmap_page *header mmap(...);u64 data_tail header-data_tail;u64 data_head READ_ONCE(header-data_head);/* 确保 data_tail 不跨 cache line */smp_rmb(); /* 读屏障保证 data_head 之前的数据可见 */while (data_tail ! data_head) {struct perf_event_header *ehdr;/* 取当前事件头部 */ehdr (struct perf_event_header *)(data (data_tail mask));/* 处理事件 */process_sample_event(ehdr);/* 前进 data_tail */data_tail ehdr-size;/* 绕过 buffer 尾部环回 */if (data_tail rb-data_size)data_tail - rb-data_size;}/* 更新用户态的 data_tail */header-data_tail data_tail;内核写入 data_head 时使用 smp_store_release 保证正确排序static void perf_output_end(struct perf_output_handle *handle){struct perf_buffer *rb handle-rb;/* 写入最后一条记录后更新 data_head */smp_store_release(rb-user_page-data_head, handle-offset);/* 根据 watermark 和 wakeup_events 触发信号 */if (handle-wakeup) {handle-wakeup--;if (!handle-wakeup) {/* 唤醒用户态等待进程 */wake_up(event-waitq);if (event-pending_kill)kill_fasync(event-fasync, SIGIO, POLL_IN);}}}六、硬件 PMU 驱动的计数器配置x86 架构的 PMU 初始化在 perf_event_intel.c 中static int intel_pmu_hw_config(struct perf_event *event){/* 根据 attr-config 选择 PMC 编号 */if (event-attr.type PERF_TYPE_HARDWARE) {switch (event-attr.config) {case PERF_COUNT_HW_CPU_CYCLES:event-hw.config ARCH_PERFMON_EVENTSEL_OS |ARCH_PERFMON_EVENTSEL_INT |x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES);break;case PERF_COUNT_HW_INSTRUCTIONS:event-hw.config ARCH_PERFMON_EVENTSEL_OS |ARCH_PERFMON_EVENTSEL_INT |x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS);break;}}/* 写入 MSR 寄存器 */wrmsrl(event-hw.event_base, event-hw.config);return 0;}PMU 溢出时x86 的 NMI handler 查找对应的 perf_eventstatic int intel_pmu_handle_irq(struct pt_regs *regs){struct cpu_hw_events *cpuc this_cpu_ptr(cpu_hw_events);int handled 0;/* 读取 IA32_PERF_GLOBAL_STATUS 寄存器 */u64 status wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS, 0);/* 遍历所有 PMC找到溢出的计数器 */for_each_set_bit(bit, (unsigned long *)status, x86_pmu.num_events) {struct perf_event *event cpuc-events[bit];/* 读取新计数器值计算溢出次数 */u64 new_count x86_pmu_event_read(event);/* 调用 perf_event_overflow 写入 ring buffer */perf_event_overflow(event, data, regs);/* 重新加载计数器 */wrmsrl(event-hw.event_base 1,-event-hw.sample_period);handled 1;}/* 确认中断 */wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, status);return handled;}七、采样频率控制与节流perf 的 interrupt throttle 机制防止采样中断过多static void perf_sample_event_took(struct perf_event *event, u64 sample_len){/* 计算采样消耗的时钟数 */u64 throttle 1000000; /* 1ms 硬限制 */if (sample_len throttle) {/* 采样本身耗时过长暂停事件 */event-pmu-stop(event, PERF_EF_UPDATE);event-state PERF_EVENT_STATE_OFF;event-pending_disable 1;/* 调度 timer 在 1 tick 后重新启用 */hrtimer_start(event-hw.timer, ns_to_ktime(1), HRTIMER_MODE_REL);}}八、mmap 页面布局总结perf ring buffer mmap 的完整布局Offset Content------ -------0x0000 struct perf_event_mmap_page (1 page)0x1000 数据页 00x2000 数据页 1...0xN000 数据页 N-1 (nr_pages)用户态通过 data_tail/data_head 协议实现生产者-消费者模型内核写入 data_head用户态读取后推进 data_tail。