perf_event source code analysis
foreword
In simple terms, perf is a performance monitoring tool. It first programs the performance counter provided by the general-purpose processor, sets the counter thresholds and events, and then the performance counter increments the counter when the set event occurs, until the counter's When the count value reaches the threshold, there are different ways to extract the counter value in different structures. For example, a hardware interrupt will be registered on MIPS, so that a hardware interrupt will be triggered when the counter overflows, and the value will be recorded in the interrupt handler. It uses the notification chain mechanism to register the overflow processing function on the die_chain notification chain. It will use the timing of any hardware interrupt to detect whether the performance counter overflows. If it is, it will record the value. This implementation avoids the need for performance Counter overflow registers a hardware interrupt.
The perf source code is divided into the user layer and the kernel layer. The user layer code provides the user with command-line specified events and sampling methods. A major feature of perf is reflected in the rich user layer tools. It can be said that the kernel part of the code only provides a sampling engine for perf , the user layer is the essence of perf. The user layer code is located in the src/tools/perf directory, the c code has about 13000 lines, and there are a large number of script programs. The kernel layer code is divided into structure-independent code (located in the src/kernel/core/ directory) and structure-related code (located in src/arch/x86/cpu/**).
Here is a framework: first, start from the system startup initialization, the related work of perf-init, and then introduce the user-level specified events, transfer to the kernel through system calls, perform sampling, and return the sampling data to the user-level through memory mapping, user-level tools Perform upper-level analysis and display
perf_event source code analysis (1) - cmd_record
perf's main entry
tools/perf/perf.c
static struct cmd_struct commands[] = {
{ "buildid-cache", cmd_buildid_cache, 0 },
{ "buildid-list", cmd_buildid_list, 0 },
{ "diff", cmd_diff, 0 },
{ "evlist", cmd_evlist, 0 },
{ "help", cmd_help, 0 },
{ "list", cmd_list, 0 },
{ "record", cmd_record, 0 },
{ "report", cmd_report, 0 },
{ "bench", cmd_bench, 0 },
{ "stat", cmd_stat, 0 },
{ "timechart", cmd_timechart, 0 },
{ "top", cmd_top, 0 },
{ "annotate", cmd_annotate, 0 },
{ "version", cmd_version, 0 },
{ "script", cmd_script, 0 },
{ "sched", cmd_sched, 0 },
#ifdef HAVE_LIBELF_SUPPORT
{ "probe", cmd_probe, 0 },
#endif
{ "kmem", cmd_kmem, 0 },
{ "lock", cmd_lock, 0 },
{ "kvm", cmd_kvm, 0 },
{ "test", cmd_test, 0 },
#ifdef HAVE_LIBAUDIT_SUPPORT
{ "trace", cmd_trace, 0 },
#endif
{ "inject", cmd_inject, 0 },
{ "mem", cmd_mem, 0 },
{ "data", cmd_data, 0 },
};
perf record's CALL CHAIN:
cmd_record
;; new a struct "record" rec, and a struct "evlist" in rec->evlist;
perf_evlist__new
perf_config
__cmd_record(&record, argc, argv); // fill out "struct record"
perf_session__new(file, false, tool); // New a sesssion for this rec, rec->session, attention: file is "struct perf_data_file *file", &rec->file;
machines__init(&session->machines);
ordered_events__init(&session->ordered_events, ordered_events__deliver_event);
perf_data_file__open(file)
check_pipe(file)
file->path = "perf.data" // If not specified name, fill out file->path
open_file(file);
fd = perf_data_file__is_read(file) ? open_file_read(file) : open_file_write(file);
file->fd = fd;
perf_session__create_kernel_maps(session) //
fd = perf_data_file__fd(file); // Get rec's fd, rec->file->fd
record__init_features(rec);
perf_header__set_feat // Fill out session's header of this rec, rec->session->header
record__open(rec)
perf_evlist__config(evlist, opts); // perf_evlist
perf_evsel__config(evsel, opts); // perf_evsel
perf_header__clear_feat
perf_header__write_pipe / perf_session__write_header
perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, machine);
perf_event__synthesize_modules(tool, process_synthesized_event, machine);
machines__process_guests(&session->machines,perf_event__synthesize_guest_os, tool);
__machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,process_synthesized_event, opts->sample_address);
tools/perf/builtin-record.c
int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
{
int err = -ENOMEM;
struct record *rec = &record;
char errbuf[BUFSIZ];
rec->evlist = perf_evlist__new();
if (rec->evlist == NULL)
return -ENOMEM;
perf_config(perf_record_config, rec); // 解析, tools/perf/util/config.c
argc = parse_options(argc, argv, record_options, record_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
if (!argc && target__none(&rec->opts.target))
usage_with_options(record_usage, record_options);
if (nr_cgroups && !rec->opts.target.system_wide) {
ui__error("cgroup monitoring only available in"
" system-wide mode\n");
usage_with_options(record_usage, record_options);
}
}
tools/perf/util/parse-events.c
setup_events // tools/perf/builtin-stat.c
parse_events // tools/perf/util/parse-events.c
parse_events // tools/perf/util/parse-events.c
int parse_events(struct perf_evlist *evlist, const char *str)
{
struct parse_events_evlist data = {
.list = LIST_HEAD_INIT(data.list),
.idx = evlist->nr_entries,
};
int ret;
ret = parse_events__scanner(str, &data, PE_START_EVENTS);
perf_pmu__parse_cleanup();
if (!ret) {
int entries = data.idx - evlist->nr_entries;
perf_evlist__splice_list_tail(evlist, &data.list, entries);
evlist->nr_groups += data.nr_groups;
return 0;
}
/*
* There are 2 users - builtin-record and builtin-test objects.
* Both call perf_evlist__delete in case of error, so we dont
* need to bother.
*/
return ret;
}
struct introduction
tools/perf/util/target.h
struct target {
const char *pid;
const char *tid;
const char *cpu_list;
const char *uid_str;
uid_t uid;
bool system_wide;
bool uses_mmap;
bool default_per_cpu;
bool per_thread;
};
===
tools/perf/util/data.h
struct perf_data_file {
const char *path;
int fd;
bool is_pipe;
bool force;
unsigned long size;
enum perf_data_mode mode;
};
===
tools/perf/util/session.h
struct perf_session {
struct perf_header header;
struct machines machines;
struct perf_evlist *evlist;
struct trace_event tevent;
bool repipe;
bool one_mmap;
void *one_mmap_addr;
u64 one_mmap_offset;
struct ordered_events ordered_events;
struct perf_data_file *file;
struct perf_tool *tool;
};
===
tools/perf/util/evlist.h
struct perf_evlist {
struct list_head entries;
struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];
int nr_entries;
int nr_groups;
int nr_mmaps;
size_t mmap_len;
int id_pos;
int is_pos;
u64 combined_sample_type;
struct {
int cork_fd;
pid_t pid;
} workload;
bool overwrite;
struct fdarray pollfd;
struct perf_mmap *mmap;
struct thread_map *threads; // threads
struct cpu_map *cpus; // cpus
struct perf_evsel *selected;
struct events_stats stats;
};
===
/** struct perf_evsel - event selector **/
Each event passed from user mapping one perf_evsel struct.
struct perf_evsel {
struct list_head node;
struct perf_event_attr attr;
char *filter;
struct xyarray *fd;
struct xyarray *sample_id;
u64 *id;
struct perf_counts *counts;
struct perf_counts *prev_raw_counts;
int idx;
u32 ids;
char *name;
double scale;
const char *unit;
bool snapshot;
struct event_format *tp_format;
...
...
struct perf_evsel *leader;
}
===
tools/perf/builtin-record.c
struct record {
struct perf_tool tool;
struct record_opts opts;
u64 bytes_written;
struct perf_data_file file;
struct perf_evlist *evlist;
struct perf_session *session;
const char *progname;
int realtime_prio;
bool no_buildid;
bool no_buildid_cache;
long samples;
};
===
Here is important, perf_stat is an array include three "struct stats" in "perf_stat",
and will init perf_stat:
for (i = 0; i < 3; i++)
init_stats(&ps->res_stats[i]);
struct perf_stat {
struct stats res_stats[3];
};
tools/perf/util/stat.h
struct stats
{
double n, mean, M2;
u64 max, min;
};
====
tools/perf/util/evsel.h
struct perf_counts_values {
union {
struct {
u64 val;
u64 ena;
u64 run;
};
u64 values[3];
};
};
struct perf_counts {
s8 scaled;
struct perf_counts_values aggr;
struct perf_counts_values cpu[];
};
perf stat's CALL CHAIN
CALL CHAIN:
commands // tools/perf/perf.c
cmd_stat // tools/perf/builtin-stat.c
parse_events_option // If perf stat -e xxx, specified event name, will check this event name
parse_events
parse_events__scanner // check events
parse_events_lex_init_extra
parse_events__scan_string
parse_events_parse
parse_events__flush_buffer
parse_events__delete_buffer
parse_events_lex_destroy
perf_pmu__parse_cleanup:
perf_evlist__new();
perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus, struct thread_map *threads) // evlist->cpus, evlist->threads
perf_evlist__set_maps ///
parse_options
parse_options_usage
add_default_attributes()
target__validate(&target);
perf_evlist__create_maps(evsel_list, &target) // fill out evlist->threads(thread_map)
evlist->threads = thread_map__new_str(target->pid, target->tid,target->uid); // evlist->threads
evlist->threads(thread_map) = [tid,tid,tid,tid,...]
target__uses_dummy_map(target)
evlist->cpus = cpu_map__dummy_new() // evlist->cpus
evlist->cpus = cpu_map__new(target->cpu_list)
perf_evlist__alloc_stats(evsel_list, interval) // Traverse all evsel
evlist__for_each(evlist, evsel) {
perf_evsel__alloc_stat_priv(evsel) // Alloc memory for each evsel->priv = zalloc(sizeof(struct perf_stat));
perf_evsel__reset_stat_priv(evsel)
init_stats // Fill out "struct perf_stat", perf_stat include 3 elements of "struct stats{}"
perf_evsel__alloc_counts(evsel, perf_evsel__nr_cpus(evsel)) // Alloc evsel->counts
alloc_raw && perf_evsel__alloc_prev_raw_counts(evsel) // Alloc evsel->prev_raw_counts = addr;
}
perf_stat_init_aggr_mode()
cpu_map__build_socket_map
cpu_map__build_map(cpus, sockp, cpu_map__get_socket);
cpu_map__get_socket
cpu_map__build_core_map
cpu_map__build_map(cpus, corep, cpu_map__get_core);
cpu_map__get_core
cpu_map__get_socket
run_perf_stat(argc, argv);
__run_perf_stat(argc, argv);
perf_evlist__prepare_workload(evsel_list, &target, argv, false, workload_exec_failed_signal)
perf_evlist__set_leader(evsel_list); // evlist->nr_groups = 1 or 0 ? decide by evlist->nr_entries > 1 or not
__perf_evlist__set_leader(&evlist->entries);
evlist__for_each(evsel_list, evsel) { // Traverse all evsel
create_perf_stat_counter(evsel)
struct perf_event_attr *attr = &evsel->attr;
attr->xxx = xxx
perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)
perf_evsel__is_group_leader(evsel)
perf_evsel__open_per_thread(evsel, evsel_list->threads)
// important: __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, struct thread_map *threads)
__perf_evsel__open(evsel, &empty_cpu_map.map, threads)
// perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads), if system_wide: nthreads = 1
perf_evsel__alloc_fd(evsel, cpus->nr, nthreads)
evsel->fd = xyarray__new(ncpus, nthreads, sizeof(int));
for (cpu = 0; cpu < cpus->nr; cpu++) {
for (thread = 0; thread < nthreads; thread++) {
group_fd = get_group_fd(evsel, cpu, thread);
sys_perf_event_open(&evsel->attr, pid, cpus->map[cpu], group_fd, flags);
}
}
}
perf_evlist__apply_filters(evsel_list, &counter)
evlist__for_each(evlist, evsel) {
perf_evsel__set_filter(evsel, ncpus, nthreads, evsel->filter);
}
t0 = rdclock();
clock_gettime(CLOCK_MONOTONIC, &ref_time);
if (forks) {
perf_evlist__start_workload(evsel_list);
handle_initial_delay();
if (interval) {
print_interval();
}
} else {
handle_initial_delay();
print_interval();
}
t1 = rdclock();
update_stats(&walltime_nsecs_stats, t1 - t0);
// 开始为每个evsel读
if (aggr_mode == AGGR_GLOBAL) {
evlist__for_each(evsel_list, counter) {
// 读到struct: "struct perf_counts_values", 保存在evsel的 &counter->counts->aggr , (这里evsel 就是counter)
// 还有“struct perf_stat” , counter->priv
read_counter_aggr(counter);
aggr->val = aggr->ena = aggr->run = 0; // 这里, 把 perf_counts_values aggr 全部初始化为0
read_counter(counter) // 如何读此event?遍历每个thread和cpu
int nthreads = thread_map__nr(evsel_list->threads);
int ncpus = perf_evsel__nr_cpus(counter);
int cpu, thread;
for (thread = 0; thread < nthreads; thread++) {
for (cpu = 0; cpu < ncpus; cpu++) {
// pocess + cpu 二维数组方式读, 读到 "struct perf_counts_values count"
process_per_cpu(struct perf_evsel *evsel, int cpu, int thread))
perf_evsel__read_cb(evsel, cpu, thread, &count)
memset(count, 0, sizeof(*count));
FD(evsel, cpu, thread)
readn(FD(evsel, cpu, thread), count, sizeof(*count))
ion(true, fd, buf, n);
read(fd, buf, left)
read_cb(evsel, cpu, thread, tmp);
switch (aggr_mode) {
case AGGR_CORE:
case AGGR_SOCKET:
case AGGR_NONE:
perf_evsel__compute_deltas(evsel, cpu, count);
perf_counts_values__scale(count, scale, NULL);
update_shadow_stats(evsel, count->values, cpu);
}
}
}
perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), thread_map__nr(evsel_list->threads));
}
} else {
evlist__for_each(evsel_list, counter) {
read_counter(counter);
perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1);
}
}
print_stat
print_aggr // AGGR_CORE AGGR_SOCKET
print_counter_aggr(evsel, NULL); // AGGR_GLOBAL
print_counter(evsel, NULL) // AGGR_NONE
tools/perf/util/evsel.h
struct perf_evsel {
}