perf-perf stat user layer code analysis

perf_event source code analysis

foreword

In simple terms, perf is a performance monitoring tool. It first programs the performance counter provided by the general-purpose processor, sets the counter thresholds and events, and then the performance counter increments the counter when the set event occurs, until the counter's When the count value reaches the threshold, there are different ways to extract the counter value in different structures. For example, a hardware interrupt will be registered on MIPS, so that a hardware interrupt will be triggered when the counter overflows, and the value will be recorded in the interrupt handler. It uses the notification chain mechanism to register the overflow processing function on the die_chain notification chain. It will use the timing of any hardware interrupt to detect whether the performance counter overflows. If it is, it will record the value. This implementation avoids the need for performance Counter overflow registers a hardware interrupt.

The perf source code is divided into the user layer and the kernel layer. The user layer code provides the user with command-line specified events and sampling methods. A major feature of perf is reflected in the rich user layer tools. It can be said that the kernel part of the code only provides a sampling engine for perf , the user layer is the essence of perf. The user layer code is located in the src/tools/perf directory, the c code has about 13000 lines, and there are a large number of script programs. The kernel layer code is divided into structure-independent code (located in the src/kernel/core/ directory) and structure-related code (located in src/arch/x86/cpu/**).

Here is a framework: first, start from the system startup initialization, the related work of perf-init, and then introduce the user-level specified events, transfer to the kernel through system calls, perform sampling, and return the sampling data to the user-level through memory mapping, user-level tools Perform upper-level analysis and display

perf_event source code analysis (1) - cmd_record

perf's main entry

tools/perf/perf.c

static struct cmd_struct commands[] = {
    { "buildid-cache", cmd_buildid_cache, 0 },
    { "buildid-list", cmd_buildid_list, 0 },
    { "diff",   cmd_diff,   0 },
    { "evlist", cmd_evlist, 0 },
    { "help",   cmd_help,   0 },
    { "list",   cmd_list,   0 },
    { "record", cmd_record, 0 },
    { "report", cmd_report, 0 },
    { "bench",  cmd_bench,  0 },
    { "stat",   cmd_stat,   0 },
    { "timechart",  cmd_timechart,  0 },
    { "top",    cmd_top,    0 },
    { "annotate",   cmd_annotate,   0 },
    { "version",    cmd_version,    0 },
    { "script", cmd_script, 0 },
    { "sched",  cmd_sched,  0 },
#ifdef HAVE_LIBELF_SUPPORT
    { "probe",  cmd_probe,  0 },
#endif
    { "kmem",   cmd_kmem,   0 },
    { "lock",   cmd_lock,   0 },
    { "kvm",    cmd_kvm,    0 },
    { "test",   cmd_test,   0 },
#ifdef HAVE_LIBAUDIT_SUPPORT
    { "trace",  cmd_trace,  0 },
#endif
    { "inject", cmd_inject, 0 },
    { "mem",    cmd_mem,    0 },
    { "data",   cmd_data,   0 },
};

perf record's CALL CHAIN:

cmd_record
    ;; new a struct "record" rec, and a struct "evlist" in rec->evlist;
    perf_evlist__new
    perf_config
    __cmd_record(&record, argc, argv); // fill out "struct record" 
        perf_session__new(file, false, tool); // New a sesssion for this rec, rec->session, attention: file is "struct perf_data_file *file",  &rec->file;
            machines__init(&session->machines);
            ordered_events__init(&session->ordered_events, ordered_events__deliver_event);
            perf_data_file__open(file) 
                check_pipe(file)
                file->path = "perf.data" // If not specified name, fill out file->path
                open_file(file);
                    fd = perf_data_file__is_read(file) ? open_file_read(file) : open_file_write(file);
                    file->fd = fd;
            perf_session__create_kernel_maps(session) // 
        fd = perf_data_file__fd(file); // Get rec's fd, rec->file->fd
        record__init_features(rec); 
            perf_header__set_feat // Fill out session's header of this rec, rec->session->header
        record__open(rec)
            perf_evlist__config(evlist, opts); // perf_evlist
                perf_evsel__config(evsel, opts); // perf_evsel
        perf_header__clear_feat
        perf_header__write_pipe / perf_session__write_header
        perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, machine);
        perf_event__synthesize_modules(tool, process_synthesized_event, machine);
        machines__process_guests(&session->machines,perf_event__synthesize_guest_os, tool);
        __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,process_synthesized_event, opts->sample_address);
        
        
tools/perf/builtin-record.c

int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
{
    int err = -ENOMEM;
    struct record *rec = &record;
    char errbuf[BUFSIZ];

    rec->evlist = perf_evlist__new();
    if (rec->evlist == NULL)
        return -ENOMEM;

    perf_config(perf_record_config, rec);  // 解析, tools/perf/util/config.c

    argc = parse_options(argc, argv, record_options, record_usage,
                PARSE_OPT_STOP_AT_NON_OPTION);
    if (!argc && target__none(&rec->opts.target))
        usage_with_options(record_usage, record_options);

    if (nr_cgroups && !rec->opts.target.system_wide) {
        ui__error("cgroup monitoring only available in"
              " system-wide mode\n");
        usage_with_options(record_usage, record_options);
    }
}
tools/perf/util/parse-events.c

setup_events // tools/perf/builtin-stat.c
    parse_events // tools/perf/util/parse-events.c
    
parse_events  // tools/perf/util/parse-events.c

int parse_events(struct perf_evlist *evlist, const char *str)
{
    struct parse_events_evlist data = {
        .list = LIST_HEAD_INIT(data.list),
        .idx  = evlist->nr_entries,
    };
    int ret;

    ret = parse_events__scanner(str, &data, PE_START_EVENTS);
    perf_pmu__parse_cleanup();
    if (!ret) {
        int entries = data.idx - evlist->nr_entries;
        perf_evlist__splice_list_tail(evlist, &data.list, entries);
        evlist->nr_groups += data.nr_groups;
        return 0;
    }

    /*
     * There are 2 users - builtin-record and builtin-test objects.
     * Both call perf_evlist__delete in case of error, so we dont
     * need to bother.
     */
    return ret;
}

struct introduction

tools/perf/util/target.h

struct target {
    const char   *pid;
    const char   *tid;
    const char   *cpu_list;
    const char   *uid_str;
    uid_t        uid;
    bool         system_wide;
    bool         uses_mmap;
    bool         default_per_cpu;
    bool         per_thread;
};
===

tools/perf/util/data.h

struct perf_data_file {
    const char      *path;
    int          fd;
    bool             is_pipe;
    bool             force;
    unsigned long        size;
    enum perf_data_mode  mode;
};

=== 

tools/perf/util/session.h

struct perf_session {
    struct perf_header  header;
    struct machines     machines;
    struct perf_evlist  *evlist;
    struct trace_event  tevent;
    bool            repipe;
    bool            one_mmap;
    void            *one_mmap_addr;
    u64         one_mmap_offset;
    struct ordered_events   ordered_events;
    struct perf_data_file   *file;
    struct perf_tool    *tool;
};

===

tools/perf/util/evlist.h 

struct perf_evlist {
    struct list_head entries;
    struct hlist_head heads[PERF_EVLIST__HLIST_SIZE];
    int      nr_entries;
    int      nr_groups;
    int      nr_mmaps;
    size_t       mmap_len;
    int      id_pos;
    int      is_pos;
    u64      combined_sample_type;
    struct {
        int cork_fd;
        pid_t   pid;
    } workload;
    bool         overwrite;
    struct fdarray   pollfd;
    struct perf_mmap *mmap;
    struct thread_map *threads; // threads
    struct cpu_map    *cpus;   // cpus
    struct perf_evsel *selected;
    struct events_stats stats;
};

=== 

/** struct perf_evsel - event selector **/

Each event passed from user mapping one perf_evsel struct. 

struct perf_evsel {
    struct list_head    node;
    struct perf_event_attr  attr;
    char            *filter;
    struct xyarray      *fd;
    struct xyarray      *sample_id;
    u64         *id;
    struct perf_counts  *counts;
    struct perf_counts  *prev_raw_counts;
    int         idx;
    u32         ids;
    char            *name;
    double          scale;
    const char      *unit;
    bool            snapshot;
    struct event_format *tp_format;
    ...
    ...
    struct perf_evsel   *leader;
}

=== 

tools/perf/builtin-record.c

struct record {
    struct perf_tool    tool;
    struct record_opts  opts;
    u64         bytes_written;
    struct perf_data_file   file;
    struct perf_evlist  *evlist;
    struct perf_session *session;
    const char      *progname;
    int         realtime_prio;
    bool            no_buildid;
    bool            no_buildid_cache;
    long            samples;
};

===
Here is important, perf_stat is an array include three "struct stats" in "perf_stat", 
and will init perf_stat: 
    for (i = 0; i < 3; i++)
        init_stats(&ps->res_stats[i]);


struct perf_stat {
    struct stats      res_stats[3];
};

tools/perf/util/stat.h

struct stats
{
    double n, mean, M2;
    u64 max, min;
};

==== 
tools/perf/util/evsel.h

struct perf_counts_values {
    union {
        struct {
            u64 val;
            u64 ena;
            u64 run;
        };
        u64 values[3];
    };
};

struct perf_counts {
    s8            scaled;
    struct perf_counts_values aggr;
    struct perf_counts_values cpu[];
};


perf stat's CALL CHAIN

CALL CHAIN: 
commands // tools/perf/perf.c
    cmd_stat // tools/perf/builtin-stat.c
        parse_events_option // If perf stat -e xxx, specified event name, will check this event name 
            parse_events
                parse_events__scanner // check events 
                    parse_events_lex_init_extra
                    parse_events__scan_string
                    parse_events_parse
                    parse_events__flush_buffer
                    parse_events__delete_buffer
                    parse_events_lex_destroy
                perf_pmu__parse_cleanup:
        perf_evlist__new();
            perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus, struct thread_map *threads) // evlist->cpus, evlist->threads
                perf_evlist__set_maps /// 
        parse_options
        parse_options_usage
        add_default_attributes()
        target__validate(&target);
        perf_evlist__create_maps(evsel_list, &target) // fill out evlist->threads(thread_map)
            evlist->threads = thread_map__new_str(target->pid, target->tid,target->uid); // evlist->threads
            evlist->threads(thread_map) = [tid,tid,tid,tid,...]
            target__uses_dummy_map(target) 
                evlist->cpus = cpu_map__dummy_new() // evlist->cpus
                evlist->cpus = cpu_map__new(target->cpu_list)
        perf_evlist__alloc_stats(evsel_list, interval)  // Traverse all evsel
            evlist__for_each(evlist, evsel) {
                perf_evsel__alloc_stat_priv(evsel) // Alloc memory for each evsel->priv = zalloc(sizeof(struct perf_stat));
                    perf_evsel__reset_stat_priv(evsel)
                        init_stats // Fill out "struct perf_stat", perf_stat include 3 elements of "struct stats{}"
                perf_evsel__alloc_counts(evsel, perf_evsel__nr_cpus(evsel)) //  Alloc evsel->counts
                alloc_raw && perf_evsel__alloc_prev_raw_counts(evsel) // Alloc evsel->prev_raw_counts =  addr;
            }
        perf_stat_init_aggr_mode()
            cpu_map__build_socket_map
                cpu_map__build_map(cpus, sockp, cpu_map__get_socket);
                cpu_map__get_socket
            cpu_map__build_core_map
                cpu_map__build_map(cpus, corep, cpu_map__get_core);
                cpu_map__get_core
                    cpu_map__get_socket
        
        run_perf_stat(argc, argv);
            __run_perf_stat(argc, argv);
                perf_evlist__prepare_workload(evsel_list, &target, argv, false, workload_exec_failed_signal)
                perf_evlist__set_leader(evsel_list); // evlist->nr_groups  = 1 or 0 ? decide by evlist->nr_entries > 1 or not
                    __perf_evlist__set_leader(&evlist->entries);
                    evlist__for_each(evsel_list, evsel) {  // Traverse all evsel
                        create_perf_stat_counter(evsel)
                            struct perf_event_attr *attr = &evsel->attr;
                            attr->xxx  = xxx
                            perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)
                            perf_evsel__is_group_leader(evsel)
                            perf_evsel__open_per_thread(evsel, evsel_list->threads)
                                // important: __perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus, struct thread_map *threads)
                                __perf_evsel__open(evsel, &empty_cpu_map.map, threads) 
                                    // perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads), if system_wide: nthreads = 1
                                    perf_evsel__alloc_fd(evsel, cpus->nr, nthreads)
                                        evsel->fd = xyarray__new(ncpus, nthreads, sizeof(int));
                                    for (cpu = 0; cpu < cpus->nr; cpu++) {
                                         for (thread = 0; thread < nthreads; thread++) { 
                                            group_fd = get_group_fd(evsel, cpu, thread);
                                            sys_perf_event_open(&evsel->attr, pid, cpus->map[cpu], group_fd, flags);
                                         }
                                    }           
                    }
                    perf_evlist__apply_filters(evsel_list, &counter)
                    evlist__for_each(evlist, evsel) {
                        perf_evsel__set_filter(evsel, ncpus, nthreads, evsel->filter);
                    }
                    t0 = rdclock();
                    clock_gettime(CLOCK_MONOTONIC, &ref_time);
                    if (forks) { 
                        perf_evlist__start_workload(evsel_list);
                        handle_initial_delay();
                        if (interval) {
                            print_interval();
                        }
                    } else {
                        handle_initial_delay();
                        print_interval();
                    }
                    t1 = rdclock();

                    update_stats(&walltime_nsecs_stats, t1 - t0);

                    // 开始为每个evsel读
                    if (aggr_mode == AGGR_GLOBAL) {
                        evlist__for_each(evsel_list, counter) {
                            // 读到struct: "struct perf_counts_values", 保存在evsel的 &counter->counts->aggr , (这里evsel 就是counter)
                            // 还有“struct perf_stat” , counter->priv
                            read_counter_aggr(counter); 
                                aggr->val = aggr->ena = aggr->run = 0; // 这里, 把 perf_counts_values aggr 全部初始化为0 
                                read_counter(counter)  // 如何读此event?遍历每个thread和cpu
                                    int nthreads = thread_map__nr(evsel_list->threads);
                                    int ncpus = perf_evsel__nr_cpus(counter);
                                    int cpu, thread;
                                    for (thread = 0; thread < nthreads; thread++) {
                                        for (cpu = 0; cpu < ncpus; cpu++) {
                                            // pocess + cpu 二维数组方式读, 读到 "struct  perf_counts_values count"
                                            process_per_cpu(struct perf_evsel *evsel, int cpu, int thread))
                                                perf_evsel__read_cb(evsel, cpu, thread, &count)
                                                    memset(count, 0, sizeof(*count));
                                                    FD(evsel, cpu, thread)
                                                    readn(FD(evsel, cpu, thread), count, sizeof(*count))
                                                        ion(true, fd, buf, n);
                                                            read(fd, buf, left)
                                                            
                                                read_cb(evsel, cpu, thread, tmp);
                                                    switch (aggr_mode) {
                                                        case AGGR_CORE:
                                                        case AGGR_SOCKET:
                                                        case AGGR_NONE:
                                                        perf_evsel__compute_deltas(evsel, cpu, count);
                                                        perf_counts_values__scale(count, scale, NULL);
                                                        update_shadow_stats(evsel, count->values, cpu);
                                                    
                                                    }
                                        }
                                    }   
                            perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), thread_map__nr(evsel_list->threads));
                        }
                    } else {
                        evlist__for_each(evsel_list, counter) {
                            read_counter(counter);
                            perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1);
                        }
                    }
            
        print_stat 
            print_aggr // AGGR_CORE AGGR_SOCKET
            print_counter_aggr(evsel, NULL); // AGGR_GLOBAL
            print_counter(evsel, NULL) // AGGR_NONE
tools/perf/util/evsel.h

struct perf_evsel {

}

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324796418&siteId=291194637