Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[linux] / tools / perf / builtin-trace.c
index dc8a6c4..ebde59e 100644 (file)
 
 #include <traceevent/event-parse.h>
 #include <api/fs/tracing_path.h>
+#include <bpf/bpf.h>
 #include "builtin.h"
 #include "util/cgroup.h"
 #include "util/color.h"
+#include "util/config.h"
 #include "util/debug.h"
 #include "util/env.h"
 #include "util/event.h"
@@ -75,6 +77,7 @@ struct trace {
        struct {
                int             max;
                struct syscall  *table;
+               struct bpf_map  *map;
                struct {
                        struct perf_evsel *sys_enter,
                                          *sys_exit,
@@ -99,6 +102,7 @@ struct trace {
        struct {
                size_t          nr;
                pid_t           *entries;
+               struct bpf_map  *map;
        }                       filter_pids;
        double                  duration_filter;
        double                  runtime_ms;
@@ -108,6 +112,8 @@ struct trace {
        } stats;
        unsigned int            max_stack;
        unsigned int            min_stack;
+       bool                    sort_events;
+       bool                    raw_augmented_syscalls;
        bool                    not_ev_qualifier;
        bool                    live;
        bool                    full_time;
@@ -121,9 +127,19 @@ struct trace {
        bool                    show_tool_stats;
        bool                    trace_syscalls;
        bool                    kernel_syscallchains;
+       s16                     args_alignment;
+       bool                    show_tstamp;
+       bool                    show_duration;
+       bool                    show_zeros;
+       bool                    show_arg_names;
+       bool                    show_string_prefix;
        bool                    force;
        bool                    vfs_getname;
        int                     trace_pgfaults;
+       struct {
+               struct ordered_events   data;
+               u64                     last;
+       } oe;
 };
 
 struct tp_field {
@@ -255,7 +271,8 @@ static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
        struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
 
        if (evsel->priv != NULL) {
-               if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr"))
+               if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
+                   perf_evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
                        goto out_delete;
                return 0;
        }
@@ -344,21 +361,25 @@ out_delete:
        ({ struct syscall_tp *fields = evsel->priv; \
           fields->name.pointer(&fields->name, sample); })
 
-size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
+size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
 {
        int idx = val - sa->offset;
 
-       if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
-               return scnprintf(bf, size, intfmt, val);
+       if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
+               size_t printed = scnprintf(bf, size, intfmt, val);
+               if (show_prefix)
+                       printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
+               return printed;
+       }
 
-       return scnprintf(bf, size, "%s", sa->entries[idx]);
+       return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
 }
 
 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
                                                const char *intfmt,
                                                struct syscall_arg *arg)
 {
-       return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
+       return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
 }
 
 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
@@ -369,34 +390,32 @@ static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 
 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
 
-struct strarrays {
-       int             nr_entries;
-       struct strarray **entries;
-};
-
-#define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
-       .nr_entries = ARRAY_SIZE(array), \
-       .entries = array, \
-}
-
-size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
-                                       struct syscall_arg *arg)
+size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
 {
-       struct strarrays *sas = arg->parm;
+       size_t printed;
        int i;
 
        for (i = 0; i < sas->nr_entries; ++i) {
                struct strarray *sa = sas->entries[i];
-               int idx = arg->val - sa->offset;
+               int idx = val - sa->offset;
 
                if (idx >= 0 && idx < sa->nr_entries) {
                        if (sa->entries[idx] == NULL)
                                break;
-                       return scnprintf(bf, size, "%s", sa->entries[idx]);
+                       return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
                }
        }
 
-       return scnprintf(bf, size, "%d", arg->val);
+       printed = scnprintf(bf, size, intfmt, val);
+       if (show_prefix)
+               printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
+       return printed;
+}
+
+size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
+                                       struct syscall_arg *arg)
+{
+       return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
 }
 
 #ifndef AT_FDCWD
@@ -407,9 +426,10 @@ static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
                                           struct syscall_arg *arg)
 {
        int fd = arg->val;
+       const char *prefix = "AT_FD";
 
        if (fd == AT_FDCWD)
-               return scnprintf(bf, size, "CWD");
+               return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
 
        return syscall_arg__scnprintf_fd(bf, size, arg);
 }
@@ -426,6 +446,13 @@ size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg
        return scnprintf(bf, size, "%#lx", arg->val);
 }
 
+size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
+{
+       if (arg->val == 0)
+               return scnprintf(bf, size, "NULL");
+       return syscall_arg__scnprintf_hex(bf, size, arg);
+}
+
 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
 {
        return scnprintf(bf, size, "%d", arg->val);
@@ -440,13 +467,13 @@ static const char *bpf_cmd[] = {
        "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
        "MAP_GET_NEXT_KEY", "PROG_LOAD",
 };
-static DEFINE_STRARRAY(bpf_cmd);
+static DEFINE_STRARRAY(bpf_cmd, "BPF_");
 
 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
-static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
+static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
 
 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
-static DEFINE_STRARRAY(itimers);
+static DEFINE_STRARRAY(itimers, "ITIMER_");
 
 static const char *keyctl_options[] = {
        "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
@@ -455,7 +482,7 @@ static const char *keyctl_options[] = {
        "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
        "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
 };
-static DEFINE_STRARRAY(keyctl_options);
+static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
 
 static const char *whences[] = { "SET", "CUR", "END",
 #ifdef SEEK_DATA
@@ -465,7 +492,7 @@ static const char *whences[] = { "SET", "CUR", "END",
 "HOLE",
 #endif
 };
-static DEFINE_STRARRAY(whences);
+static DEFINE_STRARRAY(whences, "SEEK_");
 
 static const char *fcntl_cmds[] = {
        "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
@@ -473,7 +500,7 @@ static const char *fcntl_cmds[] = {
        "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
        "GETOWNER_UIDS",
 };
-static DEFINE_STRARRAY(fcntl_cmds);
+static DEFINE_STRARRAY(fcntl_cmds, "F_");
 
 static const char *fcntl_linux_specific_cmds[] = {
        "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
@@ -481,7 +508,7 @@ static const char *fcntl_linux_specific_cmds[] = {
        "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
 };
 
-static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
+static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
 
 static struct strarray *fcntl_cmds_arrays[] = {
        &strarray__fcntl_cmds,
@@ -495,29 +522,31 @@ static const char *rlimit_resources[] = {
        "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
        "RTTIME",
 };
-static DEFINE_STRARRAY(rlimit_resources);
+static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
 
 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
-static DEFINE_STRARRAY(sighow);
+static DEFINE_STRARRAY(sighow, "SIG_");
 
 static const char *clockid[] = {
        "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
        "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
        "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
 };
-static DEFINE_STRARRAY(clockid);
+static DEFINE_STRARRAY(clockid, "CLOCK_");
 
 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
                                                 struct syscall_arg *arg)
 {
+       bool show_prefix = arg->show_string_prefix;
+       const char *suffix = "_OK";
        size_t printed = 0;
        int mode = arg->val;
 
        if (mode == F_OK) /* 0 */
-               return scnprintf(bf, size, "F");
+               return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
 #define        P_MODE(n) \
        if (mode & n##_OK) { \
-               printed += scnprintf(bf + printed, size - printed, "%s", #n); \
+               printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
                mode &= ~n##_OK; \
        }
 
@@ -542,11 +571,13 @@ static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
                                                struct syscall_arg *arg)
 {
+       bool show_prefix = arg->show_string_prefix;
+       const char *prefix = "O_";
        int printed = 0, flags = arg->val;
 
 #define        P_FLAG(n) \
        if (flags & O_##n) { \
-               printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+               printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
                flags &= ~O_##n; \
        }
 
@@ -572,11 +603,13 @@ static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
                                                   struct syscall_arg *arg)
 {
+       bool show_prefix = arg->show_string_prefix;
+       const char *prefix = "GRND_";
        int printed = 0, flags = arg->val;
 
 #define        P_FLAG(n) \
        if (flags & GRND_##n) { \
-               printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+               printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
                flags &= ~GRND_##n; \
        }
 
@@ -631,12 +664,15 @@ static struct syscall_fmt {
 } syscall_fmts[] = {
        { .name     = "access",
          .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
+       { .name     = "arch_prctl",
+         .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
+                  [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
        { .name     = "bind",
          .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, }, },
        { .name     = "bpf",
          .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
        { .name     = "brk",        .hexret = true,
-         .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
+         .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
        { .name     = "clock_gettime",
          .arg = { [0] = STRARRAY(clk_id, clockid), }, },
        { .name     = "clone",      .errpid = true, .nr_args = 5,
@@ -714,18 +750,14 @@ static struct syscall_fmt {
          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
        { .name     = "mknodat",
          .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
-       { .name     = "mlock",
-         .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
-       { .name     = "mlockall",
-         .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
        { .name     = "mmap",       .hexret = true,
 /* The standard mmap maps to old_mmap on s390x */
 #if defined(__s390x__)
        .alias = "old_mmap",
 #endif
-         .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
-                  [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
-                  [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
+         .arg = { [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
+                  [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ },
+                  [5] = { .scnprintf = SCA_HEX,        /* offset */ }, }, },
        { .name     = "mount",
          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ },
                   [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
@@ -736,13 +768,7 @@ static struct syscall_fmt {
        { .name     = "mq_unlink",
          .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
        { .name     = "mremap",     .hexret = true,
-         .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
-                  [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
-                  [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
-       { .name     = "munlock",
-         .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
-       { .name     = "munmap",
-         .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
+         .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
        { .name     = "name_to_handle_at",
          .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
        { .name     = "newfstatat",
@@ -771,7 +797,7 @@ static struct syscall_fmt {
                   [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
        { .name     = "poll", .timeout = true, },
        { .name     = "ppoll", .timeout = true, },
-       { .name     = "prctl", .alias = "arch_prctl",
+       { .name     = "prctl",
          .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
                   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
                   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
@@ -789,7 +815,12 @@ static struct syscall_fmt {
        { .name     = "recvmsg",
          .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
        { .name     = "renameat",
-         .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
+         .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
+                  [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
+       { .name     = "renameat2",
+         .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
+                  [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
+                  [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
        { .name     = "rt_sigaction",
          .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
        { .name     = "rt_sigprocmask",
@@ -882,7 +913,7 @@ static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
  * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
  */
 struct syscall {
-       struct tep_event_format *tp_format;
+       struct tep_event    *tp_format;
        int                 nr_args;
        int                 args_size;
        bool                is_exit;
@@ -893,6 +924,10 @@ struct syscall {
        struct syscall_arg_fmt *arg_fmt;
 };
 
+struct bpf_map_syscall_entry {
+       bool    enabled;
+};
+
 /*
  * We need to have this 'calculated' boolean because in some cases we really
  * don't know what is the duration of a syscall, for instance, when we start
@@ -1124,7 +1159,7 @@ static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, c
 {
        struct augmented_arg *augmented_arg = arg->augmented.args;
 
-       return scnprintf(bf, size, "%.*s", augmented_arg->size, augmented_arg->value);
+       return scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
 }
 
 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
@@ -1193,8 +1228,12 @@ static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread
 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
                                        u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
 {
-       size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
-       printed += fprintf_duration(duration, duration_calculated, fp);
+       size_t printed = 0;
+
+       if (trace->show_tstamp)
+               printed = trace__fprintf_tstamp(trace, tstamp, fp);
+       if (trace->show_duration)
+               printed += fprintf_duration(duration, duration_calculated, fp);
        return printed + trace__fprintf_comm_tid(trace, thread, fp);
 }
 
@@ -1261,7 +1300,7 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
 
        err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
                                            evlist->threads, trace__tool_process, false,
-                                           trace->opts.proc_map_timeout, 1);
+                                           1);
 out:
        if (err)
                symbol__exit();
@@ -1313,8 +1352,8 @@ static int syscall__set_arg_fmts(struct syscall *sc)
                          strcmp(field->name, "path") == 0 ||
                          strcmp(field->name, "pathname") == 0))
                        sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
-               else if (field->flags & TEP_FIELD_IS_POINTER)
-                       sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
+               else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
+                       sc->arg_fmt[idx].scnprintf = SCA_PTR;
                else if (strcmp(field->type, "pid_t") == 0)
                        sc->arg_fmt[idx].scnprintf = SCA_PID;
                else if (strcmp(field->type, "umode_t") == 0)
@@ -1547,6 +1586,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
                .mask   = 0,
                .trace  = trace,
                .thread = thread,
+               .show_string_prefix = trace->show_string_prefix,
        };
        struct thread_trace *ttrace = thread__priv(thread);
 
@@ -1578,6 +1618,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
                         * strarray for it.
                         */
                        if (val == 0 &&
+                           !trace->show_zeros &&
                            !(sc->arg_fmt &&
                              (sc->arg_fmt[arg.idx].show_zero ||
                               sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
@@ -1585,8 +1626,11 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
                              sc->arg_fmt[arg.idx].parm))
                                continue;
 
-                       printed += scnprintf(bf + printed, size - printed,
-                                            "%s%s: ", printed ? ", " : "", field->name);
+                       printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
+
+                       if (trace->show_arg_names)
+                               printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
+
                        printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
                }
        } else if (IS_ERR(sc->tp_format)) {
@@ -1699,7 +1743,7 @@ static int trace__printf_interrupted_entry(struct trace *trace)
                return 0;
 
        printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
-       printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
+       printed += fprintf(trace->output, ")%-*s ...\n", trace->args_alignment, ttrace->entry_str);
        ttrace->entry_pending = false;
 
        ++trace->nr_events_printed;
@@ -1724,13 +1768,28 @@ static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
        return printed;
 }
 
-static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size)
+static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, bool raw_augmented)
 {
        void *augmented_args = NULL;
+       /*
+        * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
+        * and there we get all 6 syscall args plus the tracepoint common
+        * fields (sizeof(long)) and the syscall_nr (another long). So we check
+        * if that is the case and if so don't look after the sc->args_size,
+        * but always after the full raw_syscalls:sys_enter payload, which is
+        * fixed.
+        *
+        * We'll revisit this later to pass s->args_size to the BPF augmenter
+        * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
+        * copies only what we need for each syscall, like what happens when we
+        * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
+        * traffic to just what is needed for each syscall.
+        */
+       int args_size = raw_augmented ? (8 * (int)sizeof(long)) : sc->args_size;
 
-       *augmented_args_size = sample->raw_size - sc->args_size;
+       *augmented_args_size = sample->raw_size - args_size;
        if (*augmented_args_size > 0)
-               augmented_args = sample->raw_data + sc->args_size;
+               augmented_args = sample->raw_data + args_size;
 
        return augmented_args;
 }
@@ -1741,7 +1800,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 {
        char *msg;
        void *args;
-       size_t printed = 0;
+       int printed = 0;
        struct thread *thread;
        int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
        int augmented_args_size = 0;
@@ -1780,7 +1839,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
         * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
         */
        if (evsel != trace->syscalls.events.sys_enter)
-               augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size);
+               augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls);
        ttrace->entry_time = sample->time;
        msg = ttrace->entry_str;
        printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
@@ -1790,8 +1849,13 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 
        if (sc->is_exit) {
                if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
+                       int alignment = 0;
+
                        trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
-                       fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
+                       printed = fprintf(trace->output, "%s)", ttrace->entry_str);
+                       if (trace->args_alignment > printed)
+                               alignment = trace->args_alignment - printed;
+                       fprintf(trace->output, "%*s= ?\n", alignment, " ");
                }
        } else {
                ttrace->entry_pending = true;
@@ -1833,7 +1897,7 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evse
                goto out_put;
 
        args = perf_evsel__sc_tp_ptr(evsel, args, sample);
-       augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size);
+       augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls);
        syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
        fprintf(trace->output, "%s", msg);
        err = 0;
@@ -1886,7 +1950,8 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
        u64 duration = 0;
        bool duration_calculated = false;
        struct thread *thread;
-       int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
+       int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
+       int alignment = trace->args_alignment;
        struct syscall *sc = trace__syscall_info(trace, evsel, id);
        struct thread_trace *ttrace;
 
@@ -1934,28 +1999,37 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
        trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
 
        if (ttrace->entry_pending) {
-               fprintf(trace->output, "%-70s", ttrace->entry_str);
+               printed = fprintf(trace->output, "%s", ttrace->entry_str);
        } else {
                fprintf(trace->output, " ... [");
                color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
                fprintf(trace->output, "]: %s()", sc->name);
        }
 
+       printed++; /* the closing ')' */
+
+       if (alignment > printed)
+               alignment -= printed;
+       else
+               alignment = 0;
+
+       fprintf(trace->output, ")%*s= ", alignment, " ");
+
        if (sc->fmt == NULL) {
                if (ret < 0)
                        goto errno_print;
 signed_print:
-               fprintf(trace->output, ") = %ld", ret);
+               fprintf(trace->output, "%ld", ret);
        } else if (ret < 0) {
 errno_print: {
                char bf[STRERR_BUFSIZE];
                const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
                           *e = errno_to_name(evsel, -ret);
 
-               fprintf(trace->output, ") = -1 %s %s", e, emsg);
+               fprintf(trace->output, "-1 %s (%s)", e, emsg);
        }
        } else if (ret == 0 && sc->fmt->timeout)
-               fprintf(trace->output, ") = 0 Timeout");
+               fprintf(trace->output, "0 (Timeout)");
        else if (ttrace->ret_scnprintf) {
                char bf[1024];
                struct syscall_arg arg = {
@@ -1965,14 +2039,14 @@ errno_print: {
                };
                ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
                ttrace->ret_scnprintf = NULL;
-               fprintf(trace->output, ") = %s", bf);
+               fprintf(trace->output, "%s", bf);
        } else if (sc->fmt->hexret)
-               fprintf(trace->output, ") = %#lx", ret);
+               fprintf(trace->output, "%#lx", ret);
        else if (sc->fmt->errpid) {
                struct thread *child = machine__find_thread(trace->host, ret, ret);
 
                if (child != NULL) {
-                       fprintf(trace->output, ") = %ld", ret);
+                       fprintf(trace->output, "%ld", ret);
                        if (child->comm_set)
                                fprintf(trace->output, " (%s)", thread__comm_str(child));
                        thread__put(child);
@@ -2153,7 +2227,7 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
        trace__printf_interrupted_entry(trace);
        trace__fprintf_tstamp(trace, sample->time, trace->output);
 
-       if (trace->trace_syscalls)
+       if (trace->trace_syscalls && trace->show_duration)
                fprintf(trace->output, "(         ): ");
 
        if (thread)
@@ -2524,7 +2598,7 @@ out_delete_sys_enter:
        goto out;
 }
 
-static int trace__set_ev_qualifier_filter(struct trace *trace)
+static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
 {
        int err = -1;
        struct perf_evsel *sys_exit;
@@ -2549,9 +2623,93 @@ out_enomem:
        goto out;
 }
 
+#ifdef HAVE_LIBBPF_SUPPORT
+static int trace__set_ev_qualifier_bpf_filter(struct trace *trace)
+{
+       int fd = bpf_map__fd(trace->syscalls.map);
+       struct bpf_map_syscall_entry value = {
+               .enabled = !trace->not_ev_qualifier,
+       };
+       int err = 0;
+       size_t i;
+
+       for (i = 0; i < trace->ev_qualifier_ids.nr; ++i) {
+               int key = trace->ev_qualifier_ids.entries[i];
+
+               err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
+               if (err)
+                       break;
+       }
+
+       return err;
+}
+
+static int __trace__init_syscalls_bpf_map(struct trace *trace, bool enabled)
+{
+       int fd = bpf_map__fd(trace->syscalls.map);
+       struct bpf_map_syscall_entry value = {
+               .enabled = enabled,
+       };
+       int err = 0, key;
+
+       for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
+               err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
+               if (err)
+                       break;
+       }
+
+       return err;
+}
+
+static int trace__init_syscalls_bpf_map(struct trace *trace)
+{
+       bool enabled = true;
+
+       if (trace->ev_qualifier_ids.nr)
+               enabled = trace->not_ev_qualifier;
+
+       return __trace__init_syscalls_bpf_map(trace, enabled);
+}
+#else
+static int trace__set_ev_qualifier_bpf_filter(struct trace *trace __maybe_unused)
+{
+       return 0;
+}
+
+static int trace__init_syscalls_bpf_map(struct trace *trace __maybe_unused)
+{
+       return 0;
+}
+#endif // HAVE_LIBBPF_SUPPORT
+
+static int trace__set_ev_qualifier_filter(struct trace *trace)
+{
+       if (trace->syscalls.map)
+               return trace__set_ev_qualifier_bpf_filter(trace);
+       return trace__set_ev_qualifier_tp_filter(trace);
+}
+
+static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
+                                   size_t npids __maybe_unused, pid_t *pids __maybe_unused)
+{
+       int err = 0;
+#ifdef HAVE_LIBBPF_SUPPORT
+       bool value = true;
+       int map_fd = bpf_map__fd(map);
+       size_t i;
+
+       for (i = 0; i < npids; ++i) {
+               err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
+               if (err)
+                       break;
+       }
+#endif
+       return err;
+}
+
 static int trace__set_filter_loop_pids(struct trace *trace)
 {
-       unsigned int nr = 1;
+       unsigned int nr = 1, err;
        pid_t pids[32] = {
                getpid(),
        };
@@ -2570,7 +2728,92 @@ static int trace__set_filter_loop_pids(struct trace *trace)
                thread = parent;
        }
 
-       return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
+       err = perf_evlist__set_tp_filter_pids(trace->evlist, nr, pids);
+       if (!err && trace->filter_pids.map)
+               err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
+
+       return err;
+}
+
+static int trace__set_filter_pids(struct trace *trace)
+{
+       int err = 0;
+       /*
+        * Better not use !target__has_task() here because we need to cover the
+        * case where no threads were specified in the command line, but a
+        * workload was, and in that case we will fill in the thread_map when
+        * we fork the workload in perf_evlist__prepare_workload.
+        */
+       if (trace->filter_pids.nr > 0) {
+               err = perf_evlist__set_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
+                                                     trace->filter_pids.entries);
+               if (!err && trace->filter_pids.map) {
+                       err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
+                                                      trace->filter_pids.entries);
+               }
+       } else if (thread_map__pid(trace->evlist->threads, 0) == -1) {
+               err = trace__set_filter_loop_pids(trace);
+       }
+
+       return err;
+}
+
+static int __trace__deliver_event(struct trace *trace, union perf_event *event)
+{
+       struct perf_evlist *evlist = trace->evlist;
+       struct perf_sample sample;
+       int err;
+
+       err = perf_evlist__parse_sample(evlist, event, &sample);
+       if (err)
+               fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
+       else
+               trace__handle_event(trace, event, &sample);
+
+       return 0;
+}
+
+static int __trace__flush_events(struct trace *trace)
+{
+       u64 first = ordered_events__first_time(&trace->oe.data);
+       u64 flush = trace->oe.last - NSEC_PER_SEC;
+
+       /* Is there some thing to flush.. */
+       if (first && first < flush)
+               return ordered_events__flush_time(&trace->oe.data, flush);
+
+       return 0;
+}
+
+static int trace__flush_events(struct trace *trace)
+{
+       return !trace->sort_events ? 0 : __trace__flush_events(trace);
+}
+
+static int trace__deliver_event(struct trace *trace, union perf_event *event)
+{
+       int err;
+
+       if (!trace->sort_events)
+               return __trace__deliver_event(trace, event);
+
+       err = perf_evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
+       if (err && err != -1)
+               return err;
+
+       err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0);
+       if (err)
+               return err;
+
+       return trace__flush_events(trace);
+}
+
+static int ordered_events__deliver_event(struct ordered_events *oe,
+                                        struct ordered_event *event)
+{
+       struct trace *trace = container_of(oe, struct trace, oe.data);
+
+       return __trace__deliver_event(trace, event->event);
 }
 
 static int trace__run(struct trace *trace, int argc, const char **argv)
@@ -2584,11 +2827,13 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 
        trace->live = true;
 
-       if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
-               goto out_error_raw_syscalls;
+       if (!trace->raw_augmented_syscalls) {
+               if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
+                       goto out_error_raw_syscalls;
 
-       if (trace->trace_syscalls)
-               trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
+               if (trace->trace_syscalls)
+                       trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
+       }
 
        if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
                pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
@@ -2679,27 +2924,22 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
                goto out_error_open;
        }
 
-       /*
-        * Better not use !target__has_task() here because we need to cover the
-        * case where no threads were specified in the command line, but a
-        * workload was, and in that case we will fill in the thread_map when
-        * we fork the workload in perf_evlist__prepare_workload.
-        */
-       if (trace->filter_pids.nr > 0)
-               err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
-       else if (thread_map__pid(evlist->threads, 0) == -1)
-               err = trace__set_filter_loop_pids(trace);
-
+       err = trace__set_filter_pids(trace);
        if (err < 0)
                goto out_error_mem;
 
+       if (trace->syscalls.map)
+               trace__init_syscalls_bpf_map(trace);
+
        if (trace->ev_qualifier_ids.nr > 0) {
                err = trace__set_ev_qualifier_filter(trace);
                if (err < 0)
                        goto out_errno;
 
-               pr_debug("event qualifier tracepoint filter: %s\n",
-                        trace->syscalls.events.sys_exit->filter);
+               if (trace->syscalls.events.sys_exit) {
+                       pr_debug("event qualifier tracepoint filter: %s\n",
+                                trace->syscalls.events.sys_exit->filter);
+               }
        }
 
        err = perf_evlist__apply_filters(evlist, &evsel);
@@ -2729,7 +2969,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
         * Now that we already used evsel->attr to ask the kernel to setup the
         * events, lets reuse evsel->attr.sample_max_stack as the limit in
         * trace__resolve_callchain(), allowing per-event max-stack settings
-        * to override an explicitely set --max-stack global setting.
+        * to override an explicitly set --max-stack global setting.
         */
        evlist__for_each_entry(evlist, evsel) {
                if (evsel__has_callchain(evsel) &&
@@ -2748,18 +2988,12 @@ again:
                        continue;
 
                while ((event = perf_mmap__read_event(md)) != NULL) {
-                       struct perf_sample sample;
-
                        ++trace->nr_events;
 
-                       err = perf_evlist__parse_sample(evlist, event, &sample);
-                       if (err) {
-                               fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
-                               goto next_event;
-                       }
+                       err = trace__deliver_event(trace, event);
+                       if (err)
+                               goto out_disable;
 
-                       trace__handle_event(trace, event, &sample);
-next_event:
                        perf_mmap__consume(md);
 
                        if (interrupted)
@@ -2781,6 +3015,9 @@ next_event:
                                draining = true;
 
                        goto again;
+               } else {
+                       if (trace__flush_events(trace))
+                               goto out_disable;
                }
        } else {
                goto again;
@@ -2791,6 +3028,9 @@ out_disable:
 
        perf_evlist__disable(evlist);
 
+       if (trace->sort_events)
+               ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
+
        if (!err) {
                if (trace->summary)
                        trace__fprintf_thread_summary(trace, trace->output);
@@ -3088,8 +3328,8 @@ static int trace__set_duration(const struct option *opt, const char *str,
        return 0;
 }
 
-static int trace__set_filter_pids(const struct option *opt, const char *str,
-                                 int unset __maybe_unused)
+static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
+                                             int unset __maybe_unused)
 {
        int ret = -1;
        size_t i;
@@ -3299,6 +3539,68 @@ static int trace__parse_cgroups(const struct option *opt, const char *str, int u
        return 0;
 }
 
+static struct bpf_map *bpf__find_map_by_name(const char *name)
+{
+       struct bpf_object *obj, *tmp;
+
+       bpf_object__for_each_safe(obj, tmp) {
+               struct bpf_map *map = bpf_object__find_map_by_name(obj, name);
+               if (map)
+                       return map;
+
+       }
+
+       return NULL;
+}
+
+static void trace__set_bpf_map_filtered_pids(struct trace *trace)
+{
+       trace->filter_pids.map = bpf__find_map_by_name("pids_filtered");
+}
+
+static void trace__set_bpf_map_syscalls(struct trace *trace)
+{
+       trace->syscalls.map = bpf__find_map_by_name("syscalls");
+}
+
+static int trace__config(const char *var, const char *value, void *arg)
+{
+       struct trace *trace = arg;
+       int err = 0;
+
+       if (!strcmp(var, "trace.add_events")) {
+               struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
+                                              "event selector. use 'perf list' to list available events",
+                                              parse_events_option);
+               err = parse_events_option(&o, value, 0);
+       } else if (!strcmp(var, "trace.show_timestamp")) {
+               trace->show_tstamp = perf_config_bool(var, value);
+       } else if (!strcmp(var, "trace.show_duration")) {
+               trace->show_duration = perf_config_bool(var, value);
+       } else if (!strcmp(var, "trace.show_arg_names")) {
+               trace->show_arg_names = perf_config_bool(var, value);
+               if (!trace->show_arg_names)
+                       trace->show_zeros = true;
+       } else if (!strcmp(var, "trace.show_zeros")) {
+               bool new_show_zeros = perf_config_bool(var, value);
+               if (!trace->show_arg_names && !new_show_zeros) {
+                       pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
+                       goto out;
+               }
+               trace->show_zeros = new_show_zeros;
+       } else if (!strcmp(var, "trace.show_prefix")) {
+               trace->show_string_prefix = perf_config_bool(var, value);
+       } else if (!strcmp(var, "trace.no_inherit")) {
+               trace->opts.no_inherit = perf_config_bool(var, value);
+       } else if (!strcmp(var, "trace.args_alignment")) {
+               int args_alignment = 0;
+               if (perf_config_int(&args_alignment, var, value) == 0)
+                       trace->args_alignment = args_alignment;
+       }
+out:
+       return err;
+}
+
 int cmd_trace(int argc, const char **argv)
 {
        const char *trace_usage[] = {
@@ -3321,10 +3623,13 @@ int cmd_trace(int argc, const char **argv)
                        .user_interval = ULLONG_MAX,
                        .no_buffering  = true,
                        .mmap_pages    = UINT_MAX,
-                       .proc_map_timeout  = 500,
                },
                .output = stderr,
                .show_comm = true,
+               .show_tstamp = true,
+               .show_duration = true,
+               .show_arg_names = true,
+               .args_alignment = 70,
                .trace_syscalls = false,
                .kernel_syscallchains = false,
                .max_stack = UINT_MAX,
@@ -3347,7 +3652,7 @@ int cmd_trace(int argc, const char **argv)
        OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
                    "trace events on existing thread id"),
        OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
-                    "pids to filter (by the kernel)", trace__set_filter_pids),
+                    "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
        OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
                    "system-wide collection from all CPUs"),
        OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
@@ -3390,9 +3695,11 @@ int cmd_trace(int argc, const char **argv)
                     "Set the maximum stack depth when parsing the callchain, "
                     "anything beyond the specified depth will be ignored. "
                     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
+       OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
+                       "Sort batch of events before processing, use if getting out of order events"),
        OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
                        "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
-       OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
+       OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
                        "per thread proc mmap processing timeout in ms"),
        OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
                     trace__parse_cgroups),
@@ -3420,6 +3727,10 @@ int cmd_trace(int argc, const char **argv)
                goto out;
        }
 
+       err = perf_config(trace__config, &trace);
+       if (err)
+               goto out;
+
        argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
                                 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
 
@@ -3435,8 +3746,11 @@ int cmd_trace(int argc, const char **argv)
                goto out;
        }
 
-       if (evsel)
+       if (evsel) {
                trace.syscalls.events.augmented = evsel;
+               trace__set_bpf_map_filtered_pids(&trace);
+               trace__set_bpf_map_syscalls(&trace);
+       }
 
        err = bpf__setup_stdout(trace.evlist);
        if (err) {
@@ -3481,6 +3795,11 @@ int cmd_trace(int argc, const char **argv)
                }
        }
 
+       if (trace.sort_events) {
+               ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
+               ordered_events__set_copy_on_queue(&trace.oe.data, true);
+       }
+
        /*
         * If we are augmenting syscalls, then combine what we put in the
         * __augmented_syscalls__ BPF map with what is in the
@@ -3501,7 +3820,15 @@ int cmd_trace(int argc, const char **argv)
                evsel->handler = trace__sys_enter;
 
                evlist__for_each_entry(trace.evlist, evsel) {
+                       bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
+
+                       if (raw_syscalls_sys_exit) {
+                               trace.raw_augmented_syscalls = true;
+                               goto init_augmented_syscall_tp;
+                       }
+
                        if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) {
+init_augmented_syscall_tp:
                                perf_evsel__init_augmented_syscall_tp(evsel);
                                perf_evsel__init_augmented_syscall_tp_ret(evsel);
                                evsel->handler = trace__sys_exit;