Skip to content

Commit fa28dcb

Browse files
liu-song-6Alexei Starovoitov
authored and
Alexei Starovoitov
committed
bpf: Introduce helper bpf_get_task_stack()
Introduce helper bpf_get_task_stack(), which dumps stack trace of given task. This is different to bpf_get_stack(), which gets stack track of current task. One potential use case of bpf_get_task_stack() is to call it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file. bpf_get_task_stack() uses stack_trace_save_tsk() instead of get_perf_callchain() for kernel stack. The benefit of this choice is that stack_trace_save_tsk() doesn't require changes in arch/. The downside of using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the stack trace to unsigned long array. For 32-bit systems, we need to translate it to u64 array. Signed-off-by: Song Liu <[email protected]> Signed-off-by: Alexei Starovoitov <[email protected]> Acked-by: Andrii Nakryiko <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent d141b8b commit fa28dcb

File tree

7 files changed

+153
-7
lines changed

7 files changed

+153
-7
lines changed

Diff for: include/linux/bpf.h

+1
Original file line numberDiff line numberDiff line change
@@ -1627,6 +1627,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
16271627
extern const struct bpf_func_proto bpf_get_current_comm_proto;
16281628
extern const struct bpf_func_proto bpf_get_stackid_proto;
16291629
extern const struct bpf_func_proto bpf_get_stack_proto;
1630+
extern const struct bpf_func_proto bpf_get_task_stack_proto;
16301631
extern const struct bpf_func_proto bpf_sock_map_update_proto;
16311632
extern const struct bpf_func_proto bpf_sock_hash_update_proto;
16321633
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;

Diff for: include/uapi/linux/bpf.h

+36-1
Original file line numberDiff line numberDiff line change
@@ -3285,6 +3285,39 @@ union bpf_attr {
32853285
* Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
32863286
* Return
32873287
* *sk* if casting is valid, or NULL otherwise.
3288+
*
3289+
* long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
3290+
* Description
3291+
* Return a user or a kernel stack in bpf program provided buffer.
3292+
* To achieve this, the helper needs *task*, which is a valid
3293+
* pointer to struct task_struct. To store the stacktrace, the
3294+
* bpf program provides *buf* with a nonnegative *size*.
3295+
*
3296+
* The last argument, *flags*, holds the number of stack frames to
3297+
* skip (from 0 to 255), masked with
3298+
* **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
3299+
* the following flags:
3300+
*
3301+
* **BPF_F_USER_STACK**
3302+
* Collect a user space stack instead of a kernel stack.
3303+
* **BPF_F_USER_BUILD_ID**
3304+
* Collect buildid+offset instead of ips for user stack,
3305+
* only valid if **BPF_F_USER_STACK** is also specified.
3306+
*
3307+
* **bpf_get_task_stack**\ () can collect up to
3308+
* **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
3309+
* to sufficient large buffer size. Note that
3310+
* this limit can be controlled with the **sysctl** program, and
3311+
* that it should be manually increased in order to profile long
3312+
* user stacks (such as stacks for Java programs). To do so, use:
3313+
*
3314+
* ::
3315+
*
3316+
* # sysctl kernel.perf_event_max_stack=<new value>
3317+
* Return
3318+
* A non-negative value equal to or less than *size* on success,
3319+
* or a negative error in case of failure.
3320+
*
32883321
*/
32893322
#define __BPF_FUNC_MAPPER(FN) \
32903323
FN(unspec), \
@@ -3427,7 +3460,9 @@ union bpf_attr {
34273460
FN(skc_to_tcp_sock), \
34283461
FN(skc_to_tcp_timewait_sock), \
34293462
FN(skc_to_tcp_request_sock), \
3430-
FN(skc_to_udp6_sock),
3463+
FN(skc_to_udp6_sock), \
3464+
FN(get_task_stack), \
3465+
/* */
34313466

34323467
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
34333468
* function eBPF program intends to call

Diff for: kernel/bpf/stackmap.c

+73-4
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,40 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
348348
}
349349
}
350350

351+
static struct perf_callchain_entry *
352+
get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
353+
{
354+
struct perf_callchain_entry *entry;
355+
int rctx;
356+
357+
entry = get_callchain_entry(&rctx);
358+
359+
if (!entry)
360+
return NULL;
361+
362+
entry->nr = init_nr +
363+
stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
364+
sysctl_perf_event_max_stack - init_nr, 0);
365+
366+
/* stack_trace_save_tsk() works on unsigned long array, while
367+
* perf_callchain_entry uses u64 array. For 32-bit systems, it is
368+
* necessary to fix this mismatch.
369+
*/
370+
if (__BITS_PER_LONG != 64) {
371+
unsigned long *from = (unsigned long *) entry->ip;
372+
u64 *to = entry->ip;
373+
int i;
374+
375+
/* copy data from the end to avoid using extra buffer */
376+
for (i = entry->nr - 1; i >= (int)init_nr; i--)
377+
to[i] = (u64)(from[i]);
378+
}
379+
380+
put_callchain_entry(rctx);
381+
382+
return entry;
383+
}
384+
351385
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
352386
u64, flags)
353387
{
@@ -448,8 +482,8 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
448482
.arg3_type = ARG_ANYTHING,
449483
};
450484

451-
BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
452-
u64, flags)
485+
static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
486+
void *buf, u32 size, u64 flags)
453487
{
454488
u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
455489
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
@@ -471,13 +505,22 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
471505
if (unlikely(size % elem_size))
472506
goto clear;
473507

508+
/* cannot get valid user stack for task without user_mode regs */
509+
if (task && user && !user_mode(regs))
510+
goto err_fault;
511+
474512
num_elem = size / elem_size;
475513
if (sysctl_perf_event_max_stack < num_elem)
476514
init_nr = 0;
477515
else
478516
init_nr = sysctl_perf_event_max_stack - num_elem;
479-
trace = get_perf_callchain(regs, init_nr, kernel, user,
480-
sysctl_perf_event_max_stack, false, false);
517+
518+
if (kernel && task)
519+
trace = get_callchain_entry_for_task(task, init_nr);
520+
else
521+
trace = get_perf_callchain(regs, init_nr, kernel, user,
522+
sysctl_perf_event_max_stack,
523+
false, false);
481524
if (unlikely(!trace))
482525
goto err_fault;
483526

@@ -505,6 +548,12 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
505548
return err;
506549
}
507550

551+
BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
552+
u64, flags)
553+
{
554+
return __bpf_get_stack(regs, NULL, buf, size, flags);
555+
}
556+
508557
const struct bpf_func_proto bpf_get_stack_proto = {
509558
.func = bpf_get_stack,
510559
.gpl_only = true,
@@ -515,6 +564,26 @@ const struct bpf_func_proto bpf_get_stack_proto = {
515564
.arg4_type = ARG_ANYTHING,
516565
};
517566

567+
BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
568+
u32, size, u64, flags)
569+
{
570+
struct pt_regs *regs = task_pt_regs(task);
571+
572+
return __bpf_get_stack(regs, task, buf, size, flags);
573+
}
574+
575+
static int bpf_get_task_stack_btf_ids[5];
576+
const struct bpf_func_proto bpf_get_task_stack_proto = {
577+
.func = bpf_get_task_stack,
578+
.gpl_only = false,
579+
.ret_type = RET_INTEGER,
580+
.arg1_type = ARG_PTR_TO_BTF_ID,
581+
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
582+
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
583+
.arg4_type = ARG_ANYTHING,
584+
.btf_id = bpf_get_task_stack_btf_ids,
585+
};
586+
518587
/* Called from eBPF program */
519588
static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
520589
{

Diff for: kernel/bpf/verifier.c

+3-1
Original file line numberDiff line numberDiff line change
@@ -4864,7 +4864,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
48644864
if (err)
48654865
return err;
48664866

4867-
if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
4867+
if ((func_id == BPF_FUNC_get_stack ||
4868+
func_id == BPF_FUNC_get_task_stack) &&
4869+
!env->prog->has_callchain_buf) {
48684870
const char *err_str;
48694871

48704872
#ifdef CONFIG_PERF_EVENTS

Diff for: kernel/trace/bpf_trace.c

+2
Original file line numberDiff line numberDiff line change
@@ -1137,6 +1137,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
11371137
return &bpf_ringbuf_query_proto;
11381138
case BPF_FUNC_jiffies64:
11391139
return &bpf_jiffies64_proto;
1140+
case BPF_FUNC_get_task_stack:
1141+
return &bpf_get_task_stack_proto;
11401142
default:
11411143
return NULL;
11421144
}

Diff for: scripts/bpf_helpers_doc.py

+2
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,7 @@ class PrinterHelpers(Printer):
426426
'struct tcp_timewait_sock',
427427
'struct tcp_request_sock',
428428
'struct udp6_sock',
429+
'struct task_struct',
429430

430431
'struct __sk_buff',
431432
'struct sk_msg_md',
@@ -468,6 +469,7 @@ class PrinterHelpers(Printer):
468469
'struct tcp_timewait_sock',
469470
'struct tcp_request_sock',
470471
'struct udp6_sock',
472+
'struct task_struct',
471473
}
472474
mapped_types = {
473475
'u8': '__u8',

Diff for: tools/include/uapi/linux/bpf.h

+36-1
Original file line numberDiff line numberDiff line change
@@ -3285,6 +3285,39 @@ union bpf_attr {
32853285
* Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
32863286
* Return
32873287
* *sk* if casting is valid, or NULL otherwise.
3288+
*
3289+
* long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
3290+
* Description
3291+
* Return a user or a kernel stack in bpf program provided buffer.
3292+
* To achieve this, the helper needs *task*, which is a valid
3293+
* pointer to struct task_struct. To store the stacktrace, the
3294+
* bpf program provides *buf* with a nonnegative *size*.
3295+
*
3296+
* The last argument, *flags*, holds the number of stack frames to
3297+
* skip (from 0 to 255), masked with
3298+
* **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
3299+
* the following flags:
3300+
*
3301+
* **BPF_F_USER_STACK**
3302+
* Collect a user space stack instead of a kernel stack.
3303+
* **BPF_F_USER_BUILD_ID**
3304+
* Collect buildid+offset instead of ips for user stack,
3305+
* only valid if **BPF_F_USER_STACK** is also specified.
3306+
*
3307+
* **bpf_get_task_stack**\ () can collect up to
3308+
* **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
3309+
* to sufficient large buffer size. Note that
3310+
* this limit can be controlled with the **sysctl** program, and
3311+
* that it should be manually increased in order to profile long
3312+
* user stacks (such as stacks for Java programs). To do so, use:
3313+
*
3314+
* ::
3315+
*
3316+
* # sysctl kernel.perf_event_max_stack=<new value>
3317+
* Return
3318+
* A non-negative value equal to or less than *size* on success,
3319+
* or a negative error in case of failure.
3320+
*
32883321
*/
32893322
#define __BPF_FUNC_MAPPER(FN) \
32903323
FN(unspec), \
@@ -3427,7 +3460,9 @@ union bpf_attr {
34273460
FN(skc_to_tcp_sock), \
34283461
FN(skc_to_tcp_timewait_sock), \
34293462
FN(skc_to_tcp_request_sock), \
3430-
FN(skc_to_udp6_sock),
3463+
FN(skc_to_udp6_sock), \
3464+
FN(get_task_stack), \
3465+
/* */
34313466

34323467
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
34333468
* function eBPF program intends to call

0 commit comments

Comments
 (0)