Skip to content

Commit 6f7092f

Browse files
committed
implement pidfd_open
1 parent 4dbba85 commit 6f7092f

File tree

12 files changed

+572
-2
lines changed

12 files changed

+572
-2
lines changed

Diff for: pkg/abi/linux/wait.go

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ const (
3737
P_ALL = 0x0
3838
P_PID = 0x1
3939
P_PGID = 0x2
40+
P_PIDFD = 0x3
4041
)
4142

4243
// WaitStatus represents a thread status, as returned by the wait* family of

Diff for: pkg/sentry/fsimpl/pidfd/BUILD

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
load("//tools:defs.bzl", "go_library", "go_test")
2+
3+
package(default_applicable_licenses = ["//:license"])
4+
5+
licenses(["notice"])
6+
7+
go_library(
8+
name = "pidfd",
9+
srcs = ["pidfd.go"],
10+
visibility = ["//pkg/sentry:internal"],
11+
deps = [
12+
"//pkg/abi/linux",
13+
"//pkg/context",
14+
"//pkg/errors/linuxerr",
15+
"//pkg/sentry/kernel",
16+
"//pkg/sentry/vfs",
17+
"//pkg/waiter",
18+
],
19+
)

Diff for: pkg/sentry/fsimpl/pidfd/pidfd.go

+116
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
// Copyright 2024 The gVisor Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// Package pidfd implements process fds.
16+
package pidfd
17+
18+
import (
19+
"gvisor.dev/gvisor/pkg/abi/linux"
20+
"gvisor.dev/gvisor/pkg/context"
21+
"gvisor.dev/gvisor/pkg/sentry/kernel"
22+
"gvisor.dev/gvisor/pkg/sentry/vfs"
23+
"gvisor.dev/gvisor/pkg/waiter"
24+
)
25+
26+
// ProcessFileDescription implements vfs.FileDescriptionImpl for pidfds.
27+
//
28+
// +stateify savable
29+
type ProcessFileDescription struct {
30+
vfsfd vfs.FileDescription
31+
vfs.FileDescriptionDefaultImpl
32+
vfs.DentryMetadataFileDescriptionImpl
33+
vfs.NoLockFD
34+
35+
tid kernel.ThreadID
36+
k *kernel.Kernel
37+
pidns *kernel.PIDNamespace
38+
nonblock bool
39+
}
40+
41+
// New creates a new process fd.
42+
func New(task *kernel.Task, flags uint32) (*vfs.FileDescription, error) {
43+
fd := &ProcessFileDescription{
44+
tid: task.ThreadID(),
45+
k: task.Kernel(),
46+
pidns: task.PIDNamespace(),
47+
nonblock: flags&linux.O_NONBLOCK != 0,
48+
}
49+
50+
fileFlags := uint32(linux.O_RDWR)
51+
if flags&linux.O_NONBLOCK != 0 {
52+
fileFlags |= linux.O_NONBLOCK
53+
}
54+
55+
vd := task.Kernel().VFS().NewAnonVirtualDentry("[pidfd]")
56+
defer vd.DecRef(task)
57+
58+
if err := fd.vfsfd.Init(fd, fileFlags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
59+
UseDentryMetadata: true,
60+
}); err != nil {
61+
return nil, err
62+
}
63+
64+
return &fd.vfsfd, nil
65+
}
66+
67+
// PID returns the process ID associated with this pidfd.
68+
func (pfd *ProcessFileDescription) PID() kernel.ThreadID {
69+
return pfd.tid
70+
}
71+
72+
// Nonblocking returns whether this pidfd is nonblocking.
73+
func (pfd *ProcessFileDescription) Nonblocking() bool {
74+
return pfd.nonblock
75+
}
76+
77+
// TaskExited returns whether the task associated with this pidfd has exited.
78+
func (pfd *ProcessFileDescription) TaskExited() bool {
79+
if task := pfd.pidns.TaskWithID(pfd.tid); task != nil {
80+
return task.ExitState() != kernel.TaskExitNone
81+
}
82+
return true
83+
}
84+
85+
// Release implements vfs.FileDescriptionImpl.Release.
86+
func (pfd *ProcessFileDescription) Release(context.Context) {
87+
}
88+
89+
// Readiness implements waiter.Waitable.Readiness.
90+
func (pfd *ProcessFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
91+
ready := waiter.EventMask(0)
92+
if pfd.TaskExited() {
93+
ready |= waiter.ReadableEvents
94+
}
95+
return mask & ready
96+
}
97+
98+
// EventRegister implements waiter.Waitable.EventRegister.
99+
func (pfd *ProcessFileDescription) EventRegister(e *waiter.Entry) error {
100+
if task := pfd.pidns.TaskWithID(pfd.tid); task != nil {
101+
task.PidfdEventRegister(e)
102+
}
103+
return nil
104+
}
105+
106+
// EventUnregister implements waiter.Waitable.EventUnregister.
107+
func (pfd *ProcessFileDescription) EventUnregister(e *waiter.Entry) {
108+
if task := pfd.pidns.TaskWithID(pfd.tid); task != nil {
109+
task.PidfdEventUnregister(e)
110+
}
111+
}
112+
113+
// Epollable implements FileDescriptionImpl.Epollable.
114+
func (pfd *ProcessFileDescription) Epollable() bool {
115+
return true
116+
}

Diff for: pkg/sentry/kernel/task.go

+13
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,9 @@ type Task struct {
186186
// The task only broadcast a notification on signal delivery.
187187
signalQueue waiter.Queue
188188

189+
// pidfdQueue is a set of registered waiters for pidfd-related events.
190+
pidfdQueue waiter.Queue
191+
189192
// If groupStopPending is true, the task should participate in a group
190193
// stop in the interrupt path.
191194
//
@@ -877,3 +880,13 @@ func (t *Task) ResetKcov() {
877880
t.kcov = nil
878881
}
879882
}
883+
884+
// PidfdEventRegister registers a waiter entry for pidfd events.
885+
func (t *Task) PidfdEventRegister(e *waiter.Entry) {
886+
t.pidfdQueue.EventRegister(e)
887+
}
888+
889+
// PidfdEventUnregister unregisters a waiter entry for pidfd events.
890+
func (t *Task) PidfdEventUnregister(e *waiter.Entry) {
891+
t.pidfdQueue.EventUnregister(e)
892+
}

Diff for: pkg/sentry/kernel/task_exit.go

+3
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,9 @@ func (*runExitNotify) execute(t *Task) taskRunState {
602602
defer t.tg.pidns.owner.mu.Unlock()
603603
t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie)
604604
t.tg.liveTasks--
605+
606+
t.pidfdQueue.Notify(waiter.ReadableEvents)
607+
605608
// Check if this completes a sibling's execve.
606609
if t.tg.execing != nil && t.tg.liveTasks == 1 {
607610
// execing blocks the addition of new tasks to the thread group, so

Diff for: pkg/sentry/syscalls/linux/BUILD

+2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ go_library(
3333
"sys_mount.go",
3434
"sys_mq.go",
3535
"sys_msgqueue.go",
36+
"sys_pidfd.go",
3637
"sys_pipe.go",
3738
"sys_poll.go",
3839
"sys_prctl.go",
@@ -89,6 +90,7 @@ go_library(
8990
"//pkg/sentry/fsimpl/host",
9091
"//pkg/sentry/fsimpl/iouringfs",
9192
"//pkg/sentry/fsimpl/lock",
93+
"//pkg/sentry/fsimpl/pidfd",
9294
"//pkg/sentry/fsimpl/pipefs",
9395
"//pkg/sentry/fsimpl/signalfd",
9496
"//pkg/sentry/fsimpl/timerfd",

Diff for: pkg/sentry/syscalls/linux/linux64.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ var AMD64 = &kernel.SyscallTable{
402402
431: syscalls.ErrorWithEvent("fsconfig", linuxerr.ENOSYS, "", nil),
403403
432: syscalls.ErrorWithEvent("fsmount", linuxerr.ENOSYS, "", nil),
404404
433: syscalls.ErrorWithEvent("fspick", linuxerr.ENOSYS, "", nil),
405-
434: syscalls.ErrorWithEvent("pidfd_open", linuxerr.ENOSYS, "", nil),
405+
434: syscalls.Supported("pidfd_open", PidfdOpen),
406406
435: syscalls.PartiallySupported("clone3", Clone3, "Options CLONE_PIDFD, CLONE_NEWCGROUP, CLONE_INTO_CGROUP, CLONE_NEWTIME, CLONE_CLEAR_SIGHAND, CLONE_PARENT, CLONE_SYSVSEM and, SetTid are not supported.", nil),
407407
436: syscalls.Supported("close_range", CloseRange),
408408
439: syscalls.Supported("faccessat2", Faccessat2),
@@ -723,7 +723,7 @@ var ARM64 = &kernel.SyscallTable{
723723
431: syscalls.ErrorWithEvent("fsconfig", linuxerr.ENOSYS, "", nil),
724724
432: syscalls.ErrorWithEvent("fsmount", linuxerr.ENOSYS, "", nil),
725725
433: syscalls.ErrorWithEvent("fspick", linuxerr.ENOSYS, "", nil),
726-
434: syscalls.ErrorWithEvent("pidfd_open", linuxerr.ENOSYS, "", nil),
726+
434: syscalls.Supported("pidfd_open", PidfdOpen),
727727
435: syscalls.PartiallySupported("clone3", Clone3, "Options CLONE_PIDFD, CLONE_NEWCGROUP, CLONE_INTO_CGROUP, CLONE_NEWTIME, CLONE_CLEAR_SIGHAND, CLONE_PARENT, CLONE_SYSVSEM and clone_args.set_tid are not supported.", nil),
728728
436: syscalls.Supported("close_range", CloseRange),
729729
439: syscalls.Supported("faccessat2", Faccessat2),

Diff for: pkg/sentry/syscalls/linux/sys_pidfd.go

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
// Copyright 2024 The gVisor Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package linux
16+
17+
import (
18+
"gvisor.dev/gvisor/pkg/abi/linux"
19+
"gvisor.dev/gvisor/pkg/errors/linuxerr"
20+
"gvisor.dev/gvisor/pkg/sentry/arch"
21+
"gvisor.dev/gvisor/pkg/sentry/kernel"
22+
"gvisor.dev/gvisor/pkg/sentry/fsimpl/pidfd"
23+
)
24+
25+
// PidfdOpen implements Linux syscall pidfd_open(2).
26+
func PidfdOpen(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
27+
pid := kernel.ThreadID(args[0].Int())
28+
flags := args[1].Int()
29+
30+
fd, err := pidfd_open(t, pid, flags)
31+
if err != nil {
32+
return 0, nil, err
33+
}
34+
35+
return uintptr(fd), nil, nil
36+
}
37+
38+
func pidfd_open(t *kernel.Task, pid kernel.ThreadID, flags int32) (int32, error) {
39+
if flags & ^linux.O_NONBLOCK != 0 {
40+
return 0, linuxerr.EINVAL
41+
}
42+
43+
if pid <= 0 {
44+
return 0, linuxerr.EINVAL
45+
}
46+
47+
targetTask := t.PIDNamespace().TaskWithID(pid)
48+
if targetTask == nil {
49+
return 0, linuxerr.ESRCH
50+
}
51+
52+
file, err := pidfd.New(targetTask, uint32(flags))
53+
if err != nil {
54+
return 0, err
55+
}
56+
defer file.DecRef(t)
57+
58+
fd, err := t.NewFDFrom(0, file, kernel.FDFlags{
59+
CloseOnExec: true,
60+
})
61+
if err != nil {
62+
return 0, err
63+
}
64+
65+
return fd, nil
66+
}

Diff for: pkg/sentry/syscalls/linux/sys_thread.go

+18
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"gvisor.dev/gvisor/pkg/hostarch"
2222
"gvisor.dev/gvisor/pkg/marshal/primitive"
2323
"gvisor.dev/gvisor/pkg/sentry/arch"
24+
"gvisor.dev/gvisor/pkg/sentry/fsimpl/pidfd"
2425
"gvisor.dev/gvisor/pkg/sentry/kernel"
2526
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
2627
"gvisor.dev/gvisor/pkg/sentry/loader"
@@ -381,6 +382,23 @@ func Waitid(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr,
381382
wopts.SpecificTID = kernel.ThreadID(id)
382383
case linux.P_PGID:
383384
wopts.SpecificPGID = kernel.ProcessGroupID(id)
385+
case linux.P_PIDFD:
386+
file := t.GetFile(int32(id))
387+
if file == nil {
388+
return 0, nil, linuxerr.EINVAL
389+
}
390+
defer file.DecRef(t)
391+
392+
pfd, ok := file.Impl().(*pidfd.ProcessFileDescription)
393+
if !ok {
394+
return 0, nil, linuxerr.EINVAL
395+
}
396+
397+
if pfd.Nonblocking() && !pfd.TaskExited() {
398+
return 0, nil, linuxerr.EAGAIN
399+
}
400+
401+
wopts.SpecificTID = pfd.PID()
384402
default:
385403
return 0, nil, linuxerr.EINVAL
386404
}

Diff for: test/syscalls/BUILD

+4
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,10 @@ syscall_test(
479479
test = "//test/syscalls/linux:pause_test",
480480
)
481481

482+
syscall_test(
483+
test = "//test/syscalls/linux:pidfd_test",
484+
)
485+
482486
syscall_test(
483487
size = "medium",
484488
add_hostinet = True,

Diff for: test/syscalls/linux/BUILD

+14
Original file line numberDiff line numberDiff line change
@@ -1674,6 +1674,20 @@ cc_binary(
16741674
],
16751675
)
16761676

1677+
cc_binary(
1678+
name = "pidfd_test",
1679+
testonly = 1,
1680+
srcs = ["pidfd.cc"],
1681+
linkstatic = 1,
1682+
malloc = "//test/util:errno_safe_allocator",
1683+
deps = select_gtest() + [
1684+
"//test/util:test_main",
1685+
"//test/util:test_util",
1686+
"//test/util:thread_util",
1687+
"//test/util:time_util",
1688+
],
1689+
)
1690+
16771691
cc_binary(
16781692
name = "ping_socket_test",
16791693
testonly = 1,

0 commit comments

Comments
 (0)