Skip to content

Commit 6bd4bd9

Browse files
committed
*: handle unprivileged operations and !dumpable
Effectively, !dumpable makes implementing rootless containers quite hard, due to a bunch of different operations on /proc/self no longer being possible without reordering everything. !dumpable only really makes sense when you are switching between different security contexts, which is only the case when we are joining namespaces. Unfortunately this means that !dumpable will still have issues in this instance, and it should only be necessary to set !dumpable if we are not joining USER namespaces (new kernels have protections that make !dumpable no longer necessary). But that's a topic for another time. This also includes code to unset and then re-set dumpable when doing the USER namespace mappings. This should also be safe because in principle processes in a container can't see us until after we fork into the PID namespace (which happens after the user mapping). In rootless containers, it is not possible to set a non-dumpable process's /proc/self/oom_score_adj (it's owned by root and thus not writeable). Thus, it needs to be set inside nsexec before we set ourselves as non-dumpable. Signed-off-by: Aleksa Sarai <[email protected]>
1 parent ef9a4b3 commit 6bd4bd9

File tree

5 files changed

+68
-32
lines changed

5 files changed

+68
-32
lines changed

libcontainer/container_linux.go

+6
Original file line numberDiff line numberDiff line change
@@ -1455,5 +1455,11 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
14551455
}
14561456
}
14571457

1458+
// write oom_score_adj
1459+
r.AddData(&Bytemsg{
1460+
Type: OomScoreAdjAttr,
1461+
Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
1462+
})
1463+
14581464
return bytes.NewReader(r.Serialize()), nil
14591465
}

libcontainer/init_linux.go

-8
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,8 @@ import (
66
"encoding/json"
77
"fmt"
88
"io"
9-
"io/ioutil"
109
"net"
1110
"os"
12-
"strconv"
1311
"strings"
1412
"syscall"
1513
"unsafe"
@@ -369,12 +367,6 @@ func setupRlimits(limits []configs.Rlimit, pid int) error {
369367
return nil
370368
}
371369

372-
func setOomScoreAdj(oomScoreAdj int, pid int) error {
373-
path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
374-
375-
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600)
376-
}
377-
378370
const _P_PID = 1
379371

380372
type siginfo struct {

libcontainer/message_linux.go

+8-6
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,14 @@ import (
1111
// list of known message types we want to send to bootstrap program
1212
// The number is randomly chosen to not conflict with known netlink types
1313
const (
14-
InitMsg uint16 = 62000
15-
CloneFlagsAttr uint16 = 27281
16-
NsPathsAttr uint16 = 27282
17-
UidmapAttr uint16 = 27283
18-
GidmapAttr uint16 = 27284
19-
SetgroupAttr uint16 = 27285
14+
InitMsg uint16 = 62000
15+
CloneFlagsAttr uint16 = 27281
16+
NsPathsAttr uint16 = 27282
17+
UidmapAttr uint16 = 27283
18+
GidmapAttr uint16 = 27284
19+
SetgroupAttr uint16 = 27285
20+
OomScoreAdjAttr uint16 = 27286
21+
2022
// When syscall.NLA_HDRLEN is in gccgo, take this out.
2123
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
2224
)

libcontainer/nsenter/nsexec.c

+54-10
Original file line numberDiff line numberDiff line change
@@ -72,18 +72,21 @@ struct nlconfig_t {
7272
char *namespaces;
7373
size_t namespaces_len;
7474
uint8_t is_setgroup;
75+
char *oom_score_adj;
76+
size_t oom_score_adj_len;
7577
};
7678

7779
/*
7880
* List of netlink message types sent to us as part of bootstrapping the init.
7981
* These constants are defined in libcontainer/message_linux.go.
8082
*/
81-
#define INIT_MSG 62000
83+
#define INIT_MSG 62000
8284
#define CLONE_FLAGS_ATTR 27281
8385
#define NS_PATHS_ATTR 27282
84-
#define UIDMAP_ATTR 27283
85-
#define GIDMAP_ATTR 27284
86+
#define UIDMAP_ATTR 27283
87+
#define GIDMAP_ATTR 27284
8688
#define SETGROUP_ATTR 27285
89+
#define OOM_SCORE_ADJ_ATTR 27286
8790

8891
/*
8992
* Use the raw syscall for versions of glibc which don't include a function for
@@ -186,7 +189,7 @@ static void update_setgroups(int pid, enum policy_t setgroup)
186189
}
187190
}
188191

189-
static void update_uidmap(int pid, char *map, int map_len)
192+
static void update_uidmap(int pid, char *map, size_t map_len)
190193
{
191194
if (map == NULL || map_len <= 0)
192195
return;
@@ -195,7 +198,7 @@ static void update_uidmap(int pid, char *map, int map_len)
195198
bail("failed to update /proc/%d/uid_map", pid);
196199
}
197200

198-
static void update_gidmap(int pid, char *map, int map_len)
201+
static void update_gidmap(int pid, char *map, size_t map_len)
199202
{
200203
if (map == NULL || map_len <= 0)
201204
return;
@@ -204,6 +207,15 @@ static void update_gidmap(int pid, char *map, int map_len)
204207
bail("failed to update /proc/%d/gid_map", pid);
205208
}
206209

210+
static void update_oom_score_adj(char *data, size_t len)
211+
{
212+
if (data == NULL || len <= 0)
213+
return;
214+
215+
if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
216+
bail("failed to update /proc/self/oom_score_adj");
217+
}
218+
207219
/* A dummy function that just jumps to the given jumpval. */
208220
static int child_func(void *arg) __attribute__ ((noinline));
209221
static int child_func(void *arg)
@@ -317,6 +329,10 @@ static void nl_parse(int fd, struct nlconfig_t *config)
317329
case CLONE_FLAGS_ATTR:
318330
config->cloneflags = readint32(current);
319331
break;
332+
case OOM_SCORE_ADJ_ATTR:
333+
config->oom_score_adj = current;
334+
config->oom_score_adj_len = payload_len;
335+
break;
320336
case NS_PATHS_ATTR:
321337
config->namespaces = current;
322338
config->namespaces_len = payload_len;
@@ -425,14 +441,32 @@ void nsexec(void)
425441
if (pipenum == -1)
426442
return;
427443

428-
/* make the process non-dumpable */
429-
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) != 0) {
430-
bail("failed to set process as non-dumpable");
431-
}
432-
433444
/* Parse all of the netlink configuration. */
434445
nl_parse(pipenum, &config);
435446

447+
/* Set oom_score_adj. This has to be done before !dumpable because
448+
* /proc/self/oom_score_adj is not writeable unless you're an privileged
449+
* user (if !dumpable is set). All children inherit their parent's
450+
* oom_score_adj value on fork(2) so this will always be propagated
451+
* properly.
452+
*/
453+
update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
454+
455+
/*
456+
* Make the process non-dumpable, to avoid various race conditions that
457+
* could cause processes in namespaces we're joining to access host
458+
* resources (or potentially execute code).
459+
*
460+
* However, if the number of namespaces we are joining is 0, we are not
461+
* going to be switching to a different security context. Thus setting
462+
* ourselves to be non-dumpable only breaks things (like rootless
463+
* containers), which is the recommendation from the kernel folks.
464+
*/
465+
if (config.namespaces) {
466+
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
467+
bail("failed to set process as non-dumpable");
468+
}
469+
436470
/* Pipe so we can tell the child when we've finished setting up. */
437471
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
438472
bail("failed to setup sync pipe between parent and child");
@@ -681,6 +715,11 @@ void nsexec(void)
681715
* clone_parent rant). So signal our parent to hook us up.
682716
*/
683717

718+
/* Switching is only necessary if we joined namespaces. */
719+
if (config.namespaces) {
720+
if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
721+
bail("failed to set process as dumpable");
722+
}
684723
s = SYNC_USERMAP_PLS;
685724
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
686725
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
@@ -691,6 +730,11 @@ void nsexec(void)
691730
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
692731
if (s != SYNC_USERMAP_ACK)
693732
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
733+
/* Switching is only necessary if we joined namespaces. */
734+
if (config.namespaces) {
735+
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
736+
bail("failed to set process as dumpable");
737+
}
694738
}
695739

696740
/*

libcontainer/process_linux.go

-8
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,6 @@ func (p *setnsProcess) start() (err error) {
8585
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
8686
}
8787
}
88-
// set oom_score_adj
89-
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
90-
return newSystemErrorWithCause(err, "setting oom score")
91-
}
9288
// set rlimits, this has to be done here because we lose permissions
9389
// to raise the limits once we enter a user-namespace
9490
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
@@ -285,10 +281,6 @@ func (p *initProcess) start() error {
285281
if err := p.manager.Set(p.config.Config); err != nil {
286282
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
287283
}
288-
// set oom_score_adj
289-
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
290-
return newSystemErrorWithCause(err, "setting oom score for ready process")
291-
}
292284
// set rlimits, this has to be done here because we lose permissions
293285
// to raise the limits once we enter a user-namespace
294286
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {

0 commit comments

Comments
 (0)