Skip to content

Commit a7f638f

Browse files
rientjestorvalds
authored andcommitted
mm, oom: normalize oom scores to oom_score_adj scale only for userspace
The oom_score_adj scale ranges from -1000 to 1000 and represents the proportion of memory available to the process at allocation time. This means an oom_score_adj value of 300, for example, will bias a process as though it was using an extra 30.0% of available memory and a value of -350 will discount 35.0% of available memory from its usage. The oom killer badness heuristic also uses this scale to report the oom score for each eligible process in determining the "best" process to kill. Thus, it can only differentiate each process's memory usage by 0.1% of system RAM. On large systems, this can end up being a large amount of memory: 256MB on 256GB systems, for example. This can be fixed by having the badness heuristic to use the actual memory usage in scoring threads and then normalizing it to the oom_score_adj scale for userspace. This results in better comparison between eligible threads for kill and no change from the userspace perspective. Suggested-by: KOSAKI Motohiro <[email protected]> Tested-by: Dave Jones <[email protected]> Signed-off-by: David Rientjes <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent fe35004 commit a7f638f

File tree

3 files changed

+22
-32
lines changed

3 files changed

+22
-32
lines changed

fs/proc/base.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -411,12 +411,13 @@ static const struct file_operations proc_lstats_operations = {
411411

412412
static int proc_oom_score(struct task_struct *task, char *buffer)
413413
{
414+
unsigned long totalpages = totalram_pages + total_swap_pages;
414415
unsigned long points = 0;
415416

416417
read_lock(&tasklist_lock);
417418
if (pid_alive(task))
418-
points = oom_badness(task, NULL, NULL,
419-
totalram_pages + total_swap_pages);
419+
points = oom_badness(task, NULL, NULL, totalpages) *
420+
1000 / totalpages;
420421
read_unlock(&tasklist_lock);
421422
return sprintf(buffer, "%lu\n", points);
422423
}

include/linux/oom.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,9 @@ enum oom_constraint {
4343
extern void compare_swap_oom_score_adj(int old_val, int new_val);
4444
extern int test_set_oom_score_adj(int new_val);
4545

46-
extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
47-
const nodemask_t *nodemask, unsigned long totalpages);
46+
extern unsigned long oom_badness(struct task_struct *p,
47+
struct mem_cgroup *memcg, const nodemask_t *nodemask,
48+
unsigned long totalpages);
4849
extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
4950
extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
5051

mm/oom_kill.c

Lines changed: 16 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,10 @@ static bool oom_unkillable_task(struct task_struct *p,
180180
* predictable as possible. The goal is to return the highest value for the
181181
* task consuming the most memory to avoid subsequent oom failures.
182182
*/
183-
unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
184-
const nodemask_t *nodemask, unsigned long totalpages)
183+
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
184+
const nodemask_t *nodemask, unsigned long totalpages)
185185
{
186-
long points;
186+
unsigned long points;
187187

188188
if (oom_unkillable_task(p, memcg, nodemask))
189189
return 0;
@@ -197,46 +197,33 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
197197
return 0;
198198
}
199199

200-
/*
201-
* The memory controller may have a limit of 0 bytes, so avoid a divide
202-
* by zero, if necessary.
203-
*/
204-
if (!totalpages)
205-
totalpages = 1;
206-
207200
/*
208201
* The baseline for the badness score is the proportion of RAM that each
209202
* task's rss, pagetable and swap space use.
210203
*/
211-
points = get_mm_rss(p->mm) + p->mm->nr_ptes;
212-
points += get_mm_counter(p->mm, MM_SWAPENTS);
213-
214-
points *= 1000;
215-
points /= totalpages;
204+
points = get_mm_rss(p->mm) + p->mm->nr_ptes +
205+
get_mm_counter(p->mm, MM_SWAPENTS);
216206
task_unlock(p);
217207

218208
/*
219209
* Root processes get 3% bonus, just like the __vm_enough_memory()
220210
* implementation used by LSMs.
221211
*/
222212
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
223-
points -= 30;
213+
points -= 30 * totalpages / 1000;
224214

225215
/*
226216
* /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
227217
* either completely disable oom killing or always prefer a certain
228218
* task.
229219
*/
230-
points += p->signal->oom_score_adj;
220+
points += p->signal->oom_score_adj * totalpages / 1000;
231221

232222
/*
233-
* Never return 0 for an eligible task that may be killed since it's
234-
* possible that no single user task uses more than 0.1% of memory and
235-
* no single admin tasks uses more than 3.0%.
223+
* Never return 0 for an eligible task regardless of the root bonus and
224+
* oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
236225
*/
237-
if (points <= 0)
238-
return 1;
239-
return (points < 1000) ? points : 1000;
226+
return points ? points : 1;
240227
}
241228

242229
/*
@@ -314,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
314301
{
315302
struct task_struct *g, *p;
316303
struct task_struct *chosen = NULL;
317-
*ppoints = 0;
304+
unsigned long chosen_points = 0;
318305

319306
do_each_thread(g, p) {
320307
unsigned int points;
@@ -354,7 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
354341
*/
355342
if (p == current) {
356343
chosen = p;
357-
*ppoints = 1000;
344+
chosen_points = ULONG_MAX;
358345
} else if (!force_kill) {
359346
/*
360347
* If this task is not being ptraced on exit,
@@ -367,12 +354,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
367354
}
368355

369356
points = oom_badness(p, memcg, nodemask, totalpages);
370-
if (points > *ppoints) {
357+
if (points > chosen_points) {
371358
chosen = p;
372-
*ppoints = points;
359+
chosen_points = points;
373360
}
374361
} while_each_thread(g, p);
375362

363+
*ppoints = chosen_points * 1000 / totalpages;
376364
return chosen;
377365
}
378366

@@ -572,7 +560,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
572560
}
573561

574562
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
575-
limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
563+
limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
576564
read_lock(&tasklist_lock);
577565
p = select_bad_process(&points, limit, memcg, NULL, false);
578566
if (p && PTR_ERR(p) != -1UL)

0 commit comments

Comments
 (0)