Skip to content

Commit dc6c9a3

Browse files
kiryltorvalds
authored andcommitted
mm: account pmd page tables to the process
Dave noticed that unprivileged process can allocate significant amount of memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PMD page tables. Linux kernel doesn't account PMD tables to the process, only PTE. The use-cases below use few tricks to allocate a lot of PMD page tables while keeping VmRSS and VmPTE low. oom_score for the process will be 0. #include <errno.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/mman.h> #include <sys/prctl.h> #define PUD_SIZE (1UL << 30) #define PMD_SIZE (1UL << 21) #define NR_PUD 130000 int main(void) { char *addr = NULL; unsigned long i; prctl(PR_SET_THP_DISABLE); for (i = 0; i < NR_PUD ; i++) { addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); break; } *addr = 'x'; munmap(addr, PMD_SIZE); mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); if (addr == MAP_FAILED) perror("re-mmap"), exit(1); } printf("PID %d consumed %lu KiB in PMD page tables\n", getpid(), i * 4096 >> 10); return pause(); } The patch addresses the issue by account PMD tables to the process the same way we account PTE. The main place where PMD tables is accounted is __pmd_alloc() and free_pmd_range(). But there're few corner cases: - HugeTLB can share PMD page tables. The patch handles by accounting the table to all processes who share it. - x86 PAE pre-allocates few PMD tables on fork. - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity check on exit(2). Accounting only happens on configuration where PMD page table's level is present (PMD is not folded). As with nr_ptes we use per-mm counter. The counter value is used to calculate baseline for badness score by oom-killer. Signed-off-by: Kirill A. Shutemov <[email protected]> Reported-by: Dave Hansen <[email protected]> Cc: Hugh Dickins <[email protected]> Reviewed-by: Cyrill Gorcunov <[email protected]> Cc: Pavel Emelyanov <[email protected]> Cc: David Rientjes <[email protected]> Tested-by: Sedat Dilek <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 8aa7687 commit dc6c9a3

File tree

11 files changed

+75
-29
lines changed

11 files changed

+75
-29
lines changed

Documentation/sysctl/vm.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -555,12 +555,12 @@ this is causing problems for your system/application.
555555

556556
oom_dump_tasks
557557

558-
Enables a system-wide task dump (excluding kernel threads) to be
559-
produced when the kernel performs an OOM-killing and includes such
560-
information as pid, uid, tgid, vm size, rss, nr_ptes, swapents,
561-
oom_score_adj score, and name. This is helpful to determine why the
562-
OOM killer was invoked, to identify the rogue task that caused it,
563-
and to determine why the OOM killer chose the task it did to kill.
558+
Enables a system-wide task dump (excluding kernel threads) to be produced
559+
when the kernel performs an OOM-killing and includes such information as
560+
pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj
561+
score, and name. This is helpful to determine why the OOM killer was
562+
invoked, to identify the rogue task that caused it, and to determine why
563+
the OOM killer chose the task it did to kill.
564564

565565
If this is set to zero, this information is suppressed. On very
566566
large systems with thousands of tasks it may not be feasible to dump

arch/x86/mm/pgtable.c

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -190,18 +190,19 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
190190

191191
#endif /* CONFIG_X86_PAE */
192192

193-
static void free_pmds(pmd_t *pmds[])
193+
static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
194194
{
195195
int i;
196196

197197
for(i = 0; i < PREALLOCATED_PMDS; i++)
198198
if (pmds[i]) {
199199
pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
200200
free_page((unsigned long)pmds[i]);
201+
mm_dec_nr_pmds(mm);
201202
}
202203
}
203204

204-
static int preallocate_pmds(pmd_t *pmds[])
205+
static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
205206
{
206207
int i;
207208
bool failed = false;
@@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[])
215216
pmd = NULL;
216217
failed = true;
217218
}
219+
if (pmd)
220+
mm_inc_nr_pmds(mm);
218221
pmds[i] = pmd;
219222
}
220223

221224
if (failed) {
222-
free_pmds(pmds);
225+
free_pmds(mm, pmds);
223226
return -ENOMEM;
224227
}
225228

@@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
246249

247250
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
248251
pmd_free(mm, pmd);
252+
mm_dec_nr_pmds(mm);
249253
}
250254
}
251255
}
@@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
283287

284288
mm->pgd = pgd;
285289

286-
if (preallocate_pmds(pmds) != 0)
290+
if (preallocate_pmds(mm, pmds) != 0)
287291
goto out_free_pgd;
288292

289293
if (paravirt_pgd_alloc(mm) != 0)
@@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
304308
return pgd;
305309

306310
out_free_pmds:
307-
free_pmds(pmds);
311+
free_pmds(mm, pmds);
308312
out_free_pgd:
309313
free_page((unsigned long)pgd);
310314
out:

fs/proc/task_mmu.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
void task_mem(struct seq_file *m, struct mm_struct *mm)
2323
{
24-
unsigned long data, text, lib, swap;
24+
unsigned long data, text, lib, swap, ptes, pmds;
2525
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
2626

2727
/*
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
4242
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
4343
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
4444
swap = get_mm_counter(mm, MM_SWAPENTS);
45+
ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
46+
pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
4547
seq_printf(m,
4648
"VmPeak:\t%8lu kB\n"
4749
"VmSize:\t%8lu kB\n"
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
5456
"VmExe:\t%8lu kB\n"
5557
"VmLib:\t%8lu kB\n"
5658
"VmPTE:\t%8lu kB\n"
59+
"VmPMD:\t%8lu kB\n"
5760
"VmSwap:\t%8lu kB\n",
5861
hiwater_vm << (PAGE_SHIFT-10),
5962
total_vm << (PAGE_SHIFT-10),
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
6366
total_rss << (PAGE_SHIFT-10),
6467
data << (PAGE_SHIFT-10),
6568
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
66-
(PTRS_PER_PTE * sizeof(pte_t) *
67-
atomic_long_read(&mm->nr_ptes)) >> 10,
69+
ptes >> 10,
70+
pmds >> 10,
6871
swap << (PAGE_SHIFT-10));
6972
}
7073

include/linux/mm.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1438,8 +1438,32 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
14381438
{
14391439
return 0;
14401440
}
1441+
1442+
static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
1443+
{
1444+
return 0;
1445+
}
1446+
1447+
static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
1448+
static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
1449+
14411450
#else
14421451
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
1452+
1453+
static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
1454+
{
1455+
return atomic_long_read(&mm->nr_pmds);
1456+
}
1457+
1458+
static inline void mm_inc_nr_pmds(struct mm_struct *mm)
1459+
{
1460+
atomic_long_inc(&mm->nr_pmds);
1461+
}
1462+
1463+
static inline void mm_dec_nr_pmds(struct mm_struct *mm)
1464+
{
1465+
atomic_long_dec(&mm->nr_pmds);
1466+
}
14431467
#endif
14441468

14451469
int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,

include/linux/mm_types.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,8 @@ struct mm_struct {
363363
pgd_t * pgd;
364364
atomic_t mm_users; /* How many users with user space? */
365365
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
366-
atomic_long_t nr_ptes; /* Page table pages */
366+
atomic_long_t nr_ptes; /* PTE page table pages */
367+
atomic_long_t nr_pmds; /* PMD page table pages */
367368
int map_count; /* number of VMAs */
368369

369370
spinlock_t page_table_lock; /* Protects page tables and some counters */

kernel/fork.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
555555
INIT_LIST_HEAD(&mm->mmlist);
556556
mm->core_state = NULL;
557557
atomic_long_set(&mm->nr_ptes, 0);
558+
#ifndef __PAGETABLE_PMD_FOLDED
559+
atomic_long_set(&mm->nr_pmds, 0);
560+
#endif
558561
mm->map_count = 0;
559562
mm->locked_vm = 0;
560563
mm->pinned_vm = 0;

mm/debug.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm)
173173
"get_unmapped_area %p\n"
174174
#endif
175175
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
176-
"pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n"
176+
"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
177177
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
178178
"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
179179
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm)
206206
mm->pgd, atomic_read(&mm->mm_users),
207207
atomic_read(&mm->mm_count),
208208
atomic_long_read((atomic_long_t *)&mm->nr_ptes),
209+
mm_nr_pmds((struct mm_struct *)mm),
209210
mm->map_count,
210211
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
211212
mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,

mm/hugetlb.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3598,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
35983598
if (saddr) {
35993599
spte = huge_pte_offset(svma->vm_mm, saddr);
36003600
if (spte) {
3601+
mm_inc_nr_pmds(mm);
36013602
get_page(virt_to_page(spte));
36023603
break;
36033604
}
@@ -3609,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
36093610

36103611
ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
36113612
spin_lock(ptl);
3612-
if (pud_none(*pud))
3613+
if (pud_none(*pud)) {
36133614
pud_populate(mm, pud,
36143615
(pmd_t *)((unsigned long)spte & PAGE_MASK));
3615-
else
3616+
} else {
36163617
put_page(virt_to_page(spte));
3618+
mm_inc_nr_pmds(mm);
3619+
}
36173620
spin_unlock(ptl);
36183621
out:
36193622
pte = (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3644,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
36443647

36453648
pud_clear(pud);
36463649
put_page(virt_to_page(ptep));
3650+
mm_dec_nr_pmds(mm);
36473651
*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
36483652
return 1;
36493653
}

mm/memory.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
428428
pmd = pmd_offset(pud, start);
429429
pud_clear(pud);
430430
pmd_free_tlb(tlb, pmd, start);
431+
mm_dec_nr_pmds(tlb->mm);
431432
}
432433

433434
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
33223323

33233324
spin_lock(&mm->page_table_lock);
33243325
#ifndef __ARCH_HAS_4LEVEL_HACK
3325-
if (pud_present(*pud)) /* Another has populated it */
3326-
pmd_free(mm, new);
3327-
else
3326+
if (!pud_present(*pud)) {
3327+
mm_inc_nr_pmds(mm);
33283328
pud_populate(mm, pud, new);
3329-
#else
3330-
if (pgd_present(*pud)) /* Another has populated it */
3329+
} else /* Another has populated it */
33313330
pmd_free(mm, new);
3332-
else
3331+
#else
3332+
if (!pgd_present(*pud)) {
3333+
mm_inc_nr_pmds(mm);
33333334
pgd_populate(mm, pud, new);
3335+
} else /* Another has populated it */
3336+
pmd_free(mm, new);
33343337
#endif /* __ARCH_HAS_4LEVEL_HACK */
33353338
spin_unlock(&mm->page_table_lock);
33363339
return 0;

mm/mmap.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2853,7 +2853,9 @@ void exit_mmap(struct mm_struct *mm)
28532853
vm_unacct_memory(nr_accounted);
28542854

28552855
WARN_ON(atomic_long_read(&mm->nr_ptes) >
2856-
(FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2856+
round_up(FIRST_USER_ADDRESS, PMD_SIZE) >> PMD_SHIFT);
2857+
WARN_ON(mm_nr_pmds(mm) >
2858+
round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT);
28572859
}
28582860

28592861
/* Insert vm structure into process list sorted by address

mm/oom_kill.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
169169
* The baseline for the badness score is the proportion of RAM that each
170170
* task's rss, pagetable and swap space use.
171171
*/
172-
points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
173-
get_mm_counter(p->mm, MM_SWAPENTS);
172+
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
173+
atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
174174
task_unlock(p);
175175

176176
/*
@@ -351,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
351351
struct task_struct *p;
352352
struct task_struct *task;
353353

354-
pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
354+
pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n");
355355
rcu_read_lock();
356356
for_each_process(p) {
357357
if (oom_unkillable_task(p, memcg, nodemask))
@@ -367,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
367367
continue;
368368
}
369369

370-
pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n",
370+
pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n",
371371
task->pid, from_kuid(&init_user_ns, task_uid(task)),
372372
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
373373
atomic_long_read(&task->mm->nr_ptes),
374+
mm_nr_pmds(task->mm),
374375
get_mm_counter(task->mm, MM_SWAPENTS),
375376
task->signal->oom_score_adj, task->comm);
376377
task_unlock(task);

0 commit comments

Comments
 (0)