Skip to content

Commit 5fbda3e

Browse files
committed
sched: highmem: Store local kmaps in task struct
Instead of storing the map per CPU provide and use per task storage. That prepares for local kmaps which are preemptible. The context switch code is preparatory and not yet in use because kmap_atomic() runs with preemption disabled. Will be made usable in the next step. The context switch logic is safe even when an interrupt happens after clearing or before restoring the kmaps. The kmap index in task struct is not modified so any nesting kmap in an interrupt will use unused indices and on return the counter is the same as before. Also add an assert into the return to user space code. Going back to user space with an active kmap local is a nono. Signed-off-by: Thomas Gleixner <[email protected]> Acked-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 14df326 commit 5fbda3e

File tree

6 files changed

+136
-10
lines changed

6 files changed

+136
-10
lines changed

include/linux/highmem-internal.h

+10
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,16 @@
99
void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
1010
void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
1111
void kunmap_local_indexed(void *vaddr);
12+
void kmap_local_fork(struct task_struct *tsk);
13+
void __kmap_local_sched_out(void);
14+
void __kmap_local_sched_in(void);
15+
static inline void kmap_assert_nomap(void)
16+
{
17+
DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx);
18+
}
19+
#else
20+
static inline void kmap_local_fork(struct task_struct *tsk) { }
21+
static inline void kmap_assert_nomap(void) { }
1222
#endif
1323

1424
#ifdef CONFIG_HIGHMEM

include/linux/sched.h

+9
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <linux/rseq.h>
3535
#include <linux/seqlock.h>
3636
#include <linux/kcsan.h>
37+
#include <asm/kmap_size.h>
3738

3839
/* task_struct member predeclarations (sorted alphabetically): */
3940
struct audit_context;
@@ -629,6 +630,13 @@ struct wake_q_node {
629630
struct wake_q_node *next;
630631
};
631632

633+
struct kmap_ctrl {
634+
#ifdef CONFIG_KMAP_LOCAL
635+
int idx;
636+
pte_t pteval[KM_MAX_IDX];
637+
#endif
638+
};
639+
632640
struct task_struct {
633641
#ifdef CONFIG_THREAD_INFO_IN_TASK
634642
/*
@@ -1294,6 +1302,7 @@ struct task_struct {
12941302
unsigned int sequential_io;
12951303
unsigned int sequential_io_avg;
12961304
#endif
1305+
struct kmap_ctrl kmap_ctrl;
12971306
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
12981307
unsigned long task_state_change;
12991308
#endif

kernel/entry/common.c

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include <linux/context_tracking.h>
44
#include <linux/entry-common.h>
5+
#include <linux/highmem.h>
56
#include <linux/livepatch.h>
67
#include <linux/audit.h>
78

@@ -194,6 +195,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
194195

195196
/* Ensure that the address limit is intact and no locks are held */
196197
addr_limit_user_check();
198+
kmap_assert_nomap();
197199
lockdep_assert_irqs_disabled();
198200
lockdep_sys_exit();
199201
}

kernel/fork.c

+1
Original file line numberDiff line numberDiff line change
@@ -930,6 +930,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
930930
account_kernel_stack(tsk, 1);
931931

932932
kcov_task_init(tsk);
933+
kmap_local_fork(tsk);
933934

934935
#ifdef CONFIG_FAULT_INJECTION
935936
tsk->fail_nth = 0;

kernel/sched/core.c

+25
Original file line numberDiff line numberDiff line change
@@ -4094,6 +4094,22 @@ static inline void finish_lock_switch(struct rq *rq)
40944094
# define finish_arch_post_lock_switch() do { } while (0)
40954095
#endif
40964096

4097+
static inline void kmap_local_sched_out(void)
4098+
{
4099+
#ifdef CONFIG_KMAP_LOCAL
4100+
if (unlikely(current->kmap_ctrl.idx))
4101+
__kmap_local_sched_out();
4102+
#endif
4103+
}
4104+
4105+
static inline void kmap_local_sched_in(void)
4106+
{
4107+
#ifdef CONFIG_KMAP_LOCAL
4108+
if (unlikely(current->kmap_ctrl.idx))
4109+
__kmap_local_sched_in();
4110+
#endif
4111+
}
4112+
40974113
/**
40984114
* prepare_task_switch - prepare to switch tasks
40994115
* @rq: the runqueue preparing to switch
@@ -4116,6 +4132,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
41164132
perf_event_task_sched_out(prev, next);
41174133
rseq_preempt(prev);
41184134
fire_sched_out_preempt_notifiers(prev, next);
4135+
kmap_local_sched_out();
41194136
prepare_task(next);
41204137
prepare_arch_switch(next);
41214138
}
@@ -4182,6 +4199,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
41824199
finish_lock_switch(rq);
41834200
finish_arch_post_lock_switch();
41844201
kcov_finish_switch(current);
4202+
/*
4203+
* kmap_local_sched_out() is invoked with rq::lock held and
4204+
* interrupts disabled. There is no requirement for that, but the
4205+
* sched out code does not have an interrupt enabled section.
4206+
* Restoring the maps on sched in does not require interrupts being
4207+
* disabled either.
4208+
*/
4209+
kmap_local_sched_in();
41854210

41864211
fire_sched_in_preempt_notifiers(current);
41874212
/*

mm/highmem.c

+89-10
Original file line numberDiff line numberDiff line change
@@ -365,8 +365,6 @@ EXPORT_SYMBOL(kunmap_high);
365365

366366
#include <asm/kmap_size.h>
367367

368-
static DEFINE_PER_CPU(int, __kmap_local_idx);
369-
370368
/*
371369
* With DEBUG_KMAP_LOCAL the stack depth is doubled and every second
372370
* slot is unused which acts as a guard page
@@ -379,23 +377,21 @@ static DEFINE_PER_CPU(int, __kmap_local_idx);
379377

380378
static inline int kmap_local_idx_push(void)
381379
{
382-
int idx = __this_cpu_add_return(__kmap_local_idx, KM_INCR) - 1;
383-
384380
WARN_ON_ONCE(in_irq() && !irqs_disabled());
385-
BUG_ON(idx >= KM_MAX_IDX);
386-
return idx;
381+
current->kmap_ctrl.idx += KM_INCR;
382+
BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX);
383+
return current->kmap_ctrl.idx - 1;
387384
}
388385

389386
static inline int kmap_local_idx(void)
390387
{
391-
return __this_cpu_read(__kmap_local_idx) - 1;
388+
return current->kmap_ctrl.idx - 1;
392389
}
393390

394391
static inline void kmap_local_idx_pop(void)
395392
{
396-
int idx = __this_cpu_sub_return(__kmap_local_idx, KM_INCR);
397-
398-
BUG_ON(idx < 0);
393+
current->kmap_ctrl.idx -= KM_INCR;
394+
BUG_ON(current->kmap_ctrl.idx < 0);
399395
}
400396

401397
#ifndef arch_kmap_local_post_map
@@ -464,6 +460,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
464460
pteval = pfn_pte(pfn, prot);
465461
set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval);
466462
arch_kmap_local_post_map(vaddr, pteval);
463+
current->kmap_ctrl.pteval[kmap_local_idx()] = pteval;
467464
preempt_enable();
468465

469466
return (void *)vaddr;
@@ -522,10 +519,92 @@ void kunmap_local_indexed(void *vaddr)
522519
arch_kmap_local_pre_unmap(addr);
523520
pte_clear(&init_mm, addr, kmap_pte - idx);
524521
arch_kmap_local_post_unmap(addr);
522+
current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0);
525523
kmap_local_idx_pop();
526524
preempt_enable();
527525
}
528526
EXPORT_SYMBOL(kunmap_local_indexed);
527+
528+
/*
529+
* Invoked before switch_to(). This is safe even when during or after
530+
* clearing the maps an interrupt which needs a kmap_local happens because
531+
* the task::kmap_ctrl.idx is not modified by the unmapping code so a
532+
* nested kmap_local will use the next unused index and restore the index
533+
* on unmap. The already cleared kmaps of the outgoing task are irrelevant
534+
* because the interrupt context does not know about them. The same applies
535+
* when scheduling back in for an interrupt which happens before the
536+
* restore is complete.
537+
*/
538+
void __kmap_local_sched_out(void)
539+
{
540+
struct task_struct *tsk = current;
541+
pte_t *kmap_pte = kmap_get_pte();
542+
int i;
543+
544+
/* Clear kmaps */
545+
for (i = 0; i < tsk->kmap_ctrl.idx; i++) {
546+
pte_t pteval = tsk->kmap_ctrl.pteval[i];
547+
unsigned long addr;
548+
int idx;
549+
550+
/* With debug all even slots are unmapped and act as guard */
551+
if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) {
552+
WARN_ON_ONCE(!pte_none(pteval));
553+
continue;
554+
}
555+
if (WARN_ON_ONCE(pte_none(pteval)))
556+
continue;
557+
558+
/*
559+
* This is a horrible hack for XTENSA to calculate the
560+
* coloured PTE index. Uses the PFN encoded into the pteval
561+
* and the map index calculation because the actual mapped
562+
* virtual address is not stored in task::kmap_ctrl.
563+
* For any sane architecture this is optimized out.
564+
*/
565+
idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
566+
567+
addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
568+
arch_kmap_local_pre_unmap(addr);
569+
pte_clear(&init_mm, addr, kmap_pte - idx);
570+
arch_kmap_local_post_unmap(addr);
571+
}
572+
}
573+
574+
void __kmap_local_sched_in(void)
575+
{
576+
struct task_struct *tsk = current;
577+
pte_t *kmap_pte = kmap_get_pte();
578+
int i;
579+
580+
/* Restore kmaps */
581+
for (i = 0; i < tsk->kmap_ctrl.idx; i++) {
582+
pte_t pteval = tsk->kmap_ctrl.pteval[i];
583+
unsigned long addr;
584+
int idx;
585+
586+
/* With debug all even slots are unmapped and act as guard */
587+
if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) {
588+
WARN_ON_ONCE(!pte_none(pteval));
589+
continue;
590+
}
591+
if (WARN_ON_ONCE(pte_none(pteval)))
592+
continue;
593+
594+
/* See comment in __kmap_local_sched_out() */
595+
idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
596+
addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
597+
set_pte_at(&init_mm, addr, kmap_pte - idx, pteval);
598+
arch_kmap_local_post_map(addr, pteval);
599+
}
600+
}
601+
602+
void kmap_local_fork(struct task_struct *tsk)
603+
{
604+
if (WARN_ON_ONCE(tsk->kmap_ctrl.idx))
605+
memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl));
606+
}
607+
529608
#endif
530609

531610
#if defined(HASHED_PAGE_VIRTUAL)

0 commit comments

Comments
 (0)