Skip to content

Commit 3425485

Browse files
laijsLai Jiangshan
authored and
Lai Jiangshan
committed
kvm: share upper halves among all pagtables
Fixes: #509 Signed-off-by: Lai Jiangshan <[email protected]> Signed-off-by: Lai Jiangshan <[email protected]>
1 parent dd05611 commit 3425485

14 files changed

+149
-65
lines changed

pkg/sentry/platform/kvm/kvm.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,7 @@ func (*KVM) MaxUserAddress() usermem.Addr {
158158
// NewAddressSpace returns a new pagetable root.
159159
func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
160160
// Allocate page tables and install system mappings.
161-
pageTables := pagetables.New(newAllocator())
162-
k.machine.mapUpperHalf(pageTables)
161+
pageTables := pagetables.NewWithUpper(newAllocator(), k.machine.upperSharedPageTables, ring0.KernelStartAddress)
163162

164163
// Return the new address space.
165164
return &addressSpace{

pkg/sentry/platform/kvm/machine.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ type machine struct {
4141
// slots are currently being updated, and the caller should retry.
4242
nextSlot uint32
4343

44+
// upperSharedPageTables tracks the read-only shared upper of all the pagetables.
45+
upperSharedPageTables *pagetables.PageTables
46+
4447
// kernel is the set of global structures.
4548
kernel ring0.Kernel
4649

@@ -199,9 +202,7 @@ func newMachine(vm int) (*machine, error) {
199202
log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
200203
m.vCPUsByTID = make(map[uint64]*vCPU)
201204
m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
202-
m.kernel.Init(ring0.KernelOpts{
203-
PageTables: pagetables.New(newAllocator()),
204-
}, m.maxVCPUs)
205+
m.kernel.Init(m.maxVCPUs)
205206

206207
// Pull the maximum slots.
207208
maxSlots, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
@@ -213,6 +214,13 @@ func newMachine(vm int) (*machine, error) {
213214
log.Debugf("The maximum number of slots is %d.", m.maxSlots)
214215
m.usedSlots = make([]uintptr, m.maxSlots)
215216

217+
// Create the upper shared pagetables and kernel(sentry) pagetables.
218+
m.upperSharedPageTables = pagetables.New(newAllocator())
219+
m.mapUpperHalf(m.upperSharedPageTables)
220+
m.upperSharedPageTables.Allocator.(*allocator).base.Drain()
221+
m.upperSharedPageTables.MarkReadOnlyShared()
222+
m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress)
223+
216224
// Apply the physical mappings. Note that these mappings may point to
217225
// guest physical addresses that are not actually available. These
218226
// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -226,7 +234,6 @@ func newMachine(vm int) (*machine, error) {
226234

227235
return true // Keep iterating.
228236
})
229-
m.mapUpperHalf(m.kernel.PageTables)
230237

231238
var physicalRegionsReadOnly []physicalRegion
232239
var physicalRegionsAvailable []physicalRegion

pkg/sentry/platform/kvm/machine_amd64.go

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -432,30 +432,27 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) {
432432
return physicalRegions
433433
}
434434

435-
var execRegions = func() (regions []region) {
435+
func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
436+
// Map all the executible regions so that all the entry functions
437+
// are mapped in the upper half.
436438
applyVirtualRegions(func(vr virtualRegion) {
437439
if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" {
438440
return
439441
}
442+
440443
if vr.accessType.Execute {
441-
regions = append(regions, vr.region)
444+
r := vr.region
445+
physical, length, ok := translateToPhysical(r.virtual)
446+
if !ok || length < r.length {
447+
panic("impossible translation")
448+
}
449+
pageTable.Map(
450+
usermem.Addr(ring0.KernelStartAddress|r.virtual),
451+
r.length,
452+
pagetables.MapOpts{AccessType: usermem.Execute},
453+
physical)
442454
}
443455
})
444-
return
445-
}()
446-
447-
func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
448-
for _, r := range execRegions {
449-
physical, length, ok := translateToPhysical(r.virtual)
450-
if !ok || length < r.length {
451-
panic("impossilbe translation")
452-
}
453-
pageTable.Map(
454-
usermem.Addr(ring0.KernelStartAddress|r.virtual),
455-
r.length,
456-
pagetables.MapOpts{AccessType: usermem.Execute},
457-
physical)
458-
}
459456
for start, end := range m.kernel.EntryRegions() {
460457
regionLen := end - start
461458
physical, length, ok := translateToPhysical(start)

pkg/sentry/platform/ring0/defs.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ import (
2323
//
2424
// This contains global state, shared by multiple CPUs.
2525
type Kernel struct {
26+
// PageTables are the kernel pagetables; this must be provided.
27+
PageTables *pagetables.PageTables
28+
2629
KernelArchState
2730
}
2831

pkg/sentry/platform/ring0/defs_amd64.go

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,9 @@ var (
6666
KernelDataSegment SegmentDescriptor
6767
)
6868

69-
// KernelOpts has initialization options for the kernel.
70-
type KernelOpts struct {
71-
// PageTables are the kernel pagetables; this must be provided.
72-
PageTables *pagetables.PageTables
73-
}
74-
7569
// KernelArchState contains architecture-specific state.
7670
type KernelArchState struct {
77-
KernelOpts
78-
79-
// cpuEntries is array of kernelEntry for all cpus
71+
// cpuEntries is array of kernelEntry for all cpus.
8072
cpuEntries []kernelEntry
8173

8274
// globalIDT is our set of interrupt gates.

pkg/sentry/platform/ring0/defs_arm64.go

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,8 @@ var (
3232
KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
3333
)
3434

35-
// KernelOpts has initialization options for the kernel.
36-
type KernelOpts struct {
37-
// PageTables are the kernel pagetables; this must be provided.
38-
PageTables *pagetables.PageTables
39-
}
40-
4135
// KernelArchState contains architecture-specific state.
4236
type KernelArchState struct {
43-
KernelOpts
4437
}
4538

4639
// CPUArchState contains CPU-specific arch state.

pkg/sentry/platform/ring0/kernel.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,9 @@ package ring0
1616

1717
// Init initializes a new kernel.
1818
//
19-
// N.B. that constraints on KernelOpts must be satisfied.
20-
//
2119
//go:nosplit
22-
func (k *Kernel) Init(opts KernelOpts, maxCPUs int) {
23-
k.init(opts, maxCPUs)
20+
func (k *Kernel) Init(maxCPUs int) {
21+
k.init(maxCPUs)
2422
}
2523

2624
// Halt halts execution.

pkg/sentry/platform/ring0/kernel_amd64.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@ import (
2424
)
2525

2626
// init initializes architecture-specific state.
27-
func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
28-
// Save the root page tables.
29-
k.PageTables = opts.PageTables
30-
27+
func (k *Kernel) init(maxCPUs int) {
3128
entrySize := reflect.TypeOf(kernelEntry{}).Size()
3229
var (
3330
entries []kernelEntry

pkg/sentry/platform/ring0/kernel_arm64.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,7 @@ func HaltAndResume()
2525
func HaltEl1SvcAndResume()
2626

2727
// init initializes architecture-specific state.
28-
func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
29-
// Save the root page tables.
30-
k.PageTables = opts.PageTables
28+
func (k *Kernel) init(maxCPUs int) {
3129
}
3230

3331
// init initializes architecture-specific state.

pkg/sentry/platform/ring0/pagetables/pagetables.go

Lines changed: 81 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ type PageTables struct {
3030
Allocator Allocator
3131

3232
// root is the pagetable root.
33+
//
34+
// For same archs such as amd64, the upper of the PTEs is cloned
35+
// from and owned by upperSharedPageTables which are shared among
36+
// many PageTables if upperSharedPageTables is not nil.
3337
root *PTEs
3438

3539
// rootPhysical is the cached physical address of the root.
@@ -39,15 +43,52 @@ type PageTables struct {
3943

4044
// archPageTables includes architecture-specific features.
4145
archPageTables
46+
47+
// upperSharedPageTables represents a read-only shared upper
48+
// of the Pagetable. When it is not nil, the upper is not
49+
// allowed to be modified.
50+
upperSharedPageTables *PageTables
51+
52+
// upperStart is the start address of the upper portion that
53+
// are shared from upperSharedPageTables
54+
upperStart uintptr
55+
56+
// readOnlyShared indicates the Pagetables are read-only and
57+
// own the ranges that are shared with other Pagetables.
58+
readOnlyShared bool
4259
}
4360

44-
// New returns new PageTables.
45-
func New(a Allocator) *PageTables {
61+
// NewWithUpper returns new PageTables.
62+
//
63+
// upperSharedPageTables are used for mapping the upper of addresses,
64+
// starting at upperStart. These pageTables should not be touched (as
65+
// invalidations may be incorrect) after they are passed as an
66+
// upperSharedPageTables. Only when all dependent PageTables are gone
67+
// may they be used. The intenteded use case is for kernel page tables,
68+
// which are static and fixed.
69+
//
70+
// Precondition: upperStart must be between canonical ranges.
71+
// Precondition: upperStart must be pgdSize aligned.
72+
// precondition: upperSharedPageTables must be marked read-only shared.
73+
func NewWithUpper(a Allocator, upperSharedPageTables *PageTables, upperStart uintptr) *PageTables {
4674
p := new(PageTables)
4775
p.Init(a)
76+
if upperSharedPageTables != nil {
77+
if !upperSharedPageTables.readOnlyShared {
78+
panic("Only read-only shared pagetables can be used as upper")
79+
}
80+
p.upperSharedPageTables = upperSharedPageTables
81+
p.upperStart = upperStart
82+
p.cloneUpperShared()
83+
}
4884
return p
4985
}
5086

87+
// New returns new PageTables.
88+
func New(a Allocator) *PageTables {
89+
return NewWithUpper(a, nil, 0)
90+
}
91+
5192
// mapVisitor is used for map.
5293
type mapVisitor struct {
5394
target uintptr // Input.
@@ -90,6 +131,21 @@ func (*mapVisitor) requiresSplit() bool { return true }
90131
//
91132
//go:nosplit
92133
func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
134+
if p.readOnlyShared {
135+
panic("Should not modify read-only shared pagetables.")
136+
}
137+
if uintptr(addr)+length < uintptr(addr) {
138+
panic("addr & length overflow")
139+
}
140+
if p.upperSharedPageTables != nil {
141+
// ignore change to the read-only upper shared portion.
142+
if uintptr(addr) >= p.upperStart {
143+
return false
144+
}
145+
if uintptr(addr)+length > p.upperStart {
146+
length = p.upperStart - uintptr(addr)
147+
}
148+
}
93149
if !opts.AccessType.Any() {
94150
return p.Unmap(addr, length)
95151
}
@@ -128,12 +184,27 @@ func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
128184
//
129185
// True is returned iff there was a previous mapping in the range.
130186
//
131-
// Precondition: addr & length must be page-aligned.
187+
// Precondition: addr & length must be page-aligned, their sum must not overflow.
132188
//
133189
// +checkescape:hard,stack
134190
//
135191
//go:nosplit
136192
func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
193+
if p.readOnlyShared {
194+
panic("Should not modify read-only shared pagetables.")
195+
}
196+
if uintptr(addr)+length < uintptr(addr) {
197+
panic("addr & length overflow")
198+
}
199+
if p.upperSharedPageTables != nil {
200+
// ignore change to the read-only upper shared portion.
201+
if uintptr(addr) >= p.upperStart {
202+
return false
203+
}
204+
if uintptr(addr)+length > p.upperStart {
205+
length = p.upperStart - uintptr(addr)
206+
}
207+
}
137208
w := unmapWalker{
138209
pageTables: p,
139210
visitor: unmapVisitor{
@@ -218,3 +289,10 @@ func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts)
218289
w.iterateRange(uintptr(addr), uintptr(addr)+1)
219290
return w.visitor.physical + offset, w.visitor.opts
220291
}
292+
293+
// MarkReadOnlyShared marks the pagetables read-only and can be shared.
294+
//
295+
// It is usually used on the pagetables that are used as the upper
296+
func (p *PageTables) MarkReadOnlyShared() {
297+
p.readOnlyShared = true
298+
}

pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,6 @@ import (
2424

2525
// archPageTables is architecture-specific data.
2626
type archPageTables struct {
27-
// root is the pagetable root for kernel space.
28-
root *PTEs
29-
30-
// rootPhysical is the cached physical address of the root.
31-
//
32-
// This is saved only to prevent constant translation.
33-
rootPhysical uintptr
34-
3527
asid uint16
3628
}
3729

@@ -46,7 +38,7 @@ func (p *PageTables) TTBR0_EL1(noFlush bool, asid uint16) uint64 {
4638
//
4739
//go:nosplit
4840
func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 {
49-
return uint64(p.archPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset
41+
return uint64(p.upperSharedPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset
5042
}
5143

5244
// Bits in page table entries.

pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,26 @@ func (p *PageTables) Init(allocator Allocator) {
5050
p.rootPhysical = p.Allocator.PhysicalFor(p.root)
5151
}
5252

53+
func pgdIndex(upperStart uintptr) uintptr {
54+
if upperStart&(pgdSize-1) != 0 {
55+
panic("upperStart should be pgd size aligned")
56+
}
57+
if upperStart >= upperBottom {
58+
return entriesPerPage/2 + (upperStart-upperBottom)/pgdSize
59+
}
60+
if upperStart < lowerTop {
61+
return upperStart / pgdSize
62+
}
63+
panic("upperStart should be in canonical range")
64+
}
65+
66+
// cloneUpperShared clone the upper from the upper shared page tables.
67+
//
68+
//go:nosplit
69+
func (p *PageTables) cloneUpperShared() {
70+
start := pgdIndex(p.upperStart)
71+
copy(p.root[start:entriesPerPage], p.upperSharedPageTables.root[start:entriesPerPage])
72+
}
73+
5374
// PTEs is a collection of entries.
5475
type PTEs [entriesPerPage]PTE

pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,17 @@ func (p *PageTables) Init(allocator Allocator) {
4949
p.Allocator = allocator
5050
p.root = p.Allocator.NewPTEs()
5151
p.rootPhysical = p.Allocator.PhysicalFor(p.root)
52-
p.archPageTables.root = p.Allocator.NewPTEs()
53-
p.archPageTables.rootPhysical = p.Allocator.PhysicalFor(p.archPageTables.root)
52+
}
53+
54+
// cloneUpperShared clone the upper from the upper shared page tables.
55+
//
56+
//go:nosplit
57+
func (p *PageTables) cloneUpperShared() {
58+
if p.upperStart != upperBottom {
59+
panic("upperStart should be the same as upperBottom")
60+
}
61+
62+
// nothing to do for arm.
5463
}
5564

5665
// PTEs is a collection of entries.

pkg/sentry/platform/ring0/pagetables/walker_arm64.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ func next(start uintptr, size uintptr) uintptr {
116116
func (w *Walker) iterateRangeCanonical(start, end uintptr) {
117117
pgdEntryIndex := w.pageTables.root
118118
if start >= upperBottom {
119-
pgdEntryIndex = w.pageTables.archPageTables.root
119+
pgdEntryIndex = w.pageTables.upperSharedPageTables.root
120120
}
121121

122122
for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ {

0 commit comments

Comments
 (0)