Skip to content

Commit df6ad69

Browse files
Jérôme Glissetorvalds
Jérôme Glisse
authored andcommitted
mm/device-public-memory: device memory cache coherent with CPU
Platform with advance system bus (like CAPI or CCIX) allow device memory to be accessible from CPU in a cache coherent fashion. Add a new type of ZONE_DEVICE to represent such memory. The use case are the same as for the un-addressable device memory but without all the corners cases. Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Jérôme Glisse <[email protected]> Cc: Aneesh Kumar <[email protected]> Cc: Paul E. McKenney <[email protected]> Cc: Benjamin Herrenschmidt <[email protected]> Cc: Dan Williams <[email protected]> Cc: Ross Zwisler <[email protected]> Cc: Balbir Singh <[email protected]> Cc: David Nellans <[email protected]> Cc: Evgeny Baskakov <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: John Hubbard <[email protected]> Cc: Kirill A. Shutemov <[email protected]> Cc: Mark Hairgrove <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Sherry Cheung <[email protected]> Cc: Subhash Gutti <[email protected]> Cc: Vladimir Davydov <[email protected]> Cc: Bob Liu <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 8315ada commit df6ad69

File tree

14 files changed

+159
-47
lines changed

14 files changed

+159
-47
lines changed

fs/proc/task_mmu.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1267,7 +1267,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
12671267
if (pm->show_pfn)
12681268
frame = pte_pfn(pte);
12691269
flags |= PM_PRESENT;
1270-
page = vm_normal_page(vma, addr, pte);
1270+
page = _vm_normal_page(vma, addr, pte, true);
12711271
if (pte_soft_dirty(pte))
12721272
flags |= PM_SOFT_DIRTY;
12731273
} else if (is_swap_pte(pte)) {

include/linux/hmm.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
327327
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
328328

329329

330-
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
330+
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
331331
struct hmm_devmem;
332332

333333
struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
@@ -494,7 +494,7 @@ struct hmm_device {
494494
*/
495495
struct hmm_device *hmm_device_new(void *drvdata);
496496
void hmm_device_put(struct hmm_device *hmm_device);
497-
#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
497+
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
498498

499499

500500
/* Below are for HMM internal use only! Not to be used by device driver! */

include/linux/ioport.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ enum {
131131
IORES_DESC_PERSISTENT_MEMORY = 4,
132132
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
133133
IORES_DESC_DEVICE_PRIVATE_MEMORY = 6,
134+
IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
134135
};
135136

136137
/* helpers to define resources */

include/linux/memremap.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
5757
*
5858
* A more complete discussion of unaddressable memory may be found in
5959
* include/linux/hmm.h and Documentation/vm/hmm.txt.
60+
*
61+
* MEMORY_DEVICE_PUBLIC:
62+
* Device memory that is cache coherent from device and CPU point of view. This
63+
* is use on platform that have an advance system bus (like CAPI or CCIX). A
64+
* driver can hotplug the device memory using ZONE_DEVICE and with that memory
65+
* type. Any page of a process can be migrated to such memory. However no one
66+
* should be allow to pin such memory so that it can always be evicted.
6067
*/
6168
enum memory_type {
6269
MEMORY_DEVICE_HOST = 0,
6370
MEMORY_DEVICE_PRIVATE,
71+
MEMORY_DEVICE_PUBLIC,
6472
};
6573

6674
/*
@@ -92,6 +100,8 @@ enum memory_type {
92100
* The page_free() callback is called once the page refcount reaches 1
93101
* (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
94102
* This allows the device driver to implement its own memory management.)
103+
*
104+
* For MEMORY_DEVICE_PUBLIC only the page_free() callback matter.
95105
*/
96106
typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
97107
unsigned long addr,
@@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct page *page)
134144
return is_zone_device_page(page) &&
135145
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
136146
}
147+
148+
static inline bool is_device_public_page(const struct page *page)
149+
{
150+
return is_zone_device_page(page) &&
151+
page->pgmap->type == MEMORY_DEVICE_PUBLIC;
152+
}
137153
#else
138154
static inline void *devm_memremap_pages(struct device *dev,
139155
struct resource *res, struct percpu_ref *ref,
@@ -157,6 +173,11 @@ static inline bool is_device_private_page(const struct page *page)
157173
{
158174
return false;
159175
}
176+
177+
static inline bool is_device_public_page(const struct page *page)
178+
{
179+
return false;
180+
}
160181
#endif
161182

162183
/**

include/linux/mm.h

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -800,15 +800,16 @@ static inline bool is_zone_device_page(const struct page *page)
800800
}
801801
#endif
802802

803-
#ifdef CONFIG_DEVICE_PRIVATE
804-
void put_zone_device_private_page(struct page *page);
803+
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
804+
void put_zone_device_private_or_public_page(struct page *page);
805805
#else
806-
static inline void put_zone_device_private_page(struct page *page)
806+
static inline void put_zone_device_private_or_public_page(struct page *page)
807807
{
808808
}
809-
#endif
809+
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
810810

811811
static inline bool is_device_private_page(const struct page *page);
812+
static inline bool is_device_public_page(const struct page *page);
812813

813814
DECLARE_STATIC_KEY_FALSE(device_private_key);
814815

@@ -834,8 +835,9 @@ static inline void put_page(struct page *page)
834835
* include/linux/memremap.h and HMM for details.
835836
*/
836837
if (static_branch_unlikely(&device_private_key) &&
837-
unlikely(is_device_private_page(page))) {
838-
put_zone_device_private_page(page);
838+
unlikely(is_device_private_page(page) ||
839+
is_device_public_page(page))) {
840+
put_zone_device_private_or_public_page(page);
839841
return;
840842
}
841843

@@ -1224,8 +1226,10 @@ struct zap_details {
12241226
pgoff_t last_index; /* Highest page->index to unmap */
12251227
};
12261228

1227-
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
1228-
pte_t pte);
1229+
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
1230+
pte_t pte, bool with_public_device);
1231+
#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
1232+
12291233
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
12301234
pmd_t pmd);
12311235

kernel/memremap.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -501,8 +501,8 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
501501
#endif /* CONFIG_ZONE_DEVICE */
502502

503503

504-
#ifdef CONFIG_DEVICE_PRIVATE
505-
void put_zone_device_private_page(struct page *page)
504+
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
505+
void put_zone_device_private_or_public_page(struct page *page)
506506
{
507507
int count = page_ref_dec_return(page);
508508

@@ -522,5 +522,5 @@ void put_zone_device_private_page(struct page *page)
522522
} else if (!count)
523523
__put_page(page);
524524
}
525-
EXPORT_SYMBOL(put_zone_device_private_page);
526-
#endif /* CONFIG_DEVICE_PRIVATE */
525+
EXPORT_SYMBOL(put_zone_device_private_or_public_page);
526+
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */

mm/Kconfig

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,12 +720,23 @@ config HMM_MIRROR
720720
config DEVICE_PRIVATE
721721
bool "Unaddressable device memory (GPU memory, ...)"
722722
depends on ARCH_HAS_HMM
723+
select HMM
723724

724725
help
725726
Allows creation of struct pages to represent unaddressable device
726727
memory; i.e., memory that is only accessible from the device (or
727728
group of devices). You likely also want to select HMM_MIRROR.
728729

730+
config DEVICE_PUBLIC
731+
bool "Addressable device memory (like GPU memory)"
732+
depends on ARCH_HAS_HMM
733+
select HMM
734+
735+
help
736+
Allows creation of struct pages to represent addressable device
737+
memory; i.e., memory that is accessible from both the device and
738+
the CPU
739+
729740
config FRAME_VECTOR
730741
bool
731742

mm/gup.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
456456
if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
457457
goto unmap;
458458
*page = pte_page(*pte);
459+
460+
/*
461+
* This should never happen (a device public page in the gate
462+
* area).
463+
*/
464+
if (is_device_public_page(*page))
465+
goto unmap;
459466
}
460467
get_page(*page);
461468
out:

mm/hmm.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -737,7 +737,7 @@ EXPORT_SYMBOL(hmm_vma_fault);
737737
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
738738

739739

740-
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
740+
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
741741
struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
742742
unsigned long addr)
743743
{
@@ -1177,4 +1177,4 @@ static int __init hmm_init(void)
11771177
}
11781178

11791179
device_initcall(hmm_init);
1180-
#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
1180+
#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */

mm/madvise.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
355355
continue;
356356
}
357357

358-
page = vm_normal_page(vma, addr, ptent);
358+
page = _vm_normal_page(vma, addr, ptent, true);
359359
if (!page)
360360
continue;
361361

mm/memcontrol.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4623,10 +4623,11 @@ static int mem_cgroup_move_account(struct page *page,
46234623
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
46244624
* target for charge migration. if @target is not NULL, the entry is stored
46254625
* in target->ent.
4626-
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
4627-
* (so ZONE_DEVICE page and thus not on the lru). For now we such page is
4628-
* charge like a regular page would be as for all intent and purposes it is
4629-
* just special memory taking the place of a regular page.
4626+
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
4627+
* or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
4628+
* For now we such page is charge like a regular page would be as for all
4629+
* intent and purposes it is just special memory taking the place of a
4630+
* regular page.
46304631
*
46314632
* See Documentations/vm/hmm.txt and include/linux/hmm.h
46324633
*
@@ -4657,7 +4658,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
46574658
*/
46584659
if (page->mem_cgroup == mc.from) {
46594660
ret = MC_TARGET_PAGE;
4660-
if (is_device_private_page(page))
4661+
if (is_device_private_page(page) ||
4662+
is_device_public_page(page))
46614663
ret = MC_TARGET_DEVICE;
46624664
if (target)
46634665
target->page = page;

mm/memory.c

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -818,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
818818
#else
819819
# define HAVE_PTE_SPECIAL 0
820820
#endif
821-
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
822-
pte_t pte)
821+
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
822+
pte_t pte, bool with_public_device)
823823
{
824824
unsigned long pfn = pte_pfn(pte);
825825

@@ -830,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
830830
return vma->vm_ops->find_special_page(vma, addr);
831831
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
832832
return NULL;
833-
if (!is_zero_pfn(pfn))
834-
print_bad_pte(vma, addr, pte, NULL);
833+
if (is_zero_pfn(pfn))
834+
return NULL;
835+
836+
/*
837+
* Device public pages are special pages (they are ZONE_DEVICE
838+
* pages but different from persistent memory). They behave
839+
* allmost like normal pages. The difference is that they are
840+
* not on the lru and thus should never be involve with any-
841+
* thing that involve lru manipulation (mlock, numa balancing,
842+
* ...).
843+
*
844+
* This is why we still want to return NULL for such page from
845+
* vm_normal_page() so that we do not have to special case all
846+
* call site of vm_normal_page().
847+
*/
848+
if (likely(pfn < highest_memmap_pfn)) {
849+
struct page *page = pfn_to_page(pfn);
850+
851+
if (is_device_public_page(page)) {
852+
if (with_public_device)
853+
return page;
854+
return NULL;
855+
}
856+
}
857+
print_bad_pte(vma, addr, pte, NULL);
835858
return NULL;
836859
}
837860

@@ -1012,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
10121035
get_page(page);
10131036
page_dup_rmap(page, false);
10141037
rss[mm_counter(page)]++;
1038+
} else if (pte_devmap(pte)) {
1039+
page = pte_page(pte);
1040+
1041+
/*
1042+
* Cache coherent device memory behave like regular page and
1043+
* not like persistent memory page. For more informations see
1044+
* MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
1045+
*/
1046+
if (is_device_public_page(page)) {
1047+
get_page(page);
1048+
page_dup_rmap(page, false);
1049+
rss[mm_counter(page)]++;
1050+
}
10151051
}
10161052

10171053
out_set_pte:
@@ -1267,7 +1303,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
12671303
if (pte_present(ptent)) {
12681304
struct page *page;
12691305

1270-
page = vm_normal_page(vma, addr, ptent);
1306+
page = _vm_normal_page(vma, addr, ptent, true);
12711307
if (unlikely(details) && page) {
12721308
/*
12731309
* unmap_shared_mapping_pages() wants to

0 commit comments

Comments
 (0)