diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 83697bf28c02a45e4c5c07c91ab34a935684cb9c..14a0123acc30d8e9f63648174c84663e4a4cc7b3 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1233,6 +1233,7 @@ CONFIG_LRU_GEN=y CONFIG_ARM64_HAFT=y CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y +CONFIG_GMEM=y CONFIG_LOCK_MM_AND_FIND_VMA=y CONFIG_IOMMU_MM_DATA=y # CONFIG_ASCEND_FEATURES is not set @@ -7022,6 +7023,13 @@ CONFIG_CPU_INSPECTOR_ATF=m CONFIG_ROH=m CONFIG_ROH_HNS=m CONFIG_ARM_SPE_MEM_SAMPLING=y + +# +# remote pager device +# +CONFIG_REMOTE_PAGER=m +CONFIG_REMOTE_PAGER_MASTER=m +# end of remote pager device # end of Device Drivers # diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h index e6a211001bd38edbb8fa3922b17ec59e6d12bbc4..ccdeffcefbfff33b53b9541a452222d43ae4ab26 100644 --- a/arch/arm64/include/asm/rsi_cmds.h +++ b/arch/arm64/include/asm/rsi_cmds.h @@ -9,6 +9,7 @@ #include #include +#include "string.h" #define RSI_GRANULE_SHIFT 12 #define RSI_GRANULE_SIZE (_AC(1, UL) << RSI_GRANULE_SHIFT) diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 4050098c07757c8d479f13d67deb38276c36b1a0..e9445d996e462887be865a5f95d4d57d380b1ffc 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1204,6 +1204,7 @@ CONFIG_LRU_GEN=y # CONFIG_LRU_GEN_STATS is not set CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y +CONFIG_GMEM=y CONFIG_LOCK_MM_AND_FIND_VMA=y CONFIG_IOMMU_MM_DATA=y CONFIG_PAGE_CACHE_LIMIT=y @@ -8209,6 +8210,13 @@ CONFIG_INTEL_TH_PTI=m # # CONFIG_CPU_INSPECT is not set # end of CPU Inspect + +# +# remote pager device +# +CONFIG_REMOTE_PAGER=m +CONFIG_REMOTE_PAGER_MASTER=m +# end of remote pager device # end of Device Drivers # diff --git a/drivers/base/node.c b/drivers/base/node.c index 4d588f4658c85cc1471da691fecbe744811812b4..b9e095cf349822c6ddb97271d2b32fd1a227fd36 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -931,6 +931,9 @@ static struct node_attr node_state_attr[] = { [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator, N_GENERIC_INITIATOR), +#ifdef CONFIG_GMEM + [N_HETEROGENEOUS] = _NODE_ATTR(has_hetero_memory, N_HETEROGENEOUS), +#endif }; static struct attribute *node_state_attrs[] = { @@ -943,6 +946,9 @@ static struct attribute *node_state_attrs[] = { &node_state_attr[N_MEMORY].attr.attr, &node_state_attr[N_CPU].attr.attr, &node_state_attr[N_GENERIC_INITIATOR].attr.attr, +#ifdef CONFIG_GMEM + &node_state_attr[N_HETEROGENEOUS].attr.attr, +#endif NULL }; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8a691365061c1704ebdbbb2e5fbd08c99b5f1d36..84faaddafddfeed4a92dc9bab090def9ce174ac9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -698,6 +698,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +#ifdef CONFIG_GMEM + [ilog2(VM_PEER_SHARED)] = "ps", +#endif #ifdef CONFIG_X86_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif diff --git a/include/linux/device.h b/include/linux/device.h index 54a4967c496cd562715a592931144acfc35f0169..94262735406a44bf96461e304b0dc60ecee20c42 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -826,7 +826,13 @@ struct device { KABI_RESERVE(2) KABI_RESERVE(3) #endif + +#ifdef CONFIG_GMEM + KABI_USE(4, void *gm_dev) +#else KABI_RESERVE(4) +#endif + KABI_RESERVE(5) KABI_RESERVE(6) KABI_RESERVE(7) diff --git a/include/linux/gmem.h b/include/linux/gmem.h new file mode 100644 index 0000000000000000000000000000000000000000..23e87f2d7fe317230ade7b02df0babc4fc037252 --- /dev/null +++ b/include/linux/gmem.h @@ -0,0 +1,449 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi Zhu + * + */ +#ifndef _GMEM_H +#define _GMEM_H + +#include + +struct hnode; + +/* + * enum gm_ret - The return value of GMEM KPI that can be used to tell + * the core VM or peripheral driver whether the GMEM KPI was + * executed successfully. + * + * @GM_RET_SUCCESS: The invoked GMEM KPI behaved as expected. + * @GM_RET_FAILURE_UNKNOWN: The GMEM KPI failed with unknown reason. + * Any external status related to this KPI invocation changes must be rolled back. + */ +enum gm_ret { + GM_RET_SUCCESS = 0, + GM_RET_NOMEM, + GM_RET_PAGE_EXIST, + GM_RET_DMA_ERROR, + GM_RET_MIGRATING, + GM_RET_FAILURE_UNKNOWN, + GM_RET_UNIMPLEMENTED, +}; + +/* + * Defines a contiguous range of virtual addresses inside a struct gm_as + * As an analogy, this is conceptually similar as virtual_address_struct + */ +struct gm_region { + unsigned long start_va; + unsigned long end_va; + struct rb_node node; + struct gm_as *as; /* The address space that it belongs to */ + + /* Do we need another list_node to maintain a tailQ of allocated VMAs inside a gm_as? */ + struct list_head mapping_set_link; + + void (*callback_op)(void *args); + void *cb_args; +}; + +/* This holds a list of regions that must not be concurrently manipulated. */ +struct gm_mapping_set { + unsigned int region_cnt; + struct list_head gm_region_list; +}; + +/** + * enum gm_mmu_mode - defines the method to share a physical page table. + * + * @GM_MMU_MODE_SHARE: Literally share a physical page table with another + * attached device's MMU. Nothing is guaranteed about the allocated address. + * @GM_MMU_MODE_COHERENT_EXCLUSIVE: Maintain a coherent page table that holds + * exclusive mapping entries, so that device memory accesses can trigger fault-driven + * migration for automatic data locality optimizations. + * @GM_MMU_MODE_REPLICATE: Maintain a coherent page table that replicates physical + * mapping entries whenever a physical mapping is installed inside the address space, so + * that it may minimize the page faults to be triggered by this device. + */ +enum gm_mmu_mode { + GM_MMU_MODE_SHARE, + GM_MMU_MODE_COHERENT_EXCLUSIVE, + GM_MMU_MODE_REPLICATE, +}; + +/* + * This is the parameter list of peer_map/unmap mmu operations. + * if device should copy data to/from host, set copy and dma_addr + */ +struct gm_fault_t { + struct mm_struct *mm; + struct gm_dev *dev; + unsigned long pfn; + unsigned long va; + unsigned long size; + unsigned long prot; + bool copy; + dma_addr_t dma_addr; + int behavior; +}; + +enum gm_memcpy_kind { + GM_MEMCPY_INIT, + GM_MEMCPY_H2H, + GM_MEMCPY_H2D, + GM_MEMCPY_D2H, + GM_MEMCPY_D2D, + GM_MEMCPY_KIND_INVALID, +}; + +struct gm_memcpy_t { + struct mm_struct *mm; + struct gm_dev *dev; + dma_addr_t src; + dma_addr_t dest; + + size_t size; + enum gm_memcpy_kind kind; +}; + +/** + * + * This struct defines a series of MMU functions registered by a peripheral + * device that is to be invoked by GMEM. + * + * pmap is an opaque pointer that identifies a physical page table of a device. + * A physical page table holds the physical mappings that can be interpreted by + * the hardware MMU. + */ +struct gm_mmu { + /* + * Each bit indicates a supported page size for page-based TLB. + * Currently we do not consider range TLBs. + */ + unsigned long pgsize_bitmap; + + /* + * cookie identifies the type of the MMU. If two gm_mmu shares the same cookie, + * then it means their page table formats are compatible. + * In that case, they can share the same void *pmap as the input arg. + */ + unsigned long cookie; + + /* Synchronize VMA in a peer OS to interact with the host OS */ + enum gm_ret (*peer_va_alloc_fixed)(struct gm_fault_t *gmf); + enum gm_ret (*peer_va_free)(struct gm_fault_t *gmf); + + /* Create physical mappings on peer host. + * If copy is set, copy data [dma_addr, dma_addr + size] to peer host + */ + enum gm_ret (*peer_map)(struct gm_fault_t *gmf); + /* + * Destroy physical mappings on peer host. + * If copy is set, copy data back to [dma_addr, dma_addr + size] + */ + enum gm_ret (*peer_unmap)(struct gm_fault_t *gmf); + + enum gm_ret (*import_phys_mem)(struct mm_struct *mm, int hnid, unsigned long page_cnt); + + /* Create or destroy a device's physical page table. */ + enum gm_ret (*pmap_create)(struct gm_dev *dev, void **pmap); + enum gm_ret (*pmap_destroy)(void *pmap); + + /* Create or destroy a physical mapping of a created physical page table */ + enum gm_ret (*pmap_enter)(void *pmap, unsigned long va, unsigned long size, + unsigned long pa, unsigned long prot); + enum gm_ret (*pmap_release)(void *pmap, unsigned long va, unsigned long size); + + /* Change the protection of a virtual page */ + enum gm_ret (*pmap_protect)(void *pmap, unsigned long va, unsigned long size, + unsigned long new_prot); + + /* Invalidation functions of the MMU TLB */ + enum gm_ret (*tlb_invl)(void *pmap, unsigned long va, unsigned long size); + enum gm_ret (*tlb_invl_coalesced)(void *pmap, struct list_head *mappings); + + // copy one area of memory from device to host or from host to device + enum gm_ret (*peer_hmemcpy)(struct gm_memcpy_t *gmc); +}; + +/** + * unsigned long defines a composable flag to describe the capabilities of a device. + * + * @GM_DEV_CAP_REPLAYABLE: Memory accesses can be replayed to recover page faults. + * @GM_DEV_CAP_PEER: The device has its own VMA/PA management, controlled by another peer OS + */ +#define GM_DEV_CAP_REPLAYABLE 0x00000001 +#define GM_DEV_CAP_PEER 0x00000010 + +#define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0) + +struct gm_context { + struct gm_as *as; + struct gm_dev *dev; + void *pmap; + /* + * consider a better container to maintain multiple ctx inside a device or multiple ctx + * inside a va space. + * A device may simultaneously have multiple contexts for time-sliced ctx switching + */ + struct list_head gm_dev_link; + + /* A va space may have multiple gm_context */ + struct list_head gm_as_link; +}; +#define get_gm_context(head) (list_entry((head)->prev, struct gm_context, ctx_link)) + +struct gm_dev { + int id; + + /* identifies the device capability + * For example, whether the device supports page faults or whether it has its + * own OS that manages the VA and PA resources. + */ + unsigned long capability; + struct gm_mmu *mmu; + void *dev_data; + /* + * TODO: Use a better container of struct gm_context to support time-sliced context switch. + * A collection of device contexts. If the device does not support time-sliced context + * switch, then the size of the collection should never be greater than one. + * We need to think about what operators should the container be optimized for. + * A list, a radix-tree or what? What would gm_dev_activate require? + * Are there any accelerators that are really going to support time-sliced context switch? + */ + struct gm_context *current_ctx; + + struct list_head gm_ctx_list; + + /* Add tracking of registered device local physical memory. */ + nodemask_t registered_hnodes; + struct device *dma_dev; + + struct gm_mapping *gm_mapping; +}; + +#define GM_MAPPING_CPU 0x10 /* Determines whether page is a pointer or a pfn number. */ +#define GM_MAPPING_DEVICE 0x20 +#define GM_MAPPING_NOMAP 0x40 +#define GM_MAPPING_PINNED 0x80 +#define GM_MAPPING_WILLNEED 0x100 + +#define GM_MAPPING_TYPE_MASK (GM_MAPPING_CPU | GM_MAPPING_DEVICE | GM_MAPPING_NOMAP) + +/* Records the status of a page-size physical page */ +struct gm_mapping { + unsigned int flag; + + union { + struct page *page; /* CPU node */ + struct gm_page *gm_page; /* hetero-node */ + }; + + struct gm_dev *dev; + struct mutex lock; +}; + +static inline void gm_mapping_flags_set(struct gm_mapping *gm_mapping, int flags) +{ + if (flags & GM_MAPPING_TYPE_MASK) + gm_mapping->flag &= ~GM_MAPPING_TYPE_MASK; + + gm_mapping->flag |= flags; +} + +static inline void gm_mapping_flags_clear(struct gm_mapping *gm_mapping, int flags) +{ + gm_mapping->flag &= ~flags; +} + +static inline bool gm_mapping_cpu(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_MAPPING_CPU); +} + +static inline bool gm_mapping_device(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_MAPPING_DEVICE); +} + +static inline bool gm_mapping_nomap(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_MAPPING_NOMAP); +} + +#define test_gm_mapping_mapped_on_node(i) { /* implement this */ } +#define set_gm_mapping_mapped_on_node(i) { /* implement this */ } +#define unset_gm_mapping_mapped_on_node(i) { /* implement this */ } + +/* GMEM Device KPI */ +extern enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, + struct gm_dev **new_dev); +extern enum gm_ret gm_dev_switch(struct gm_dev *dev, struct gm_as *as); +extern enum gm_ret gm_dev_detach(struct gm_dev *dev, struct gm_as *as); +extern int gm_dev_register_hnode(struct gm_dev *dev); +enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, + struct gm_dev *dev, int behavior); +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order); + +/* GMEM address space KPI */ +extern enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, + unsigned long cache_quantum, struct gm_as **new_as); +extern enum gm_ret gm_as_destroy(struct gm_as *as); +extern enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, + bool activate, struct gm_context **out_ctx); +extern unsigned long gm_as_alloc(struct gm_as *as, unsigned long hint, unsigned long size, + unsigned long align, unsigned long no_cross, unsigned long max_va, + struct gm_region **new_region); + +extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); +extern int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size); + +enum gmem_stats_item { + NR_PAGE_MIGRATING_H2D, + NR_PAGE_MIGRATING_D2H, + NR_GMEM_STAT_ITEMS +}; + +extern void gmem_stats_counter(enum gmem_stats_item item, int val); +extern void gmem_stats_counter_show(void); + +/* h-NUMA topology */ +struct hnode { + unsigned int id; + struct gm_dev *dev; + + struct task_struct *swapd_task; + + struct list_head freelist; + struct list_head activelist; + spinlock_t freelist_lock; + spinlock_t activelist_lock; + atomic_t nr_free_pages; + atomic_t nr_active_pages; + + unsigned long max_memsize; + + bool import_failed; +}; + +static inline void hnode_active_pages_inc(struct hnode *hnode) +{ + atomic_inc(&hnode->nr_active_pages); +} + +static inline void hnode_active_pages_dec(struct hnode *hnode) +{ + atomic_dec(&hnode->nr_active_pages); +} + +static inline void hnode_free_pages_inc(struct hnode *hnode) +{ + atomic_inc(&hnode->nr_free_pages); +} + +static inline void hnode_free_pages_dec(struct hnode *hnode) +{ + atomic_dec(&hnode->nr_free_pages); +} + +static inline bool is_hnode(int node) +{ + return (node < MAX_NUMNODES) && !node_isset(node, node_possible_map) && + node_isset(node, hnode_map); +} + +static inline int get_hnuma_id(struct gm_dev *gm_dev) +{ + return first_node(gm_dev->registered_hnodes); +} + +void __init hnuma_init(void); +unsigned int alloc_hnode_id(void); +void free_hnode_id(unsigned int nid); +struct hnode *get_hnode(unsigned int hnid); +struct gm_dev *get_gm_dev(unsigned int nid); +void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev); +void hnode_deinit(unsigned int hnid, struct gm_dev *dev); + +struct gm_page { + struct list_head gm_page_list; + + unsigned long flags; + unsigned long dev_pfn; + unsigned long dev_dma_addr; + unsigned int hnid; + + /* + * The same functionality as rmap, we need know which process + * maps to this gm_page with which virtual address. + * */ + unsigned long va; + struct mm_struct *mm; + spinlock_t rmap_lock; + + unsigned int flag; + atomic_t refcount; +}; + +#define GM_PAGE_EVICTING 0x1 +#define GM_PAGE_PINNED 0x2 + +static inline void gm_page_flags_set(struct gm_page *gm_page, int flags) +{ + gm_page->flag |= flags; +} + +static inline void gm_page_flags_clear(struct gm_page *gm_page, int flags) +{ + gm_page->flag &= ~flags; +} + +static inline bool gm_page_evicting(struct gm_page *gm_page) +{ + return !!(gm_page->flag & GM_PAGE_EVICTING); +} + +static inline bool gm_page_pinned(struct gm_page *gm_page) +{ + return !!(gm_page->flag & GM_PAGE_PINNED); +} + +#define NUM_IMPORT_PAGES 16 + +int __init gm_page_cachep_init(void); +void gm_page_cachep_destroy(void); +struct gm_page *alloc_gm_page_struct(void); +void hnode_freelist_add(struct hnode *hnode, struct gm_page *gm_page); +void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page); +void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page); +void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page); +void mark_gm_page_active(struct gm_page *gm_page); +void mark_gm_page_pinned(struct gm_page *gm_page); +void mark_gm_page_unpinned(struct gm_page *gm_page); +void gm_page_add_rmap(struct gm_page *gm_page, struct mm_struct *mm, unsigned long va); +void gm_page_remove_rmap(struct gm_page *gm_page); +int gm_add_pages(unsigned int hnid, struct list_head *pages); +void gm_free_page(struct gm_page *gm_page); +struct gm_page *gm_alloc_page(struct mm_struct *mm, struct hnode *hnode); + +static inline void get_gm_page(struct gm_page *gm_page) +{ + atomic_inc(&gm_page->refcount); +} + +static inline void put_gm_page(struct gm_page *gm_page) +{ + if (atomic_dec_and_test(&gm_page->refcount)) + gm_free_page(gm_page); +} + +int hnode_init_sysfs(unsigned int hnid); +int gm_init_sysfs(void); +void gm_deinit_sysfs(void); + +#define gmem_err(fmt, ...) \ + ((void)pr_err("[gmem]" fmt "\n", ##__VA_ARGS__)) + +#endif /* _GMEM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f36bf9ee02f7e9ae1a3d9cc8e16666d0260d663..5850701096ca0bdffcb315e8cf736240abd07731 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -342,6 +342,12 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) + +#ifdef CONFIG_GMEM +#define VM_PEER_SHARED BIT(56) +#else +#define VM_PEER_SHARED VM_NONE +#endif #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS @@ -3404,6 +3410,12 @@ unsigned long randomize_page(unsigned long start, unsigned long range); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +#ifdef CONFIG_GMEM +extern unsigned long get_unmapped_area_aligned(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags, unsigned long align); +#endif + extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); @@ -4213,4 +4225,28 @@ void vma_pgtable_walk_end(struct vm_area_struct *vma); /* added to mm.h to avoid every caller adding new header file */ #include + +#ifdef CONFIG_GMEM +DECLARE_STATIC_KEY_FALSE(gmem_status); + +static inline bool gmem_is_enabled(void) +{ + return static_branch_likely(&gmem_status); +} + +static inline bool vma_is_peer_shared(struct vm_area_struct *vma) +{ + if (!gmem_is_enabled()) + return false; + + return !!(vma->vm_flags & VM_PEER_SHARED); +} +#else +static inline bool gmem_is_enabled(void) { return false; } +static inline bool vma_is_peer_shared(struct vm_area_struct *vma) +{ + return false; +} +#endif + #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 64c38b09e18d5579dd362cc160f68d6535c70428..f012f7c7c4d4a11c5532e33bf4331ce114687233 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -612,6 +612,74 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +#ifdef CONFIG_GMEM +/* + * Defines a centralized logical mapping table that reflects the mapping information + * regardless of the underlying arch-specific MMUs. + * The implementation of this data structure borrows the VM_OBJECT from FreeBSD as well + * as the filemap address_space struct from Linux page cache. + * Only VMAs point to VM_OBJECTs and maintain logical mappings, because we assume that + * the coordiantion between page tables must happen with CPU page table involved. That + * is to say, a generalized process unit must involve in a UVA-programming model, otherwise + * there is no point to support UVA programming. + * However, a VMA only needs to maintain logical mappings if the process has been + * attached to a GMEM VA space. In normal cases, a CPU process does not need it. (unless + * we later build a reservation system on top of the logical mapping tables to support + * reservation-based superpages and rangeTLBs). + * A GM_REGION does not need to maintain logical mappings. In the case that a device wants + * to support its private address space with local physical memory, GMEM should forward address + * space management to the core VM, using VMAs, instead of using GM_REGIONs. + */ +struct vm_object { + spinlock_t lock; + struct vm_area_struct *vma; + + /* + * The logical_page_table is a container that holds the mapping + * information between a VA and a struct page. + */ + struct xarray *logical_page_table; + atomic_t nr_pages; + + /* + * a vm object might be referred by multiple VMAs to share + * memory. + */ + atomic_t ref_count; +}; + +#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */ + +/** + * enum gm_as_alloc - defines different allocation policy for virtual addresses. + * + * @GM_AS_ALLOC_DEFAULT: An object cache is applied to accelerate VA allocations. + * @GM_AS_ALLOC_FIRSTFIT: Prefer allocation efficiency. + * @GM_AS_ALLOC_BESTFIT: Prefer space efficiency. + * @GM_AS_ALLOC_NEXTFIT: Perform an address-ordered search for free addresses, + * beginning where the previous search ended. + */ +enum gm_as_alloc { + GM_AS_ALLOC_DEFAULT = 0, + GM_AS_ALLOC_FIRSTFIT, + GM_AS_ALLOC_BESTFIT, + GM_AS_ALLOC_NEXTFIT, +}; + +/* Defines an address space. */ +struct gm_as { + spinlock_t rbtree_lock; /* spinlock of struct gm_as */ + struct rb_root rbroot; /*root of gm_region_t */ + enum gm_as_alloc policy; + unsigned long start_va; + unsigned long end_va; + /* defines the VA unit size if an object cache is applied */ + unsigned long cache_quantum; + /* tracks device contexts attached to this va space, using gm_as_link */ + struct list_head gm_ctx_list; +}; +#endif + struct anon_vma_name { struct kref kref; /* The name needs to be at the end because it is dynamically sized. */ @@ -735,7 +803,11 @@ struct vm_area_struct { #ifdef CONFIG_SHARE_POOL struct sp_area *spa; #endif +#ifdef CONFIG_GMEM + KABI_USE(1, struct vm_object *vm_obj) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) @@ -1016,7 +1088,11 @@ struct mm_struct { #else KABI_RESERVE(1) #endif +#ifdef CONFIG_GMEM + KABI_USE(2, struct gm_as *gm_as) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5) diff --git a/include/linux/mman.h b/include/linux/mman.h index 8ddca62d6460bd461b8afff731bb64a5203b822a..30ec68346f6b0409155afbf32aa3d40e8afb305b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -55,7 +55,8 @@ | MAP_32BIT \ | MAP_ABOVE4G \ | MAP_HUGE_2MB \ - | MAP_HUGE_1GB) + | MAP_HUGE_1GB \ + | MAP_PEER_SHARED) extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 8d07116caaf1b037c3121bd8ca5011dd4568cdc2..f005f3d903aedc52d0d9423f3077b6cfedd10865 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -407,6 +407,11 @@ enum node_states { N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ +#ifdef CONFIG_GMEM +#ifndef __GENKSYMS__ + N_HETEROGENEOUS, /* The node has heterogeneous memory */ +#endif +#endif NR_NODE_STATES }; @@ -536,6 +541,13 @@ static inline int node_random(const nodemask_t *maskp) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) +#ifdef CONFIG_GMEM +/* For h-NUMA topology */ +#define hnode_map node_states[N_HETEROGENEOUS] +#define num_hnodes() num_node_state(N_HETEROGENEOUS) +#define for_each_hnode(node) for_each_node_state(node, N_HETEROGENEOUS) +#endif + /* * For nodemask scratch area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and diff --git a/include/linux/remote_pager/msg_chan.h b/include/linux/remote_pager/msg_chan.h new file mode 100644 index 0000000000000000000000000000000000000000..a8049def052d6686a59474846b83c59576cd2263 --- /dev/null +++ b/include/linux/remote_pager/msg_chan.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __RPG_MSG_CHAN_H__ +#define __RPG_MSG_CHAN_H__ + +#include + +/* + * struct phys_channel_ops - Channel physical layer ops + * @open: Open the communication channel of node nid and alloc physical resources, + * returns the channel ID + * @notify: Notify peer of chan_id to receive messages + * @copy_to: Copy the msg_data message from origin to peer + * @copy_from: Copy the msg_data message from peer to origin + * @close: Close channel and free physical resources + */ +struct phys_channel_ops { + char *name; + int (*open)(int nid); + int (*notify)(int chan_id); + int (*copy_to)(int chan_id, void *msg_data, size_t msg_len, int flags); + int (*copy_from)(int chan_id, void *buf, size_t len, int flags); + int (*migrate_page)(void *peer_addr, struct page *local_page, size_t size, int dir); + int (*close)(int chan_id); +}; + +int msg_layer_install_phy_ops(struct phys_channel_ops *ops, int default_chan_id); +int msg_layer_uninstall_phy_ops(struct phys_channel_ops *ops); + +#define log_err(fmt, ...) pr_err("[%s:%d]" fmt, __func__, __LINE__, ##__VA_ARGS__) +#define log_info(fmt, ...) pr_info("[%s:%d]" fmt, __func__, __LINE__, ##__VA_ARGS__) + +#define MSG_CMD_START 0x1 +#define MSG_CMD_IRQ_END 0x2 +#define MSG_CMD_FIFO_NO_MEM 0x3 +#define MSG_CMD_CHANN_OPEN 0x4 + +#define CHAN_STAT_ENABLE 1 +#define CHAN_STAT_DISABLE 0 + +#define TO_PEER 0 +#define FROM_PEER 1 + +#endif diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h new file mode 100644 index 0000000000000000000000000000000000000000..480bb12fb6a351bad891295380bd44bd4801fe8a --- /dev/null +++ b/include/linux/vm_object.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _VM_OBJECT_H +#define _VM_OBJECT_H + +#include +#include + +#ifdef CONFIG_GMEM +/* vm_object KPI */ +int __init vm_object_init(void); +struct vm_object *vm_object_create(struct vm_area_struct *vma); +void vm_object_drop_locked(struct vm_area_struct *vma); +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src, bool dst_peer_shared); +void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end); +void vm_object_merge(struct vm_area_struct *vma, unsigned long addr); +void vm_object_split(struct vm_area_struct *old_vma, struct vm_area_struct *new_vma); +void dup_peer_shared_vma(struct vm_area_struct *vma); + +struct gm_mapping *alloc_gm_mapping(void); +struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va); +void vm_object_mapping_create(struct vm_object *obj, unsigned long start); +void free_gm_mappings(struct vm_area_struct *vma); +#else +static inline void __init vm_object_init(void) {} +static inline struct vm_object *vm_object_create(struct vm_area_struct *vma) { return NULL; } +static inline void vm_object_drop_locked(struct vm_area_struct *vma) {} +static inline void dup_vm_object(struct vm_area_struct *dst, + struct vm_area_struct *src, bool dst_peer_shared) {} +static inline void dup_peer_shared_vma(struct vm_area_struct *vma) {} +static inline void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end) {} + +static inline struct gm_mapping *alloc_gm_mapping(void) { return NULL; } +static inline struct gm_mapping *vm_object_lookup(struct vm_object *obj, + unsigned long va) { return NULL; } +static inline void vm_object_mapping_create(struct vm_object *obj, + unsigned long start) {} +static inline void free_gm_mappings(struct vm_area_struct *vma) {} +#endif + +#endif /* _VM_OBJECT_H */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 14e5498efd7acab203c0d43e48e0536ed52ffead..19e22492a85b8d24e9d094b90a759e38ccb24154 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -33,6 +33,8 @@ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ +#define MAP_PEER_SHARED 0x1000000 + /* * Flags for mlock */ @@ -79,6 +81,12 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +/* for hmadvise */ +#define MADV_GMEM_BASE 0x1000 +#define MADV_PREFETCH MADV_GMEM_BASE /* prefetch pages for hNUMA node */ +#define MADV_PINNED (MADV_GMEM_BASE+1) /* pin these pages */ +#define MADV_PINNED_REMOVE (MADV_GMEM_BASE+2) /* unpin these pages */ + #define MADV_ETMEM_BASE 0x1100 #define MADV_SWAPFLAG MADV_ETMEM_BASE /* for memory to be swap out */ #define MADV_SWAPFLAG_REMOVE (MADV_SWAPFLAG + 1) diff --git a/kernel/fork.c b/kernel/fork.c index 78663ca681600ff7b78150acb521d115e3f1f1a9..7c7f87bd1110a0f5654db12f7dcd94ac3a7a72af 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -99,6 +99,11 @@ #include #include #include + +#ifdef CONFIG_GMEM +#include +#endif + #ifdef CONFIG_QOS_SCHED_SMART_GRID #include #endif @@ -110,10 +115,15 @@ #include #include #include + #ifdef CONFIG_FAST_SYSCALL #include #endif +#ifdef CONFIG_GMEM +#include +#endif + #include #define CREATE_TRACE_POINTS @@ -526,6 +536,10 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); +#ifdef CONFIG_GMEM + dup_peer_shared_vma(new); +#endif + return new; } @@ -551,6 +565,10 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) void vm_area_free(struct vm_area_struct *vma) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + vm_object_drop_locked(vma); +#endif #ifdef CONFIG_PER_VMA_LOCK call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); #else @@ -1766,7 +1784,9 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, err = dup_mmap(mm, oldmm); if (err) goto free_pt; - +#ifdef CONFIG_GMEM + mm->gm_as = NULL; +#endif mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; diff --git a/mm/Kconfig b/mm/Kconfig index bdd8372552ffd0fd17a1c879c5fe1545f99f0f0c..829a0d6a0fb5e50caef77fe02423ecd053119b1c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1480,6 +1480,21 @@ config NUMABALANCING_MEM_SAMPLING if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING. +config GMEM + bool "gmem subsystem for multi-MMU cooperative management" + depends on (ARM64 || X86_64) && MMU && TRANSPARENT_HUGEPAGE + select ARCH_USES_HIGH_VMA_FLAGS + default y + help + This provides a high-level interface that decouples MMU-specific functions. + Device drivers can thus attach themselves to a process’s address space and + let the OS take charge of their memory management. This eliminates + the need for device drivers to reinvent the wheel and allows them to + benefit from general memory optimizations integrated by GMEM. + + say Y here to enable gmem subsystem + + source "mm/damon/Kconfig" config THP_CONTROL diff --git a/mm/Makefile b/mm/Makefile index 08fcaca0d8cd1b8743e5781df785c72e47fa6b45..db7c51e1f563181c982fa027dc8d27ec801eb471 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -41,7 +41,7 @@ mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o - +mmu-$(CONFIG_GMEM) += gmem.o gmem_phys.o gmem_stat.o vm_object.o ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o diff --git a/mm/gmem.c b/mm/gmem.c new file mode 100644 index 0000000000000000000000000000000000000000..227717b2408e70440a57b9288611e9ae70865c68 --- /dev/null +++ b/mm/gmem.c @@ -0,0 +1,1109 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi Zhu + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(gmem_status); +EXPORT_SYMBOL_GPL(gmem_status); + +static struct kmem_cache *gm_as_cache; +static struct kmem_cache *gm_dev_cache; +static struct kmem_cache *gm_ctx_cache; +static struct kmem_cache *gm_region_cache; +static DEFINE_XARRAY_ALLOC(gm_dev_id_pool); + +static bool enable_gmem; + +static inline unsigned long pe_mask(unsigned int order) +{ + if (order == 0) + return PAGE_MASK; + if (order == PMD_ORDER) + return HPAGE_PMD_MASK; + if (order == PUD_ORDER) + return HPAGE_PUD_MASK; + return 0; +} + +static struct percpu_counter g_gmem_stats[NR_GMEM_STAT_ITEMS]; + +void gmem_stats_counter(enum gmem_stats_item item, int val) +{ + if (!gmem_is_enabled()) + return; + + if (WARN_ON_ONCE(unlikely(item >= NR_GMEM_STAT_ITEMS))) + return; + + percpu_counter_add(&g_gmem_stats[item], val); +} + +static int gmem_stats_init(void) +{ + int i, rc; + + for (i = 0; i < NR_GMEM_STAT_ITEMS; i++) { + rc = percpu_counter_init(&g_gmem_stats[i], 0, GFP_KERNEL); + if (rc) { + int j; + + for (j = i-1; j >= 0; j--) + percpu_counter_destroy(&g_gmem_stats[j]); + + break; /* break the initialization process */ + } + } + + return rc; +} + +#ifdef CONFIG_PROC_FS +static int gmem_stats_show(struct seq_file *m, void *arg) +{ + if (!gmem_is_enabled()) + return 0; + + seq_printf( + m, "migrating H2D : %lld\n", + percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING_H2D])); + seq_printf( + m, "migrating D2H : %lld\n", + percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING_D2H])); + + return 0; +} +#endif /* CONFIG_PROC_FS */ + +static struct workqueue_struct *prefetch_wq; + +#define GM_WORK_CONCURRENCY 4 + +static int __init gmem_init(void) +{ + int err = -ENOMEM; + + if (!enable_gmem) + return 0; + + gm_as_cache = KMEM_CACHE(gm_as, 0); + if (!gm_as_cache) + goto out; + + gm_dev_cache = KMEM_CACHE(gm_dev, 0); + if (!gm_dev_cache) + goto free_as; + + gm_ctx_cache = KMEM_CACHE(gm_context, 0); + if (!gm_ctx_cache) + goto free_dev; + + gm_region_cache = KMEM_CACHE(gm_region, 0); + if (!gm_region_cache) + goto free_ctx; + + err = gm_page_cachep_init(); + if (err) + goto free_region; + + err = gm_init_sysfs(); + if (err) + goto free_gm_page; + + err = vm_object_init(); + if (err) + goto free_gm_sysfs; + + err = gmem_stats_init(); + if (err) + goto free_region; + + prefetch_wq = alloc_workqueue("prefetch", + __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE, GM_WORK_CONCURRENCY); + if (!prefetch_wq) { + gmem_err("fail to alloc workqueue prefetch_wq\n"); + err = -EFAULT; + goto free_region; + } + +#ifdef CONFIG_PROC_FS + proc_create_single("gmemstats", 0444, NULL, gmem_stats_show); +#endif + + static_branch_enable(&gmem_status); + + return 0; + +free_gm_sysfs: + gm_deinit_sysfs(); +free_gm_page: + gm_page_cachep_destroy(); +free_region: + kmem_cache_destroy(gm_region_cache); +free_ctx: + kmem_cache_destroy(gm_ctx_cache); +free_dev: + kmem_cache_destroy(gm_dev_cache); +free_as: + kmem_cache_destroy(gm_as_cache); +out: + return -ENOMEM; +} +subsys_initcall(gmem_init); + +static int __init setup_gmem(char *str) +{ + strtobool(str, &enable_gmem); + + return 1; +} +__setup("gmem=", setup_gmem); + +/* + * Create a GMEM device, register its MMU function and the page table. + * The returned device pointer will be passed by new_dev. + * A unique id will be assigned to the GMEM device, using Linux's xarray. + */ +enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, + struct gm_dev **new_dev) +{ + struct gm_dev *dev; + + if (!gmem_is_enabled()) + return GM_RET_FAILURE_UNKNOWN; + + dev = kmem_cache_alloc(gm_dev_cache, GFP_KERNEL); + if (!dev) + return GM_RET_NOMEM; + + if (xa_alloc(&gm_dev_id_pool, &dev->id, dev, xa_limit_32b, + GFP_KERNEL)) { + kmem_cache_free(gm_dev_cache, dev); + return GM_RET_NOMEM; + } + + dev->capability = cap; + dev->mmu = mmu; + dev->dev_data = dev_data; + dev->current_ctx = NULL; + INIT_LIST_HEAD(&dev->gm_ctx_list); + *new_dev = dev; + nodes_clear(dev->registered_hnodes); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_dev_create); + +/* Handle the page fault triggered by a given device with mmap lock*/ +enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, + int behavior) +{ + enum gm_ret ret = GM_RET_SUCCESS; + struct gm_mmu *mmu = dev->mmu; + struct hnode *hnode; + struct device *dma_dev = dev->dma_dev; + struct vm_area_struct *vma; + struct vm_object *obj; + struct gm_mapping *gm_mapping; + struct gm_page *gm_page; + unsigned long size = HPAGE_SIZE; + struct gm_fault_t gmf = { + .mm = mm, + .va = addr, + .dev = dev, + .size = size, + .copy = false, + .behavior = behavior + }; + struct page *page = NULL; + + hnode = get_hnode(get_hnuma_id(dev)); + if (!hnode) { + gmem_err("gmem device should correspond to a hnuma node"); + ret = -EINVAL; + goto out; + } + + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) { + gmem_err("%s failed to find vma", __func__); + ret = GM_RET_FAILURE_UNKNOWN; + goto out; + } + obj = vma->vm_obj; + if (!obj) { + gmem_err("%s no vm_obj", __func__); + ret = GM_RET_FAILURE_UNKNOWN; + goto out; + } + + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + vm_object_mapping_create(obj, addr); + gm_mapping = vm_object_lookup(obj, addr); + } + xa_unlock(obj->logical_page_table); + + if (unlikely(!gm_mapping)) { + gmem_err("OOM when creating vm_obj!"); + ret = GM_RET_NOMEM; + goto out; + } + mutex_lock(&gm_mapping->lock); + if (gm_mapping_nomap(gm_mapping)) { + goto peer_map; + } else if (gm_mapping_device(gm_mapping)) { + switch (behavior) { + case MADV_PINNED: + mark_gm_page_pinned(gm_mapping->gm_page); + fallthrough; + case MADV_WILLNEED: + mark_gm_page_active(gm_mapping->gm_page); + goto unlock; + case MADV_PINNED_REMOVE: + mark_gm_page_unpinned(gm_mapping->gm_page); + goto unlock; + default: + ret = 0; + goto unlock; + } + } else if (gm_mapping_cpu(gm_mapping)) { + page = gm_mapping->page; + if (!page) { + gmem_err("host gm_mapping page is NULL. Set nomap"); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); + goto unlock; + } + get_page(page); + /* zap_page_range_single can be used in Linux 6.4 and later versions. */ + zap_page_range_single(vma, addr, size, NULL); + gmf.dma_addr = + dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) + gmem_err("dma map failed"); + + gmf.copy = true; + } + +peer_map: + gm_page = gm_alloc_page(mm, hnode); + if (!gm_page) { + gmem_err("Alloc gm_page for device fault failed."); + ret = -ENOMEM; + goto unlock; + } + + gmf.pfn = gm_page->dev_pfn; + + ret = mmu->peer_map(&gmf); + if (ret != GM_RET_SUCCESS) { + gmem_err("peer map failed"); + if (page) + gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); + put_gm_page(gm_page); + goto unlock; + } + + if (page) { + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + folio_put(page_folio(page)); + } + + gm_mapping_flags_set(gm_mapping, GM_MAPPING_DEVICE); + gm_mapping->dev = dev; + gm_page_add_rmap(gm_page, mm, addr); + gm_mapping->gm_page = gm_page; + if (behavior == MADV_PINNED) { + mark_gm_page_pinned(gm_page); + } else if (behavior == MADV_PINNED_REMOVE) { + mark_gm_page_unpinned(gm_page); + } + hnode_activelist_add(hnode, gm_page); + hnode_active_pages_inc(hnode); +unlock: + mutex_unlock(&gm_mapping->lock); +out: + return ret; +} +EXPORT_SYMBOL_GPL(gm_dev_fault_locked); + +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, + unsigned int order) +{ + vm_fault_t ret = 0; + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address & pe_mask(order); + struct vm_object *obj = vma->vm_obj; + struct gm_mapping *gm_mapping; + unsigned long size = HPAGE_SIZE; + struct gm_dev *dev; + struct hnode *hnode; + struct device *dma_dev; + struct gm_fault_t gmf = { + .mm = vma->vm_mm, + .va = addr, + .size = size, + .copy = true, + }; + + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + gmem_err("host fault gm_mapping should not be NULL\n"); + return VM_FAULT_SIGBUS; + } + + dev = gm_mapping->dev; + gmf.dev = dev; + gmf.pfn = gm_mapping->gm_page->dev_pfn; + dma_dev = dev->dma_dev; + gmf.dma_addr = + dma_map_page(dma_dev, vmf->page, 0, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) { + gmem_err("host fault dma mapping error\n"); + return VM_FAULT_SIGBUS; + } + if (dev->mmu->peer_unmap(&gmf) != GM_RET_SUCCESS) { + gmem_err("peer unmap failed\n"); + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + return VM_FAULT_SIGBUS; + } + + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + hnode = get_hnode(gm_mapping->gm_page->hnid); + gm_page_remove_rmap(gm_mapping->gm_page); + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); + return ret; +} + +/* GMEM Virtual Address Space API */ +enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, + unsigned long cache_quantum, struct gm_as **new_as) +{ + struct gm_as *as; + + if (!new_as) + return -EINVAL; + + as = kmem_cache_alloc(gm_as_cache, GFP_ATOMIC); + if (!as) + return -ENOMEM; + + spin_lock_init(&as->rbtree_lock); + as->rbroot = RB_ROOT; + as->start_va = begin; + as->end_va = end; + as->policy = policy; + + INIT_LIST_HEAD(&as->gm_ctx_list); + + *new_as = as; + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_create); + +enum gm_ret gm_as_destroy(struct gm_as *as) +{ + struct gm_context *ctx, *tmp_ctx; + + list_for_each_entry_safe(ctx, tmp_ctx, &as->gm_ctx_list, gm_as_link) + kfree(ctx); + + kmem_cache_free(gm_as_cache, as); + + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_destroy); + +enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, + bool activate, struct gm_context **out_ctx) +{ + struct gm_context *ctx; + int nid; + int ret; + + ctx = kmem_cache_alloc(gm_ctx_cache, GFP_KERNEL); + if (!ctx) + return GM_RET_NOMEM; + + ctx->as = as; + ctx->dev = dev; + ctx->pmap = NULL; + ret = dev->mmu->pmap_create(dev, &ctx->pmap); + if (ret) { + kmem_cache_free(gm_ctx_cache, ctx); + return ret; + } + + INIT_LIST_HEAD(&ctx->gm_dev_link); + INIT_LIST_HEAD(&ctx->gm_as_link); + list_add_tail(&dev->gm_ctx_list, &ctx->gm_dev_link); + list_add_tail(&ctx->gm_as_link, &as->gm_ctx_list); + + if (activate) { + /* + * Here we should really have a callback function to perform the context switch + * for the hardware. E.g. in x86 this function is effectively + * flushing the CR3 value. Currently we do not care time-sliced context switch, + * unless someone wants to support it. + */ + dev->current_ctx = ctx; + } + *out_ctx = ctx; + + /* + * gm_as_attach will be used to attach device to process address space. + * Handle this case and add hnodes registered by device to process mems_allowed. + */ + for_each_node_mask(nid, dev->registered_hnodes) + node_set(nid, current->mems_allowed); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_attach); + +struct prefetch_data { + struct mm_struct *mm; + struct gm_dev *dev; + unsigned long addr; + size_t size; + struct work_struct work; + int behavior; + int *res; +}; + +static void prefetch_work_cb(struct work_struct *work) +{ + struct prefetch_data *d = + container_of(work, struct prefetch_data, work); + unsigned long addr = d->addr, end = d->addr + d->size; + int page_size = HPAGE_SIZE; + int ret; + + do { + /* MADV_WILLNEED: dev will soon access this addr. */ + mmap_read_lock(d->mm); + ret = gm_dev_fault_locked(d->mm, addr, d->dev, d->behavior); + mmap_read_unlock(d->mm); + if (ret == GM_RET_PAGE_EXIST) { + gmem_err("%s: device has done page fault, ignore prefetch\n", + __func__); + } else if (ret != GM_RET_SUCCESS) { + *d->res = -EFAULT; + gmem_err("%s: call dev fault error %d\n", __func__, ret); + } + } while (addr += page_size, addr != end); + + kfree(d); +} + +static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t size, int behavior) +{ + unsigned long start, end, per_size; + int page_size = HPAGE_SIZE; + struct prefetch_data *data; + struct vm_area_struct *vma; + int res = GM_RET_SUCCESS; + unsigned long old_start; + + /* overflow */ + if (check_add_overflow(addr, size, &end)) { + gmem_err("addr plus size will cause overflow!\n"); + return -EINVAL; + } + + old_start = end; + + /* Align addr by rounding outward to make page cover addr. */ + end = round_up(end, page_size); + start = round_down(addr, page_size); + size = end - start; + + if (!end && old_start) { + gmem_err("end addr align up 2M causes invalid addr\n"); + return -EINVAL; + } + + if (size == 0) + return 0; + + mmap_read_lock(current->mm); + vma = find_vma(current->mm, start); + if (!vma || start < vma->vm_start || end > vma->vm_end) { + mmap_read_unlock(current->mm); + gmem_err("failed to find vma by invalid start or size.\n"); + return GM_RET_FAILURE_UNKNOWN; + } else if (!vma_is_peer_shared(vma)) { + mmap_read_unlock(current->mm); + gmem_err("%s the vma does not use VM_PEER_SHARED\n", __func__); + return GM_RET_FAILURE_UNKNOWN; + } + mmap_read_unlock(current->mm); + + per_size = (size / GM_WORK_CONCURRENCY) & ~(page_size - 1); + + while (start < end) { + data = kzalloc(sizeof(struct prefetch_data), GFP_KERNEL); + if (!data) { + flush_workqueue(prefetch_wq); + return GM_RET_NOMEM; + } + + INIT_WORK(&data->work, prefetch_work_cb); + data->mm = current->mm; + data->dev = dev; + data->addr = start; + data->behavior = behavior; + data->res = &res; + if (per_size == 0) + data->size = size; + else + /* Process (1.x * per_size) for the last time */ + data->size = (end - start < 2 * per_size) ? + (end - start) : + per_size; + queue_work(prefetch_wq, &data->work); + start += data->size; + } + + flush_workqueue(prefetch_wq); + return res; +} + +static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int page_size) +{ + struct gm_fault_t gmf = { + .mm = current->mm, + .size = page_size, + .copy = false, + }; + struct gm_mapping *gm_mapping; + struct vm_object *obj; + struct hnode *hnode; + int ret; + + obj = vma->vm_obj; + if (!obj) { + gmem_err("peer-shared vma should have vm_object\n"); + return -EINVAL; + } + + for (; start < end; start += page_size) { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, start); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + mutex_lock(&gm_mapping->lock); + if (gm_mapping_nomap(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } else if (gm_mapping_cpu(gm_mapping)) { + zap_page_range_single(vma, start, page_size, NULL); + } else { + gmf.va = start; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret) { + gmem_err("peer_unmap failed. ret %d\n", ret); + mutex_unlock(&gm_mapping->lock); + continue; + } + hnode = get_hnode(gm_mapping->gm_page->hnid); + gm_page_remove_rmap(gm_mapping->gm_page); + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); + } + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); + mutex_unlock(&gm_mapping->lock); + } + + return 0; +} + +static int hmadvise_do_eagerfree(unsigned long addr, size_t size) +{ + unsigned long start, end, i_start, i_end; + int page_size = HPAGE_SIZE; + struct vm_area_struct *vma; + int ret = GM_RET_SUCCESS; + unsigned long old_start; + + /* overflow */ + if (check_add_overflow(addr, size, &end)) { + gmem_err("addr plus size will cause overflow!\n"); + return -EINVAL; + } + + old_start = addr; + + /* Align addr by rounding inward to avoid excessive page release. */ + end = round_down(end, page_size); + start = round_up(addr, page_size); + if (start >= end) { + pr_debug("gmem:start align up 2M >= end align down 2M.\n"); + return ret; + } + + /* Check to see whether len was rounded up from small -ve to zero */ + if (old_start && !start) { + gmem_err("start addr align up 2M causes invalid addr"); + return -EINVAL; + } + + mmap_read_lock(current->mm); + do { + vma = find_vma_intersection(current->mm, start, end); + if (!vma) { + gmem_err("gmem: there is no valid vma\n"); + break; + } + + if (!vma_is_peer_shared(vma)) { + pr_debug("gmem:not peer-shared vma, skip dontneed\n"); + start = vma->vm_end; + continue; + } + + i_start = start > vma->vm_start ? start : vma->vm_start; + i_end = end < vma->vm_end ? end : vma->vm_end; + ret = gmem_unmap_vma_pages(vma, i_start, i_end, page_size); + if (ret) + break; + + start = vma->vm_end; + } while (start < end); + + mmap_read_unlock(current->mm); + return ret; +} + +static bool check_hmadvise_behavior(int behavior) +{ + return behavior == MADV_DONTNEED; +} + +int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) +{ + int error = -EINVAL; + struct gm_dev *dev = NULL; + + if (hnid == -1) { + if (check_hmadvise_behavior(behavior)) { + goto no_hnid; + } else { + gmem_err("hmadvise: behavior %d need hnid or is invalid\n", + behavior); + return error; + } + } + + if (hnid < 0) { + gmem_err("hmadvise: invalid hnid %d < 0\n", hnid); + return error; + } + + if (!is_hnode(hnid)) { + gmem_err("hmadvise: can't find hnode by hnid:%d or hnode is not allowed\n", hnid); + return error; + } + + dev = get_gm_dev(hnid); + if (!dev) { + gmem_err("hmadvise: hnode id %d is invalid\n", hnid); + return error; + } + +no_hnid: + switch (behavior) { + case MADV_PREFETCH: + behavior = MADV_WILLNEED; + fallthrough; + case MADV_PINNED_REMOVE: + fallthrough; + case MADV_PINNED: + return hmadvise_do_prefetch(dev, start, len_in, behavior); + case MADV_DONTNEED: + return hmadvise_do_eagerfree(start, len_in); + default: + gmem_err("hmadvise: unsupported behavior %d\n", behavior); + } + + return error; +} +EXPORT_SYMBOL_GPL(hmadvise_inner); + +static bool hnid_match_dest(int hnid, struct gm_mapping *dest) +{ + return (hnid < 0) ? gm_mapping_cpu(dest) : gm_mapping_device(dest); +} + +static void cpu_page_copy(struct page *dst_page, unsigned long dst_offset, + struct page *src_page, unsigned long src_offset, size_t size) +{ + unsigned long src, dst; + + src = (unsigned long)page_address(src_page) + src_offset; + dst = (unsigned long)page_address(dst_page) + dst_offset; + if (!src || !dst) { + gmem_err("%s: src (%lx) or dst (%lx) is invalid!", src, dst); + return; + } + memcpy((void *)dst, (void *)src, size); +} + +static enum gmem_copy_dir { + COPY_GMEM_TO_NORM, + COPY_NORM_TO_GMEM, + COPY_GMEM_TO_GMEM, +}; + +static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, + unsigned long src, size_t size) +{ + enum gm_ret ret; + int page_size = HPAGE_SIZE; + struct vm_area_struct *vma_dest, *vma_src; + struct gm_mapping *gm_mapping_dest, *gm_mapping_src; + struct gm_dev *dev = NULL; + struct gm_memcpy_t gmc = {0}; + enum gmem_copy_dir dir; + struct page *trans_hpage; + void *trans_addr; + + if (size == 0) + return; + + mmap_read_lock(mm); + vma_dest = find_vma(mm, dest); + vma_src = find_vma(mm, src); + + if (!vma_src || vma_src->vm_start > src || !vma_dest || vma_dest->vm_start > dest) { + gmem_err("hmemcpy: the vma find by src/dest is NULL!"); + goto unlock_mm; + } + + if (vma_is_peer_shared(vma_src) && vma_is_peer_shared(vma_dest)) { + dir = COPY_GMEM_TO_GMEM; + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); + gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + } else if (vma_is_peer_shared(vma_src)) { + dir = COPY_GMEM_TO_NORM; + gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + gm_mapping_dest = NULL; + } else if (vma_is_peer_shared(vma_dest)) { + dir = COPY_NORM_TO_GMEM; + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); + gm_mapping_src = NULL; + } else { + gmem_err("%s: src %lx and dest %lx both not gmem addr!", __func__, src, dest); + goto unlock_mm; + } + + trans_hpage = alloc_pages(GFP_TRANSHUGE, HPAGE_PMD_ORDER); + if (!trans_hpage) { + gmem_err("%s: alloc trans_hpage failed!", __func__); + goto unlock_mm; + } + trans_addr = page_to_virt(trans_hpage); + + if (dir != COPY_NORM_TO_GMEM && (!gm_mapping_src || gm_mapping_nomap(gm_mapping_src))) { + gmem_err("%s: gm_mapping_src is NULL or still not mapped! addr is %lx", __func__, src); + } + + if (hnid != -1) { + dev = get_gm_dev(hnid); + if (!dev) { + gmem_err("hmemcpy: hnode's dev is NULL"); + goto free_trans_page; + } + } + + // Trigger dest page fault on host or device + if (!gm_mapping_dest || gm_mapping_nomap(gm_mapping_dest) + || !hnid_match_dest(hnid, gm_mapping_dest)) { + if (hnid == -1) { + if (gm_mapping_dest && gm_mapping_device(gm_mapping_dest) && gm_page_pinned(gm_mapping_dest->gm_page)) { + gmem_err("%s: dest %lx is pinned on device, skip handle_mm_fault", __func__, dest); + } else { + ret = handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | + FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); + if (ret) { + gmem_err("%s: failed to execute host page fault, ret:%d", + __func__, ret); + goto free_trans_page; + } + } + } else { + ret = gm_dev_fault_locked(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); + if (ret != GM_RET_SUCCESS) { + gmem_err("%s: failed to excecute dev page fault.", __func__); + goto free_trans_page; + } + } + } + if (!gm_mapping_dest && dir != COPY_GMEM_TO_NORM) + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, round_down(dest, page_size)); + + if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) + mutex_lock(&gm_mapping_dest->lock); + if (gm_mapping_src) + mutex_lock(&gm_mapping_src->lock); + // Use memcpy when there is no device address, otherwise use peer_memcpy + if (dir == COPY_GMEM_TO_NORM) { + if (!gm_mapping_src) { + gmem_err("%s: do COPY_GMEM_TO_NORM but gm_mapping_src is NULL!", __func__); + goto unlock_gm_mapping; + } + if (gm_mapping_cpu(gm_mapping_src)) { // host to host + cpu_page_copy(trans_hpage, (unsigned long)trans_addr & (page_size - 1), + gm_mapping_src->page, src & (page_size - 1), size); + goto copy_to_norm_dest; + } else if (gm_mapping_device(gm_mapping_src)) { // device to host + dev = gm_mapping_src->dev; + gmc.dest = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + + ((unsigned long)trans_addr & (page_size - 1))); + gmc.src = gm_mapping_src->gm_page->dev_dma_addr + (src & (page_size - 1)); + gmc.kind = GM_MEMCPY_D2H; + } else { + gmem_err("gm_mapping_src bad status, dir is COPY_GMEM_TO_NORM"); + goto unlock_gm_mapping; + } + } else if (dir == COPY_NORM_TO_GMEM) { + if (!gm_mapping_dest) { + gmem_err("%s: do COPY_NORM_TO_GMEM but gm_mapping_dest is NULL!", __func__); + goto unlock_gm_mapping; + } + if (copy_from_user(trans_addr, (void __user *)src, size) > 0) + gmem_err("copy normal src %lx to trans failed", src); + if (gm_mapping_cpu(gm_mapping_dest)) { // host to host + cpu_page_copy(gm_mapping_dest->page, dest & (page_size - 1), + trans_hpage, (unsigned long)trans_addr & (page_size - 1), size); + goto unlock_gm_mapping; + } else if (gm_mapping_device(gm_mapping_dest)) { + if (!dev) { + gmem_err("%s: do COPY_NORM_TO_GMEM but dev is NULL, hnid is %d", __func__, hnid); + goto unlock_gm_mapping; + } + gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + + (dest & (page_size - 1)); + gmc.src = phys_to_dma(dev->dma_dev, page_to_phys(trans_hapge) + + ((unsigned long)trans_addr & (page_size - 1))); + gmc.kind = GM_MEMCPY_H2D; + } else { // device to device + gmem_err("gm_mapping_dest bad status, dir is COPY_NORM_TO_GMEM\n"); + goto unlock_gm_mmaping; + } + } else if (dir == COPY_GMEM_TO_GMEM) { + if (gm_mapping_cpu(gm_mapping_src)) { + if (gm_mapping_cpu(gm_mapping_dest)) { + cpu_page_copy(gm_mapping_dest->page, dest & (page_size - 1), + gm_mapping_src->page, src & (page_size - 1), size); + goto unlock_gm_mapping; + } else if (gm_mapping_device(gm_mapping_dest)) { + dev = gm_mapping_dest->dev; + gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + (dest & (page_size - 1)); + gmc.src = phys_to_dma(dev->dma_dev, page_to_phys(gm_mapping_src->page) + + (src & (page_size - 1))); + gmc.kind = GM_MEMCPY_H2D; + } else { + gmem_err("gm_mapping_dest bad status, src is on host!"); + goto unlock_gm_mapping; + } + } else if (gm_mapping_device(gm_mapping_src)) { + if (gm_mapping_cpu(gm_mapping_dest)) { + dev = gm_mapping_src->dev; + gmc.dest = phys_to_dma(dev->dma_dev, page_to_phys(gm_mapping_dest->page) + + (dest & (page_size - 1))); + gmc.src = gm_mapping_src->gm_page->dev_dma_addr + (src & (page_size - 1)); + gmc.kind = GM_MEMCPY_D2H; + } else if (gm_mapping_device(gm_mapping_dest)) { + dev = gm_mapping_src->dev; + gmc.dest = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + + ((unsigned long)trans_addr & (page_size - 1))); + gmc.src = gm_mapping_src->gm_page->dev_dma_addr + (src & (page_size - 1)); + gmc.kind = GM_MEMCPY_D2H; + gmc.mm = mm; + gmc.dev = dev; + gmc.size = size; + dev->mmu->peer_hmemcpy(&gmc); + + dev = gm_mapping_dest->dev; + gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + (dest & (page_size - 1)); + gmc.src = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + + ((unsigned long)trans_addr& (page_size - 1))); + gmc.kind = GM_MEMCPY_H2D; + gmc.mm = mm; + gmc.dev = dev; + gmc.size = size; + dev->mmu->peer_hmemcpy(&gmc); + + goto unlock_gm_mapping; + } else { + gmem_err("gm_mapping_dest bad status, src is on device!"); + goto unlock_gm_mapping; + } + } else { + gmem_err("gm_mapping_src bad status, dir is COPY_GMEM_TO_GMEM"); + goto unlock_gm_mapping; + } + } + gmc.mm = mm; + gmc.dev = dev; + gmc.size = size; + dev->mmu->peer_hmemcpy(&gmc); + +copy_to_norm_dest: + if (dir == COPY_GMEM_TO_NORM) { + if (copy_to_user((void __user *)dest, trans_addr, size) > 0) + gmem_err("copy trans to normal dest %lx failed!", dest); + } + +unlock_gm_mmaping: + if (gm_mapping_src) + mutex_unlock(&gm_mapping_src->lock); + if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) + mutex_unlock(&gm_mapping_dest->lock); +free_trans_page: + __free_pages(trans_hpage, HPAGE_PMD_ORDER); +unlock_mm: + mmap_read_unlock(mm); +} + +/* + * Each page needs to be copied in three parts when the address is not aligned. + * | ml <--0-->|<1><--2-> | + * | -------|--------- | + * | / /| / / | + * | / / | / / | + * | / / |/ / | + * | ----------|------ | + * | | | + * |<----page x---->|<----page y---->| + */ + +static void __hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) +{ + int i = 0; + // offsets within the huge page for the source and destination addresses + int src_offset = src & (HPAGE_SIZE - 1); + int dst_offset = dest & (HPAGE_SIZE - 1); + // Divide each page into three parts according to the align + int ml[3] = { + HPAGE_SIZE - (src_offset < dst_offset ? dst_offset : src_offset), + src_offset < dst_offset ? (dst_offset - src_offset) : (src_offset - dst_offset), + src_offset < dst_offset ? src_offset : dst_offset + }; + struct mm_struct *mm = current->mm; + + if (size == 0) + return; + + while (size >= ml[i]) { + if (ml[i] > 0) { + do_hmemcpy(mm, hnid, dest, src, ml[i]); + src += ml[i]; + dest += ml[i]; + size -= ml[i]; + } + i = (i + 1) % 3; + } + + if (size > 0) + do_hmemcpy(mm, hnid, dest, src, size); +} + +int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) +{ + struct vm_area_struct *vma_dest, *vma_src; + struct mm_struct *mm = current->mm; + + if (hnid < 0) { + if (hnid != -1) { + gmem_err("hmemcpy: invalid hnid %d < 0\n", hnid); + return -EINVAL; + } + } else if (!is_hnode(hnid)) { + gmem_err("hmemcpy: can't find hnode by hnid:%d or hnode is not allowed\n", hnid); + return -EINVAL; + } + + mmap_read_lock(mm); + vma_dest = find_vma(mm, dest); + vma_src = find_vma(mm, src); + + if ((ULONG_MAX - size < src) || !vma_src || vma_src->vm_start > src || + vma_src->vm_end < (src + size)) { + gmem_err("failed to find peer_shared vma by invalid src or size\n"); + goto unlock; + } + + if ((ULONG_MAX - size < dest) || !vma_dest || vma_dest->vm_start > dest || + vma_dest->vm_end < (dest + size)) { + gmem_err("failed to find peer_shared vma by invalid dest or size\n"); + goto unlock; + } + + if (!vma_is_peer_shared(vma_src) && !vma_is_peer_shared(vma_dest)) { + mmap_read_unlock(mm); + return -EAGAIN; + } + + if (!(vma_dest->vm_flags & VM_WRITE)) { + gmem_err("dest is not writable.\n"); + goto unlock; + } + mmap_read_unlock(mm); + + __hmemcpy(hnid, dest, src, size); + + return 0; + +unlock: + mmap_read_unlock(mm); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(hmemcpy); + diff --git a/mm/gmem_phys.c b/mm/gmem_phys.c new file mode 100644 index 0000000000000000000000000000000000000000..10531edccfc357879de6e7df6cb8eefbe6a6a9f3 --- /dev/null +++ b/mm/gmem_phys.c @@ -0,0 +1,563 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * GMEM physical memory management. + * + * Copyright (C) 2025- Huawei, Inc. + * Author: Bin Wang + * + */ + +#include +#include +#include +#include +#include + +#include +#include + +#define NUM_SWAP_PAGES 16 +#define MAX_SWAP_RETRY_TIMES 10 + +static struct kmem_cache *gm_page_cachep; + +DEFINE_SPINLOCK(hnode_lock); +struct hnode *hnodes[MAX_NUMNODES]; + +void __init hnuma_init(void) +{ + unsigned int node; + + spin_lock(&hnode_lock); + for_each_node(node) + node_set(node, hnode_map); + spin_unlock(&hnode_lock); +} + +unsigned int alloc_hnode_id(void) +{ + unsigned int node; + + node = first_unset_node(hnode_map); + node_set(node, hnode_map); + + return node; +} + +void free_hnode_id(unsigned int nid) +{ + node_clear(nid, hnode_map); +} + +void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev) +{ + hnode->id = hnid; + hnode->dev = dev; + INIT_LIST_HEAD(&hnode->freelist); + INIT_LIST_HEAD(&hnode->activelist); + spin_lock_init(&hnode->freelist_lock); + spin_lock_init(&hnode->activelist_lock); + atomic_set(&hnode->nr_free_pages, 0); + atomic_set(&hnode->nr_active_pages, 0); + hnode->import_failed = false; + hnode->max_memsize = 0; + + node_set(hnid, dev->registered_hnodes); + hnodes[hnid] = hnode; +} + +void hnode_deinit(unsigned int hnid, struct gm_dev *dev) +{ + hnodes[hnid]->id = 0; + hnodes[hnid]->dev = NULL; + node_clear(hnid, dev->registered_hnodes); + hnodes[hnid] = NULL; +} + +struct hnode *get_hnode(unsigned int hnid) +{ + if (!hnodes[hnid]) + gmem_err("h-NUMA node for hnode id %u is NULL.", hnid); + return hnodes[hnid]; +} + +struct gm_dev *get_gm_dev(unsigned int nid) +{ + struct hnode *hnode; + struct gm_dev *dev = NULL; + + spin_lock(&hnode_lock); + hnode = get_hnode(nid); + if (hnode) + dev = hnode->dev; + spin_unlock(&hnode_lock); + return dev; +} + +static void init_swapd(struct hnode *hnode); + +int gm_dev_register_hnode(struct gm_dev *dev) +{ + unsigned int hnid; + struct hnode *hnode = kmalloc(sizeof(struct hnode), GFP_KERNEL); + int ret; + + if (!hnode) + return -ENOMEM; + + spin_lock(&hnode_lock); + hnid = alloc_hnode_id(); + spin_unlock(&hnode_lock); + + if (hnid == MAX_NUMNODES) + goto free_hnode; + + ret = hnode_init_sysfs(hnid); + if (ret) + goto free_hnode; + + hnode_init(hnode, hnid, dev); + init_swapd(hnode); + + return GM_RET_SUCCESS; + +free_hnode: + kfree(hnode); + return -EBUSY; +} +EXPORT_SYMBOL_GPL(gm_dev_register_hnode); + +int __init gm_page_cachep_init(void) +{ + gm_page_cachep = KMEM_CACHE(gm_page, 0); + if (!gm_page_cachep) + return -EINVAL; + return 0; +} + +void gm_page_cachep_destroy(void) +{ + kmem_cache_destroy(gm_page_cachep); +} + +struct gm_page *alloc_gm_page_struct(void) +{ + struct gm_page *gm_page = kmem_cache_zalloc(gm_page_cachep, GFP_KERNEL); + + if (!gm_page) + return NULL; + atomic_set(&gm_page->refcount, 0); + spin_lock_init(&gm_page->rmap_lock); + return gm_page; +} +EXPORT_SYMBOL(alloc_gm_page_struct); + +void hnode_freelist_add(struct hnode *hnode, struct gm_page *gm_page) +{ + spin_lock(&hnode->freelist_lock); + list_add(&gm_page->gm_page_list, &hnode->freelist); + spin_unlock(&hnode->freelist_lock); +} + +void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page) +{ + spin_lock(&hnode->activelist_lock); + list_add_tail(&gm_page->gm_page_list, &hnode->activelist); + spin_unlock(&hnode->activelist_lock); +} + +void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page) +{ + spin_lock(&hnode->activelist_lock); + /* If a gm_page is being evicted, it is currently located in the + * temporary linked list. */ + if (!gm_page_evicting(gm_page)) + list_del_init(&gm_page->gm_page_list); + spin_unlock(&hnode->activelist_lock); +} + +void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page) +{ + spin_lock(&hnode->activelist_lock); + list_move_tail(&gm_page->gm_page_list, &hnode->activelist); + spin_unlock(&hnode->activelist_lock); +} + +void mark_gm_page_active(struct gm_page *gm_page) +{ + struct hnode *hnode = get_hnode(gm_page->hnid); + + if (!hnode) + return; + + hnode_activelist_del_and_add(hnode, gm_page); +} + +void mark_gm_page_pinned(struct gm_page *gm_page) +{ + struct hnode *hnode = get_hnode(gm_page->hnid); + + if (!hnode) + return; + + spin_lock(&hnode->activelist_lock); + if (gm_page_evicting(gm_page)) { + gmem_err("%s: maybe page has been evicted!", __func__); + goto unlock; + } else if (gm_page_pinned(gm_page)) { + goto unlock; + } + gm_page_flags_set(gm_page, GM_PAGE_PINNED); + +unlock: + spin_unlock(&hnode->activelist_lock); + return; +} + +void mark_gm_page_unpinned(struct gm_page *gm_page) +{ + struct hnode *hnode = get_hnode(gm_page->hnid); + + if (!hnode) + return; + + spin_lock(&hnode->activelist_lock); + if (!gm_page_pinned(gm_page) || gm_page_evicting(gm_page)) { + goto unlock; + } + gm_page_flags_clear(gm_page, GM_PAGE_PINNED); + +unlock: + spin_unlock(&hnode->activelist_lock); + return; +} + +int gm_add_pages(unsigned int hnid, struct list_head *pages) +{ + struct hnode *hnode; + struct gm_page *gm_page, *n; + + hnode = get_hnode(hnid); + if (!hnode) + return -EINVAL; + + list_for_each_entry_safe(gm_page, n, pages, gm_page_list) { + list_del(&gm_page->gm_page_list); + hnode_freelist_add(hnode, gm_page); + hnode_free_pages_inc(hnode); + gm_page_flags_clear(gm_page, GM_PAGE_PINNED); + } + + return 0; +} +EXPORT_SYMBOL(gm_add_pages); + +void gm_free_page(struct gm_page *gm_page) +{ + struct hnode *hnode; + + hnode = get_hnode(gm_page->hnid); + if (!hnode) + return; + hnode_freelist_add(hnode, gm_page); + hnode_free_pages_inc(hnode); +} + +void gm_page_add_rmap(struct gm_page *gm_page, struct mm_struct *mm, unsigned long va) +{ + spin_lock(&gm_page->rmap_lock); + gm_page->mm = mm; + gm_page->va = va; + spin_unlock(&gm_page->rmap_lock); +} + +void gm_page_remove_rmap(struct gm_page *gm_page) +{ + spin_lock(&gm_page->rmap_lock); + gm_page->mm = NULL; + gm_page->va = 0; + spin_unlock(&gm_page->rmap_lock); +} + +enum gm_evict_ret { + GM_EVICT_SUCCESS = 0, + GM_EVICT_UNMAP, + GM_EVICT_FALLBACK, + GM_EVICT_DEVERR, +}; + +enum gm_evict_ret gm_evict_page_locked(struct gm_page *gm_page) +{ + struct gm_dev *gm_dev; + struct gm_mapping *gm_mapping; + struct vm_area_struct *vma; + struct mm_struct *mm; + struct page *page; + struct device *dma_dev; + unsigned long va; + struct folio *folio = NULL; + struct gm_fault_t gmf = { + .size = HPAGE_SIZE, + .copy = true + }; + enum gm_evict_ret ret = GM_EVICT_SUCCESS; + int err; + + gm_dev = get_gm_dev(gm_page->hnid); + if (!gm_dev) + return GM_EVICT_DEVERR; + + spin_lock(&gm_page->rmap_lock); + if (!gm_page->mm) { + /* Evicting gm_page conflicts with unmap.*/ + ret = GM_EVICT_UNMAP; + goto rmap_unlock; + } + + mm = gm_page->mm; + va = gm_page->va; + vma = find_vma(mm, va); + if (!vma || !vma->vm_obj) { + gmem_err("%s: cannot find vma or vma->vm_obj is null for va %lx", __func__, va); + ret = GM_EVICT_UNMAP; + goto rmap_unlock; + } + + gm_mapping = vm_object_lookup(vma->vm_obj, va); + if (!gm_mapping) { + gmem_err("%s: no gm_mapping for va %lx", __func__, va); + ret = GM_EVICT_UNMAP; + goto rmap_unlock; + } + + spin_unlock(&gm_page->rmap_lock); + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + /* Evicting gm_page conflicts with unmap.*/ + ret = GM_EVICT_UNMAP; + goto gm_mapping_unlock; + } + + if (gm_mapping->gm_page != gm_page) { + /* gm_mapping maps to another gm_page. */ + ret = GM_EVICT_UNMAP; + goto gm_mapping_unlock; + } + + folio = vma_alloc_folio(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, va, true); + if (!folio) { + gmem_err("%s: allocate host page failed.", __func__); + ret = GM_EVICT_FALLBACK; + goto gm_mapping_unlock; + } + page = &folio->page; + + gmf.mm = mm; + gmf.va = va; + gmf.dev = gm_dev; + gmf.pfn = gm_page->dev_pfn; + dma_dev = gm_dev->dma_dev; + gmf.dma_addr = dma_map_page(dma_dev, page, 0, HPAGE_SIZE, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) { + gmem_err("%s: dma map failed.", __func__); + ret = GM_EVICT_FALLBACK; + goto gm_mapping_unlock; + } + + err = gm_dev->mmu->peer_unmap(&gmf); + if (err) { + gmem_err("%s: peer_unmap failed.", __func__); + ret = GM_EVICT_DEVERR; + goto dma_unmap; + } + + gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); + gm_page_remove_rmap(gm_page); + gm_mapping->page = page; + put_gm_page(gm_page); +dma_unmap: + dma_unmap_page(dma_dev, gmf.dma_addr, HPAGE_SIZE, DMA_BIDIRECTIONAL); +gm_mapping_unlock: + mutex_unlock(&gm_mapping->lock); + return ret; +rmap_unlock: + spin_unlock(&gm_page->rmap_lock); + return ret; +} + +enum gm_evict_ret gm_evict_page(struct gm_page *gm_page) +{ + struct mm_struct *mm = gm_page->mm; + enum gm_evict_ret ret; + + mmap_read_lock(mm); + ret = gm_evict_page_locked(gm_page); + mmap_read_unlock(mm); + return ret; +} + +static void gm_do_swap(struct hnode *hnode) +{ + struct list_head swap_list; + struct gm_page *gm_page, *n; + unsigned int nr_swap_pages = 0; + int ret; + + INIT_LIST_HEAD(&swap_list); + + spin_lock(&hnode->activelist_lock); + list_for_each_entry_safe(gm_page, n, &hnode->activelist, gm_page_list) { + if (gm_page_pinned(gm_page)) { + gmem_err("%s: va %lx is pinned!", __func__, gm_page->va); + continue; + } + /* Move gm_page to temporary list. */ + get_gm_page(gm_page); + gm_page_flags_set(gm_page, GM_PAGE_EVICTING); + list_move(&gm_page->gm_page_list, &swap_list); + nr_swap_pages++; + if (nr_swap_pages >= NUM_SWAP_PAGES) + break; + } + spin_unlock(&hnode->activelist_lock); + + list_for_each_entry_safe(gm_page, n, &swap_list, gm_page_list) { + list_del_init(&gm_page->gm_page_list); + ret = gm_evict_page_locked(gm_page); + gm_page_flags_clear(gm_page, GM_PAGE_EVICTING); + if (ret == GM_EVICT_UNMAP) { + /* Evicting gm_page conflicts with unmap.*/ + put_gm_page(gm_page); + } else if (ret == GM_EVICT_FALLBACK) { + /* An error occurred with the host, and gm_page needs + * to be added back to the activelist. */ + hnode_activelist_add(hnode, gm_page); + put_gm_page(gm_page); + } else if (ret == GM_EVICT_DEVERR) { + /* It generally occurs when the process has already + * exited, at which point gm_page needs to be returned + * to the freelist. */ + put_gm_page(gm_page); + } else { + hnode_active_pages_dec(hnode); + put_gm_page(gm_page); + } + } +}; + +static inline bool need_wake_up_swapd(struct hnode *hnode) +{ + return false; +} + +static int swapd_func(void *data) +{ + struct hnode *hnode = (struct hnode *)data; + + while (!kthread_should_stop()) { + if (!need_wake_up_swapd(hnode)) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + + gm_do_swap(hnode); + } + + return 0; +}; + +static void init_swapd(struct hnode *hnode) +{ + hnode->swapd_task = kthread_run(swapd_func, NULL, "gm_swapd/%u", hnode->id); + if (IS_ERR(hnode->swapd_task)) { + gmem_err("%s: create swapd task failed", __func__); + hnode->swapd_task = NULL; + } +} + +static void wake_up_swapd(struct hnode *hnode) +{ + if (likely(hnode->swapd_task)) + wake_up_process(hnode->swapd_task); +} + +static bool can_import(struct hnode *hnode) +{ + unsigned long nr_pages; + unsigned long used_mem; + + nr_pages = atomic_read(&hnode->nr_free_pages) + atomic_read(&hnode->nr_active_pages); + used_mem = nr_pages * HPAGE_SIZE; + + /* GMEM usable memory is unlimited if max_memsize is zero. */ + if (!hnode->max_memsize) + return true; + return used_mem < hnode->max_memsize; +} + +static struct gm_page *get_gm_page_from_freelist(struct hnode *hnode) +{ + struct gm_page *gm_page; + + spin_lock(&hnode->freelist_lock); + gm_page = list_first_entry_or_null(&hnode->freelist, struct gm_page, gm_page_list); + /* Delete from freelist. */ + if (gm_page) { + if (gm_page_pinned(gm_page)) { + gmem_err("%s: gm_page %lx from freelist has pinned flag, clear it!", __func__, (unsigned long)gm_page); + gm_page_flags_clear(gm_page, GM_PAGE_PINNED); + } + list_del_init(&gm_page->gm_page_list); + hnode_free_pages_dec(hnode); + get_gm_page(gm_page); + /* TODO: wakeup swapd if needed. */ + if (need_wake_up_swapd(hnode)) + wake_up_swapd(hnode); + } + spin_unlock(&hnode->freelist_lock); + + return gm_page; +} + +/* + * gm_alloc_page - Allocate a gm_page. + * + * Allocate a gm_page from hnode freelist. If failed to allocate gm_page, try + * to import memory from device. And if failed to import memory, try to swap + * several gm_pages to host and allocate gm_page again. + */ +struct gm_page *gm_alloc_page(struct mm_struct *mm, struct hnode *hnode) +{ + struct gm_page *gm_page; + struct gm_dev *gm_dev; + int retry_times = 0; + int ret = 0; + + if (hnode->dev) + gm_dev = hnode->dev; + else + return NULL; + +retry: + gm_page = get_gm_page_from_freelist(hnode); + if (!gm_page && can_import(hnode) && !hnode->import_failed) { + /* Import pages from device. */ + ret = gm_dev->mmu->import_phys_mem(mm, hnode->id, NUM_IMPORT_PAGES); + if (!ret) + goto retry; + hnode->import_failed = true; + } + + /* Try to swap pages. */ + if (!gm_page) { + if (retry_times > MAX_SWAP_RETRY_TIMES) + return NULL; + gm_do_swap(hnode); + retry_times++; + goto retry; + } + + return gm_page; +} + diff --git a/mm/gmem_stat.c b/mm/gmem_stat.c new file mode 100644 index 0000000000000000000000000000000000000000..dbf4de2151cb4c5bb6827590490fce6f4b87f59d --- /dev/null +++ b/mm/gmem_stat.c @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * GMEM statistics. + * + * Copyright (C) 2025- Huawei, Inc. + * Author: Bin Wang + * + */ + +#include +#include +#include + +static struct kobject *gm_kobj; + +struct hnode_kobject { + struct kobject kobj; + unsigned int hnid; +}; + +#define HNODE_NAME_LEN 32 + +static struct hnode *get_hnode_kobj(struct kobject *kobj) +{ + struct hnode *hnode; + struct hnode_kobject *hnode_kobj; + + hnode_kobj = container_of(kobj, struct hnode_kobject, kobj); + hnode = get_hnode(hnode_kobj->hnid); + if (!hnode) { + gmem_err("%s: failed to get hnode from kobject", __func__); + return NULL; + } + + return hnode; +} + + +static ssize_t max_memsize_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + return sprintf(buf, "%lu\n", hnode->max_memsize); +} + +static ssize_t max_memsize_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + hnode->max_memsize = memparse(buf, NULL) & (~(HPAGE_SIZE - 1)); + return count; +} + +static struct kobj_attribute max_memsize_attr = + __ATTR(max_memsize, 0640, max_memsize_show, max_memsize_store); + +static ssize_t nr_freepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + return sprintf(buf, "%u\n", atomic_read(&hnode->nr_free_pages)); +} + +static struct kobj_attribute nr_freepages_attr = + __ATTR(nr_freepages, 0440, nr_freepages_show, NULL); + +static ssize_t nr_activepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + return sprintf(buf, "%u\n", atomic_read(&hnode->nr_active_pages)); +} + +static struct kobj_attribute nr_activepages_attr = + __ATTR(nr_activepages, 0444, nr_activepages_show, NULL); + +static ssize_t nr_freelist_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int nr_freelist = 0; + struct gm_page *gm_page; + struct hnode *hnode = get_hnode_kobj(kobj); + if (!hnode) + return -EINVAL; + + spin_lock(&hnode->freelist_lock); + list_for_each_entry(gm_page, &hnode->freelist, gm_page_list) { + nr_freelist++; + } + spin_unlock(&hnode->freelist_lock); + return sprintf(buf, "%u\n", nr_freelist); +} + +static struct kobj_attribute nr_freelist_attr = + __ATTR(nr_freelist, 0440, nr_freelist_show, NULL); + +static ssize_t nr_activelist_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int nr_activelist = 0; + struct gm_page *gm_page; + struct hnode *hnode = get_hnode_kobj(kobj); + if (!hnode) + return -EINVAL; + + spin_lock(&hnode->activelist_lock); + list_for_each_entry(gm_page, &hnode->activelist, gm_page_list) { + nr_activelist++; + } + spin_unlock(&hnode->activelist_lock); + return sprintf(buf, "%u\n", nr_activelist); +} + +static struct kobj_attribute nr_activelist_attr = + __ATTR(nr_activelist, 0440, nr_activelist_show, NULL); + +static struct attribute *hnode_attrs[] = { + &max_memsize_attr.attr, + &nr_freepages_attr.attr, + &nr_activepages_attr.attr, + &nr_freelist_attr.attr, + &nr_activelist_attr.attr, + NULL, +}; + +static struct attribute_group hnode_attr_group = { + .attrs = hnode_attrs, +}; + +static void hnode_kobj_release(struct kobject *kobj) +{ + struct hnode_kobject *hnode_kobj = + container_of(kobj, struct hnode_kobject, kobj); + kfree(hnode_kobj); +} + +static const struct kobj_type hnode_kobj_ktype = { + .release = hnode_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, +}; + +int hnode_init_sysfs(unsigned int hnid) +{ + int ret; + struct hnode_kobject *hnode_kobj; + + hnode_kobj = kzalloc(sizeof(struct hnode_kobject), GFP_KERNEL); + if (!hnode_kobj) + return -ENOMEM; + + ret = kobject_init_and_add(&hnode_kobj->kobj, &hnode_kobj_ktype, + gm_kobj, "hnode%u", hnid); + if (ret) { + gmem_err("%s: failed to init hnode object", __func__); + goto free_hnode_kobj; + } + + ret = sysfs_create_group(&hnode_kobj->kobj, &hnode_attr_group); + if (ret) { + gmem_err("%s: failed to register hnode group", __func__); + goto delete_hnode_kobj; + } + + hnode_kobj->hnid = hnid; + return 0; + +delete_hnode_kobj: + kobject_put(&hnode_kobj->kobj); +free_hnode_kobj: + kfree(hnode_kobj); + return ret; +} +EXPORT_SYMBOL(hnode_init_sysfs); + +int __init gm_init_sysfs(void) +{ + gm_kobj = kobject_create_and_add("gmem", mm_kobj); + if (!gm_kobj) { + gmem_err("%s: failed to create gmem object", __func__); + return -ENOMEM; + } + + return 0; + +} +EXPORT_SYMBOL(gm_init_sysfs); + +void gm_deinit_sysfs(void) +{ + kobject_put(gm_kobj); +} diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a28dda79997820f95a241fed211ce489cd80dc19..59f546540fd0260a0ab714298e3fcf1079ccf130 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -38,6 +38,10 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -1337,6 +1341,46 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); +static struct folio *vma_alloc_peer_shared_folio_pmd(struct vm_area_struct *vma, + unsigned long haddr, gm_mapping_t *gm_mapping) +{ + struct folio *folio; + gfp_t gfp = GFP_TRANSHUGE; + + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); + if (unlikely(!folio)) { + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + return NULL; + } + + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + folio_put(folio); + count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + return NULL; + } + folio_throttle_swaprate(folio, gfp); + + /* + * gmem device overcommit needs to reload the swapped page, + * so skip it to avoid clearing device data. + */ + if (!gm_mapping_cpu(gm_mapping)) + clear_huge_page(page, vmf->address, HPAGE_PMD_NR); + + /* + * The memory barrier inside __folio_mark_uptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ + __folio_mark_uptodate(folio); + return folio; +} + static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, unsigned long addr) { @@ -1344,6 +1388,12 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, const int order = HPAGE_PMD_ORDER; struct folio *folio; +#ifdef CONFIG_GMEM + /* always try to compact hugepage for peer shared vma */ + if (vma_is_peer_shared(vma)) + gfp = GFP_TRANSHUGE; +#endif + folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true); if (unlikely(!folio)) { @@ -1391,6 +1441,101 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } +struct gm_mapping *vma_prepare_gm_mapping(struct vm_area_struct *vma, unsigned long haddr) +{ + struct gm_mapping *gm_mapping; + + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + if (!gm_mapping) { + vm_object_mapping_create(vma->vm_obj, haddr); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + } + xa_unlock(vma->vm_obj->logical_page_table); + + return gm_mapping; +} + +static vm_fault_t __do_peer_shared_anonymous_page(struct vm_fault *vmf) +{ + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct folio *folio = NULL; + bool is_new_folio = false; + pgtable_t pgtable = NULL; + struct gm_mapping *gm_mapping; + vm_fault_t ret = 0; + + gm_mapping = vma_prepare_gm_mapping(vma, haddr); + if (!gm_mapping) + return VM_FAULT_OOM; + + mutex_lock(&gm_mapping->lock); + + if (gm_mapping_device(gm_mapping) && gm_page_pinned(gm_mapping->gm_page)) { + pr_err("page is pinned! addr is %lx\n", gm_mapping->gm_page->va); + ret = VM_FAULT_SIGBUS; + goto release; + } + + if (gm_mapping_cpu(gm_mapping)) + folio = page_folio(gm_mapping->page); + if (!folio) { + folio = vma_alloc_anon_folio_pmd(vma, haddr); + is_new_folio = true; + } + + if (unlikely(!folio)) { + ret = VM_FAULT_FALLBACK; + goto release; + } + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } + + /** + * if page is mapped in device, release device mapping and + * deliver the page content to host. + */ + if (gm_mapping_device(gm_mapping)) { + vmf->page = &folio->page; + ret = gm_host_fault_locked(vmf, PMD_ORDER); + if (ret) + goto release; + } + + /* map page in pgtable */ + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + + BUG_ON(!pmd_none(*vmf->pmd)); + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + mm_inc_nr_ptes(vma->vm_mm); + spin_unlock(vmf->ptl); + + /* finally setup cpu mapping */ + gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); + gm_mapping->page = &folio->page; + mutex_unlock(&gm_mapping->lock); + + return 0; +unlock_release: + spin_unlock(vmf->ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + if (is_new_folio) + folio_put(folio); + mutex_unlock(&gm_mapping->lock); + return ret; +} + static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) { unsigned long haddr = vmf->address & HPAGE_PMD_MASK; @@ -1440,7 +1585,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); folio_put(folio); return ret; - } /* @@ -1506,6 +1650,9 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; khugepaged_enter_vma(vma, vma->vm_flags); + if (vma_is_peer_shared(vma)) + return __do_peer_shared_anonymous_page(vmf); + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { @@ -1545,7 +1692,6 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) } return ret; } - return __do_huge_pmd_anonymous_page(vmf); } diff --git a/mm/memory.c b/mm/memory.c index 4bb3acfc3dd9c885956ec02bac7545b5f5e9947f..8891831579e45b9cc8e32282a196305c744b1b79 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,6 +77,10 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -1710,6 +1714,50 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, return addr; } +#ifdef CONFIG_GMEM +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) +{ + struct gm_mapping *gm_mapping = NULL; + struct page *page = NULL; + + if (!vma->vm_obj) + return; + + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, addr); + + if (gm_mapping && gm_mapping_cpu(gm_mapping)) { + page = gm_mapping->page; + if (page && (page_ref_count(page) != 0)) { + put_page(page); + gm_mapping->page = NULL; + } + } + xa_unlock(vma->vm_obj->logical_page_table); +} + +static inline void zap_logic_pud_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) +{ + unsigned long next; + + do { + next = pmd_addr_end(addr, end); + zap_logic_pmd_range(vma, addr, next); + } while (addr = next, addr != end); +} +#else +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) {} +static inline void zap_logic_pud_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) {} +#endif + static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, @@ -1740,6 +1788,19 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, */ spin_unlock(ptl); } +#ifdef CONFIG_GMEM + /* + * Here there can be other concurrent MADV_DONTNEED or + * trans huge page faults running, and if the pmd is + * none or trans huge it can change under us. This is + * because MADV_DONTNEED holds the mmap_lock in read + * mode. + */ + if (vma_is_peer_shared(vma)) { + if (pmd_none_or_clear_bad(pmd) || pmd_trans_huge(*pmd)) + zap_logic_pmd_range(vma, addr, next); + } +#endif if (pmd_none(*pmd)) { addr = next; continue; @@ -1771,8 +1832,11 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, goto next; /* fall through */ } - if (pud_none_or_clear_bad(pud)) + if (pud_none_or_clear_bad(pud)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_pmd_range(tlb, vma, pud, addr, next, details); next: cond_resched(); @@ -1792,8 +1856,11 @@ static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_none_or_clear_bad(p4d)) + if (p4d_none_or_clear_bad(p4d)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_pud_range(tlb, vma, p4d, addr, next, details); } while (p4d++, addr = next, addr != end); @@ -1813,8 +1880,13 @@ void unmap_page_range(struct mmu_gather *tlb, pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none_or_clear_bad(pgd)) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); +#endif continue; + } next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); @@ -1865,6 +1937,77 @@ static void unmap_single_vma(struct mmu_gather *tlb, } } +static void unmap_single_peer_shared_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end, addr; + struct vm_object *obj = vma->vm_obj; + enum gm_ret ret; + struct gm_mapping *gm_mapping; + struct hnode *hnode; + struct gm_fault_t gmf = { + .mm = mm, + .copy = false, + }; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return; + addr = start; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + + if (!obj) + return; + + if (!mm->gm_as) + return; + + do { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } + + /* In fact, during the exit_mmap process of the host, we do not + * need to call peer_unmap to release the memory within the NPU + * card, as the NPU card has an independent process that will + * handle the unmap operation. */ + //gmf.va = addr; + //gmf.size = HPAGE_SIZE; + //gmf.pfn = gm_mapping->gm_page->dev_pfn; + //gmf.dev = gm_mapping->dev; + //ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + //if (ret != GM_RET_SUCCESS) + // gmem_err("%s: call dev peer_unmap error %d", __func__, ret); + + /* + * Regardless of whether the gm_page is unmapped, we should release it. + */ + hnode = get_hnode(gm_mapping->gm_page->hnid); + if (!hnode) { + mutex_unlock(&gm_mapping->lock); + continue; + } + gm_page_remove_rmap(gm_mapping->gm_page); + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); + gm_mapping->gm_page = NULL; + mutex_unlock(&gm_mapping->lock); + } while (addr += HPAGE_SIZE, addr != end); +} + /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather @@ -1908,6 +2051,9 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unmap_single_vma(tlb, vma, start, end, &details, mm_wr_locked); hugetlb_zap_end(vma, &details); +#ifdef CONFIG_GMEM + unmap_single_peer_shared_vma(vma->vm_mm, vma, start, end); +#endif vma = mas_find(mas, tree_end - 1); } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); @@ -5802,7 +5948,9 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; - +#ifdef CONFIG_GMEM + char *thp_enable_path = "/sys/kernel/mm/transparent_hugepage/enabled"; +#endif pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) @@ -5855,9 +6003,21 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; + if (vma_is_peer_shared(vma)) + return VM_FAULT_OOM; } else { vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma) && pmd_none(*vmf.pmd) && + (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))) { + /* if transparent hugepage is not enabled, return pagefault failed */ + gmem_err("transparent hugepage is not enabled. check %s\n", + thp_enable_path); + return VM_FAULT_SIGBUS; + } +#endif + if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(vmf.orig_pmd)); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a82aab7ab47a5ea444b2cc01c77520dea817b683..bb35f9fafcf61083b4b040d7f98ad9787175c98f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1902,8 +1902,13 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, bool vma_migratable(struct vm_area_struct *vma) { +#ifdef CONFIG_GMEM + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED)) + return false; +#else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; +#endif /* * DAX device mappings require predictable access latency, so avoid diff --git a/mm/mm_init.c b/mm/mm_init.c index 6677aaa5972d4e97fe5604d64d73dab3903fe7c6..1a3d3b6e52c9c20d73f7b557663b67eb86d71960 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -30,6 +30,9 @@ #include "internal.h" #include "slab.h" #include "shuffle.h" +#ifdef CONFIG_GMEM +#include +#endif #include @@ -2797,6 +2800,9 @@ static void __init mem_init_print_info(void) */ void __init mm_core_init(void) { +#ifdef CONFIG_GMEM + hnuma_init(); +#endif /* Initializations relying on SMP setup */ build_all_zonelists(NULL); page_alloc_init_cpuhp(); diff --git a/mm/mmap.c b/mm/mmap.c index fb54df419ea2c360f1e0c23f921f54627a08c5b5..2e777ad31323629354257169f8f11557fb1695d1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,10 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -645,7 +649,9 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, */ if (src->anon_vma && !dst->anon_vma) { int ret; - +#ifdef CONFIG_GMEM + dup_vm_object(dst, src, true); +#endif vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; ret = anon_vma_clone(dst, src); @@ -701,6 +707,13 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Only handles expanding */ VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); + if (vma_is_peer_shared(vma)) { + if (!remove_next) + vm_object_adjust(vma, start, end); + else + vm_object_merge(vma, next->vm_end); + } + /* Note: vma iterator must be pointing to 'start' */ vma_iter_config(vmi, start, end); if (vma_iter_prealloc(vmi, vma)) @@ -752,6 +765,9 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); + if (vma_is_peer_shared(vma)) + vm_object_adjust(vma, start, end); + vma_iter_clear(vmi); vma->vm_start = start; vma->vm_end = end; @@ -1002,6 +1018,9 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (!next->anon_vma) err = dup_anon_vma(prev, curr, &anon_dup); } + if (vma_is_peer_shared(prev)) { + vm_object_merge(prev, next->vm_end); + } } else if (merge_prev) { /* case 2 */ if (curr) { vma_start_write(curr); @@ -1020,6 +1039,9 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } if (!err) err = dup_anon_vma(prev, curr, &anon_dup); + if (vma_is_peer_shared(prev)) { + vm_object_merge(prev, end); + } } } else { /* merge_next */ vma_start_write(next); @@ -1030,6 +1052,9 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, adjust = next; adj_start = -(prev->vm_end - addr); err = dup_anon_vma(next, prev, &anon_dup); + if (vma_is_peer_shared(prev)) { + vm_object_merge(prev, addr); + } } else { /* * Note that cases 3 and 8 are the ONLY ones where prev @@ -1045,6 +1070,8 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, remove = curr; err = dup_anon_vma(next, curr, &anon_dup); } + if (vma_is_peer_shared(curr)) + vm_object_merge(vma, next->vm_end); } } @@ -1316,11 +1343,21 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { + len = round_up(len, PMD_SIZE); + addr = get_unmapped_area_aligned(file, addr, len, pgoff, flags, + PMD_SIZE); + } else { + addr = get_unmapped_area(file, addr, len, pgoff, flags); + } +#else addr = get_unmapped_area(file, addr, len, pgoff, flags); +#endif if (IS_ERR_VALUE(addr)) return addr; - if (flags & MAP_FIXED_NOREPLACE) { + if ((flags & MAP_FIXED_NOREPLACE) || (gmem_is_enabled() && (flags & MAP_PEER_SHARED))) { if (find_vma_intersection(mm, addr, addr + len)) return -EEXIST; } @@ -1439,6 +1476,14 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE; } +#ifdef CONFIG_GMEM + if (flags & MAP_PEER_SHARED) { + if (gmem_is_enabled()) + vm_flags |= VM_PEER_SHARED; + else + return -EINVAL; + } +#endif addr = __mmap_region_ext(mm, file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && @@ -1447,6 +1492,7 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon *populate = len; return addr; } +EXPORT_SYMBOL(__do_mmap_mm); unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -1465,7 +1511,26 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval; +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_SHARED) && (flags & MAP_PEER_SHARED)) { + retval = -EINVAL; + gmem_err(" MAP_PEER_SHARED and MAP_SHARE cannot be used together.\n"); + goto out_fput; + } + if (gmem_is_enabled() && (flags & MAP_HUGETLB) && (flags & MAP_PEER_SHARED)) { + retval = -EINVAL; + gmem_err(" MAP_PEER_SHARED and MAP_HUGETLB cannot be used together.\n"); + goto out_fput; + } +#endif if (!(flags & MAP_ANONYMOUS)) { +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { + retval = -EINVAL; + gmem_err(" MAP_PEER_SHARED cannot map file page.\n"); + goto out_fput; + } +#endif audit_mmap_fd(fd, flags); file = fget(fd); if (!file) @@ -1933,6 +1998,29 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +#ifdef CONFIG_GMEM +unsigned long +get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, unsigned long align) +{ + if (len > TASK_SIZE) + return -ENOMEM; + + addr = current->mm->get_unmapped_area(file, addr, len + align, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; + + addr = round_up(addr, align); + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (!IS_ALIGNED(addr, PMD_SIZE)) + return -EINVAL; + + return addr; +} +EXPORT_SYMBOL(get_unmapped_area_aligned); +#endif + /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. @@ -2471,7 +2559,9 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, err = anon_vma_clone(new, vma); if (err) goto out_free_mpol; - +#ifdef COFNIG_GMEM + dup_vm_object(new, vma, false); +#endif if (new->vm_file) get_file(new->vm_file); @@ -2486,6 +2576,9 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + if (vma_is_peer_shared(vma)) + vm_object_split(vma, new); + if (new_below) { vma->vm_start = addr; vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; @@ -2523,6 +2616,135 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } +#ifdef CONFIG_GMEM +static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end, addr; + struct vm_object *obj = vma->vm_obj; + enum gm_ret ret; + struct gm_context *ctx, *tmp; + struct gm_mapping *gm_mapping; + struct hnode *hnode; + struct gm_fault_t gmf = { + .mm = mm, + .copy = false, + }; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return; + addr = start; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + + if (!obj) + return; + + if (!mm->gm_as) + return; + + do { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } + + gmf.va = addr; + gmf.size = HPAGE_SIZE; + gmf.pfn = gm_mapping->gm_page->dev_pfn; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret != GM_RET_SUCCESS) + gmem_err("%s: call dev peer_unmap error %d", __func__, ret); + + /* + * Regardless of whether the gm_page is unmapped, we should release it. + */ + hnode = get_hnode(gm_mapping->gm_page->hnid); + if (!hnode) { + mutex_unlock(&gm_mapping->lock); + continue; + } + gm_page_remove_rmap(gm_mapping->gm_page); + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); + gm_mapping->gm_page = NULL; + mutex_unlock(&gm_mapping->lock); + } while (addr += HPAGE_SIZE, addr != end); + + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + if (!gm_dev_is_peer(ctx->dev)) + continue; + if (!ctx->dev->mmu->peer_va_free) + continue; + + gmf.va = start; + gmf.size = end - start; + gmf.dev = ctx->dev; + + ret = ctx->dev->mmu->peer_va_free(&gmf); + if (ret != GM_RET_SUCCESS) + pr_debug("gmem: free_vma failed, ret %d\n", ret); + } +} + +static void munmap_in_peer_devices(struct mm_struct *mm, unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + + VMA_ITERATOR(vmi, mm, start); + for_each_vma_range(vmi, vma, end) { + if (vma_is_peer_shared(vma)) + munmap_single_vma_in_peer_devices(mm, vma, start, end); + } +} + +static unsigned long gmem_unmap_align(struct mm_struct *mm, unsigned long start, size_t len) +{ + struct vm_area_struct *vma, *vma_end; + + vma = find_vma_intersection(mm, start, start + len); + vma_end = find_vma(mm, start + len); + if (!vma || !vma_is_peer_shared(vma)) + return 0; + if (vma_is_peer_shared(vma)) { + if (!IS_ALIGNED(start, PMD_SIZE)) + return -EINVAL; + } + + /* Prevents partial release of the peer_share page. */ + if (vma_end && vma_end->vm_start < (start + len) && vma_is_peer_shared(vma_end)) + len = round_up(len, SZ_2M); + return len; +} + +static void gmem_unmap_region(struct mm_struct *mm, unsigned long start, size_t len) +{ + unsigned long end, ret; + + ret = gmem_unmap_align(mm, start, len); + + if (!ret || IS_ERR_VALUE(ret)) + return; + + end = start + ret; + munmap_in_peer_devices(mm, start, end); +} +#endif + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2653,6 +2875,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, prev = vma_iter_prev_range(vmi); next = vma_next(vmi); + if (next) vma_iter_prev_range(vmi); @@ -2711,6 +2934,17 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long end; struct vm_area_struct *vma; +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) { + unsigned long ret = gmem_unmap_align(mm, start, len); + + if (IS_ERR_VALUE(ret)) + return ret; + else if (ret) + len = ret; + } +#endif + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; @@ -2745,6 +2979,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, { VMA_ITERATOR(vmi, mm, start); +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + gmem_unmap_region(mm, start, len); +#endif return do_vmi_munmap(&vmi, mm, start, len, uf, false); } @@ -2774,21 +3012,24 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, nr_pages = count_vma_pages_range(mm, addr, end); if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) + (len >> PAGE_SHIFT) - nr_pages)) { return -ENOMEM; + } } /* Unmap any existing mapping in the area */ - if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) + if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) { return -ENOMEM; + } /* * Private writable mapping: check memory availability */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) + if (security_vm_enough_memory_mm(mm, charged)) { return -ENOMEM; + } vm_flags |= VM_ACCOUNT; } @@ -3029,6 +3270,11 @@ static int __vm_munmap(unsigned long start, size_t len, bool unlock) if (sp_check_addr(start)) return -EINVAL; +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + gmem_unmap_region(mm, start, len); +#endif + if (mmap_write_lock_killable(mm)) return -EINTR; @@ -3410,6 +3656,10 @@ void exit_mmap(struct mm_struct *mm) __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && mm->gm_as) + gm_as_destroy(mm->gm_as); +#endif } /* Insert vm structure into process list sorted by address diff --git a/mm/mprotect.c b/mm/mprotect.c index e65363eb603e64c46fd742cf01087f88c73491e5..4eac8ad8a7181e0979dfb8addf56174b3ce8dd41 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -693,7 +693,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, unsigned long prot, int pkey) { unsigned long nstart, end, tmp, reqprot; +#ifdef CONFIG_GMEM + struct vm_area_struct *vma, *prev, *vma_end; +#else struct vm_area_struct *vma, *prev; +#endif int error; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && @@ -736,7 +740,19 @@ static int do_mprotect_pkey(unsigned long start, size_t len, error = -ENOMEM; if (!vma) goto out; - +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + start = ALIGN_DOWN(start, HPAGE_SIZE); + vma_end = find_vma(current->mm, end); + if (vma_end && vma_end->vm_start < end && vma_is_peer_shared(vma_end)) + end = ALIGN(end, HPAGE_SIZE); + if (end <= start) { + error = -ENOMEM; + goto out; + } + len = end - start; + } +#endif if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c233d61d0d06df9a48b779ad600d094ddd95510a..80b29d946a0d92ff57891a903cd4f252bd0434ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -194,6 +194,9 @@ EXPORT_SYMBOL(latent_entropy); nodemask_t node_states[NR_NODE_STATES] __read_mostly = { [N_POSSIBLE] = NODE_MASK_ALL, [N_ONLINE] = { { [0] = 1UL } }, +#ifdef CONFIG_GMEM + [N_HETEROGENEOUS] = NODE_MASK_NONE, +#endif #ifndef CONFIG_NUMA [N_NORMAL_MEMORY] = { { [0] = 1UL } }, #ifdef CONFIG_HIGHMEM diff --git a/mm/util.c b/mm/util.c index 7a5eed15c98fd9ce86fc16b7c27706209a6f8627..65392c97b1e993028dc29508bf134243dba4cf85 100644 --- a/mm/util.c +++ b/mm/util.c @@ -27,6 +27,9 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif #include "internal.h" #include "swap.h" @@ -540,6 +543,114 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) } EXPORT_SYMBOL_GPL(account_locked_vm); +#ifdef CONFIG_GMEM +static unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, + unsigned long flag) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct gm_context *ctx, *tmp; + unsigned long prot = VM_NONE; + enum gm_ret ret; + char *thp_enable_path = "/sys/kernel/mm/transparent_hugepage/enabled"; + + vma = find_vma(mm, addr); + if (!vma) { + gmem_err("vma for addr %lx is NULL, should not happen\n", addr); + return -EINVAL; + } + + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) { + gmem_err("transparent hugepage is not enabled. check %s\n", + thp_enable_path); + return -EINVAL; + } + + prot |= vma->vm_flags; + + if (!mm->gm_as) { + ret = gm_as_create(0, ULONG_MAX, GM_AS_ALLOC_DEFAULT, HPAGE_SIZE, &mm->gm_as); + if (ret) { + gmem_err("gm_as_create failed\n"); + return ret; + } + } + + ret = -ENODEV; + // TODO: consider the concurrency problem of device attaching/detaching from the gm_as. + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + struct gm_fault_t gmf = { + .mm = mm, + .dev = ctx->dev, + .va = addr, + .size = len, + .prot = prot, + }; + + if (!gm_dev_is_peer(ctx->dev)) + continue; + + if (!ctx->dev->mmu->peer_va_alloc_fixed) { + pr_debug("gmem: mmu ops has no alloc_vma\n"); + continue; + } + + ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); + if (ret != GM_RET_SUCCESS) { + gmem_err("device mmap failed\n"); + return ret; + } + } + + if (!vma->vm_obj) + vma->vm_obj = vm_object_create(vma); + if (!vma->vm_obj) + return -ENOMEM; + + return ret; +} + +struct gmem_vma_list { + unsigned long start; + size_t len; + struct list_head list; +}; + +static void gmem_reserve_vma(struct mm_struct *mm, unsigned long start, + size_t len, struct list_head *head) +{ + struct vm_area_struct *vma; + struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL); + + vma = find_vma(mm, start); + if (!vma || vma->vm_start >= start + len) { + kfree(node); + return; + } + vm_flags_set(vma, ~VM_PEER_SHARED); + + node->start = start; + node->len = round_up(len, SZ_2M); + list_add_tail(&node->list, head); +} + +static void gmem_release_vma(struct mm_struct *mm, struct list_head *head) +{ + struct gmem_vma_list *node, *next; + + list_for_each_entry_safe(node, next, head, list) { + unsigned long start = node->start; + size_t len = node->len; + + if (len) + vm_munmap(start, len); + + list_del(&node->list); + kfree(node); + } +} +#endif + unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) @@ -548,7 +659,11 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); - +#ifdef CONFIG_GMEM + unsigned int retry_times = 0; + LIST_HEAD(reserve_list); +retry: +#endif ret = security_mmap_file(file, prot, flag); if (!ret) { if (mmap_write_lock_killable(mm)) @@ -559,6 +674,27 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && !IS_ERR_VALUE(ret) && flag & MAP_PEER_SHARED) { + enum gm_ret gm_ret = 0; + + gm_ret = alloc_va_in_peer_devices(ret, len, flag); + /* + * if alloc_va_in_peer_devices failed + * add vma to reserve_list and release after find a proper vma + */ + if (gm_ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + retry_times++; + gmem_reserve_vma(mm, ret, len, &reserve_list); + goto retry; + } else if (gm_ret != GM_RET_SUCCESS) { + gmem_err("alloc vma ret %lu\n", ret); + gmem_reserve_vma(mm, ret, len, &reserve_list); + ret = -ENOMEM; + } + gmem_release_vma(mm, &reserve_list); + } +#endif } return ret; } diff --git a/mm/vm_object.c b/mm/vm_object.c new file mode 100644 index 0000000000000000000000000000000000000000..42219e8ff42b77827c5bff56381bd1b289e09251 --- /dev/null +++ b/mm/vm_object.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Logical Mapping Management + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi zhu, chao Liu + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Sine VM_OBJECT maintains the logical page table under each VMA, and each VMA + * points to a VM_OBJECT. Ultimately VM_OBJECTs must be maintained as long as VMA + * gets changed: merge, split, adjust + */ +static struct kmem_cache *vm_object_cachep; +static struct kmem_cache *gm_mapping_cachep; + +/* gm_mapping will not be release dynamically */ +struct gm_mapping *alloc_gm_mapping(void) +{ + struct gm_mapping *gm_mapping = kmem_cache_zalloc(gm_mapping_cachep, GFP_KERNEL); + + if (!gm_mapping) + return NULL; + + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); + mutex_init(&gm_mapping->lock); + + return gm_mapping; +} +EXPORT_SYMBOL(alloc_gm_mapping); + +static inline void release_gm_mapping(struct gm_mapping *mapping) +{ + kmem_cache_free(gm_mapping_cachep, mapping); +} + +static inline struct gm_mapping *lookup_gm_mapping(struct vm_object *obj, unsigned long pindex) +{ + return xa_load(obj->logical_page_table, pindex); +} + +int __init vm_object_init(void) +{ + vm_object_cachep = KMEM_CACHE(vm_object, 0); + if (!vm_object_cachep) + goto out; + + gm_mapping_cachep = KMEM_CACHE(gm_mapping, 0); + if (!gm_mapping_cachep) + goto free_vm_object; + + return 0; +free_vm_object: + kmem_cache_destroy(vm_object_cachep); +out: + return -ENOMEM; +} + +/* + * Create a VM_OBJECT and attach it to a VMA + * This should be called when a VMA is created. + */ +struct vm_object *vm_object_create(struct vm_area_struct *vma) +{ + struct vm_object *obj = kmem_cache_alloc(vm_object_cachep, GFP_KERNEL); + + if (!obj) + return NULL; + + spin_lock_init(&obj->lock); + obj->vma = vma; + + /* + * The logical page table maps linear_page_index(obj->vma, va) + * to pointers of struct gm_mapping. + */ + obj->logical_page_table = kmalloc(sizeof(struct xarray), GFP_KERNEL); + if (!obj->logical_page_table) { + kmem_cache_free(vm_object_cachep, obj); + return NULL; + } + + xa_init(obj->logical_page_table); + atomic_set(&obj->nr_pages, 0); + atomic_set(&obj->ref_count, 1); + + return obj; +} + +/* This should be called when a VMA no longer refers to a VM_OBJECT */ +void vm_object_drop_locked(struct vm_area_struct *vma) +{ + struct vm_object *obj = vma->vm_obj; + + if (!obj) { + pr_err("vm_object: vm_obj of the vma is NULL\n"); + return; + } + + /* + * We must enter this with VMA write-locked, which is unfortunately a giant lock. + * Note that Linux 6.0 has per-VMA lock: + * https://lwn.net/Articles/906852/ + * https://lwn.net/Articles/906833/ + */ + free_gm_mappings(vma); + mmap_assert_write_locked(vma->vm_mm); + vma->vm_obj = NULL; + + if (atomic_dec_and_test(&obj->ref_count)) { + xa_destroy(obj->logical_page_table); + kfree(obj->logical_page_table); + kmem_cache_free(vm_object_cachep, obj); + } +} + +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src, bool dst_peer_shared) +{ + unsigned long index; + struct gm_mapping *mapping; + unsigned long moved_pages = 0; + + if (dst_peer_shared) { + if (!vma_is_peer_shared(dst)) + return; + } else { + if (!vma_is_peer_shared(src)) + return; + } + + XA_STATE(xas, src->vm_obj->logical_page_table, linear_page_index(src, src->vm_start)); + + xa_lock(dst->vm_obj->logical_page_table); + rcu_read_lock(); + xas_for_each(&xas, mapping, linear_page_index(src, src->vm_end)) { + index = xas.xa_index - src->vm_pgoff + dst->vm_pgoff + + ((src->vm_start - dst->vm_start) >> PAGE_SHIFT); + __xa_store(dst->vm_obj->logical_page_table, index, mapping, GFP_KERNEL); + moved_pages++; + } + rcu_read_unlock(); + atomic_add(moved_pages, &dst->vm_obj->nr_pages); + xa_unlock(dst->vm_obj->logical_page_table); +} + +void dup_peer_shared_vma(struct vm_area_struct *vma) +{ + if (vma_is_peer_shared(vma)) { + pr_debug("gmem: peer-shared vma should not be dup\n"); + vma->vm_obj = vm_object_create(vma); + } +} + +/** + * new_vma is part of old_vma, so old_vma->vm_start <= new_vma->vm_start + * and new_vma->vm_end < old_vma->vm_end + */ +void vm_object_split(struct vm_area_struct *old_vma, struct vm_area_struct *new_vma) +{ + unsigned long index; + struct gm_mapping *page; + unsigned long transferred_pages = 0; + pgoff_t pgoff = linear_page_index(old_vma, new_vma->vm_start); + + XA_STATE(xas, old_vma->vm_obj->logical_page_table, pgoff); + + xa_lock(old_vma->vm_obj->logical_page_table); + xa_lock(new_vma->vm_obj->logical_page_table); + xas_for_each(&xas, page, linear_page_index(old_vma, new_vma->vm_end - SZ_2M)) { + index = xas.xa_index - old_vma->vm_pgoff + new_vma->vm_pgoff + - ((new_vma->vm_start - old_vma->vm_start) >> PAGE_SHIFT); + __xa_store(new_vma->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas, NULL); + transferred_pages++; + } + + atomic_sub(transferred_pages, &old_vma->vm_obj->nr_pages); + atomic_add(transferred_pages, &new_vma->vm_obj->nr_pages); + xa_unlock(new_vma->vm_obj->logical_page_table); + xa_unlock(old_vma->vm_obj->logical_page_table); +} + +void vm_object_merge(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long index; + struct gm_mapping *page; + struct vm_area_struct *next, *n_next; + unsigned long moved_pages = 0; + pgoff_t pgoff; + + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start); + next = vma_next(&vmi); + next = vma_next(&vmi); + if (!next) + return; + + if (addr < vma->vm_end) { + /* case 4: move logical mapping in [end, vma->vm_end) from vma to next */ + pgoff = linear_page_index(vma, addr); + XA_STATE(xas, vma->vm_obj->logical_page_table, pgoff); + + xa_lock(vma->vm_obj->logical_page_table); + xa_lock(next->vm_obj->logical_page_table); + xas_for_each(&xas, page, linear_page_index(vma, vma->vm_end - SZ_2M)) { + index = xas.xa_index - vma->vm_pgoff + next->vm_pgoff + - ((next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(next->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas, NULL); + moved_pages++; + } + atomic_sub(moved_pages, &vma->vm_obj->nr_pages); + atomic_add(moved_pages, &next->vm_obj->nr_pages); + xa_unlock(next->vm_obj->logical_page_table); + xa_unlock(vma->vm_obj->logical_page_table); + } else { + n_next = vma_next(&vmi); + + if (addr == next->vm_end) { + /* case 1, 7, 8: copy all logical mappings from next to vma */ + pgoff = linear_page_index(next, next->vm_start); + XA_STATE(xas, next->vm_obj->logical_page_table, pgoff); + + xa_lock(vma->vm_obj->logical_page_table); + rcu_read_lock(); + xas_for_each(&xas, page, linear_page_index(next, next->vm_end - SZ_2M)) { + index = xas.xa_index - next->vm_pgoff + vma->vm_pgoff + + ((next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(vma->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas, NULL); + moved_pages++; + } + rcu_read_unlock(); + atomic_add(moved_pages, &vma->vm_obj->nr_pages); + xa_unlock(vma->vm_obj->logical_page_table); + } else if (next->vm_start < addr && addr < next->vm_end) { + /* case 5: move logical mapping in [next->vm_start, end) from next to vma */ + pgoff = linear_page_index(next, next->vm_start); + XA_STATE(xas, next->vm_obj->logical_page_table, pgoff); + + xa_lock(vma->vm_obj->logical_page_table); + xa_lock(next->vm_obj->logical_page_table); + xas_for_each(&xas, page, linear_page_index(next, addr - SZ_2M)) { + index = xas.xa_index - next->vm_pgoff + vma->vm_pgoff + + ((next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(vma->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas, NULL); + moved_pages++; + } + atomic_add(moved_pages, &vma->vm_obj->nr_pages); + atomic_sub(moved_pages, &next->vm_obj->nr_pages); + xa_unlock(next->vm_obj->logical_page_table); + xa_unlock(vma->vm_obj->logical_page_table); + } else if (n_next && addr == n_next->vm_end) { + /* case 6: copy all logical mappings from next and n_next to vma */ + pgoff = linear_page_index(next, next->vm_start); + XA_STATE(xas_next, next->vm_obj->logical_page_table, pgoff); + pgoff = linear_page_index(n_next, n_next->vm_start); + XA_STATE(xas_n_next, n_next->vm_obj->logical_page_table, pgoff); + + xa_lock(vma->vm_obj->logical_page_table); + rcu_read_lock(); + + xas_for_each(&xas_next, page, linear_page_index(next, next->vm_end - SZ_2M)) { + index = xas_next.xa_index - next->vm_pgoff + vma->vm_pgoff + + ((next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(vma->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas_next, NULL); + moved_pages++; + } + + xas_for_each(&xas_n_next, page, linear_page_index(n_next, n_next->vm_end - SZ_2M)) { + index = xas_n_next.xa_index - n_next->vm_pgoff + vma->vm_pgoff + + ((n_next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(vma->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas_n_next, NULL); + moved_pages++; + } + + rcu_read_unlock(); + atomic_add(moved_pages, &vma->vm_obj->nr_pages); + xa_unlock(vma->vm_obj->logical_page_table); + } + } + /* case 2, 3: do nothing */ +} + +void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + /* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */ + unsigned long removed_pages = 0; + struct gm_mapping *mapping; + pgoff_t pgoff = linear_page_index(vma, vma->vm_start); + + XA_STATE(xas, vma->vm_obj->logical_page_table, pgoff); + + xas_lock(&xas); + if (vma->vm_start < start) { + xas_for_each(&xas, mapping, linear_page_index(vma, start)) { + xas_store(&xas, NULL); + removed_pages++; + } + } + + if (vma->vm_end > end) { + xas_set(&xas, linear_page_index(vma, end)); + + xas_for_each(&xas, mapping, linear_page_index(vma, vma->vm_end)) { + xas_store(&xas, NULL); + removed_pages++; + } + } + atomic_sub(removed_pages, &vma->vm_obj->nr_pages); + xas_unlock(&xas); +} + +/* + * Given a VA, the page_index is computed by + * page_index = linear_page_index(struct vm_area_struct *vma, unsigned long address) + */ +struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va) +{ + return lookup_gm_mapping(obj, linear_page_index(obj->vma, va)); +} +EXPORT_SYMBOL_GPL(vm_object_lookup); + +void vm_object_mapping_create(struct vm_object *obj, unsigned long start) +{ + pgoff_t index = linear_page_index(obj->vma, start); + struct gm_mapping *gm_mapping; + + gm_mapping = alloc_gm_mapping(); + if (!gm_mapping) + return; + + __xa_store(obj->logical_page_table, index, gm_mapping, GFP_KERNEL); +} + +void free_gm_mappings(struct vm_area_struct *vma) +{ + struct gm_mapping *gm_mapping; + pgoff_t pgoff = linear_page_index(vma, vma->vm_start); + XA_STATE(xas, vma->vm_obj->logical_page_table, pgoff); + + xa_lock(vma->vm_obj->logical_page_table); + xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end - SZ_2M)) { + release_gm_mapping(gm_mapping); + xas_store(&xas, NULL); + } + xa_unlock(vma->vm_obj->logical_page_table); +}