From 01febd6752fba4599e95af149c40d69ceb2a9feb Mon Sep 17 00:00:00 2001 From: nicunshu Date: Tue, 24 Jun 2025 17:11:35 +0800 Subject: [PATCH 01/27] mm: gmem: Introduce CONFIG_GMEM euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Introduce config GMEM in preparation for isolation code for gmem. Signed-off-by: nicunshu --- mm/Kconfig | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index bdd8372552ff..88be25f465b3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1480,6 +1480,21 @@ config NUMABALANCING_MEM_SAMPLING if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING. +config GMEM + bool "gmem subsystem for multi-MMU cooperative management" + depends on (ARM64 || X86_64) && MMU && TRANSPARENT_HUGEPAGE + select ARCH_USES_HIGH_VMA_FLAGS + default y + help + This provides a high-level interface that decouples MMUspecific functions. + Device drivers can thus attach themselves to a process’s address space and + let the OS take charge of their memory management. This eliminates + the need for device drivers to reinvent the wheel and allows them to + benefit from general memory optimizations integrated by GMEM. + + say Y here to enable gmem subsystem + + source "mm/damon/Kconfig" config THP_CONTROL -- Gitee From 6baef857874e991c1d1abdeda5d19711a89e8d43 Mon Sep 17 00:00:00 2001 From: Ni Cunshu Date: Tue, 24 Jun 2025 17:32:20 +0800 Subject: [PATCH 02/27] mm: gmem: Introduce new node state N_HETEROGENEOUS euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Introduce new node state N_HETEROGENEOUS to indicate heterogeneous memory devices. Co-developed-by: Jiangtian Feng Signed-off-by: Jiangtian Feng Co-developed-by: liuzixian Signed-off-by: liuzixian Signed-off-by: Ni Cunshu --- drivers/base/node.c | 6 ++++++ include/linux/nodemask.h | 12 ++++++++++++ mm/page_alloc.c | 3 +++ 3 files changed, 21 insertions(+) diff --git a/drivers/base/node.c b/drivers/base/node.c index 4d588f4658c8..b9e095cf3498 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -931,6 +931,9 @@ static struct node_attr node_state_attr[] = { [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator, N_GENERIC_INITIATOR), +#ifdef CONFIG_GMEM + [N_HETEROGENEOUS] = _NODE_ATTR(has_hetero_memory, N_HETEROGENEOUS), +#endif }; static struct attribute *node_state_attrs[] = { @@ -943,6 +946,9 @@ static struct attribute *node_state_attrs[] = { &node_state_attr[N_MEMORY].attr.attr, &node_state_attr[N_CPU].attr.attr, &node_state_attr[N_GENERIC_INITIATOR].attr.attr, +#ifdef CONFIG_GMEM + &node_state_attr[N_HETEROGENEOUS].attr.attr, +#endif NULL }; diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 8d07116caaf1..f005f3d903ae 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -407,6 +407,11 @@ enum node_states { N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ +#ifdef CONFIG_GMEM +#ifndef __GENKSYMS__ + N_HETEROGENEOUS, /* The node has heterogeneous memory */ +#endif +#endif NR_NODE_STATES }; @@ -536,6 +541,13 @@ static inline int node_random(const nodemask_t *maskp) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) +#ifdef CONFIG_GMEM +/* For h-NUMA topology */ +#define hnode_map node_states[N_HETEROGENEOUS] +#define num_hnodes() num_node_state(N_HETEROGENEOUS) +#define for_each_hnode(node) for_each_node_state(node, N_HETEROGENEOUS) +#endif + /* * For nodemask scratch area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c233d61d0d06..80b29d946a0d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -194,6 +194,9 @@ EXPORT_SYMBOL(latent_entropy); nodemask_t node_states[NR_NODE_STATES] __read_mostly = { [N_POSSIBLE] = NODE_MASK_ALL, [N_ONLINE] = { { [0] = 1UL } }, +#ifdef CONFIG_GMEM + [N_HETEROGENEOUS] = NODE_MASK_NONE, +#endif #ifndef CONFIG_NUMA [N_NORMAL_MEMORY] = { { [0] = 1UL } }, #ifdef CONFIG_HIGHMEM -- Gitee From 1a4d893c09aaa3584b1b513c01aea18defc2a58a Mon Sep 17 00:00:00 2001 From: Yang Yanchao Date: Tue, 24 Jun 2025 19:21:37 +0800 Subject: [PATCH 03/27] mm: gmem: Introduce gmem related madvise euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues//ICHFJN --------------------------------------------- Introduce hmadvise via ioctl in order to specific gmem behavior. Introduce new madvise opcode for hmadvise: MADV_PREFETCH: prefetch pages for hNUMA node MADV_PINNED: pin pages In order to avoid conflict to existing or new madvise opcode, make the new one begin with 0x1000. Signed-off-by: Yang Yanchao --- include/uapi/asm-generic/mman-common.h | 5 +++++ init/main.c | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 14e5498efd7a..5bd675448f53 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -79,6 +79,11 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +/* for hmadvise */ +#define MADV_GMEM_BASE 0x1000 +#define MADV_PREFETCH MADV_GMEM_BASE /* prefetch pages for hNUMA node */ +#define MADV_PINNED (MADV_GMEM_BASE+1) /* pin these pages */ + #define MADV_ETMEM_BASE 0x1100 #define MADV_SWAPFLAG MADV_ETMEM_BASE /* for memory to be swap out */ #define MADV_SWAPFLAG_REMOVE (MADV_SWAPFLAG + 1) diff --git a/init/main.c b/init/main.c index f97f06547078..51395ee7a27d 100644 --- a/init/main.c +++ b/init/main.c @@ -102,6 +102,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include #include @@ -905,6 +909,10 @@ void start_kernel(void) smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ boot_cpu_hotplug_init(); +#ifdef CONFIG_GMEM + hnuma_init(); +#endif + pr_notice("Kernel command line: %s\n", saved_command_line); /* parameters may set static keys */ jump_label_init(); -- Gitee From 9804ea9d4a81632f6965cbe76213b5bc66935f65 Mon Sep 17 00:00:00 2001 From: Liu Chao Date: Tue, 24 Jun 2025 19:39:12 +0800 Subject: [PATCH 04/27] mm: gmem: Introduce vm_object in preparation for gmem euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Defines a centrailized logical mapping table that reflects the mapping information regardless of the underlying arch-specific MMUs. Co-developed-by: Ni Cunshu Signed-off-by: Ni Cunshu Signed-off-by: Liu Chao --- include/linux/mm_types.h | 44 +++++++++++++++++++++++++++++++++++++++ include/linux/vm_object.h | 16 ++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 include/linux/vm_object.h diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 64c38b09e18d..bcfbaa36bbbb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,10 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -612,6 +616,43 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +#ifdef CONFIG_GMEM +/* + * Defines a centralized logical mapping table that reflects the mapping information + * regardless of the underlying arch-specific MMUs. + * The implementation of this data structure borrows the VM_OBJECT from FreeBSD as well + * as the filemap address_space struct from Linux page cache. + * Only VMAs point to VM_OBJECTs and maintain logical mappings, because we assume that + * the coordiantion between page tables must happen with CPU page table involved. That + * is to say, a generalized process unit must involve in a UVA-programming model, otherwise + * there is no point to support UVA programming. + * However, a VMA only needs to maintain logical mappings if the process has been + * attached to a GMEM VA space. In normal cases, a CPU process does not need it. (unless + * we later build a reservation system on top of the logical mapping tables to support + * reservation-based superpages and rangeTLBs). + * A GM_REGION does not need to maintain logical mappings. In the case that a device wants + * to support its private address space with local physical memory, GMEM should forward address + * space management to the core VM, using VMAs, instead of using GM_REGIONs. + */ +struct vm_object { + spinlock_t lock; + struct vm_area_struct *vma; + + /* + * The logical_page_table is a container that holds the mapping + * information between a VA and a struct page. + */ + struct xarray *logical_page_table; + atomic_t nr_pages; + + /* + * a vm object might be referred by multiple VMAs to share + * memory. + */ + atomic_t ref_count; +}; +#endif + struct anon_vma_name { struct kref kref; /* The name needs to be at the end because it is dynamically sized. */ @@ -732,6 +773,9 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef CONFIG_GMEM + struct vm_object *vm_obj; +#endif #ifdef CONFIG_SHARE_POOL struct sp_area *spa; #endif diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h new file mode 100644 index 000000000000..d39b461799f2 --- /dev/null +++ b/include/linux/vm_object.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _VM_OBJECT_H +#define _VM_OBJECT_H + +#include +#include + +#ifdef CONFIG_GMEM +/* vm_object KAPI */ +static inline struct gm_mapping *vm_object_lookup(struct vm_object *obj, + unsigned long va) { return NULL; } +static inline void vm_object_mapping_create(struct vm_object *obj, + unsigned long start) { return 0; } +#endif + +#endif /* _VM_OBJECT_H */ -- Gitee From 7a4e896a7c0f8546879d1d583d3542b41fe9addf Mon Sep 17 00:00:00 2001 From: wangbin Date: Tue, 24 Jun 2025 19:45:30 +0800 Subject: [PATCH 05/27] mm: gmem: Introduce GMEM euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- The functions of the GMEM can be summarized as follows: First, the accelerator driver can directly reuse the core VM code of Linux by providing the MMU operation function required by the GMEM, and no independent memory management mechanism is required. Second, the GMEM can coordinate a page table between multiple heterogeneous MMUs, so as to implement memory coherence (memory coherence) between the CPU and the accelerator in a same address space. From a kernel point of view, the driver code for memory management with repetitive functions is greatly reduced. From the perspective of driver programming, the development and maintenance workload of driver code is greatly reduced. From the perspective of application development, the same address space greatly reduces programming complexity, while GMEM provides heterogeneous memory semantics to enhance flexibility and ease of use in performance tuning. To enable gmem, add "gmem=on" in kernel commandline. Co-developed-by: Yang Yanchao Signed-off-by: Yang Yanchao Co-developed-by: Ni Cunshu Signed-off-by: Ni Cunshu Co-developed-by: luochunsheng Signed-off-by: luochunsheng Co-developed-by: Weixi Zhu Signed-off-by: Weixi Zhu Signed-off-by: wangbin --- include/linux/gmem.h | 347 ++++++++++++++++ include/linux/gmem_as.h | 36 ++ include/linux/mm.h | 41 ++ include/linux/vm_object.h | 1 + mm/Makefile | 2 +- mm/gmem.c | 836 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 1262 insertions(+), 1 deletion(-) create mode 100644 include/linux/gmem.h create mode 100644 include/linux/gmem_as.h create mode 100644 mm/gmem.c diff --git a/include/linux/gmem.h b/include/linux/gmem.h new file mode 100644 index 000000000000..3216b55d659d --- /dev/null +++ b/include/linux/gmem.h @@ -0,0 +1,347 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi Zhu + * + */ +#ifndef _GMEM_H +#define _GMEM_H + +#include + +struct hnode; + +/* + * enum gm_ret - The return value of GMEM KPI that can be used to tell + * the core VM or peripheral driver whether the GMEM KPI was + * executed successfully. + * + * @GM_RET_SUCCESS: The invoked GMEM KPI behaved as expected. + * @GM_RET_FAILURE_UNKNOWN: The GMEM KPI failed with unknown reason. + * Any external status related to this KPI invocation changes must be rolled back. + */ +enum gm_ret { + GM_RET_SUCCESS = 0, + GM_RET_NOMEM, + GM_RET_PAGE_EXIST, + GM_RET_DMA_ERROR, + GM_RET_MIGRATING, + GM_RET_FAILURE_UNKNOWN, + GM_RET_UNIMPLEMENTED, +}; + +/* + * Defines a contiguous range of virtual addresses inside a struct gm_as + * As an analogy, this is conceptually similar as virtual_address_struct + */ +struct gm_region { + unsigned long start_va; + unsigned long end_va; + struct rb_node node; + struct gm_as *as; /* The address space that it belongs to */ + + /* Do we need another list_node to maintain a tailQ of allocated VMAs inside a gm_as? */ + struct list_head mapping_set_link; + + void (*callback_op)(void *args); + void *cb_args; +}; + +/* This holds a list of regions that must not be concurrently manipulated. */ +struct gm_mapping_set { + unsigned int region_cnt; + struct list_head gm_region_list; +}; + +/** + * enum gm_mmu_mode - defines the method to share a physical page table. + * + * @GM_MMU_MODE_SHARE: Literally share a physical page table with another + * attached device's MMU. Nothing is guaranteed about the allocated address. + * @GM_MMU_MODE_COHERENT_EXCLUSIVE: Maintain a coherent page table that holds + * exclusive mapping entries, so that device memory accesses can trigger fault-driven + * migration for automatic data locality optimizations. + * @GM_MMU_MODE_REPLICATE: Maintain a coherent page table that replicates physical + * mapping entries whenever a physical mapping is installed inside the address space, so + * that it may minimize the page faults to be triggered by this device. + */ +enum gm_mmu_mode { + GM_MMU_MODE_SHARE, + GM_MMU_MODE_COHERENT_EXCLUSIVE, + GM_MMU_MODE_REPLICATE, +}; + +/* + * This is the parameter list of peer_map/unmap mmu operations. + * if device should copy data to/from host, set copy and dma_addr + */ +struct gm_fault_t { + struct mm_struct *mm; + struct gm_dev *dev; + unsigned long va; + unsigned long size; + unsigned long prot; + bool copy; + dma_addr_t dma_addr; + int behavior; +}; + +struct gm_memcpy_t { + struct mm_struct *mm; + struct gm_dev *dev; + unsigned long src; + unsigned long dest; + dma_addr_t dma_addr; + size_t size; +}; + +/** + * + * This struct defines a series of MMU functions registered by a peripheral + * device that is to be invoked by GMEM. + * + * pmap is an opaque pointer that identifies a physical page table of a device. + * A physical page table holds the physical mappings that can be interpreted by + * the hardware MMU. + */ +struct gm_mmu { + /* + * Each bit indicates a supported page size for page-based TLB. + * Currently we do not consider range TLBs. + */ + unsigned long pgsize_bitmap; + + /* + * cookie identifies the type of the MMU. If two gm_mmu shares the same cookie, + * then it means their page table formats are compatible. + * In that case, they can share the same void *pmap as the input arg. + */ + unsigned long cookie; + + /* Synchronize VMA in a peer OS to interact with the host OS */ + enum gm_ret (*peer_va_alloc_fixed)(struct mm_struct *mm, unsigned long va, + unsigned long size, unsigned long prot); + enum gm_ret (*peer_va_free)(struct mm_struct *mm, unsigned long va, + unsigned long size); + + /* Create physical mappings on peer host. + * If copy is set, copy data [dma_addr, dma_addr + size] to peer host + */ + enum gm_ret (*peer_map)(struct gm_fault_t *gmf); + /* + * Destroy physical mappings on peer host. + * If copy is set, copy data back to [dma_addr, dma_addr + size] + */ + enum gm_ret (*peer_unmap)(struct gm_fault_t *gmf); + + /* Create or destroy a device's physical page table. */ + enum gm_ret (*pmap_create)(struct gm_dev *dev, void **pmap); + enum gm_ret (*pmap_destroy)(void *pmap); + + /* Create or destroy a physical mapping of a created physical page table */ + enum gm_ret (*pmap_enter)(void *pmap, unsigned long va, unsigned long size, + unsigned long pa, unsigned long prot); + enum gm_ret (*pmap_release)(void *pmap, unsigned long va, unsigned long size); + + /* Change the protection of a virtual page */ + enum gm_ret (*pmap_protect)(void *pmap, unsigned long va, unsigned long size, + unsigned long new_prot); + + /* Invalidation functions of the MMU TLB */ + enum gm_ret (*tlb_invl)(void *pmap, unsigned long va, unsigned long size); + enum gm_ret (*tlb_invl_coalesced)(void *pmap, struct list_head *mappings); +}; + +/** + * unsigned long defines a composable flag to describe the capabilities of a device. + * + * @GM_DEV_CAP_REPLAYABLE: Memory accesses can be replayed to recover page faults. + * @GM_DEV_CAP_PEER: The device has its own VMA/PA management, controlled by another peer OS + */ +#define GM_DEV_CAP_REPLAYABLE 0x00000001 +#define GM_DEV_CAP_PEER 0x00000010 + +#define gm_dev_is_peer(dev) (((dev)->capability & GM_DEV_CAP_PEER) != 0) + +struct gm_context { + struct gm_as *as; + struct gm_dev *dev; + void *pmap; + /* + * consider a better container to maintain multiple ctx inside a device or multiple ctx + * inside a va space. + * A device may simultaneously have multiple contexts for time-sliced ctx switching + */ + struct list_head gm_dev_link; + + /* A va space may have multiple gm_context */ + struct list_head gm_as_link; +}; +#define get_gm_context(head) (list_entry((head)->prev, struct gm_context, ctx_link)) + +struct gm_dev { + int id; + + /* identifies the device capability + * For example, whether the device supports page faults or whether it has its + * own OS that manages the VA and PA resources. + */ + unsigned long capability; + struct gm_mmu *mmu; + void *dev_data; + /* + * TODO: Use a better container of struct gm_context to support time-sliced context switch. + * A collection of device contexts. If the device does not support time-sliced context + * switch, then the size of the collection should never be greater than one. + * We need to think about what operators should the container be optimized for. + * A list, a radix-tree or what? What would gm_dev_activate require? + * Are there any accelerators that are really going to support time-sliced context switch? + */ + struct gm_context *current_ctx; + + struct list_head gm_ctx_list; + + /* Add tracking of registered device local physical memory. */ + nodemask_t registered_hnodes; + struct device *dma_dev; + + struct gm_mapping *gm_mapping; +}; + +#define GM_PAGE_DIRTY 0x8 /* Whether the page is dirty */ +#define GM_PAGE_CPU 0x10 /* Determines whether page is a pointer or a pfn number. */ +#define GM_PAGE_DEVICE 0x20 +#define GM_PAGE_NOMAP 0x40 +#define GM_PAGE_PINNED 0x80 +#define GM_PAGE_WILLNEED 0x100 + +#define GM_PAGE_TYPE_MASK (GM_PAGE_CPU | GM_PAGE_DEVICE | GM_PAGE_NOMAP) + +/* Records the status of a page-size physical page */ +struct gm_mapping { + unsigned int flag; + + union { + struct page *page; /* CPU node */ + struct gm_dev *dev; /* hetero-node */ + unsigned long pfn; + }; + + struct mutex lock; +}; + +static inline void gm_mapping_flags_set(struct gm_mapping *gm_mapping, int flags) +{ + if (flags & GM_PAGE_TYPE_MASK) + gm_mapping->flag &= ~GM_PAGE_TYPE_MASK; + + gm_mapping->flag |= flags; +} + +static inline void gm_mapping_flags_clear(struct gm_mapping *gm_mapping, int flags) +{ + gm_mapping->flag &= ~flags; +} + +static inline bool gm_mapping_cpu(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_CPU); +} + +static inline bool gm_mapping_device(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_DEVICE); +} + +static inline bool gm_mapping_nomap(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_NOMAP); +} + +static inline bool gm_mapping_willneed(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_WILLNEED); +} + +static inline bool gm_mapping_pinned(struct gm_mapping *gm_mapping) +{ + return !!(gm_mapping->flag & GM_PAGE_PINNED); +} + +#define test_gm_mapping_mapped_on_node(i) { /* implement this */ } +#define set_gm_mapping_mapped_on_node(i) { /* implement this */ } +#define unset_gm_mapping_mapped_on_node(i) { /* implement this */ } + +/* GMEM Device KPI */ +extern enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, + struct gm_dev **new_dev); +extern enum gm_ret gm_dev_destroy(struct gm_dev *dev); +extern enum gm_ret gm_dev_switch(struct gm_dev *dev, struct gm_as *as); +extern enum gm_ret gm_dev_detach(struct gm_dev *dev, struct gm_as *as); +extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, + unsigned long end); +enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, + int behavior); +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, enum page_entry_size pe_size); + +/* GMEM address space KPI */ +extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, + unsigned long end); +extern void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid); +extern struct gm_mapping *gm_mappings_alloc(unsigned int nid, unsigned int order); +extern enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, + unsigned long cache_quantum, struct gm_as **new_as); +extern enum gm_ret gm_as_destroy(struct gm_as *as); +extern enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, + bool activate, struct gm_context **out_ctx); +extern unsigned long gm_as_alloc(struct gm_as *as, unsigned long hint, unsigned long size, + unsigned long align, unsigned long no_cross, unsigned long max_va, + struct gm_region **new_region); + +extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); + +enum gmem_stat_item { + NR_PAGE_MIGRATING, + NR_GMEM_STAT_ITEMS +}; + +extern void gmem_state_counter(enum gmem_stat_item item, int val); +extern void gmem_state_counter_show(void); + +/* h-NUMA topology */ +struct hnode { + unsigned int id; + + struct gm_dev *dev; + + struct xarray pages; +}; + +extern struct hnode *hnodes[]; + +static inline bool is_hnode(int node) +{ + return (node < MAX_NUMNODES) && !node_isset(node, node_possible_map) && + node_isset(node, hnode_map); +} + +static inline bool is_hnode_allowed(int node) +{ + return (node < MAX_NUMNODES) && is_hnode(node) && + node_isset(node, current->mems_allowed); +} + +static inline struct hnode *get_hnode(unsigned int hnid) +{ + return hnodes[hnid]; +} + +void __init hnuma_init(void); +unsigned int alloc_hnode_id(void); +void free_hnode_id(unsigned int nid); +void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev); +void hnode_deinit(unsigned int hnid, struct gm_dev *dev); + +#endif /* _GMEM_H */ diff --git a/include/linux/gmem_as.h b/include/linux/gmem_as.h new file mode 100644 index 000000000000..d691de1162eb --- /dev/null +++ b/include/linux/gmem_as.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _GMEM_AS_H +#define _GMEM_AS_H + +#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */ + +/** + * enum gm_as_alloc - defines different allocation policy for virtual addresses. + * + * @GM_AS_ALLOC_DEFAULT: An object cache is applied to accelerate VA allocations. + * @GM_AS_ALLOC_FIRSTFIT: Prefer allocation efficiency. + * @GM_AS_ALLOC_BESTFIT: Prefer space efficiency. + * @GM_AS_ALLOC_NEXTFIT: Perform an address-ordered search for free addresses, + * beginning where the previous search ended. + */ +enum gm_as_alloc { + GM_AS_ALLOC_DEFAULT = 0, + GM_AS_ALLOC_FIRSTFIT, + GM_AS_ALLOC_BESTFIT, + GM_AS_ALLOC_NEXTFIT, +}; + +/* Defines an address space. */ +struct gm_as { + spinlock_t rbtree_lock; /* spinlock of struct gm_as */ + struct rb_root rbroot; /*root of gm_region_t */ + enum gm_as_alloc policy; + unsigned long start_va; + unsigned long end_va; + /* defines the VA unit size if an object cache is applied */ + unsigned long cache_quantum; + /* tracks device contexts attached to this va space, using gm_as_link */ + struct list_head gm_ctx_list; +}; + +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f36bf9ee02f..da5d2b0ea066 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -342,6 +342,12 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) + +#ifdef CONFIG_GMEM +#define VM_PEER_SHARED BIT(56) +#else +#define VM_PEER_SHARED VM_NONE +#endif #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS @@ -576,6 +582,13 @@ struct vm_fault { KABI_RESERVE(3) }; +/* page entry size for vm->huge_fault() */ +enum page_entry_size { + PE_SIZE_PTE = 0, + PE_SIZE_PMD, + PE_SIZE_PUD, +}; + /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -3404,6 +3417,10 @@ unsigned long randomize_page(unsigned long start, unsigned long range); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +extern unsigned long get_unmapped_area_aligned(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags, unsigned long align); + extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); @@ -4213,4 +4230,28 @@ void vma_pgtable_walk_end(struct vm_area_struct *vma); /* added to mm.h to avoid every caller adding new header file */ #include + +#ifdef CONFIG_GMEM +DECLARE_STATIC_KEY_FALSE(gmem_status); + +static inline bool gmem_is_enabled(void) +{ + return static_branch_likely(&gmem_status); +} + +static inline bool vma_is_peer_shared(struct vm_area_struct *vma) +{ + if (!gmem_is_enabled()) + return false; + + return !!(vma->vm_flags & VM_PEER_SHARED); +} +#else +static inline bool gmem_is_enabled(void) { return false; } +static inline bool vma_is_peer_shared(struct vm_area_struct *vma) +{ + return false; +} +#endif + #endif /* _LINUX_MM_H */ diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index d39b461799f2..083a1278901a 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -7,6 +7,7 @@ #ifdef CONFIG_GMEM /* vm_object KAPI */ +static inline int __init vm_object_init(void) { return 0; } static inline struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va) { return NULL; } static inline void vm_object_mapping_create(struct vm_object *obj, diff --git a/mm/Makefile b/mm/Makefile index 08fcaca0d8cd..5f9b94bdc5ae 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -41,7 +41,7 @@ mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o - +mmu-$(CONFIG_GMEM) += gmem.o ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o diff --git a/mm/gmem.c b/mm/gmem.c new file mode 100644 index 000000000000..add5062296b6 --- /dev/null +++ b/mm/gmem.c @@ -0,0 +1,836 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi Zhu + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(gmem_status); +EXPORT_SYMBOL_GPL(gmem_status); + +static struct kmem_cache *gm_as_cache; +static struct kmem_cache *gm_dev_cache; +static struct kmem_cache *gm_ctx_cache; +static struct kmem_cache *gm_region_cache; +static DEFINE_XARRAY_ALLOC(gm_dev_id_pool); + +static bool enable_gmem; + +static inline unsigned long pe_mask(unsigned int order) +{ + if (order == 0) + return PAGE_MASK; + if (order == PMD_ORDER) + return HPAGE_PMD_MASK; + if (order == PUD_ORDER) + return HPAGE_PUD_MASK; + return ~0; +} + +static struct percpu_counter g_gmem_stats[NR_GMEM_STAT_ITEMS]; + +void gmem_state_counter(enum gmem_stat_item item, int val) +{ + if (!gmem_is_enabled()) + return; + + if (WARN_ON_ONCE(unlikely(item >= NR_GMEM_STAT_ITEMS))) + return; + + percpu_counter_add(&g_gmem_stats[item], val); +} + +static int gmem_stat_init(void) +{ + int i, rc; + + for (i = 0; i < NR_GMEM_STAT_ITEMS; i++) { + rc = percpu_counter_init(&g_gmem_stats[i], 0, GFP_KERNEL); + if (rc) { + for (i--; i >= 0; i--) + percpu_counter_destroy(&g_gmem_stats[i]); + + break; /* break the initialization process */ + } + } + + return rc; +} + +#ifdef CONFIG_PROC_FS +static int gmemstat_show(struct seq_file *m, void *arg) +{ + if (!gmem_is_enabled()) + return 0; + + seq_printf( + m, "migrating : %lld\n", + percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING])); + + return 0; +} +#endif /* CONFIG_PROC_FS */ + +static struct workqueue_struct *prefetch_wq; + +#define GM_WORK_CONCURRENCY 4 + +static int __init gmem_init(void) +{ + int err = -ENOMEM; + + if (!enable_gmem) + return 0; + + gm_as_cache = KMEM_CACHE(gm_as, 0); + if (!gm_as_cache) + goto out; + + gm_dev_cache = KMEM_CACHE(gm_dev, 0); + if (!gm_dev_cache) + goto free_as; + + gm_ctx_cache = KMEM_CACHE(gm_context, 0); + if (!gm_ctx_cache) + goto free_dev; + + gm_region_cache = KMEM_CACHE(gm_region, 0); + if (!gm_region_cache) + goto free_ctx; + + err = vm_object_init(); + if (err) + goto free_ctx; + + err = gmem_stat_init(); + if (err) + goto free_ctx; + + prefetch_wq = alloc_workqueue("prefetch", + __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | + WQ_CPU_INTENSIVE, + GM_WORK_CONCURRENCY); + if (!prefetch_wq) { + pr_info("fail to alloc workqueue prefetch_wq\n"); + err = -EFAULT; + goto free_ctx; + } + +#ifdef CONFIG_PROC_FS + proc_create_single("gmemstat", 0444, NULL, gmemstat_show); +#endif + + static_branch_enable(&gmem_status); + + return 0; + +free_ctx: + kmem_cache_destroy(gm_ctx_cache); +free_dev: + kmem_cache_destroy(gm_dev_cache); +free_as: + kmem_cache_destroy(gm_as_cache); +out: + return -ENOMEM; +} +subsys_initcall(gmem_init); + +static int __init setup_gmem(char *str) +{ + strtobool(str, &enable_gmem); + + return 1; +} +__setup("gmem=", setup_gmem); + +/* + * Create a GMEM device, register its MMU function and the page table. + * The returned device pointer will be passed by new_dev. + * A unique id will be assigned to the GMEM device, using Linux's xarray. + */ +gm_ret_t gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, + struct gm_dev **new_dev) +{ + struct gm_dev *dev; + + if (!gmem_is_enabled()) + return GM_RET_FAILURE_UNKNOWN; + + dev = kmem_cache_alloc(gm_dev_cache, GFP_KERNEL); + if (!dev) + return GM_RET_NOMEM; + + if (xa_alloc(&gm_dev_id_pool, &dev->id, dev, xa_limit_32b, + GFP_KERNEL)) { + kmem_cache_free(gm_dev_cache, dev); + return GM_RET_NOMEM; + } + + dev->capability = cap; + dev->mmu = mmu; + dev->dev_data = dev_data; + dev->current_ctx = NULL; + INIT_LIST_HEAD(&dev->gm_ctx_list); + *new_dev = dev; + nodes_clear(dev->registered_hnodes); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_dev_create); + +// Destroy a GMEM device and reclaim the resources. +gm_ret_t gm_dev_destroy(struct gm_dev *dev) +{ + // TODO: implement it + xa_erase(&gm_dev_id_pool, dev->id); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_dev_destroy); + +/* Handle the page fault triggered by a given device */ +gm_ret_t gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, + int behavior) +{ + gm_ret_t ret = GM_RET_SUCCESS; + struct gm_mmu *mmu = dev->mmu; + struct device *dma_dev = dev->dma_dev; + struct vm_area_struct *vma; + vm_object_t *obj; + struct gm_mapping *gm_mapping; + unsigned long size = HPAGE_SIZE; + struct gm_fault_t gmf = { .mm = mm, + .va = addr, + .dev = dev, + .size = size, + .copy = false, + .behavior = behavior }; + struct page *page = NULL; + + mmap_read_lock(mm); + + vma = find_vma(mm, addr); + if (!vma) { + pr_info("gmem: %s no vma\n", __func__); + ret = GM_RET_FAILURE_UNKNOWN; + goto mmap_unlock; + } + obj = vma->vm_obj; + if (!obj) { + pr_info("gmem: %s no vm_obj\n", __func__); + ret = GM_RET_FAILURE_UNKNOWN; + goto mmap_unlock; + } + + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + vm_object_mapping_create(obj, addr); + gm_mapping = vm_object_lookup(obj, addr); + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (gm_mapping_nomap(gm_mapping)) { + goto peer_map; + } else if (gm_mapping_device(gm_mapping)) { + if (behavior == MADV_WILLNEED || behavior == MADV_PINNED) { + goto peer_map; + } else { + ret = 0; + goto unlock; + } + } else if (gm_mapping_cpu(gm_mapping)) { + page = gm_mapping->page; + if (!page) { + pr_err("gmem: host gm_mapping page is NULL. Set nomap\n"); + gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + goto unlock; + } + get_page(page); + zap_page_range_single(vma, addr, size, NULL); + gmf.dma_addr = + dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) + pr_info("gmem: dma map failed\n"); + + gmf.copy = true; + } + +peer_map: + ret = mmu->peer_map(&gmf); + if (ret != GM_RET_SUCCESS) { + if (ret == GM_RET_MIGRATING) { + /* + * gmem page is migrating due to overcommit. + * update page to willneed and this will stop page evicting + */ + gm_mapping_flags_set(gm_mapping, GM_PAGE_WILLNEED); + gmem_state_counter(NR_PAGE_MIGRATING, 1); + ret = GM_RET_SUCCESS; + } else { + pr_err("gmem: peer map failed\n"); + if (page) { + gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + put_page(page); + } + } + goto unlock; + } + + if (page) { + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + put_page(page); + } + + gm_mapping_flags_set(gm_mapping, GM_PAGE_DEVICE); + gm_mapping->dev = dev; +unlock: + mutex_unlock(&gm_mapping->lock); +mmap_unlock: + mmap_read_unlock(mm); + return ret; +} +EXPORT_SYMBOL_GPL(gm_dev_fault); + +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, + unsigned int order) +{ + vm_fault_t ret = 0; + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address & pe_mask(order); + vm_object_t *obj = vma->vm_obj; + struct gm_mapping *gm_mapping; + unsigned long size = HPAGE_SIZE; + struct gm_dev *dev; + struct device *dma_dev; + struct gm_fault_t gmf = { + .mm = vma->vm_mm, + .va = addr, + .size = size, + .copy = true, + }; + + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + pr_err("gmem: host fault gm_mapping should not be NULL\n"); + return VM_FAULT_SIGBUS; + } + + dev = gm_mapping->dev; + gmf.dev = dev; + dma_dev = dev->dma_dev; + gmf.dma_addr = + dma_map_page(dma_dev, vmf->page, 0, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) { + pr_err("gmem: host fault dma mapping error\n"); + return VM_FAULT_SIGBUS; + } + if (dev->mmu->peer_unmap(&gmf) != GM_RET_SUCCESS) { + pr_err("gmem: peer unmap failed\n"); + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + return VM_FAULT_SIGBUS; + } + + dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + return ret; +} + +/* + * Register the local physical memory of a gmem device. + * This implies dynamically creating + * the struct page data structures. + */ +gm_ret_t gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end) +{ + struct gm_mapping *mapping; + unsigned long addr = PAGE_ALIGN(begin); + unsigned int nid; + int i, page_num = (end - addr) >> PAGE_SHIFT; + struct hnode *hnode = kmalloc(sizeof(struct hnode), GFP_KERNEL); + + if (!hnode) + goto err; + + nid = alloc_hnode_id(); + if (nid == MAX_NUMNODES) + goto free_hnode; + hnode_init(hnode, nid, dev); + + mapping = kvmalloc_array(page_num, sizeof(struct gm_mapping), GFP_KERNEL); + if (!mapping) + goto deinit_hnode; + + for (i = 0; i < page_num; i++, addr += PAGE_SIZE) { + mapping[i].pfn = addr >> PAGE_SHIFT; + mapping[i].flag = 0; + } + + xa_lock(&hnode->pages); + for (i = 0; i < page_num; i++) { + if (xa_err(__xa_store(&hnode->pages, i, mapping + i, + GFP_KERNEL))) { + /* Probably nomem */ + kvfree(mapping); + xa_unlock(&hnode->pages); + goto deinit_hnode; + } + __xa_set_mark(&hnode->pages, i, XA_MARK_0); + } + xa_unlock(&hnode->pages); + + return GM_RET_SUCCESS; + +deinit_hnode: + hnode_deinit(nid, dev); + free_hnode_id(nid); +free_hnode: + kfree(hnode); +err: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(gm_dev_register_physmem); + +void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid) +{ + struct hnode *hnode = get_hnode(nid); + struct gm_mapping *mapping = xa_load(&hnode->pages, 0); + + kvfree(mapping); + hnode_deinit(nid, dev); + free_hnode_id(nid); + kfree(hnode); +} +EXPORT_SYMBOL_GPL(gm_dev_unregister_physmem); + +struct gm_mapping *gm_mappings_alloc(unsigned int nid, unsigned int order) +{ + struct gm_mapping *mapping; + struct hnode *node = get_hnode(nid); + XA_STATE(xas, &node->pages, 0); + + /* TODO: support order > 0 */ + if (order != 0) + return ERR_PTR(-EINVAL); + + xa_lock(&node->pages); + mapping = xas_find_marked(&xas, ULONG_MAX, XA_MARK_0); + if (!mapping) { + xa_unlock(&node->pages); + return ERR_PTR(-ENOMEM); + } + + xas_clear_mark(&xas, XA_MARK_0); + xa_unlock(&node->pages); + + return mapping; +} +EXPORT_SYMBOL_GPL(gm_mappings_alloc); + +/* GMEM Virtual Address Space API */ +gm_ret_t gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, + unsigned long cache_quantum, struct gm_as **new_as) +{ + struct gm_as *as; + + if (!new_as) + return -EINVAL; + + as = kmem_cache_alloc(gm_as_cache, GFP_ATOMIC); + if (!as) + return -ENOMEM; + + spin_lock_init(&as->rbtree_lock); + as->rbroot = RB_ROOT; + as->start_va = begin; + as->end_va = end; + as->policy = policy; + + INIT_LIST_HEAD(&as->gm_ctx_list); + + *new_as = as; + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_create); + +gm_ret_t gm_as_destroy(struct gm_as *as) +{ + struct gm_context *ctx, *tmp_ctx; + + list_for_each_entry_safe(ctx, tmp_ctx, &as->gm_ctx_list, gm_as_link) + kfree(ctx); + + kmem_cache_free(gm_as_cache, as); + + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_destroy); + +gm_ret_t gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, + bool activate, struct gm_context **out_ctx) +{ + struct gm_context *ctx; + int nid; + int ret; + + ctx = kmem_cache_alloc(gm_ctx_cache, GFP_KERNEL); + if (!ctx) + return GM_RET_NOMEM; + + ctx->as = as; + ctx->dev = dev; + ctx->pmap = NULL; + ret = dev->mmu->pmap_create(dev, &ctx->pmap); + if (ret) { + kmem_cache_free(gm_ctx_cache, ctx); + return ret; + } + + INIT_LIST_HEAD(&ctx->gm_dev_link); + INIT_LIST_HEAD(&ctx->gm_as_link); + list_add_tail(&dev->gm_ctx_list, &ctx->gm_dev_link); + list_add_tail(&ctx->gm_as_link, &as->gm_ctx_list); + + if (activate) { + /* + * Here we should really have a callback function to perform the context switch + * for the hardware. E.g. in x86 this function is effectively + * flushing the CR3 value. Currently we do not care time-sliced context switch, + * unless someone wants to support it. + */ + dev->current_ctx = ctx; + } + *out_ctx = ctx; + + /* + * gm_as_attach will be used to attach device to process address space. + * Handle this case and add hnodes registered by device to process mems_allowed. + */ + for_each_node_mask(nid, dev->registered_hnodes) + node_set(nid, current->mems_allowed); + return GM_RET_SUCCESS; +} +EXPORT_SYMBOL_GPL(gm_as_attach); + +DEFINE_SPINLOCK(hnode_lock); +struct hnode *hnodes[MAX_NUMNODES]; + +void __init hnuma_init(void) +{ + unsigned int node; + + for_each_node(node) + node_set(node, hnode_map); +} + +unsigned int alloc_hnode_id(void) +{ + unsigned int node; + + spin_lock(&hnode_lock); + node = first_unset_node(hnode_map); + node_set(node, hnode_map); + spin_unlock(&hnode_lock); + + return node; +} + +void free_hnode_id(unsigned int nid) +{ + node_clear(nid, hnode_map); +} + +void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev) +{ + hnodes[hnid] = hnode; + hnodes[hnid]->id = hnid; + hnodes[hnid]->dev = dev; + node_set(hnid, dev->registered_hnodes); + xa_init(&hnodes[hnid]->pages); +} + +void hnode_deinit(unsigned int hnid, struct gm_dev *dev) +{ + hnodes[hnid]->id = 0; + hnodes[hnid]->dev = NULL; + node_clear(hnid, dev->registered_hnodes); + xa_destroy(&hnodes[hnid]->pages); + hnodes[hnid] = NULL; +} + +struct prefetch_data { + struct mm_struct *mm; + struct gm_dev *dev; + unsigned long addr; + size_t size; + struct work_struct work; + int *res; +}; + +static void prefetch_work_cb(struct work_struct *work) +{ + struct prefetch_data *d = + container_of(work, struct prefetch_data, work); + unsigned long addr = d->addr, end = d->addr + d->size; + int page_size = HPAGE_SIZE; + int ret; + + do { + /* MADV_WILLNEED: dev will soon access this addr. */ + ret = gm_dev_fault(d->mm, addr, d->dev, MADV_WILLNEED); + if (ret == GM_RET_PAGE_EXIST) { + pr_info("%s: device has done page fault, ignore prefetch\n", + __func__); + } else if (ret != GM_RET_SUCCESS) { + *d->res = -EFAULT; + pr_err("%s: call dev fault error %d\n", __func__, ret); + } + } while (addr += page_size, addr != end); + + kfree(d); +} + +static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t size) +{ + unsigned long start, end, per_size; + int page_size = HPAGE_SIZE; + struct prefetch_data *data; + struct vm_area_struct *vma; + int res = GM_RET_SUCCESS; + unsigned long old_start; + + /* overflow */ + if (check_add_overflow(addr, size, &end)) + return -EINVAL; + + old_start = end; + + /* Align addr by rounding outward to make page cover addr. */ + end = round_up(end, page_size); + start = round_down(addr, page_size); + size = end - start; + + if (!end && old_start) + return -EINVAL; + + if (size == 0) + return 0; + + mmap_read_lock(current->mm); + vma = find_vma(current->mm, start); + if (!vma || start < vma->vm_start || end > vma->vm_end) { + mmap_read_unlock(current->mm); + return GM_RET_FAILURE_UNKNOWN; + } + mmap_read_unlock(current->mm); + + per_size = (size / GM_WORK_CONCURRENCY) & ~(page_size - 1); + + while (start < end) { + data = kzalloc(sizeof(struct prefetch_data), GFP_KERNEL); + if (!data) { + flush_workqueue(prefetch_wq); + return GM_RET_NOMEM; + } + + INIT_WORK(&data->work, prefetch_work_cb); + data->mm = current->mm; + data->dev = dev; + data->addr = start; + data->res = &res; + if (per_size == 0) + data->size = size; + else + /* Process (1.x * per_size) for the last time */ + data->size = (end - start < 2 * per_size) ? + (end - start) : + per_size; + queue_work(prefetch_wq, &data->work); + start += data->size; + } + + flush_workqueue(prefetch_wq); + return res; +} + +static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int page_size) +{ + struct gm_fault_t gmf = { + .mm = current->mm, + .size = page_size, + .copy = false, + }; + struct gm_mapping *gm_mapping; + vm_object_t *obj; + int ret; + + obj = vma->vm_obj; + if (!obj) { + pr_err("gmem: peer-shared vma should have vm_object\n"); + return -EINVAL; + } + + for (; start < end; start += page_size) { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, start); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + mutex_lock(&gm_mapping->lock); + if (gm_mapping_nomap(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } else if (gm_mapping_cpu(gm_mapping)) { + zap_page_range_single(vma, start, page_size, NULL); + } else { + gmf.va = start; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret) { + pr_err("gmem: peer_unmap failed. ret %d\n", + ret); + mutex_unlock(&gm_mapping->lock); + continue; + } + } + gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + mutex_unlock(&gm_mapping->lock); + } + + return 0; +} + +static int hmadvise_do_eagerfree(unsigned long addr, size_t size) +{ + unsigned long start, end, i_start, i_end; + int page_size = HPAGE_SIZE; + struct vm_area_struct *vma; + int ret = GM_RET_SUCCESS; + unsigned long old_start; + + /* overflow */ + if (check_add_overflow(addr, size, &end)) + return -EINVAL; + + old_start = addr; + + /* Align addr by rounding inward to avoid excessive page release. */ + end = round_down(end, page_size); + start = round_up(addr, page_size); + if (start >= end) + return ret; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (old_start && !start) + return -EINVAL; + + mmap_read_lock(current->mm); + do { + vma = find_vma_intersection(current->mm, start, end); + if (!vma) { + pr_info("gmem: there is no valid vma\n"); + break; + } + + if (!vma_is_peer_shared(vma)) { + pr_debug("gmem: not peer-shared vma, skip dontneed\n"); + start = vma->vm_end; + continue; + } + + i_start = start > vma->vm_start ? start : vma->vm_start; + i_end = end < vma->vm_end ? end : vma->vm_end; + ret = gmem_unmap_vma_pages(vma, i_start, i_end, page_size); + if (ret) + break; + + start = vma->vm_end; + } while (start < end); + + mmap_read_unlock(current->mm); + return ret; +} + +static bool check_hmadvise_behavior(int behavior) +{ + return behavior == MADV_DONTNEED; +} + +int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) +{ + int error = -EINVAL; + struct hnode *node; + + if (hnid == -1) { + if (check_hmadvise_behavior(behavior)) { + goto no_hnid; + } else { + pr_err("hmadvise: behavior %d need hnid or is invalid\n", + behavior); + return error; + } + } + + if (hnid < 0) + return error; + + if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) + return error; + + node = get_hnode(hnid); + if (!node) { + pr_err("hmadvise: hnode id %d is invalid\n", hnid); + return error; + } + +no_hnid: + switch (behavior) { + case MADV_PREFETCH: + return hmadvise_do_prefetch(node->dev, start, len_in); + case MADV_DONTNEED: + return hmadvise_do_eagerfree(start, len_in); + default: + pr_err("hmadvise: unsupported behavior %d\n", behavior); + } + + return error; +} +EXPORT_SYMBOL_GPL(hmadvise_inner); -- Gitee From 6274abb2ca432bfc38f7e418687e7925927ee75f Mon Sep 17 00:00:00 2001 From: Chen Jun Date: Tue, 24 Jun 2025 19:49:28 +0800 Subject: [PATCH 06/27] mm: gmem: Add gm_dev in struct device euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Add gm_dev in struct device to keep track on gmem device. Co-developed-by: Jiangtian Feng Signed-off-by: Jiangtian Feng Co-developed-by: luochunsheng Signed-off-by: luochunsheng Signed-off-by: Chen Jun --- include/linux/device.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/device.h b/include/linux/device.h index 54a4967c496c..94262735406a 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -826,7 +826,13 @@ struct device { KABI_RESERVE(2) KABI_RESERVE(3) #endif + +#ifdef CONFIG_GMEM + KABI_USE(4, void *gm_dev) +#else KABI_RESERVE(4) +#endif + KABI_RESERVE(5) KABI_RESERVE(6) KABI_RESERVE(7) -- Gitee From cd65e9f54eef24ee0e5dfe5d83467fc9ba11499a Mon Sep 17 00:00:00 2001 From: Liu Chao Date: Wed, 25 Jun 2025 10:36:28 +0800 Subject: [PATCH 07/27] mm: gmem: Introduce vm_object for gmem euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Introduce vm_object for gmem. Co-developed-by: fangchuangchuang Signed-off-by: fangchuangchuang Co-developed-by: Lemmy Huang Signed-off-by: Lemmy Huang Signed-off-by: Liu Chao --- include/linux/vm_object.h | 25 ++- include/uapi/asm-generic/mman-common.h | 2 + kernel/fork.c | 12 ++ mm/Makefile | 2 +- mm/gmem.c | 6 +- mm/huge_memory.c | 149 +++++++++++++- mm/memory.c | 82 ++++++-- mm/mempolicy.c | 4 + mm/mmap.c | 261 ++++++++++++++++++++++++- mm/vm_object.c | 228 +++++++++++++++++++++ 10 files changed, 745 insertions(+), 26 deletions(-) create mode 100644 mm/vm_object.c diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index 083a1278901a..e5327665b6b7 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -6,12 +6,31 @@ #include #ifdef CONFIG_GMEM -/* vm_object KAPI */ -static inline int __init vm_object_init(void) { return 0; } +/* vm_object KPI */ +int __init vm_object_init(void); +struct vm_object *vm_object_create(struct vm_area_struct *vma); +void vm_object_drop_locked(struct vm_area_struct *vma); +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src); +void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end); + +gm_mapping_t *alloc_gm_mapping(void); +struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va); +void vm_object_mapping_create(struct vm_object *obj, unsigned long start); +void free_gm_mappings(struct vm_area_struct *vma); +#else +static inline void __init vm_object_init(void) {} +static inline struct vm_object *vm_object_create(struct vm_area_struct *vma) { return NULL; } +static inline void vm_object_drop_locked(struct vm_area_struct *vma) {} +static inline void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end) {} + +static inline gm_mapping_t *alloc_gm_mapping(void) { return NULL; } static inline struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va) { return NULL; } static inline void vm_object_mapping_create(struct vm_object *obj, - unsigned long start) { return 0; } + unsigned long start) {} +static inline void free_gm_mappings(struct vm_area_struct *vma) {} #endif #endif /* _VM_OBJECT_H */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 5bd675448f53..cdcb59fbfe7f 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -33,6 +33,8 @@ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ +#define MAP_PEER_SHARED 0x8000000 + /* * Flags for mlock */ diff --git a/kernel/fork.c b/kernel/fork.c index 78663ca68160..d984d93b3d39 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -99,6 +99,11 @@ #include #include #include + +#ifdef CONFIG_GMEM +#include +#endif + #ifdef CONFIG_QOS_SCHED_SMART_GRID #include #endif @@ -526,6 +531,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(orig)) { + pr_debug("gmem: peer-shared vma should not be dup\n"); + new->vm_obj = vm_object_create(new); + } +#endif + return new; } diff --git a/mm/Makefile b/mm/Makefile index 5f9b94bdc5ae..e4aa8e1cd329 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -41,7 +41,7 @@ mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o -mmu-$(CONFIG_GMEM) += gmem.o +mmu-$(CONFIG_GMEM) += gmem.o vm_object.o ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o diff --git a/mm/gmem.c b/mm/gmem.c index add5062296b6..ebf6a93bc33a 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -231,7 +231,7 @@ gm_ret_t gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *d struct gm_mmu *mmu = dev->mmu; struct device *dma_dev = dev->dma_dev; struct vm_area_struct *vma; - vm_object_t *obj; + struct vm_object *obj; struct gm_mapping *gm_mapping; unsigned long size = HPAGE_SIZE; struct gm_fault_t gmf = { .mm = mm, @@ -334,7 +334,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, vm_fault_t ret = 0; struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address & pe_mask(order); - vm_object_t *obj = vma->vm_obj; + struct vm_object *obj = vma->vm_obj; struct gm_mapping *gm_mapping; unsigned long size = HPAGE_SIZE; struct gm_dev *dev; @@ -697,7 +697,7 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, .copy = false, }; struct gm_mapping *gm_mapping; - vm_object_t *obj; + struct vm_object *obj; int ret; obj = vma->vm_obj; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a28dda799978..1070688be9d1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -38,6 +38,10 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -1337,6 +1341,47 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); + +static struct folio *vma_alloc_peer_shared_folio_pmd(struct vm_area_struct *vma, + unsigned long haddr, gm_mapping_t *gm_mapping) +{ + struct folio *folio; + gfp_t gfp = GFP_TRANSHUGE; + + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); + if (unlikely(!folio)) { + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + return NULL; + } + + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { + folio_put(folio); + count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + return NULL; + } + folio_throttle_swaprate(folio, gfp); + + /* + * gmem device overcommit needs to reload the swapped page, + * so skip it to avoid clearing device data. + */ + if (!gm_mapping_cpu(gm_mapping)) + clear_huge_page(page, vmf->address, HPAGE_PMD_NR); + + /* + * The memory barrier inside __folio_mark_uptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ + __folio_mark_uptodate(folio); + return folio; +} + static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, unsigned long addr) { @@ -1344,6 +1389,12 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, const int order = HPAGE_PMD_ORDER; struct folio *folio; +#ifdef CONFIG_GMEM + /* always try to compact hugepage for peer shared vma */ + if (vma_is_peer_shared(vma)) + gfp = GFP_TRANSHUGE; +#endif + folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true); if (unlikely(!folio)) { @@ -1391,6 +1442,95 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } +struct gm_mapping *vma_prepare_gm_mapping(struct vm_area_struct *vma, unsigned long haddr) +{ + struct gm_mapping *gm_mapping; + + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + if (!gm_mapping) { + vm_object_mapping_create(vma->vm_obj, haddr); + gm_mapping = vm_object_lookup(vma->vm_obj, haddr); + } + xa_unlock(vma->vm_obj->logical_page_table); + + return gm_mapping; +} + +static vm_fault_t __do_peer_shared_anonymous_page(struct vm_fault *vmf) +{ + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct folio *folio = NULL; + bool is_new_folio = false; + pgtable_t pgtable = NULL; + struct gm_mapping *gm_mapping; + vm_fault_t ret = 0; + + gm_mapping = vma_prepare_gm_mapping(vma, haddr); + if (!gm_mapping) + return VM_FAULT_OOM; + + mutex_lock(&gm_mapping->lock); + + if (gm_mapping_cpu(gm_mapping)) + folio = page_folio(gm_mapping->page); + if (!folio) { + folio = vma_alloc_anon_folio_pmd(vma, haddr); + is_new_folio = true; + } + + if (unlikely(!folio)) { + ret = VM_FAULT_FALLBACK; + goto release; + } + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } + + /** + * if page is mapped in device, release device mapping and + * deliver the page content to host. + */ + if (gm_mapping_device(gm_mapping)) { + vmf->page = &folio->page; + ret = gm_host_fault_locked(vmf, PMD_ORDER); + if (ret) + goto release; + } + + /* map page in pgtable */ + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + + BUG_ON(!pmd_none(*vmf->pmd)); + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + mm_inc_nr_ptes(vma->vm_mm); + spin_unlock(vmf->ptl); + + /* finally setup cpu mapping */ + gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); + gm_mapping->page = &folio->page; + mutex_unlock(&gm_mapping->lock); + + return 0; +unlock_release: + spin_unlock(vmf->ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + if (is_new_folio) + folio_put(folio); + mutex_unlock(&gm_mapping->lock); + return ret; +} + static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) { unsigned long haddr = vmf->address & HPAGE_PMD_MASK; @@ -1424,7 +1564,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); - return ret; + goto gm_mapping_release; } pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); @@ -1496,16 +1636,17 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - vm_fault_t ret; - if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; ret = vmf_anon_prepare(vmf); if (ret) return ret; + khugepaged_enter_vma(vma, vma->vm_flags); + if (vma_is_peer_shared(vma)) + return __do_peer_shared_anonymous_page(vmf); + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { diff --git a/mm/memory.c b/mm/memory.c index 4bb3acfc3dd9..9aa4d8174724 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,6 +77,10 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -1710,6 +1714,47 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, return addr; } +#ifdef CONFIG_GMEM +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) +{ + gm_mapping_t *gm_mapping = NULL; + struct page *page = NULL; + + xa_lock(vma->vm_obj->logical_page_table); + gm_mapping = vm_object_lookup(vma->vm_obj, addr); + + if (gm_mapping && gm_mapping_cpu(gm_mapping)) { + page = gm_mapping->page; + if (page && (page_ref_count(page) != 0)) { + put_page(page); + gm_mapping->page = NULL; + } + } + xa_unlock(vma->vm_obj->logical_page_table); +} + +static inline void zap_logic_pud_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) +{ + unsigned long next; + + do { + next = pmd_addr_end(addr, end); + zap_logic_pmd_range(vma, addr, next); + } while (addr = next, addr != end); +} +#else +static inline void zap_logic_pmd_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) {} +static inline void zap_logic_pud_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long end) {} +#endif + static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, @@ -1724,10 +1769,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); - else if (zap_huge_pmd(tlb, vma, pmd, addr)) { - addr = next; - continue; - } + else if (zap_huge_pmd(tlb, vma, pmd, addr)) + goto next; /* fall through */ } else if (details && details->single_folio && folio_test_pmd_mappable(details->single_folio) && @@ -1740,18 +1783,30 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, */ spin_unlock(ptl); } - if (pmd_none(*pmd)) { - addr = next; - continue; + + /* + * Here there can be other concurrent MADV_DONTNEED or + * trans huge page faults running, and if the pmd is + * none or trans huge it can change under us. This is + * because MADV_DONTNEED holds the mmap_lock in read + * mode. + */ + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) { + if (vma_is_peer_shared(vma)) + zap_logic_pmd_range(vma, addr, next); + goto next; } - addr = zap_pte_range(tlb, vma, pmd, addr, next, details); - if (addr != next) - pmd--; - } while (pmd++, cond_resched(), addr != end); + + next = zap_pte_range(tlb, vma, pmd, addr, next, details); +next: + cond_resched(); + } while (pmd++, addr = next, addr != end); return addr; } + + static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, @@ -1813,8 +1868,11 @@ void unmap_page_range(struct mmu_gather *tlb, pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none_or_clear_bad(pgd)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a82aab7ab47a..fb11f24b6685 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1902,7 +1902,11 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, bool vma_migratable(struct vm_area_struct *vma) { +#ifdef CONFIG_GMEM + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED)) +#else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) +#endif return false; /* diff --git a/mm/mmap.c b/mm/mmap.c index fb54df419ea2..1ea24d8f2fd0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,10 @@ #include #include #include +#ifdef CONFIG_GMEM +#include +#endif + #include #include @@ -644,6 +648,10 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(dst)) + dup_vm_object(dst, src); +#endif int ret; vma_assert_write_locked(dst); @@ -760,6 +768,39 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } +#ifdef CONFIG_GMEM +struct gmem_vma_list { + struct vm_area_struct *vma; + struct list_head list; +}; + +void gmem_reserve_vma(struct vm_area_struct *value, struct list_head *head) +{ + struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL); + + if (!node) + return; + + node->vma = value; + list_add_tail(&node->list, head); +} + +void gmem_release_vma(struct mm_struct *mm, struct list_head *head) +{ + struct gmem_vma_list *node, *next; + + list_for_each_entry_safe(node, next, head, list) { + struct vm_area_struct *vma = node->vma; + + if (vma != NULL) + vm_area_free(vma); + + list_del(&node->list); + kfree(node); + } +} +#endif + /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those if the caller indicates @@ -1082,6 +1123,11 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_iter_store(vmi, vma); if (adj_start) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(adjust)) + vm_object_adjust(adjust, adjust->vm_start + adj_start, + adjust->vm_end); +#endif adjust->vm_start += adj_start; adjust->vm_pgoff += adj_start >> PAGE_SHIFT; if (adj_start < 0) { @@ -1316,7 +1362,17 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { + len = round_up(len, SZ_2M); + addr = get_unmapped_area_aligned(file, addr, len, pgoff, flags, + SZ_2M); + } else { + addr = get_unmapped_area(file, addr, len, pgoff, flags); + } +#else addr = get_unmapped_area(file, addr, len, pgoff, flags); +#endif if (IS_ERR_VALUE(addr)) return addr; @@ -1439,6 +1495,10 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE; } +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) + vm_flags |= VM_PEER_SHARED; +#endif addr = __mmap_region_ext(mm, file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && @@ -1447,6 +1507,7 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon *populate = len; return addr; } +EXPORT_SYMBOL(__do_mmap_mm); unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -1933,6 +1994,27 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +unsigned long +get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, unsigned long align) +{ + if (len > TASK_SIZE) + return -ENOMEM; + + addr = current->mm->get_unmapped_area(file, addr, len + align, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; + + addr = round_up(addr, align); + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (!IS_ALIGNED(addr, PMD_SIZE)) + return -EINVAL; + + return addr; +} +EXPORT_SYMBOL(get_unmapped_area_aligned); + /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. @@ -2472,6 +2554,11 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (err) goto out_free_mpol; +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + dup_vm_object(new, vma); +#endif + if (new->vm_file) get_file(new->vm_file); @@ -2486,6 +2573,18 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + if (new_below) { + vm_object_adjust(new, new->vm_start, addr); + vm_object_adjust(vma, addr, vma->vm_end); + } else { + vm_object_adjust(vma, vma->vm_start, addr); + vm_object_adjust(new, addr, new->vm_end); + } + } +#endif + if (new_below) { vma->vm_start = addr; vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; @@ -2523,6 +2622,68 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } +#ifdef CONFIG_GMEM +static void munmap_in_peer_devices(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + unsigned long addr = start; + struct vm_object *obj = vma->vm_obj; + gm_ret_t ret; + gm_context_t *ctx, *tmp; + gm_mapping_t *gm_mapping; + + struct gm_fault_t gmf = { + .mm = mm, + .copy = false, + }; + + if (!obj) + return; + + do { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } + + gmf.va = addr; + gmf.size = HPAGE_SIZE; + gmf.dev = gm_mapping->dev; + ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + if (ret != GM_RET_SUCCESS) { + pr_err("%s: call dev peer_unmap error %d\n", __func__, ret); + mutex_unlock(&gm_mapping->lock); + continue; + } + mutex_unlock(&gm_mapping->lock); + } while (addr += HPAGE_SIZE, addr != end); + + if (!mm->gm_as) + return; + + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + if (!gm_dev_is_peer(ctx->dev)) + continue; + if (!ctx->dev->mmu->peer_va_free) + continue; + + ret = ctx->dev->mmu->peer_va_free(mm, start, end - start); + if (ret != GM_RET_SUCCESS) + pr_debug("gmem: free_vma(start:%lx, len:%lx) ret %d\n", + start, end - start, ret); + } +} +#endif + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2653,6 +2814,10 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, prev = vma_iter_prev_range(vmi); next = vma_next(vmi); +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + munmap_in_peer_devices(mm, vma, start, end); +#endif if (next) vma_iter_prev_range(vmi); @@ -2711,6 +2876,17 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long end; struct vm_area_struct *vma; + if (gmem_is_enabled()) { + vma = find_vma_intersection(mm, start, start + len); + if (!vma) + return 0; + if (vma_is_peer_shared(vma)) { + if (!IS_ALIGNED(start, PMD_SIZE)) + return -EINVAL; + + len = round_up(len, SZ_2M); + } + } if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; @@ -2748,6 +2924,48 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return do_vmi_munmap(&vmi, mm, start, len, uf, false); } +#ifdef CONFIG_GMEM +static int alloc_va_in_peer_devices(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr, unsigned long len, + vm_flags_t vm_flags) +{ + gm_context_t *ctx, *tmp; + gm_prot_t prot = VM_NONE; + gm_ret_t ret; + + pr_debug("gmem: start mmap, as %p\n", mm->gm_as); + if (!mm->gm_as) + return -ENODEV; + + if (!vma->vm_obj) + vma->vm_obj = vm_object_create(vma); + if (!vma->vm_obj) + return -ENOMEM; + /* + * TODO: consider the concurrency problem of device + * attaching/detaching from the gm_as. + */ + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + if (!gm_dev_is_peer(ctx->dev)) + continue; + + if (!ctx->dev->mmu->peer_va_alloc_fixed) { + pr_debug("gmem: mmu ops has no alloc_vma\n"); + continue; + } + + pr_debug("gmem: call vma_alloc\n"); + ret = ctx->dev->mmu->peer_va_alloc_fixed(mm, addr, len, vm_flags); + if (ret != GM_RET_SUCCESS) { + pr_debug("gmem: alloc_vma ret %d\n", ret); + return ret; + } + } + + return GM_RET_SUCCESS; +} +#endif + static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, @@ -2762,7 +2980,12 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, pgoff_t vm_pgoff; int error; VMA_ITERATOR(vmi, mm, addr); +#ifdef CONFIG_GMEM + unsigned int retry_times = 0; + LIST_HEAD(reserve_list); +retry: +#endif /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; @@ -2774,21 +2997,33 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, nr_pages = count_vma_pages_range(mm, addr, end); if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) + (len >> PAGE_SHIFT) - nr_pages)) { +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return -ENOMEM; + } } /* Unmap any existing mapping in the area */ - if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) + if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) { +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return -ENOMEM; + } /* * Private writable mapping: check memory availability */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) + if (security_vm_enough_memory_mm(mm, charged)) { +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return -ENOMEM; + } vm_flags |= VM_ACCOUNT; } @@ -2931,6 +3166,23 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, file = vma->vm_file; ksm_add_vma(vma); expanded: +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + gm_ret_t ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); + + if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + retry_times++; + addr = get_unmapped_area(file, addr, len, pgoff, 0); + gmem_reserve_vma(vma, &reserve_list); + goto retry; + } else if (ret != GM_RET_SUCCESS) { + pr_debug("gmem: alloc_vma ret %d\n", ret); + error = -ENOMEM; + goto free_vma; + } + gmem_release_vma(mm, &reserve_list); + } +#endif perf_event_mmap(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); @@ -2974,6 +3226,9 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, unacct_error: if (charged) vm_unacct_memory(charged); +#ifdef CONFIG_GMEM + gmem_release_vma(mm, &reserve_list); +#endif return error; } diff --git a/mm/vm_object.c b/mm/vm_object.c new file mode 100644 index 000000000000..8d3d6b121649 --- /dev/null +++ b/mm/vm_object.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Logical Mapping Management + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Weixi zhu, chao Liu + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Sine VM_OBJECT maintains the logical page table under each VMA, and each VMA + * points to a VM_OBJECT. Ultimately VM_OBJECTs must be maintained as long as VMA + * gets changed: merge, split, adjust + */ +static struct kmem_cache *vm_object_cachep; +static struct kmem_cache *gm_mapping_cachep; + +/* gm_mapping will not be release dynamically */ +gm_mapping_t *alloc_gm_mapping(void) +{ + gm_mapping_t *gm_mapping = kmem_cache_zalloc(gm_mapping_cachep, GFP_KERNEL); + + if (!gm_mapping) + return NULL; + + gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + mutex_init(&gm_mapping->lock); + + return gm_mapping; +} +EXPORT_SYMBOL(alloc_gm_mapping); + +static inline void release_gm_mapping(gm_mapping_t *mapping) +{ + kmem_cache_free(gm_mapping_cachep, mapping); +} + +static inline gm_mapping_t *lookup_gm_mapping(struct vm_object *obj, unsigned long pindex) +{ + return xa_load(obj->logical_page_table, pindex); +} + +int __init vm_object_init(void) +{ + vm_object_cachep = KMEM_CACHE(vm_object, 0); + if (!vm_object_cachep) + goto out; + + gm_mapping_cachep = KMEM_CACHE(gm_mapping, 0); + if (!gm_mapping_cachep) + goto free_vm_object; + + return 0; +free_vm_object: + kmem_cache_destroy(vm_object_cachep); +out: + return -ENOMEM; +} + +/* + * Create a VM_OBJECT and attach it to a VMA + * This should be called when a VMA is created. + */ +struct vm_object *vm_object_create(struct vm_area_struct *vma) +{ + struct vm_object *obj = kmem_cache_alloc(vm_object_cachep, GFP_KERNEL); + + if (!obj) + return NULL; + + spin_lock_init(&obj->lock); + obj->vma = vma; + + /* + * The logical page table maps linear_page_index(obj->vma, va) + * to pointers of struct gm_mapping. + */ + obj->logical_page_table = kmalloc(sizeof(struct xarray), GFP_KERNEL); + if (!obj->logical_page_table) { + kmem_cache_free(vm_object_cachep, obj); + return NULL; + } + + xa_init(obj->logical_page_table); + atomic_set(&obj->nr_pages, 0); + atomic_set(&obj->ref_count, 1); + + return obj; +} + +/* This should be called when a VMA no longer refers to a VM_OBJECT */ +void vm_object_drop_locked(struct vm_area_struct *vma) +{ + struct vm_object *obj = vma->vm_obj; + + if (!obj) { + pr_err("vm_object: vm_obj of the vma is NULL\n"); + return; + } + + /* + * We must enter this with VMA write-locked, which is unfortunately a giant lock. + * Note that Linux 6.0 has per-VMA lock: + * https://lwn.net/Articles/906852/ + * https://lwn.net/Articles/906833/ + */ + free_gm_mappings(vma); + mmap_assert_write_locked(vma->vm_mm); + vma->vm_obj = NULL; + + if (atomic_dec_and_test(&obj->ref_count)) { + xa_destroy(obj->logical_page_table); + kfree(obj->logical_page_table); + kmem_cache_free(vm_object_cachep, obj); + } +} + +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src) +{ + unsigned long index; + gm_mapping_t *mapping; + unsigned long moved_pages = 0; + + XA_STATE(xas, src->vm_obj->logical_page_table, linear_page_index(src, src->vm_start)); + + xa_lock(dst->vm_obj->logical_page_table); + rcu_read_lock(); + xas_for_each(&xas, mapping, linear_page_index(src, src->vm_end)) { + index = xas.xa_index - src->vm_pgoff + dst->vm_pgoff + + ((src->vm_start - dst->vm_start) >> PAGE_SHIFT); + __xa_store(dst->vm_obj->logical_page_table, index, mapping, GFP_KERNEL); + moved_pages++; + } + rcu_read_unlock(); + atomic_add(moved_pages, &dst->vm_obj->nr_pages); + xa_unlock(dst->vm_obj->logical_page_table); +} + +void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + /* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */ + unsigned long removed_pages = 0; + gm_mapping_t *mapping; + + XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + + xas_lock(&xas); + if (vma->vm_start < start) { + xas_for_each(&xas, mapping, linear_page_index(vma, start)) { + xas_store(&xas, NULL); + removed_pages++; + } + } + + if (vma->vm_end > end) { + xas_set(&xas, linear_page_index(vma, end)); + + xas_for_each(&xas, mapping, linear_page_index(vma, vma->vm_end)) { + xas_store(&xas, NULL); + removed_pages++; + } + } + atomic_sub(removed_pages, &vma->vm_obj->nr_pages); + xas_unlock(&xas); +} + +/* + * Given a VA, the page_index is computed by + * page_index = linear_page_index(struct vm_area_struct *vma, unsigned long address) + */ +struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va) +{ + return lookup_gm_mapping(obj, linear_page_index(obj->vma, va)); +} +EXPORT_SYMBOL_GPL(vm_object_lookup); + +void vm_object_mapping_create(struct vm_object *obj, unsigned long start) +{ + pgoff_t index = linear_page_index(obj->vma, start); + gm_mapping_t *gm_mapping; + + gm_mapping = alloc_gm_mapping(); + if (!gm_mapping) + return; + + __xa_store(obj->logical_page_table, index, gm_mapping, GFP_KERNEL); +} + +void free_gm_mappings(struct vm_area_struct *vma) +{ + gm_mapping_t *gm_mapping; + XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + + xa_lock(vma->vm_obj->logical_page_table); + xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end)) { + release_gm_mapping(gm_mapping); + xas_store(&xas, NULL); + } + xa_unlock(vma->vm_obj->logical_page_table); +} -- Gitee From 204398ce50856da2db1ea734dc593728b50adfdd Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Wed, 25 Jun 2025 10:49:25 +0800 Subject: [PATCH 08/27] openeuler_defconfig: Enable gmem related configs euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Enable gmem related configs. Signed-off-by: Ma Wupeng --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 83697bf28c02..456a6a491e40 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1233,6 +1233,7 @@ CONFIG_LRU_GEN=y CONFIG_ARM64_HAFT=y CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y +CONFIG_GMEM=y CONFIG_LOCK_MM_AND_FIND_VMA=y CONFIG_IOMMU_MM_DATA=y # CONFIG_ASCEND_FEATURES is not set diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 4050098c0775..40db2117da87 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1204,6 +1204,7 @@ CONFIG_LRU_GEN=y # CONFIG_LRU_GEN_STATS is not set CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y +CONFIG_GMEM=y CONFIG_LOCK_MM_AND_FIND_VMA=y CONFIG_IOMMU_MM_DATA=y CONFIG_PAGE_CACHE_LIMIT=y -- Gitee From a358a8f128f2c40ed3b2390ba650f5f060e75a06 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Wed, 25 Jun 2025 10:57:25 +0800 Subject: [PATCH 09/27] mm: gmem: Display VM_PEER_SHARED as ps during smaps euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Display VM_PEER_SHARED as ps during smaps. Signed-off-by: Ma Wupeng --- fs/proc/task_mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8a691365061c..84faaddafddf 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -698,6 +698,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +#ifdef CONFIG_GMEM + [ilog2(VM_PEER_SHARED)] = "ps", +#endif #ifdef CONFIG_X86_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif -- Gitee From 150377da8adbc266873bc810b8a72bea1aa8e26f Mon Sep 17 00:00:00 2001 From: Ni Cunshu Date: Wed, 25 Jun 2025 16:44:12 +0800 Subject: [PATCH 10/27] drivers: remote_pager: introduce remote_pager module for gmem euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Introduce remote_pager, an extension module for GMEM, which can be used for communication between the remote and host Signed-off-by: Ni Cunshu --- arch/arm64/configs/openeuler_defconfig | 7 + arch/x86/configs/openeuler_defconfig | 7 + drivers/Kconfig | 2 + drivers/Makefile | 2 + drivers/remote_pager/Kconfig | 38 + drivers/remote_pager/Makefile | 12 + drivers/remote_pager/main.c | 32 + .../msg_chan/msg_layer/msg_layer.c | 271 +++++++ .../msg_chan/msg_layer/msg_layer.h | 48 ++ drivers/remote_pager/msg_handler.h | 132 ++++ drivers/remote_pager/msg_handler_comm.c | 142 ++++ drivers/remote_pager/msg_handler_origin.c | 474 +++++++++++++ drivers/remote_pager/msg_handler_peer.c | 667 ++++++++++++++++++ drivers/remote_pager/svm_proc_mng.c | 419 +++++++++++ drivers/remote_pager/svm_proc_mng.h | 65 ++ drivers/remote_pager/wait_station.c | 81 +++ drivers/remote_pager/wait_station.h | 30 + include/linux/remote_pager/msg_chan.h | 43 ++ 18 files changed, 2472 insertions(+) create mode 100644 drivers/remote_pager/Kconfig create mode 100644 drivers/remote_pager/Makefile create mode 100644 drivers/remote_pager/main.c create mode 100644 drivers/remote_pager/msg_chan/msg_layer/msg_layer.c create mode 100644 drivers/remote_pager/msg_chan/msg_layer/msg_layer.h create mode 100644 drivers/remote_pager/msg_handler.h create mode 100644 drivers/remote_pager/msg_handler_comm.c create mode 100644 drivers/remote_pager/msg_handler_origin.c create mode 100644 drivers/remote_pager/msg_handler_peer.c create mode 100644 drivers/remote_pager/svm_proc_mng.c create mode 100644 drivers/remote_pager/svm_proc_mng.h create mode 100644 drivers/remote_pager/wait_station.c create mode 100644 drivers/remote_pager/wait_station.h create mode 100644 include/linux/remote_pager/msg_chan.h diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 456a6a491e40..14a0123acc30 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -7023,6 +7023,13 @@ CONFIG_CPU_INSPECTOR_ATF=m CONFIG_ROH=m CONFIG_ROH_HNS=m CONFIG_ARM_SPE_MEM_SAMPLING=y + +# +# remote pager device +# +CONFIG_REMOTE_PAGER=m +CONFIG_REMOTE_PAGER_MASTER=m +# end of remote pager device # end of Device Drivers # diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 40db2117da87..e9445d996e46 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -8210,6 +8210,13 @@ CONFIG_INTEL_TH_PTI=m # # CONFIG_CPU_INSPECT is not set # end of CPU Inspect + +# +# remote pager device +# +CONFIG_REMOTE_PAGER=m +CONFIG_REMOTE_PAGER_MASTER=m +# end of remote pager device # end of Device Drivers # diff --git a/drivers/Kconfig b/drivers/Kconfig index da6544d0c108..64acbbd060ee 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -247,6 +247,8 @@ source "drivers/hte/Kconfig" source "drivers/cdx/Kconfig" +source "drivers/remote_pager/Kconfig" + source "drivers/cpuinspect/Kconfig" source "drivers/roh/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index 9af19fcf784c..b66caa9a69a4 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -201,6 +201,8 @@ obj-$(CONFIG_HTE) += hte/ obj-$(CONFIG_DRM_ACCEL) += accel/ obj-$(CONFIG_CDX_BUS) += cdx/ +obj-$(CONFIG_REMOTE_PAGER) += remote_pager/ + obj-$(CONFIG_S390) += s390/ obj-$(CONFIG_ROH) += roh/ diff --git a/drivers/remote_pager/Kconfig b/drivers/remote_pager/Kconfig new file mode 100644 index 000000000000..bf0d0f58a3d4 --- /dev/null +++ b/drivers/remote_pager/Kconfig @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "remote pager device" + +config REMOTE_PAGER + tristate "remote pager" + default m + depends on GMEM + help + Module used for gmem. + This is comm part, including send and recv message function + Used for memory management + If unsure, say Y. + +config REMOTE_PAGER_MASTER + tristate "remote pager master" + default m + depends on REMOTE_PAGER + help + Module used for gmem. + This is host part, used for send and recv message from device + Used for memory management + If unsure, say Y. + +config REMOTE_PAGER_SLAVE + tristate "remote pager slave" + default n + depends on ARM64 + depends on REMOTE_PAGER + help + Module used for gmem. + This is host part, used for send and recv message from device + Used for memory management + If you want to remote_pager driver to support the peer function, + say m. + If unsure, say Y. + +endmenu diff --git a/drivers/remote_pager/Makefile b/drivers/remote_pager/Makefile new file mode 100644 index 000000000000..cb723290af59 --- /dev/null +++ b/drivers/remote_pager/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_REMOTE_PAGER) += remote_pager.o + +remote_pager-$(CONFIG_REMOTE_PAGER) := main.o \ + wait_station.o \ + msg_handler_comm.o \ + msg_chan/msg_layer/msg_layer.o \ + svm_proc_mng.o + +remote_pager-$(CONFIG_REMOTE_PAGER_MASTER) += msg_handler_origin.o +remote_pager-$(CONFIG_REMOTE_PAGER_SLAVE) += msg_handler_peer.o \ No newline at end of file diff --git a/drivers/remote_pager/main.c b/drivers/remote_pager/main.c new file mode 100644 index 000000000000..1e4aec881b9e --- /dev/null +++ b/drivers/remote_pager/main.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generalized Memory Management. + * + * Copyright (c) 2023- Huawei, Inc. + * Author: Chunsheng Luo + * Co-Author: Jun Chen + */ +#include +#include "msg_chan/msg_layer/msg_layer.h" +#include "msg_handler.h" + +static int __init remote_pager_init(void) +{ + msg_handle_init(); + return 0; +} + +static void __exit remote_pager_exit(void) +{ + /* + * If module_init() is implemented, module_exit() + * should be implemented as well. + */ +} + +module_init(remote_pager_init); +module_exit(remote_pager_exit); + +MODULE_AUTHOR("Huawei Tech. Co., Ltd."); +MODULE_DESCRIPTION("Remote-pager"); +MODULE_ALIAS("Remote-pager"); diff --git a/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c b/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c new file mode 100644 index 000000000000..eceff696c77f --- /dev/null +++ b/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generalized Memory Management. + * + * Copyright (c) 2023- Huawei, Inc. + * Author: Chunsheng Luo + * Co-Author: Jiangtian Feng, Jun Chen + */ +#include +#include +#include +#include + +#include "msg_layer.h" + +#define MAX_NUM_NODES 16 +#define MSG_SLEEP_MIN 2 +#define MSG_SLEEP_MAX 3 + +/* Per-node handle */ +struct sock_handle { + int nid; + int status; + int chan_id; + struct task_struct *recv_handler; +}; + +static struct sock_handle sock_handles[MAX_NUM_NODES]; +static struct phys_channel_ops *g_phys_chan_ops; + +int msg_send(int chan_id, void *msg_data, size_t msg_len) +{ + int ret = 0; + + if (!g_phys_chan_ops) + return -ENOENT; + + ret = g_phys_chan_ops->copy_to(chan_id, msg_data, msg_len, 1); + ret |= g_phys_chan_ops->notify(chan_id); + + return ret; +} + +static inline int build_msg(int type, int from_nid, int to_nid, void *msg_data, size_t msg_len) +{ + struct rpg_kmsg_message *msg = (struct rpg_kmsg_message *)msg_data; + + msg->header.type = type; + msg->header.prio = RPG_KMSG_PRIO_NORMAL; + msg->header.size = msg_len; + msg->header.from_nid = from_nid; + msg->header.to_nid = to_nid; + + return 0; +} + +int msg_send_nid(int type, int from_nid, int to_nid, void *msg_data, size_t msg_len) +{ + struct sock_handle *sh = sock_handles + to_nid; + + build_msg(type, from_nid, to_nid, msg_data, msg_len); + + return msg_send(sh->chan_id, msg_data, msg_len); +} +EXPORT_SYMBOL(msg_send_nid); + +int msg_recv(int chan_id, void *buf, size_t len) +{ + if (!g_phys_chan_ops) + return -ENOENT; + + return g_phys_chan_ops->copy_from(chan_id, buf, len, 1); +} + +extern int handle_remote_pager_work(void *msg); +static int recv_handler(void *arg) +{ + struct sock_handle *sh = arg; + + log_info("RECV handler for %d is ready ha %ld\n", sh->nid, sizeof(struct rpg_kmsg_hdr)); + + while (!kthread_should_stop()) { + size_t len; + int ret; + size_t offset; + struct rpg_kmsg_hdr header; + char *data = NULL; + size_t msg_len = 0; + + /* compose header */ + offset = 0; + len = sizeof(header); + while (len > 0) { + ret = msg_recv(sh->chan_id, (char *)(&header) + offset, len); + if (ret == -ENOENT) { + pr_err("no msg chan failed\n"); + usleep_range(MSG_SLEEP_MIN, MSG_SLEEP_MAX); + break; + } + + if ((ret == -1) || kthread_should_stop()) + return 0; + + offset += ret; + len -= ret; + } + + if (ret < 0) + break; + + msg_len = header.size; + if (!msg_len) { + pr_err("msg_len is zero failed? from_nid %d prio:%d type:%d size:%ld\n", + header.from_nid, header.prio, header.type, header.size); + continue; + } + + /* compose body */ + data = kmalloc(msg_len, GFP_KERNEL); + if WARN_ON_ONCE(!data && "Unable to alloc a message") + return -1; + memcpy(data, &header, sizeof(header)); + + offset = sizeof(header); + len = msg_len - offset; + + while (len > 0) { + ret = msg_recv(sh->chan_id, data + offset, len); + if (ret == -1 || kthread_should_stop()) + return 0; + + offset += ret; + len -= ret; + } + + if (ret < 0) + break; + + /* Call pcn_kmsg upper layer */ + handle_remote_pager_work(data); + } + + return 0; +} + +int msg_open(int nid) +{ + int chan_id = 0; + struct sock_handle *sh = sock_handles + nid; + struct task_struct *tsk_recv; + + if (sh->status == MSG_CHAN_ENABLE) { + pr_err("node:%d msg chan is enabled\n", nid); + return 0; + } + + if (!g_phys_chan_ops) + return -ENOENT; + + chan_id = g_phys_chan_ops->open(nid); + if (chan_id < 0) { + log_err("open msg channel failed %d\n", chan_id); + return chan_id; + } + + tsk_recv = kthread_run(recv_handler, sock_handles + nid, "remote-pager-recv"); + if (IS_ERR(tsk_recv)) { + log_err("Cannot create %s handler, %ld\n", "remote-pager-recv", PTR_ERR(tsk_recv)); + return PTR_ERR(tsk_recv); + } + + sh->chan_id = chan_id; + sh->status = MSG_CHAN_ENABLE; + sh->nid = nid; + sh->recv_handler = tsk_recv; + + pr_err("%s chanid %d\n", __func__, chan_id); + + return chan_id; +} +EXPORT_SYMBOL(msg_open); + +int msg_close(int nid) +{ + struct sock_handle *sh = sock_handles + nid; + + /* TODO: Get sock_handle, then set sock_handle disable and destroy recv task */ + if (sh->status != MSG_CHAN_ENABLE) { + pr_err("node:%d msg chan is disabled\n", nid); + return 0; + } + + if (sh->recv_handler) { + kthread_stop(sh->recv_handler); + sh->recv_handler = NULL; + } + + if (g_phys_chan_ops) + g_phys_chan_ops->close(sh->chan_id); + + sh->chan_id = 0; + sh->status = MSG_CHAN_DISABLE; + + return 0; +} +EXPORT_SYMBOL(msg_close); + +int handle_migrate_page(void *peer_addr, struct page *local_page, size_t size, int dir) +{ + if (!g_phys_chan_ops) + return -ENOENT; + + return g_phys_chan_ops->migrate_page(peer_addr, local_page, size, dir); +} +EXPORT_SYMBOL(handle_migrate_page); + +static DEFINE_SPINLOCK(install_lock); +static int default_msg_chan_id; +int msg_layer_install_phy_ops(struct phys_channel_ops *ops, int default_chan_id) +{ + int ret = 0; + + if (!ops) { + pr_err("install NULL as msg channel\n"); + return -EINVAL; + } + + spin_lock(&install_lock); + if (g_phys_chan_ops) { + ret = -EEXIST; + pr_err("phy_ops areadly be installed\n"); + goto unlock; + } + + /* must before msg_open */ + g_phys_chan_ops = ops; + if (default_chan_id >= 0) { + ret = msg_open(default_chan_id); + if (ret) { + pr_err("can not open msg channel %d\n", default_chan_id); + g_phys_chan_ops = NULL; + goto unlock; + } + } + + default_msg_chan_id = default_chan_id; + +unlock: + spin_unlock(&install_lock); + return ret; +} +EXPORT_SYMBOL(msg_layer_install_phy_ops); + +int msg_layer_uninstall_phy_ops(struct phys_channel_ops *ops) +{ + if (!ops || ops != g_phys_chan_ops) { + pr_err("Invalid phy_ops\n"); + return -EINVAL; + } + + spin_lock(&install_lock); + if (default_msg_chan_id >= 0) + msg_close(default_msg_chan_id); + + g_phys_chan_ops = NULL; + default_msg_chan_id = -1; + spin_unlock(&install_lock); + + return 0; +} +EXPORT_SYMBOL(msg_layer_uninstall_phy_ops); diff --git a/drivers/remote_pager/msg_chan/msg_layer/msg_layer.h b/drivers/remote_pager/msg_chan/msg_layer/msg_layer.h new file mode 100644 index 000000000000..1217dafeaf52 --- /dev/null +++ b/drivers/remote_pager/msg_chan/msg_layer/msg_layer.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Generalized Memory Management. + * + * Copyright (c) 2023- Huawei, Inc. + * Author: Chunsheng Luo + * Co-Author: Jiangtian Feng + */ +#ifndef __MSG_LAYER_H__ +#define __MSG_LAYER_H__ + +#include +#include + +#define RPG_KMSG_MAX_SIZE (64UL << 10) +#define RPG_KMSG_MAX_PAYLOAD_SIZE \ + (RPG_KMSG_MAX_SIZE - sizeof(struct rpg_kmsg_hdr)) + +/* Enumerate message priority. XXX Priority is not supported yet. */ +enum rpg_kmsg_prio { + RPG_KMSG_PRIO_LOW, + RPG_KMSG_PRIO_NORMAL, + RPG_KMSG_PRIO_HIGH, +}; + +#define MSG_CHAN_DISABLE 0 +#define MSG_CHAN_ENABLE 1 + +struct rpg_kmsg_hdr { + int from_nid :6; + int to_nid :6; + enum rpg_kmsg_prio prio :2; + int type :8; + size_t size; +} __packed; + +struct rpg_kmsg_message { + struct rpg_kmsg_hdr header; + unsigned char data[RPG_KMSG_MAX_PAYLOAD_SIZE]; +} __packed; + +int msg_send_nid(int type, int from_nid, int to_nid, void *msg_data, size_t msg_len); +int msg_send(int chan_id, void *msg_data, size_t msg_len); +int msg_recv(int chan_id, void *buf, size_t len); +int msg_open(int nid); +int msg_close(int nid); +int handle_migrate_page(void *peer_addr, struct page *local_page, size_t size, int dir); + diff --git a/drivers/remote_pager/msg_handler.h b/drivers/remote_pager/msg_handler.h new file mode 100644 index 000000000000..cb08fe765c77 --- /dev/null +++ b/drivers/remote_pager/msg_handler.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generalized Memory Management. + * + * Copyright (c) 2023- Huawei, Inc. + * Author: Liming Huang + * Co-Author: Jun Chen + * + */ +#ifndef _REMOTE_PAGER_MSG_HANDLER_H_ +#define _REMOTE_PAGER_MSG_HANDLER_H_ + +#include +#include + +#ifdef WITH_GMEM +#include +#endif + +#include "wait_station.h" +#include "msg_chan/msg_layer/msg_layer.h" + +#define PXD_JUDGE(pxd) (((pxd) == NULL) || (pxd##_none(*(pxd##_t *)(pxd)) != 0) || \ + (pxd##_bad(*(pxd##_t *)(pxd)) != 0)) +#define PMD_JUDGE(pmd) (((pmd) == NULL) || (pmd_none(*(pmd_t *)(pmd)) != 0) || \ + (pmd_bad(*(pmd_t *)(pmd)) != 0)) + +#define GMEM_COPY_PAGE 1 + +/* Function pointer to callback function */ +typedef int (*rpg_kmsg_cbftn)(struct rpg_kmsg_message *); + +enum rpg_kmsg_type { + /* TASK CMD */ + GMEM_TASK_PAIRING_REQUEST, + GMEM_TASK_EXIT_ORIGIN, + GMEM_TASK_EXIT_REMOTE, + + /* VMA CMD */ + GMEM_ALLOC_VMA_REQUEST, + GMEM_FREE_VMA_REQUEST, + + /* PAGE CMD */ + GMEM_ALLOC_PAGE_REQUEST, + GMEM_FREE_PAGE_REQUEST, + GMEM_PAGE_FAULT_REQUEST, + GMEM_EVICT_PAGE_REQUEST, + + /* ADVISE CMD */ + GMEM_HMADVISE_REQUEST, + GMEM_HMEMCPY_REQUEST, + + GMEM_COMMON_RESPONSE, + GMEM_MSG_MAX_ID, +}; + +enum msg_location { + MSG_ON_ORIGIN, + MSG_ON_REMOTE, +}; + +struct rpg_kmsg_work { + struct work_struct work; + void *msg; +}; + +struct msg_handler_st { + rpg_kmsg_cbftn fnt; +}; + +struct comm_msg_rsp { + struct rpg_kmsg_hdr header; + int peer_ws; + int ret; +}; + +struct gm_pair_msg_rq { + struct rpg_kmsg_hdr header; + unsigned int my_ws; + unsigned int my_pid; + unsigned int peer_nid; + unsigned int peer_pid; +}; + +struct gm_pager_msg_rq { + struct rpg_kmsg_hdr header; + unsigned int my_ws; + unsigned int peer_pid; + unsigned long va; + unsigned long dma_addr; + unsigned long size; + unsigned long prot; + unsigned long flags; + int behavior; +}; + +struct gm_evict_page_msg_rq { + struct rpg_kmsg_hdr header; + unsigned int peer_pid; + unsigned int ws; + unsigned long va; + unsigned long size; +}; + + +int gmem_register_pair_remote_task(int origin_nid, int origin_pid, int remote_nid, int remote_pid); + +#ifdef WITH_GMEM +gm_dev_t *gmem_id_to_device(unsigned int id); +#endif + + +/* msg handler */ +int gmem_handle_task_pairing(struct rpg_kmsg_message *msg); +int gmem_handle_comm_msg_rsp(struct rpg_kmsg_message *msg); +int gmem_handle_alloc_vma_fixed(struct rpg_kmsg_message *msg); +int gmem_handle_free_vma(struct rpg_kmsg_message *msg); + +int gmem_handle_alloc_page(struct rpg_kmsg_message *msg); +int gmem_handle_free_page(struct rpg_kmsg_message *msg); +int gmem_handle_hmadvise(struct rpg_kmsg_message *msg); +int gmem_handle_hmemcpy(struct rpg_kmsg_message *msg); +int gmem_handle_dev_fault(struct rpg_kmsg_message *msg); +int gmem_handle_evict_page(struct rpg_kmsg_message *msg); + +int gmem_add_to_svm_proc(int my_nid, int my_pid, int peer_nid, int peer_pid); +int gmem_send_comm_msg_reply(unsigned int from_nid, unsigned int to_nid, + unsigned int peer_ws, int ret); + +int handle_remote_pager_work(void *msg); +int msg_handle_init(void); + diff --git a/drivers/remote_pager/msg_handler_comm.c b/drivers/remote_pager/msg_handler_comm.c new file mode 100644 index 000000000000..f195ad15e646 --- /dev/null +++ b/drivers/remote_pager/msg_handler_comm.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Chushu Ni + * Co-Author: Chunsheng Luo + */ +#include +#include +#include +#include + +#include "msg_handler.h" +#include "svm_proc_mng.h" + +static struct workqueue_struct *remote_pager_wq; + +struct msg_handler_st rpg_kmsg_cbftns[GMEM_MSG_MAX_ID] = { +#if IS_ENABLED(CONFIG_REMOTE_PAGER_SLAVE) + /* HOST TO REMOTE */ + [GMEM_TASK_PAIRING_REQUEST] = { + gmem_handle_task_pairing + }, + [GMEM_ALLOC_VMA_REQUEST] = { + gmem_handle_alloc_vma_fixed + }, + [GMEM_FREE_VMA_REQUEST] = { + gmem_handle_free_vma + }, + [GMEM_ALLOC_PAGE_REQUEST] = { + gmem_handle_alloc_page + }, + [GMEM_FREE_PAGE_REQUEST] = { + gmem_handle_free_page + }, + [GMEM_HMADVISE_REQUEST] = { + gmem_handle_hmadvise + }, + [GMEM_HMEMCPY_REQUEST] = { + gmem_handle_hmemcpy + }, +#endif + +#if IS_ENABLED(CONFIG_REMOTE_PAGER_MASTER) + /* REMOTE TO HOST */ + [GMEM_PAGE_FAULT_REQUEST] = { + gmem_handle_dev_fault + }, + [GMEM_EVICT_PAGE_REQUEST] = { + gmem_handle_evict_page + }, +#endif + + /* BOTH */ + [GMEM_COMMON_RESPONSE] = { + gmem_handle_comm_msg_rsp + }, +}; + +int gmem_handle_comm_msg_rsp(struct rpg_kmsg_message *msg) +{ + struct comm_msg_rsp *rsp = (struct comm_msg_rsp *)msg; + struct wait_station *my_ws = wait_station(rsp->peer_ws); + + my_ws->private = rsp; + /* must first set my_ws */ + smp_rmb(); + complete(&my_ws->pendings); + + return 0; +} + +int gmem_send_comm_msg_reply(unsigned int from_nid, unsigned int to_nid, + unsigned int peer_ws, int reply) +{ + struct comm_msg_rsp rsp; + int ret = reply; + + rsp.ret = reply; + rsp.peer_ws = peer_ws; + ret = msg_send_nid(GMEM_COMMON_RESPONSE, from_nid, + to_nid, &rsp, sizeof(struct comm_msg_rsp)); + + return ret; +} + +int gmem_add_to_svm_proc(int my_nid, int my_pid, int peer_nid, int peer_pid) +{ + struct svm_proc *peer_proc; + + peer_proc = alloc_svm_proc(my_nid, my_pid, peer_nid, peer_pid); + if (!peer_proc) + return -1; + + return 0; +} + +void process_remote_pager_work(struct work_struct *work) +{ + struct rpg_kmsg_work *w = container_of(work, struct rpg_kmsg_work, work); + struct rpg_kmsg_message *msg = w->msg; + rpg_kmsg_cbftn ftn; + + ftn = rpg_kmsg_cbftns[msg->header.type].fnt; + if (ftn != NULL) + ftn(msg); + else + pr_err("No callback registered for %d\n", msg->header.type); + kfree(w); +} + +int handle_remote_pager_work(void *msg) +{ + struct rpg_kmsg_work *w = kmalloc(sizeof(*w), GFP_ATOMIC); + + w->msg = msg; + + INIT_WORK(&w->work, process_remote_pager_work); + /* should firstly initialize w */ + smp_wmb(); + queue_work(remote_pager_wq, &w->work); + + return 0; +} + +int msg_handle_init(void) +{ + unsigned int flags = __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE; + + remote_pager_wq = alloc_workqueue("remote_wq", flags, 0); + if (!remote_pager_wq) { + pr_err("%s alloc workqueue failed %lx\n", __func__, (unsigned long)remote_pager_wq); + return -1; + } + + pr_err("%s alloc workqueue%lx\n", __func__, (unsigned long)remote_pager_wq); +#ifndef WITH_GMEM + msg_open(0); +#endif + return 0; +} diff --git a/drivers/remote_pager/msg_handler_origin.c b/drivers/remote_pager/msg_handler_origin.c new file mode 100644 index 000000000000..8c861b6d2eb0 --- /dev/null +++ b/drivers/remote_pager/msg_handler_origin.c @@ -0,0 +1,474 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generalized Memory Management. + * + * Copyright (c) 2023- Huawei, Inc. + * Author: Bin Wang + * Co-Author: Chunsheng Luo, Cunshu Ni + * + */ +#include +#include +#include +#include +#include +#include + +#include "msg_handler.h" +#include "wait_station.h" +#include "svm_proc_mng.h" + +#define NPU_PAGE_SIZE PAGE_SIZE +#define MAX_NR_NPU 8 +#define GMEM_DEBUG 0 + +static gm_dev_t *gm_devs[MAX_NR_NPU]; + +gm_dev_t *gmem_id_to_device(unsigned int id) +{ + if (id >= MAX_NR_NPU) { + pr_err("device id is invalid. (dev_id = %u)\n", id); + return NULL; + } + + return gm_devs[id]; +} + +int gmem_register_pair_remote_task(int origin_nid, int origin_pid, int remote_nid, int remote_pid) +{ + struct gm_pair_msg_rq req; + struct comm_msg_rsp *rsp; + int ret = 0; + struct wait_station *ws; + + /* open msg chan */ + pr_err("%s origin_nid %d, origin_pid %d, remote_nid %d, remote_pid %d\n", __func__, + origin_nid, origin_pid, remote_nid, remote_pid); + ret = msg_open(remote_nid); + if (ret < 0) { + pr_err("%s open msg chan failed\n", __func__); + return ret; + } + + /* start pairing */ + ws = get_wait_station(); + req.my_pid = origin_pid; + req.my_ws = ws->id; + req.peer_nid = remote_nid; + req.peer_pid = remote_pid; + + ret = msg_send_nid(GMEM_TASK_PAIRING_REQUEST, origin_nid, + remote_nid, &req, sizeof(struct gm_pair_msg_rq)); + rsp = wait_at_station(ws); + if ((long)rsp != -ETIMEDOUT) { + ret = rsp->ret; + kfree(rsp); + gmem_add_to_svm_proc(origin_nid, origin_pid, remote_nid, remote_pid); + } + + return ret; +} +EXPORT_SYMBOL(gmem_register_pair_remote_task); + +int gmem_handle_dev_fault(struct rpg_kmsg_message *msg) +{ + int ret; + struct gm_pager_msg_rq *recv = (struct gm_pager_msg_rq *)msg; + unsigned int my_pid = recv->peer_pid; + unsigned int nid = recv->header.to_nid; + unsigned int peer_nid = recv->header.from_nid; + unsigned int peer_ws = recv->my_ws; + gm_dev_t *dev = gm_devs[peer_nid]; + struct task_struct *tsk; + struct mm_struct *mm; + + tsk = find_get_task_by_vpid(my_pid); + if (!tsk) { + pr_err("svm process does not have task_struct\n"); + ret = GM_RET_FAILURE_UNKNOWN; + goto out; + } + + mm = get_task_mm(tsk); + if (!mm) { + pr_err("no mm\n"); + ret = GM_RET_FAILURE_UNKNOWN; + goto put_task; + } + + if (!dev) { + pr_info("gmem: device get failed, dev_id %ld\n", (unsigned long)peer_nid); + ret = -ENODEV; + goto put_mm; + } + + ret = gm_dev_fault(mm, recv->va, dev, 0); + if (ret != GM_RET_SUCCESS && ret != GM_RET_PAGE_EXIST) { + pr_info("gmem dev fault failed\n"); + ret = -EFAULT; + goto put_mm; + } + +put_mm: + mmput(mm); +put_task: + put_task_struct(tsk); +out: + gmem_send_comm_msg_reply(nid, peer_nid, peer_ws, ret); + kfree(msg); + return ret; +} + +gm_ret_t gmem_map(struct gm_fault_t *gmf) +{ + int ret = 0; + struct wait_station *ws; + struct comm_msg_rsp *rsp; + struct mm_struct *mm = gmf->mm; + struct svm_proc *proc = search_svm_proc_by_mm(mm); + struct gm_pager_msg_rq req = { + .peer_pid = proc->peer_pid, + .va = gmf->va, + .size = gmf->size, + .behavior = gmf->behavior + }; + + if (!proc) { + pr_err("can not find proc\n"); + return -EBUSY; + } + + ws = get_wait_station(); + req.my_ws = ws->id; + + if (gmf->copy) { + req.flags |= GMEM_COPY_PAGE; + req.dma_addr = gmf->dma_addr; + } + + ret = msg_send_nid(GMEM_ALLOC_PAGE_REQUEST, proc->nid, proc->peer_nid, + &req, sizeof(struct gm_pager_msg_rq)); + rsp = wait_at_station(ws); + if ((long)rsp == -ETIMEDOUT) + return -EBUSY; + ret |= rsp->ret; + kfree(rsp); + if (ret) { + if (ret == GM_RET_MIGRATING) + pr_info("gmem: race with migrating\n"); + else + pr_info("send alloc page message failed %d\n", ret); + return ret; + } + + return GM_RET_SUCCESS; +} + +gm_ret_t gmem_unmap(struct gm_fault_t *gmf) +{ + int ret; + struct wait_station *ws; + struct comm_msg_rsp *rsp; + struct mm_struct *mm = gmf->mm; + struct svm_proc *proc = search_svm_proc_by_mm(mm); + struct gm_pager_msg_rq req = { + .peer_pid = proc->peer_pid, + .va = gmf->va, + .size = gmf->size, + }; + + if (!proc) { + pr_err("can not find proc\n"); + return -EBUSY; + } + + if (gmf->copy) { + req.flags |= GMEM_COPY_PAGE; + req.dma_addr = gmf->dma_addr; + } + + ws = get_wait_station(); + req.my_ws = ws->id; + + ret = msg_send_nid(GMEM_FREE_PAGE_REQUEST, proc->nid, proc->peer_nid, + &req, sizeof(struct gm_pager_msg_rq)); + rsp = wait_at_station(ws); + if ((long)rsp == -ETIMEDOUT) + return -EBUSY; + ret |= rsp->ret; + kfree(rsp); + if (ret) { + pr_info("send free page message failed %d\n", ret); + return GM_RET_FAILURE_UNKNOWN; + } + + return GM_RET_SUCCESS; +} + +gm_ret_t gmem_alloc(struct mm_struct *mm, unsigned long va, unsigned long size, + unsigned long prot) +{ + int ret = 0; + struct wait_station *ws; + struct comm_msg_rsp *rsp; + struct svm_proc *proc = search_svm_proc_by_mm(mm); + struct gm_pager_msg_rq req = { + .peer_pid = proc->peer_pid, + .va = va, + .size = size, + .prot = prot, + }; + + if (!proc) { + pr_err("can not find proc\n"); + return -EBUSY; + } + + ws = get_wait_station(); + req.my_ws = ws->id; + ret = msg_send_nid(GMEM_ALLOC_VMA_REQUEST, proc->nid, proc->peer_nid, + &req, sizeof(struct gm_pager_msg_rq)); + rsp = wait_at_station(ws); + if ((long)rsp == -ETIMEDOUT) + return -EBUSY; + ret |= rsp->ret; + kfree(rsp); + if (ret) { + pr_info("send alloc vma message failed %d\n", ret); + return GM_RET_NOMEM; + } + + return GM_RET_SUCCESS; +} + +gm_ret_t gmem_free(struct mm_struct *mm, unsigned long va, unsigned long size) +{ + int ret = 0; + struct wait_station *ws; + struct comm_msg_rsp *rsp; + struct svm_proc *proc = search_svm_proc_by_mm(mm); + struct gm_pager_msg_rq req = { + .peer_pid = proc->peer_pid, + .va = va, + .size = size, + }; + + if (!proc) { + pr_err("can not find proc\n"); + return -EBUSY; + } + + ws = get_wait_station(); + req.my_ws = ws->id; + ret = msg_send_nid(GMEM_FREE_VMA_REQUEST, proc->nid, proc->peer_nid, + &req, sizeof(struct gm_pager_msg_rq)); + rsp = wait_at_station(ws); + if ((long)rsp == -ETIMEDOUT) + return -EBUSY; + ret |= rsp->ret; + kfree(rsp); + if (ret) { + pr_info("send free vma message failed %d\n", ret); + return GM_RET_FAILURE_UNKNOWN; + } + + return GM_RET_SUCCESS; +} + +int gmem_handle_evict_page(struct rpg_kmsg_message *msg) +{ + struct gm_evict_page_msg_rq *recv = (struct gm_evict_page_msg_rq *)msg; + unsigned int nid = recv->header.to_nid; + unsigned int peer_nid = recv->header.from_nid; + unsigned int peer_ws = recv->ws; + unsigned int pid = recv->peer_pid; + unsigned long size = recv->size; + unsigned long addr = recv->va; + struct vm_area_struct *vma; + struct page *page; + dma_addr_t dma_addr; + gm_mapping_t *gm_mapping; + struct device *dma_dev; + struct gm_fault_t gmf; + struct svm_proc *proc; + struct task_struct *tsk; + struct mm_struct *mm; + int ret; + struct folio *folio = NULL; + + proc = search_svm_proc_by_pid(pid); + if (!proc) { + pr_err("can not find svm_proc of task-%d\n", pid); + ret = -EINVAL; + goto response; + } + + tsk = find_get_task_by_vpid(pid); + if (!tsk) { + pr_err("can not find task of task-%d\n", pid); + ret = -EINVAL; + goto response; + } + + mm = get_task_mm(tsk); + if (!mm) { + pr_err("task-%d exited\n", pid); + ret = -EINTR; + goto put_task; + } + + if (mm != proc->mm) { + pr_err("miss match\n"); + ret = -EINTR; + goto put_mm; + } + + gmf.mm = mm; + gmf.va = addr; + gmf.size = size; + gmf.copy = GMEM_COPY_PAGE; + + vma = find_vma(mm, addr); + if (!vma || !vma->vm_obj) { + pr_err("evict addr %lx vma %lx vm_obj %lx, no vma or vm_obj\n", addr, + (unsigned long)vma, vma ? (unsigned long)vma->vm_obj : 0); + ret = -EINVAL; + goto put_mm; + } + + gm_mapping = vm_object_lookup(vma->vm_obj, addr); + if (!gm_mapping) { + pr_err("evictim gm_page is NULL\n"); + ret = -EINVAL; + goto put_mm; + } + + mutex_lock(&gm_mapping->lock); + if (gm_mapping_willneed(gm_mapping)) { + pr_info("gmem: racing with prefetch or willneed so cancel evict\n"); + gm_mapping_flags_clear(gm_mapping, GM_PAGE_WILLNEED); + ret = -EINVAL; + goto unlock; + } + + if (!gm_mapping_device(gm_mapping)) { + pr_info("gmem: page is not in device\n"); + ret = -EINVAL; + goto unlock; + } + + if (size == HPAGE_PMD_SIZE) { + folio = vma_alloc_folio(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, addr, true); + page = &folio->page; + } else { + page = alloc_page(GFP_KERNEL); + } + + if (!page) { + pr_err("gmem: gmem_evict_page alloc hugepage failed\n"); + ret = -ENOMEM; + goto unlock; + } + + dma_dev = gm_mapping->dev->dma_dev; + dma_addr = dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL); + gmf.dev = gm_mapping->dev; + gmf.dma_addr = dma_addr; + + ret = gmem_unmap(&gmf); + dma_unmap_page(dma_dev, dma_addr, size, DMA_BIDIRECTIONAL); + if (ret) { + pr_err("gmem_unmap failed, ret %d\n", ret); + put_page(page); + goto unlock; + } + + gm_mapping_flags_set(gm_mapping, GM_PAGE_CPU); + gm_mapping->page = page; + +unlock: + mutex_unlock(&gm_mapping->lock); +put_mm: + mmput(mm); +put_task: + put_task_struct(tsk); +response: + gmem_send_comm_msg_reply(nid, peer_nid, peer_ws, ret); + kfree(msg); + return ret; +} + +gm_ret_t gmem_create(gm_dev_t *dev, void **pmap) +{ + return GM_RET_SUCCESS; +} + +gm_mmu_t gm_mmu = { + .peer_va_alloc_fixed = gmem_alloc, + .pmap_create = gmem_create, + .peer_va_free = gmem_free, + .peer_map = gmem_map, + .peer_unmap = gmem_unmap, +}; + +#define ASCEND910_HBM_START 0x0000000800000000 +#define ASCEND910_HBM_END 0x0000000fffffffff + +gm_ret_t mmu_dev_create(struct device *dev, int devid) +{ + gm_ret_t ret; + + ret = gm_dev_create(&gm_mmu, NULL, GM_DEV_CAP_REPLAYABLE | GM_DEV_CAP_PEER, &dev->gm_dev); + if (ret != GM_RET_SUCCESS) { + pr_err("NPU gmem device create failed\n"); + return ret; + } + + ret = gm_dev_register_physmem(dev->gm_dev, ASCEND910_HBM_START, ASCEND910_HBM_END); + if (ret != GM_RET_SUCCESS) { + pr_err("NPU gmem device register physical memory failed\n"); + goto free_gm_dev; + } + + dev->gm_dev->dma_dev = dev; + gm_devs[devid] = dev->gm_dev; + + pr_info("Create NPU gmem device and register HBM\n"); + return ret; +free_gm_dev: + gm_dev_destroy(dev->gm_dev); + dev->gm_dev = NULL; + return ret; +} +EXPORT_SYMBOL(mmu_dev_create); + +gm_ret_t mmu_as_attach(struct device *dev) +{ + gm_ret_t ret; + gm_dev_t *gm_dev = dev->gm_dev; + gm_context_t *gm_ctx; + + if (!gm_dev) { + pr_err("NPU device gm_dev is NULL\n"); + return GM_RET_FAILURE_UNKNOWN; + } + + if (!current->mm->gm_as) { + ret = gm_as_create(0, ULONG_MAX, GM_AS_ALLOC_DEFAULT, NPU_PAGE_SIZE, + ¤t->mm->gm_as); + if (ret != GM_RET_SUCCESS) { + pr_err("Process %d create gm_as failed\n", current->pid); + return ret; + } + } + + ret = gm_as_attach(current->mm->gm_as, gm_dev, 0, 1, &gm_ctx); + if (ret != GM_RET_SUCCESS) { + pr_err("gm_dev attach to process %d failed\n", current->pid); + return ret; + } + + pr_info("Attach gm_dev to process %d\n", current->pid); + return ret; +} +EXPORT_SYMBOL(mmu_as_attach); diff --git a/drivers/remote_pager/msg_handler_peer.c b/drivers/remote_pager/msg_handler_peer.c new file mode 100644 index 000000000000..68c16f6b3ff9 --- /dev/null +++ b/drivers/remote_pager/msg_handler_peer.c @@ -0,0 +1,667 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Chunsheng Luo + * Co-Author: Weixi Zhu, Jun Chen, Jiangtian Feng + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "msg_handler.h" +#include "svm_proc_mng.h" + +#define MAX_RETRY_TIME 10 + +static inline vm_fault_t get_page_size(enum page_entry_size pe_size, + unsigned int *page_size, + unsigned long *addr) +{ + switch (pe_size) { + case PE_SIZE_PTE: + *page_size = PAGE_SIZE; + break; + case PE_SIZE_PMD: + *page_size = HPAGE_SIZE; + *addr = round_down(*addr, HPAGE_SIZE); + break; + default: + return VM_FAULT_FALLBACK; + } + return 0; +} + +static inline bool addr_is_mapped(unsigned long addr, pmd_t *pmd, + enum page_entry_size pe_size) +{ + pte_t *pte; + bool ret; + + if (pe_size == PE_SIZE_PMD) + return !pmd_none(*pmd); + if (pmd_none(*pmd)) + return false; + pte = pte_offset_map(pmd, addr); + ret = !pte_none(*pte); + pte_unmap(pte); + return ret; +} + +static vm_fault_t __gmem_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + vm_fault_t ret = VM_FAULT_SIGBUS; + int msg_ret = GM_RET_FAILURE_UNKNOWN; + unsigned long addr = vmf->address; + unsigned int page_size; + struct gm_pager_msg_rq req = { 0 }; + struct comm_msg_rsp *rsp; + struct wait_station *ws; + struct page_info *page_info; + struct mm_struct *mm; + struct svm_proc *proc; + + ret = get_page_size(pe_size, &page_size, &addr); + if (ret) + return ret; + + mm = vmf->vma->vm_mm; + proc = search_svm_proc_by_mm(mm); + if (!proc) { + pr_err("%s: failed to get svm proc\n", __func__); + return VM_FAULT_SIGBUS; + } + + page_info = get_page_info(&proc->pager, addr, page_size, page_size); + if (!page_info) { + pr_err("%s: failed to get page_info\n", __func__); + return VM_FAULT_SIGBUS; + } + mutex_lock(&page_info->lock); + + if (addr_is_mapped(addr, vmf->pmd, pe_size)) + goto unlock; + + req.va = addr; + req.size = page_size; + + /* start fault */ + ws = get_wait_station(); + req.my_ws = ws->id; + req.peer_pid = proc->peer_pid; + + ret = msg_send_nid(GMEM_PAGE_FAULT_REQUEST, proc->nid, proc->peer_nid, + &req, sizeof(req)); + rsp = wait_at_station(ws); + if ((long)rsp != -ETIMEDOUT) { + msg_ret = rsp->ret; + kfree(rsp); + } + if (msg_ret == GM_RET_PAGE_EXIST) { + pr_warn("gmem: weird page exist\n"); + } else if (msg_ret != GM_RET_SUCCESS) { + ret = VM_FAULT_SIGBUS; + goto unlock; + } + + ret = VM_FAULT_NOPAGE; + +unlock: + mutex_unlock(&page_info->lock); + return ret; +} + +static vm_fault_t gmem_fault(struct vm_fault *vmf) +{ + return __gmem_fault(vmf, PE_SIZE_PTE); +} + +static vm_fault_t gmem_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + int ret = 0; + + ret = __gmem_fault(vmf, pe_size); + + return ret; +} + +static const struct vm_operations_struct gmem_vma_ops = { + .fault = gmem_fault, + .huge_fault = gmem_huge_fault, +}; + +int gmem_handle_task_pairing(struct rpg_kmsg_message *msg) +{ + struct gm_pair_msg_rq *recv = (struct gm_pair_msg_rq *)msg; + unsigned int peer_nid = recv->header.from_nid; + unsigned int peer_pid = recv->my_pid; + unsigned int peer_ws = recv->my_ws; + unsigned int my_nid = recv->peer_nid; + unsigned int my_pid = recv->peer_pid; + int ret = 0; + + gmem_add_to_svm_proc(my_nid, my_pid, peer_nid, peer_pid); + gmem_send_comm_msg_reply(my_nid, peer_nid, peer_ws, ret); + kfree(msg); + return 0; +} + +int vma_is_gmem(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_PEER_SHARED) != 0; +} + +int gmem_handle_alloc_vma_fixed(struct rpg_kmsg_message *msg) +{ + struct gm_pager_msg_rq *data = (struct gm_pager_msg_rq *)msg; + unsigned long va = data->va; + unsigned long size = data->size; + unsigned long gmem_prot = data->prot; + unsigned int my_pid = data->peer_pid; + unsigned int peer_nid = data->header.from_nid; + unsigned int nid = data->header.to_nid; + unsigned int peer_ws = data->my_ws; + unsigned long prot = 0; + unsigned long populate; + struct task_struct *tsk; + struct mm_struct *mm; + unsigned long addr; + struct vm_area_struct *vma; + int ret = GM_RET_SUCCESS; + + if (gmem_prot & VM_READ) + prot |= PROT_READ; + if (gmem_prot & VM_WRITE) + prot |= PROT_WRITE; + if (gmem_prot & VM_EXEC) + prot |= PROT_EXEC; + + tsk = find_get_task_by_vpid(my_pid); + if (!tsk) { + pr_err("svm process does not have task_struct\n"); + ret = GM_RET_FAILURE_UNKNOWN; + goto out; + } + + mm = get_task_mm(tsk); + if (!mm) { + pr_err("no mm\n"); + ret = -1; + goto put_task; + } + + mmap_write_lock(mm); + current->mm = mm; + addr = __do_mmap_mm(mm, NULL, va, size, prot, + MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, 0, + 0, &populate, NULL); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto unlock; + } + + vma = find_vma(mm, addr); + if (!vma) { + ret = GM_RET_FAILURE_UNKNOWN; + goto unlock; + } + + vma->vm_ops = &gmem_vma_ops; + vm_flags_set(vma, VM_HUGEPAGE | VM_PEER_SHARED); + +unlock: + current->mm = NULL; + mmap_write_unlock(mm); + mmput(mm); +put_task: + put_task_struct(tsk); +out: + pr_info("%s va %lx vma message %d\n", __func__, va, ret); + gmem_send_comm_msg_reply(nid, peer_nid, peer_ws, ret); + kfree(msg); + return ret; +} + +int gmem_handle_free_vma(struct rpg_kmsg_message *msg) +{ + struct gm_pager_msg_rq *recv = (struct gm_pager_msg_rq *)msg; + unsigned long va = recv->va; + unsigned long size = recv->size; + unsigned int my_pid = recv->peer_pid; + unsigned int nid = recv->header.to_nid; + unsigned int peer_nid = recv->header.from_nid; + struct mm_struct *old_mm = current->mm; + + int ret = 0; + + tsk = find_get_task_by_vpid(my_pid); + if (!tsk) { + pr_err("svm process does not have task_struct\n"); + ret = GM_RET_FAILURE_UNKNOWN; + goto out; + } + + mm = get_task_mm(tsk); + if (!mm) { + pr_err("no mm\n"); + ret = -1; + goto put_task; + } + + current->mm = mm; + ret = vm_munmap(va, size); + current->mm = old_mm; + + if (ret < 0) + ret = GM_RET_FAILURE_UNKNOWN; + else + ret = GM_RET_SUCCESS; + + mmput(mm); +put_task: + put_task_struct(tsk); +out: + gmem_send_comm_msg_reply(nid, peer_nid, peer_ws, ret); + kfree(msg); + return ret; +} + +pmd_t *get_huge_pmd(const struct vm_area_struct *vma, u64 va) +{ + pgd_t *pgd = NULL; + p4d_t *p4d = NULL; + pud_t *pud = NULL; + pmd_t *pmd = NULL; + + if ((vma == NULL) || (vma->vm_mm == NULL)) { + pr_err("Vm_mm none. (va=0x%llx)\n", va); + return NULL; + } + /* too much log, not print */ + pgd = pgd_offset(vma->vm_mm, va); + if (PXD_JUDGE(pgd)) + return NULL; + + p4d = p4d_offset(pgd, va); + if (PXD_JUDGE(p4d) != 0) + return NULL; + + pud = pud_offset(p4d, va); + if (PXD_JUDGE(pud) != 0) + return NULL; + + pmd = pmd_offset(pud, va); + return pmd; +} + +static inline struct page *alloc_transhuge_page_node(int nid, int zero) +{ + struct page *page; + gfp_t gfp_mask = GFP_TRANSHUGE | __GFP_THISNODE | __GFP_NOWARN; + + if (zero) + gfp_mask |= __GFP_ZERO; + + page = alloc_pages_node(nid, gfp_mask, HPAGE_PMD_ORDER); + if (!page) + return NULL; + + return page; +} + +int gmem_hugepage_remap_owner(struct svm_proc *svm_proc, u64 addr, + pgprot_t prot, struct page *hpage) +{ + int ret; + + ret = hugetlb_insert_hugepage_pte(svm_proc->mm, addr, prot, hpage); + if (ret != 0) { + pr_err("insert_hugepage owner fail. (va=0x%llx)\n", addr); + return ret; + } + + return 0; +} + +int gmem_hugepage_remap_local(struct svm_proc *svm_proc, u64 addr, + pgprot_t prot, struct page *hpage) +{ + int ret = 0; + struct local_pair_proc *item = NULL; + struct local_pair_proc *next = NULL; + + list_for_each_entry_safe(item, next, &svm_proc->tasks_list, node) { + ret = hugetlb_insert_hugepage_pte(item->mm, addr, prot, hpage); + if (ret != 0) { + pr_err("insert_hugepage local fail. (va=0x%llx)\n", addr); + return ret; + } + } + + return 0; +} + + +int gmem_hugepage_remap(struct svm_proc *svm_proc, u64 addr, pgprot_t prot, + struct page *hpage) +{ + int ret; + + ret = gmem_hugepage_remap_owner(svm_proc, addr, prot, hpage); + if (ret != 0) { + pr_err("gmem_hugepage_remap_owner fail. (va=0x%llx)\n", addr); + return ret; + } + + ret = gmem_hugepage_remap_local(svm_proc, addr, prot, hpage); + if (ret != 0) { + pr_err("gmem_hugepage_remap_local fail. (va=0x%llx)\n", addr); + return ret; + } + + return 0; +} + +int gmem_handle_alloc_page(struct rpg_kmsg_message *msg) +{ + struct gm_pager_msg_rq *recv = (struct gm_pager_msg_rq *)msg; + unsigned long addr = recv->va; + unsigned int page_size = recv->size; + unsigned int my_pid = recv->peer_pid; + unsigned int peer_ws = recv->my_ws; + int nid = recv->header.to_nid; + int peer_nid = recv->header.from_nid; + struct page_info *page_info; + struct svm_proc *proc = search_svm_proc_by_pid(my_pid); + struct page *page; + unsigned long long prot_val; + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct *vma; + int ret = 0; + + if (!proc) { + pr_info("can not find proc of %d\n", my_pid); + ret = -EINVAL; + goto out; + } + + page_info = get_page_info(&proc->pager, addr, page_size, page_size); + if (!page_info) { + pr_err("%s: failed to get page_info\n", __func__); + ret = -EINVAL; + goto out; + } + + if (recv->behavior == MADV_WILLNEED) { + if (!page_info->page) + goto new_page; + + goto out; + } + +new_page: + /* TODO: How Can Know HBM node */ + page = alloc_transhuge_page_node(1, !recv->dma_addr); + if (!page) { + ret = -ENOMEM; + goto out; + } + + /* We need a condition */ + if (need_wake_up_swapd()) + wake_up_swapd(); + + if (recv->dma_addr) { + handle_migrate_page((void *)recv->dma_addr, page, page_size, + FROM_PEER); + } + + tsk = find_get_task_by_vpid(my_pid); + if (!tsk) { + pr_err("svm process does not have task_struct\n"); + ret = GM_RET_FAILURE_UNKNOWN; + goto out; + } + + mm = get_task_mm(tsk); + if (!mm) { + pr_err("no mm\n"); + ret = -1; + goto put_task; + } + + vma = find_vma(mm, addr); + if (vma->vm_flags & VM_WRITE) { + prot_val = (pgprot_val(PAGE_SHARED_EXEC) & (~PTE_RDONLY)) | + PTE_DIRTY; + } else { + prot_val = pgprot_val(PAGE_READONLY_EXEC); + } + + /* TODO: 9 Consider multiple processes bind */ + ret = gmem_hugepage_remap(proc, addr, __pgprot(prot_val), page); + if (ret) + goto put_mm; + + page_info->page = page; + +put_mm: + mmput(mm); +put_task: + put_task_struct(tsk); +out: + gmem_send_comm_msg_reply(nid, peer_nid, peer_ws, ret); + kfree(msg); + return ret; +} + +static inline void zap_clear_pmd(struct vm_area_struct *vma, u64 vaddr, + pmd_t *pmd) +{ + pmd_clear(pmd); + flush_tlb_range(vma, vaddr, vaddr + HPAGE_SIZE); +} + +void zap_vma_pmd(struct vm_area_struct *vma, u64 vaddr) +{ + pmd_t *pmd = NULL; + + pmd = get_huge_pmd(vma, vaddr); + + if (pmd == NULL) + return; + + zap_clear_pmd(vma, vaddr, pmd); +} + +void gmem_hugepage_unmap_local(struct svm_proc *svm_proc, u64 addr) +{ + struct local_pair_proc *item = NULL; + struct local_pair_proc *next = NULL; + struct vm_area_struct *vma; + + list_for_each_entry_safe(item, next, &svm_proc->tasks_list, node) { + vma = find_vma(item->mm, addr); + if (!vma) + zap_vma_pmd(vma, addr); + } +} + +void gmem_unmap_hugepage(struct svm_proc *svm_proc, u64 addr) +{ + struct vm_area_struct *vma; + + vma = find_vma(svm_proc->mm, addr); + + if (!vma) + zap_vma_pmd(vma, addr); + + gmem_hugepage_unmap_local(svm_proc, addr); +} + +int gmem_handle_free_page(struct rpg_kmsg_message *msg) +{ + struct gm_pager_msg_rq *recv = (struct gm_pager_msg_rq *)msg; + unsigned long addr = recv->va; + unsigned long page_size = recv->size; + unsigned int my_pid = recv->peer_pid; + unsigned int peer_ws = recv->my_ws; + int peer_nid = recv->header.from_nid; + int nid = recv->header.to_nid; + struct task_struct *tsk; + struct mm_struct *mm; + struct page_info *page_info; + struct svm_proc *proc = search_svm_proc_by_pid(my_pid); + struct page *page = NULL; + int ret = 0; + + if (!proc) { + pr_info("can not find proc of %d\n", my_pid); + ret = -EINVAL; + goto out; + } + + page_info = get_page_info(&proc->pager, addr, page_size, page_size); + if (!page_info) { + pr_err("%s: failed to get page_info\n", __func__); + ret = -EINVAL; + goto out; + } + + page = page_info->page; + if (!page) { + pr_err("%s: page reference in page_info is NULL\n", __func__); + ret = -EINVAL; + goto out; + } + + tsk = find_get_task_by_vpid(my_pid); + if (!tsk) { + pr_err("svm process does not have task_struct\n"); + ret = GM_RET_FAILURE_UNKNOWN; + goto out; + } + + mm = get_task_mm(tsk); + if (!mm) { + pr_err("no mm\n"); + ret = -1; + goto put_task; + } + + gmem_unmap_hugepage(proc, addr); + mmput(mm); + + if (recv->dma_addr) + handle_migrate_page((void *)recv->dma_addr, page, page_size, + TO_PEER); + + free_page_info(&proc->pager, page_info); + put_page(page); + +put_task: + put_task_struct(tsk); +out: + gmem_send_comm_msg_reply(nid, peer_nid, peer_ws, ret); + kfree(msg); + return ret; +} + +int gmem_handle_hmadvise(struct rpg_kmsg_message *msg) +{ + kfree(msg); + return 0; +} + +int gmem_handle_hmemcpy(struct rpg_kmsg_message *msg) +{ + kfree(msg); + return 0; +} + +static int sync_gmem_vma_to_custom_process(struct svm_proc *svm_proc, + struct local_pair_proc *local_proc) +{ + struct mm_struct *mm = svm_proc->mm; + struct vm_area_struct *vma, *local_vma; + unsigned long populate; + struct mm_struct *old_mm = current->mm; + unsigned long addr; + unsigned long prot = PROT_READ; + + VMA_ITERATOR(vmi, mm, 0); + + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + if (!vma_is_peer_shared(vma)) + continue; + current->mm = local_proc->mm; + pr_debug("%s cur %lx local %lx start %lx -- end %lx\n", __func__, + (unsigned long)current->mm, + (unsigned long)local_proc->mm, vma->vm_start, + vma->vm_end); + prot = PROT_READ; + if (vma->vm_flags & VM_WRITE) + prot |= PROT_WRITE; + addr = __do_mmap_mm(local_proc->mm, NULL, vma->vm_start, + vma->vm_end - vma->vm_start, prot, + MAP_SHARED | MAP_ANONYMOUS | + MAP_FIXED_NOREPLACE, 0, + 0, &populate, NULL); + current->mm = old_mm; + if (IS_ERR_VALUE(addr)) { + pr_err("%s failed start %lx - end %lx ret %ld\n", + __func__, vma->vm_start, vma->vm_end, addr); + continue; + } + local_vma = find_vma(local_proc->mm, addr); + if (!local_vma) { + local_vma->vm_ops = vma->vm_ops; + vm_flags_set(vma, VM_HUGEPAGE | VM_PEER_SHARED); + } + } + mmap_write_unlock(mm); + + return 0; +} + +int gmem_register_pair_local_task(unsigned int bind_to_pid, + unsigned int local_pid) +{ + int ret = 0; + struct svm_proc *proc = search_svm_proc_by_pid(bind_to_pid); + struct local_pair_proc *local_proc; + + pr_debug("%s bind_to_pid %d local_pid %d\n", __func__, bind_to_pid, + local_pid); + + local_proc = insert_local_proc(proc, local_pid); + if (IS_ERR(local_proc)) { + pr_err("%s failed\n", __func__); + return PTR_ERR(local_proc); + } + + /* sync vma and vma_ops to local_pid */ + sync_gmem_vma_to_custom_process(proc, local_proc); + + return ret; +} diff --git a/drivers/remote_pager/svm_proc_mng.c b/drivers/remote_pager/svm_proc_mng.c new file mode 100644 index 000000000000..fab63d4c5be9 --- /dev/null +++ b/drivers/remote_pager/svm_proc_mng.c @@ -0,0 +1,419 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Jiangtian Feng + * Co-Author: Jun Chen, Chuangchuang Fang + * + */ + +#include +#include +#include +#include +#include + +#include "svm_proc_mng.h" + +struct svm_proc_node { + struct svm_proc svm_proc; + struct hlist_node list; +}; + +static inline struct svm_proc_node *to_proc_node(struct svm_proc *proc) +{ + return list_entry(proc, struct svm_proc_node, svm_proc); +} + +#define _PROC_LIST_MAX 0x0f +#define _PROC_LIST_SHIFT 4 +static DEFINE_RWLOCK(svm_proc_hash_rwlock); +static DEFINE_HASHTABLE(svm_proc_hashtable, _PROC_LIST_SHIFT); + +static unsigned int get_hash_tag(int pid) +{ + return (unsigned int)pid % _PROC_LIST_MAX; +} + +static void add_to_hashtable(struct svm_proc *proc) +{ + struct svm_proc_node *node = to_proc_node(proc); + unsigned int tag = get_hash_tag(proc->pid); + + write_lock(&svm_proc_hash_rwlock); + hash_add(svm_proc_hashtable, &node->list, tag); + write_unlock(&svm_proc_hash_rwlock); +} + +static void del_from_hashtable(struct svm_proc *proc) +{ + struct svm_proc_node *node; + + write_lock(&svm_proc_hash_rwlock); + node = to_proc_node(proc); + hash_del(&node->list); + write_unlock(&svm_proc_hash_rwlock); +} + +struct svm_proc *search_svm_proc_by_mm(struct mm_struct *mm) +{ + struct svm_proc_node *node; + unsigned int tag; + + read_lock(&svm_proc_hash_rwlock); + hash_for_each(svm_proc_hashtable, tag, node, list) { + if (node->svm_proc.mm == mm) { + read_unlock(&svm_proc_hash_rwlock); + return &node->svm_proc; + } + } + read_unlock(&svm_proc_hash_rwlock); + + return search_svm_proc_by_local_mm(mm); +} + +struct svm_proc *search_svm_proc_by_local_mm(struct mm_struct *mm) +{ + struct svm_proc_node *node; + unsigned int hash_tag; + struct local_pair_proc *item = NULL; + struct local_pair_proc *next = NULL; + + read_lock(&svm_proc_hash_rwlock); + hash_for_each(svm_proc_hashtable, hash_tag, node, list) { + list_for_each_entry_safe(item, next, &node->svm_proc.tasks_list, node) { + if (item->mm == mm) { + read_unlock(&svm_proc_hash_rwlock); + return &node->svm_proc; + } + } + } + read_unlock(&svm_proc_hash_rwlock); + + return NULL; +} + +struct svm_proc *search_svm_proc_by_pid(unsigned int pid) +{ + struct svm_proc_node *node; + unsigned int tag = get_hash_tag(pid); + + read_lock(&svm_proc_hash_rwlock); + hash_for_each_possible(svm_proc_hashtable, node, list, tag) { + if (node->svm_proc.pid == pid) { + read_unlock(&svm_proc_hash_rwlock); + return &node->svm_proc; + } + } + read_unlock(&svm_proc_hash_rwlock); + + return NULL; +} + +static struct page_info *__search_page_info(struct page_mng *pager, + unsigned long va, unsigned long len) +{ + struct rb_node *node = pager->rbtree.rb_node; + struct page_info *page_info = NULL; + + while (node) { + page_info = rb_entry(node, struct page_info, node); + + if (va + len <= page_info->va) + node = node->rb_left; + else if (va >= page_info->va + page_info->len) + node = node->rb_right; + else + break; + } + + if (page_info) { + if (va < page_info->va || va + len > page_info->va + page_info->len) + return NULL; + } + return page_info; +} + +struct page_info *search_page_info(struct page_mng *pager, unsigned long va, unsigned long len) +{ + struct page_info *page_info; + + if (!pager) + return NULL; + + down_read(&pager->rw_sem); + page_info = __search_page_info(pager, va, len); + up_read(&pager->rw_sem); + + return page_info; +} + +static int insert_page_info(struct page_mng *pager, struct page_info *page_info) +{ + struct rb_node **new_node; + struct rb_node *parent = NULL; + struct page_info *cur = NULL; + + down_write(&pager->rw_sem); + new_node = &(pager->rbtree.rb_node); + + /* Figure out where to put new node */ + while (*new_node) { + cur = rb_entry(*new_node, struct page_info, node); + parent = *new_node; + if (page_info->va + page_info->len <= cur->va) { + new_node = &((*new_node)->rb_left); + } else if (page_info->va >= cur->va + cur->len) { + new_node = &((*new_node)->rb_right); + } else { + up_write(&pager->rw_sem); + return -EFAULT; + } + } + /* Add new node and rebalance tree. */ + rb_link_node(&page_info->node, parent, new_node); + rb_insert_color(&page_info->node, &pager->rbtree); + + up_write(&pager->rw_sem); + + return 0; +} + +static void erase_page_info(struct page_mng *pager, struct page_info *page_info) +{ + rb_erase(&page_info->node, &pager->rbtree); +} + +static struct page_info *alloc_page_info(unsigned long va, unsigned long len, + unsigned int page_size) +{ + + struct page_info *page_info; + size_t size; + + size = sizeof(struct page_info); + page_info = kzalloc(size, GFP_KERNEL); + if (!page_info) { + pr_err("alloc page_info failed: (size=%lx)\n", (unsigned long)size); + return NULL; + } + + page_info->va = va; + page_info->len = len; + mutex_init(&page_info->lock); + + return page_info; +} + +struct page_info *get_page_info(struct page_mng *pager, + unsigned long va, unsigned long len, unsigned int page_size) +{ + struct page_info *page_info = search_page_info(pager, va, len); + + if (page_info) + return page_info; + + page_info = alloc_page_info(va, len, page_size); + if (page_info) { + if (insert_page_info(pager, page_info)) { + kfree(page_info); + page_info = search_page_info(pager, va, len); + } + } + + return page_info; +} + +void free_page_info(struct page_mng *pager, struct page_info *page_info) +{ + down_write(&pager->rw_sem); + erase_page_info(pager, page_info); + up_write(&pager->rw_sem); + kfree(page_info); +} + +static void free_pager(struct page_mng *pager) +{ + struct page_info *page_info = NULL; + struct rb_node *node = NULL; + + down_write(&pager->rw_sem); + node = rb_first(&pager->rbtree); + while (node) { + page_info = rb_entry(node, struct page_info, node); + node = rb_next(node); + erase_page_info(pager, page_info); + kfree(page_info); + } + up_write(&pager->rw_sem); +} + +static void free_svm_proc(struct svm_proc *proc) +{ + struct local_pair_proc *item = NULL; + struct local_pair_proc *next = NULL; + struct mm_struct *mm = proc->mm; + int count; + + free_pager(&proc->pager); + del_from_hashtable(proc); + + count = atomic_read(&mm->mm_users); + if (count) { + pr_err("mm_users is %d\n", count); + mmput(mm); + } + + if (!list_empty(&proc->tasks_list)) { + list_for_each_entry_safe(item, next, &proc->tasks_list, node) + list_del(&item->node); + } + pr_err("svm proc clean up done pid %d, peer_pid %d\n", proc->pid, proc->peer_pid); +} + +static void svm_proc_mm_release(struct mmu_notifier *subscription, struct mm_struct *mm) +{ + struct svm_proc *proc = container_of(subscription, struct svm_proc, notifier); + + free_svm_proc(proc); + kfree(proc); +} + +static const struct mmu_notifier_ops svm_proc_mmu_notifier_ops = { + .release = svm_proc_mm_release, +}; + +static int svm_proc_mmu_notifier_register(struct svm_proc *proc) +{ + proc->notifier.ops = &svm_proc_mmu_notifier_ops; + + return mmu_notifier_register(&proc->notifier, proc->mm); +} + +static void local_pair_proc_mm_release(struct mmu_notifier *subscription, struct mm_struct *mm) +{ + struct local_pair_proc *local_proc = + container_of(subscription, struct local_pair_proc, notifier); + + list_del(&local_proc->node); + kfree(local_proc); + pr_debug("clean pair proc resources\n"); +} + +static const struct mmu_notifier_ops local_pair_proc_mmu_notifier_ops = { + .release = local_pair_proc_mm_release, +}; + +static int local_pair_proc_mmu_notifier_register(struct local_pair_proc *local_proc) +{ + local_proc->notifier.ops = &local_pair_proc_mmu_notifier_ops; + + return mmu_notifier_register(&local_proc->notifier, local_proc->mm); +} + +struct local_pair_proc *insert_local_proc(struct svm_proc *proc, unsigned int pid) +{ + int ret = 0; + struct local_pair_proc *local_proc = kzalloc(sizeof(struct local_pair_proc), GFP_KERNEL); + + if (!local_proc) + return ERR_PTR(-ENOMEM); + + local_proc->tsk = find_get_task_by_vpid(pid); + if (!local_proc->tsk) { + pr_err("can not find process by pid %d\n", pid); + ret = -EINVAL; + goto free; + } + + local_proc->pid = pid; + local_proc->mm = get_task_mm(local_proc->tsk); + /* task is exiting */ + if (!local_proc->mm) { + pr_err("can not get process[%d] mm\n", pid); + ret = -EINTR; + goto put_task; + } + + ret = local_pair_proc_mmu_notifier_register(local_proc); + if (ret) { + pr_err("register mmu notifier failed\n"); + goto put_mm; + } + + mmput(local_proc->mm); + put_task_struct(local_proc->tsk); + + list_add(&local_proc->node, &proc->tasks_list); + pr_debug("%s bind_to_pid %d local_pid %d\n", __func__, proc->pid, local_proc->pid); + + return local_proc; + +put_mm: + mmput(local_proc->mm); +put_task: + put_task_struct(local_proc->tsk); +free: + kfree(local_proc); + return ERR_PTR(ret); +} + +struct svm_proc *alloc_svm_proc(int nid, int pid, int peer_nid, int peer_pid) +{ + struct svm_proc *proc; + int ret; + + proc = kzalloc(sizeof(struct svm_proc), GFP_KERNEL); + if (!proc) + return ERR_PTR(-ENOMEM); + + proc->pager.rbtree = RB_ROOT; + init_rwsem(&proc->pager.rw_sem); + + proc->pid = pid; + proc->nid = nid; + proc->peer_nid = peer_nid; + proc->peer_pid = peer_pid; + INIT_LIST_HEAD(&proc->tasks_list); + + proc->tsk = find_get_task_by_vpid(pid); + if (!proc->tsk) { + pr_err("can not find process by pid %d\n", pid); + ret = -EINVAL; + goto free; + } + + proc->mm = get_task_mm(proc->tsk); + /* task is exiting */ + if (!proc->mm) { + pr_err("can not get process[%d] mm\n", pid); + ret = -EINTR; + goto put_task; + } + + ret = svm_proc_mmu_notifier_register(proc); + if (ret) { + pr_err("register mmu notifier failed\n"); + goto put_mm; + } + + /* + * destroying svm_proc depends on mmu_notifier. + * we have to put mm to make sure mmu_notifier can be called + */ + mmput(proc->mm); + put_task_struct(proc->tsk); + + add_to_hashtable(proc); + + return proc; + +put_mm: + mmput(proc->mm); +put_task: + put_task_struct(proc->tsk); +free: + kfree(proc); + return ERR_PTR(ret); +} diff --git a/drivers/remote_pager/svm_proc_mng.h b/drivers/remote_pager/svm_proc_mng.h new file mode 100644 index 000000000000..7af28610c285 --- /dev/null +++ b/drivers/remote_pager/svm_proc_mng.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-only + * + * Generalized Memory Management. + * + * Copyright (C) 2023- Huawei, Inc. + * Author: Jiangtian Feng + * Co-Author: Jun Chen + */ + +#ifndef _REMOTE_PAGER_PROC_MNG_H_ +#define _REMOTE_PAGER_PROC_MNG_H_ + +#include +#include +#include +#include +#include + +struct page_info { + struct rb_node node; + unsigned long va; + unsigned long len; + struct mutex lock; + struct page *page; +}; + +struct page_mng { + struct rw_semaphore rw_sem; + struct rb_root rbtree; +}; + +struct local_pair_proc { + struct list_head node; + pid_t pid; + struct task_struct *tsk; + struct mm_struct *mm; + struct mmu_notifier notifier; +}; + +struct svm_proc { + int pid; + int nid; + int peer_pid; + int peer_nid; + struct mm_struct *mm; /* never dereference */ + struct task_struct *tsk; + struct list_head tasks_list; /* bind to svm_proc local tasks */ + struct mmu_notifier notifier; + + struct page_mng pager; +}; + +struct page_info *search_page_info(struct page_mng *pager, + unsigned long va, unsigned long len); +struct page_info *get_page_info(struct page_mng *pager, + unsigned long va, unsigned long len, unsigned int page_size); +void free_page_info(struct page_mng *pager, struct page_info *page_info); + +struct svm_proc *alloc_svm_proc(int nid, int pid, int peer_nid, int peer_pid); +struct svm_proc *search_svm_proc_by_mm(struct mm_struct *mm); +struct svm_proc *search_svm_proc_by_pid(unsigned int pid); +struct local_pair_proc *insert_local_proc(struct svm_proc *proc, + unsigned int local_pid); +struct svm_proc *search_svm_proc_by_local_mm(struct mm_struct *mm); + diff --git a/drivers/remote_pager/wait_station.c b/drivers/remote_pager/wait_station.c new file mode 100644 index 000000000000..df5075c744fd --- /dev/null +++ b/drivers/remote_pager/wait_station.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Waiting stations allows threads to be waited for a given + * number of events are completed + * + * Original file developed by SSRG at Virginia Tech. + * + * author, Javier Malave, Rebecca Shapiro, Andrew Hughes, + * Narf Industries 2020 (modifications for upstream RFC) + * + */ + +#include +#include +#include + +#include "wait_station.h" + +#define MAX_WAIT_STATIONS 1024 +#define MAX_WAIT_IO_TIMEOUT (300 * HZ) + +static struct wait_station wait_stations[MAX_WAIT_STATIONS]; + +static DEFINE_SPINLOCK(wait_station_lock); +static DECLARE_BITMAP(wait_station_available, MAX_WAIT_STATIONS) = { 0 }; + +struct wait_station *get_wait_station(void) +{ + int id; + struct wait_station *ws; + + spin_lock(&wait_station_lock); + id = find_first_zero_bit(wait_station_available, MAX_WAIT_STATIONS); + ws = wait_stations + id; + set_bit(id, wait_station_available); + spin_unlock(&wait_station_lock); + + ws->id = id; + ws->private = (void *)0xbad0face; + init_completion(&ws->pendings); + + return ws; +} +EXPORT_SYMBOL_GPL(get_wait_station); + +struct wait_station *wait_station(int id) +{ + /* memory barrier */ + smp_rmb(); + return wait_stations + id; +} +EXPORT_SYMBOL_GPL(wait_station); + +void put_wait_station(struct wait_station *ws) +{ + int id = ws->id; + + spin_lock(&wait_station_lock); + clear_bit(id, wait_station_available); + spin_unlock(&wait_station_lock); +} +EXPORT_SYMBOL_GPL(put_wait_station); + +void *wait_at_station(struct wait_station *ws) +{ + void *ret; + + if (!try_wait_for_completion(&ws->pendings)) { + if (wait_for_completion_io_timeout(&ws->pendings, MAX_WAIT_IO_TIMEOUT) == 0) { + pr_err("%s timeout\n", __func__); + ret = ERR_PTR(-ETIMEDOUT); + goto out; + } + } + /* memory barrier */ + smp_rmb(); + ret = ws->private; +out: + put_wait_station(ws); + return ret; +} diff --git a/drivers/remote_pager/wait_station.h b/drivers/remote_pager/wait_station.h new file mode 100644 index 000000000000..43ff9271288f --- /dev/null +++ b/drivers/remote_pager/wait_station.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * /kernel/popcorn/wait_station.c + * + * Waiting stations allows threads to be waited for a given + * number of events are completed + * + * Original file developed by SSRG at Virginia Tech. + * + * author, Javier Malave, Rebecca Shapiro, Andrew Hughes, + * Narf Industries 2020 (modifications for upstream RFC) + * + */ + +#ifndef _REMOTE_PAGER_WAIT_STATION_H_ +#define _REMOTE_PAGER_WAIT_STATION_H_ + +#include +#include + +struct wait_station { + unsigned int id; + void *private; + struct completion pendings; +}; + +struct wait_station *get_wait_station(void); +struct wait_station *wait_station(int id); +void put_wait_station(struct wait_station *ws); +void *wait_at_station(struct wait_station *ws); diff --git a/include/linux/remote_pager/msg_chan.h b/include/linux/remote_pager/msg_chan.h new file mode 100644 index 000000000000..b6e46b6f7378 --- /dev/null +++ b/include/linux/remote_pager/msg_chan.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __RPG_MSG_CHAN_H__ +#define __RPG_MSG_CHAN_H__ + +#include + +/* + * struct phys_channel_ops - Channel physical layer ops + * @open: Open the communication channel of node nid and alloc physical resources, + * returns the channel ID + * @notify: Notify peer of chan_id to receive messages + * @copy_to: Copy the msg_data message from origin to peer + * @copy_from: Copy the msg_data message from peer to origin + * @close: Close channel and free physical resources + */ +struct phys_channel_ops { + char *name; + int (*open)(int nid); + int (*notify)(int chan_id); + int (*copy_to)(int chan_id, void *msg_data, size_t msg_len, int flags); + int (*copy_from)(int chan_id, void *buf, size_t len, int flags); + int (*migrate_page)(void *peer_addr, struct page *local_page, size_t size, int dir); + int (*close)(int chan_id); +}; + +int msg_layer_install_phy_ops(struct phys_channel_ops *ops, int default_chan_id); +int msg_layer_uninstall_phy_ops(struct phys_channel_ops *ops); + +#define log_err(fmt, ...) pr_err("[%s:%d]" fmt, __func__, __LINE__, ##__VA_ARGS__) +#define log_info(fmt, ...) pr_info("[%s:%d]" fmt, __func__, __LINE__, ##__VA_ARGS__) + +#define MSG_CMD_START 0x1 +#define MSG_CMD_IRQ_END 0x2 +#define MSG_CMD_FIFO_NO_MEM 0x3 +#define MSG_CMD_CHANN_OPEN 0x4 + +#define CHAN_STAT_ENABLE 1 +#define CHAN_STAT_DISABLE 0 + +#define TO_PEER 0 +#define FROM_PEER 1 + -- Gitee From c96af8065139e2f7fedad97b190e33fc287bb105 Mon Sep 17 00:00:00 2001 From: nicunshu Date: Thu, 10 Jul 2025 20:32:48 +0800 Subject: [PATCH 11/27] mm: gmem: use thp_vma_suitable_order instead of transhuge_vma_suitable euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- use thp_vma_suitable_order instead of transhuge_vma_suitable and add config isolation huge_mm.h Signed-off-by: nicunshu --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1070688be9d1..34af481e6176 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1564,7 +1564,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); - goto gm_mapping_release; + return ret; } pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); -- Gitee From 26a80939b1ab75c2c54e6be41c9373e6dafe4aee Mon Sep 17 00:00:00 2001 From: nicunshu Date: Sat, 28 Jun 2025 10:17:48 +0800 Subject: [PATCH 12/27] mm: gmem: change conflict flag euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- change conflict mmap flag used in gmem Fixes: 4c627cebab85 ("mm: gmem: Introduce GMEM") Signed-off-by: nicunshu --- include/uapi/asm-generic/mman-common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index cdcb59fbfe7f..d8857c71d4bb 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -33,7 +33,7 @@ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ -#define MAP_PEER_SHARED 0x8000000 +#define MAP_PEER_SHARED 0x1000000 /* * Flags for mlock -- Gitee From d048b45bcc4f8ffaa9e30b10c373638993454078 Mon Sep 17 00:00:00 2001 From: nicunshu Date: Thu, 10 Jul 2025 20:40:51 +0800 Subject: [PATCH 13/27] mm: gmem: use kabi_reserve to avoid kabi breakage euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- use kabi_reserve to avoid kabi breakage Fixes: 4c627cebab85 ("mm: gmem: Introduce GMEM") Signed-off-by: nicunshu --- include/linux/mm_types.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bcfbaa36bbbb..28b308a7d5c6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1060,7 +1060,11 @@ struct mm_struct { #else KABI_RESERVE(1) #endif +#ifdef CONFIG_GMEM + KABI_USE(2, gm_as_t *gm_as) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5) -- Gitee From 11ab9450154cf3b37de41f8e63e0fb848b34ea76 Mon Sep 17 00:00:00 2001 From: nicunshu Date: Thu, 10 Jul 2025 20:47:48 +0800 Subject: [PATCH 14/27] driver: remote_pager: fix several flaws MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- 1、show info when msg_open failed or kmalloc failed during processing message 2、change the printk level of init success to debug 3、change the maximum device num to 16 Fixes: 0c2df2139d4d ("drivers: remote_pager: introduce remote_pager module for gmem") Signed-off-by: nicunshu --- drivers/remote_pager/msg_chan/msg_layer/msg_layer.c | 4 +++- drivers/remote_pager/msg_handler_comm.c | 5 ++++- drivers/remote_pager/msg_handler_origin.c | 4 ++-- drivers/remote_pager/svm_proc_mng.c | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c b/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c index eceff696c77f..edf98c5f1834 100644 --- a/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c +++ b/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c @@ -37,6 +37,8 @@ int msg_send(int chan_id, void *msg_data, size_t msg_len) ret = g_phys_chan_ops->copy_to(chan_id, msg_data, msg_len, 1); ret |= g_phys_chan_ops->notify(chan_id); + if (ret < 0) + pr_err("%s failed in chan %d\n", __func__, chan_id); return ret; } @@ -236,7 +238,7 @@ int msg_layer_install_phy_ops(struct phys_channel_ops *ops, int default_chan_id) g_phys_chan_ops = ops; if (default_chan_id >= 0) { ret = msg_open(default_chan_id); - if (ret) { + if (ret < 0) { pr_err("can not open msg channel %d\n", default_chan_id); g_phys_chan_ops = NULL; goto unlock; diff --git a/drivers/remote_pager/msg_handler_comm.c b/drivers/remote_pager/msg_handler_comm.c index f195ad15e646..cefd9d9b7ccf 100644 --- a/drivers/remote_pager/msg_handler_comm.c +++ b/drivers/remote_pager/msg_handler_comm.c @@ -113,7 +113,10 @@ void process_remote_pager_work(struct work_struct *work) int handle_remote_pager_work(void *msg) { struct rpg_kmsg_work *w = kmalloc(sizeof(*w), GFP_ATOMIC); - + if (IS_ERR_OR_NULL(w)) { + pr_err("can not alloc memory for rpg_kmsg_work\n"); + goto PTR_ERR(w); + } w->msg = msg; INIT_WORK(&w->work, process_remote_pager_work); diff --git a/drivers/remote_pager/msg_handler_origin.c b/drivers/remote_pager/msg_handler_origin.c index 8c861b6d2eb0..501c627dbb0f 100644 --- a/drivers/remote_pager/msg_handler_origin.c +++ b/drivers/remote_pager/msg_handler_origin.c @@ -19,7 +19,7 @@ #include "svm_proc_mng.h" #define NPU_PAGE_SIZE PAGE_SIZE -#define MAX_NR_NPU 8 +#define MAX_NR_NPU 16 #define GMEM_DEBUG 0 static gm_dev_t *gm_devs[MAX_NR_NPU]; @@ -42,7 +42,7 @@ int gmem_register_pair_remote_task(int origin_nid, int origin_pid, int remote_ni struct wait_station *ws; /* open msg chan */ - pr_err("%s origin_nid %d, origin_pid %d, remote_nid %d, remote_pid %d\n", __func__, + pr_debug("%s origin_nid %d, origin_pid %d, remote_nid %d, remote_pid %d\n", __func__, origin_nid, origin_pid, remote_nid, remote_pid); ret = msg_open(remote_nid); if (ret < 0) { diff --git a/drivers/remote_pager/svm_proc_mng.c b/drivers/remote_pager/svm_proc_mng.c index fab63d4c5be9..c339b407bc27 100644 --- a/drivers/remote_pager/svm_proc_mng.c +++ b/drivers/remote_pager/svm_proc_mng.c @@ -269,7 +269,7 @@ static void free_svm_proc(struct svm_proc *proc) list_for_each_entry_safe(item, next, &proc->tasks_list, node) list_del(&item->node); } - pr_err("svm proc clean up done pid %d, peer_pid %d\n", proc->pid, proc->peer_pid); + pr_debug("svm proc clean up done pid %d, peer_pid %d\n", proc->pid, proc->peer_pid); } static void svm_proc_mm_release(struct mmu_notifier *subscription, struct mm_struct *mm) -- Gitee From baaabb5c5b18c5c13975f3a07f39c979632a2b79 Mon Sep 17 00:00:00 2001 From: xujunming Date: Thu, 10 Jul 2025 20:55:59 +0800 Subject: [PATCH 15/27] drivers: remote_pager: remove remote_pager_peer euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- remove remote_pager_peer which should be compiled in npu OS. Signed-off-by: xujunming --- drivers/remote_pager/Kconfig | 13 ----- drivers/remote_pager/Makefile | 3 +- drivers/remote_pager/main.c | 1 + .../msg_chan/msg_layer/msg_layer.h | 1 + drivers/remote_pager/msg_handler.h | 13 ++--- drivers/remote_pager/msg_handler_comm.c | 27 +--------- drivers/remote_pager/msg_handler_peer.c | 52 ++++++++++--------- drivers/remote_pager/svm_proc_mng.c | 52 +++++++++++-------- drivers/remote_pager/svm_proc_mng.h | 2 +- drivers/remote_pager/wait_station.c | 4 +- drivers/remote_pager/wait_station.h | 1 + include/linux/remote_pager/msg_chan.h | 1 + 12 files changed, 72 insertions(+), 98 deletions(-) diff --git a/drivers/remote_pager/Kconfig b/drivers/remote_pager/Kconfig index bf0d0f58a3d4..414a676f02b0 100644 --- a/drivers/remote_pager/Kconfig +++ b/drivers/remote_pager/Kconfig @@ -22,17 +22,4 @@ config REMOTE_PAGER_MASTER Used for memory management If unsure, say Y. -config REMOTE_PAGER_SLAVE - tristate "remote pager slave" - default n - depends on ARM64 - depends on REMOTE_PAGER - help - Module used for gmem. - This is host part, used for send and recv message from device - Used for memory management - If you want to remote_pager driver to support the peer function, - say m. - If unsure, say Y. - endmenu diff --git a/drivers/remote_pager/Makefile b/drivers/remote_pager/Makefile index cb723290af59..8b1d735ae546 100644 --- a/drivers/remote_pager/Makefile +++ b/drivers/remote_pager/Makefile @@ -8,5 +8,4 @@ remote_pager-$(CONFIG_REMOTE_PAGER) := main.o \ msg_chan/msg_layer/msg_layer.o \ svm_proc_mng.o -remote_pager-$(CONFIG_REMOTE_PAGER_MASTER) += msg_handler_origin.o -remote_pager-$(CONFIG_REMOTE_PAGER_SLAVE) += msg_handler_peer.o \ No newline at end of file +remote_pager-$(CONFIG_REMOTE_PAGER_MASTER) += msg_handler_origin.o \ No newline at end of file diff --git a/drivers/remote_pager/main.c b/drivers/remote_pager/main.c index 1e4aec881b9e..afa17ee2ce07 100644 --- a/drivers/remote_pager/main.c +++ b/drivers/remote_pager/main.c @@ -30,3 +30,4 @@ module_exit(remote_pager_exit); MODULE_AUTHOR("Huawei Tech. Co., Ltd."); MODULE_DESCRIPTION("Remote-pager"); MODULE_ALIAS("Remote-pager"); +MODULE_LICENSE("GPL"); diff --git a/drivers/remote_pager/msg_chan/msg_layer/msg_layer.h b/drivers/remote_pager/msg_chan/msg_layer/msg_layer.h index 1217dafeaf52..221842e8c434 100644 --- a/drivers/remote_pager/msg_chan/msg_layer/msg_layer.h +++ b/drivers/remote_pager/msg_chan/msg_layer/msg_layer.h @@ -46,3 +46,4 @@ int msg_open(int nid); int msg_close(int nid); int handle_migrate_page(void *peer_addr, struct page *local_page, size_t size, int dir); +#endif diff --git a/drivers/remote_pager/msg_handler.h b/drivers/remote_pager/msg_handler.h index cb08fe765c77..d7a0cd7231a9 100644 --- a/drivers/remote_pager/msg_handler.h +++ b/drivers/remote_pager/msg_handler.h @@ -111,17 +111,9 @@ gm_dev_t *gmem_id_to_device(unsigned int id); /* msg handler */ -int gmem_handle_task_pairing(struct rpg_kmsg_message *msg); +int gmem_handle_evict_page(struct rpg_kmsg_message *msg); int gmem_handle_comm_msg_rsp(struct rpg_kmsg_message *msg); -int gmem_handle_alloc_vma_fixed(struct rpg_kmsg_message *msg); -int gmem_handle_free_vma(struct rpg_kmsg_message *msg); - -int gmem_handle_alloc_page(struct rpg_kmsg_message *msg); -int gmem_handle_free_page(struct rpg_kmsg_message *msg); -int gmem_handle_hmadvise(struct rpg_kmsg_message *msg); -int gmem_handle_hmemcpy(struct rpg_kmsg_message *msg); int gmem_handle_dev_fault(struct rpg_kmsg_message *msg); -int gmem_handle_evict_page(struct rpg_kmsg_message *msg); int gmem_add_to_svm_proc(int my_nid, int my_pid, int peer_nid, int peer_pid); int gmem_send_comm_msg_reply(unsigned int from_nid, unsigned int to_nid, @@ -130,3 +122,6 @@ int gmem_send_comm_msg_reply(unsigned int from_nid, unsigned int to_nid, int handle_remote_pager_work(void *msg); int msg_handle_init(void); +#endif + + diff --git a/drivers/remote_pager/msg_handler_comm.c b/drivers/remote_pager/msg_handler_comm.c index cefd9d9b7ccf..82308249afda 100644 --- a/drivers/remote_pager/msg_handler_comm.c +++ b/drivers/remote_pager/msg_handler_comm.c @@ -17,31 +17,6 @@ static struct workqueue_struct *remote_pager_wq; struct msg_handler_st rpg_kmsg_cbftns[GMEM_MSG_MAX_ID] = { -#if IS_ENABLED(CONFIG_REMOTE_PAGER_SLAVE) - /* HOST TO REMOTE */ - [GMEM_TASK_PAIRING_REQUEST] = { - gmem_handle_task_pairing - }, - [GMEM_ALLOC_VMA_REQUEST] = { - gmem_handle_alloc_vma_fixed - }, - [GMEM_FREE_VMA_REQUEST] = { - gmem_handle_free_vma - }, - [GMEM_ALLOC_PAGE_REQUEST] = { - gmem_handle_alloc_page - }, - [GMEM_FREE_PAGE_REQUEST] = { - gmem_handle_free_page - }, - [GMEM_HMADVISE_REQUEST] = { - gmem_handle_hmadvise - }, - [GMEM_HMEMCPY_REQUEST] = { - gmem_handle_hmemcpy - }, -#endif - #if IS_ENABLED(CONFIG_REMOTE_PAGER_MASTER) /* REMOTE TO HOST */ [GMEM_PAGE_FAULT_REQUEST] = { @@ -115,7 +90,7 @@ int handle_remote_pager_work(void *msg) struct rpg_kmsg_work *w = kmalloc(sizeof(*w), GFP_ATOMIC); if (IS_ERR_OR_NULL(w)) { pr_err("can not alloc memory for rpg_kmsg_work\n"); - goto PTR_ERR(w); + return PTR_ERR(w); } w->msg = msg; diff --git a/drivers/remote_pager/msg_handler_peer.c b/drivers/remote_pager/msg_handler_peer.c index 68c16f6b3ff9..5bb809f5258c 100644 --- a/drivers/remote_pager/msg_handler_peer.c +++ b/drivers/remote_pager/msg_handler_peer.c @@ -108,7 +108,7 @@ static vm_fault_t __gmem_fault(struct vm_fault *vmf, req.peer_pid = proc->peer_pid; ret = msg_send_nid(GMEM_PAGE_FAULT_REQUEST, proc->nid, proc->peer_nid, - &req, sizeof(req)); + &req, sizeof(req)); rsp = wait_at_station(ws); if ((long)rsp != -ETIMEDOUT) { msg_ret = rsp->ret; @@ -211,8 +211,8 @@ int gmem_handle_alloc_vma_fixed(struct rpg_kmsg_message *msg) mmap_write_lock(mm); current->mm = mm; addr = __do_mmap_mm(mm, NULL, va, size, prot, - MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, 0, - 0, &populate, NULL); + MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, 0, + 0, &populate, NULL); if (IS_ERR_VALUE(addr)) { ret = addr; goto unlock; @@ -248,6 +248,9 @@ int gmem_handle_free_vma(struct rpg_kmsg_message *msg) unsigned int my_pid = recv->peer_pid; unsigned int nid = recv->header.to_nid; unsigned int peer_nid = recv->header.from_nid; + unsigned int peer_ws = recv->my_ws; + struct task_struct *tsk; + struct mm_struct *mm; struct mm_struct *old_mm = current->mm; int ret = 0; @@ -328,7 +331,7 @@ static inline struct page *alloc_transhuge_page_node(int nid, int zero) } int gmem_hugepage_remap_owner(struct svm_proc *svm_proc, u64 addr, - pgprot_t prot, struct page *hpage) + pgprot_t prot, struct page *hpage) { int ret; @@ -342,7 +345,7 @@ int gmem_hugepage_remap_owner(struct svm_proc *svm_proc, u64 addr, } int gmem_hugepage_remap_local(struct svm_proc *svm_proc, u64 addr, - pgprot_t prot, struct page *hpage) + pgprot_t prot, struct page *hpage) { int ret = 0; struct local_pair_proc *item = NULL; @@ -351,7 +354,8 @@ int gmem_hugepage_remap_local(struct svm_proc *svm_proc, u64 addr, list_for_each_entry_safe(item, next, &svm_proc->tasks_list, node) { ret = hugetlb_insert_hugepage_pte(item->mm, addr, prot, hpage); if (ret != 0) { - pr_err("insert_hugepage local fail. (va=0x%llx)\n", addr); + pr_err("insert_hugepage local fail. (va=0x%llx)\n", + addr); return ret; } } @@ -359,9 +363,8 @@ int gmem_hugepage_remap_local(struct svm_proc *svm_proc, u64 addr, return 0; } - int gmem_hugepage_remap(struct svm_proc *svm_proc, u64 addr, pgprot_t prot, - struct page *hpage) + struct page *hpage) { int ret; @@ -432,7 +435,7 @@ int gmem_handle_alloc_page(struct rpg_kmsg_message *msg) if (recv->dma_addr) { handle_migrate_page((void *)recv->dma_addr, page, page_size, - FROM_PEER); + FROM_PEER); } tsk = find_get_task_by_vpid(my_pid); @@ -452,7 +455,7 @@ int gmem_handle_alloc_page(struct rpg_kmsg_message *msg) vma = find_vma(mm, addr); if (vma->vm_flags & VM_WRITE) { prot_val = (pgprot_val(PAGE_SHARED_EXEC) & (~PTE_RDONLY)) | - PTE_DIRTY; + PTE_DIRTY; } else { prot_val = pgprot_val(PAGE_READONLY_EXEC); } @@ -475,7 +478,7 @@ int gmem_handle_alloc_page(struct rpg_kmsg_message *msg) } static inline void zap_clear_pmd(struct vm_area_struct *vma, u64 vaddr, - pmd_t *pmd) + pmd_t *pmd) { pmd_clear(pmd); flush_tlb_range(vma, vaddr, vaddr + HPAGE_SIZE); @@ -573,7 +576,7 @@ int gmem_handle_free_page(struct rpg_kmsg_message *msg) if (recv->dma_addr) handle_migrate_page((void *)recv->dma_addr, page, page_size, - TO_PEER); + TO_PEER); free_page_info(&proc->pager, page_info); put_page(page); @@ -599,7 +602,7 @@ int gmem_handle_hmemcpy(struct rpg_kmsg_message *msg) } static int sync_gmem_vma_to_custom_process(struct svm_proc *svm_proc, - struct local_pair_proc *local_proc) + struct local_pair_proc *local_proc) { struct mm_struct *mm = svm_proc->mm; struct vm_area_struct *vma, *local_vma; @@ -615,22 +618,22 @@ static int sync_gmem_vma_to_custom_process(struct svm_proc *svm_proc, if (!vma_is_peer_shared(vma)) continue; current->mm = local_proc->mm; - pr_debug("%s cur %lx local %lx start %lx -- end %lx\n", __func__, - (unsigned long)current->mm, - (unsigned long)local_proc->mm, vma->vm_start, - vma->vm_end); + pr_debug("%s cur %lx local %lx start %lx -- end %lx\n", + __func__, (unsigned long)current->mm, + (unsigned long)local_proc->mm, vma->vm_start, + vma->vm_end); prot = PROT_READ; if (vma->vm_flags & VM_WRITE) prot |= PROT_WRITE; addr = __do_mmap_mm(local_proc->mm, NULL, vma->vm_start, - vma->vm_end - vma->vm_start, prot, - MAP_SHARED | MAP_ANONYMOUS | - MAP_FIXED_NOREPLACE, 0, - 0, &populate, NULL); + vma->vm_end - vma->vm_start, prot, + MAP_SHARED | MAP_ANONYMOUS | + MAP_FIXED_NOREPLACE, + 0, 0, &populate, NULL); current->mm = old_mm; if (IS_ERR_VALUE(addr)) { pr_err("%s failed start %lx - end %lx ret %ld\n", - __func__, vma->vm_start, vma->vm_end, addr); + __func__, vma->vm_start, vma->vm_end, addr); continue; } local_vma = find_vma(local_proc->mm, addr); @@ -645,14 +648,14 @@ static int sync_gmem_vma_to_custom_process(struct svm_proc *svm_proc, } int gmem_register_pair_local_task(unsigned int bind_to_pid, - unsigned int local_pid) + unsigned int local_pid) { int ret = 0; struct svm_proc *proc = search_svm_proc_by_pid(bind_to_pid); struct local_pair_proc *local_proc; pr_debug("%s bind_to_pid %d local_pid %d\n", __func__, bind_to_pid, - local_pid); + local_pid); local_proc = insert_local_proc(proc, local_pid); if (IS_ERR(local_proc)) { @@ -665,3 +668,4 @@ int gmem_register_pair_local_task(unsigned int bind_to_pid, return ret; } +EXPORT_SYMBOL(gmem_register_pair_local_task); diff --git a/drivers/remote_pager/svm_proc_mng.c b/drivers/remote_pager/svm_proc_mng.c index c339b407bc27..201c30885437 100644 --- a/drivers/remote_pager/svm_proc_mng.c +++ b/drivers/remote_pager/svm_proc_mng.c @@ -26,8 +26,8 @@ static inline struct svm_proc_node *to_proc_node(struct svm_proc *proc) return list_entry(proc, struct svm_proc_node, svm_proc); } -#define _PROC_LIST_MAX 0x0f -#define _PROC_LIST_SHIFT 4 +#define _PROC_LIST_MAX 0x0f +#define _PROC_LIST_SHIFT 4 static DEFINE_RWLOCK(svm_proc_hash_rwlock); static DEFINE_HASHTABLE(svm_proc_hashtable, _PROC_LIST_SHIFT); @@ -82,7 +82,8 @@ struct svm_proc *search_svm_proc_by_local_mm(struct mm_struct *mm) read_lock(&svm_proc_hash_rwlock); hash_for_each(svm_proc_hashtable, hash_tag, node, list) { - list_for_each_entry_safe(item, next, &node->svm_proc.tasks_list, node) { + list_for_each_entry_safe(item, next, &node->svm_proc.tasks_list, + node) { if (item->mm == mm) { read_unlock(&svm_proc_hash_rwlock); return &node->svm_proc; @@ -112,7 +113,7 @@ struct svm_proc *search_svm_proc_by_pid(unsigned int pid) } static struct page_info *__search_page_info(struct page_mng *pager, - unsigned long va, unsigned long len) + unsigned long va, unsigned long len) { struct rb_node *node = pager->rbtree.rb_node; struct page_info *page_info = NULL; @@ -129,13 +130,15 @@ static struct page_info *__search_page_info(struct page_mng *pager, } if (page_info) { - if (va < page_info->va || va + len > page_info->va + page_info->len) + if (va < page_info->va || + va + len > page_info->va + page_info->len) return NULL; } return page_info; } -struct page_info *search_page_info(struct page_mng *pager, unsigned long va, unsigned long len) +struct page_info *search_page_info(struct page_mng *pager, unsigned long va, + unsigned long len) { struct page_info *page_info; @@ -186,18 +189,15 @@ static void erase_page_info(struct page_mng *pager, struct page_info *page_info) } static struct page_info *alloc_page_info(unsigned long va, unsigned long len, - unsigned int page_size) + unsigned int page_size) { - struct page_info *page_info; size_t size; size = sizeof(struct page_info); page_info = kzalloc(size, GFP_KERNEL); - if (!page_info) { - pr_err("alloc page_info failed: (size=%lx)\n", (unsigned long)size); + if (!page_info) return NULL; - } page_info->va = va; page_info->len = len; @@ -206,8 +206,8 @@ static struct page_info *alloc_page_info(unsigned long va, unsigned long len, return page_info; } -struct page_info *get_page_info(struct page_mng *pager, - unsigned long va, unsigned long len, unsigned int page_size) +struct page_info *get_page_info(struct page_mng *pager, unsigned long va, + unsigned long len, unsigned int page_size) { struct page_info *page_info = search_page_info(pager, va, len); @@ -269,12 +269,15 @@ static void free_svm_proc(struct svm_proc *proc) list_for_each_entry_safe(item, next, &proc->tasks_list, node) list_del(&item->node); } - pr_debug("svm proc clean up done pid %d, peer_pid %d\n", proc->pid, proc->peer_pid); + pr_debug("svm proc clean up done pid %d, peer_pid %d\n", proc->pid, + proc->peer_pid); } -static void svm_proc_mm_release(struct mmu_notifier *subscription, struct mm_struct *mm) +static void svm_proc_mm_release(struct mmu_notifier *subscription, + struct mm_struct *mm) { - struct svm_proc *proc = container_of(subscription, struct svm_proc, notifier); + struct svm_proc *proc = + container_of(subscription, struct svm_proc, notifier); free_svm_proc(proc); kfree(proc); @@ -291,10 +294,11 @@ static int svm_proc_mmu_notifier_register(struct svm_proc *proc) return mmu_notifier_register(&proc->notifier, proc->mm); } -static void local_pair_proc_mm_release(struct mmu_notifier *subscription, struct mm_struct *mm) +static void local_pair_proc_mm_release(struct mmu_notifier *subscription, + struct mm_struct *mm) { struct local_pair_proc *local_proc = - container_of(subscription, struct local_pair_proc, notifier); + container_of(subscription, struct local_pair_proc, notifier); list_del(&local_proc->node); kfree(local_proc); @@ -305,17 +309,20 @@ static const struct mmu_notifier_ops local_pair_proc_mmu_notifier_ops = { .release = local_pair_proc_mm_release, }; -static int local_pair_proc_mmu_notifier_register(struct local_pair_proc *local_proc) +static int +local_pair_proc_mmu_notifier_register(struct local_pair_proc *local_proc) { local_proc->notifier.ops = &local_pair_proc_mmu_notifier_ops; return mmu_notifier_register(&local_proc->notifier, local_proc->mm); } -struct local_pair_proc *insert_local_proc(struct svm_proc *proc, unsigned int pid) +struct local_pair_proc *insert_local_proc(struct svm_proc *proc, + unsigned int pid) { int ret = 0; - struct local_pair_proc *local_proc = kzalloc(sizeof(struct local_pair_proc), GFP_KERNEL); + struct local_pair_proc *local_proc = + kzalloc(sizeof(struct local_pair_proc), GFP_KERNEL); if (!local_proc) return ERR_PTR(-ENOMEM); @@ -346,7 +353,8 @@ struct local_pair_proc *insert_local_proc(struct svm_proc *proc, unsigned int pi put_task_struct(local_proc->tsk); list_add(&local_proc->node, &proc->tasks_list); - pr_debug("%s bind_to_pid %d local_pid %d\n", __func__, proc->pid, local_proc->pid); + pr_debug("%s bind_to_pid %d local_pid %d\n", __func__, proc->pid, + local_proc->pid); return local_proc; diff --git a/drivers/remote_pager/svm_proc_mng.h b/drivers/remote_pager/svm_proc_mng.h index 7af28610c285..85a014c79b41 100644 --- a/drivers/remote_pager/svm_proc_mng.h +++ b/drivers/remote_pager/svm_proc_mng.h @@ -62,4 +62,4 @@ struct svm_proc *search_svm_proc_by_pid(unsigned int pid); struct local_pair_proc *insert_local_proc(struct svm_proc *proc, unsigned int local_pid); struct svm_proc *search_svm_proc_by_local_mm(struct mm_struct *mm); - +#endif diff --git a/drivers/remote_pager/wait_station.c b/drivers/remote_pager/wait_station.c index df5075c744fd..858859fb1605 100644 --- a/drivers/remote_pager/wait_station.c +++ b/drivers/remote_pager/wait_station.c @@ -66,7 +66,8 @@ void *wait_at_station(struct wait_station *ws) void *ret; if (!try_wait_for_completion(&ws->pendings)) { - if (wait_for_completion_io_timeout(&ws->pendings, MAX_WAIT_IO_TIMEOUT) == 0) { + if (wait_for_completion_io_timeout(&ws->pendings, + MAX_WAIT_IO_TIMEOUT) == 0) { pr_err("%s timeout\n", __func__); ret = ERR_PTR(-ETIMEDOUT); goto out; @@ -79,3 +80,4 @@ void *wait_at_station(struct wait_station *ws) put_wait_station(ws); return ret; } +EXPORT_SYMBOL_GPL(wait_at_station); diff --git a/drivers/remote_pager/wait_station.h b/drivers/remote_pager/wait_station.h index 43ff9271288f..7833b40be638 100644 --- a/drivers/remote_pager/wait_station.h +++ b/drivers/remote_pager/wait_station.h @@ -28,3 +28,4 @@ struct wait_station *get_wait_station(void); struct wait_station *wait_station(int id); void put_wait_station(struct wait_station *ws); void *wait_at_station(struct wait_station *ws); +#endif diff --git a/include/linux/remote_pager/msg_chan.h b/include/linux/remote_pager/msg_chan.h index b6e46b6f7378..a8049def052d 100644 --- a/include/linux/remote_pager/msg_chan.h +++ b/include/linux/remote_pager/msg_chan.h @@ -41,3 +41,4 @@ int msg_layer_uninstall_phy_ops(struct phys_channel_ops *ops); #define TO_PEER 0 #define FROM_PEER 1 +#endif -- Gitee From 8ba1aa8cd1de8baa6f2bdf7a1126b17240ba6fc3 Mon Sep 17 00:00:00 2001 From: nicunshu Date: Thu, 10 Jul 2025 21:19:16 +0800 Subject: [PATCH 16/27] mm: gmem: remove deprecated function euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- remove deprecated function pmd_none_or_trans_huge_or_clear_bad. Use pmd_none_or_clear_bad and pmd_trans_huge instead. remove deprecated page size function and avoid kabi problem. Fixes: 4c627cebab85 ("mm: gmem: Introduce GMEM") Signed-off-by: nicunshu --- .../msg_chan/msg_layer/msg_layer.c | 2 +- drivers/remote_pager/msg_handler.h | 2 +- drivers/remote_pager/msg_handler_origin.c | 39 ++++++++++--------- drivers/remote_pager/msg_handler_peer.c | 28 ++++++------- include/linux/gmem.h | 2 +- include/linux/mm.h | 7 ---- include/linux/mm_types.h | 9 +++-- include/linux/vm_object.h | 4 +- mm/gmem.c | 18 ++++----- mm/huge_memory.c | 2 - mm/memory.c | 31 ++++++++------- mm/mmap.c | 15 +++---- mm/vm_object.c | 16 ++++---- 13 files changed, 87 insertions(+), 88 deletions(-) diff --git a/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c b/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c index edf98c5f1834..ad1705b9634d 100644 --- a/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c +++ b/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c @@ -119,7 +119,7 @@ static int recv_handler(void *arg) /* compose body */ data = kmalloc(msg_len, GFP_KERNEL); - if WARN_ON_ONCE(!data && "Unable to alloc a message") + if (WARN_ON_ONCE(!data && "Unable to alloc a message")) return -1; memcpy(data, &header, sizeof(header)); diff --git a/drivers/remote_pager/msg_handler.h b/drivers/remote_pager/msg_handler.h index d7a0cd7231a9..48a9dbb90c39 100644 --- a/drivers/remote_pager/msg_handler.h +++ b/drivers/remote_pager/msg_handler.h @@ -106,7 +106,7 @@ struct gm_evict_page_msg_rq { int gmem_register_pair_remote_task(int origin_nid, int origin_pid, int remote_nid, int remote_pid); #ifdef WITH_GMEM -gm_dev_t *gmem_id_to_device(unsigned int id); +struct gm_dev *gmem_id_to_device(unsigned int id); #endif diff --git a/drivers/remote_pager/msg_handler_origin.c b/drivers/remote_pager/msg_handler_origin.c index 501c627dbb0f..05547d88c3c5 100644 --- a/drivers/remote_pager/msg_handler_origin.c +++ b/drivers/remote_pager/msg_handler_origin.c @@ -22,9 +22,9 @@ #define MAX_NR_NPU 16 #define GMEM_DEBUG 0 -static gm_dev_t *gm_devs[MAX_NR_NPU]; +static struct gm_dev *gm_devs[MAX_NR_NPU]; -gm_dev_t *gmem_id_to_device(unsigned int id) +struct gm_dev *gmem_id_to_device(unsigned int id) { if (id >= MAX_NR_NPU) { pr_err("device id is invalid. (dev_id = %u)\n", id); @@ -78,7 +78,7 @@ int gmem_handle_dev_fault(struct rpg_kmsg_message *msg) unsigned int nid = recv->header.to_nid; unsigned int peer_nid = recv->header.from_nid; unsigned int peer_ws = recv->my_ws; - gm_dev_t *dev = gm_devs[peer_nid]; + struct gm_dev *dev = gm_devs[peer_nid]; struct task_struct *tsk; struct mm_struct *mm; @@ -119,7 +119,7 @@ int gmem_handle_dev_fault(struct rpg_kmsg_message *msg) return ret; } -gm_ret_t gmem_map(struct gm_fault_t *gmf) +enum gm_ret gmem_map(struct gm_fault_t *gmf) { int ret = 0; struct wait_station *ws; @@ -164,7 +164,7 @@ gm_ret_t gmem_map(struct gm_fault_t *gmf) return GM_RET_SUCCESS; } -gm_ret_t gmem_unmap(struct gm_fault_t *gmf) +enum gm_ret gmem_unmap(struct gm_fault_t *gmf) { int ret; struct wait_station *ws; @@ -205,7 +205,7 @@ gm_ret_t gmem_unmap(struct gm_fault_t *gmf) return GM_RET_SUCCESS; } -gm_ret_t gmem_alloc(struct mm_struct *mm, unsigned long va, unsigned long size, +enum gm_ret gmem_alloc(struct mm_struct *mm, unsigned long va, unsigned long size, unsigned long prot) { int ret = 0; @@ -241,7 +241,7 @@ gm_ret_t gmem_alloc(struct mm_struct *mm, unsigned long va, unsigned long size, return GM_RET_SUCCESS; } -gm_ret_t gmem_free(struct mm_struct *mm, unsigned long va, unsigned long size) +enum gm_ret gmem_free(struct mm_struct *mm, unsigned long va, unsigned long size) { int ret = 0; struct wait_station *ws; @@ -287,7 +287,7 @@ int gmem_handle_evict_page(struct rpg_kmsg_message *msg) struct vm_area_struct *vma; struct page *page; dma_addr_t dma_addr; - gm_mapping_t *gm_mapping; + struct gm_mapping *gm_mapping; struct device *dma_dev; struct gm_fault_t gmf; struct svm_proc *proc; @@ -398,12 +398,12 @@ int gmem_handle_evict_page(struct rpg_kmsg_message *msg) return ret; } -gm_ret_t gmem_create(gm_dev_t *dev, void **pmap) +enum gm_ret gmem_create(struct gm_dev *dev, void **pmap) { return GM_RET_SUCCESS; } -gm_mmu_t gm_mmu = { +struct gm_mmu gm_mmu = { .peer_va_alloc_fixed = gmem_alloc, .pmap_create = gmem_create, .peer_va_free = gmem_free, @@ -414,11 +414,12 @@ gm_mmu_t gm_mmu = { #define ASCEND910_HBM_START 0x0000000800000000 #define ASCEND910_HBM_END 0x0000000fffffffff -gm_ret_t mmu_dev_create(struct device *dev, int devid) +enum gm_ret mmu_dev_create(struct device *dev, int devid) { - gm_ret_t ret; + enum gm_ret ret; - ret = gm_dev_create(&gm_mmu, NULL, GM_DEV_CAP_REPLAYABLE | GM_DEV_CAP_PEER, &dev->gm_dev); + ret = gm_dev_create(&gm_mmu, NULL, GM_DEV_CAP_REPLAYABLE | GM_DEV_CAP_PEER, + (struct gm_dev **)&dev->gm_dev); if (ret != GM_RET_SUCCESS) { pr_err("NPU gmem device create failed\n"); return ret; @@ -430,8 +431,8 @@ gm_ret_t mmu_dev_create(struct device *dev, int devid) goto free_gm_dev; } - dev->gm_dev->dma_dev = dev; - gm_devs[devid] = dev->gm_dev; + ((struct gm_dev *)dev->gm_dev)->dma_dev = dev; + gm_devs[devid] = (struct gm_dev *)dev->gm_dev; pr_info("Create NPU gmem device and register HBM\n"); return ret; @@ -442,11 +443,11 @@ gm_ret_t mmu_dev_create(struct device *dev, int devid) } EXPORT_SYMBOL(mmu_dev_create); -gm_ret_t mmu_as_attach(struct device *dev) +enum gm_ret mmu_as_attach(struct device *dev) { - gm_ret_t ret; - gm_dev_t *gm_dev = dev->gm_dev; - gm_context_t *gm_ctx; + enum gm_ret ret; + struct gm_dev *gm_dev = dev->gm_dev; + struct gm_context *gm_ctx; if (!gm_dev) { pr_err("NPU device gm_dev is NULL\n"); diff --git a/drivers/remote_pager/msg_handler_peer.c b/drivers/remote_pager/msg_handler_peer.c index 5bb809f5258c..e9222a1b60e4 100644 --- a/drivers/remote_pager/msg_handler_peer.c +++ b/drivers/remote_pager/msg_handler_peer.c @@ -30,15 +30,15 @@ #define MAX_RETRY_TIME 10 -static inline vm_fault_t get_page_size(enum page_entry_size pe_size, - unsigned int *page_size, - unsigned long *addr) +static inline vm_fault_t get_page_size(unsigned int order, + unsigned int *page_size, + unsigned long *addr) { - switch (pe_size) { - case PE_SIZE_PTE: + switch (order) { + case 0: *page_size = PAGE_SIZE; break; - case PE_SIZE_PMD: + case PMD_ORDER: *page_size = HPAGE_SIZE; *addr = round_down(*addr, HPAGE_SIZE); break; @@ -49,12 +49,12 @@ static inline vm_fault_t get_page_size(enum page_entry_size pe_size, } static inline bool addr_is_mapped(unsigned long addr, pmd_t *pmd, - enum page_entry_size pe_size) + unsigned int order) { pte_t *pte; bool ret; - if (pe_size == PE_SIZE_PMD) + if (order == PMD_ORDER) return !pmd_none(*pmd); if (pmd_none(*pmd)) return false; @@ -65,7 +65,7 @@ static inline bool addr_is_mapped(unsigned long addr, pmd_t *pmd, } static vm_fault_t __gmem_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { vm_fault_t ret = VM_FAULT_SIGBUS; int msg_ret = GM_RET_FAILURE_UNKNOWN; @@ -78,7 +78,7 @@ static vm_fault_t __gmem_fault(struct vm_fault *vmf, struct mm_struct *mm; struct svm_proc *proc; - ret = get_page_size(pe_size, &page_size, &addr); + ret = get_page_size(order, &page_size, &addr); if (ret) return ret; @@ -96,7 +96,7 @@ static vm_fault_t __gmem_fault(struct vm_fault *vmf, } mutex_lock(&page_info->lock); - if (addr_is_mapped(addr, vmf->pmd, pe_size)) + if (addr_is_mapped(addr, vmf->pmd, order)) goto unlock; req.va = addr; @@ -130,15 +130,15 @@ static vm_fault_t __gmem_fault(struct vm_fault *vmf, static vm_fault_t gmem_fault(struct vm_fault *vmf) { - return __gmem_fault(vmf, PE_SIZE_PTE); + return __gmem_fault(vmf, 0); } static vm_fault_t gmem_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { int ret = 0; - ret = __gmem_fault(vmf, pe_size); + ret = __gmem_fault(vmf, order); return ret; } diff --git a/include/linux/gmem.h b/include/linux/gmem.h index 3216b55d659d..fefe17d6f50d 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -284,7 +284,7 @@ extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long beg unsigned long end); enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, int behavior); -vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, enum page_entry_size pe_size); +vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order); /* GMEM address space KPI */ extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, diff --git a/include/linux/mm.h b/include/linux/mm.h index da5d2b0ea066..c024f8cd7bdf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -582,13 +582,6 @@ struct vm_fault { KABI_RESERVE(3) }; -/* page entry size for vm->huge_fault() */ -enum page_entry_size { - PE_SIZE_PTE = 0, - PE_SIZE_PMD, - PE_SIZE_PUD, -}; - /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 28b308a7d5c6..e9cd4439e08d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -773,13 +773,14 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; -#ifdef CONFIG_GMEM - struct vm_object *vm_obj; -#endif #ifdef CONFIG_SHARE_POOL struct sp_area *spa; #endif +#ifdef CONFIG_GMEM + KABI_USE(1, struct vm_object *vm_obj) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) @@ -1061,7 +1062,7 @@ struct mm_struct { KABI_RESERVE(1) #endif #ifdef CONFIG_GMEM - KABI_USE(2, gm_as_t *gm_as) + KABI_USE(2, struct gm_as *gm_as) #else KABI_RESERVE(2) #endif diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index e5327665b6b7..f17d78a62416 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -14,7 +14,7 @@ void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src); void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end); -gm_mapping_t *alloc_gm_mapping(void); +struct gm_mapping *alloc_gm_mapping(void); struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va); void vm_object_mapping_create(struct vm_object *obj, unsigned long start); void free_gm_mappings(struct vm_area_struct *vma); @@ -25,7 +25,7 @@ static inline void vm_object_drop_locked(struct vm_area_struct *vma) {} static inline void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) {} -static inline gm_mapping_t *alloc_gm_mapping(void) { return NULL; } +static inline struct gm_mapping *alloc_gm_mapping(void) { return NULL; } static inline struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va) { return NULL; } static inline void vm_object_mapping_create(struct vm_object *obj, diff --git a/mm/gmem.c b/mm/gmem.c index ebf6a93bc33a..adf640790df5 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -61,7 +61,7 @@ static inline unsigned long pe_mask(unsigned int order) return HPAGE_PMD_MASK; if (order == PUD_ORDER) return HPAGE_PUD_MASK; - return ~0; + return 0; } static struct percpu_counter g_gmem_stats[NR_GMEM_STAT_ITEMS]; @@ -185,7 +185,7 @@ __setup("gmem=", setup_gmem); * The returned device pointer will be passed by new_dev. * A unique id will be assigned to the GMEM device, using Linux's xarray. */ -gm_ret_t gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, +enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, struct gm_dev **new_dev) { struct gm_dev *dev; @@ -215,7 +215,7 @@ gm_ret_t gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, EXPORT_SYMBOL_GPL(gm_dev_create); // Destroy a GMEM device and reclaim the resources. -gm_ret_t gm_dev_destroy(struct gm_dev *dev) +enum gm_ret gm_dev_destroy(struct gm_dev *dev) { // TODO: implement it xa_erase(&gm_dev_id_pool, dev->id); @@ -224,10 +224,10 @@ gm_ret_t gm_dev_destroy(struct gm_dev *dev) EXPORT_SYMBOL_GPL(gm_dev_destroy); /* Handle the page fault triggered by a given device */ -gm_ret_t gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, +enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, int behavior) { - gm_ret_t ret = GM_RET_SUCCESS; + enum gm_ret ret = GM_RET_SUCCESS; struct gm_mmu *mmu = dev->mmu; struct device *dma_dev = dev->dma_dev; struct vm_area_struct *vma; @@ -376,7 +376,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, * This implies dynamically creating * the struct page data structures. */ -gm_ret_t gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end) +enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end) { struct gm_mapping *mapping; unsigned long addr = PAGE_ALIGN(begin); @@ -463,7 +463,7 @@ struct gm_mapping *gm_mappings_alloc(unsigned int nid, unsigned int order) EXPORT_SYMBOL_GPL(gm_mappings_alloc); /* GMEM Virtual Address Space API */ -gm_ret_t gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, +enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as) { struct gm_as *as; @@ -488,7 +488,7 @@ gm_ret_t gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc p } EXPORT_SYMBOL_GPL(gm_as_create); -gm_ret_t gm_as_destroy(struct gm_as *as) +enum gm_ret gm_as_destroy(struct gm_as *as) { struct gm_context *ctx, *tmp_ctx; @@ -501,7 +501,7 @@ gm_ret_t gm_as_destroy(struct gm_as *as) } EXPORT_SYMBOL_GPL(gm_as_destroy); -gm_ret_t gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, +enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, bool activate, struct gm_context **out_ctx) { struct gm_context *ctx; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 34af481e6176..3f6f3c92514e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1341,7 +1341,6 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); - static struct folio *vma_alloc_peer_shared_folio_pmd(struct vm_area_struct *vma, unsigned long haddr, gm_mapping_t *gm_mapping) { @@ -1580,7 +1579,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) pte_free(vma->vm_mm, pgtable); folio_put(folio); return ret; - } /* diff --git a/mm/memory.c b/mm/memory.c index 9aa4d8174724..70d92ba26cc2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1719,7 +1719,7 @@ static inline void zap_logic_pmd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { - gm_mapping_t *gm_mapping = NULL; + struct gm_mapping *gm_mapping = NULL; struct page *page = NULL; xa_lock(vma->vm_obj->logical_page_table); @@ -1769,8 +1769,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); - else if (zap_huge_pmd(tlb, vma, pmd, addr)) - goto next; + else if (zap_huge_pmd(tlb, vma, pmd, addr)) { + addr = next; + continue; + } /* fall through */ } else if (details && details->single_folio && folio_test_pmd_mappable(details->single_folio) && @@ -1783,7 +1785,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, */ spin_unlock(ptl); } - +#ifdef CONFIG_GMEM /* * Here there can be other concurrent MADV_DONTNEED or * trans huge page faults running, and if the pmd is @@ -1791,22 +1793,23 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, * because MADV_DONTNEED holds the mmap_lock in read * mode. */ - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) { + if (pmd_none_or_clear_bad(pmd) || pmd_trans_huge(*pmd)) { if (vma_is_peer_shared(vma)) zap_logic_pmd_range(vma, addr, next); - goto next; } - - next = zap_pte_range(tlb, vma, pmd, addr, next, details); -next: - cond_resched(); - } while (pmd++, addr = next, addr != end); +#endif + if (pmd_none(*pmd)) { + addr = next; + continue; + } + addr = zap_pte_range(tlb, vma, pmd, addr, next, details); + if (addr != next) + pmd--; + } while (pmd++, cond_resched(), addr != end); return addr; } - - static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, @@ -1869,8 +1872,10 @@ void unmap_page_range(struct mmu_gather *tlb, do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { +#ifdef CONFIG_GMEM if (vma_is_peer_shared(vma)) zap_logic_pud_range(vma, addr, next); +#endif continue; } next = zap_p4d_range(tlb, vma, pgd, addr, next, details); diff --git a/mm/mmap.c b/mm/mmap.c index 1ea24d8f2fd0..7e931deefe6a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1994,6 +1994,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +#ifdef CONFIG_GMEM unsigned long get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long align) @@ -2014,6 +2015,7 @@ get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long l return addr; } EXPORT_SYMBOL(get_unmapped_area_aligned); +#endif /** * find_vma_intersection() - Look up the first VMA which intersects the interval @@ -2628,9 +2630,9 @@ static void munmap_in_peer_devices(struct mm_struct *mm, { unsigned long addr = start; struct vm_object *obj = vma->vm_obj; - gm_ret_t ret; - gm_context_t *ctx, *tmp; - gm_mapping_t *gm_mapping; + enum gm_ret ret; + struct gm_context *ctx, *tmp; + struct gm_mapping *gm_mapping; struct gm_fault_t gmf = { .mm = mm, @@ -2929,9 +2931,8 @@ static int alloc_va_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long len, vm_flags_t vm_flags) { - gm_context_t *ctx, *tmp; - gm_prot_t prot = VM_NONE; - gm_ret_t ret; + struct gm_context *ctx, *tmp; + enum gm_ret ret; pr_debug("gmem: start mmap, as %p\n", mm->gm_as); if (!mm->gm_as) @@ -3168,7 +3169,7 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, expanded: #ifdef CONFIG_GMEM if (vma_is_peer_shared(vma)) { - gm_ret_t ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); + enum gm_ret ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { retry_times++; diff --git a/mm/vm_object.c b/mm/vm_object.c index 8d3d6b121649..6ac4c172cfdd 100644 --- a/mm/vm_object.c +++ b/mm/vm_object.c @@ -44,9 +44,9 @@ static struct kmem_cache *vm_object_cachep; static struct kmem_cache *gm_mapping_cachep; /* gm_mapping will not be release dynamically */ -gm_mapping_t *alloc_gm_mapping(void) +struct gm_mapping *alloc_gm_mapping(void) { - gm_mapping_t *gm_mapping = kmem_cache_zalloc(gm_mapping_cachep, GFP_KERNEL); + struct gm_mapping *gm_mapping = kmem_cache_zalloc(gm_mapping_cachep, GFP_KERNEL); if (!gm_mapping) return NULL; @@ -58,12 +58,12 @@ gm_mapping_t *alloc_gm_mapping(void) } EXPORT_SYMBOL(alloc_gm_mapping); -static inline void release_gm_mapping(gm_mapping_t *mapping) +static inline void release_gm_mapping(struct gm_mapping *mapping) { kmem_cache_free(gm_mapping_cachep, mapping); } -static inline gm_mapping_t *lookup_gm_mapping(struct vm_object *obj, unsigned long pindex) +static inline struct gm_mapping *lookup_gm_mapping(struct vm_object *obj, unsigned long pindex) { return xa_load(obj->logical_page_table, pindex); } @@ -146,7 +146,7 @@ void vm_object_drop_locked(struct vm_area_struct *vma) void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src) { unsigned long index; - gm_mapping_t *mapping; + struct gm_mapping *mapping; unsigned long moved_pages = 0; XA_STATE(xas, src->vm_obj->logical_page_table, linear_page_index(src, src->vm_start)); @@ -168,7 +168,7 @@ void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned { /* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */ unsigned long removed_pages = 0; - gm_mapping_t *mapping; + struct gm_mapping *mapping; XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); @@ -205,7 +205,7 @@ EXPORT_SYMBOL_GPL(vm_object_lookup); void vm_object_mapping_create(struct vm_object *obj, unsigned long start) { pgoff_t index = linear_page_index(obj->vma, start); - gm_mapping_t *gm_mapping; + struct gm_mapping *gm_mapping; gm_mapping = alloc_gm_mapping(); if (!gm_mapping) @@ -216,7 +216,7 @@ void vm_object_mapping_create(struct vm_object *obj, unsigned long start) void free_gm_mappings(struct vm_area_struct *vma) { - gm_mapping_t *gm_mapping; + struct gm_mapping *gm_mapping; XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); xa_lock(vma->vm_obj->logical_page_table); -- Gitee From a5da9f6f7ef60b938cfc030dd9dae03deb7f60dd Mon Sep 17 00:00:00 2001 From: nicunshu Date: Thu, 17 Jul 2025 19:41:25 +0800 Subject: [PATCH 17/27] mm: gmem: fix code sytle problems euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- fix multiple code sytle problems Fixes: 4c627cebab85 ("mm: gmem: Introduce GMEM") Signed-off-by: nicunshu --- include/linux/gmem.h | 9 +++++---- include/linux/vm_object.h | 6 +++++- init/main.c | 8 -------- kernel/fork.c | 5 +---- mm/Kconfig | 2 +- mm/gmem.c | 15 +++++++++------ mm/huge_memory.c | 6 ++++-- mm/memory.c | 4 ++-- mm/mempolicy.c | 5 +---- mm/mm_init.c | 6 ++++++ mm/mmap.c | 17 ++++++----------- mm/vm_object.c | 18 +++++++++++++++++- 12 files changed, 57 insertions(+), 44 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index fefe17d6f50d..b0cdb6d0ab9a 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -302,13 +302,14 @@ extern unsigned long gm_as_alloc(struct gm_as *as, unsigned long hint, unsigned extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); -enum gmem_stat_item { - NR_PAGE_MIGRATING, +enum gmem_stats_item { + NR_PAGE_MIGRATING_H2D, + NR_PAGE_MIGRATING_D2H, NR_GMEM_STAT_ITEMS }; -extern void gmem_state_counter(enum gmem_stat_item item, int val); -extern void gmem_state_counter_show(void); +extern void gmem_stats_counter(enum gmem_stats_item item, int val); +extern void gmem_stats_counter_show(void); /* h-NUMA topology */ struct hnode { diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index f17d78a62416..ca82642eb2df 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -10,9 +10,10 @@ int __init vm_object_init(void); struct vm_object *vm_object_create(struct vm_area_struct *vma); void vm_object_drop_locked(struct vm_area_struct *vma); -void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src); +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src, bool dst_peer_shared); void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end); +void dup_peer_shared_vma(struct vm_area_struct *vma); struct gm_mapping *alloc_gm_mapping(void); struct gm_mapping *vm_object_lookup(struct vm_object *obj, unsigned long va); @@ -22,6 +23,9 @@ void free_gm_mappings(struct vm_area_struct *vma); static inline void __init vm_object_init(void) {} static inline struct vm_object *vm_object_create(struct vm_area_struct *vma) { return NULL; } static inline void vm_object_drop_locked(struct vm_area_struct *vma) {} +static inline void dup_vm_object(struct vm_area_struct *dst, + struct vm_area_struct *src, bool dst_peer_shared) {} +static inline void dup_peer_shared_vma(struct vm_area_struct *vma) {} static inline void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) {} diff --git a/init/main.c b/init/main.c index 51395ee7a27d..f97f06547078 100644 --- a/init/main.c +++ b/init/main.c @@ -102,10 +102,6 @@ #include #include -#ifdef CONFIG_GMEM -#include -#endif - #include #include #include @@ -909,10 +905,6 @@ void start_kernel(void) smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ boot_cpu_hotplug_init(); -#ifdef CONFIG_GMEM - hnuma_init(); -#endif - pr_notice("Kernel command line: %s\n", saved_command_line); /* parameters may set static keys */ jump_label_init(); diff --git a/kernel/fork.c b/kernel/fork.c index d984d93b3d39..cf44a02680d6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -532,10 +532,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) dup_anon_vma_name(orig, new); #ifdef CONFIG_GMEM - if (vma_is_peer_shared(orig)) { - pr_debug("gmem: peer-shared vma should not be dup\n"); - new->vm_obj = vm_object_create(new); - } + dup_peer_shared_vma(new); #endif return new; diff --git a/mm/Kconfig b/mm/Kconfig index 88be25f465b3..829a0d6a0fb5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1486,7 +1486,7 @@ config GMEM select ARCH_USES_HIGH_VMA_FLAGS default y help - This provides a high-level interface that decouples MMUspecific functions. + This provides a high-level interface that decouples MMU-specific functions. Device drivers can thus attach themselves to a process’s address space and let the OS take charge of their memory management. This eliminates the need for device drivers to reinvent the wheel and allows them to diff --git a/mm/gmem.c b/mm/gmem.c index adf640790df5..c484c2c40101 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -66,7 +66,7 @@ static inline unsigned long pe_mask(unsigned int order) static struct percpu_counter g_gmem_stats[NR_GMEM_STAT_ITEMS]; -void gmem_state_counter(enum gmem_stat_item item, int val) +void gmem_stats_counter(enum gmem_stats_item item, int val) { if (!gmem_is_enabled()) return; @@ -95,14 +95,17 @@ static int gmem_stat_init(void) } #ifdef CONFIG_PROC_FS -static int gmemstat_show(struct seq_file *m, void *arg) +static int gmem_stats_show(struct seq_file *m, void *arg) { if (!gmem_is_enabled()) return 0; seq_printf( - m, "migrating : %lld\n", - percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING])); + m, "migrating H2D : %lld\n", + percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING_H2D])); + seq_printf( + m, "migrating D2H : %lld\n", + percpu_counter_read_positive(&g_gmem_stats[NR_PAGE_MIGRATING_D2H])); return 0; } @@ -154,7 +157,7 @@ static int __init gmem_init(void) } #ifdef CONFIG_PROC_FS - proc_create_single("gmemstat", 0444, NULL, gmemstat_show); + proc_create_single("gmemstat", 0444, NULL, gmem_stats_show); #endif static_branch_enable(&gmem_status); @@ -301,7 +304,7 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev * update page to willneed and this will stop page evicting */ gm_mapping_flags_set(gm_mapping, GM_PAGE_WILLNEED); - gmem_state_counter(NR_PAGE_MIGRATING, 1); + gmem_stats_counter(NR_PAGE_MIGRATING_D2H, 1); ret = GM_RET_SUCCESS; } else { pr_err("gmem: peer map failed\n"); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3f6f3c92514e..437182b72cd6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1634,12 +1634,14 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + vm_fault_t ret; + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; ret = vmf_anon_prepare(vmf); if (ret) return ret; - khugepaged_enter_vma(vma, vma->vm_flags); if (vma_is_peer_shared(vma)) @@ -1684,10 +1686,10 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) } return ret; } - return __do_huge_pmd_anonymous_page(vmf); } + static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, pgtable_t pgtable) diff --git a/mm/memory.c b/mm/memory.c index 70d92ba26cc2..ef556a62670e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1793,8 +1793,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, * because MADV_DONTNEED holds the mmap_lock in read * mode. */ - if (pmd_none_or_clear_bad(pmd) || pmd_trans_huge(*pmd)) { - if (vma_is_peer_shared(vma)) + if (vma_is_peer_shared(vma)) { + if (pmd_none_or_clear_bad(pmd) || pmd_trans_huge(*pmd)) zap_logic_pmd_range(vma, addr, next); } #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fb11f24b6685..9290304e3741 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1902,11 +1902,8 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, bool vma_migratable(struct vm_area_struct *vma) { -#ifdef CONFIG_GMEM + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED)) -#else - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) -#endif return false; /* diff --git a/mm/mm_init.c b/mm/mm_init.c index 6677aaa5972d..1a3d3b6e52c9 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -30,6 +30,9 @@ #include "internal.h" #include "slab.h" #include "shuffle.h" +#ifdef CONFIG_GMEM +#include +#endif #include @@ -2797,6 +2800,9 @@ static void __init mem_init_print_info(void) */ void __init mm_core_init(void) { +#ifdef CONFIG_GMEM + hnuma_init(); +#endif /* Initializations relying on SMP setup */ build_all_zonelists(NULL); page_alloc_init_cpuhp(); diff --git a/mm/mmap.c b/mm/mmap.c index 7e931deefe6a..cb36c96a9619 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -648,12 +648,10 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { + int ret; #ifdef CONFIG_GMEM - if (vma_is_peer_shared(dst)) - dup_vm_object(dst, src); + dup_vm_object(dst, src, true); #endif - int ret; - vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; ret = anon_vma_clone(dst, src); @@ -1364,9 +1362,9 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon */ #ifdef CONFIG_GMEM if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { - len = round_up(len, SZ_2M); + len = round_up(len, PMD_SIZE); addr = get_unmapped_area_aligned(file, addr, len, pgoff, flags, - SZ_2M); + PMD_SIZE); } else { addr = get_unmapped_area(file, addr, len, pgoff, flags); } @@ -2555,12 +2553,9 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, err = anon_vma_clone(new, vma); if (err) goto out_free_mpol; - -#ifdef CONFIG_GMEM - if (vma_is_peer_shared(vma)) - dup_vm_object(new, vma); +#ifdef COFNIG_GMEM + dup_vm_object(new, vma, false); #endif - if (new->vm_file) get_file(new->vm_file); diff --git a/mm/vm_object.c b/mm/vm_object.c index 6ac4c172cfdd..25af359def56 100644 --- a/mm/vm_object.c +++ b/mm/vm_object.c @@ -143,12 +143,20 @@ void vm_object_drop_locked(struct vm_area_struct *vma) } } -void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src) +void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src, bool dst_peer_shared) { unsigned long index; struct gm_mapping *mapping; unsigned long moved_pages = 0; + if (dst_peer_shared) { + if (!vma_is_peer_shared(dst)) + return; + } else { + if (!vma_is_peer_shared(src)) + return; + } + XA_STATE(xas, src->vm_obj->logical_page_table, linear_page_index(src, src->vm_start)); xa_lock(dst->vm_obj->logical_page_table); @@ -164,6 +172,14 @@ void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src) xa_unlock(dst->vm_obj->logical_page_table); } +void dup_peer_shared_vma(struct vm_area_struct *vma) +{ + if (vma_is_peer_shared(vma)) { + pr_debug("gmem: peer-shared vma should not be dup\n"); + vma->vm_obj = vm_object_create(vma); + } +} + void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) { /* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */ -- Gitee From 8448d9478216e94cb954dcd325ebb23f7c975a9e Mon Sep 17 00:00:00 2001 From: nicunshu Date: Thu, 17 Jul 2025 19:59:11 +0800 Subject: [PATCH 18/27] drivers: remote_pager: remove remote_pager euleros inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- remote_pager can not be used currently, remove it from kernel Fixes: 3d5767f93a1b ("drivers: remote_pager: introduce remote_pager module for gmem") Signed-off-by: nicunshu --- drivers/Kconfig | 2 - drivers/Makefile | 2 - drivers/remote_pager/Kconfig | 25 - drivers/remote_pager/Makefile | 11 - drivers/remote_pager/main.c | 33 - .../msg_chan/msg_layer/msg_layer.c | 273 ------- .../msg_chan/msg_layer/msg_layer.h | 49 -- drivers/remote_pager/msg_handler.h | 127 ---- drivers/remote_pager/msg_handler_comm.c | 120 ---- drivers/remote_pager/msg_handler_origin.c | 475 ------------- drivers/remote_pager/msg_handler_peer.c | 671 ------------------ drivers/remote_pager/svm_proc_mng.c | 427 ----------- drivers/remote_pager/svm_proc_mng.h | 65 -- drivers/remote_pager/wait_station.c | 83 --- drivers/remote_pager/wait_station.h | 31 - 15 files changed, 2394 deletions(-) delete mode 100644 drivers/remote_pager/Kconfig delete mode 100644 drivers/remote_pager/Makefile delete mode 100644 drivers/remote_pager/main.c delete mode 100644 drivers/remote_pager/msg_chan/msg_layer/msg_layer.c delete mode 100644 drivers/remote_pager/msg_chan/msg_layer/msg_layer.h delete mode 100644 drivers/remote_pager/msg_handler.h delete mode 100644 drivers/remote_pager/msg_handler_comm.c delete mode 100644 drivers/remote_pager/msg_handler_origin.c delete mode 100644 drivers/remote_pager/msg_handler_peer.c delete mode 100644 drivers/remote_pager/svm_proc_mng.c delete mode 100644 drivers/remote_pager/svm_proc_mng.h delete mode 100644 drivers/remote_pager/wait_station.c delete mode 100644 drivers/remote_pager/wait_station.h diff --git a/drivers/Kconfig b/drivers/Kconfig index 64acbbd060ee..da6544d0c108 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -247,8 +247,6 @@ source "drivers/hte/Kconfig" source "drivers/cdx/Kconfig" -source "drivers/remote_pager/Kconfig" - source "drivers/cpuinspect/Kconfig" source "drivers/roh/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index b66caa9a69a4..9af19fcf784c 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -201,8 +201,6 @@ obj-$(CONFIG_HTE) += hte/ obj-$(CONFIG_DRM_ACCEL) += accel/ obj-$(CONFIG_CDX_BUS) += cdx/ -obj-$(CONFIG_REMOTE_PAGER) += remote_pager/ - obj-$(CONFIG_S390) += s390/ obj-$(CONFIG_ROH) += roh/ diff --git a/drivers/remote_pager/Kconfig b/drivers/remote_pager/Kconfig deleted file mode 100644 index 414a676f02b0..000000000000 --- a/drivers/remote_pager/Kconfig +++ /dev/null @@ -1,25 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -menu "remote pager device" - -config REMOTE_PAGER - tristate "remote pager" - default m - depends on GMEM - help - Module used for gmem. - This is comm part, including send and recv message function - Used for memory management - If unsure, say Y. - -config REMOTE_PAGER_MASTER - tristate "remote pager master" - default m - depends on REMOTE_PAGER - help - Module used for gmem. - This is host part, used for send and recv message from device - Used for memory management - If unsure, say Y. - -endmenu diff --git a/drivers/remote_pager/Makefile b/drivers/remote_pager/Makefile deleted file mode 100644 index 8b1d735ae546..000000000000 --- a/drivers/remote_pager/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -obj-$(CONFIG_REMOTE_PAGER) += remote_pager.o - -remote_pager-$(CONFIG_REMOTE_PAGER) := main.o \ - wait_station.o \ - msg_handler_comm.o \ - msg_chan/msg_layer/msg_layer.o \ - svm_proc_mng.o - -remote_pager-$(CONFIG_REMOTE_PAGER_MASTER) += msg_handler_origin.o \ No newline at end of file diff --git a/drivers/remote_pager/main.c b/drivers/remote_pager/main.c deleted file mode 100644 index afa17ee2ce07..000000000000 --- a/drivers/remote_pager/main.c +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Generalized Memory Management. - * - * Copyright (c) 2023- Huawei, Inc. - * Author: Chunsheng Luo - * Co-Author: Jun Chen - */ -#include -#include "msg_chan/msg_layer/msg_layer.h" -#include "msg_handler.h" - -static int __init remote_pager_init(void) -{ - msg_handle_init(); - return 0; -} - -static void __exit remote_pager_exit(void) -{ - /* - * If module_init() is implemented, module_exit() - * should be implemented as well. - */ -} - -module_init(remote_pager_init); -module_exit(remote_pager_exit); - -MODULE_AUTHOR("Huawei Tech. Co., Ltd."); -MODULE_DESCRIPTION("Remote-pager"); -MODULE_ALIAS("Remote-pager"); -MODULE_LICENSE("GPL"); diff --git a/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c b/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c deleted file mode 100644 index ad1705b9634d..000000000000 --- a/drivers/remote_pager/msg_chan/msg_layer/msg_layer.c +++ /dev/null @@ -1,273 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Generalized Memory Management. - * - * Copyright (c) 2023- Huawei, Inc. - * Author: Chunsheng Luo - * Co-Author: Jiangtian Feng, Jun Chen - */ -#include -#include -#include -#include - -#include "msg_layer.h" - -#define MAX_NUM_NODES 16 -#define MSG_SLEEP_MIN 2 -#define MSG_SLEEP_MAX 3 - -/* Per-node handle */ -struct sock_handle { - int nid; - int status; - int chan_id; - struct task_struct *recv_handler; -}; - -static struct sock_handle sock_handles[MAX_NUM_NODES]; -static struct phys_channel_ops *g_phys_chan_ops; - -int msg_send(int chan_id, void *msg_data, size_t msg_len) -{ - int ret = 0; - - if (!g_phys_chan_ops) - return -ENOENT; - - ret = g_phys_chan_ops->copy_to(chan_id, msg_data, msg_len, 1); - ret |= g_phys_chan_ops->notify(chan_id); - if (ret < 0) - pr_err("%s failed in chan %d\n", __func__, chan_id); - - return ret; -} - -static inline int build_msg(int type, int from_nid, int to_nid, void *msg_data, size_t msg_len) -{ - struct rpg_kmsg_message *msg = (struct rpg_kmsg_message *)msg_data; - - msg->header.type = type; - msg->header.prio = RPG_KMSG_PRIO_NORMAL; - msg->header.size = msg_len; - msg->header.from_nid = from_nid; - msg->header.to_nid = to_nid; - - return 0; -} - -int msg_send_nid(int type, int from_nid, int to_nid, void *msg_data, size_t msg_len) -{ - struct sock_handle *sh = sock_handles + to_nid; - - build_msg(type, from_nid, to_nid, msg_data, msg_len); - - return msg_send(sh->chan_id, msg_data, msg_len); -} -EXPORT_SYMBOL(msg_send_nid); - -int msg_recv(int chan_id, void *buf, size_t len) -{ - if (!g_phys_chan_ops) - return -ENOENT; - - return g_phys_chan_ops->copy_from(chan_id, buf, len, 1); -} - -extern int handle_remote_pager_work(void *msg); -static int recv_handler(void *arg) -{ - struct sock_handle *sh = arg; - - log_info("RECV handler for %d is ready ha %ld\n", sh->nid, sizeof(struct rpg_kmsg_hdr)); - - while (!kthread_should_stop()) { - size_t len; - int ret; - size_t offset; - struct rpg_kmsg_hdr header; - char *data = NULL; - size_t msg_len = 0; - - /* compose header */ - offset = 0; - len = sizeof(header); - while (len > 0) { - ret = msg_recv(sh->chan_id, (char *)(&header) + offset, len); - if (ret == -ENOENT) { - pr_err("no msg chan failed\n"); - usleep_range(MSG_SLEEP_MIN, MSG_SLEEP_MAX); - break; - } - - if ((ret == -1) || kthread_should_stop()) - return 0; - - offset += ret; - len -= ret; - } - - if (ret < 0) - break; - - msg_len = header.size; - if (!msg_len) { - pr_err("msg_len is zero failed? from_nid %d prio:%d type:%d size:%ld\n", - header.from_nid, header.prio, header.type, header.size); - continue; - } - - /* compose body */ - data = kmalloc(msg_len, GFP_KERNEL); - if (WARN_ON_ONCE(!data && "Unable to alloc a message")) - return -1; - memcpy(data, &header, sizeof(header)); - - offset = sizeof(header); - len = msg_len - offset; - - while (len > 0) { - ret = msg_recv(sh->chan_id, data + offset, len); - if (ret == -1 || kthread_should_stop()) - return 0; - - offset += ret; - len -= ret; - } - - if (ret < 0) - break; - - /* Call pcn_kmsg upper layer */ - handle_remote_pager_work(data); - } - - return 0; -} - -int msg_open(int nid) -{ - int chan_id = 0; - struct sock_handle *sh = sock_handles + nid; - struct task_struct *tsk_recv; - - if (sh->status == MSG_CHAN_ENABLE) { - pr_err("node:%d msg chan is enabled\n", nid); - return 0; - } - - if (!g_phys_chan_ops) - return -ENOENT; - - chan_id = g_phys_chan_ops->open(nid); - if (chan_id < 0) { - log_err("open msg channel failed %d\n", chan_id); - return chan_id; - } - - tsk_recv = kthread_run(recv_handler, sock_handles + nid, "remote-pager-recv"); - if (IS_ERR(tsk_recv)) { - log_err("Cannot create %s handler, %ld\n", "remote-pager-recv", PTR_ERR(tsk_recv)); - return PTR_ERR(tsk_recv); - } - - sh->chan_id = chan_id; - sh->status = MSG_CHAN_ENABLE; - sh->nid = nid; - sh->recv_handler = tsk_recv; - - pr_err("%s chanid %d\n", __func__, chan_id); - - return chan_id; -} -EXPORT_SYMBOL(msg_open); - -int msg_close(int nid) -{ - struct sock_handle *sh = sock_handles + nid; - - /* TODO: Get sock_handle, then set sock_handle disable and destroy recv task */ - if (sh->status != MSG_CHAN_ENABLE) { - pr_err("node:%d msg chan is disabled\n", nid); - return 0; - } - - if (sh->recv_handler) { - kthread_stop(sh->recv_handler); - sh->recv_handler = NULL; - } - - if (g_phys_chan_ops) - g_phys_chan_ops->close(sh->chan_id); - - sh->chan_id = 0; - sh->status = MSG_CHAN_DISABLE; - - return 0; -} -EXPORT_SYMBOL(msg_close); - -int handle_migrate_page(void *peer_addr, struct page *local_page, size_t size, int dir) -{ - if (!g_phys_chan_ops) - return -ENOENT; - - return g_phys_chan_ops->migrate_page(peer_addr, local_page, size, dir); -} -EXPORT_SYMBOL(handle_migrate_page); - -static DEFINE_SPINLOCK(install_lock); -static int default_msg_chan_id; -int msg_layer_install_phy_ops(struct phys_channel_ops *ops, int default_chan_id) -{ - int ret = 0; - - if (!ops) { - pr_err("install NULL as msg channel\n"); - return -EINVAL; - } - - spin_lock(&install_lock); - if (g_phys_chan_ops) { - ret = -EEXIST; - pr_err("phy_ops areadly be installed\n"); - goto unlock; - } - - /* must before msg_open */ - g_phys_chan_ops = ops; - if (default_chan_id >= 0) { - ret = msg_open(default_chan_id); - if (ret < 0) { - pr_err("can not open msg channel %d\n", default_chan_id); - g_phys_chan_ops = NULL; - goto unlock; - } - } - - default_msg_chan_id = default_chan_id; - -unlock: - spin_unlock(&install_lock); - return ret; -} -EXPORT_SYMBOL(msg_layer_install_phy_ops); - -int msg_layer_uninstall_phy_ops(struct phys_channel_ops *ops) -{ - if (!ops || ops != g_phys_chan_ops) { - pr_err("Invalid phy_ops\n"); - return -EINVAL; - } - - spin_lock(&install_lock); - if (default_msg_chan_id >= 0) - msg_close(default_msg_chan_id); - - g_phys_chan_ops = NULL; - default_msg_chan_id = -1; - spin_unlock(&install_lock); - - return 0; -} -EXPORT_SYMBOL(msg_layer_uninstall_phy_ops); diff --git a/drivers/remote_pager/msg_chan/msg_layer/msg_layer.h b/drivers/remote_pager/msg_chan/msg_layer/msg_layer.h deleted file mode 100644 index 221842e8c434..000000000000 --- a/drivers/remote_pager/msg_chan/msg_layer/msg_layer.h +++ /dev/null @@ -1,49 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Generalized Memory Management. - * - * Copyright (c) 2023- Huawei, Inc. - * Author: Chunsheng Luo - * Co-Author: Jiangtian Feng - */ -#ifndef __MSG_LAYER_H__ -#define __MSG_LAYER_H__ - -#include -#include - -#define RPG_KMSG_MAX_SIZE (64UL << 10) -#define RPG_KMSG_MAX_PAYLOAD_SIZE \ - (RPG_KMSG_MAX_SIZE - sizeof(struct rpg_kmsg_hdr)) - -/* Enumerate message priority. XXX Priority is not supported yet. */ -enum rpg_kmsg_prio { - RPG_KMSG_PRIO_LOW, - RPG_KMSG_PRIO_NORMAL, - RPG_KMSG_PRIO_HIGH, -}; - -#define MSG_CHAN_DISABLE 0 -#define MSG_CHAN_ENABLE 1 - -struct rpg_kmsg_hdr { - int from_nid :6; - int to_nid :6; - enum rpg_kmsg_prio prio :2; - int type :8; - size_t size; -} __packed; - -struct rpg_kmsg_message { - struct rpg_kmsg_hdr header; - unsigned char data[RPG_KMSG_MAX_PAYLOAD_SIZE]; -} __packed; - -int msg_send_nid(int type, int from_nid, int to_nid, void *msg_data, size_t msg_len); -int msg_send(int chan_id, void *msg_data, size_t msg_len); -int msg_recv(int chan_id, void *buf, size_t len); -int msg_open(int nid); -int msg_close(int nid); -int handle_migrate_page(void *peer_addr, struct page *local_page, size_t size, int dir); - -#endif diff --git a/drivers/remote_pager/msg_handler.h b/drivers/remote_pager/msg_handler.h deleted file mode 100644 index 48a9dbb90c39..000000000000 --- a/drivers/remote_pager/msg_handler.h +++ /dev/null @@ -1,127 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Generalized Memory Management. - * - * Copyright (c) 2023- Huawei, Inc. - * Author: Liming Huang - * Co-Author: Jun Chen - * - */ -#ifndef _REMOTE_PAGER_MSG_HANDLER_H_ -#define _REMOTE_PAGER_MSG_HANDLER_H_ - -#include -#include - -#ifdef WITH_GMEM -#include -#endif - -#include "wait_station.h" -#include "msg_chan/msg_layer/msg_layer.h" - -#define PXD_JUDGE(pxd) (((pxd) == NULL) || (pxd##_none(*(pxd##_t *)(pxd)) != 0) || \ - (pxd##_bad(*(pxd##_t *)(pxd)) != 0)) -#define PMD_JUDGE(pmd) (((pmd) == NULL) || (pmd_none(*(pmd_t *)(pmd)) != 0) || \ - (pmd_bad(*(pmd_t *)(pmd)) != 0)) - -#define GMEM_COPY_PAGE 1 - -/* Function pointer to callback function */ -typedef int (*rpg_kmsg_cbftn)(struct rpg_kmsg_message *); - -enum rpg_kmsg_type { - /* TASK CMD */ - GMEM_TASK_PAIRING_REQUEST, - GMEM_TASK_EXIT_ORIGIN, - GMEM_TASK_EXIT_REMOTE, - - /* VMA CMD */ - GMEM_ALLOC_VMA_REQUEST, - GMEM_FREE_VMA_REQUEST, - - /* PAGE CMD */ - GMEM_ALLOC_PAGE_REQUEST, - GMEM_FREE_PAGE_REQUEST, - GMEM_PAGE_FAULT_REQUEST, - GMEM_EVICT_PAGE_REQUEST, - - /* ADVISE CMD */ - GMEM_HMADVISE_REQUEST, - GMEM_HMEMCPY_REQUEST, - - GMEM_COMMON_RESPONSE, - GMEM_MSG_MAX_ID, -}; - -enum msg_location { - MSG_ON_ORIGIN, - MSG_ON_REMOTE, -}; - -struct rpg_kmsg_work { - struct work_struct work; - void *msg; -}; - -struct msg_handler_st { - rpg_kmsg_cbftn fnt; -}; - -struct comm_msg_rsp { - struct rpg_kmsg_hdr header; - int peer_ws; - int ret; -}; - -struct gm_pair_msg_rq { - struct rpg_kmsg_hdr header; - unsigned int my_ws; - unsigned int my_pid; - unsigned int peer_nid; - unsigned int peer_pid; -}; - -struct gm_pager_msg_rq { - struct rpg_kmsg_hdr header; - unsigned int my_ws; - unsigned int peer_pid; - unsigned long va; - unsigned long dma_addr; - unsigned long size; - unsigned long prot; - unsigned long flags; - int behavior; -}; - -struct gm_evict_page_msg_rq { - struct rpg_kmsg_hdr header; - unsigned int peer_pid; - unsigned int ws; - unsigned long va; - unsigned long size; -}; - - -int gmem_register_pair_remote_task(int origin_nid, int origin_pid, int remote_nid, int remote_pid); - -#ifdef WITH_GMEM -struct gm_dev *gmem_id_to_device(unsigned int id); -#endif - - -/* msg handler */ -int gmem_handle_evict_page(struct rpg_kmsg_message *msg); -int gmem_handle_comm_msg_rsp(struct rpg_kmsg_message *msg); -int gmem_handle_dev_fault(struct rpg_kmsg_message *msg); - -int gmem_add_to_svm_proc(int my_nid, int my_pid, int peer_nid, int peer_pid); -int gmem_send_comm_msg_reply(unsigned int from_nid, unsigned int to_nid, - unsigned int peer_ws, int ret); - -int handle_remote_pager_work(void *msg); -int msg_handle_init(void); - -#endif - - diff --git a/drivers/remote_pager/msg_handler_comm.c b/drivers/remote_pager/msg_handler_comm.c deleted file mode 100644 index 82308249afda..000000000000 --- a/drivers/remote_pager/msg_handler_comm.c +++ /dev/null @@ -1,120 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Generalized Memory Management. - * - * Copyright (C) 2023- Huawei, Inc. - * Author: Chushu Ni - * Co-Author: Chunsheng Luo - */ -#include -#include -#include -#include - -#include "msg_handler.h" -#include "svm_proc_mng.h" - -static struct workqueue_struct *remote_pager_wq; - -struct msg_handler_st rpg_kmsg_cbftns[GMEM_MSG_MAX_ID] = { -#if IS_ENABLED(CONFIG_REMOTE_PAGER_MASTER) - /* REMOTE TO HOST */ - [GMEM_PAGE_FAULT_REQUEST] = { - gmem_handle_dev_fault - }, - [GMEM_EVICT_PAGE_REQUEST] = { - gmem_handle_evict_page - }, -#endif - - /* BOTH */ - [GMEM_COMMON_RESPONSE] = { - gmem_handle_comm_msg_rsp - }, -}; - -int gmem_handle_comm_msg_rsp(struct rpg_kmsg_message *msg) -{ - struct comm_msg_rsp *rsp = (struct comm_msg_rsp *)msg; - struct wait_station *my_ws = wait_station(rsp->peer_ws); - - my_ws->private = rsp; - /* must first set my_ws */ - smp_rmb(); - complete(&my_ws->pendings); - - return 0; -} - -int gmem_send_comm_msg_reply(unsigned int from_nid, unsigned int to_nid, - unsigned int peer_ws, int reply) -{ - struct comm_msg_rsp rsp; - int ret = reply; - - rsp.ret = reply; - rsp.peer_ws = peer_ws; - ret = msg_send_nid(GMEM_COMMON_RESPONSE, from_nid, - to_nid, &rsp, sizeof(struct comm_msg_rsp)); - - return ret; -} - -int gmem_add_to_svm_proc(int my_nid, int my_pid, int peer_nid, int peer_pid) -{ - struct svm_proc *peer_proc; - - peer_proc = alloc_svm_proc(my_nid, my_pid, peer_nid, peer_pid); - if (!peer_proc) - return -1; - - return 0; -} - -void process_remote_pager_work(struct work_struct *work) -{ - struct rpg_kmsg_work *w = container_of(work, struct rpg_kmsg_work, work); - struct rpg_kmsg_message *msg = w->msg; - rpg_kmsg_cbftn ftn; - - ftn = rpg_kmsg_cbftns[msg->header.type].fnt; - if (ftn != NULL) - ftn(msg); - else - pr_err("No callback registered for %d\n", msg->header.type); - kfree(w); -} - -int handle_remote_pager_work(void *msg) -{ - struct rpg_kmsg_work *w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (IS_ERR_OR_NULL(w)) { - pr_err("can not alloc memory for rpg_kmsg_work\n"); - return PTR_ERR(w); - } - w->msg = msg; - - INIT_WORK(&w->work, process_remote_pager_work); - /* should firstly initialize w */ - smp_wmb(); - queue_work(remote_pager_wq, &w->work); - - return 0; -} - -int msg_handle_init(void) -{ - unsigned int flags = __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE; - - remote_pager_wq = alloc_workqueue("remote_wq", flags, 0); - if (!remote_pager_wq) { - pr_err("%s alloc workqueue failed %lx\n", __func__, (unsigned long)remote_pager_wq); - return -1; - } - - pr_err("%s alloc workqueue%lx\n", __func__, (unsigned long)remote_pager_wq); -#ifndef WITH_GMEM - msg_open(0); -#endif - return 0; -} diff --git a/drivers/remote_pager/msg_handler_origin.c b/drivers/remote_pager/msg_handler_origin.c deleted file mode 100644 index 05547d88c3c5..000000000000 --- a/drivers/remote_pager/msg_handler_origin.c +++ /dev/null @@ -1,475 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Generalized Memory Management. - * - * Copyright (c) 2023- Huawei, Inc. - * Author: Bin Wang - * Co-Author: Chunsheng Luo, Cunshu Ni - * - */ -#include -#include -#include -#include -#include -#include - -#include "msg_handler.h" -#include "wait_station.h" -#include "svm_proc_mng.h" - -#define NPU_PAGE_SIZE PAGE_SIZE -#define MAX_NR_NPU 16 -#define GMEM_DEBUG 0 - -static struct gm_dev *gm_devs[MAX_NR_NPU]; - -struct gm_dev *gmem_id_to_device(unsigned int id) -{ - if (id >= MAX_NR_NPU) { - pr_err("device id is invalid. (dev_id = %u)\n", id); - return NULL; - } - - return gm_devs[id]; -} - -int gmem_register_pair_remote_task(int origin_nid, int origin_pid, int remote_nid, int remote_pid) -{ - struct gm_pair_msg_rq req; - struct comm_msg_rsp *rsp; - int ret = 0; - struct wait_station *ws; - - /* open msg chan */ - pr_debug("%s origin_nid %d, origin_pid %d, remote_nid %d, remote_pid %d\n", __func__, - origin_nid, origin_pid, remote_nid, remote_pid); - ret = msg_open(remote_nid); - if (ret < 0) { - pr_err("%s open msg chan failed\n", __func__); - return ret; - } - - /* start pairing */ - ws = get_wait_station(); - req.my_pid = origin_pid; - req.my_ws = ws->id; - req.peer_nid = remote_nid; - req.peer_pid = remote_pid; - - ret = msg_send_nid(GMEM_TASK_PAIRING_REQUEST, origin_nid, - remote_nid, &req, sizeof(struct gm_pair_msg_rq)); - rsp = wait_at_station(ws); - if ((long)rsp != -ETIMEDOUT) { - ret = rsp->ret; - kfree(rsp); - gmem_add_to_svm_proc(origin_nid, origin_pid, remote_nid, remote_pid); - } - - return ret; -} -EXPORT_SYMBOL(gmem_register_pair_remote_task); - -int gmem_handle_dev_fault(struct rpg_kmsg_message *msg) -{ - int ret; - struct gm_pager_msg_rq *recv = (struct gm_pager_msg_rq *)msg; - unsigned int my_pid = recv->peer_pid; - unsigned int nid = recv->header.to_nid; - unsigned int peer_nid = recv->header.from_nid; - unsigned int peer_ws = recv->my_ws; - struct gm_dev *dev = gm_devs[peer_nid]; - struct task_struct *tsk; - struct mm_struct *mm; - - tsk = find_get_task_by_vpid(my_pid); - if (!tsk) { - pr_err("svm process does not have task_struct\n"); - ret = GM_RET_FAILURE_UNKNOWN; - goto out; - } - - mm = get_task_mm(tsk); - if (!mm) { - pr_err("no mm\n"); - ret = GM_RET_FAILURE_UNKNOWN; - goto put_task; - } - - if (!dev) { - pr_info("gmem: device get failed, dev_id %ld\n", (unsigned long)peer_nid); - ret = -ENODEV; - goto put_mm; - } - - ret = gm_dev_fault(mm, recv->va, dev, 0); - if (ret != GM_RET_SUCCESS && ret != GM_RET_PAGE_EXIST) { - pr_info("gmem dev fault failed\n"); - ret = -EFAULT; - goto put_mm; - } - -put_mm: - mmput(mm); -put_task: - put_task_struct(tsk); -out: - gmem_send_comm_msg_reply(nid, peer_nid, peer_ws, ret); - kfree(msg); - return ret; -} - -enum gm_ret gmem_map(struct gm_fault_t *gmf) -{ - int ret = 0; - struct wait_station *ws; - struct comm_msg_rsp *rsp; - struct mm_struct *mm = gmf->mm; - struct svm_proc *proc = search_svm_proc_by_mm(mm); - struct gm_pager_msg_rq req = { - .peer_pid = proc->peer_pid, - .va = gmf->va, - .size = gmf->size, - .behavior = gmf->behavior - }; - - if (!proc) { - pr_err("can not find proc\n"); - return -EBUSY; - } - - ws = get_wait_station(); - req.my_ws = ws->id; - - if (gmf->copy) { - req.flags |= GMEM_COPY_PAGE; - req.dma_addr = gmf->dma_addr; - } - - ret = msg_send_nid(GMEM_ALLOC_PAGE_REQUEST, proc->nid, proc->peer_nid, - &req, sizeof(struct gm_pager_msg_rq)); - rsp = wait_at_station(ws); - if ((long)rsp == -ETIMEDOUT) - return -EBUSY; - ret |= rsp->ret; - kfree(rsp); - if (ret) { - if (ret == GM_RET_MIGRATING) - pr_info("gmem: race with migrating\n"); - else - pr_info("send alloc page message failed %d\n", ret); - return ret; - } - - return GM_RET_SUCCESS; -} - -enum gm_ret gmem_unmap(struct gm_fault_t *gmf) -{ - int ret; - struct wait_station *ws; - struct comm_msg_rsp *rsp; - struct mm_struct *mm = gmf->mm; - struct svm_proc *proc = search_svm_proc_by_mm(mm); - struct gm_pager_msg_rq req = { - .peer_pid = proc->peer_pid, - .va = gmf->va, - .size = gmf->size, - }; - - if (!proc) { - pr_err("can not find proc\n"); - return -EBUSY; - } - - if (gmf->copy) { - req.flags |= GMEM_COPY_PAGE; - req.dma_addr = gmf->dma_addr; - } - - ws = get_wait_station(); - req.my_ws = ws->id; - - ret = msg_send_nid(GMEM_FREE_PAGE_REQUEST, proc->nid, proc->peer_nid, - &req, sizeof(struct gm_pager_msg_rq)); - rsp = wait_at_station(ws); - if ((long)rsp == -ETIMEDOUT) - return -EBUSY; - ret |= rsp->ret; - kfree(rsp); - if (ret) { - pr_info("send free page message failed %d\n", ret); - return GM_RET_FAILURE_UNKNOWN; - } - - return GM_RET_SUCCESS; -} - -enum gm_ret gmem_alloc(struct mm_struct *mm, unsigned long va, unsigned long size, - unsigned long prot) -{ - int ret = 0; - struct wait_station *ws; - struct comm_msg_rsp *rsp; - struct svm_proc *proc = search_svm_proc_by_mm(mm); - struct gm_pager_msg_rq req = { - .peer_pid = proc->peer_pid, - .va = va, - .size = size, - .prot = prot, - }; - - if (!proc) { - pr_err("can not find proc\n"); - return -EBUSY; - } - - ws = get_wait_station(); - req.my_ws = ws->id; - ret = msg_send_nid(GMEM_ALLOC_VMA_REQUEST, proc->nid, proc->peer_nid, - &req, sizeof(struct gm_pager_msg_rq)); - rsp = wait_at_station(ws); - if ((long)rsp == -ETIMEDOUT) - return -EBUSY; - ret |= rsp->ret; - kfree(rsp); - if (ret) { - pr_info("send alloc vma message failed %d\n", ret); - return GM_RET_NOMEM; - } - - return GM_RET_SUCCESS; -} - -enum gm_ret gmem_free(struct mm_struct *mm, unsigned long va, unsigned long size) -{ - int ret = 0; - struct wait_station *ws; - struct comm_msg_rsp *rsp; - struct svm_proc *proc = search_svm_proc_by_mm(mm); - struct gm_pager_msg_rq req = { - .peer_pid = proc->peer_pid, - .va = va, - .size = size, - }; - - if (!proc) { - pr_err("can not find proc\n"); - return -EBUSY; - } - - ws = get_wait_station(); - req.my_ws = ws->id; - ret = msg_send_nid(GMEM_FREE_VMA_REQUEST, proc->nid, proc->peer_nid, - &req, sizeof(struct gm_pager_msg_rq)); - rsp = wait_at_station(ws); - if ((long)rsp == -ETIMEDOUT) - return -EBUSY; - ret |= rsp->ret; - kfree(rsp); - if (ret) { - pr_info("send free vma message failed %d\n", ret); - return GM_RET_FAILURE_UNKNOWN; - } - - return GM_RET_SUCCESS; -} - -int gmem_handle_evict_page(struct rpg_kmsg_message *msg) -{ - struct gm_evict_page_msg_rq *recv = (struct gm_evict_page_msg_rq *)msg; - unsigned int nid = recv->header.to_nid; - unsigned int peer_nid = recv->header.from_nid; - unsigned int peer_ws = recv->ws; - unsigned int pid = recv->peer_pid; - unsigned long size = recv->size; - unsigned long addr = recv->va; - struct vm_area_struct *vma; - struct page *page; - dma_addr_t dma_addr; - struct gm_mapping *gm_mapping; - struct device *dma_dev; - struct gm_fault_t gmf; - struct svm_proc *proc; - struct task_struct *tsk; - struct mm_struct *mm; - int ret; - struct folio *folio = NULL; - - proc = search_svm_proc_by_pid(pid); - if (!proc) { - pr_err("can not find svm_proc of task-%d\n", pid); - ret = -EINVAL; - goto response; - } - - tsk = find_get_task_by_vpid(pid); - if (!tsk) { - pr_err("can not find task of task-%d\n", pid); - ret = -EINVAL; - goto response; - } - - mm = get_task_mm(tsk); - if (!mm) { - pr_err("task-%d exited\n", pid); - ret = -EINTR; - goto put_task; - } - - if (mm != proc->mm) { - pr_err("miss match\n"); - ret = -EINTR; - goto put_mm; - } - - gmf.mm = mm; - gmf.va = addr; - gmf.size = size; - gmf.copy = GMEM_COPY_PAGE; - - vma = find_vma(mm, addr); - if (!vma || !vma->vm_obj) { - pr_err("evict addr %lx vma %lx vm_obj %lx, no vma or vm_obj\n", addr, - (unsigned long)vma, vma ? (unsigned long)vma->vm_obj : 0); - ret = -EINVAL; - goto put_mm; - } - - gm_mapping = vm_object_lookup(vma->vm_obj, addr); - if (!gm_mapping) { - pr_err("evictim gm_page is NULL\n"); - ret = -EINVAL; - goto put_mm; - } - - mutex_lock(&gm_mapping->lock); - if (gm_mapping_willneed(gm_mapping)) { - pr_info("gmem: racing with prefetch or willneed so cancel evict\n"); - gm_mapping_flags_clear(gm_mapping, GM_PAGE_WILLNEED); - ret = -EINVAL; - goto unlock; - } - - if (!gm_mapping_device(gm_mapping)) { - pr_info("gmem: page is not in device\n"); - ret = -EINVAL; - goto unlock; - } - - if (size == HPAGE_PMD_SIZE) { - folio = vma_alloc_folio(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, addr, true); - page = &folio->page; - } else { - page = alloc_page(GFP_KERNEL); - } - - if (!page) { - pr_err("gmem: gmem_evict_page alloc hugepage failed\n"); - ret = -ENOMEM; - goto unlock; - } - - dma_dev = gm_mapping->dev->dma_dev; - dma_addr = dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL); - gmf.dev = gm_mapping->dev; - gmf.dma_addr = dma_addr; - - ret = gmem_unmap(&gmf); - dma_unmap_page(dma_dev, dma_addr, size, DMA_BIDIRECTIONAL); - if (ret) { - pr_err("gmem_unmap failed, ret %d\n", ret); - put_page(page); - goto unlock; - } - - gm_mapping_flags_set(gm_mapping, GM_PAGE_CPU); - gm_mapping->page = page; - -unlock: - mutex_unlock(&gm_mapping->lock); -put_mm: - mmput(mm); -put_task: - put_task_struct(tsk); -response: - gmem_send_comm_msg_reply(nid, peer_nid, peer_ws, ret); - kfree(msg); - return ret; -} - -enum gm_ret gmem_create(struct gm_dev *dev, void **pmap) -{ - return GM_RET_SUCCESS; -} - -struct gm_mmu gm_mmu = { - .peer_va_alloc_fixed = gmem_alloc, - .pmap_create = gmem_create, - .peer_va_free = gmem_free, - .peer_map = gmem_map, - .peer_unmap = gmem_unmap, -}; - -#define ASCEND910_HBM_START 0x0000000800000000 -#define ASCEND910_HBM_END 0x0000000fffffffff - -enum gm_ret mmu_dev_create(struct device *dev, int devid) -{ - enum gm_ret ret; - - ret = gm_dev_create(&gm_mmu, NULL, GM_DEV_CAP_REPLAYABLE | GM_DEV_CAP_PEER, - (struct gm_dev **)&dev->gm_dev); - if (ret != GM_RET_SUCCESS) { - pr_err("NPU gmem device create failed\n"); - return ret; - } - - ret = gm_dev_register_physmem(dev->gm_dev, ASCEND910_HBM_START, ASCEND910_HBM_END); - if (ret != GM_RET_SUCCESS) { - pr_err("NPU gmem device register physical memory failed\n"); - goto free_gm_dev; - } - - ((struct gm_dev *)dev->gm_dev)->dma_dev = dev; - gm_devs[devid] = (struct gm_dev *)dev->gm_dev; - - pr_info("Create NPU gmem device and register HBM\n"); - return ret; -free_gm_dev: - gm_dev_destroy(dev->gm_dev); - dev->gm_dev = NULL; - return ret; -} -EXPORT_SYMBOL(mmu_dev_create); - -enum gm_ret mmu_as_attach(struct device *dev) -{ - enum gm_ret ret; - struct gm_dev *gm_dev = dev->gm_dev; - struct gm_context *gm_ctx; - - if (!gm_dev) { - pr_err("NPU device gm_dev is NULL\n"); - return GM_RET_FAILURE_UNKNOWN; - } - - if (!current->mm->gm_as) { - ret = gm_as_create(0, ULONG_MAX, GM_AS_ALLOC_DEFAULT, NPU_PAGE_SIZE, - ¤t->mm->gm_as); - if (ret != GM_RET_SUCCESS) { - pr_err("Process %d create gm_as failed\n", current->pid); - return ret; - } - } - - ret = gm_as_attach(current->mm->gm_as, gm_dev, 0, 1, &gm_ctx); - if (ret != GM_RET_SUCCESS) { - pr_err("gm_dev attach to process %d failed\n", current->pid); - return ret; - } - - pr_info("Attach gm_dev to process %d\n", current->pid); - return ret; -} -EXPORT_SYMBOL(mmu_as_attach); diff --git a/drivers/remote_pager/msg_handler_peer.c b/drivers/remote_pager/msg_handler_peer.c deleted file mode 100644 index e9222a1b60e4..000000000000 --- a/drivers/remote_pager/msg_handler_peer.c +++ /dev/null @@ -1,671 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Generalized Memory Management. - * - * Copyright (C) 2023- Huawei, Inc. - * Author: Chunsheng Luo - * Co-Author: Weixi Zhu, Jun Chen, Jiangtian Feng - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "msg_handler.h" -#include "svm_proc_mng.h" - -#define MAX_RETRY_TIME 10 - -static inline vm_fault_t get_page_size(unsigned int order, - unsigned int *page_size, - unsigned long *addr) -{ - switch (order) { - case 0: - *page_size = PAGE_SIZE; - break; - case PMD_ORDER: - *page_size = HPAGE_SIZE; - *addr = round_down(*addr, HPAGE_SIZE); - break; - default: - return VM_FAULT_FALLBACK; - } - return 0; -} - -static inline bool addr_is_mapped(unsigned long addr, pmd_t *pmd, - unsigned int order) -{ - pte_t *pte; - bool ret; - - if (order == PMD_ORDER) - return !pmd_none(*pmd); - if (pmd_none(*pmd)) - return false; - pte = pte_offset_map(pmd, addr); - ret = !pte_none(*pte); - pte_unmap(pte); - return ret; -} - -static vm_fault_t __gmem_fault(struct vm_fault *vmf, - unsigned int order) -{ - vm_fault_t ret = VM_FAULT_SIGBUS; - int msg_ret = GM_RET_FAILURE_UNKNOWN; - unsigned long addr = vmf->address; - unsigned int page_size; - struct gm_pager_msg_rq req = { 0 }; - struct comm_msg_rsp *rsp; - struct wait_station *ws; - struct page_info *page_info; - struct mm_struct *mm; - struct svm_proc *proc; - - ret = get_page_size(order, &page_size, &addr); - if (ret) - return ret; - - mm = vmf->vma->vm_mm; - proc = search_svm_proc_by_mm(mm); - if (!proc) { - pr_err("%s: failed to get svm proc\n", __func__); - return VM_FAULT_SIGBUS; - } - - page_info = get_page_info(&proc->pager, addr, page_size, page_size); - if (!page_info) { - pr_err("%s: failed to get page_info\n", __func__); - return VM_FAULT_SIGBUS; - } - mutex_lock(&page_info->lock); - - if (addr_is_mapped(addr, vmf->pmd, order)) - goto unlock; - - req.va = addr; - req.size = page_size; - - /* start fault */ - ws = get_wait_station(); - req.my_ws = ws->id; - req.peer_pid = proc->peer_pid; - - ret = msg_send_nid(GMEM_PAGE_FAULT_REQUEST, proc->nid, proc->peer_nid, - &req, sizeof(req)); - rsp = wait_at_station(ws); - if ((long)rsp != -ETIMEDOUT) { - msg_ret = rsp->ret; - kfree(rsp); - } - if (msg_ret == GM_RET_PAGE_EXIST) { - pr_warn("gmem: weird page exist\n"); - } else if (msg_ret != GM_RET_SUCCESS) { - ret = VM_FAULT_SIGBUS; - goto unlock; - } - - ret = VM_FAULT_NOPAGE; - -unlock: - mutex_unlock(&page_info->lock); - return ret; -} - -static vm_fault_t gmem_fault(struct vm_fault *vmf) -{ - return __gmem_fault(vmf, 0); -} - -static vm_fault_t gmem_huge_fault(struct vm_fault *vmf, - unsigned int order) -{ - int ret = 0; - - ret = __gmem_fault(vmf, order); - - return ret; -} - -static const struct vm_operations_struct gmem_vma_ops = { - .fault = gmem_fault, - .huge_fault = gmem_huge_fault, -}; - -int gmem_handle_task_pairing(struct rpg_kmsg_message *msg) -{ - struct gm_pair_msg_rq *recv = (struct gm_pair_msg_rq *)msg; - unsigned int peer_nid = recv->header.from_nid; - unsigned int peer_pid = recv->my_pid; - unsigned int peer_ws = recv->my_ws; - unsigned int my_nid = recv->peer_nid; - unsigned int my_pid = recv->peer_pid; - int ret = 0; - - gmem_add_to_svm_proc(my_nid, my_pid, peer_nid, peer_pid); - gmem_send_comm_msg_reply(my_nid, peer_nid, peer_ws, ret); - kfree(msg); - return 0; -} - -int vma_is_gmem(struct vm_area_struct *vma) -{ - return (vma->vm_flags & VM_PEER_SHARED) != 0; -} - -int gmem_handle_alloc_vma_fixed(struct rpg_kmsg_message *msg) -{ - struct gm_pager_msg_rq *data = (struct gm_pager_msg_rq *)msg; - unsigned long va = data->va; - unsigned long size = data->size; - unsigned long gmem_prot = data->prot; - unsigned int my_pid = data->peer_pid; - unsigned int peer_nid = data->header.from_nid; - unsigned int nid = data->header.to_nid; - unsigned int peer_ws = data->my_ws; - unsigned long prot = 0; - unsigned long populate; - struct task_struct *tsk; - struct mm_struct *mm; - unsigned long addr; - struct vm_area_struct *vma; - int ret = GM_RET_SUCCESS; - - if (gmem_prot & VM_READ) - prot |= PROT_READ; - if (gmem_prot & VM_WRITE) - prot |= PROT_WRITE; - if (gmem_prot & VM_EXEC) - prot |= PROT_EXEC; - - tsk = find_get_task_by_vpid(my_pid); - if (!tsk) { - pr_err("svm process does not have task_struct\n"); - ret = GM_RET_FAILURE_UNKNOWN; - goto out; - } - - mm = get_task_mm(tsk); - if (!mm) { - pr_err("no mm\n"); - ret = -1; - goto put_task; - } - - mmap_write_lock(mm); - current->mm = mm; - addr = __do_mmap_mm(mm, NULL, va, size, prot, - MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, 0, - 0, &populate, NULL); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto unlock; - } - - vma = find_vma(mm, addr); - if (!vma) { - ret = GM_RET_FAILURE_UNKNOWN; - goto unlock; - } - - vma->vm_ops = &gmem_vma_ops; - vm_flags_set(vma, VM_HUGEPAGE | VM_PEER_SHARED); - -unlock: - current->mm = NULL; - mmap_write_unlock(mm); - mmput(mm); -put_task: - put_task_struct(tsk); -out: - pr_info("%s va %lx vma message %d\n", __func__, va, ret); - gmem_send_comm_msg_reply(nid, peer_nid, peer_ws, ret); - kfree(msg); - return ret; -} - -int gmem_handle_free_vma(struct rpg_kmsg_message *msg) -{ - struct gm_pager_msg_rq *recv = (struct gm_pager_msg_rq *)msg; - unsigned long va = recv->va; - unsigned long size = recv->size; - unsigned int my_pid = recv->peer_pid; - unsigned int nid = recv->header.to_nid; - unsigned int peer_nid = recv->header.from_nid; - unsigned int peer_ws = recv->my_ws; - struct task_struct *tsk; - struct mm_struct *mm; - struct mm_struct *old_mm = current->mm; - - int ret = 0; - - tsk = find_get_task_by_vpid(my_pid); - if (!tsk) { - pr_err("svm process does not have task_struct\n"); - ret = GM_RET_FAILURE_UNKNOWN; - goto out; - } - - mm = get_task_mm(tsk); - if (!mm) { - pr_err("no mm\n"); - ret = -1; - goto put_task; - } - - current->mm = mm; - ret = vm_munmap(va, size); - current->mm = old_mm; - - if (ret < 0) - ret = GM_RET_FAILURE_UNKNOWN; - else - ret = GM_RET_SUCCESS; - - mmput(mm); -put_task: - put_task_struct(tsk); -out: - gmem_send_comm_msg_reply(nid, peer_nid, peer_ws, ret); - kfree(msg); - return ret; -} - -pmd_t *get_huge_pmd(const struct vm_area_struct *vma, u64 va) -{ - pgd_t *pgd = NULL; - p4d_t *p4d = NULL; - pud_t *pud = NULL; - pmd_t *pmd = NULL; - - if ((vma == NULL) || (vma->vm_mm == NULL)) { - pr_err("Vm_mm none. (va=0x%llx)\n", va); - return NULL; - } - /* too much log, not print */ - pgd = pgd_offset(vma->vm_mm, va); - if (PXD_JUDGE(pgd)) - return NULL; - - p4d = p4d_offset(pgd, va); - if (PXD_JUDGE(p4d) != 0) - return NULL; - - pud = pud_offset(p4d, va); - if (PXD_JUDGE(pud) != 0) - return NULL; - - pmd = pmd_offset(pud, va); - return pmd; -} - -static inline struct page *alloc_transhuge_page_node(int nid, int zero) -{ - struct page *page; - gfp_t gfp_mask = GFP_TRANSHUGE | __GFP_THISNODE | __GFP_NOWARN; - - if (zero) - gfp_mask |= __GFP_ZERO; - - page = alloc_pages_node(nid, gfp_mask, HPAGE_PMD_ORDER); - if (!page) - return NULL; - - return page; -} - -int gmem_hugepage_remap_owner(struct svm_proc *svm_proc, u64 addr, - pgprot_t prot, struct page *hpage) -{ - int ret; - - ret = hugetlb_insert_hugepage_pte(svm_proc->mm, addr, prot, hpage); - if (ret != 0) { - pr_err("insert_hugepage owner fail. (va=0x%llx)\n", addr); - return ret; - } - - return 0; -} - -int gmem_hugepage_remap_local(struct svm_proc *svm_proc, u64 addr, - pgprot_t prot, struct page *hpage) -{ - int ret = 0; - struct local_pair_proc *item = NULL; - struct local_pair_proc *next = NULL; - - list_for_each_entry_safe(item, next, &svm_proc->tasks_list, node) { - ret = hugetlb_insert_hugepage_pte(item->mm, addr, prot, hpage); - if (ret != 0) { - pr_err("insert_hugepage local fail. (va=0x%llx)\n", - addr); - return ret; - } - } - - return 0; -} - -int gmem_hugepage_remap(struct svm_proc *svm_proc, u64 addr, pgprot_t prot, - struct page *hpage) -{ - int ret; - - ret = gmem_hugepage_remap_owner(svm_proc, addr, prot, hpage); - if (ret != 0) { - pr_err("gmem_hugepage_remap_owner fail. (va=0x%llx)\n", addr); - return ret; - } - - ret = gmem_hugepage_remap_local(svm_proc, addr, prot, hpage); - if (ret != 0) { - pr_err("gmem_hugepage_remap_local fail. (va=0x%llx)\n", addr); - return ret; - } - - return 0; -} - -int gmem_handle_alloc_page(struct rpg_kmsg_message *msg) -{ - struct gm_pager_msg_rq *recv = (struct gm_pager_msg_rq *)msg; - unsigned long addr = recv->va; - unsigned int page_size = recv->size; - unsigned int my_pid = recv->peer_pid; - unsigned int peer_ws = recv->my_ws; - int nid = recv->header.to_nid; - int peer_nid = recv->header.from_nid; - struct page_info *page_info; - struct svm_proc *proc = search_svm_proc_by_pid(my_pid); - struct page *page; - unsigned long long prot_val; - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct *vma; - int ret = 0; - - if (!proc) { - pr_info("can not find proc of %d\n", my_pid); - ret = -EINVAL; - goto out; - } - - page_info = get_page_info(&proc->pager, addr, page_size, page_size); - if (!page_info) { - pr_err("%s: failed to get page_info\n", __func__); - ret = -EINVAL; - goto out; - } - - if (recv->behavior == MADV_WILLNEED) { - if (!page_info->page) - goto new_page; - - goto out; - } - -new_page: - /* TODO: How Can Know HBM node */ - page = alloc_transhuge_page_node(1, !recv->dma_addr); - if (!page) { - ret = -ENOMEM; - goto out; - } - - /* We need a condition */ - if (need_wake_up_swapd()) - wake_up_swapd(); - - if (recv->dma_addr) { - handle_migrate_page((void *)recv->dma_addr, page, page_size, - FROM_PEER); - } - - tsk = find_get_task_by_vpid(my_pid); - if (!tsk) { - pr_err("svm process does not have task_struct\n"); - ret = GM_RET_FAILURE_UNKNOWN; - goto out; - } - - mm = get_task_mm(tsk); - if (!mm) { - pr_err("no mm\n"); - ret = -1; - goto put_task; - } - - vma = find_vma(mm, addr); - if (vma->vm_flags & VM_WRITE) { - prot_val = (pgprot_val(PAGE_SHARED_EXEC) & (~PTE_RDONLY)) | - PTE_DIRTY; - } else { - prot_val = pgprot_val(PAGE_READONLY_EXEC); - } - - /* TODO: 9 Consider multiple processes bind */ - ret = gmem_hugepage_remap(proc, addr, __pgprot(prot_val), page); - if (ret) - goto put_mm; - - page_info->page = page; - -put_mm: - mmput(mm); -put_task: - put_task_struct(tsk); -out: - gmem_send_comm_msg_reply(nid, peer_nid, peer_ws, ret); - kfree(msg); - return ret; -} - -static inline void zap_clear_pmd(struct vm_area_struct *vma, u64 vaddr, - pmd_t *pmd) -{ - pmd_clear(pmd); - flush_tlb_range(vma, vaddr, vaddr + HPAGE_SIZE); -} - -void zap_vma_pmd(struct vm_area_struct *vma, u64 vaddr) -{ - pmd_t *pmd = NULL; - - pmd = get_huge_pmd(vma, vaddr); - - if (pmd == NULL) - return; - - zap_clear_pmd(vma, vaddr, pmd); -} - -void gmem_hugepage_unmap_local(struct svm_proc *svm_proc, u64 addr) -{ - struct local_pair_proc *item = NULL; - struct local_pair_proc *next = NULL; - struct vm_area_struct *vma; - - list_for_each_entry_safe(item, next, &svm_proc->tasks_list, node) { - vma = find_vma(item->mm, addr); - if (!vma) - zap_vma_pmd(vma, addr); - } -} - -void gmem_unmap_hugepage(struct svm_proc *svm_proc, u64 addr) -{ - struct vm_area_struct *vma; - - vma = find_vma(svm_proc->mm, addr); - - if (!vma) - zap_vma_pmd(vma, addr); - - gmem_hugepage_unmap_local(svm_proc, addr); -} - -int gmem_handle_free_page(struct rpg_kmsg_message *msg) -{ - struct gm_pager_msg_rq *recv = (struct gm_pager_msg_rq *)msg; - unsigned long addr = recv->va; - unsigned long page_size = recv->size; - unsigned int my_pid = recv->peer_pid; - unsigned int peer_ws = recv->my_ws; - int peer_nid = recv->header.from_nid; - int nid = recv->header.to_nid; - struct task_struct *tsk; - struct mm_struct *mm; - struct page_info *page_info; - struct svm_proc *proc = search_svm_proc_by_pid(my_pid); - struct page *page = NULL; - int ret = 0; - - if (!proc) { - pr_info("can not find proc of %d\n", my_pid); - ret = -EINVAL; - goto out; - } - - page_info = get_page_info(&proc->pager, addr, page_size, page_size); - if (!page_info) { - pr_err("%s: failed to get page_info\n", __func__); - ret = -EINVAL; - goto out; - } - - page = page_info->page; - if (!page) { - pr_err("%s: page reference in page_info is NULL\n", __func__); - ret = -EINVAL; - goto out; - } - - tsk = find_get_task_by_vpid(my_pid); - if (!tsk) { - pr_err("svm process does not have task_struct\n"); - ret = GM_RET_FAILURE_UNKNOWN; - goto out; - } - - mm = get_task_mm(tsk); - if (!mm) { - pr_err("no mm\n"); - ret = -1; - goto put_task; - } - - gmem_unmap_hugepage(proc, addr); - mmput(mm); - - if (recv->dma_addr) - handle_migrate_page((void *)recv->dma_addr, page, page_size, - TO_PEER); - - free_page_info(&proc->pager, page_info); - put_page(page); - -put_task: - put_task_struct(tsk); -out: - gmem_send_comm_msg_reply(nid, peer_nid, peer_ws, ret); - kfree(msg); - return ret; -} - -int gmem_handle_hmadvise(struct rpg_kmsg_message *msg) -{ - kfree(msg); - return 0; -} - -int gmem_handle_hmemcpy(struct rpg_kmsg_message *msg) -{ - kfree(msg); - return 0; -} - -static int sync_gmem_vma_to_custom_process(struct svm_proc *svm_proc, - struct local_pair_proc *local_proc) -{ - struct mm_struct *mm = svm_proc->mm; - struct vm_area_struct *vma, *local_vma; - unsigned long populate; - struct mm_struct *old_mm = current->mm; - unsigned long addr; - unsigned long prot = PROT_READ; - - VMA_ITERATOR(vmi, mm, 0); - - mmap_write_lock(mm); - for_each_vma(vmi, vma) { - if (!vma_is_peer_shared(vma)) - continue; - current->mm = local_proc->mm; - pr_debug("%s cur %lx local %lx start %lx -- end %lx\n", - __func__, (unsigned long)current->mm, - (unsigned long)local_proc->mm, vma->vm_start, - vma->vm_end); - prot = PROT_READ; - if (vma->vm_flags & VM_WRITE) - prot |= PROT_WRITE; - addr = __do_mmap_mm(local_proc->mm, NULL, vma->vm_start, - vma->vm_end - vma->vm_start, prot, - MAP_SHARED | MAP_ANONYMOUS | - MAP_FIXED_NOREPLACE, - 0, 0, &populate, NULL); - current->mm = old_mm; - if (IS_ERR_VALUE(addr)) { - pr_err("%s failed start %lx - end %lx ret %ld\n", - __func__, vma->vm_start, vma->vm_end, addr); - continue; - } - local_vma = find_vma(local_proc->mm, addr); - if (!local_vma) { - local_vma->vm_ops = vma->vm_ops; - vm_flags_set(vma, VM_HUGEPAGE | VM_PEER_SHARED); - } - } - mmap_write_unlock(mm); - - return 0; -} - -int gmem_register_pair_local_task(unsigned int bind_to_pid, - unsigned int local_pid) -{ - int ret = 0; - struct svm_proc *proc = search_svm_proc_by_pid(bind_to_pid); - struct local_pair_proc *local_proc; - - pr_debug("%s bind_to_pid %d local_pid %d\n", __func__, bind_to_pid, - local_pid); - - local_proc = insert_local_proc(proc, local_pid); - if (IS_ERR(local_proc)) { - pr_err("%s failed\n", __func__); - return PTR_ERR(local_proc); - } - - /* sync vma and vma_ops to local_pid */ - sync_gmem_vma_to_custom_process(proc, local_proc); - - return ret; -} -EXPORT_SYMBOL(gmem_register_pair_local_task); diff --git a/drivers/remote_pager/svm_proc_mng.c b/drivers/remote_pager/svm_proc_mng.c deleted file mode 100644 index 201c30885437..000000000000 --- a/drivers/remote_pager/svm_proc_mng.c +++ /dev/null @@ -1,427 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Generalized Memory Management. - * - * Copyright (C) 2023- Huawei, Inc. - * Author: Jiangtian Feng - * Co-Author: Jun Chen, Chuangchuang Fang - * - */ - -#include -#include -#include -#include -#include - -#include "svm_proc_mng.h" - -struct svm_proc_node { - struct svm_proc svm_proc; - struct hlist_node list; -}; - -static inline struct svm_proc_node *to_proc_node(struct svm_proc *proc) -{ - return list_entry(proc, struct svm_proc_node, svm_proc); -} - -#define _PROC_LIST_MAX 0x0f -#define _PROC_LIST_SHIFT 4 -static DEFINE_RWLOCK(svm_proc_hash_rwlock); -static DEFINE_HASHTABLE(svm_proc_hashtable, _PROC_LIST_SHIFT); - -static unsigned int get_hash_tag(int pid) -{ - return (unsigned int)pid % _PROC_LIST_MAX; -} - -static void add_to_hashtable(struct svm_proc *proc) -{ - struct svm_proc_node *node = to_proc_node(proc); - unsigned int tag = get_hash_tag(proc->pid); - - write_lock(&svm_proc_hash_rwlock); - hash_add(svm_proc_hashtable, &node->list, tag); - write_unlock(&svm_proc_hash_rwlock); -} - -static void del_from_hashtable(struct svm_proc *proc) -{ - struct svm_proc_node *node; - - write_lock(&svm_proc_hash_rwlock); - node = to_proc_node(proc); - hash_del(&node->list); - write_unlock(&svm_proc_hash_rwlock); -} - -struct svm_proc *search_svm_proc_by_mm(struct mm_struct *mm) -{ - struct svm_proc_node *node; - unsigned int tag; - - read_lock(&svm_proc_hash_rwlock); - hash_for_each(svm_proc_hashtable, tag, node, list) { - if (node->svm_proc.mm == mm) { - read_unlock(&svm_proc_hash_rwlock); - return &node->svm_proc; - } - } - read_unlock(&svm_proc_hash_rwlock); - - return search_svm_proc_by_local_mm(mm); -} - -struct svm_proc *search_svm_proc_by_local_mm(struct mm_struct *mm) -{ - struct svm_proc_node *node; - unsigned int hash_tag; - struct local_pair_proc *item = NULL; - struct local_pair_proc *next = NULL; - - read_lock(&svm_proc_hash_rwlock); - hash_for_each(svm_proc_hashtable, hash_tag, node, list) { - list_for_each_entry_safe(item, next, &node->svm_proc.tasks_list, - node) { - if (item->mm == mm) { - read_unlock(&svm_proc_hash_rwlock); - return &node->svm_proc; - } - } - } - read_unlock(&svm_proc_hash_rwlock); - - return NULL; -} - -struct svm_proc *search_svm_proc_by_pid(unsigned int pid) -{ - struct svm_proc_node *node; - unsigned int tag = get_hash_tag(pid); - - read_lock(&svm_proc_hash_rwlock); - hash_for_each_possible(svm_proc_hashtable, node, list, tag) { - if (node->svm_proc.pid == pid) { - read_unlock(&svm_proc_hash_rwlock); - return &node->svm_proc; - } - } - read_unlock(&svm_proc_hash_rwlock); - - return NULL; -} - -static struct page_info *__search_page_info(struct page_mng *pager, - unsigned long va, unsigned long len) -{ - struct rb_node *node = pager->rbtree.rb_node; - struct page_info *page_info = NULL; - - while (node) { - page_info = rb_entry(node, struct page_info, node); - - if (va + len <= page_info->va) - node = node->rb_left; - else if (va >= page_info->va + page_info->len) - node = node->rb_right; - else - break; - } - - if (page_info) { - if (va < page_info->va || - va + len > page_info->va + page_info->len) - return NULL; - } - return page_info; -} - -struct page_info *search_page_info(struct page_mng *pager, unsigned long va, - unsigned long len) -{ - struct page_info *page_info; - - if (!pager) - return NULL; - - down_read(&pager->rw_sem); - page_info = __search_page_info(pager, va, len); - up_read(&pager->rw_sem); - - return page_info; -} - -static int insert_page_info(struct page_mng *pager, struct page_info *page_info) -{ - struct rb_node **new_node; - struct rb_node *parent = NULL; - struct page_info *cur = NULL; - - down_write(&pager->rw_sem); - new_node = &(pager->rbtree.rb_node); - - /* Figure out where to put new node */ - while (*new_node) { - cur = rb_entry(*new_node, struct page_info, node); - parent = *new_node; - if (page_info->va + page_info->len <= cur->va) { - new_node = &((*new_node)->rb_left); - } else if (page_info->va >= cur->va + cur->len) { - new_node = &((*new_node)->rb_right); - } else { - up_write(&pager->rw_sem); - return -EFAULT; - } - } - /* Add new node and rebalance tree. */ - rb_link_node(&page_info->node, parent, new_node); - rb_insert_color(&page_info->node, &pager->rbtree); - - up_write(&pager->rw_sem); - - return 0; -} - -static void erase_page_info(struct page_mng *pager, struct page_info *page_info) -{ - rb_erase(&page_info->node, &pager->rbtree); -} - -static struct page_info *alloc_page_info(unsigned long va, unsigned long len, - unsigned int page_size) -{ - struct page_info *page_info; - size_t size; - - size = sizeof(struct page_info); - page_info = kzalloc(size, GFP_KERNEL); - if (!page_info) - return NULL; - - page_info->va = va; - page_info->len = len; - mutex_init(&page_info->lock); - - return page_info; -} - -struct page_info *get_page_info(struct page_mng *pager, unsigned long va, - unsigned long len, unsigned int page_size) -{ - struct page_info *page_info = search_page_info(pager, va, len); - - if (page_info) - return page_info; - - page_info = alloc_page_info(va, len, page_size); - if (page_info) { - if (insert_page_info(pager, page_info)) { - kfree(page_info); - page_info = search_page_info(pager, va, len); - } - } - - return page_info; -} - -void free_page_info(struct page_mng *pager, struct page_info *page_info) -{ - down_write(&pager->rw_sem); - erase_page_info(pager, page_info); - up_write(&pager->rw_sem); - kfree(page_info); -} - -static void free_pager(struct page_mng *pager) -{ - struct page_info *page_info = NULL; - struct rb_node *node = NULL; - - down_write(&pager->rw_sem); - node = rb_first(&pager->rbtree); - while (node) { - page_info = rb_entry(node, struct page_info, node); - node = rb_next(node); - erase_page_info(pager, page_info); - kfree(page_info); - } - up_write(&pager->rw_sem); -} - -static void free_svm_proc(struct svm_proc *proc) -{ - struct local_pair_proc *item = NULL; - struct local_pair_proc *next = NULL; - struct mm_struct *mm = proc->mm; - int count; - - free_pager(&proc->pager); - del_from_hashtable(proc); - - count = atomic_read(&mm->mm_users); - if (count) { - pr_err("mm_users is %d\n", count); - mmput(mm); - } - - if (!list_empty(&proc->tasks_list)) { - list_for_each_entry_safe(item, next, &proc->tasks_list, node) - list_del(&item->node); - } - pr_debug("svm proc clean up done pid %d, peer_pid %d\n", proc->pid, - proc->peer_pid); -} - -static void svm_proc_mm_release(struct mmu_notifier *subscription, - struct mm_struct *mm) -{ - struct svm_proc *proc = - container_of(subscription, struct svm_proc, notifier); - - free_svm_proc(proc); - kfree(proc); -} - -static const struct mmu_notifier_ops svm_proc_mmu_notifier_ops = { - .release = svm_proc_mm_release, -}; - -static int svm_proc_mmu_notifier_register(struct svm_proc *proc) -{ - proc->notifier.ops = &svm_proc_mmu_notifier_ops; - - return mmu_notifier_register(&proc->notifier, proc->mm); -} - -static void local_pair_proc_mm_release(struct mmu_notifier *subscription, - struct mm_struct *mm) -{ - struct local_pair_proc *local_proc = - container_of(subscription, struct local_pair_proc, notifier); - - list_del(&local_proc->node); - kfree(local_proc); - pr_debug("clean pair proc resources\n"); -} - -static const struct mmu_notifier_ops local_pair_proc_mmu_notifier_ops = { - .release = local_pair_proc_mm_release, -}; - -static int -local_pair_proc_mmu_notifier_register(struct local_pair_proc *local_proc) -{ - local_proc->notifier.ops = &local_pair_proc_mmu_notifier_ops; - - return mmu_notifier_register(&local_proc->notifier, local_proc->mm); -} - -struct local_pair_proc *insert_local_proc(struct svm_proc *proc, - unsigned int pid) -{ - int ret = 0; - struct local_pair_proc *local_proc = - kzalloc(sizeof(struct local_pair_proc), GFP_KERNEL); - - if (!local_proc) - return ERR_PTR(-ENOMEM); - - local_proc->tsk = find_get_task_by_vpid(pid); - if (!local_proc->tsk) { - pr_err("can not find process by pid %d\n", pid); - ret = -EINVAL; - goto free; - } - - local_proc->pid = pid; - local_proc->mm = get_task_mm(local_proc->tsk); - /* task is exiting */ - if (!local_proc->mm) { - pr_err("can not get process[%d] mm\n", pid); - ret = -EINTR; - goto put_task; - } - - ret = local_pair_proc_mmu_notifier_register(local_proc); - if (ret) { - pr_err("register mmu notifier failed\n"); - goto put_mm; - } - - mmput(local_proc->mm); - put_task_struct(local_proc->tsk); - - list_add(&local_proc->node, &proc->tasks_list); - pr_debug("%s bind_to_pid %d local_pid %d\n", __func__, proc->pid, - local_proc->pid); - - return local_proc; - -put_mm: - mmput(local_proc->mm); -put_task: - put_task_struct(local_proc->tsk); -free: - kfree(local_proc); - return ERR_PTR(ret); -} - -struct svm_proc *alloc_svm_proc(int nid, int pid, int peer_nid, int peer_pid) -{ - struct svm_proc *proc; - int ret; - - proc = kzalloc(sizeof(struct svm_proc), GFP_KERNEL); - if (!proc) - return ERR_PTR(-ENOMEM); - - proc->pager.rbtree = RB_ROOT; - init_rwsem(&proc->pager.rw_sem); - - proc->pid = pid; - proc->nid = nid; - proc->peer_nid = peer_nid; - proc->peer_pid = peer_pid; - INIT_LIST_HEAD(&proc->tasks_list); - - proc->tsk = find_get_task_by_vpid(pid); - if (!proc->tsk) { - pr_err("can not find process by pid %d\n", pid); - ret = -EINVAL; - goto free; - } - - proc->mm = get_task_mm(proc->tsk); - /* task is exiting */ - if (!proc->mm) { - pr_err("can not get process[%d] mm\n", pid); - ret = -EINTR; - goto put_task; - } - - ret = svm_proc_mmu_notifier_register(proc); - if (ret) { - pr_err("register mmu notifier failed\n"); - goto put_mm; - } - - /* - * destroying svm_proc depends on mmu_notifier. - * we have to put mm to make sure mmu_notifier can be called - */ - mmput(proc->mm); - put_task_struct(proc->tsk); - - add_to_hashtable(proc); - - return proc; - -put_mm: - mmput(proc->mm); -put_task: - put_task_struct(proc->tsk); -free: - kfree(proc); - return ERR_PTR(ret); -} diff --git a/drivers/remote_pager/svm_proc_mng.h b/drivers/remote_pager/svm_proc_mng.h deleted file mode 100644 index 85a014c79b41..000000000000 --- a/drivers/remote_pager/svm_proc_mng.h +++ /dev/null @@ -1,65 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only - * - * Generalized Memory Management. - * - * Copyright (C) 2023- Huawei, Inc. - * Author: Jiangtian Feng - * Co-Author: Jun Chen - */ - -#ifndef _REMOTE_PAGER_PROC_MNG_H_ -#define _REMOTE_PAGER_PROC_MNG_H_ - -#include -#include -#include -#include -#include - -struct page_info { - struct rb_node node; - unsigned long va; - unsigned long len; - struct mutex lock; - struct page *page; -}; - -struct page_mng { - struct rw_semaphore rw_sem; - struct rb_root rbtree; -}; - -struct local_pair_proc { - struct list_head node; - pid_t pid; - struct task_struct *tsk; - struct mm_struct *mm; - struct mmu_notifier notifier; -}; - -struct svm_proc { - int pid; - int nid; - int peer_pid; - int peer_nid; - struct mm_struct *mm; /* never dereference */ - struct task_struct *tsk; - struct list_head tasks_list; /* bind to svm_proc local tasks */ - struct mmu_notifier notifier; - - struct page_mng pager; -}; - -struct page_info *search_page_info(struct page_mng *pager, - unsigned long va, unsigned long len); -struct page_info *get_page_info(struct page_mng *pager, - unsigned long va, unsigned long len, unsigned int page_size); -void free_page_info(struct page_mng *pager, struct page_info *page_info); - -struct svm_proc *alloc_svm_proc(int nid, int pid, int peer_nid, int peer_pid); -struct svm_proc *search_svm_proc_by_mm(struct mm_struct *mm); -struct svm_proc *search_svm_proc_by_pid(unsigned int pid); -struct local_pair_proc *insert_local_proc(struct svm_proc *proc, - unsigned int local_pid); -struct svm_proc *search_svm_proc_by_local_mm(struct mm_struct *mm); -#endif diff --git a/drivers/remote_pager/wait_station.c b/drivers/remote_pager/wait_station.c deleted file mode 100644 index 858859fb1605..000000000000 --- a/drivers/remote_pager/wait_station.c +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Waiting stations allows threads to be waited for a given - * number of events are completed - * - * Original file developed by SSRG at Virginia Tech. - * - * author, Javier Malave, Rebecca Shapiro, Andrew Hughes, - * Narf Industries 2020 (modifications for upstream RFC) - * - */ - -#include -#include -#include - -#include "wait_station.h" - -#define MAX_WAIT_STATIONS 1024 -#define MAX_WAIT_IO_TIMEOUT (300 * HZ) - -static struct wait_station wait_stations[MAX_WAIT_STATIONS]; - -static DEFINE_SPINLOCK(wait_station_lock); -static DECLARE_BITMAP(wait_station_available, MAX_WAIT_STATIONS) = { 0 }; - -struct wait_station *get_wait_station(void) -{ - int id; - struct wait_station *ws; - - spin_lock(&wait_station_lock); - id = find_first_zero_bit(wait_station_available, MAX_WAIT_STATIONS); - ws = wait_stations + id; - set_bit(id, wait_station_available); - spin_unlock(&wait_station_lock); - - ws->id = id; - ws->private = (void *)0xbad0face; - init_completion(&ws->pendings); - - return ws; -} -EXPORT_SYMBOL_GPL(get_wait_station); - -struct wait_station *wait_station(int id) -{ - /* memory barrier */ - smp_rmb(); - return wait_stations + id; -} -EXPORT_SYMBOL_GPL(wait_station); - -void put_wait_station(struct wait_station *ws) -{ - int id = ws->id; - - spin_lock(&wait_station_lock); - clear_bit(id, wait_station_available); - spin_unlock(&wait_station_lock); -} -EXPORT_SYMBOL_GPL(put_wait_station); - -void *wait_at_station(struct wait_station *ws) -{ - void *ret; - - if (!try_wait_for_completion(&ws->pendings)) { - if (wait_for_completion_io_timeout(&ws->pendings, - MAX_WAIT_IO_TIMEOUT) == 0) { - pr_err("%s timeout\n", __func__); - ret = ERR_PTR(-ETIMEDOUT); - goto out; - } - } - /* memory barrier */ - smp_rmb(); - ret = ws->private; -out: - put_wait_station(ws); - return ret; -} -EXPORT_SYMBOL_GPL(wait_at_station); diff --git a/drivers/remote_pager/wait_station.h b/drivers/remote_pager/wait_station.h deleted file mode 100644 index 7833b40be638..000000000000 --- a/drivers/remote_pager/wait_station.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * /kernel/popcorn/wait_station.c - * - * Waiting stations allows threads to be waited for a given - * number of events are completed - * - * Original file developed by SSRG at Virginia Tech. - * - * author, Javier Malave, Rebecca Shapiro, Andrew Hughes, - * Narf Industries 2020 (modifications for upstream RFC) - * - */ - -#ifndef _REMOTE_PAGER_WAIT_STATION_H_ -#define _REMOTE_PAGER_WAIT_STATION_H_ - -#include -#include - -struct wait_station { - unsigned int id; - void *private; - struct completion pendings; -}; - -struct wait_station *get_wait_station(void); -struct wait_station *wait_station(int id); -void put_wait_station(struct wait_station *ws); -void *wait_at_station(struct wait_station *ws); -#endif -- Gitee From 38de80e650c2fb06464989810c294df0577a5e13 Mon Sep 17 00:00:00 2001 From: nicunshu Date: Thu, 17 Jul 2025 21:59:35 +0800 Subject: [PATCH 19/27] mm: gmem: introduce hmemcpy and add multiple protection euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- add multiple param check for protection Signed-off-by: nicunshu --- include/linux/gmem.h | 15 +- include/linux/gmem_as.h | 36 ---- include/linux/mm.h | 2 + include/linux/mm_types.h | 35 +++- include/linux/printk.h | 6 + kernel/fork.c | 9 + mm/gmem.c | 363 ++++++++++++++++++++++++++++++++++----- mm/huge_memory.c | 1 - mm/memory.c | 10 +- mm/mmap.c | 148 +++++++++++----- 10 files changed, 496 insertions(+), 129 deletions(-) delete mode 100644 include/linux/gmem_as.h diff --git a/include/linux/gmem.h b/include/linux/gmem.h index b0cdb6d0ab9a..d37e79a7052d 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -121,10 +121,8 @@ struct gm_mmu { unsigned long cookie; /* Synchronize VMA in a peer OS to interact with the host OS */ - enum gm_ret (*peer_va_alloc_fixed)(struct mm_struct *mm, unsigned long va, - unsigned long size, unsigned long prot); - enum gm_ret (*peer_va_free)(struct mm_struct *mm, unsigned long va, - unsigned long size); + enum gm_ret (*peer_va_alloc_fixed)(struct gm_fault_t *gmf); + enum gm_ret (*peer_va_free)(struct gm_fault_t *gmf); /* Create physical mappings on peer host. * If copy is set, copy data [dma_addr, dma_addr + size] to peer host @@ -152,6 +150,9 @@ struct gm_mmu { /* Invalidation functions of the MMU TLB */ enum gm_ret (*tlb_invl)(void *pmap, unsigned long va, unsigned long size); enum gm_ret (*tlb_invl_coalesced)(void *pmap, struct list_head *mappings); + + // copy one area of memory from device to host or from host to device + enum gm_ret (*peer_hmemcpy)(struct gm_memcpy_t *gmc); }; /** @@ -301,6 +302,7 @@ extern unsigned long gm_as_alloc(struct gm_as *as, unsigned long hint, unsigned struct gm_region **new_region); extern int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior); +extern int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size); enum gmem_stats_item { NR_PAGE_MIGRATING_H2D, @@ -339,6 +341,11 @@ static inline struct hnode *get_hnode(unsigned int hnid) return hnodes[hnid]; } +static inline int get_hnuma_id(struct gm_dev *gm_dev) +{ + return first_node(gm_dev->registered_hnodes); +} + void __init hnuma_init(void); unsigned int alloc_hnode_id(void); void free_hnode_id(unsigned int nid); diff --git a/include/linux/gmem_as.h b/include/linux/gmem_as.h deleted file mode 100644 index d691de1162eb..000000000000 --- a/include/linux/gmem_as.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _GMEM_AS_H -#define _GMEM_AS_H - -#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */ - -/** - * enum gm_as_alloc - defines different allocation policy for virtual addresses. - * - * @GM_AS_ALLOC_DEFAULT: An object cache is applied to accelerate VA allocations. - * @GM_AS_ALLOC_FIRSTFIT: Prefer allocation efficiency. - * @GM_AS_ALLOC_BESTFIT: Prefer space efficiency. - * @GM_AS_ALLOC_NEXTFIT: Perform an address-ordered search for free addresses, - * beginning where the previous search ended. - */ -enum gm_as_alloc { - GM_AS_ALLOC_DEFAULT = 0, - GM_AS_ALLOC_FIRSTFIT, - GM_AS_ALLOC_BESTFIT, - GM_AS_ALLOC_NEXTFIT, -}; - -/* Defines an address space. */ -struct gm_as { - spinlock_t rbtree_lock; /* spinlock of struct gm_as */ - struct rb_root rbroot; /*root of gm_region_t */ - enum gm_as_alloc policy; - unsigned long start_va; - unsigned long end_va; - /* defines the VA unit size if an object cache is applied */ - unsigned long cache_quantum; - /* tracks device contexts attached to this va space, using gm_as_link */ - struct list_head gm_ctx_list; -}; - -#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index c024f8cd7bdf..5850701096ca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3410,9 +3410,11 @@ unsigned long randomize_page(unsigned long start, unsigned long range); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +#ifdef CONFIG_GMEM extern unsigned long get_unmapped_area_aligned(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long align); +#endif extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e9cd4439e08d..f012f7c7c4d4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,10 +20,6 @@ #include #include -#ifdef CONFIG_GMEM -#include -#endif - #include #include @@ -651,6 +647,37 @@ struct vm_object { */ atomic_t ref_count; }; + +#define GMEM_MMAP_RETRY_TIMES 10 /* gmem retry times before OOM */ + +/** + * enum gm_as_alloc - defines different allocation policy for virtual addresses. + * + * @GM_AS_ALLOC_DEFAULT: An object cache is applied to accelerate VA allocations. + * @GM_AS_ALLOC_FIRSTFIT: Prefer allocation efficiency. + * @GM_AS_ALLOC_BESTFIT: Prefer space efficiency. + * @GM_AS_ALLOC_NEXTFIT: Perform an address-ordered search for free addresses, + * beginning where the previous search ended. + */ +enum gm_as_alloc { + GM_AS_ALLOC_DEFAULT = 0, + GM_AS_ALLOC_FIRSTFIT, + GM_AS_ALLOC_BESTFIT, + GM_AS_ALLOC_NEXTFIT, +}; + +/* Defines an address space. */ +struct gm_as { + spinlock_t rbtree_lock; /* spinlock of struct gm_as */ + struct rb_root rbroot; /*root of gm_region_t */ + enum gm_as_alloc policy; + unsigned long start_va; + unsigned long end_va; + /* defines the VA unit size if an object cache is applied */ + unsigned long cache_quantum; + /* tracks device contexts attached to this va space, using gm_as_link */ + struct list_head gm_ctx_list; +}; #endif struct anon_vma_name { diff --git a/include/linux/printk.h b/include/linux/printk.h index e4878bb58f66..c4fc04998932 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -761,3 +761,9 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true) #endif + +#ifdef CONFIG_GMEM +#define gmem_err(fmt, ...) \ + ((void)pr_err("[gmem]" fmt "\n", ##__VA_ARGS__)) + +#endif diff --git a/kernel/fork.c b/kernel/fork.c index cf44a02680d6..7e9612e408da 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -115,10 +115,15 @@ #include #include #include + #ifdef CONFIG_FAST_SYSCALL #include #endif +#ifdef CONFIG_GMEM +#include +#endif + #include #define CREATE_TRACE_POINTS @@ -560,6 +565,10 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) void vm_area_free(struct vm_area_struct *vma) { +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) + vm_object_drop_locked(vma); +#endif #ifdef CONFIG_PER_VMA_LOCK call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); #else diff --git a/mm/gmem.c b/mm/gmem.c index c484c2c40101..ce591b9ed8ca 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -87,7 +87,7 @@ static int gmem_stat_init(void) for (i--; i >= 0; i--) percpu_counter_destroy(&g_gmem_stats[i]); - break; /* break the initialization process */ + break; /* break the initialization process */ } } @@ -112,6 +112,7 @@ static int gmem_stats_show(struct seq_file *m, void *arg) #endif /* CONFIG_PROC_FS */ static struct workqueue_struct *prefetch_wq; +static struct workqueue_struct *hmemcpy_wq; #define GM_WORK_CONCURRENCY 4 @@ -140,20 +141,27 @@ static int __init gmem_init(void) err = vm_object_init(); if (err) - goto free_ctx; + goto free_region; err = gmem_stat_init(); if (err) - goto free_ctx; + goto free_region; prefetch_wq = alloc_workqueue("prefetch", - __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | - WQ_CPU_INTENSIVE, - GM_WORK_CONCURRENCY); + __WQ_LEGACY | WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE, GM_WORK_CONCURRENCY); if (!prefetch_wq) { - pr_info("fail to alloc workqueue prefetch_wq\n"); + gmem_err("fail to alloc workqueue prefetch_wq\n"); err = -EFAULT; - goto free_ctx; + goto free_region; + } + + hmemcpy_wq = alloc_workqueue("hmemcpy", __WQ_LEGACY | WQ_UNBOUND + | WQ_HIGHPRI | WQ_CPU_INTENSIVE, GM_WORK_CONCURRENCY); + if (!hmemcpy_wq) { + gmem_err("fail to alloc workqueue hmemcpy_wq\n"); + err = -EFAULT; + destroy_workqueue(prefetch_wq); + goto free_region; } #ifdef CONFIG_PROC_FS @@ -164,6 +172,8 @@ static int __init gmem_init(void) return 0; +free_region: + kmem_cache_destroy(gm_region_cache); free_ctx: kmem_cache_destroy(gm_ctx_cache); free_dev: @@ -237,25 +247,28 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev struct vm_object *obj; struct gm_mapping *gm_mapping; unsigned long size = HPAGE_SIZE; - struct gm_fault_t gmf = { .mm = mm, - .va = addr, - .dev = dev, - .size = size, - .copy = false, - .behavior = behavior }; + struct gm_fault_t gmf = { + .mm = mm, + .va = addr, + .dev = dev, + .size = size, + .copy = false, + .behavior = behavior + }; struct page *page = NULL; mmap_read_lock(mm); vma = find_vma(mm, addr); - if (!vma) { + if (!vma || vma->vm_start > addr) { + gmem_err("%s failed to find vma by addr %p\n", __func__, (void *)addr); pr_info("gmem: %s no vma\n", __func__); ret = GM_RET_FAILURE_UNKNOWN; goto mmap_unlock; } obj = vma->vm_obj; if (!obj) { - pr_info("gmem: %s no vm_obj\n", __func__); + gmem_err("%s no vm_obj\n", __func__); ret = GM_RET_FAILURE_UNKNOWN; goto mmap_unlock; } @@ -268,6 +281,11 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev } xa_unlock(obj->logical_page_table); + if (unlikely(!gm_mapping)) { + gmem_err("OOM when creating vm_obj!\n"); + ret = GM_RET_NOMEM; + goto mmap_unlock; + } mutex_lock(&gm_mapping->lock); if (gm_mapping_nomap(gm_mapping)) { goto peer_map; @@ -281,16 +299,17 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev } else if (gm_mapping_cpu(gm_mapping)) { page = gm_mapping->page; if (!page) { - pr_err("gmem: host gm_mapping page is NULL. Set nomap\n"); + gmem_err("host gm_mapping page is NULL. Set nomap\n"); gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); goto unlock; } get_page(page); + /* zap_page_range_single can be used in Linux 6.4 and later versions. */ zap_page_range_single(vma, addr, size, NULL); gmf.dma_addr = dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL); if (dma_mapping_error(dma_dev, gmf.dma_addr)) - pr_info("gmem: dma map failed\n"); + gmem_err("dma map failed\n"); gmf.copy = true; } @@ -307,7 +326,7 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev gmem_stats_counter(NR_PAGE_MIGRATING_D2H, 1); ret = GM_RET_SUCCESS; } else { - pr_err("gmem: peer map failed\n"); + gmem_err("peer map failed\n"); if (page) { gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); put_page(page); @@ -351,7 +370,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, gm_mapping = vm_object_lookup(obj, addr); if (!gm_mapping) { - pr_err("gmem: host fault gm_mapping should not be NULL\n"); + gmem_err("host fault gm_mapping should not be NULL\n"); return VM_FAULT_SIGBUS; } @@ -361,11 +380,11 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, gmf.dma_addr = dma_map_page(dma_dev, vmf->page, 0, size, DMA_BIDIRECTIONAL); if (dma_mapping_error(dma_dev, gmf.dma_addr)) { - pr_err("gmem: host fault dma mapping error\n"); + gmem_err("host fault dma mapping error\n"); return VM_FAULT_SIGBUS; } if (dev->mmu->peer_unmap(&gmf) != GM_RET_SUCCESS) { - pr_err("gmem: peer unmap failed\n"); + gmem_err("peer unmap failed\n"); dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); return VM_FAULT_SIGBUS; } @@ -467,7 +486,7 @@ EXPORT_SYMBOL_GPL(gm_mappings_alloc); /* GMEM Virtual Address Space API */ enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, - unsigned long cache_quantum, struct gm_as **new_as) + unsigned long cache_quantum, struct gm_as **new_as) { struct gm_as *as; @@ -505,7 +524,7 @@ enum gm_ret gm_as_destroy(struct gm_as *as) EXPORT_SYMBOL_GPL(gm_as_destroy); enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode mode, - bool activate, struct gm_context **out_ctx) + bool activate, struct gm_context **out_ctx) { struct gm_context *ctx; int nid; @@ -617,11 +636,11 @@ static void prefetch_work_cb(struct work_struct *work) /* MADV_WILLNEED: dev will soon access this addr. */ ret = gm_dev_fault(d->mm, addr, d->dev, MADV_WILLNEED); if (ret == GM_RET_PAGE_EXIST) { - pr_info("%s: device has done page fault, ignore prefetch\n", + gmem_err("%s: device has done page fault, ignore prefetch\n", __func__); } else if (ret != GM_RET_SUCCESS) { *d->res = -EFAULT; - pr_err("%s: call dev fault error %d\n", __func__, ret); + gmem_err("%s: call dev fault error %d\n", __func__, ret); } } while (addr += page_size, addr != end); @@ -638,8 +657,10 @@ static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t s unsigned long old_start; /* overflow */ - if (check_add_overflow(addr, size, &end)) + if (check_add_overflow(addr, size, &end)) { + gmem_err("addr plus size will cause overflow!\n"); return -EINVAL; + } old_start = end; @@ -648,8 +669,10 @@ static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t s start = round_down(addr, page_size); size = end - start; - if (!end && old_start) + if (!end && old_start) { + gmem_err("end addr align up 2M causes invalid addr %p\n", (void *)end); return -EINVAL; + } if (size == 0) return 0; @@ -658,6 +681,12 @@ static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t s vma = find_vma(current->mm, start); if (!vma || start < vma->vm_start || end > vma->vm_end) { mmap_read_unlock(current->mm); + gmem_err("failed to find vma by invalid start %p or size 0x%zx.\n", + (void *)start, size); + return GM_RET_FAILURE_UNKNOWN; + } else if (!vma_is_peer_shared(vma)) { + mmap_read_unlock(current->mm); + gmem_err("%s the vma does not use VM_PEER_SHARED\n", __func__); return GM_RET_FAILURE_UNKNOWN; } mmap_read_unlock(current->mm); @@ -705,7 +734,7 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, obj = vma->vm_obj; if (!obj) { - pr_err("gmem: peer-shared vma should have vm_object\n"); + gmem_err("peer-shared vma should have vm_object\n"); return -EINVAL; } @@ -728,8 +757,7 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, gmf.dev = gm_mapping->dev; ret = gm_mapping->dev->mmu->peer_unmap(&gmf); if (ret) { - pr_err("gmem: peer_unmap failed. ret %d\n", - ret); + gmem_err("peer_unmap failed. ret %d\n", ret); mutex_unlock(&gm_mapping->lock); continue; } @@ -750,31 +778,38 @@ static int hmadvise_do_eagerfree(unsigned long addr, size_t size) unsigned long old_start; /* overflow */ - if (check_add_overflow(addr, size, &end)) + if (check_add_overflow(addr, size, &end)) { + gmem_err("addr plus size will cause overflow!\n"); return -EINVAL; + } old_start = addr; /* Align addr by rounding inward to avoid excessive page release. */ end = round_down(end, page_size); start = round_up(addr, page_size); - if (start >= end) + if (start >= end) { + pr_debug("gmem:start align up 2M >= end align down 2M.\n"); return ret; + } /* Check to see whether len was rounded up from small -ve to zero */ - if (old_start && !start) + if (old_start && !start) { + gmem_err("start addr align up 2M causes invalid addr %p", (void *)start); return -EINVAL; + } mmap_read_lock(current->mm); do { vma = find_vma_intersection(current->mm, start, end); if (!vma) { - pr_info("gmem: there is no valid vma\n"); + gmem_err("gmem: there is no valid vma\n"); break; } if (!vma_is_peer_shared(vma)) { - pr_debug("gmem: not peer-shared vma, skip dontneed\n"); + pr_debug("gmem:not peer-shared vma %p-%p, skip dontneed\n", + (void *)vma->vm_start, (void *)vma->vm_end); start = vma->vm_end; continue; } @@ -806,21 +841,25 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) if (check_hmadvise_behavior(behavior)) { goto no_hnid; } else { - pr_err("hmadvise: behavior %d need hnid or is invalid\n", - behavior); + gmem_err("hmadvise: behavior %d need hnid or is invalid\n", + behavior); return error; } } - if (hnid < 0) + if (hnid < 0) { + gmem_err("hmadvise: invalid hnid %d < 0\n", hnid); return error; + } - if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) + if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) { + gmem_err("hmadvise: can't find hnode by hnid:%d or hnode is not allowed\n", hnid); return error; + } node = get_hnode(hnid); if (!node) { - pr_err("hmadvise: hnode id %d is invalid\n", hnid); + gmem_err("hmadvise: hnode id %d is invalid\n", hnid); return error; } @@ -831,9 +870,249 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) case MADV_DONTNEED: return hmadvise_do_eagerfree(start, len_in); default: - pr_err("hmadvise: unsupported behavior %d\n", behavior); + gmem_err("hmadvise: unsupported behavior %d\n", behavior); } return error; } EXPORT_SYMBOL_GPL(hmadvise_inner); + +struct hmemcpy_data { + struct mm_struct *mm; + int hnid; + unsigned long src; + unsigned long dest; + size_t size; + struct work_struct work; +}; + +static bool hnid_match_dest(int hnid, struct gm_mapping *dest) +{ + return (hnid < 0) ? gm_mapping_cpu(dest) : gm_mapping_device(dest); +} + +static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, + unsigned long src, size_t size) +{ + enum gm_ret ret; + int page_size = HPAGE_SIZE; + struct vm_area_struct *vma_dest, *vma_src; + struct gm_mapping *gm_mmaping_dest, *gm_mmaping_src; + struct gm_dev *dev = NULL; + struct hnode *node; + struct gm_memcpy_t gmc = {0}; + + if (size == 0) + return; + + vma_dest = find_vma(mm, dest); + vma_src = find_vma(mm, src); + + gm_mmaping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); + gm_mmaping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + + if (!gm_mmaping_src) { + gmem_err("%s: gm_mmaping_src is NULL, src=%p; size=0x%zx\n", + __func__, (void *)src, size); + return; + } + + if (hnid != -1) { + node = get_hnode(hnid); + if (node) + dev = node->dev; + if (!dev) { + gmem_err("%s: hnode's dev is NULL\n", __func__); + return; + } + } + + // Trigger dest page fault on host or device + if (!gm_mmaping_dest || gm_mapping_nomap(gm_mmaping_dest) + || !hnid_match_dest(hnid, gm_mmaping_dest)) { + if (hnid == -1) { + mmap_read_lock(mm); + handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | + FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); + mmap_read_unlock(mm); + } else { + ret = gm_dev_fault(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); + if (ret != GM_RET_SUCCESS) { + gmem_err("%s: gm_dev_fault failed\n", __func__); + return; + } + } + } + if (!gm_mmaping_dest) + gm_mmaping_dest = vm_object_lookup(vma_dest->vm_obj, round_down(dest, page_size)); + + if (gm_mmaping_dest && gm_mmaping_dest != gm_mmaping_src) + mutex_lock(&gm_mmaping_dest->lock); + mutex_lock(&gm_mmaping_src->lock); + // Use memcpy when there is no device address, otherwise use peer_memcpy + if (hnid == -1) { + if (gm_mapping_cpu(gm_mmaping_src)) { // host to host + memcpy(page_to_virt(gm_mmaping_dest->page) + (dest & (page_size - 1)), + page_to_virt(gm_mmaping_src->page) + (src & (page_size - 1)), + size); + goto unlock; + } else { // device to host + dev = gm_mmaping_src->dev; + gmc.dma_addr = phys_to_dma(dev->dma_dev, + page_to_phys(gm_mmaping_dest->page) + (dest & (page_size - 1))); + gmc.src = src; + } + } else { + if (gm_mapping_cpu(gm_mmaping_src)) { // host to device + gmc.dest = dest; + gmc.dma_addr = phys_to_dma(dev->dma_dev, + page_to_phys(gm_mmaping_src->page) + (src & (page_size - 1))); + } else { // device to device + if (dev == gm_mmaping_src->dev) { // same device + gmc.dest = dest; + gmc.src = src; + } else { // TODO: different devices + gmem_err("%s: device to device is unimplemented\n", __func__); + goto unlock; + } + } + } + gmc.mm = mm; + gmc.dev = dev; + gmc.size = size; + dev->mmu->peer_hmemcpy(&gmc); + +unlock: + mutex_unlock(&gm_mmaping_src->lock); + if (gm_mmaping_dest && gm_mmaping_dest != gm_mmaping_src) + mutex_unlock(&gm_mmaping_dest->lock); +} + +/* + * Each page needs to be copied in three parts when the address is not aligned. + * | <--a-->| | + * | -------|--------- | + * | / /| / / | + * | / / | / / | + * | / / |/ / | + * | ----------|------ | + * | <----b--->| | + * |<----page x---->|<----page y---->| + */ + +static void hmemcpy_work_cb(struct work_struct *work) +{ + size_t i; + int remain, a, b, page_size = HPAGE_SIZE; + struct hmemcpy_data *d = container_of(work, struct hmemcpy_data, work); + unsigned long src = d->src, dest = d->dest; + + a = min(page_size - (src & (page_size - 1)), page_size - (dest & (page_size - 1))); + b = max(page_size - (src & (page_size - 1)), page_size - (dest & (page_size - 1))); + + for (i = page_size; i < d->size; i += page_size) { + if (a != 0) + do_hmemcpy(d->mm, d->hnid, dest, src, a); + if (b - a != 0) + do_hmemcpy(d->mm, d->hnid, dest + a, src + a, b - a); + if (page_size - b != 0) + do_hmemcpy(d->mm, d->hnid, dest + b, src + b, page_size - b); + src += page_size; + dest += page_size; + } + + remain = d->size + page_size - i; + if (remain == 0) + goto out; + + if (remain < a) { + do_hmemcpy(d->mm, d->hnid, dest, src, remain); + } else if (remain < b) { + do_hmemcpy(d->mm, d->hnid, dest, src, a); + do_hmemcpy(d->mm, d->hnid, dest + a, src + a, remain - a); + } else { + do_hmemcpy(d->mm, d->hnid, dest, src, a); + do_hmemcpy(d->mm, d->hnid, dest + a, src + a, b - a); + do_hmemcpy(d->mm, d->hnid, dest + b, src + b, remain - b); + } + +out: + kfree(d); +} + +int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) +{ + int page_size = HPAGE_SIZE; + unsigned long per_size, copied = 0; + struct hmemcpy_data *data; + struct vm_area_struct *vma_dest, *vma_src; + + if (hnid < 0) { + if (hnid != -1) { + gmem_err("hmadvise: invalid hnid %d < 0\n", hnid); + return -EINVAL; + } + } else if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) { + gmem_err( + "hmadvise: can't find hnode by hnid:%d or hnode is not allowed\n", + hnid); + return -EINVAL; + } + + vma_dest = find_vma(current->mm, dest); + vma_src = find_vma(current->mm, src); + + if (!vma_src || vma_src->vm_start > src || !vma_is_peer_shared(vma_src) + || vma_src->vm_end < (src + size)) { + gmem_err("failed to find peer_shared vma by invalid src:%p or size :0x%zx", + (void *)src, size); + return -EINVAL; + } + + if (!vma_dest || vma_dest->vm_start > dest || !vma_is_peer_shared(vma_dest) + || vma_dest->vm_end < (dest + size)) { + gmem_err("failed to find peer_shared vma by invalid dest:%p or size :0x%zx", + (void *)dest, size); + return -EINVAL; + } + + if (!(vma_dest->vm_flags & VM_WRITE)) { + gmem_err("dest is not writable.\n"); + return -EINVAL; + } + + if (!(vma_dest->vm_flags & VM_WRITE)) { + gmem_err("dest is not writable.\n"); + return -EINVAL; + } + + per_size = (size / GM_WORK_CONCURRENCY) & ~(page_size - 1); + + while (copied < size) { + data = kzalloc(sizeof(struct hmemcpy_data), GFP_KERNEL); + if (data == NULL) { + flush_workqueue(hmemcpy_wq); + return GM_RET_NOMEM; + } + INIT_WORK(&data->work, hmemcpy_work_cb); + data->mm = current->mm; + data->hnid = hnid; + data->src = src; + data->dest = dest; + if (per_size == 0) { + data->size = size; + } else { + // Process (1.x * per_size) for the last time + data->size = (size - copied < 2 * per_size) ? (size - copied) : per_size; + } + + queue_work(hmemcpy_wq, &data->work); + src += data->size; + dest += data->size; + copied += data->size; + } + + flush_workqueue(hmemcpy_wq); + return 0; +} +EXPORT_SYMBOL_GPL(hmemcpy); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 437182b72cd6..f4613cf7c6dc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1689,7 +1689,6 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return __do_huge_pmd_anonymous_page(vmf); } - static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, pgtable_t pgtable) diff --git a/mm/memory.c b/mm/memory.c index ef556a62670e..568f3e295fdb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1829,8 +1829,11 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, goto next; /* fall through */ } - if (pud_none_or_clear_bad(pud)) + if (pud_none_or_clear_bad(pud)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_pmd_range(tlb, vma, pud, addr, next, details); next: cond_resched(); @@ -1850,8 +1853,11 @@ static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_none_or_clear_bad(p4d)) + if (p4d_none_or_clear_bad(p4d)) { + if (vma_is_peer_shared(vma)) + zap_logic_pud_range(vma, addr, next); continue; + } next = zap_pud_range(tlb, vma, p4d, addr, next, details); } while (p4d++, addr = next, addr != end); diff --git a/mm/mmap.c b/mm/mmap.c index cb36c96a9619..a50ea15e6c81 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1524,7 +1524,26 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval; +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_SHARED) && (flags & MAP_PEER_SHARED)) { + retval = -EINVAL; + gmem_err(" MAP_PEER_SHARED and MAP_SHARE cannot be used together.\n"); + goto out_fput; + } + if (gmem_is_enabled() && (flags & MAP_HUGETLB) && (flags & MAP_PEER_SHARED)) { + retval = -EINVAL; + gmem_err(" MAP_PEER_SHARED and MAP_HUGETLB cannot be used together.\n"); + goto out_fput; + } +#endif if (!(flags & MAP_ANONYMOUS)) { +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) { + retval = -EINVAL; + gmem_err(" MAP_PEER_SHARED cannot map file page.\n"); + goto out_fput; + } +#endif audit_mmap_fd(fd, flags); file = fget(fd); if (!file) @@ -2620,10 +2639,10 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, } #ifdef CONFIG_GMEM -static void munmap_in_peer_devices(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long start, unsigned long end) +static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) { - unsigned long addr = start; + unsigned long start, end, addr; struct vm_object *obj = vma->vm_obj; enum gm_ret ret; struct gm_context *ctx, *tmp; @@ -2634,9 +2653,20 @@ static void munmap_in_peer_devices(struct mm_struct *mm, .copy = false, }; + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return; + addr = start; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + if (!obj) return; + if (!mm->gm_as) + return; + do { xa_lock(obj->logical_page_table); gm_mapping = vm_object_lookup(obj, addr); @@ -2657,28 +2687,40 @@ static void munmap_in_peer_devices(struct mm_struct *mm, gmf.dev = gm_mapping->dev; ret = gm_mapping->dev->mmu->peer_unmap(&gmf); if (ret != GM_RET_SUCCESS) { - pr_err("%s: call dev peer_unmap error %d\n", __func__, ret); + gmem_err("%s: call dev peer_unmap error %d\n", __func__, ret); mutex_unlock(&gm_mapping->lock); continue; } mutex_unlock(&gm_mapping->lock); } while (addr += HPAGE_SIZE, addr != end); - if (!mm->gm_as) - return; - list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { if (!gm_dev_is_peer(ctx->dev)) continue; if (!ctx->dev->mmu->peer_va_free) continue; - ret = ctx->dev->mmu->peer_va_free(mm, start, end - start); + gmf.va = start; + gmf.size = end - start; + gmf.dev = ctx->dev; + + ret = ctx->dev->mmu->peer_va_free(&gmf); if (ret != GM_RET_SUCCESS) pr_debug("gmem: free_vma(start:%lx, len:%lx) ret %d\n", start, end - start, ret); } } + +static void munmap_in_peer_devices(struct mm_struct *mm, unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + + VMA_ITERATOR(vmi, mm, start); + for_each_vma_range(vmi, vma, end) { + if (vma_is_peer_shared(vma)) + munmap_single_vma_in_peer_devices(mm, vma, start, end); + } +} #endif /* @@ -2755,6 +2797,10 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, locked_vm += vma_pages(next); count++; +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + munmap_single_vma_in_peer_devices(mm, vma, start, end); +#endif if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas @@ -2813,7 +2859,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, next = vma_next(vmi); #ifdef CONFIG_GMEM if (gmem_is_enabled()) - munmap_in_peer_devices(mm, vma, start, end); + munmap_in_peer_devices(mm, start, end); #endif if (next) vma_iter_prev_range(vmi); @@ -2873,17 +2919,23 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long end; struct vm_area_struct *vma; +#ifdef CONFIG_GMEM + struct vm_area_struct *vma_end; if (gmem_is_enabled()) { vma = find_vma_intersection(mm, start, start + len); + vma_end = find_vma(mm, start + len); if (!vma) return 0; if (vma_is_peer_shared(vma)) { if (!IS_ALIGNED(start, PMD_SIZE)) return -EINVAL; - - len = round_up(len, SZ_2M); } + /* Prevents partial release of the peer_share page. */ + if (vma_end && vma_end->vm_start < (start + len) && vma_is_peer_shared(vma_end)) + len = round_up(len, SZ_2M); } +#endif + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; @@ -2922,16 +2974,19 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, } #ifdef CONFIG_GMEM -static int alloc_va_in_peer_devices(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long addr, unsigned long len, - vm_flags_t vm_flags) +static int alloc_va_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, vm_flags_t vm_flags) { struct gm_context *ctx, *tmp; enum gm_ret ret; - pr_debug("gmem: start mmap, as %p\n", mm->gm_as); - if (!mm->gm_as) - return -ENODEV; + if (!mm->gm_as) { + ret = gm_as_create(0, ULONG_MAX, GM_AS_ALLOC_DEFAULT, PAGE_SIZE, + &mm->gm_as); + if (ret) + return ret; + } + pr_debug("gmem: start mmap, as %p\n", (void *)mm->gm_as); if (!vma->vm_obj) vma->vm_obj = vm_object_create(vma); @@ -2941,7 +2996,16 @@ static int alloc_va_in_peer_devices(struct mm_struct *mm, * TODO: consider the concurrency problem of device * attaching/detaching from the gm_as. */ + ret = -ENODEV; list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + struct gm_fault_t gmf = { + .mm = mm, + .dev = ctx->dev, + .va = addr, + .size = len, + .prot = vm_flags, + }; + if (!gm_dev_is_peer(ctx->dev)) continue; @@ -2949,16 +3013,15 @@ static int alloc_va_in_peer_devices(struct mm_struct *mm, pr_debug("gmem: mmu ops has no alloc_vma\n"); continue; } - pr_debug("gmem: call vma_alloc\n"); - ret = ctx->dev->mmu->peer_va_alloc_fixed(mm, addr, len, vm_flags); + ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); if (ret != GM_RET_SUCCESS) { pr_debug("gmem: alloc_vma ret %d\n", ret); return ret; } } - return GM_RET_SUCCESS; + return ret; } #endif @@ -2995,7 +3058,8 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) { #ifdef CONFIG_GMEM - gmem_release_vma(mm, &reserve_list); + if (gmem_is_enabled()) + gmem_release_vma(mm, &reserve_list); #endif return -ENOMEM; } @@ -3004,7 +3068,8 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, /* Unmap any existing mapping in the area */ if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) { #ifdef CONFIG_GMEM - gmem_release_vma(mm, &reserve_list); + if (gmem_is_enabled()) + gmem_release_vma(mm, &reserve_list); #endif return -ENOMEM; } @@ -3016,7 +3081,8 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, charged = len >> PAGE_SHIFT; if (security_vm_enough_memory_mm(mm, charged)) { #ifdef CONFIG_GMEM - gmem_release_vma(mm, &reserve_list); + if (gmem_is_enabled()) + gmem_release_vma(mm, &reserve_list); #endif return -ENOMEM; } @@ -3083,6 +3149,24 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + enum gm_ret ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); + + if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + retry_times++; + addr = get_unmapped_area(file, addr, len, pgoff, 0); + gmem_reserve_vma(vma, &reserve_list); + goto retry; + } else if (ret != GM_RET_SUCCESS) { + pr_debug("gmem: alloc_vma ret %d\n", ret); + error = -ENOMEM; + goto free_vma; + } + gmem_release_vma(mm, &reserve_list); + } +#endif + if (vma_iter_prealloc(&vmi, vma)) { error = -ENOMEM; goto free_vma; @@ -3162,23 +3246,6 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, file = vma->vm_file; ksm_add_vma(vma); expanded: -#ifdef CONFIG_GMEM - if (vma_is_peer_shared(vma)) { - enum gm_ret ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); - - if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { - retry_times++; - addr = get_unmapped_area(file, addr, len, pgoff, 0); - gmem_reserve_vma(vma, &reserve_list); - goto retry; - } else if (ret != GM_RET_SUCCESS) { - pr_debug("gmem: alloc_vma ret %d\n", ret); - error = -ENOMEM; - goto free_vma; - } - gmem_release_vma(mm, &reserve_list); - } -#endif perf_event_mmap(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); @@ -3223,6 +3290,7 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, if (charged) vm_unacct_memory(charged); #ifdef CONFIG_GMEM + if (gmem_is_enabled()) gmem_release_vma(mm, &reserve_list); #endif return error; -- Gitee From 91bd4469ec1c9c6f007d6a825656e3e28e91fb5c Mon Sep 17 00:00:00 2001 From: nicunshu Date: Sat, 26 Jul 2025 17:40:11 +0800 Subject: [PATCH 20/27] mm: gmem: optimize hmemcpy and other gmem functions euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- optimize multiple gmem related functions Signed-off-by: nicunshu --- include/linux/gmem.h | 16 +- include/linux/mman.h | 3 +- include/linux/printk.h | 6 - kernel/fork.c | 4 +- mm/gmem.c | 400 ++++++++++++++++++----------------------- mm/memory.c | 19 +- mm/mempolicy.c | 6 +- mm/mmap.c | 205 +++++++-------------- mm/mprotect.c | 18 +- mm/util.c | 138 +++++++++++++- mm/vm_object.c | 2 +- 11 files changed, 421 insertions(+), 396 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index d37e79a7052d..7beebc67c398 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -278,20 +278,18 @@ static inline bool gm_mapping_pinned(struct gm_mapping *gm_mapping) /* GMEM Device KPI */ extern enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, struct gm_dev **new_dev); -extern enum gm_ret gm_dev_destroy(struct gm_dev *dev); extern enum gm_ret gm_dev_switch(struct gm_dev *dev, struct gm_as *as); extern enum gm_ret gm_dev_detach(struct gm_dev *dev, struct gm_as *as); extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end); -enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, - int behavior); +enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, + struct gm_dev *dev, int behavior); vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order); /* GMEM address space KPI */ extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end); extern void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid); -extern struct gm_mapping *gm_mappings_alloc(unsigned int nid, unsigned int order); extern enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as); extern enum gm_ret gm_as_destroy(struct gm_as *as); @@ -322,8 +320,6 @@ struct hnode { struct xarray pages; }; -extern struct hnode *hnodes[]; - static inline bool is_hnode(int node) { return (node < MAX_NUMNODES) && !node_isset(node, node_possible_map) && @@ -336,11 +332,6 @@ static inline bool is_hnode_allowed(int node) node_isset(node, current->mems_allowed); } -static inline struct hnode *get_hnode(unsigned int hnid) -{ - return hnodes[hnid]; -} - static inline int get_hnuma_id(struct gm_dev *gm_dev) { return first_node(gm_dev->registered_hnodes); @@ -352,4 +343,7 @@ void free_hnode_id(unsigned int nid); void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev); void hnode_deinit(unsigned int hnid, struct gm_dev *dev); +#define gmem_err(fmt, ...) \ + ((void)pr_err("[gmem]" fmt "\n", ##__VA_ARGS__)) + #endif /* _GMEM_H */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 8ddca62d6460..30ec68346f6b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -55,7 +55,8 @@ | MAP_32BIT \ | MAP_ABOVE4G \ | MAP_HUGE_2MB \ - | MAP_HUGE_1GB) + | MAP_HUGE_1GB \ + | MAP_PEER_SHARED) extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; diff --git a/include/linux/printk.h b/include/linux/printk.h index c4fc04998932..e4878bb58f66 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -761,9 +761,3 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true) #endif - -#ifdef CONFIG_GMEM -#define gmem_err(fmt, ...) \ - ((void)pr_err("[gmem]" fmt "\n", ##__VA_ARGS__)) - -#endif diff --git a/kernel/fork.c b/kernel/fork.c index 7e9612e408da..7c7f87bd1110 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1784,7 +1784,9 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, err = dup_mmap(mm, oldmm); if (err) goto free_pt; - +#ifdef CONFIG_GMEM + mm->gm_as = NULL; +#endif mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; diff --git a/mm/gmem.c b/mm/gmem.c index ce591b9ed8ca..039f4cfe28db 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -53,6 +53,9 @@ static DEFINE_XARRAY_ALLOC(gm_dev_id_pool); static bool enable_gmem; +DEFINE_SPINLOCK(hnode_lock); +struct hnode *hnodes[MAX_NUMNODES]; + static inline unsigned long pe_mask(unsigned int order) { if (order == 0) @@ -77,15 +80,17 @@ void gmem_stats_counter(enum gmem_stats_item item, int val) percpu_counter_add(&g_gmem_stats[item], val); } -static int gmem_stat_init(void) +static int gmem_stats_init(void) { int i, rc; for (i = 0; i < NR_GMEM_STAT_ITEMS; i++) { rc = percpu_counter_init(&g_gmem_stats[i], 0, GFP_KERNEL); if (rc) { - for (i--; i >= 0; i--) - percpu_counter_destroy(&g_gmem_stats[i]); + int j; + + for (j = i-1; j >= 0; j--) + percpu_counter_destroy(&g_gmem_stats[j]); break; /* break the initialization process */ } @@ -112,7 +117,6 @@ static int gmem_stats_show(struct seq_file *m, void *arg) #endif /* CONFIG_PROC_FS */ static struct workqueue_struct *prefetch_wq; -static struct workqueue_struct *hmemcpy_wq; #define GM_WORK_CONCURRENCY 4 @@ -143,7 +147,7 @@ static int __init gmem_init(void) if (err) goto free_region; - err = gmem_stat_init(); + err = gmem_stats_init(); if (err) goto free_region; @@ -155,17 +159,8 @@ static int __init gmem_init(void) goto free_region; } - hmemcpy_wq = alloc_workqueue("hmemcpy", __WQ_LEGACY | WQ_UNBOUND - | WQ_HIGHPRI | WQ_CPU_INTENSIVE, GM_WORK_CONCURRENCY); - if (!hmemcpy_wq) { - gmem_err("fail to alloc workqueue hmemcpy_wq\n"); - err = -EFAULT; - destroy_workqueue(prefetch_wq); - goto free_region; - } - #ifdef CONFIG_PROC_FS - proc_create_single("gmemstat", 0444, NULL, gmem_stats_show); + proc_create_single("gmemstats", 0444, NULL, gmem_stats_show); #endif static_branch_enable(&gmem_status); @@ -227,18 +222,9 @@ enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned long cap, } EXPORT_SYMBOL_GPL(gm_dev_create); -// Destroy a GMEM device and reclaim the resources. -enum gm_ret gm_dev_destroy(struct gm_dev *dev) -{ - // TODO: implement it - xa_erase(&gm_dev_id_pool, dev->id); - return GM_RET_SUCCESS; -} -EXPORT_SYMBOL_GPL(gm_dev_destroy); - -/* Handle the page fault triggered by a given device */ -enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, - int behavior) +/* Handle the page fault triggered by a given device with mmap lock*/ +enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, + int behavior) { enum gm_ret ret = GM_RET_SUCCESS; struct gm_mmu *mmu = dev->mmu; @@ -257,20 +243,18 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev }; struct page *page = NULL; - mmap_read_lock(mm); - vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) { - gmem_err("%s failed to find vma by addr %p\n", __func__, (void *)addr); + gmem_err("%s failed to find vma\n", __func__); pr_info("gmem: %s no vma\n", __func__); ret = GM_RET_FAILURE_UNKNOWN; - goto mmap_unlock; + goto out; } obj = vma->vm_obj; if (!obj) { gmem_err("%s no vm_obj\n", __func__); ret = GM_RET_FAILURE_UNKNOWN; - goto mmap_unlock; + goto out; } xa_lock(obj->logical_page_table); @@ -284,7 +268,7 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev if (unlikely(!gm_mapping)) { gmem_err("OOM when creating vm_obj!\n"); ret = GM_RET_NOMEM; - goto mmap_unlock; + goto out; } mutex_lock(&gm_mapping->lock); if (gm_mapping_nomap(gm_mapping)) { @@ -344,11 +328,10 @@ enum gm_ret gm_dev_fault(struct mm_struct *mm, unsigned long addr, struct gm_dev gm_mapping->dev = dev; unlock: mutex_unlock(&gm_mapping->lock); -mmap_unlock: - mmap_read_unlock(mm); +out: return ret; } -EXPORT_SYMBOL_GPL(gm_dev_fault); +EXPORT_SYMBOL_GPL(gm_dev_fault_locked); vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order) @@ -393,6 +376,24 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, return ret; } +static inline struct hnode *get_hnode(unsigned int hnid) +{ + return hnodes[hnid]; +} + +static struct gm_dev *get_gm_dev(unsigned int nid) +{ + struct hnode *hnode; + struct gm_dev *dev = NULL; + + spin_lock(&hnode_lock); + hnode = get_hnode(nid); + if (hnode) + dev = hnode->dev; + spin_unlock(&hnode_lock); + return dev; +} + /* * Register the local physical memory of a gmem device. * This implies dynamically creating @@ -409,15 +410,16 @@ enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, uns if (!hnode) goto err; + mapping = kvmalloc_array(page_num, sizeof(struct gm_mapping), GFP_KERNEL); + if (!mapping) + goto free_hnode; + + spin_lock(&hnode_lock); nid = alloc_hnode_id(); if (nid == MAX_NUMNODES) - goto free_hnode; + goto unlock_hnode; hnode_init(hnode, nid, dev); - mapping = kvmalloc_array(page_num, sizeof(struct gm_mapping), GFP_KERNEL); - if (!mapping) - goto deinit_hnode; - for (i = 0; i < page_num; i++, addr += PAGE_SIZE) { mapping[i].pfn = addr >> PAGE_SHIFT; mapping[i].flag = 0; @@ -436,11 +438,14 @@ enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, uns } xa_unlock(&hnode->pages); + spin_unlock(&hnode_lock); return GM_RET_SUCCESS; deinit_hnode: hnode_deinit(nid, dev); free_hnode_id(nid); +unlock_hnode: + spin_unlock(&hnode_lock); free_hnode: kfree(hnode); err: @@ -450,40 +455,31 @@ EXPORT_SYMBOL_GPL(gm_dev_register_physmem); void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid) { - struct hnode *hnode = get_hnode(nid); - struct gm_mapping *mapping = xa_load(&hnode->pages, 0); + struct hnode *hnode = NULL; + struct gm_mapping *mapping = NULL; + + spin_lock(&hnode_lock); + + if (!node_isset(nid, dev->registered_hnodes)) + goto unlock; + + hnode = get_hnode(nid); + + if (!hnode) + goto unlock; + mapping = xa_load(&hnode->pages, 0); + + if (mapping) + kvfree(mapping); - kvfree(mapping); hnode_deinit(nid, dev); free_hnode_id(nid); kfree(hnode); +unlock: + spin_unlock(&hnode_lock); } EXPORT_SYMBOL_GPL(gm_dev_unregister_physmem); -struct gm_mapping *gm_mappings_alloc(unsigned int nid, unsigned int order) -{ - struct gm_mapping *mapping; - struct hnode *node = get_hnode(nid); - XA_STATE(xas, &node->pages, 0); - - /* TODO: support order > 0 */ - if (order != 0) - return ERR_PTR(-EINVAL); - - xa_lock(&node->pages); - mapping = xas_find_marked(&xas, ULONG_MAX, XA_MARK_0); - if (!mapping) { - xa_unlock(&node->pages); - return ERR_PTR(-ENOMEM); - } - - xas_clear_mark(&xas, XA_MARK_0); - xa_unlock(&node->pages); - - return mapping; -} -EXPORT_SYMBOL_GPL(gm_mappings_alloc); - /* GMEM Virtual Address Space API */ enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as) @@ -569,32 +565,30 @@ enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode } EXPORT_SYMBOL_GPL(gm_as_attach); -DEFINE_SPINLOCK(hnode_lock); -struct hnode *hnodes[MAX_NUMNODES]; - void __init hnuma_init(void) { unsigned int node; - + spin_lock(&hnode_lock); for_each_node(node) node_set(node, hnode_map); + spin_unlock(&hnode_lock); } unsigned int alloc_hnode_id(void) { unsigned int node; - spin_lock(&hnode_lock); node = first_unset_node(hnode_map); node_set(node, hnode_map); - spin_unlock(&hnode_lock); return node; } void free_hnode_id(unsigned int nid) { + spin_lock(&hnode_lock); node_clear(nid, hnode_map); + spin_unlock(&hnode_lock); } void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev) @@ -634,7 +628,9 @@ static void prefetch_work_cb(struct work_struct *work) do { /* MADV_WILLNEED: dev will soon access this addr. */ - ret = gm_dev_fault(d->mm, addr, d->dev, MADV_WILLNEED); + mmap_read_lock(d->mm); + ret = gm_dev_fault_locked(d->mm, addr, d->dev, MADV_WILLNEED); + mmap_read_unlock(d->mm); if (ret == GM_RET_PAGE_EXIST) { gmem_err("%s: device has done page fault, ignore prefetch\n", __func__); @@ -670,7 +666,7 @@ static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t s size = end - start; if (!end && old_start) { - gmem_err("end addr align up 2M causes invalid addr %p\n", (void *)end); + gmem_err("end addr align up 2M causes invalid addr\n"); return -EINVAL; } @@ -681,8 +677,7 @@ static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t s vma = find_vma(current->mm, start); if (!vma || start < vma->vm_start || end > vma->vm_end) { mmap_read_unlock(current->mm); - gmem_err("failed to find vma by invalid start %p or size 0x%zx.\n", - (void *)start, size); + gmem_err("failed to find vma by invalid start or size.\n"); return GM_RET_FAILURE_UNKNOWN; } else if (!vma_is_peer_shared(vma)) { mmap_read_unlock(current->mm); @@ -795,7 +790,7 @@ static int hmadvise_do_eagerfree(unsigned long addr, size_t size) /* Check to see whether len was rounded up from small -ve to zero */ if (old_start && !start) { - gmem_err("start addr align up 2M causes invalid addr %p", (void *)start); + gmem_err("start addr align up 2M causes invalid addr"); return -EINVAL; } @@ -808,8 +803,7 @@ static int hmadvise_do_eagerfree(unsigned long addr, size_t size) } if (!vma_is_peer_shared(vma)) { - pr_debug("gmem:not peer-shared vma %p-%p, skip dontneed\n", - (void *)vma->vm_start, (void *)vma->vm_end); + pr_debug("gmem:not peer-shared vma, skip dontneed\n"); start = vma->vm_end; continue; } @@ -835,7 +829,7 @@ static bool check_hmadvise_behavior(int behavior) int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) { int error = -EINVAL; - struct hnode *node; + struct gm_dev *dev = NULL; if (hnid == -1) { if (check_hmadvise_behavior(behavior)) { @@ -857,8 +851,8 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) return error; } - node = get_hnode(hnid); - if (!node) { + dev = get_gm_dev(hnid); + if (!dev) { gmem_err("hmadvise: hnode id %d is invalid\n", hnid); return error; } @@ -866,7 +860,7 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) no_hnid: switch (behavior) { case MADV_PREFETCH: - return hmadvise_do_prefetch(node->dev, start, len_in); + return hmadvise_do_prefetch(dev, start, len_in); case MADV_DONTNEED: return hmadvise_do_eagerfree(start, len_in); default: @@ -877,15 +871,6 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) } EXPORT_SYMBOL_GPL(hmadvise_inner); -struct hmemcpy_data { - struct mm_struct *mm; - int hnid; - unsigned long src; - unsigned long dest; - size_t size; - struct work_struct work; -}; - static bool hnid_match_dest(int hnid, struct gm_mapping *dest) { return (hnid < 0) ? gm_mapping_cpu(dest) : gm_mapping_device(dest); @@ -897,84 +882,82 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, enum gm_ret ret; int page_size = HPAGE_SIZE; struct vm_area_struct *vma_dest, *vma_src; - struct gm_mapping *gm_mmaping_dest, *gm_mmaping_src; + struct gm_mapping *gm_mapping_dest, *gm_mapping_src; struct gm_dev *dev = NULL; - struct hnode *node; struct gm_memcpy_t gmc = {0}; if (size == 0) return; + mmap_read_lock(mm); vma_dest = find_vma(mm, dest); vma_src = find_vma(mm, src); - gm_mmaping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); - gm_mmaping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + if (!vma_src || vma_src->vm_start > src || !vma_dest || vma_dest->vm_start > dest) { + gmem_err("hmemcpy: the vma find by src/dest is NULL!\n"); + goto unlock_mm; + } - if (!gm_mmaping_src) { - gmem_err("%s: gm_mmaping_src is NULL, src=%p; size=0x%zx\n", - __func__, (void *)src, size); - return; + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); + gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + + if (!gm_mapping_src) { + gmem_err("hmemcpy: gm_mapping_src is NULL\n"); + goto unlock_mm; } if (hnid != -1) { - node = get_hnode(hnid); - if (node) - dev = node->dev; + dev = get_gm_dev(hnid); if (!dev) { - gmem_err("%s: hnode's dev is NULL\n", __func__); - return; + gmem_err("hmemcpy: hnode's dev is NULL\n"); + goto unlock_mm; } } // Trigger dest page fault on host or device - if (!gm_mmaping_dest || gm_mapping_nomap(gm_mmaping_dest) - || !hnid_match_dest(hnid, gm_mmaping_dest)) { + if (!gm_mapping_dest || gm_mapping_nomap(gm_mapping_dest) + || !hnid_match_dest(hnid, gm_mapping_dest)) { if (hnid == -1) { - mmap_read_lock(mm); - handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | - FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); - mmap_read_unlock(mm); + ret = handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | + FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); + if (ret) { + gmem_err("%s: failed to execute host page fault, ret:%d\n", + __func__, ret); + goto unlock_mm; + } } else { - ret = gm_dev_fault(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); + ret = gm_dev_fault_locked(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); if (ret != GM_RET_SUCCESS) { - gmem_err("%s: gm_dev_fault failed\n", __func__); - return; + gmem_err("%s: failed to excecute dev page fault.\n", __func__); + goto unlock_mm; } } } - if (!gm_mmaping_dest) - gm_mmaping_dest = vm_object_lookup(vma_dest->vm_obj, round_down(dest, page_size)); + if (!gm_mapping_dest) + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, round_down(dest, page_size)); - if (gm_mmaping_dest && gm_mmaping_dest != gm_mmaping_src) - mutex_lock(&gm_mmaping_dest->lock); - mutex_lock(&gm_mmaping_src->lock); + if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) + mutex_lock(&gm_mapping_dest->lock); + mutex_lock(&gm_mapping_src->lock); // Use memcpy when there is no device address, otherwise use peer_memcpy if (hnid == -1) { - if (gm_mapping_cpu(gm_mmaping_src)) { // host to host - memcpy(page_to_virt(gm_mmaping_dest->page) + (dest & (page_size - 1)), - page_to_virt(gm_mmaping_src->page) + (src & (page_size - 1)), - size); - goto unlock; + if (gm_mapping_cpu(gm_mapping_src)) { // host to host + gmem_err("hmemcpy: host to host is unimplemented\n"); + goto unlock_gm_mmaping; } else { // device to host - dev = gm_mmaping_src->dev; + dev = gm_mapping_src->dev; gmc.dma_addr = phys_to_dma(dev->dma_dev, - page_to_phys(gm_mmaping_dest->page) + (dest & (page_size - 1))); + page_to_phys(gm_mapping_dest->page) + (dest & (page_size - 1))); gmc.src = src; } } else { - if (gm_mapping_cpu(gm_mmaping_src)) { // host to device + if (gm_mapping_cpu(gm_mapping_src)) { // host to device gmc.dest = dest; gmc.dma_addr = phys_to_dma(dev->dma_dev, - page_to_phys(gm_mmaping_src->page) + (src & (page_size - 1))); + page_to_phys(gm_mapping_src->page) + (src & (page_size - 1))); } else { // device to device - if (dev == gm_mmaping_src->dev) { // same device - gmc.dest = dest; - gmc.src = src; - } else { // TODO: different devices - gmem_err("%s: device to device is unimplemented\n", __func__); - goto unlock; - } + gmem_err("hmemcpy: device to device is unimplemented\n"); + goto unlock_gm_mmaping; } } gmc.mm = mm; @@ -982,137 +965,100 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, gmc.size = size; dev->mmu->peer_hmemcpy(&gmc); -unlock: - mutex_unlock(&gm_mmaping_src->lock); - if (gm_mmaping_dest && gm_mmaping_dest != gm_mmaping_src) - mutex_unlock(&gm_mmaping_dest->lock); +unlock_gm_mmaping: + mutex_unlock(&gm_mapping_src->lock); + if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) + mutex_unlock(&gm_mapping_dest->lock); +unlock_mm: + mmap_read_unlock(mm); } /* * Each page needs to be copied in three parts when the address is not aligned. - * | <--a-->| | + * | ml <--0-->|<1><--2-> | * | -------|--------- | * | / /| / / | * | / / | / / | * | / / |/ / | * | ----------|------ | - * | <----b--->| | + * | | | * |<----page x---->|<----page y---->| */ -static void hmemcpy_work_cb(struct work_struct *work) +static void __hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) { - size_t i; - int remain, a, b, page_size = HPAGE_SIZE; - struct hmemcpy_data *d = container_of(work, struct hmemcpy_data, work); - unsigned long src = d->src, dest = d->dest; - - a = min(page_size - (src & (page_size - 1)), page_size - (dest & (page_size - 1))); - b = max(page_size - (src & (page_size - 1)), page_size - (dest & (page_size - 1))); - - for (i = page_size; i < d->size; i += page_size) { - if (a != 0) - do_hmemcpy(d->mm, d->hnid, dest, src, a); - if (b - a != 0) - do_hmemcpy(d->mm, d->hnid, dest + a, src + a, b - a); - if (page_size - b != 0) - do_hmemcpy(d->mm, d->hnid, dest + b, src + b, page_size - b); - src += page_size; - dest += page_size; - } + int i = 0; + // offsets within the huge page for the source and destination addresses + int src_offset = src & (HPAGE_SIZE - 1); + int dst_offset = dest & (HPAGE_SIZE - 1); + // Divide each page into three parts according to the align + int ml[3] = { + HPAGE_SIZE - (src_offset < dst_offset ? dst_offset : src_offset), + src_offset < dst_offset ? (dst_offset - src_offset) : (src_offset - dst_offset), + src_offset < dst_offset ? src_offset : dst_offset + }; + struct mm_struct *mm = current->mm; - remain = d->size + page_size - i; - if (remain == 0) - goto out; + if (size == 0) + return; - if (remain < a) { - do_hmemcpy(d->mm, d->hnid, dest, src, remain); - } else if (remain < b) { - do_hmemcpy(d->mm, d->hnid, dest, src, a); - do_hmemcpy(d->mm, d->hnid, dest + a, src + a, remain - a); - } else { - do_hmemcpy(d->mm, d->hnid, dest, src, a); - do_hmemcpy(d->mm, d->hnid, dest + a, src + a, b - a); - do_hmemcpy(d->mm, d->hnid, dest + b, src + b, remain - b); + while (size >= ml[i]) { + if (ml[i] > 0) { + do_hmemcpy(mm, hnid, dest, src, ml[i]); + src += ml[i]; + dest += ml[i]; + size -= ml[i]; + } + i = (i + 1) % 3; } -out: - kfree(d); + if (size > 0) + do_hmemcpy(mm, hnid, dest, src, size); } int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) { - int page_size = HPAGE_SIZE; - unsigned long per_size, copied = 0; - struct hmemcpy_data *data; struct vm_area_struct *vma_dest, *vma_src; + struct mm_struct *mm = current->mm; if (hnid < 0) { if (hnid != -1) { - gmem_err("hmadvise: invalid hnid %d < 0\n", hnid); + gmem_err("hmemcpy: invalid hnid %d < 0\n", hnid); return -EINVAL; } } else if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) { - gmem_err( - "hmadvise: can't find hnode by hnid:%d or hnode is not allowed\n", - hnid); + gmem_err("hmemcpy: can't find hnode by hnid:%d or hnode is not allowed\n", hnid); return -EINVAL; } - vma_dest = find_vma(current->mm, dest); - vma_src = find_vma(current->mm, src); - - if (!vma_src || vma_src->vm_start > src || !vma_is_peer_shared(vma_src) - || vma_src->vm_end < (src + size)) { - gmem_err("failed to find peer_shared vma by invalid src:%p or size :0x%zx", - (void *)src, size); - return -EINVAL; - } + mmap_read_lock(mm); + vma_dest = find_vma(mm, dest); + vma_src = find_vma(mm, src); - if (!vma_dest || vma_dest->vm_start > dest || !vma_is_peer_shared(vma_dest) - || vma_dest->vm_end < (dest + size)) { - gmem_err("failed to find peer_shared vma by invalid dest:%p or size :0x%zx", - (void *)dest, size); - return -EINVAL; + if ((ULONG_MAX - size < src) || !vma_src || vma_src->vm_start > src || + !vma_is_peer_shared(vma_src) || vma_src->vm_end < (src + size)) { + gmem_err("failed to find peer_shared vma by invalid src or size\n"); + goto unlock; } - if (!(vma_dest->vm_flags & VM_WRITE)) { - gmem_err("dest is not writable.\n"); - return -EINVAL; + if ((ULONG_MAX - size < dest) || !vma_dest || vma_dest->vm_start > dest || + !vma_is_peer_shared(vma_dest) || vma_dest->vm_end < (dest + size)) { + gmem_err("failed to find peer_shared vma by invalid dest or size\n"); + goto unlock; } if (!(vma_dest->vm_flags & VM_WRITE)) { gmem_err("dest is not writable.\n"); - return -EINVAL; + goto unlock; } + mmap_read_unlock(mm); - per_size = (size / GM_WORK_CONCURRENCY) & ~(page_size - 1); - - while (copied < size) { - data = kzalloc(sizeof(struct hmemcpy_data), GFP_KERNEL); - if (data == NULL) { - flush_workqueue(hmemcpy_wq); - return GM_RET_NOMEM; - } - INIT_WORK(&data->work, hmemcpy_work_cb); - data->mm = current->mm; - data->hnid = hnid; - data->src = src; - data->dest = dest; - if (per_size == 0) { - data->size = size; - } else { - // Process (1.x * per_size) for the last time - data->size = (size - copied < 2 * per_size) ? (size - copied) : per_size; - } - - queue_work(hmemcpy_wq, &data->work); - src += data->size; - dest += data->size; - copied += data->size; - } + __hmemcpy(hnid, dest, src, size); - flush_workqueue(hmemcpy_wq); return 0; + +unlock: + mmap_read_unlock(mm); + return -EINVAL; } EXPORT_SYMBOL_GPL(hmemcpy); diff --git a/mm/memory.c b/mm/memory.c index 568f3e295fdb..100769eae24f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1722,6 +1722,9 @@ static inline void zap_logic_pmd_range(struct vm_area_struct *vma, struct gm_mapping *gm_mapping = NULL; struct page *page = NULL; + if (!vma->vm_obj) + return; + xa_lock(vma->vm_obj->logical_page_table); gm_mapping = vm_object_lookup(vma->vm_obj, addr); @@ -5871,7 +5874,9 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; - +#ifdef CONFIG_GMEM + char *thp_enable_path = "/sys/kernel/mm/transparent_hugepage/enabled"; +#endif pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) @@ -5924,9 +5929,21 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; + if (vma_is_peer_shared(vma)) + return VM_FAULT_OOM; } else { vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma) && pmd_none(*vmf.pmd) && + (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))) { + /* if transparent hugepage is not enabled, return pagefault failed */ + gmem_err("transparent hugepage is not enabled. check %s\n", + thp_enable_path); + return VM_FAULT_SIGBUS; + } +#endif + if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(vmf.orig_pmd)); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9290304e3741..bb35f9fafcf6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1902,9 +1902,13 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, bool vma_migratable(struct vm_area_struct *vma) { - +#ifdef CONFIG_GMEM if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_PEER_SHARED)) return false; +#else + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + return false; +#endif /* * DAX device mappings require predictable access latency, so avoid diff --git a/mm/mmap.c b/mm/mmap.c index a50ea15e6c81..0c3e60f94a7d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -766,39 +766,6 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } -#ifdef CONFIG_GMEM -struct gmem_vma_list { - struct vm_area_struct *vma; - struct list_head list; -}; - -void gmem_reserve_vma(struct vm_area_struct *value, struct list_head *head) -{ - struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL); - - if (!node) - return; - - node->vma = value; - list_add_tail(&node->list, head); -} - -void gmem_release_vma(struct mm_struct *mm, struct list_head *head) -{ - struct gmem_vma_list *node, *next; - - list_for_each_entry_safe(node, next, head, list) { - struct vm_area_struct *vma = node->vma; - - if (vma != NULL) - vm_area_free(vma); - - list_del(&node->list); - kfree(node); - } -} -#endif - /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those if the caller indicates @@ -1374,7 +1341,7 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon if (IS_ERR_VALUE(addr)) return addr; - if (flags & MAP_FIXED_NOREPLACE) { + if ((flags & MAP_FIXED_NOREPLACE) || (gmem_is_enabled() && (flags & MAP_PEER_SHARED))) { if (find_vma_intersection(mm, addr, addr + len)) return -EEXIST; } @@ -1494,8 +1461,12 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon vm_flags |= VM_NORESERVE; } #ifdef CONFIG_GMEM - if (gmem_is_enabled() && (flags & MAP_PEER_SHARED)) - vm_flags |= VM_PEER_SHARED; + if (flags & MAP_PEER_SHARED) { + if (gmem_is_enabled()) + vm_flags |= VM_PEER_SHARED; + else + return -EINVAL; + } #endif addr = __mmap_region_ext(mm, file, addr, len, vm_flags, pgoff, uf); @@ -2706,8 +2677,7 @@ static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_ar ret = ctx->dev->mmu->peer_va_free(&gmf); if (ret != GM_RET_SUCCESS) - pr_debug("gmem: free_vma(start:%lx, len:%lx) ret %d\n", - start, end - start, ret); + pr_debug("gmem: free_vma failed, ret %d\n", ret); } } @@ -2721,6 +2691,38 @@ static void munmap_in_peer_devices(struct mm_struct *mm, unsigned long start, un munmap_single_vma_in_peer_devices(mm, vma, start, end); } } + +static unsigned long gmem_unmap_align(struct mm_struct *mm, unsigned long start, size_t len) +{ + struct vm_area_struct *vma, *vma_end; + + vma = find_vma_intersection(mm, start, start + len); + vma_end = find_vma(mm, start + len); + if (!vma || !vma_is_peer_shared(vma)) + return 0; + if (vma_is_peer_shared(vma)) { + if (!IS_ALIGNED(start, PMD_SIZE)) + return -EINVAL; + } + + /* Prevents partial release of the peer_share page. */ + if (vma_end && vma_end->vm_start < (start + len) && vma_is_peer_shared(vma_end)) + len = round_up(len, SZ_2M); + return len; +} + +static void gmem_unmap_region(struct mm_struct *mm, unsigned long start, size_t len) +{ + unsigned long end, ret; + + ret = gmem_unmap_align(mm, start, len); + + if (!ret || IS_ERR_VALUE(ret)) + return; + + end = start + ret; + munmap_in_peer_devices(mm, start, end); +} #endif /* @@ -2857,10 +2859,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, prev = vma_iter_prev_range(vmi); next = vma_next(vmi); -#ifdef CONFIG_GMEM - if (gmem_is_enabled()) - munmap_in_peer_devices(mm, start, end); -#endif + if (next) vma_iter_prev_range(vmi); @@ -2920,19 +2919,13 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *vma; #ifdef CONFIG_GMEM - struct vm_area_struct *vma_end; if (gmem_is_enabled()) { - vma = find_vma_intersection(mm, start, start + len); - vma_end = find_vma(mm, start + len); - if (!vma) - return 0; - if (vma_is_peer_shared(vma)) { - if (!IS_ALIGNED(start, PMD_SIZE)) - return -EINVAL; - } - /* Prevents partial release of the peer_share page. */ - if (vma_end && vma_end->vm_start < (start + len) && vma_is_peer_shared(vma_end)) - len = round_up(len, SZ_2M); + unsigned long ret = gmem_unmap_align(mm, start, len); + + if (IS_ERR_VALUE(ret)) + return ret; + else if (ret) + len = ret; } #endif @@ -2970,60 +2963,12 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, { VMA_ITERATOR(vmi, mm, start); - return do_vmi_munmap(&vmi, mm, start, len, uf, false); -} - #ifdef CONFIG_GMEM -static int alloc_va_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, vm_flags_t vm_flags) -{ - struct gm_context *ctx, *tmp; - enum gm_ret ret; - - if (!mm->gm_as) { - ret = gm_as_create(0, ULONG_MAX, GM_AS_ALLOC_DEFAULT, PAGE_SIZE, - &mm->gm_as); - if (ret) - return ret; - } - pr_debug("gmem: start mmap, as %p\n", (void *)mm->gm_as); - - if (!vma->vm_obj) - vma->vm_obj = vm_object_create(vma); - if (!vma->vm_obj) - return -ENOMEM; - /* - * TODO: consider the concurrency problem of device - * attaching/detaching from the gm_as. - */ - ret = -ENODEV; - list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { - struct gm_fault_t gmf = { - .mm = mm, - .dev = ctx->dev, - .va = addr, - .size = len, - .prot = vm_flags, - }; - - if (!gm_dev_is_peer(ctx->dev)) - continue; - - if (!ctx->dev->mmu->peer_va_alloc_fixed) { - pr_debug("gmem: mmu ops has no alloc_vma\n"); - continue; - } - pr_debug("gmem: call vma_alloc\n"); - ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); - if (ret != GM_RET_SUCCESS) { - pr_debug("gmem: alloc_vma ret %d\n", ret); - return ret; - } - } - - return ret; -} + if (gmem_is_enabled()) + gmem_unmap_region(mm, start, len); #endif + return do_vmi_munmap(&vmi, mm, start, len, uf, false); +} static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, @@ -3039,12 +2984,7 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, pgoff_t vm_pgoff; int error; VMA_ITERATOR(vmi, mm, addr); -#ifdef CONFIG_GMEM - unsigned int retry_times = 0; - LIST_HEAD(reserve_list); -retry: -#endif /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; @@ -3057,20 +2997,12 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) { -#ifdef CONFIG_GMEM - if (gmem_is_enabled()) - gmem_release_vma(mm, &reserve_list); -#endif return -ENOMEM; } } /* Unmap any existing mapping in the area */ if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) { -#ifdef CONFIG_GMEM - if (gmem_is_enabled()) - gmem_release_vma(mm, &reserve_list); -#endif return -ENOMEM; } @@ -3080,10 +3012,6 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; if (security_vm_enough_memory_mm(mm, charged)) { -#ifdef CONFIG_GMEM - if (gmem_is_enabled()) - gmem_release_vma(mm, &reserve_list); -#endif return -ENOMEM; } vm_flags |= VM_ACCOUNT; @@ -3149,24 +3077,6 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; -#ifdef CONFIG_GMEM - if (vma_is_peer_shared(vma)) { - enum gm_ret ret = alloc_va_in_peer_devices(mm, vma, addr, len, vm_flags); - - if (ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { - retry_times++; - addr = get_unmapped_area(file, addr, len, pgoff, 0); - gmem_reserve_vma(vma, &reserve_list); - goto retry; - } else if (ret != GM_RET_SUCCESS) { - pr_debug("gmem: alloc_vma ret %d\n", ret); - error = -ENOMEM; - goto free_vma; - } - gmem_release_vma(mm, &reserve_list); - } -#endif - if (vma_iter_prealloc(&vmi, vma)) { error = -ENOMEM; goto free_vma; @@ -3289,10 +3199,6 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, unacct_error: if (charged) vm_unacct_memory(charged); -#ifdef CONFIG_GMEM - if (gmem_is_enabled()) - gmem_release_vma(mm, &reserve_list); -#endif return error; } @@ -3348,6 +3254,11 @@ static int __vm_munmap(unsigned long start, size_t len, bool unlock) if (sp_check_addr(start)) return -EINVAL; +#ifdef CONFIG_GMEM + if (gmem_is_enabled()) + gmem_unmap_region(mm, start, len); +#endif + if (mmap_write_lock_killable(mm)) return -EINTR; @@ -3729,6 +3640,10 @@ void exit_mmap(struct mm_struct *mm) __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && mm->gm_as) + gm_as_destroy(mm->gm_as); +#endif } /* Insert vm structure into process list sorted by address diff --git a/mm/mprotect.c b/mm/mprotect.c index e65363eb603e..4eac8ad8a718 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -693,7 +693,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, unsigned long prot, int pkey) { unsigned long nstart, end, tmp, reqprot; +#ifdef CONFIG_GMEM + struct vm_area_struct *vma, *prev, *vma_end; +#else struct vm_area_struct *vma, *prev; +#endif int error; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && @@ -736,7 +740,19 @@ static int do_mprotect_pkey(unsigned long start, size_t len, error = -ENOMEM; if (!vma) goto out; - +#ifdef CONFIG_GMEM + if (vma_is_peer_shared(vma)) { + start = ALIGN_DOWN(start, HPAGE_SIZE); + vma_end = find_vma(current->mm, end); + if (vma_end && vma_end->vm_start < end && vma_is_peer_shared(vma_end)) + end = ALIGN(end, HPAGE_SIZE); + if (end <= start) { + error = -ENOMEM; + goto out; + } + len = end - start; + } +#endif if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; diff --git a/mm/util.c b/mm/util.c index 7a5eed15c98f..65392c97b1e9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -27,6 +27,9 @@ #include #include +#ifdef CONFIG_GMEM +#include +#endif #include "internal.h" #include "swap.h" @@ -540,6 +543,114 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) } EXPORT_SYMBOL_GPL(account_locked_vm); +#ifdef CONFIG_GMEM +static unsigned long alloc_va_in_peer_devices(unsigned long addr, unsigned long len, + unsigned long flag) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct gm_context *ctx, *tmp; + unsigned long prot = VM_NONE; + enum gm_ret ret; + char *thp_enable_path = "/sys/kernel/mm/transparent_hugepage/enabled"; + + vma = find_vma(mm, addr); + if (!vma) { + gmem_err("vma for addr %lx is NULL, should not happen\n", addr); + return -EINVAL; + } + + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) { + gmem_err("transparent hugepage is not enabled. check %s\n", + thp_enable_path); + return -EINVAL; + } + + prot |= vma->vm_flags; + + if (!mm->gm_as) { + ret = gm_as_create(0, ULONG_MAX, GM_AS_ALLOC_DEFAULT, HPAGE_SIZE, &mm->gm_as); + if (ret) { + gmem_err("gm_as_create failed\n"); + return ret; + } + } + + ret = -ENODEV; + // TODO: consider the concurrency problem of device attaching/detaching from the gm_as. + list_for_each_entry_safe(ctx, tmp, &mm->gm_as->gm_ctx_list, gm_as_link) { + struct gm_fault_t gmf = { + .mm = mm, + .dev = ctx->dev, + .va = addr, + .size = len, + .prot = prot, + }; + + if (!gm_dev_is_peer(ctx->dev)) + continue; + + if (!ctx->dev->mmu->peer_va_alloc_fixed) { + pr_debug("gmem: mmu ops has no alloc_vma\n"); + continue; + } + + ret = ctx->dev->mmu->peer_va_alloc_fixed(&gmf); + if (ret != GM_RET_SUCCESS) { + gmem_err("device mmap failed\n"); + return ret; + } + } + + if (!vma->vm_obj) + vma->vm_obj = vm_object_create(vma); + if (!vma->vm_obj) + return -ENOMEM; + + return ret; +} + +struct gmem_vma_list { + unsigned long start; + size_t len; + struct list_head list; +}; + +static void gmem_reserve_vma(struct mm_struct *mm, unsigned long start, + size_t len, struct list_head *head) +{ + struct vm_area_struct *vma; + struct gmem_vma_list *node = kmalloc(sizeof(struct gmem_vma_list), GFP_KERNEL); + + vma = find_vma(mm, start); + if (!vma || vma->vm_start >= start + len) { + kfree(node); + return; + } + vm_flags_set(vma, ~VM_PEER_SHARED); + + node->start = start; + node->len = round_up(len, SZ_2M); + list_add_tail(&node->list, head); +} + +static void gmem_release_vma(struct mm_struct *mm, struct list_head *head) +{ + struct gmem_vma_list *node, *next; + + list_for_each_entry_safe(node, next, head, list) { + unsigned long start = node->start; + size_t len = node->len; + + if (len) + vm_munmap(start, len); + + list_del(&node->list); + kfree(node); + } +} +#endif + unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) @@ -548,7 +659,11 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); - +#ifdef CONFIG_GMEM + unsigned int retry_times = 0; + LIST_HEAD(reserve_list); +retry: +#endif ret = security_mmap_file(file, prot, flag); if (!ret) { if (mmap_write_lock_killable(mm)) @@ -559,6 +674,27 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); +#ifdef CONFIG_GMEM + if (gmem_is_enabled() && !IS_ERR_VALUE(ret) && flag & MAP_PEER_SHARED) { + enum gm_ret gm_ret = 0; + + gm_ret = alloc_va_in_peer_devices(ret, len, flag); + /* + * if alloc_va_in_peer_devices failed + * add vma to reserve_list and release after find a proper vma + */ + if (gm_ret == GM_RET_NOMEM && retry_times < GMEM_MMAP_RETRY_TIMES) { + retry_times++; + gmem_reserve_vma(mm, ret, len, &reserve_list); + goto retry; + } else if (gm_ret != GM_RET_SUCCESS) { + gmem_err("alloc vma ret %lu\n", ret); + gmem_reserve_vma(mm, ret, len, &reserve_list); + ret = -ENOMEM; + } + gmem_release_vma(mm, &reserve_list); + } +#endif } return ret; } diff --git a/mm/vm_object.c b/mm/vm_object.c index 25af359def56..3c8932c47270 100644 --- a/mm/vm_object.c +++ b/mm/vm_object.c @@ -236,7 +236,7 @@ void free_gm_mappings(struct vm_area_struct *vma) XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); xa_lock(vma->vm_obj->logical_page_table); - xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end)) { + xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end - SZ_2M)) { release_gm_mapping(gm_mapping); xas_store(&xas, NULL); } -- Gitee From 6cf4160da8cc45550b4888dff4fc5651fddbb1b6 Mon Sep 17 00:00:00 2001 From: nicunshu Date: Mon, 8 Sep 2025 11:38:17 +0800 Subject: [PATCH 21/27] mm:fix hnode and vma bug euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- remove is_hnode_allow which is not used. split or merge vm_object when split or merge vma. Signed-off-by: nicunshu --- arch/arm64/include/asm/rsi_cmds.h | 1 + include/linux/gmem.h | 6 -- include/linux/vm_object.h | 2 + mm/gmem.c | 4 +- mm/mmap.c | 39 +++++---- mm/vm_object.c | 139 +++++++++++++++++++++++++++++- 6 files changed, 165 insertions(+), 26 deletions(-) diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h index e6a211001bd3..ccdeffcefbff 100644 --- a/arch/arm64/include/asm/rsi_cmds.h +++ b/arch/arm64/include/asm/rsi_cmds.h @@ -9,6 +9,7 @@ #include #include +#include "string.h" #define RSI_GRANULE_SHIFT 12 #define RSI_GRANULE_SIZE (_AC(1, UL) << RSI_GRANULE_SHIFT) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index 7beebc67c398..a2becb381cc9 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -326,12 +326,6 @@ static inline bool is_hnode(int node) node_isset(node, hnode_map); } -static inline bool is_hnode_allowed(int node) -{ - return (node < MAX_NUMNODES) && is_hnode(node) && - node_isset(node, current->mems_allowed); -} - static inline int get_hnuma_id(struct gm_dev *gm_dev) { return first_node(gm_dev->registered_hnodes); diff --git a/include/linux/vm_object.h b/include/linux/vm_object.h index ca82642eb2df..480bb12fb6a3 100644 --- a/include/linux/vm_object.h +++ b/include/linux/vm_object.h @@ -13,6 +13,8 @@ void vm_object_drop_locked(struct vm_area_struct *vma); void dup_vm_object(struct vm_area_struct *dst, struct vm_area_struct *src, bool dst_peer_shared); void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end); +void vm_object_merge(struct vm_area_struct *vma, unsigned long addr); +void vm_object_split(struct vm_area_struct *old_vma, struct vm_area_struct *new_vma); void dup_peer_shared_vma(struct vm_area_struct *vma); struct gm_mapping *alloc_gm_mapping(void); diff --git a/mm/gmem.c b/mm/gmem.c index 039f4cfe28db..1397a56e42bb 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -846,7 +846,7 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) return error; } - if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) { + if (!is_hnode(hnid)) { gmem_err("hmadvise: can't find hnode by hnid:%d or hnode is not allowed\n", hnid); return error; } @@ -1026,7 +1026,7 @@ int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) gmem_err("hmemcpy: invalid hnid %d < 0\n", hnid); return -EINVAL; } - } else if (!is_hnode(hnid) || !is_hnode_allowed(hnid)) { + } else if (!is_hnode(hnid)) { gmem_err("hmemcpy: can't find hnode by hnid:%d or hnode is not allowed\n", hnid); return -EINVAL; } diff --git a/mm/mmap.c b/mm/mmap.c index 0c3e60f94a7d..fa8a6ba93070 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -707,6 +707,13 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Only handles expanding */ VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); + if (vma_is_peer_shared(vma)) { + if (!remove_next) + vm_object_adjust(vma, start, end); + else + vm_object_merge(vma, next->vm_end); + } + /* Note: vma iterator must be pointing to 'start' */ vma_iter_config(vmi, start, end); if (vma_iter_prealloc(vmi, vma)) @@ -758,6 +765,9 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); + if (vma_is_peer_shared(vma)) + vm_object_adjust(vma, start, end); + vma_iter_clear(vmi); vma->vm_start = start; vma->vm_end = end; @@ -1008,6 +1018,9 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (!next->anon_vma) err = dup_anon_vma(prev, curr, &anon_dup); } + if (vma_is_peer_shared(prev)) { + vm_object_merge(prev, next->vm_end); + } } else if (merge_prev) { /* case 2 */ if (curr) { vma_start_write(curr); @@ -1026,6 +1039,9 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } if (!err) err = dup_anon_vma(prev, curr, &anon_dup); + if (vma_is_peer_shared(prev)) { + vm_object_merge(prev, end); + } } } else { /* merge_next */ vma_start_write(next); @@ -1036,6 +1052,9 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, adjust = next; adj_start = -(prev->vm_end - addr); err = dup_anon_vma(next, prev, &anon_dup); + if (vma_is_peer_shared(prev)) { + vm_object_merge(prev, addr); + } } else { /* * Note that cases 3 and 8 are the ONLY ones where prev @@ -1051,6 +1070,8 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, remove = curr; err = dup_anon_vma(next, curr, &anon_dup); } + if (vma_is_peer_shared(curr)) + vm_object_merge(vma, next->vm_end); } } @@ -1088,11 +1109,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_iter_store(vmi, vma); if (adj_start) { -#ifdef CONFIG_GMEM - if (vma_is_peer_shared(adjust)) - vm_object_adjust(adjust, adjust->vm_start + adj_start, - adjust->vm_end); -#endif adjust->vm_start += adj_start; adjust->vm_pgoff += adj_start >> PAGE_SHIFT; if (adj_start < 0) { @@ -2560,17 +2576,8 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_prepare(&vp); vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); -#ifdef CONFIG_GMEM - if (vma_is_peer_shared(vma)) { - if (new_below) { - vm_object_adjust(new, new->vm_start, addr); - vm_object_adjust(vma, addr, vma->vm_end); - } else { - vm_object_adjust(vma, vma->vm_start, addr); - vm_object_adjust(new, addr, new->vm_end); - } - } -#endif + if (vma_is_peer_shared(vma)) + vm_object_split(vma, new); if (new_below) { vma->vm_start = addr; diff --git a/mm/vm_object.c b/mm/vm_object.c index 3c8932c47270..b30b69f81167 100644 --- a/mm/vm_object.c +++ b/mm/vm_object.c @@ -180,13 +180,147 @@ void dup_peer_shared_vma(struct vm_area_struct *vma) } } +/** + * new_vma is part of old_vma, so old_vma->vm_start <= new_vma->vm_start + * and new_vma->vm_end < old_vma->vm_end + */ +void vm_object_split(struct vm_area_struct *old_vma, struct vm_area_struct *new_vma) +{ + unsigned long index; + struct gm_mapping *page; + unsigned long transferred_pages = 0; + pgoff_t pgoff = linear_page_index(old_vma, new_vma->vm_start); + + XA_STATE(xas, old_vma->vm_obj->logical_page_table, pgoff); + + xa_lock(old_vma->vm_obj->logical_page_table); + xa_lock(new_vma->vm_obj->logical_page_table); + xas_for_each(&xas, page, linear_page_index(old_vma, new_vma->vm_end - SZ_2M)) { + index = xas.xa_index - old_vma->vm_pgoff + new_vma->vm_pgoff + - ((new_vma->vm_start - old_vma->vm_start) >> PAGE_SHIFT); + __xa_store(new_vma->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas, NULL); + transferred_pages++; + } + + atomic_sub(transferred_pages, &old_vma->vm_obj->nr_pages); + atomic_add(transferred_pages, &new_vma->vm_obj->nr_pages); + xa_unlock(new_vma->vm_obj->logical_page_table); + xa_unlock(old_vma->vm_obj->logical_page_table); +} + +void vm_object_merge(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long index; + struct gm_mapping *page; + struct vm_area_struct *next, *n_next; + unsigned long moved_pages = 0; + pgoff_t pgoff; + + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start); + next = vma_next(&vmi); + next = vma_next(&vmi); + if (!next) + return; + + if (addr < vma->vm_end) { + /* case 4: move logical mapping in [end, vma->vm_end) from vma to next */ + pgoff = linear_page_index(vma, addr); + XA_STATE(xas, vma->vm_obj->logical_page_table, pgoff); + + xa_lock(vma->vm_obj->logical_page_table); + xa_lock(next->vm_obj->logical_page_table); + xas_for_each(&xas, page, linear_page_index(vma, vma->vm_end - SZ_2M)) { + index = xas.xa_index - vma->vm_pgoff + next->vm_pgoff + - ((next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(next->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas, NULL); + moved_pages++; + } + atomic_sub(moved_pages, &vma->vm_obj->nr_pages); + atomic_add(moved_pages, &next->vm_obj->nr_pages); + xa_unlock(next->vm_obj->logical_page_table); + xa_unlock(vma->vm_obj->logical_page_table); + } else { + n_next = vma_next(&vmi); + + if (addr == next->vm_end) { + /* case 1, 7, 8: copy all logical mappings from next to vma */ + pgoff = linear_page_index(next, next->vm_start); + XA_STATE(xas, next->vm_obj->logical_page_table, pgoff); + + xa_lock(vma->vm_obj->logical_page_table); + rcu_read_lock(); + xas_for_each(&xas, page, linear_page_index(next, next->vm_end - SZ_2M)) { + index = xas.xa_index - next->vm_pgoff + vma->vm_pgoff + + ((next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(vma->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas, NULL); + moved_pages++; + } + rcu_read_unlock(); + atomic_add(moved_pages, &vma->vm_obj->nr_pages); + xa_unlock(vma->vm_obj->logical_page_table); + } else if (next->vm_start < addr && addr < next->vm_end) { + /* case 5: move logical mapping in [next->vm_start, end) from next to vma */ + pgoff = linear_page_index(next, next->vm_start); + XA_STATE(xas, next->vm_obj->logical_page_table, pgoff); + + xa_lock(vma->vm_obj->logical_page_table); + xa_lock(next->vm_obj->logical_page_table); + xas_for_each(&xas, page, linear_page_index(next, addr - SZ_2M)) { + index = xas.xa_index - next->vm_pgoff + vma->vm_pgoff + + ((next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(vma->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas, NULL); + moved_pages++; + } + atomic_add(moved_pages, &vma->vm_obj->nr_pages); + atomic_sub(moved_pages, &next->vm_obj->nr_pages); + xa_unlock(next->vm_obj->logical_page_table); + xa_unlock(vma->vm_obj->logical_page_table); + } else if (n_next && addr == n_next->vm_end) { + /* case 6: copy all logical mappings from next and n_next to vma */ + pgoff = linear_page_index(next, next->vm_start); + XA_STATE(xas_next, next->vm_obj->logical_page_table, pgoff); + pgoff = linear_page_index(n_next, n_next->vm_start); + XA_STATE(xas_n_next, n_next->vm_obj->logical_page_table, pgoff); + + xa_lock(vma->vm_obj->logical_page_table); + rcu_read_lock(); + + xas_for_each(&xas_next, page, linear_page_index(next, next->vm_end - SZ_2M)) { + index = xas_next.xa_index - next->vm_pgoff + vma->vm_pgoff + + ((next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(vma->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas_next, NULL); + moved_pages++; + } + + xas_for_each(&xas_n_next, page, linear_page_index(n_next, n_next->vm_end - SZ_2M)) { + index = xas_n_next.xa_index - n_next->vm_pgoff + vma->vm_pgoff + + ((n_next->vm_start - vma->vm_start) >> PAGE_SHIFT); + __xa_store(vma->vm_obj->logical_page_table, index, page, GFP_KERNEL); + xas_store(&xas_n_next, NULL); + moved_pages++; + } + + rcu_read_unlock(); + atomic_add(moved_pages, &vma->vm_obj->nr_pages); + xa_unlock(vma->vm_obj->logical_page_table); + } + } + /* case 2, 3: do nothing */ +} + void vm_object_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end) { /* remove logical mapping in [vma->vm_start, start) and [end, vm->vm_end) */ unsigned long removed_pages = 0; struct gm_mapping *mapping; + pgoff_t pgoff = linear_page_index(vma, vma->vm_start); - XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + XA_STATE(xas, vma->vm_obj->logical_page_table, pgoff); xas_lock(&xas); if (vma->vm_start < start) { @@ -233,7 +367,8 @@ void vm_object_mapping_create(struct vm_object *obj, unsigned long start) void free_gm_mappings(struct vm_area_struct *vma) { struct gm_mapping *gm_mapping; - XA_STATE(xas, vma->vm_obj->logical_page_table, linear_page_index(vma, vma->vm_start)); + pgoff_t pgoff = linear_page_index(vma, vma->vm_start); + XA_STATE(xas, vma->vm_obj->logical_page_table, pgoff); xa_lock(vma->vm_obj->logical_page_table); xas_for_each(&xas, gm_mapping, linear_page_index(vma, vma->vm_end - SZ_2M)) { -- Gitee From 961ecbccf4b298c5103e5508ad01fb9ed314c51a Mon Sep 17 00:00:00 2001 From: Bin Wang Date: Wed, 17 Sep 2025 09:31:43 +0800 Subject: [PATCH 22/27] gmem: support allocating overlimit pages in device. euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- gmem support swapping pages in device to host. This feature allows device releasing some oldest pages and allocate more memory than limit. Signed-off-by: Bin Wang --- include/linux/gmem.h | 114 +++++++++-- mm/Makefile | 2 +- mm/gmem.c | 231 ++++++--------------- mm/gmem_phys.c | 466 +++++++++++++++++++++++++++++++++++++++++++ mm/gmem_stat.c | 166 +++++++++++++++ mm/mmap.c | 20 +- 6 files changed, 812 insertions(+), 187 deletions(-) create mode 100644 mm/gmem_phys.c create mode 100644 mm/gmem_stat.c diff --git a/include/linux/gmem.h b/include/linux/gmem.h index a2becb381cc9..736e2d4feb4d 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -80,6 +80,7 @@ enum gm_mmu_mode { struct gm_fault_t { struct mm_struct *mm; struct gm_dev *dev; + unsigned long pfn; unsigned long va; unsigned long size; unsigned long prot; @@ -88,13 +89,23 @@ struct gm_fault_t { int behavior; }; +enum gm_memcpy_kind { + GM_MEMCPY_INIT, + GM_MEMCPY_H2H, + GM_MEMCPY_H2D, + GM_MEMCPY_D2H, + GM_MEMCPY_D2D, + GM_MEMCPY_KIND_INVALID, +}; + struct gm_memcpy_t { struct mm_struct *mm; struct gm_dev *dev; - unsigned long src; - unsigned long dest; - dma_addr_t dma_addr; + dma_addr_t src; + dma_addr_t dest; + size_t size; + enum gm_memcpy_kind kind; }; /** @@ -134,6 +145,8 @@ struct gm_mmu { */ enum gm_ret (*peer_unmap)(struct gm_fault_t *gmf); + enum gm_ret (*import_phys_mem)(struct mm_struct *mm, int hnid, unsigned long page_cnt); + /* Create or destroy a device's physical page table. */ enum gm_ret (*pmap_create)(struct gm_dev *dev, void **pmap); enum gm_ret (*pmap_destroy)(void *pmap); @@ -225,11 +238,11 @@ struct gm_mapping { unsigned int flag; union { - struct page *page; /* CPU node */ - struct gm_dev *dev; /* hetero-node */ - unsigned long pfn; + struct page *page; /* CPU node */ + struct gm_page *gm_page; /* hetero-node */ }; + struct gm_dev *dev; struct mutex lock; }; @@ -280,16 +293,12 @@ extern enum gm_ret gm_dev_create(struct gm_mmu *mmu, void *dev_data, unsigned lo struct gm_dev **new_dev); extern enum gm_ret gm_dev_switch(struct gm_dev *dev, struct gm_as *as); extern enum gm_ret gm_dev_detach(struct gm_dev *dev, struct gm_as *as); -extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, - unsigned long end); +extern int gm_dev_register_hnode(struct gm_dev *dev); enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struct gm_dev *dev, int behavior); vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, unsigned int order); /* GMEM address space KPI */ -extern enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, - unsigned long end); -extern void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid); extern enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as); extern enum gm_ret gm_as_destroy(struct gm_as *as); @@ -314,12 +323,42 @@ extern void gmem_stats_counter_show(void); /* h-NUMA topology */ struct hnode { unsigned int id; - struct gm_dev *dev; - struct xarray pages; + struct task_struct *swapd_task; + + struct list_head freelist; + struct list_head activelist; + spinlock_t freelist_lock; + spinlock_t activelist_lock; + atomic_t nr_free_pages; + atomic_t nr_active_pages; + + unsigned long max_memsize; + + bool import_failed; }; +static inline void hnode_active_pages_inc(struct hnode *hnode) +{ + atomic_inc(&hnode->nr_active_pages); +} + +static inline void hnode_active_pages_dec(struct hnode *hnode) +{ + atomic_dec(&hnode->nr_active_pages); +} + +static inline void hnode_free_pages_inc(struct hnode *hnode) +{ + atomic_inc(&hnode->nr_free_pages); +} + +static inline void hnode_free_pages_dec(struct hnode *hnode) +{ + atomic_dec(&hnode->nr_free_pages); +} + static inline bool is_hnode(int node) { return (node < MAX_NUMNODES) && !node_isset(node, node_possible_map) && @@ -334,9 +373,58 @@ static inline int get_hnuma_id(struct gm_dev *gm_dev) void __init hnuma_init(void); unsigned int alloc_hnode_id(void); void free_hnode_id(unsigned int nid); +struct hnode *get_hnode(unsigned int hnid); +struct gm_dev *get_gm_dev(unsigned int nid); void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev); void hnode_deinit(unsigned int hnid, struct gm_dev *dev); +struct gm_page { + struct list_head gm_page_list; + + unsigned long flags; + unsigned long dev_pfn; + unsigned long dev_dma_addr; + unsigned int hnid; + + /* + * The same functionality as rmap, we need know which process + * maps to this gm_page with which virtual address. + * */ + unsigned long va; + struct mm_struct *mm; + + atomic_t refcount; +}; + +#define NUM_IMPORT_PAGES 16 + +int __init gm_page_cachep_init(void); +void gm_page_cachep_destroy(void); +struct gm_page *alloc_gm_page_struct(void); +void hnode_freelist_add(struct hnode *hnode, struct gm_page *gm_page); +void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page); +void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page); +void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page); +void mark_gm_page_active(struct gm_page *gm_page); +int gm_add_pages(unsigned int hnid, struct list_head *pages); +void gm_free_page(struct gm_page *gm_page); +struct gm_page *gm_alloc_page(struct mm_struct *mm, struct hnode *hnode); + +static inline void get_gm_page(struct gm_page *gm_page) +{ + atomic_inc(&gm_page->refcount); +} + +static inline void put_gm_page(struct gm_page *gm_page) +{ + if (atomic_dec_and_test(&gm_page->refcount)) + gm_free_page(gm_page); +} + +int hnode_init_sysfs(unsigned int hnid); +int gm_init_sysfs(void); +void gm_deinit_sysfs(void); + #define gmem_err(fmt, ...) \ ((void)pr_err("[gmem]" fmt "\n", ##__VA_ARGS__)) diff --git a/mm/Makefile b/mm/Makefile index e4aa8e1cd329..db7c51e1f563 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -41,7 +41,7 @@ mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ pgtable-generic.o rmap.o vmalloc.o -mmu-$(CONFIG_GMEM) += gmem.o vm_object.o +mmu-$(CONFIG_GMEM) += gmem.o gmem_phys.o gmem_stat.o vm_object.o ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o diff --git a/mm/gmem.c b/mm/gmem.c index 1397a56e42bb..b18a4b3ed850 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -53,9 +53,6 @@ static DEFINE_XARRAY_ALLOC(gm_dev_id_pool); static bool enable_gmem; -DEFINE_SPINLOCK(hnode_lock); -struct hnode *hnodes[MAX_NUMNODES]; - static inline unsigned long pe_mask(unsigned int order) { if (order == 0) @@ -143,10 +140,18 @@ static int __init gmem_init(void) if (!gm_region_cache) goto free_ctx; - err = vm_object_init(); + err = gm_page_cachep_init(); if (err) goto free_region; + err = gm_init_sysfs(); + if (err) + goto free_gm_page; + + err = vm_object_init(); + if (err) + goto free_gm_sysfs; + err = gmem_stats_init(); if (err) goto free_region; @@ -167,6 +172,10 @@ static int __init gmem_init(void) return 0; +free_gm_sysfs: + gm_deinit_sysfs(); +free_gm_page: + gm_page_cachep_destroy(); free_region: kmem_cache_destroy(gm_region_cache); free_ctx: @@ -228,10 +237,12 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc { enum gm_ret ret = GM_RET_SUCCESS; struct gm_mmu *mmu = dev->mmu; + struct hnode *hnode; struct device *dma_dev = dev->dma_dev; struct vm_area_struct *vma; struct vm_object *obj; struct gm_mapping *gm_mapping; + struct gm_page *gm_page; unsigned long size = HPAGE_SIZE; struct gm_fault_t gmf = { .mm = mm, @@ -243,16 +254,22 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc }; struct page *page = NULL; + hnode = get_hnode(get_hnuma_id(dev)); + if (!hnode) { + gmem_err("gmem device should correspond to a hnuma node"); + ret = -EINVAL; + goto out; + } + vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) { - gmem_err("%s failed to find vma\n", __func__); - pr_info("gmem: %s no vma\n", __func__); + gmem_err("%s failed to find vma", __func__); ret = GM_RET_FAILURE_UNKNOWN; goto out; } obj = vma->vm_obj; if (!obj) { - gmem_err("%s no vm_obj\n", __func__); + gmem_err("%s no vm_obj", __func__); ret = GM_RET_FAILURE_UNKNOWN; goto out; } @@ -266,7 +283,7 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc xa_unlock(obj->logical_page_table); if (unlikely(!gm_mapping)) { - gmem_err("OOM when creating vm_obj!\n"); + gmem_err("OOM when creating vm_obj!"); ret = GM_RET_NOMEM; goto out; } @@ -274,8 +291,9 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc if (gm_mapping_nomap(gm_mapping)) { goto peer_map; } else if (gm_mapping_device(gm_mapping)) { - if (behavior == MADV_WILLNEED || behavior == MADV_PINNED) { - goto peer_map; + if (behavior == MADV_WILLNEED) { + mark_gm_page_active(gm_mapping->gm_page); + goto unlock; } else { ret = 0; goto unlock; @@ -283,7 +301,7 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc } else if (gm_mapping_cpu(gm_mapping)) { page = gm_mapping->page; if (!page) { - gmem_err("host gm_mapping page is NULL. Set nomap\n"); + gmem_err("host gm_mapping page is NULL. Set nomap"); gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); goto unlock; } @@ -293,12 +311,21 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc gmf.dma_addr = dma_map_page(dma_dev, page, 0, size, DMA_BIDIRECTIONAL); if (dma_mapping_error(dma_dev, gmf.dma_addr)) - gmem_err("dma map failed\n"); + gmem_err("dma map failed"); gmf.copy = true; } peer_map: + gm_page = gm_alloc_page(mm, hnode); + if (!gm_page) { + gmem_err("Alloc gm_page for device fault failed."); + ret = -ENOMEM; + goto unlock; + } + + gmf.pfn = gm_page->dev_pfn; + ret = mmu->peer_map(&gmf); if (ret != GM_RET_SUCCESS) { if (ret == GM_RET_MIGRATING) { @@ -310,7 +337,7 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc gmem_stats_counter(NR_PAGE_MIGRATING_D2H, 1); ret = GM_RET_SUCCESS; } else { - gmem_err("peer map failed\n"); + gmem_err("peer map failed"); if (page) { gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); put_page(page); @@ -321,11 +348,16 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc if (page) { dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); - put_page(page); + folio_put(page_folio(page)); } gm_mapping_flags_set(gm_mapping, GM_PAGE_DEVICE); gm_mapping->dev = dev; + gm_page->va = addr; + gm_page->mm = mm; + gm_mapping->gm_page = gm_page; + hnode_activelist_add(hnode, gm_page); + hnode_active_pages_inc(hnode); unlock: mutex_unlock(&gm_mapping->lock); out: @@ -343,6 +375,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, struct gm_mapping *gm_mapping; unsigned long size = HPAGE_SIZE; struct gm_dev *dev; + struct hnode *hnode; struct device *dma_dev; struct gm_fault_t gmf = { .mm = vma->vm_mm, @@ -359,6 +392,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, dev = gm_mapping->dev; gmf.dev = dev; + gmf.pfn = gm_mapping->gm_page->dev_pfn; dma_dev = dev->dma_dev; gmf.dma_addr = dma_map_page(dma_dev, vmf->page, 0, size, DMA_BIDIRECTIONAL); @@ -373,113 +407,13 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, } dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); + hnode = get_hnode(gm_mapping->gm_page->hnid); + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); return ret; } -static inline struct hnode *get_hnode(unsigned int hnid) -{ - return hnodes[hnid]; -} - -static struct gm_dev *get_gm_dev(unsigned int nid) -{ - struct hnode *hnode; - struct gm_dev *dev = NULL; - - spin_lock(&hnode_lock); - hnode = get_hnode(nid); - if (hnode) - dev = hnode->dev; - spin_unlock(&hnode_lock); - return dev; -} - -/* - * Register the local physical memory of a gmem device. - * This implies dynamically creating - * the struct page data structures. - */ -enum gm_ret gm_dev_register_physmem(struct gm_dev *dev, unsigned long begin, unsigned long end) -{ - struct gm_mapping *mapping; - unsigned long addr = PAGE_ALIGN(begin); - unsigned int nid; - int i, page_num = (end - addr) >> PAGE_SHIFT; - struct hnode *hnode = kmalloc(sizeof(struct hnode), GFP_KERNEL); - - if (!hnode) - goto err; - - mapping = kvmalloc_array(page_num, sizeof(struct gm_mapping), GFP_KERNEL); - if (!mapping) - goto free_hnode; - - spin_lock(&hnode_lock); - nid = alloc_hnode_id(); - if (nid == MAX_NUMNODES) - goto unlock_hnode; - hnode_init(hnode, nid, dev); - - for (i = 0; i < page_num; i++, addr += PAGE_SIZE) { - mapping[i].pfn = addr >> PAGE_SHIFT; - mapping[i].flag = 0; - } - - xa_lock(&hnode->pages); - for (i = 0; i < page_num; i++) { - if (xa_err(__xa_store(&hnode->pages, i, mapping + i, - GFP_KERNEL))) { - /* Probably nomem */ - kvfree(mapping); - xa_unlock(&hnode->pages); - goto deinit_hnode; - } - __xa_set_mark(&hnode->pages, i, XA_MARK_0); - } - xa_unlock(&hnode->pages); - - spin_unlock(&hnode_lock); - return GM_RET_SUCCESS; - -deinit_hnode: - hnode_deinit(nid, dev); - free_hnode_id(nid); -unlock_hnode: - spin_unlock(&hnode_lock); -free_hnode: - kfree(hnode); -err: - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(gm_dev_register_physmem); - -void gm_dev_unregister_physmem(struct gm_dev *dev, unsigned int nid) -{ - struct hnode *hnode = NULL; - struct gm_mapping *mapping = NULL; - - spin_lock(&hnode_lock); - - if (!node_isset(nid, dev->registered_hnodes)) - goto unlock; - - hnode = get_hnode(nid); - - if (!hnode) - goto unlock; - mapping = xa_load(&hnode->pages, 0); - - if (mapping) - kvfree(mapping); - - hnode_deinit(nid, dev); - free_hnode_id(nid); - kfree(hnode); -unlock: - spin_unlock(&hnode_lock); -} -EXPORT_SYMBOL_GPL(gm_dev_unregister_physmem); - /* GMEM Virtual Address Space API */ enum gm_ret gm_as_create(unsigned long begin, unsigned long end, enum gm_as_alloc policy, unsigned long cache_quantum, struct gm_as **new_as) @@ -565,50 +499,6 @@ enum gm_ret gm_as_attach(struct gm_as *as, struct gm_dev *dev, enum gm_mmu_mode } EXPORT_SYMBOL_GPL(gm_as_attach); -void __init hnuma_init(void) -{ - unsigned int node; - spin_lock(&hnode_lock); - for_each_node(node) - node_set(node, hnode_map); - spin_unlock(&hnode_lock); -} - -unsigned int alloc_hnode_id(void) -{ - unsigned int node; - - node = first_unset_node(hnode_map); - node_set(node, hnode_map); - - return node; -} - -void free_hnode_id(unsigned int nid) -{ - spin_lock(&hnode_lock); - node_clear(nid, hnode_map); - spin_unlock(&hnode_lock); -} - -void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev) -{ - hnodes[hnid] = hnode; - hnodes[hnid]->id = hnid; - hnodes[hnid]->dev = dev; - node_set(hnid, dev->registered_hnodes); - xa_init(&hnodes[hnid]->pages); -} - -void hnode_deinit(unsigned int hnid, struct gm_dev *dev) -{ - hnodes[hnid]->id = 0; - hnodes[hnid]->dev = NULL; - node_clear(hnid, dev->registered_hnodes); - xa_destroy(&hnodes[hnid]->pages); - hnodes[hnid] = NULL; -} - struct prefetch_data { struct mm_struct *mm; struct gm_dev *dev; @@ -725,6 +615,7 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, }; struct gm_mapping *gm_mapping; struct vm_object *obj; + struct hnode *hnode; int ret; obj = vma->vm_obj; @@ -756,6 +647,10 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, mutex_unlock(&gm_mapping->lock); continue; } + hnode = get_hnode(gm_mapping->gm_page->hnid); + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); } gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); mutex_unlock(&gm_mapping->lock); @@ -946,15 +841,18 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, goto unlock_gm_mmaping; } else { // device to host dev = gm_mapping_src->dev; - gmc.dma_addr = phys_to_dma(dev->dma_dev, + gmc.dest = phys_to_dma(dev->dma_dev, page_to_phys(gm_mapping_dest->page) + (dest & (page_size - 1))); - gmc.src = src; + gmc.src = gm_mapping_src->gm_page->dev_dma_addr + (src & (page_size - 1)); + gmc.kind = GM_MEMCPY_D2H; } } else { if (gm_mapping_cpu(gm_mapping_src)) { // host to device - gmc.dest = dest; - gmc.dma_addr = phys_to_dma(dev->dma_dev, + gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + + (dest & (page_size - 1)); + gmc.src = phys_to_dma(dev->dma_dev, page_to_phys(gm_mapping_src->page) + (src & (page_size - 1))); + gmc.kind = GM_MEMCPY_H2D; } else { // device to device gmem_err("hmemcpy: device to device is unimplemented\n"); goto unlock_gm_mmaping; @@ -1062,3 +960,4 @@ int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) return -EINVAL; } EXPORT_SYMBOL_GPL(hmemcpy); + diff --git a/mm/gmem_phys.c b/mm/gmem_phys.c new file mode 100644 index 000000000000..1ff4407d5aff --- /dev/null +++ b/mm/gmem_phys.c @@ -0,0 +1,466 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * GMEM physical memory management. + * + * Copyright (C) 2025- Huawei, Inc. + * Author: Bin Wang + * + */ + +#include +#include +#include +#include +#include + +#include +#include + +#define NUM_SWAP_PAGES 16 +#define MAX_SWAP_RETRY_TIMES 10 + +static struct kmem_cache *gm_page_cachep; + +DEFINE_SPINLOCK(hnode_lock); +struct hnode *hnodes[MAX_NUMNODES]; + +void __init hnuma_init(void) +{ + unsigned int node; + + spin_lock(&hnode_lock); + for_each_node(node) + node_set(node, hnode_map); + spin_unlock(&hnode_lock); +} + +unsigned int alloc_hnode_id(void) +{ + unsigned int node; + + node = first_unset_node(hnode_map); + node_set(node, hnode_map); + + return node; +} + +void free_hnode_id(unsigned int nid) +{ + node_clear(nid, hnode_map); +} + +void hnode_init(struct hnode *hnode, unsigned int hnid, struct gm_dev *dev) +{ + hnode->id = hnid; + hnode->dev = dev; + INIT_LIST_HEAD(&hnode->freelist); + INIT_LIST_HEAD(&hnode->activelist); + spin_lock_init(&hnode->freelist_lock); + spin_lock_init(&hnode->activelist_lock); + atomic_set(&hnode->nr_free_pages, 0); + atomic_set(&hnode->nr_active_pages, 0); + hnode->import_failed = false; + hnode->max_memsize = 0; + + node_set(hnid, dev->registered_hnodes); + hnodes[hnid] = hnode; +} + +void hnode_deinit(unsigned int hnid, struct gm_dev *dev) +{ + hnodes[hnid]->id = 0; + hnodes[hnid]->dev = NULL; + node_clear(hnid, dev->registered_hnodes); + hnodes[hnid] = NULL; +} + +struct hnode *get_hnode(unsigned int hnid) +{ + if (!hnodes[hnid]) + gmem_err("h-NUMA node for hnode id %u is NULL.", hnid); + return hnodes[hnid]; +} + +struct gm_dev *get_gm_dev(unsigned int nid) +{ + struct hnode *hnode; + struct gm_dev *dev = NULL; + + spin_lock(&hnode_lock); + hnode = get_hnode(nid); + if (hnode) + dev = hnode->dev; + spin_unlock(&hnode_lock); + return dev; +} + +static void init_swapd(struct hnode *hnode); + +int gm_dev_register_hnode(struct gm_dev *dev) +{ + unsigned int hnid; + struct hnode *hnode = kmalloc(sizeof(struct hnode), GFP_KERNEL); + int ret; + + if (!hnode) + return -ENOMEM; + + spin_lock(&hnode_lock); + hnid = alloc_hnode_id(); + spin_unlock(&hnode_lock); + + if (hnid == MAX_NUMNODES) + goto free_hnode; + + ret = hnode_init_sysfs(hnid); + if (ret) + goto free_hnode; + + hnode_init(hnode, hnid, dev); + init_swapd(hnode); + + return GM_RET_SUCCESS; + +free_hnode: + kfree(hnode); + return -EBUSY; +} +EXPORT_SYMBOL_GPL(gm_dev_register_hnode); + +int __init gm_page_cachep_init(void) +{ + gm_page_cachep = KMEM_CACHE(gm_page, 0); + if (!gm_page_cachep) + return -EINVAL; + return 0; +} + +void gm_page_cachep_destroy(void) +{ + kmem_cache_destroy(gm_page_cachep); +} + +struct gm_page *alloc_gm_page_struct(void) +{ + struct gm_page *gm_page = kmem_cache_zalloc(gm_page_cachep, GFP_KERNEL); + + if (!gm_page) + return NULL; + atomic_set(&gm_page->refcount, 0); + return gm_page; +} +EXPORT_SYMBOL(alloc_gm_page_struct); + +void hnode_freelist_add(struct hnode *hnode, struct gm_page *gm_page) +{ + spin_lock(&hnode->freelist_lock); + list_add(&gm_page->gm_page_list, &hnode->freelist); + spin_unlock(&hnode->freelist_lock); +} + +void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page) +{ + spin_lock(&hnode->activelist_lock); + list_add_tail(&gm_page->gm_page_list, &hnode->activelist); + spin_unlock(&hnode->activelist_lock); +} + +void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page) +{ + spin_lock(&hnode->activelist_lock); + list_del(&gm_page->gm_page_list); + spin_unlock(&hnode->activelist_lock); +} + +void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page) +{ + spin_lock(&hnode->activelist_lock); + list_move_tail(&gm_page->gm_page_list, &hnode->activelist); + spin_unlock(&hnode->activelist_lock); +} + +void mark_gm_page_active(struct gm_page *gm_page) +{ + struct hnode *hnode = get_hnode(gm_page->hnid); + + if (!hnode) + return; + + hnode_activelist_del_and_add(hnode, gm_page); +} + +int gm_add_pages(unsigned int hnid, struct list_head *pages) +{ + struct hnode *hnode; + struct gm_page *gm_page, *n; + + hnode = get_hnode(hnid); + if (!hnode) + return -EINVAL; + + list_for_each_entry_safe(gm_page, n, pages, gm_page_list) { + list_del(&gm_page->gm_page_list); + hnode_freelist_add(hnode, gm_page); + hnode_free_pages_inc(hnode); + } + + return 0; +} +EXPORT_SYMBOL(gm_add_pages); + +void gm_free_page(struct gm_page *gm_page) +{ + struct hnode *hnode; + + hnode = get_hnode(gm_page->hnid); + if (!hnode) + return; + hnode_freelist_add(hnode, gm_page); + hnode_free_pages_inc(hnode); +} + +static int gm_evict_page_locked(struct gm_page *gm_page) +{ + struct gm_dev *gm_dev; + struct gm_mapping *gm_mapping; + struct vm_area_struct *vma; + struct mm_struct *mm = gm_page->mm; + struct page *page; + struct device *dma_dev; + unsigned long va = gm_page->va; + struct folio *folio = NULL; + struct gm_fault_t gmf = { + .mm = mm, + .va = va, + .size = HPAGE_SIZE, + .copy = true + }; + int ret = 0; + + gm_dev = get_gm_dev(gm_page->hnid); + if (!gm_dev) + return -EINVAL; + + vma = find_vma(mm, va); + if (!vma || !vma->vm_obj) { + gmem_err("%s: cannot find vma or vma->vm_obj is null for va %lx", __func__, va); + return -EINVAL; + } + + gm_mapping = vm_object_lookup(vma->vm_obj, va); + if (!gm_mapping) { + gmem_err("%s: no gm_mapping for va %lx", __func__, va); + return -EINVAL; + } + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + gmem_err("%s: evicting gm_page conflicts with unmap.", __func__); + ret = 0; + goto gm_mapping_unlock; + } + + folio = vma_alloc_folio(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, va, true); + if (!folio) { + gmem_err("%s: allocate host page failed.", __func__); + ret = -ENOMEM; + goto gm_mapping_unlock; + } + page = &folio->page; + + gmf.dev = gm_dev; + gmf.pfn = gm_page->dev_pfn; + dma_dev = gm_dev->dma_dev; + gmf.dma_addr = dma_map_page(dma_dev, page, 0, HPAGE_SIZE, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dma_dev, gmf.dma_addr)) { + gmem_err("%s: dma map failed.", __func__); + ret = -EINVAL; + goto gm_mapping_unlock; + } + + ret = gm_dev->mmu->peer_unmap(&gmf); + if (ret) + gmem_err("%s: peer_unmap failed.", __func__); + + dma_unmap_page(dma_dev, gmf.dma_addr, HPAGE_SIZE, DMA_BIDIRECTIONAL); + gm_mapping_flags_set(gm_mapping, GM_PAGE_CPU); + gm_mapping->page = page; + put_gm_page(gm_page); +gm_mapping_unlock: + mutex_unlock(&gm_mapping->lock); + return ret; +} + +static int gm_evict_page(struct gm_page *gm_page) +{ + struct mm_struct *mm = gm_page->mm; + int ret; + + mmap_read_lock(mm); + ret = gm_evict_page_locked(gm_page); + mmap_read_unlock(mm); + return ret; +} + +static void gm_do_swap(struct hnode *hnode) +{ + struct list_head swap_list; + struct gm_page *gm_page, *n; + unsigned int nr_swap_pages = 0; + int ret; + + INIT_LIST_HEAD(&swap_list); + + spin_lock(&hnode->activelist_lock); + list_for_each_entry_safe(gm_page, n, &hnode->activelist, gm_page_list) { + /* Move gm_page to temporary list. */ + get_gm_page(gm_page); + list_move(&gm_page->gm_page_list, &swap_list); + nr_swap_pages++; + if (nr_swap_pages >= NUM_SWAP_PAGES) + break; + } + spin_unlock(&hnode->activelist_lock); + + list_for_each_entry_safe(gm_page, n, &swap_list, gm_page_list) { + list_del(&gm_page->gm_page_list); + ret = gm_evict_page_locked(gm_page); + if (ret) { + gmem_err("%s: evict gm_page %lx failed, va %lx", __func__, + (unsigned long)gm_page, gm_page->va); + if (ret == -ENOMEM) { + /* + * Failed to allocate host page, so return gm_page + * to activelist. + */ + hnode_activelist_add(hnode, gm_page); + } else { + /* + * Conflicts with process exit, so return gm_page + * to freelist to avoid memory leak. + */ + atomic_set(&gm_page->refcount, 0); + hnode_freelist_add(hnode, gm_page); + hnode_active_pages_dec(hnode); + hnode_free_pages_inc(hnode); + } + put_gm_page(gm_page); + continue; + } + + hnode_active_pages_dec(hnode); + put_gm_page(gm_page); + } +}; + +static inline bool need_wake_up_swapd(struct hnode *hnode) +{ + return false; +} + +static int swapd_func(void *data) +{ + struct hnode *hnode = (struct hnode *)data; + + while (!kthread_should_stop()) { + if (!need_wake_up_swapd(hnode)) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + + gm_do_swap(hnode); + } + + return 0; +}; + +static void init_swapd(struct hnode *hnode) +{ + hnode->swapd_task = kthread_run(swapd_func, NULL, "gm_swapd/%u", hnode->id); + if (IS_ERR(hnode->swapd_task)) { + gmem_err("%s: create swapd task failed", __func__); + hnode->swapd_task = NULL; + } +} + +static void wake_up_swapd(struct hnode *hnode) +{ + if (likely(hnode->swapd_task)) + wake_up_process(hnode->swapd_task); +} + +static bool can_import(struct hnode *hnode) +{ + unsigned long nr_pages; + unsigned long used_mem; + + nr_pages = atomic_read(&hnode->nr_free_pages) + atomic_read(&hnode->nr_active_pages); + used_mem = nr_pages * HPAGE_SIZE; + + /* GMEM usable memory is unlimited if max_memsize is zero. */ + if (!hnode->max_memsize) + return true; + return used_mem < hnode->max_memsize; +} + +static struct gm_page *get_gm_page_from_freelist(struct hnode *hnode) +{ + struct gm_page *gm_page; + + spin_lock(&hnode->freelist_lock); + gm_page = list_first_entry_or_null(&hnode->freelist, struct gm_page, gm_page_list); + /* Delete from freelist. */ + if (gm_page) { + list_del(&gm_page->gm_page_list); + hnode_free_pages_dec(hnode); + get_gm_page(gm_page); + /* TODO: wakeup swapd if needed. */ + if (need_wake_up_swapd(hnode)) + wake_up_swapd(hnode); + } + spin_unlock(&hnode->freelist_lock); + + return gm_page; +} + +/* + * gm_alloc_page - Allocate a gm_page. + * + * Allocate a gm_page from hnode freelist. If failed to allocate gm_page, try + * to import memory from device. And if failed to import memory, try to swap + * several gm_pages to host and allocate gm_page again. + */ +struct gm_page *gm_alloc_page(struct mm_struct *mm, struct hnode *hnode) +{ + struct gm_page *gm_page; + struct gm_dev *gm_dev; + int retry_times = 0; + int ret = 0; + + if (hnode->dev) + gm_dev = hnode->dev; + else + return NULL; + +retry: + gm_page = get_gm_page_from_freelist(hnode); + if (!gm_page && can_import(hnode) && !hnode->import_failed) { + /* Import pages from device. */ + ret = gm_dev->mmu->import_phys_mem(mm, hnode->id, NUM_IMPORT_PAGES); + if (!ret) + goto retry; + hnode->import_failed = true; + } + + /* Try to swap pages. */ + if (!gm_page) { + if (retry_times > MAX_SWAP_RETRY_TIMES) + return NULL; + gm_do_swap(hnode); + retry_times++; + goto retry; + } + + return gm_page; +} + diff --git a/mm/gmem_stat.c b/mm/gmem_stat.c new file mode 100644 index 000000000000..18bb7a950215 --- /dev/null +++ b/mm/gmem_stat.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * GMEM statistics. + * + * Copyright (C) 2025- Huawei, Inc. + * Author: Bin Wang + * + */ + +#include +#include +#include + +static struct kobject *gm_kobj; + +struct hnode_kobject { + struct kobject kobj; + unsigned int hnid; +}; + +#define HNODE_NAME_LEN 32 + +static struct hnode *get_hnode_kobj(struct kobject *kobj) +{ + struct hnode *hnode; + struct hnode_kobject *hnode_kobj; + + hnode_kobj = container_of(kobj, struct hnode_kobject, kobj); + hnode = get_hnode(hnode_kobj->hnid); + if (!hnode) { + gmem_err("%s: failed to get hnode from kobject", __func__); + return NULL; + } + + return hnode; +} + + +static ssize_t max_memsize_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + return sprintf(buf, "%lu\n", hnode->max_memsize); +} + +static ssize_t max_memsize_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + hnode->max_memsize = memparse(buf, NULL) & (~(HPAGE_SIZE - 1)); + return count; +} + +static struct kobj_attribute max_memsize_attr = + __ATTR(max_memsize, 0644, max_memsize_show, max_memsize_store); + +static ssize_t nr_freepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + return sprintf(buf, "%u\n", atomic_read(&hnode->nr_free_pages)); +} + +static struct kobj_attribute nr_freepages_attr = + __ATTR(nr_freepages, 0444, nr_freepages_show, NULL); + +static ssize_t nr_activepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hnode *hnode = get_hnode_kobj(kobj); + + if (!hnode) + return -EINVAL; + + return sprintf(buf, "%u\n", atomic_read(&hnode->nr_active_pages)); +} + +static struct kobj_attribute nr_activepages_attr = + __ATTR(nr_activepages, 0444, nr_activepages_show, NULL); + +static struct attribute *hnode_attrs[] = { + &max_memsize_attr.attr, + &nr_freepages_attr.attr, + &nr_activepages_attr.attr, + NULL, +}; + +static struct attribute_group hnode_attr_group = { + .attrs = hnode_attrs, +}; + +static void hnode_kobj_release(struct kobject *kobj) +{ + struct hnode_kobject *hnode_kobj = + container_of(kobj, struct hnode_kobject, kobj); + kfree(hnode_kobj); +} + +static const struct kobj_type hnode_kobj_ktype = { + .release = hnode_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, +}; + +int hnode_init_sysfs(unsigned int hnid) +{ + int ret; + struct hnode_kobject *hnode_kobj; + + hnode_kobj = kzalloc(sizeof(struct hnode_kobject), GFP_KERNEL); + if (!hnode_kobj) + return -ENOMEM; + + ret = kobject_init_and_add(&hnode_kobj->kobj, &hnode_kobj_ktype, + gm_kobj, "hnode%u", hnid); + if (ret) { + gmem_err("%s: failed to init hnode object", __func__); + goto free_hnode_kobj; + } + + ret = sysfs_create_group(&hnode_kobj->kobj, &hnode_attr_group); + if (ret) { + gmem_err("%s: failed to register hnode group", __func__); + goto delete_hnode_kobj; + } + + hnode_kobj->hnid = hnid; + return 0; + +delete_hnode_kobj: + kobject_put(&hnode_kobj->kobj); +free_hnode_kobj: + kfree(hnode_kobj); + return ret; +} +EXPORT_SYMBOL(hnode_init_sysfs); + +int __init gm_init_sysfs(void) +{ + gm_kobj = kobject_create_and_add("gmem", mm_kobj); + if (!gm_kobj) { + gmem_err("%s: failed to create gmem object", __func__); + return -ENOMEM; + } + + return 0; + +} +EXPORT_SYMBOL(gm_init_sysfs); + +void gm_deinit_sysfs(void) +{ + kobject_put(gm_kobj); +} diff --git a/mm/mmap.c b/mm/mmap.c index fa8a6ba93070..02f67a68e3e3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2625,7 +2625,7 @@ static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_ar enum gm_ret ret; struct gm_context *ctx, *tmp; struct gm_mapping *gm_mapping; - + struct hnode *hnode; struct gm_fault_t gmf = { .mm = mm, .copy = false, @@ -2664,11 +2664,21 @@ static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_ar gmf.size = HPAGE_SIZE; gmf.dev = gm_mapping->dev; ret = gm_mapping->dev->mmu->peer_unmap(&gmf); - if (ret != GM_RET_SUCCESS) { - gmem_err("%s: call dev peer_unmap error %d\n", __func__, ret); + if (ret != GM_RET_SUCCESS) + gmem_err("%s: call dev peer_unmap error %d", __func__, ret); + + /* + * Regardless of whether the gm_page is unmapped, we should release it. + */ + hnode = get_hnode(gm_mapping->gm_page->hnid); + if (!hnode) { mutex_unlock(&gm_mapping->lock); continue; } + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); + gm_mapping->gm_page = NULL; mutex_unlock(&gm_mapping->lock); } while (addr += HPAGE_SIZE, addr != end); @@ -2806,10 +2816,6 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, locked_vm += vma_pages(next); count++; -#ifdef CONFIG_GMEM - if (gmem_is_enabled()) - munmap_single_vma_in_peer_devices(mm, vma, start, end); -#endif if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas -- Gitee From 46f800586bd5812e0ac72c24d3fef8de0ce1699d Mon Sep 17 00:00:00 2001 From: Bin Wang Date: Tue, 23 Sep 2025 21:33:47 +0800 Subject: [PATCH 23/27] gmem_phys: Fix the oops issue caused by concurrency in the evict and unmap processes. euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- When exiting process unmaps vma, swap process may be in a window when it can see rmap (gm_page->mm and gm_page->va). This cause a UAF bug. add rmap and remove rmap should be protected by lock. Signed-off-by: Bin Wang --- include/linux/gmem.h | 54 ++++++++++------- mm/gmem.c | 44 ++++++-------- mm/gmem_phys.c | 139 +++++++++++++++++++++++++++++-------------- mm/gmem_stat.c | 42 +++++++++++++ mm/memory.c | 74 +++++++++++++++++++++++ mm/mmap.c | 5 +- mm/vm_object.c | 2 +- 7 files changed, 266 insertions(+), 94 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index 736e2d4feb4d..393d84de499e 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -224,14 +224,13 @@ struct gm_dev { struct gm_mapping *gm_mapping; }; -#define GM_PAGE_DIRTY 0x8 /* Whether the page is dirty */ -#define GM_PAGE_CPU 0x10 /* Determines whether page is a pointer or a pfn number. */ -#define GM_PAGE_DEVICE 0x20 -#define GM_PAGE_NOMAP 0x40 -#define GM_PAGE_PINNED 0x80 -#define GM_PAGE_WILLNEED 0x100 +#define GM_MAPPING_CPU 0x10 /* Determines whether page is a pointer or a pfn number. */ +#define GM_MAPPING_DEVICE 0x20 +#define GM_MAPPING_NOMAP 0x40 +#define GM_MAPPING_PINNED 0x80 +#define GM_MAPPING_WILLNEED 0x100 -#define GM_PAGE_TYPE_MASK (GM_PAGE_CPU | GM_PAGE_DEVICE | GM_PAGE_NOMAP) +#define GM_MAPPING_TYPE_MASK (GM_MAPPING_CPU | GM_MAPPING_DEVICE | GM_MAPPING_NOMAP) /* Records the status of a page-size physical page */ struct gm_mapping { @@ -248,8 +247,8 @@ struct gm_mapping { static inline void gm_mapping_flags_set(struct gm_mapping *gm_mapping, int flags) { - if (flags & GM_PAGE_TYPE_MASK) - gm_mapping->flag &= ~GM_PAGE_TYPE_MASK; + if (flags & GM_MAPPING_TYPE_MASK) + gm_mapping->flag &= ~GM_MAPPING_TYPE_MASK; gm_mapping->flag |= flags; } @@ -261,27 +260,17 @@ static inline void gm_mapping_flags_clear(struct gm_mapping *gm_mapping, int fla static inline bool gm_mapping_cpu(struct gm_mapping *gm_mapping) { - return !!(gm_mapping->flag & GM_PAGE_CPU); + return !!(gm_mapping->flag & GM_MAPPING_CPU); } static inline bool gm_mapping_device(struct gm_mapping *gm_mapping) { - return !!(gm_mapping->flag & GM_PAGE_DEVICE); + return !!(gm_mapping->flag & GM_MAPPING_DEVICE); } static inline bool gm_mapping_nomap(struct gm_mapping *gm_mapping) { - return !!(gm_mapping->flag & GM_PAGE_NOMAP); -} - -static inline bool gm_mapping_willneed(struct gm_mapping *gm_mapping) -{ - return !!(gm_mapping->flag & GM_PAGE_WILLNEED); -} - -static inline bool gm_mapping_pinned(struct gm_mapping *gm_mapping) -{ - return !!(gm_mapping->flag & GM_PAGE_PINNED); + return !!(gm_mapping->flag & GM_MAPPING_NOMAP); } #define test_gm_mapping_mapped_on_node(i) { /* implement this */ } @@ -392,10 +381,29 @@ struct gm_page { * */ unsigned long va; struct mm_struct *mm; + spinlock_t rmap_lock; + unsigned int flag; atomic_t refcount; }; +#define GM_PAGE_EVICTING 0x1 + +static inline void gm_page_flags_set(struct gm_page *gm_page, int flags) +{ + gm_page->flag |= flags; +} + +static inline void gm_page_flags_clear(struct gm_page *gm_page, int flags) +{ + gm_page->flag &= ~flags; +} + +static inline bool gm_page_evicting(struct gm_page *gm_page) +{ + return !!(gm_page->flag & GM_PAGE_EVICTING); +} + #define NUM_IMPORT_PAGES 16 int __init gm_page_cachep_init(void); @@ -406,6 +414,8 @@ void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page); void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page); void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page); void mark_gm_page_active(struct gm_page *gm_page); +void gm_page_add_rmap(struct gm_page *gm_page, struct mm_struct *mm, unsigned long va); +void gm_page_remove_rmap(struct gm_page *gm_page); int gm_add_pages(unsigned int hnid, struct list_head *pages); void gm_free_page(struct gm_page *gm_page); struct gm_page *gm_alloc_page(struct mm_struct *mm, struct hnode *hnode); diff --git a/mm/gmem.c b/mm/gmem.c index b18a4b3ed850..6898126221cd 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -302,7 +302,7 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc page = gm_mapping->page; if (!page) { gmem_err("host gm_mapping page is NULL. Set nomap"); - gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); goto unlock; } get_page(page); @@ -328,21 +328,9 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc ret = mmu->peer_map(&gmf); if (ret != GM_RET_SUCCESS) { - if (ret == GM_RET_MIGRATING) { - /* - * gmem page is migrating due to overcommit. - * update page to willneed and this will stop page evicting - */ - gm_mapping_flags_set(gm_mapping, GM_PAGE_WILLNEED); - gmem_stats_counter(NR_PAGE_MIGRATING_D2H, 1); - ret = GM_RET_SUCCESS; - } else { - gmem_err("peer map failed"); - if (page) { - gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); - put_page(page); - } - } + gmem_err("peer map failed"); + if (page) + gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); goto unlock; } @@ -351,10 +339,9 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc folio_put(page_folio(page)); } - gm_mapping_flags_set(gm_mapping, GM_PAGE_DEVICE); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_DEVICE); gm_mapping->dev = dev; - gm_page->va = addr; - gm_page->mm = mm; + gm_page_add_rmap(gm_page, mm, addr); gm_mapping->gm_page = gm_page; hnode_activelist_add(hnode, gm_page); hnode_active_pages_inc(hnode); @@ -408,6 +395,7 @@ vm_fault_t gm_host_fault_locked(struct vm_fault *vmf, dma_unmap_page(dma_dev, gmf.dma_addr, size, DMA_BIDIRECTIONAL); hnode = get_hnode(gm_mapping->gm_page->hnid); + gm_page_remove_rmap(gm_mapping->gm_page); hnode_activelist_del(hnode, gm_mapping->gm_page); hnode_active_pages_dec(hnode); put_gm_page(gm_mapping->gm_page); @@ -648,11 +636,12 @@ static int gmem_unmap_vma_pages(struct vm_area_struct *vma, unsigned long start, continue; } hnode = get_hnode(gm_mapping->gm_page->hnid); + gm_page_remove_rmap(gm_mapping->gm_page); hnode_activelist_del(hnode, gm_mapping->gm_page); hnode_active_pages_dec(hnode); put_gm_page(gm_mapping->gm_page); } - gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); mutex_unlock(&gm_mapping->lock); } @@ -789,7 +778,7 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, vma_src = find_vma(mm, src); if (!vma_src || vma_src->vm_start > src || !vma_dest || vma_dest->vm_start > dest) { - gmem_err("hmemcpy: the vma find by src/dest is NULL!\n"); + gmem_err("hmemcpy: the vma find by src/dest is NULL!"); goto unlock_mm; } @@ -797,14 +786,19 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); if (!gm_mapping_src) { - gmem_err("hmemcpy: gm_mapping_src is NULL\n"); + gmem_err("hmemcpy: gm_mapping_src is NULL"); + goto unlock_mm; + } + + if (gm_mapping_nomap(gm_mapping_src)) { + gmem_err("hmemcpy: src address is not mapping to CPU or device"); goto unlock_mm; } if (hnid != -1) { dev = get_gm_dev(hnid); if (!dev) { - gmem_err("hmemcpy: hnode's dev is NULL\n"); + gmem_err("hmemcpy: hnode's dev is NULL"); goto unlock_mm; } } @@ -816,14 +810,14 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, ret = handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); if (ret) { - gmem_err("%s: failed to execute host page fault, ret:%d\n", + gmem_err("%s: failed to execute host page fault, ret:%d", __func__, ret); goto unlock_mm; } } else { ret = gm_dev_fault_locked(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); if (ret != GM_RET_SUCCESS) { - gmem_err("%s: failed to excecute dev page fault.\n", __func__); + gmem_err("%s: failed to excecute dev page fault.", __func__); goto unlock_mm; } } diff --git a/mm/gmem_phys.c b/mm/gmem_phys.c index 1ff4407d5aff..cd4d4b875c3d 100644 --- a/mm/gmem_phys.c +++ b/mm/gmem_phys.c @@ -147,6 +147,7 @@ struct gm_page *alloc_gm_page_struct(void) if (!gm_page) return NULL; atomic_set(&gm_page->refcount, 0); + spin_lock_init(&gm_page->rmap_lock); return gm_page; } EXPORT_SYMBOL(alloc_gm_page_struct); @@ -168,7 +169,10 @@ void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page) void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page) { spin_lock(&hnode->activelist_lock); - list_del(&gm_page->gm_page_list); + /* If a gm_page is being evicted, it is currently located in the + * temporary linked list. */ + if (!gm_page_evicting(gm_page)) + list_del_init(&gm_page->gm_page_list); spin_unlock(&hnode->activelist_lock); } @@ -219,82 +223,133 @@ void gm_free_page(struct gm_page *gm_page) hnode_free_pages_inc(hnode); } -static int gm_evict_page_locked(struct gm_page *gm_page) +void gm_page_add_rmap(struct gm_page *gm_page, struct mm_struct *mm, unsigned long va) +{ + spin_lock(&gm_page->rmap_lock); + gm_page->mm = mm; + gm_page->va = va; + spin_unlock(&gm_page->rmap_lock); +} + +void gm_page_remove_rmap(struct gm_page *gm_page) +{ + spin_lock(&gm_page->rmap_lock); + gm_page->mm = NULL; + gm_page->va = 0; + spin_unlock(&gm_page->rmap_lock); +} + +enum gm_evict_ret { + GM_EVICT_SUCCESS = 0, + GM_EVICT_UNMAP, + GM_EVICT_FALLBACK, + GM_EVICT_DEVERR, +}; + +enum gm_evict_ret gm_evict_page_locked(struct gm_page *gm_page) { struct gm_dev *gm_dev; struct gm_mapping *gm_mapping; struct vm_area_struct *vma; - struct mm_struct *mm = gm_page->mm; + struct mm_struct *mm; struct page *page; struct device *dma_dev; - unsigned long va = gm_page->va; + unsigned long va; struct folio *folio = NULL; struct gm_fault_t gmf = { - .mm = mm, - .va = va, .size = HPAGE_SIZE, .copy = true }; - int ret = 0; + enum gm_evict_ret ret = GM_EVICT_SUCCESS; + int err; gm_dev = get_gm_dev(gm_page->hnid); if (!gm_dev) - return -EINVAL; + return GM_EVICT_DEVERR; + + spin_lock(&gm_page->rmap_lock); + if (!gm_page->mm) { + /* Evicting gm_page conflicts with unmap.*/ + ret = GM_EVICT_UNMAP; + goto rmap_unlock; + } + mm = gm_page->mm; + va = gm_page->va; vma = find_vma(mm, va); if (!vma || !vma->vm_obj) { gmem_err("%s: cannot find vma or vma->vm_obj is null for va %lx", __func__, va); - return -EINVAL; + ret = GM_EVICT_UNMAP; + goto rmap_unlock; } gm_mapping = vm_object_lookup(vma->vm_obj, va); if (!gm_mapping) { gmem_err("%s: no gm_mapping for va %lx", __func__, va); - return -EINVAL; + ret = GM_EVICT_UNMAP; + goto rmap_unlock; } + spin_unlock(&gm_page->rmap_lock); + mutex_lock(&gm_mapping->lock); if (!gm_mapping_device(gm_mapping)) { - gmem_err("%s: evicting gm_page conflicts with unmap.", __func__); - ret = 0; + /* Evicting gm_page conflicts with unmap.*/ + ret = GM_EVICT_UNMAP; + goto gm_mapping_unlock; + } + + if (gm_mapping->gm_page != gm_page) { + /* gm_mapping maps to another gm_page. */ + ret = GM_EVICT_UNMAP; goto gm_mapping_unlock; } folio = vma_alloc_folio(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma, va, true); if (!folio) { gmem_err("%s: allocate host page failed.", __func__); - ret = -ENOMEM; + ret = GM_EVICT_FALLBACK; goto gm_mapping_unlock; } page = &folio->page; + gmf.mm = mm; + gmf.va = va; gmf.dev = gm_dev; gmf.pfn = gm_page->dev_pfn; dma_dev = gm_dev->dma_dev; gmf.dma_addr = dma_map_page(dma_dev, page, 0, HPAGE_SIZE, DMA_BIDIRECTIONAL); if (dma_mapping_error(dma_dev, gmf.dma_addr)) { gmem_err("%s: dma map failed.", __func__); - ret = -EINVAL; + ret = GM_EVICT_FALLBACK; goto gm_mapping_unlock; } - ret = gm_dev->mmu->peer_unmap(&gmf); - if (ret) + err = gm_dev->mmu->peer_unmap(&gmf); + if (err) { gmem_err("%s: peer_unmap failed.", __func__); + ret = GM_EVICT_DEVERR; + goto dma_unmap; + } - dma_unmap_page(dma_dev, gmf.dma_addr, HPAGE_SIZE, DMA_BIDIRECTIONAL); - gm_mapping_flags_set(gm_mapping, GM_PAGE_CPU); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); + gm_page_remove_rmap(gm_page); gm_mapping->page = page; put_gm_page(gm_page); +dma_unmap: + dma_unmap_page(dma_dev, gmf.dma_addr, HPAGE_SIZE, DMA_BIDIRECTIONAL); gm_mapping_unlock: mutex_unlock(&gm_mapping->lock); return ret; +rmap_unlock: + spin_unlock(&gm_page->rmap_lock); + return ret; } -static int gm_evict_page(struct gm_page *gm_page) +enum gm_evict_ret gm_evict_page(struct gm_page *gm_page) { struct mm_struct *mm = gm_page->mm; - int ret; + enum gm_evict_ret ret; mmap_read_lock(mm); ret = gm_evict_page_locked(gm_page); @@ -315,6 +370,7 @@ static void gm_do_swap(struct hnode *hnode) list_for_each_entry_safe(gm_page, n, &hnode->activelist, gm_page_list) { /* Move gm_page to temporary list. */ get_gm_page(gm_page); + gm_page_flags_set(gm_page, GM_PAGE_EVICTING); list_move(&gm_page->gm_page_list, &swap_list); nr_swap_pages++; if (nr_swap_pages >= NUM_SWAP_PAGES) @@ -323,33 +379,26 @@ static void gm_do_swap(struct hnode *hnode) spin_unlock(&hnode->activelist_lock); list_for_each_entry_safe(gm_page, n, &swap_list, gm_page_list) { - list_del(&gm_page->gm_page_list); + list_del_init(&gm_page->gm_page_list); ret = gm_evict_page_locked(gm_page); - if (ret) { - gmem_err("%s: evict gm_page %lx failed, va %lx", __func__, - (unsigned long)gm_page, gm_page->va); - if (ret == -ENOMEM) { - /* - * Failed to allocate host page, so return gm_page - * to activelist. - */ - hnode_activelist_add(hnode, gm_page); - } else { - /* - * Conflicts with process exit, so return gm_page - * to freelist to avoid memory leak. - */ - atomic_set(&gm_page->refcount, 0); - hnode_freelist_add(hnode, gm_page); - hnode_active_pages_dec(hnode); - hnode_free_pages_inc(hnode); - } + gm_page_flags_clear(gm_page, GM_PAGE_EVICTING); + if (ret == GM_EVICT_UNMAP) { + /* Evicting gm_page conflicts with unmap.*/ + put_gm_page(gm_page); + } else if (ret == GM_EVICT_FALLBACK) { + /* An error occurred with the host, and gm_page needs + * to be added back to the activelist. */ + hnode_activelist_add(hnode, gm_page); + put_gm_page(gm_page); + } else if (ret == GM_EVICT_DEVERR) { + /* It generally occurs when the process has already + * exited, at which point gm_page needs to be returned + * to the freelist. */ + put_gm_page(gm_page); + } else { + hnode_active_pages_dec(hnode); put_gm_page(gm_page); - continue; } - - hnode_active_pages_dec(hnode); - put_gm_page(gm_page); } }; @@ -411,7 +460,7 @@ static struct gm_page *get_gm_page_from_freelist(struct hnode *hnode) gm_page = list_first_entry_or_null(&hnode->freelist, struct gm_page, gm_page_list); /* Delete from freelist. */ if (gm_page) { - list_del(&gm_page->gm_page_list); + list_del_init(&gm_page->gm_page_list); hnode_free_pages_dec(hnode); get_gm_page(gm_page); /* TODO: wakeup swapd if needed. */ diff --git a/mm/gmem_stat.c b/mm/gmem_stat.c index 18bb7a950215..9c9eb3ef0de2 100644 --- a/mm/gmem_stat.c +++ b/mm/gmem_stat.c @@ -91,10 +91,52 @@ static ssize_t nr_activepages_show(struct kobject *kobj, static struct kobj_attribute nr_activepages_attr = __ATTR(nr_activepages, 0444, nr_activepages_show, NULL); +static ssize_t nr_freelist_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int nr_freelist = 0; + struct gm_page *gm_page; + struct hnode *hnode = get_hnode_kobj(kobj); + if (!hnode) + return -EINVAL; + + spin_lock(&hnode->freelist_lock); + list_for_each_entry(gm_page, &hnode->freelist, gm_page_list) { + nr_freelist++; + } + spin_unlock(&hnode->freelist_lock); + return sprintf(buf, "%u\n", nr_freelist); +} + +static struct kobj_attribute nr_freelist_attr = + __ATTR(nr_freelist, 0444, nr_freelist_show, NULL); + +static ssize_t nr_activelist_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int nr_activelist = 0; + struct gm_page *gm_page; + struct hnode *hnode = get_hnode_kobj(kobj); + if (!hnode) + return -EINVAL; + + spin_lock(&hnode->activelist_lock); + list_for_each_entry(gm_page, &hnode->activelist, gm_page_list) { + nr_activelist++; + } + spin_unlock(&hnode->activelist_lock); + return sprintf(buf, "%u\n", nr_activelist); +} + +static struct kobj_attribute nr_activelist_attr = + __ATTR(nr_activelist, 0444, nr_activelist_show, NULL); + static struct attribute *hnode_attrs[] = { &max_memsize_attr.attr, &nr_freepages_attr.attr, &nr_activepages_attr.attr, + &nr_freelist_attr.attr, + &nr_activelist_attr.attr, NULL, }; diff --git a/mm/memory.c b/mm/memory.c index 100769eae24f..8891831579e4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1937,6 +1937,77 @@ static void unmap_single_vma(struct mmu_gather *tlb, } } +static void unmap_single_peer_shared_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end, addr; + struct vm_object *obj = vma->vm_obj; + enum gm_ret ret; + struct gm_mapping *gm_mapping; + struct hnode *hnode; + struct gm_fault_t gmf = { + .mm = mm, + .copy = false, + }; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return; + addr = start; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + + if (!obj) + return; + + if (!mm->gm_as) + return; + + do { + xa_lock(obj->logical_page_table); + gm_mapping = vm_object_lookup(obj, addr); + if (!gm_mapping) { + xa_unlock(obj->logical_page_table); + continue; + } + xa_unlock(obj->logical_page_table); + + mutex_lock(&gm_mapping->lock); + if (!gm_mapping_device(gm_mapping)) { + mutex_unlock(&gm_mapping->lock); + continue; + } + + /* In fact, during the exit_mmap process of the host, we do not + * need to call peer_unmap to release the memory within the NPU + * card, as the NPU card has an independent process that will + * handle the unmap operation. */ + //gmf.va = addr; + //gmf.size = HPAGE_SIZE; + //gmf.pfn = gm_mapping->gm_page->dev_pfn; + //gmf.dev = gm_mapping->dev; + //ret = gm_mapping->dev->mmu->peer_unmap(&gmf); + //if (ret != GM_RET_SUCCESS) + // gmem_err("%s: call dev peer_unmap error %d", __func__, ret); + + /* + * Regardless of whether the gm_page is unmapped, we should release it. + */ + hnode = get_hnode(gm_mapping->gm_page->hnid); + if (!hnode) { + mutex_unlock(&gm_mapping->lock); + continue; + } + gm_page_remove_rmap(gm_mapping->gm_page); + hnode_activelist_del(hnode, gm_mapping->gm_page); + hnode_active_pages_dec(hnode); + put_gm_page(gm_mapping->gm_page); + gm_mapping->gm_page = NULL; + mutex_unlock(&gm_mapping->lock); + } while (addr += HPAGE_SIZE, addr != end); +} + /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather @@ -1980,6 +2051,9 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unmap_single_vma(tlb, vma, start, end, &details, mm_wr_locked); hugetlb_zap_end(vma, &details); +#ifdef CONFIG_GMEM + unmap_single_peer_shared_vma(vma->vm_mm, vma, start, end); +#endif vma = mas_find(mas, tree_end - 1); } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); diff --git a/mm/mmap.c b/mm/mmap.c index 02f67a68e3e3..2e777ad31323 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2618,7 +2618,7 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, #ifdef CONFIG_GMEM static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr) + unsigned long start_addr, unsigned long end_addr) { unsigned long start, end, addr; struct vm_object *obj = vma->vm_obj; @@ -2662,6 +2662,7 @@ static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_ar gmf.va = addr; gmf.size = HPAGE_SIZE; + gmf.pfn = gm_mapping->gm_page->dev_pfn; gmf.dev = gm_mapping->dev; ret = gm_mapping->dev->mmu->peer_unmap(&gmf); if (ret != GM_RET_SUCCESS) @@ -2675,9 +2676,11 @@ static void munmap_single_vma_in_peer_devices(struct mm_struct *mm, struct vm_ar mutex_unlock(&gm_mapping->lock); continue; } + gm_page_remove_rmap(gm_mapping->gm_page); hnode_activelist_del(hnode, gm_mapping->gm_page); hnode_active_pages_dec(hnode); put_gm_page(gm_mapping->gm_page); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); gm_mapping->gm_page = NULL; mutex_unlock(&gm_mapping->lock); } while (addr += HPAGE_SIZE, addr != end); diff --git a/mm/vm_object.c b/mm/vm_object.c index b30b69f81167..42219e8ff42b 100644 --- a/mm/vm_object.c +++ b/mm/vm_object.c @@ -51,7 +51,7 @@ struct gm_mapping *alloc_gm_mapping(void) if (!gm_mapping) return NULL; - gm_mapping_flags_set(gm_mapping, GM_PAGE_NOMAP); + gm_mapping_flags_set(gm_mapping, GM_MAPPING_NOMAP); mutex_init(&gm_mapping->lock); return gm_mapping; -- Gitee From 0aa0cb551a1f11158a02c65483a2c1e275feb974 Mon Sep 17 00:00:00 2001 From: Bin Wang Date: Thu, 25 Sep 2025 09:14:02 +0800 Subject: [PATCH 24/27] gmem_phys: Fix memory leak issue after peer_map failure. euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Release gm_page when fail to do device mapping. Signed-off-by: Bin Wang --- mm/gmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/gmem.c b/mm/gmem.c index 6898126221cd..6378242c111d 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -331,6 +331,7 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc gmem_err("peer map failed"); if (page) gm_mapping_flags_set(gm_mapping, GM_MAPPING_CPU); + put_gm_page(gm_page); goto unlock; } -- Gitee From 76d78abac719bcc7a8c115907740936ca695f443 Mon Sep 17 00:00:00 2001 From: Bin Wang Date: Thu, 25 Sep 2025 09:34:35 +0800 Subject: [PATCH 25/27] gmem_stat: Remove read permission for regular users euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --------------------------------------------- Only root can read active and free pages number. Signed-off-by: Bin Wang --- mm/gmem_stat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/gmem_stat.c b/mm/gmem_stat.c index 9c9eb3ef0de2..dbf4de2151cb 100644 --- a/mm/gmem_stat.c +++ b/mm/gmem_stat.c @@ -61,7 +61,7 @@ static ssize_t max_memsize_store(struct kobject *kobj, } static struct kobj_attribute max_memsize_attr = - __ATTR(max_memsize, 0644, max_memsize_show, max_memsize_store); + __ATTR(max_memsize, 0640, max_memsize_show, max_memsize_store); static ssize_t nr_freepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -75,7 +75,7 @@ static ssize_t nr_freepages_show(struct kobject *kobj, } static struct kobj_attribute nr_freepages_attr = - __ATTR(nr_freepages, 0444, nr_freepages_show, NULL); + __ATTR(nr_freepages, 0440, nr_freepages_show, NULL); static ssize_t nr_activepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -109,7 +109,7 @@ static ssize_t nr_freelist_show(struct kobject *kobj, } static struct kobj_attribute nr_freelist_attr = - __ATTR(nr_freelist, 0444, nr_freelist_show, NULL); + __ATTR(nr_freelist, 0440, nr_freelist_show, NULL); static ssize_t nr_activelist_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -129,7 +129,7 @@ static ssize_t nr_activelist_show(struct kobject *kobj, } static struct kobj_attribute nr_activelist_attr = - __ATTR(nr_activelist, 0444, nr_activelist_show, NULL); + __ATTR(nr_activelist, 0440, nr_activelist_show, NULL); static struct attribute *hnode_attrs[] = { &max_memsize_attr.attr, -- Gitee From 60355e03466e3206bbdfbbd7e6d4b560b30d8659 Mon Sep 17 00:00:00 2001 From: Super User Date: Wed, 15 Oct 2025 19:08:55 +0800 Subject: [PATCH 26/27] gmem: support pin and unpin gm_page euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --- include/linux/gmem.h | 8 +++++ include/uapi/asm-generic/mman-common.h | 1 + mm/gmem.c | 37 +++++++++++++++----- mm/gmem_phys.c | 48 ++++++++++++++++++++++++++ mm/huge_memory.c | 6 ++++ 5 files changed, 91 insertions(+), 9 deletions(-) diff --git a/include/linux/gmem.h b/include/linux/gmem.h index 393d84de499e..23e87f2d7fe3 100644 --- a/include/linux/gmem.h +++ b/include/linux/gmem.h @@ -388,6 +388,7 @@ struct gm_page { }; #define GM_PAGE_EVICTING 0x1 +#define GM_PAGE_PINNED 0x2 static inline void gm_page_flags_set(struct gm_page *gm_page, int flags) { @@ -404,6 +405,11 @@ static inline bool gm_page_evicting(struct gm_page *gm_page) return !!(gm_page->flag & GM_PAGE_EVICTING); } +static inline bool gm_page_pinned(struct gm_page *gm_page) +{ + return !!(gm_page->flag & GM_PAGE_PINNED); +} + #define NUM_IMPORT_PAGES 16 int __init gm_page_cachep_init(void); @@ -414,6 +420,8 @@ void hnode_activelist_add(struct hnode *hnode, struct gm_page *gm_page); void hnode_activelist_del(struct hnode *hnode, struct gm_page *gm_page); void hnode_activelist_del_and_add(struct hnode *hnode, struct gm_page *gm_page); void mark_gm_page_active(struct gm_page *gm_page); +void mark_gm_page_pinned(struct gm_page *gm_page); +void mark_gm_page_unpinned(struct gm_page *gm_page); void gm_page_add_rmap(struct gm_page *gm_page, struct mm_struct *mm, unsigned long va); void gm_page_remove_rmap(struct gm_page *gm_page); int gm_add_pages(unsigned int hnid, struct list_head *pages); diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index d8857c71d4bb..19e22492a85b 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -85,6 +85,7 @@ #define MADV_GMEM_BASE 0x1000 #define MADV_PREFETCH MADV_GMEM_BASE /* prefetch pages for hNUMA node */ #define MADV_PINNED (MADV_GMEM_BASE+1) /* pin these pages */ +#define MADV_PINNED_REMOVE (MADV_GMEM_BASE+2) /* unpin these pages */ #define MADV_ETMEM_BASE 0x1100 #define MADV_SWAPFLAG MADV_ETMEM_BASE /* for memory to be swap out */ diff --git a/mm/gmem.c b/mm/gmem.c index 6378242c111d..abb910d84abe 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -291,12 +291,19 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc if (gm_mapping_nomap(gm_mapping)) { goto peer_map; } else if (gm_mapping_device(gm_mapping)) { - if (behavior == MADV_WILLNEED) { - mark_gm_page_active(gm_mapping->gm_page); - goto unlock; - } else { - ret = 0; - goto unlock; + switch (behavior) { + case MADV_PINNED: + mark_gm_page_pinned(gm_mapping->gm_page); + fallthrough; + case MADV_WILLNEED: + mark_gm_page_active(gm_mapping->gm_page); + goto unlock; + case MADV_PINNED_REMOVE: + mark_gm_page_unpinned(gm_mapping->gm_page); + goto unlock; + default: + ret = 0; + goto unlock; } } else if (gm_mapping_cpu(gm_mapping)) { page = gm_mapping->page; @@ -344,6 +351,11 @@ enum gm_ret gm_dev_fault_locked(struct mm_struct *mm, unsigned long addr, struc gm_mapping->dev = dev; gm_page_add_rmap(gm_page, mm, addr); gm_mapping->gm_page = gm_page; + if (behavior == MADV_PINNED) { + mark_gm_page_pinned(gm_page); + } else if (behavior == MADV_PINNED_REMOVE) { + mark_gm_page_unpinned(gm_page); + } hnode_activelist_add(hnode, gm_page); hnode_active_pages_inc(hnode); unlock: @@ -494,6 +506,7 @@ struct prefetch_data { unsigned long addr; size_t size; struct work_struct work; + int behavior; int *res; }; @@ -508,7 +521,7 @@ static void prefetch_work_cb(struct work_struct *work) do { /* MADV_WILLNEED: dev will soon access this addr. */ mmap_read_lock(d->mm); - ret = gm_dev_fault_locked(d->mm, addr, d->dev, MADV_WILLNEED); + ret = gm_dev_fault_locked(d->mm, addr, d->dev, d->behavior); mmap_read_unlock(d->mm); if (ret == GM_RET_PAGE_EXIST) { gmem_err("%s: device has done page fault, ignore prefetch\n", @@ -522,7 +535,7 @@ static void prefetch_work_cb(struct work_struct *work) kfree(d); } -static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t size) +static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t size, int behavior) { unsigned long start, end, per_size; int page_size = HPAGE_SIZE; @@ -578,6 +591,7 @@ static int hmadvise_do_prefetch(struct gm_dev *dev, unsigned long addr, size_t s data->mm = current->mm; data->dev = dev; data->addr = start; + data->behavior = behavior; data->res = &res; if (per_size == 0) data->size = size; @@ -745,7 +759,12 @@ int hmadvise_inner(int hnid, unsigned long start, size_t len_in, int behavior) no_hnid: switch (behavior) { case MADV_PREFETCH: - return hmadvise_do_prefetch(dev, start, len_in); + behavior = MADV_WILLNEED; + fallthrough; + case MADV_PINNED_REMOVE: + fallthrough; + case MADV_PINNED: + return hmadvise_do_prefetch(dev, start, len_in, behavior); case MADV_DONTNEED: return hmadvise_do_eagerfree(start, len_in); default: diff --git a/mm/gmem_phys.c b/mm/gmem_phys.c index cd4d4b875c3d..10531edccfc3 100644 --- a/mm/gmem_phys.c +++ b/mm/gmem_phys.c @@ -193,6 +193,45 @@ void mark_gm_page_active(struct gm_page *gm_page) hnode_activelist_del_and_add(hnode, gm_page); } +void mark_gm_page_pinned(struct gm_page *gm_page) +{ + struct hnode *hnode = get_hnode(gm_page->hnid); + + if (!hnode) + return; + + spin_lock(&hnode->activelist_lock); + if (gm_page_evicting(gm_page)) { + gmem_err("%s: maybe page has been evicted!", __func__); + goto unlock; + } else if (gm_page_pinned(gm_page)) { + goto unlock; + } + gm_page_flags_set(gm_page, GM_PAGE_PINNED); + +unlock: + spin_unlock(&hnode->activelist_lock); + return; +} + +void mark_gm_page_unpinned(struct gm_page *gm_page) +{ + struct hnode *hnode = get_hnode(gm_page->hnid); + + if (!hnode) + return; + + spin_lock(&hnode->activelist_lock); + if (!gm_page_pinned(gm_page) || gm_page_evicting(gm_page)) { + goto unlock; + } + gm_page_flags_clear(gm_page, GM_PAGE_PINNED); + +unlock: + spin_unlock(&hnode->activelist_lock); + return; +} + int gm_add_pages(unsigned int hnid, struct list_head *pages) { struct hnode *hnode; @@ -206,6 +245,7 @@ int gm_add_pages(unsigned int hnid, struct list_head *pages) list_del(&gm_page->gm_page_list); hnode_freelist_add(hnode, gm_page); hnode_free_pages_inc(hnode); + gm_page_flags_clear(gm_page, GM_PAGE_PINNED); } return 0; @@ -368,6 +408,10 @@ static void gm_do_swap(struct hnode *hnode) spin_lock(&hnode->activelist_lock); list_for_each_entry_safe(gm_page, n, &hnode->activelist, gm_page_list) { + if (gm_page_pinned(gm_page)) { + gmem_err("%s: va %lx is pinned!", __func__, gm_page->va); + continue; + } /* Move gm_page to temporary list. */ get_gm_page(gm_page); gm_page_flags_set(gm_page, GM_PAGE_EVICTING); @@ -460,6 +504,10 @@ static struct gm_page *get_gm_page_from_freelist(struct hnode *hnode) gm_page = list_first_entry_or_null(&hnode->freelist, struct gm_page, gm_page_list); /* Delete from freelist. */ if (gm_page) { + if (gm_page_pinned(gm_page)) { + gmem_err("%s: gm_page %lx from freelist has pinned flag, clear it!", __func__, (unsigned long)gm_page); + gm_page_flags_clear(gm_page, GM_PAGE_PINNED); + } list_del_init(&gm_page->gm_page_list); hnode_free_pages_dec(hnode); get_gm_page(gm_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f4613cf7c6dc..59f546540fd0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1472,6 +1472,12 @@ static vm_fault_t __do_peer_shared_anonymous_page(struct vm_fault *vmf) mutex_lock(&gm_mapping->lock); + if (gm_mapping_device(gm_mapping) && gm_page_pinned(gm_mapping->gm_page)) { + pr_err("page is pinned! addr is %lx\n", gm_mapping->gm_page->va); + ret = VM_FAULT_SIGBUS; + goto release; + } + if (gm_mapping_cpu(gm_mapping)) folio = page_folio(gm_mapping->page); if (!folio) { -- Gitee From 16da5061b50e20eb5fdb1423cd0d4dba76b4ab21 Mon Sep 17 00:00:00 2001 From: Super User Date: Wed, 15 Oct 2025 20:20:02 +0800 Subject: [PATCH 27/27] gmem: Expand hmemcpy to support copy between gmem and not gmem, besides consider overlimit euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ICHFJN --- mm/gmem.c | 198 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 165 insertions(+), 33 deletions(-) diff --git a/mm/gmem.c b/mm/gmem.c index abb910d84abe..227717b2408e 100644 --- a/mm/gmem.c +++ b/mm/gmem.c @@ -780,6 +780,26 @@ static bool hnid_match_dest(int hnid, struct gm_mapping *dest) return (hnid < 0) ? gm_mapping_cpu(dest) : gm_mapping_device(dest); } +static void cpu_page_copy(struct page *dst_page, unsigned long dst_offset, + struct page *src_page, unsigned long src_offset, size_t size) +{ + unsigned long src, dst; + + src = (unsigned long)page_address(src_page) + src_offset; + dst = (unsigned long)page_address(dst_page) + dst_offset; + if (!src || !dst) { + gmem_err("%s: src (%lx) or dst (%lx) is invalid!", src, dst); + return; + } + memcpy((void *)dst, (void *)src, size); +} + +static enum gmem_copy_dir { + COPY_GMEM_TO_NORM, + COPY_NORM_TO_GMEM, + COPY_GMEM_TO_GMEM, +}; + static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, unsigned long src, size_t size) { @@ -789,6 +809,9 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, struct gm_mapping *gm_mapping_dest, *gm_mapping_src; struct gm_dev *dev = NULL; struct gm_memcpy_t gmc = {0}; + enum gmem_copy_dir dir; + struct page *trans_hpage; + void *trans_addr; if (size == 0) return; @@ -802,24 +825,39 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, goto unlock_mm; } - gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); - gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); - - if (!gm_mapping_src) { - gmem_err("hmemcpy: gm_mapping_src is NULL"); + if (vma_is_peer_shared(vma_src) && vma_is_peer_shared(vma_dest)) { + dir = COPY_GMEM_TO_GMEM; + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); + gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + } else if (vma_is_peer_shared(vma_src)) { + dir = COPY_GMEM_TO_NORM; + gm_mapping_src = vm_object_lookup(vma_src->vm_obj, src & ~(page_size - 1)); + gm_mapping_dest = NULL; + } else if (vma_is_peer_shared(vma_dest)) { + dir = COPY_NORM_TO_GMEM; + gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, dest & ~(page_size - 1)); + gm_mapping_src = NULL; + } else { + gmem_err("%s: src %lx and dest %lx both not gmem addr!", __func__, src, dest); goto unlock_mm; } - if (gm_mapping_nomap(gm_mapping_src)) { - gmem_err("hmemcpy: src address is not mapping to CPU or device"); + trans_hpage = alloc_pages(GFP_TRANSHUGE, HPAGE_PMD_ORDER); + if (!trans_hpage) { + gmem_err("%s: alloc trans_hpage failed!", __func__); goto unlock_mm; } + trans_addr = page_to_virt(trans_hpage); + + if (dir != COPY_NORM_TO_GMEM && (!gm_mapping_src || gm_mapping_nomap(gm_mapping_src))) { + gmem_err("%s: gm_mapping_src is NULL or still not mapped! addr is %lx", __func__, src); + } if (hnid != -1) { dev = get_gm_dev(hnid); if (!dev) { gmem_err("hmemcpy: hnode's dev is NULL"); - goto unlock_mm; + goto free_trans_page; } } @@ -827,60 +865,149 @@ static void do_hmemcpy(struct mm_struct *mm, int hnid, unsigned long dest, if (!gm_mapping_dest || gm_mapping_nomap(gm_mapping_dest) || !hnid_match_dest(hnid, gm_mapping_dest)) { if (hnid == -1) { - ret = handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | - FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); - if (ret) { - gmem_err("%s: failed to execute host page fault, ret:%d", - __func__, ret); - goto unlock_mm; + if (gm_mapping_dest && gm_mapping_device(gm_mapping_dest) && gm_page_pinned(gm_mapping_dest->gm_page)) { + gmem_err("%s: dest %lx is pinned on device, skip handle_mm_fault", __func__, dest); + } else { + ret = handle_mm_fault(vma_dest, dest & ~(page_size - 1), FAULT_FLAG_USER | + FAULT_FLAG_INSTRUCTION | FAULT_FLAG_WRITE, NULL); + if (ret) { + gmem_err("%s: failed to execute host page fault, ret:%d", + __func__, ret); + goto free_trans_page; + } } } else { ret = gm_dev_fault_locked(mm, dest & ~(page_size - 1), dev, MADV_WILLNEED); if (ret != GM_RET_SUCCESS) { gmem_err("%s: failed to excecute dev page fault.", __func__); - goto unlock_mm; + goto free_trans_page; } } } - if (!gm_mapping_dest) + if (!gm_mapping_dest && dir != COPY_GMEM_TO_NORM) gm_mapping_dest = vm_object_lookup(vma_dest->vm_obj, round_down(dest, page_size)); if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) mutex_lock(&gm_mapping_dest->lock); - mutex_lock(&gm_mapping_src->lock); + if (gm_mapping_src) + mutex_lock(&gm_mapping_src->lock); // Use memcpy when there is no device address, otherwise use peer_memcpy - if (hnid == -1) { + if (dir == COPY_GMEM_TO_NORM) { + if (!gm_mapping_src) { + gmem_err("%s: do COPY_GMEM_TO_NORM but gm_mapping_src is NULL!", __func__); + goto unlock_gm_mapping; + } if (gm_mapping_cpu(gm_mapping_src)) { // host to host - gmem_err("hmemcpy: host to host is unimplemented\n"); - goto unlock_gm_mmaping; - } else { // device to host + cpu_page_copy(trans_hpage, (unsigned long)trans_addr & (page_size - 1), + gm_mapping_src->page, src & (page_size - 1), size); + goto copy_to_norm_dest; + } else if (gm_mapping_device(gm_mapping_src)) { // device to host dev = gm_mapping_src->dev; - gmc.dest = phys_to_dma(dev->dma_dev, - page_to_phys(gm_mapping_dest->page) + (dest & (page_size - 1))); + gmc.dest = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + + ((unsigned long)trans_addr & (page_size - 1))); gmc.src = gm_mapping_src->gm_page->dev_dma_addr + (src & (page_size - 1)); gmc.kind = GM_MEMCPY_D2H; + } else { + gmem_err("gm_mapping_src bad status, dir is COPY_GMEM_TO_NORM"); + goto unlock_gm_mapping; } - } else { - if (gm_mapping_cpu(gm_mapping_src)) { // host to device - gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr - + (dest & (page_size - 1)); - gmc.src = phys_to_dma(dev->dma_dev, - page_to_phys(gm_mapping_src->page) + (src & (page_size - 1))); + } else if (dir == COPY_NORM_TO_GMEM) { + if (!gm_mapping_dest) { + gmem_err("%s: do COPY_NORM_TO_GMEM but gm_mapping_dest is NULL!", __func__); + goto unlock_gm_mapping; + } + if (copy_from_user(trans_addr, (void __user *)src, size) > 0) + gmem_err("copy normal src %lx to trans failed", src); + if (gm_mapping_cpu(gm_mapping_dest)) { // host to host + cpu_page_copy(gm_mapping_dest->page, dest & (page_size - 1), + trans_hpage, (unsigned long)trans_addr & (page_size - 1), size); + goto unlock_gm_mapping; + } else if (gm_mapping_device(gm_mapping_dest)) { + if (!dev) { + gmem_err("%s: do COPY_NORM_TO_GMEM but dev is NULL, hnid is %d", __func__, hnid); + goto unlock_gm_mapping; + } + gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + + (dest & (page_size - 1)); + gmc.src = phys_to_dma(dev->dma_dev, page_to_phys(trans_hapge) + + ((unsigned long)trans_addr & (page_size - 1))); gmc.kind = GM_MEMCPY_H2D; } else { // device to device - gmem_err("hmemcpy: device to device is unimplemented\n"); + gmem_err("gm_mapping_dest bad status, dir is COPY_NORM_TO_GMEM\n"); goto unlock_gm_mmaping; } + } else if (dir == COPY_GMEM_TO_GMEM) { + if (gm_mapping_cpu(gm_mapping_src)) { + if (gm_mapping_cpu(gm_mapping_dest)) { + cpu_page_copy(gm_mapping_dest->page, dest & (page_size - 1), + gm_mapping_src->page, src & (page_size - 1), size); + goto unlock_gm_mapping; + } else if (gm_mapping_device(gm_mapping_dest)) { + dev = gm_mapping_dest->dev; + gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + (dest & (page_size - 1)); + gmc.src = phys_to_dma(dev->dma_dev, page_to_phys(gm_mapping_src->page) + + (src & (page_size - 1))); + gmc.kind = GM_MEMCPY_H2D; + } else { + gmem_err("gm_mapping_dest bad status, src is on host!"); + goto unlock_gm_mapping; + } + } else if (gm_mapping_device(gm_mapping_src)) { + if (gm_mapping_cpu(gm_mapping_dest)) { + dev = gm_mapping_src->dev; + gmc.dest = phys_to_dma(dev->dma_dev, page_to_phys(gm_mapping_dest->page) + + (dest & (page_size - 1))); + gmc.src = gm_mapping_src->gm_page->dev_dma_addr + (src & (page_size - 1)); + gmc.kind = GM_MEMCPY_D2H; + } else if (gm_mapping_device(gm_mapping_dest)) { + dev = gm_mapping_src->dev; + gmc.dest = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + + ((unsigned long)trans_addr & (page_size - 1))); + gmc.src = gm_mapping_src->gm_page->dev_dma_addr + (src & (page_size - 1)); + gmc.kind = GM_MEMCPY_D2H; + gmc.mm = mm; + gmc.dev = dev; + gmc.size = size; + dev->mmu->peer_hmemcpy(&gmc); + + dev = gm_mapping_dest->dev; + gmc.dest = gm_mapping_dest->gm_page->dev_dma_addr + (dest & (page_size - 1)); + gmc.src = phys_to_dma(dev->dma_dev, page_to_phys(trans_hpage) + + ((unsigned long)trans_addr& (page_size - 1))); + gmc.kind = GM_MEMCPY_H2D; + gmc.mm = mm; + gmc.dev = dev; + gmc.size = size; + dev->mmu->peer_hmemcpy(&gmc); + + goto unlock_gm_mapping; + } else { + gmem_err("gm_mapping_dest bad status, src is on device!"); + goto unlock_gm_mapping; + } + } else { + gmem_err("gm_mapping_src bad status, dir is COPY_GMEM_TO_GMEM"); + goto unlock_gm_mapping; + } } gmc.mm = mm; gmc.dev = dev; gmc.size = size; dev->mmu->peer_hmemcpy(&gmc); +copy_to_norm_dest: + if (dir == COPY_GMEM_TO_NORM) { + if (copy_to_user((void __user *)dest, trans_addr, size) > 0) + gmem_err("copy trans to normal dest %lx failed!", dest); + } + unlock_gm_mmaping: - mutex_unlock(&gm_mapping_src->lock); + if (gm_mapping_src) + mutex_unlock(&gm_mapping_src->lock); if (gm_mapping_dest && gm_mapping_dest != gm_mapping_src) mutex_unlock(&gm_mapping_dest->lock); +free_trans_page: + __free_pages(trans_hpage, HPAGE_PMD_ORDER); unlock_mm: mmap_read_unlock(mm); } @@ -948,17 +1075,22 @@ int hmemcpy(int hnid, unsigned long dest, unsigned long src, size_t size) vma_src = find_vma(mm, src); if ((ULONG_MAX - size < src) || !vma_src || vma_src->vm_start > src || - !vma_is_peer_shared(vma_src) || vma_src->vm_end < (src + size)) { + vma_src->vm_end < (src + size)) { gmem_err("failed to find peer_shared vma by invalid src or size\n"); goto unlock; } if ((ULONG_MAX - size < dest) || !vma_dest || vma_dest->vm_start > dest || - !vma_is_peer_shared(vma_dest) || vma_dest->vm_end < (dest + size)) { + vma_dest->vm_end < (dest + size)) { gmem_err("failed to find peer_shared vma by invalid dest or size\n"); goto unlock; } + if (!vma_is_peer_shared(vma_src) && !vma_is_peer_shared(vma_dest)) { + mmap_read_unlock(mm); + return -EAGAIN; + } + if (!(vma_dest->vm_flags & VM_WRITE)) { gmem_err("dest is not writable.\n"); goto unlock; -- Gitee