diff --git a/Documentation/ABI/testing/configfs-tsm b/Documentation/ABI/testing/configfs-tsm new file mode 100644 index 0000000000000000000000000000000000000000..dd24202b5ba52ba9cc7034fca8897eee0be5a997 --- /dev/null +++ b/Documentation/ABI/testing/configfs-tsm @@ -0,0 +1,82 @@ +What: /sys/kernel/config/tsm/report/$name/inblob +Date: September, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (WO) Up to 64 bytes of user specified binary data. For replay + protection this should include a nonce, but the kernel does not + place any restrictions on the content. + +What: /sys/kernel/config/tsm/report/$name/outblob +Date: September, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (RO) Binary attestation report generated from @inblob and other + options The format of the report is implementation specific + where the implementation is conveyed via the @provider + attribute. + +What: /sys/kernel/config/tsm/report/$name/auxblob +Date: October, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (RO) Optional supplemental data that a TSM may emit, visibility + of this attribute depends on TSM, and may be empty if no + auxiliary data is available. + + When @provider is "sev_guest" this file contains the + "cert_table" from SEV-ES Guest-Hypervisor Communication Block + Standardization v2.03 Section 4.1.8.1 MSG_REPORT_REQ. + https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56421.pdf + +What: /sys/kernel/config/tsm/report/$name/provider +Date: September, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (RO) A name for the format-specification of @outblob like + "sev_guest" [1] or "tdx_guest" [2] in the near term, or a + common standard format in the future. + + [1]: SEV Secure Nested Paging Firmware ABI Specification + Revision 1.55 Table 22 + https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56860.pdf + + [2]: IntelĀ® Trust Domain Extensions Data Center Attestation + Primitives : Quote Generation Library and Quote Verification + Library Revision 0.8 Appendix 4,5 + https://download.01.org/intel-sgx/latest/dcap-latest/linux/docs/Intel_TDX_DCAP_Quoting_Library_API.pdf + +What: /sys/kernel/config/tsm/report/$name/generation +Date: September, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (RO) The value in this attribute increments each time @inblob or + any option is written. Userspace can detect conflicts by + checking generation before writing to any attribute and making + sure the number of writes matches expectations after reading + @outblob, or it can prevent conflicts by creating a report + instance per requesting context. + +What: /sys/kernel/config/tsm/report/$name/privlevel +Date: September, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (WO) Attribute is visible if a TSM implementation provider + supports the concept of attestation reports for TVMs running at + different privilege levels, like SEV-SNP "VMPL", specify the + privilege level via this attribute. The minimum acceptable + value is conveyed via @privlevel_floor and the maximum + acceptable value is TSM_PRIVLEVEL_MAX (3). + +What: /sys/kernel/config/tsm/report/$name/privlevel_floor +Date: September, 2023 +KernelVersion: v6.7 +Contact: linux-coco@lists.linux.dev +Description: + (RO) Indicates the minimum permissible value that can be written + to @privlevel. diff --git a/Documentation/arch/arm64/arm-cca.rst b/Documentation/arch/arm64/arm-cca.rst new file mode 100644 index 0000000000000000000000000000000000000000..c48b7d4ab6bde4833c427056cfcadb79f03b6c26 --- /dev/null +++ b/Documentation/arch/arm64/arm-cca.rst @@ -0,0 +1,69 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== +Arm Confidential Compute Architecture +===================================== + +Arm systems that support the Realm Management Extension (RME) contain +hardware to allow a VM guest to be run in a way which protects the code +and data of the guest from the hypervisor. It extends the older "two +world" model (Normal and Secure World) into four worlds: Normal, Secure, +Root and Realm. Linux can then also be run as a guest to a monitor +running in the Realm world. + +The monitor running in the Realm world is known as the Realm Management +Monitor (RMM) and implements the Realm Management Monitor +specification[1]. The monitor acts a bit like a hypervisor (e.g. it runs +in EL2 and manages the stage 2 page tables etc of the guests running in +Realm world), however much of the control is handled by a hypervisor +running in the Normal World. The Normal World hypervisor uses the Realm +Management Interface (RMI) defined by the RMM specification to request +the RMM to perform operations (e.g. mapping memory or executing a vCPU). + +The RMM defines an environment for guests where the address space (IPA) +is split into two. The lower half is protected - any memory that is +mapped in this half cannot be seen by the Normal World and the RMM +restricts what operations the Normal World can perform on this memory +(e.g. the Normal World cannot replace pages in this region without the +guest's cooperation). The upper half is shared, the Normal World is free +to make changes to the pages in this region, and is able to emulate MMIO +devices in this region too. + +A guest running in a Realm may also communicate with the RMM using the +Realm Services Interface (RSI) to request changes in its environment or +to perform attestation about its environment. In particular it may +request that areas of the protected address space are transitioned +between 'RAM' and 'EMPTY' (in either direction). This allows a Realm +guest to give up memory to be returned to the Normal World, or to +request new memory from the Normal World. Without an explicit request +from the Realm guest the RMM will otherwise prevent the Normal World +from making these changes. + +Linux as a Realm Guest +---------------------- + +To run Linux as a guest within a Realm, the following must be provided +either by the VMM or by a `boot loader` run in the Realm before Linux: + + * All protected RAM described to Linux (by DT or ACPI) must be marked + RIPAS RAM before handing control over to Linux. + + * MMIO devices must be either unprotected (e.g. emulated by the Normal + World) or marked RIPAS DEV. + + * MMIO devices emulated by the Normal World and used very early in boot + (specifically earlycon) must be specified in the upper half of IPA. + For earlycon this can be done by specifying the address on the + command line, e.g. with an IPA size of 33 bits and the base address + of the emulated UART at 0x1000000: ``earlycon=uart,mmio,0x101000000`` + + * Linux will use bounce buffers for communicating with unprotected + devices. It will transition some protected memory to RIPAS EMPTY and + expect to be able to access unprotected pages at the same IPA address + but with the highest valid IPA bit set. The expectation is that the + VMM will remove the physical pages from the protected mapping and + provide those pages as unprotected pages. + +References +---------- +[1] https://developer.arm.com/documentation/den0137/ diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index b57776a68f156d0be28a73c6036073494f3961e0..30164fb24a24641b41be4d8013e825583011cea0 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -41,6 +41,9 @@ to automatically locate and size all RAM, or it may use knowledge of the RAM in the machine, or any other method the boot loader designer sees fit.) +For Arm Confidential Compute Realms this includes ensuring that all +protected RAM has a Realm IPA state (RIPAS) of "RAM". + 2. Setup the device tree ------------------------- diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst index d08e924204bf15a6528e4b20ff046e74e11c04aa..e352f4e6eb9675d6862310ac26588340ba059536 100644 --- a/Documentation/arch/arm64/index.rst +++ b/Documentation/arch/arm64/index.rst @@ -10,6 +10,7 @@ ARM64 Architecture acpi_object_usage amu arm-acpi + arm-cca asymmetric-32bit booting cpu-feature-registers diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index d258ed3baed8e62a09aa8005569f8d08c3cdb89c..22c664fedbd60627742ae869e48013f72222c487 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -151,8 +151,20 @@ In order to create user controlled virtual machines on S390, check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as privileged user (CAP_SYS_ADMIN). -On arm64, the physical address size for a VM (IPA Size limit) is limited -to 40bits by default. The limit can be configured if the host supports the +On arm64, the machine type identifier is used to encode a type and the +physical address size for the VM. The lower byte (bits[7-0]) encode the +address size and the upper bits[11-8] encode a machine type. The machine +types that might be available are: + + ====================== ============================================ + KVM_VM_TYPE_ARM_NORMAL A standard VM + KVM_VM_TYPE_ARM_REALM A "Realm" VM using the Arm Confidential + Compute extensions, the VM's memory is + protected from the host. + ====================== ============================================ + +The physical address size for a VM (IPA Size limit) is limited to 40bits +by default. The limit can be configured if the host supports the extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type identifier, where IPA_Bits is the maximum width of any physical @@ -1271,6 +1283,8 @@ User space may need to inject several types of events to the guest. Set the pending SError exception state for this VCPU. It is not possible to 'cancel' an Serror that has been made pending. +User space cannot inject SErrors into Realms. + If the guest performed an access to I/O memory which could not be handled by userspace, for example because of missing instruction syndrome decode information or because there is no device mapped at the accessed IPA, then @@ -3520,6 +3534,11 @@ Possible features: - the KVM_REG_ARM64_SVE_VLS pseudo-register is immutable, and can no longer be written using KVM_SET_ONE_REG. + - KVM_ARM_VCPU_REC: Allocate a REC (Realm Execution Context) for this + VCPU. This must be specified on all VCPUs created in a Realm VM. + Depends on KVM_CAP_ARM_RME. + Requires KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_REC). + 4.83 KVM_ARM_PREFERRED_TARGET ----------------------------- @@ -5055,6 +5074,7 @@ Recognised values for feature: ===== =========================================== arm64 KVM_ARM_VCPU_SVE (requires KVM_CAP_ARM_SVE) + arm64 KVM_ARM_VCPU_REC (requires KVM_CAP_ARM_RME) ===== =========================================== Finalizes the configuration of the specified vcpu feature. @@ -6336,6 +6356,30 @@ to the byte array. __u64 flags; } hypercall; +4.144 KVM_ARM_VCPU_RMM_PSCI_COMPLETE +------------------------------------ + +:Capability: KVM_CAP_ARM_RME +:Architectures: arm64 +:Type: vcpu ioctl +:Parameters: struct kvm_arm_rmm_psci_complete (in) +:Returns: 0 if successful, < 0 on error + +:: + + struct kvm_arm_rmm_psci_complete { + __u64 target_mpidr; + __u32 psci_status; + __u32 padding[3]; + }; + +Where PSCI functions are handled by user space, the RMM needs to be informed of +the target of the operation using `target_mpidr`, along with the status +(`psci_status`). The RMM v1.0 specification defines two functions that require +this call: PSCI_CPU_ON and PSCI_AFFINITY_INFO. + +If the kernel is handling PSCI then this is done automatically and the VMM +doesn't need to call this ioctl. It is strongly recommended that userspace use ``KVM_EXIT_IO`` (x86) or ``KVM_EXIT_MMIO`` (all except s390) to implement functionality that @@ -7833,6 +7877,46 @@ This capability is aimed to mitigate the threat that malicious VMs can cause CPU stuck (due to event windows don't open up) and make the CPU unavailable to host or other VMs. +7.38 KVM_CAP_ARM_RME +-------------------- + +:Architectures: arm64 +:Target: VM +:Parameters: args[0] provides an action, args[1] points to a structure in + memory for some actions. +:Returns: 0 on success, negative value on error + +Used to configure and set up the memory for a Realm. The available actions are: + +================================= ============================================= + KVM_CAP_ARM_RME_CONFIG_REALM Takes struct arm_rme_config as args[1] and + configures realm parameters prior to it being + created. + + Options are ARM_RME_CONFIG_RPV to set the + "Realm Personalization Value" and + ARM_RME_CONFIG_HASH_ALGO to set the hash + algorithm. + + KVM_CAP_ARM_RME_CREATE_REALM Request the RMM create the realm. The realm's + configuration parameters must be set first. + + KVM_CAP_ARM_RME_INIT_RIPAS_REALM Takes struct arm_rme_init_ripas as args[1] + and sets the RIPAS (Realm IPA State) to + RIPAS_RAM of a specified area of the realm's + IPA. + + KVM_CAP_ARM_RME_POPULATE_REALM Takes struct arm_rme_init_ripas as args[1] + and populates a region of protected address + space by copying the data from the shared + alias. + + KVM_CAP_ARM_RME_ACTIVATE_REALM Request the RMM activate the realm. No + further changes can be made to the realm's + configuration, and VCPUs are not permitted to + enter the realm until it has been activated. +================================= ============================================= + 8. Other capabilities. ====================== @@ -8155,6 +8239,9 @@ is supported, than the other should as well and vice versa. For arm64 see Documentation/virt/kvm/devices/vcpu.rst "KVM_ARM_VCPU_PVTIME_CTRL". For x86 see Documentation/virt/kvm/x86/msr.rst "MSR_KVM_STEAL_TIME". +Note that steal time accounting is not available when a guest is running +within a Arm CCA realm (machine type KVM_VM_TYPE_ARM_REALM). + 8.25 KVM_CAP_S390_DIAG318 ------------------------- diff --git a/MAINTAINERS b/MAINTAINERS index 082aac0730700e06852964fb9a731c6df8ee487d..1af3bc0d637169e0e092d7e3e5239d7bdcc6e765 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3015,6 +3015,8 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git F: Documentation/arch/arm64/ F: arch/arm64/ +F: drivers/virt/coco/arm-cca-guest/ +F: drivers/virt/coco/pkvm-guest/ F: tools/testing/selftests/arm64/ X: arch/arm64/boot/dts/ @@ -21970,6 +21972,14 @@ W: https://github.com/srcres258/linux-doc T: git git://github.com/srcres258/linux-doc.git doc-zh-tw F: Documentation/translations/zh_TW/ +TRUSTED SECURITY MODULE (TSM) ATTESTATION REPORTS +M: Dan Williams +L: linux-coco@lists.linux.dev +S: Maintained +F: Documentation/ABI/testing/configfs-tsm +F: drivers/virt/coco/tsm.c +F: include/linux/tsm.h + TTY LAYER AND SERIAL DRIVERS M: Greg Kroah-Hartman M: Jiri Slaby diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index aff99b41981d58613cdc3faed9029aa130bb5683..7452254e7a5c24e03543a41a25c82b85f80c3abb 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -20,6 +20,7 @@ config ARM64 select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE + select ARCH_HAS_CC_PLATFORM select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE @@ -32,6 +33,7 @@ config ARM64 select ARCH_HAS_KCOV select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP @@ -39,6 +41,8 @@ config ARM64 select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY + select ARCH_HAS_MEM_ENCRYPT + select ARCH_HAS_FORCE_DMA_UNENCRYPTED select ARCH_STACKWALK select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config index fb4542b428f68e59803744779b30ab96cb99fa93..d6dceaabbf74e75ebae91e4a3ce96ccb5917d60c 100644 --- a/arch/arm64/configs/tencent.config +++ b/arch/arm64/configs/tencent.config @@ -1417,6 +1417,8 @@ CONFIG_MLX5_VFIO_PCI=m CONFIG_VFIO_PLATFORM=m CONFIG_VFIO_FSL_MC=m CONFIG_VIRT_DRIVERS=y +CONFIG_TSM_REPORTS=m +CONFIG_ARM_CCA_GUEST=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_VDPA=m CONFIG_VIRTIO_BALLOON=m diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 3b694511b98f833722a089e5e34d5cb0c4b273db..e9d6bcfddf35c368e015de3eb9689c264b24fed4 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -17,6 +17,7 @@ #include #include #include +#include /* * Generic IO read/write. These perform native-endian accesses. @@ -139,6 +140,10 @@ extern void __memset_io(volatile void __iomem *, int, size_t); * I/O memory mapping functions. */ +typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size, + pgprot_t *prot); +int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); + #define ioremap_prot ioremap_prot #define _PAGE_IOREMAP PROT_DEVICE_nGnRE @@ -182,4 +187,11 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, unsigned long flags); #define arch_memremap_can_ram_remap arch_memremap_can_ram_remap +static inline bool arm64_is_protected_mmio(phys_addr_t phys_addr, size_t size) +{ + if (unlikely(is_realm_world())) + return arm64_rsi_is_protected(phys_addr, size); + return false; +} + #endif /* __ASM_IO_H */ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 3d6725ff0bf6d24577c84c6814fcc814b0681fb8..4cab0024e260357117faf1eb507f7c8b5f16b51d 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -615,4 +615,44 @@ static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) kvm_write_cptr_el2(val); } + +static inline bool kvm_is_realm(struct kvm *kvm) +{ + if (static_branch_unlikely(&kvm_rme_is_available) && kvm) + return kvm->arch.is_realm; + return false; +} + +static inline enum realm_state kvm_realm_state(struct kvm *kvm) +{ + return READ_ONCE(kvm->arch.realm.state); +} + +static inline bool kvm_realm_is_created(struct kvm *kvm) +{ + return kvm_is_realm(kvm) && kvm_realm_state(kvm) != REALM_STATE_NONE; +} + +static inline gpa_t kvm_gpa_from_fault(struct kvm *kvm, phys_addr_t ipa) +{ + if (kvm_is_realm(kvm)) { + struct realm *realm = &kvm->arch.realm; + + return ipa & ~BIT(realm->ia_bits - 1); + } + return ipa; +} + +static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu) +{ + if (static_branch_unlikely(&kvm_rme_is_available)) + return vcpu_has_feature(vcpu, KVM_ARM_VCPU_REC); + return false; +} + +static inline bool kvm_arm_rec_finalized(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.rec.mpidr != INVALID_HWID; +} + #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index df73e452d2cc0e5fd815d398a932763a5aa7607a..9afcb17e1d19e84664eba16cbaf6c21315fc5b3b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -27,6 +27,7 @@ #include #include #include +#include #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -38,7 +39,7 @@ #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS -#define KVM_VCPU_MAX_FEATURES 7 +#define KVM_VCPU_MAX_FEATURES 10 #define KVM_VCPU_VALID_FEATURES (BIT(KVM_VCPU_MAX_FEATURES) - 1) #define KVM_REQ_SLEEP \ @@ -72,9 +73,9 @@ enum kvm_mode kvm_get_mode(void); static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; }; #endif -extern unsigned int __ro_after_init kvm_sve_max_vl; extern unsigned int __ro_after_init kvm_host_sve_max_vl; int __init kvm_arm_init_sve(void); +unsigned int kvm_sve_get_max_vl(struct kvm *kvm); u32 __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); @@ -256,6 +257,9 @@ struct kvm_arch { cpumask_var_t supported_cpus; + /* PMCR_EL0.N value for the guest */ + u8 pmcr_n; + /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; struct maple_tree smccc_filter; @@ -278,6 +282,9 @@ struct kvm_arch { * the associated pKVM instance in the hypervisor. */ struct kvm_protected_vm pkvm; + + bool is_realm; + struct realm realm; }; struct kvm_vcpu_fault_info { @@ -588,6 +595,9 @@ struct kvm_vcpu_arch { /* Per-vcpu CCSIDR override or NULL */ u32 *ccsidr; + + /* Realm meta data */ + struct realm_rec rec; }; /* diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index d3e354bb8351d7aa9762155e18cb49cffee938e0..4153b77d8aaee0fe6eeb370f5c3e10f8be69b816 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -44,6 +44,15 @@ typedef u64 kvm_pte_t; #define KVM_PHYS_INVALID (-1ULL) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) + +/* + * Used to indicate a pte for which a 'break-before-make' sequence is in + * progress. + */ +#define KVM_INVALID_PTE_LOCKED BIT(10) + static inline bool kvm_pte_valid(kvm_pte_t pte) { return pte & KVM_PTE_VALID; diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h new file mode 100644 index 0000000000000000000000000000000000000000..2761e7eb5f3da56e8e5e5399792c4379e6495b91 --- /dev/null +++ b/arch/arm64/include/asm/kvm_rme.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#ifndef __ASM_KVM_RME_H +#define __ASM_KVM_RME_H + +#include +#include + +/** + * enum realm_state - State of a Realm + */ +enum realm_state { + /** + * @REALM_STATE_NONE: + * Realm has not yet been created. rmi_realm_create() may be + * called to create the realm. + */ + REALM_STATE_NONE, + /** + * @REALM_STATE_NEW: + * Realm is under construction, not eligible for execution. Pages + * may be populated with rmi_data_create(). + */ + REALM_STATE_NEW, + /** + * @REALM_STATE_ACTIVE: + * Realm has been created and is eligible for execution with + * rmi_rec_enter(). Pages may no longer be populated with + * rmi_data_create(). + */ + REALM_STATE_ACTIVE, + /** + * @REALM_STATE_DYING: + * Realm is in the process of being destroyed or has already been + * destroyed. + */ + REALM_STATE_DYING, + /** + * @REALM_STATE_DEAD: + * Realm has been destroyed. + */ + REALM_STATE_DEAD +}; + +/** + * struct realm - Additional per VM data for a Realm + * + * @state: The lifetime state machine for the realm + * @rd: Kernel mapping of the Realm Descriptor (RD) + * @params: Parameters for the RMI_REALM_CREATE command + * @num_aux: The number of auxiliary pages required by the RMM + * @vmid: VMID to be used by the RMM for the realm + * @ia_bits: Number of valid Input Address bits in the IPA + */ +struct realm { + enum realm_state state; + + void *rd; + struct realm_params *params; + + unsigned long num_aux; + unsigned int vmid; + unsigned int ia_bits; +}; + +/** + * struct realm_rec - Additional per VCPU data for a Realm + * + * @mpidr: MPIDR (Multiprocessor Affinity Register) value to identify this VCPU + * @rec_page: Kernel VA of the RMM's private page for this REC + * @aux_pages: Additional pages private to the RMM for this REC + * @run: Kernel VA of the RmiRecRun structure shared with the RMM + */ +struct realm_rec { + unsigned long mpidr; + void *rec_page; + /* + * REC_PARAMS_AUX_GRANULES is the maximum number of granules that the + * RMM can require. By using that to size the array we know that it + * will be big enough as the page size is always at least as large as + * the granule size. In the case of a larger page size than 4k (or an + * RMM which requires fewer auxiliary granules), the array will be + * bigger than needed however the extra memory required is small and + * this keeps the code cleaner. + */ + struct page *aux_pages[REC_PARAMS_AUX_GRANULES]; + struct rec_run *run; +}; + +void kvm_init_rme(void); +u32 kvm_realm_ipa_limit(void); +u32 kvm_realm_vgic_nr_lr(void); +u8 kvm_realm_max_pmu_counters(void); +unsigned int kvm_realm_sve_max_vl(void); + +u64 kvm_realm_reset_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); + +bool kvm_rme_supports_sve(void); + +int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); +int kvm_init_realm_vm(struct kvm *kvm); +void kvm_destroy_realm(struct kvm *kvm); +void kvm_realm_destroy_rtts(struct kvm *kvm, u32 ia_bits); +int kvm_create_rec(struct kvm_vcpu *vcpu); +void kvm_destroy_rec(struct kvm_vcpu *vcpu); + +int kvm_rec_enter(struct kvm_vcpu *vcpu); +int kvm_rec_pre_enter(struct kvm_vcpu *vcpu); +int handle_rec_exit(struct kvm_vcpu *vcpu, int rec_run_status); + +void kvm_realm_unmap_range(struct kvm *kvm, + unsigned long ipa, + unsigned long size, + bool unmap_private); +int realm_map_protected(struct realm *realm, + unsigned long base_ipa, + kvm_pfn_t pfn, + unsigned long size, + struct kvm_mmu_memory_cache *memcache); +int realm_map_non_secure(struct realm *realm, + unsigned long ipa, + kvm_pfn_t pfn, + unsigned long size, + struct kvm_mmu_memory_cache *memcache); +int realm_psci_complete(struct kvm_vcpu *source, + struct kvm_vcpu *target, + unsigned long status); + +static inline bool kvm_realm_is_private_address(struct realm *realm, + unsigned long addr) +{ + return !(addr & BIT(realm->ia_bits - 1)); +} + +#endif /* __ASM_KVM_RME_H */ diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h new file mode 100644 index 0000000000000000000000000000000000000000..a2a1eeb36d4b5077e56fdd570739565b8ede2e3c --- /dev/null +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_MEM_ENCRYPT_H +#define __ASM_MEM_ENCRYPT_H + +#include + +struct arm64_mem_crypt_ops { + int (*encrypt)(unsigned long addr, int numpages); + int (*decrypt)(unsigned long addr, int numpages); +}; + +int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops); + +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + +int realm_register_memory_enc_ops(void); + +static inline bool force_dma_unencrypted(struct device *dev) +{ + return is_realm_world(); +} + +/* + * For Arm CCA guests, canonical addresses are "encrypted", so no changes + * required for dma_addr_encrypted(). + * The unencrypted DMA buffers must be accessed via the unprotected IPA, + * "top IPA bit" set. + */ +#define dma_addr_unencrypted(x) ((x) | PROT_NS_SHARED) + +/* Clear the "top" IPA bit while converting back */ +#define dma_addr_canonical(x) ((x) & ~PROT_NS_SHARED) + +#endif /* __ASM_MEM_ENCRYPT_H */ diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index eed814b00a389755e050afa4c1d6eae908a42d4d..b8646e82db8777171a2d890886e3e25ebc45fc6f 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -65,8 +65,12 @@ #include #include +#include extern bool arm64_use_ng_mappings; +extern unsigned long prot_ns_shared; + +#define PROT_NS_SHARED (is_realm_world() ? prot_ns_shared : 0) #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 34cf8dcfa0b09d1b2174647808b113bce209fbf2..f6575f6f8c2d7302f8476058669b2993c8e7e92b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -552,6 +552,11 @@ static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, #define pgprot_nx(prot) \ __pgprot_modify(prot, PTE_MAYBE_GP, PTE_PXN) +#define pgprot_decrypted(prot) \ + __pgprot_modify(prot, PROT_NS_SHARED, PROT_NS_SHARED) +#define pgprot_encrypted(prot) \ + __pgprot_modify(prot, PROT_NS_SHARED, 0) + /* * Mark the prot value as uncacheable and unbufferable. */ diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h new file mode 100644 index 0000000000000000000000000000000000000000..27cd2751f3bfa4c599c0498301ec3c8f611cdf1b --- /dev/null +++ b/arch/arm64/include/asm/rmi_cmds.h @@ -0,0 +1,508 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#ifndef __ASM_RMI_CMDS_H +#define __ASM_RMI_CMDS_H + +#include + +#include + +struct rtt_entry { + unsigned long walk_level; + unsigned long desc; + int state; + int ripas; +}; + +/** + * rmi_data_create() - Create a data granule + * @rd: PA of the RD + * @data: PA of the target granule + * @ipa: IPA at which the granule will be mapped in the guest + * @src: PA of the source granule + * @flags: RMI_MEASURE_CONTENT if the contents should be measured + * + * Create a new data granule, copying contents from a non-secure granule. + * + * Return: RMI return code + */ +static inline int rmi_data_create(unsigned long rd, unsigned long data, + unsigned long ipa, unsigned long src, + unsigned long flags) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_DATA_CREATE, rd, data, ipa, src, + flags, &res); + + return res.a0; +} + +/** + * rmi_data_create_unknown() - Create a data granule with unknown contents + * @rd: PA of the RD + * @data: PA of the target granule + * @ipa: IPA at which the granule will be mapped in the guest + * + * Return: RMI return code + */ +static inline int rmi_data_create_unknown(unsigned long rd, + unsigned long data, + unsigned long ipa) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_DATA_CREATE_UNKNOWN, rd, data, ipa, &res); + + return res.a0; +} + +/** + * rmi_data_destroy() - Destroy a data granule + * @rd: PA of the RD + * @ipa: IPA at which the granule is mapped in the guest + * @data_out: PA of the granule which was destroyed + * @top_out: Top IPA of non-live RTT entries + * + * Unmap a protected IPA from stage 2, transitioning it to DESTROYED. + * The IPA cannot be used by the guest unless it is transitioned to RAM again + * by the realm guest. + * + * Return: RMI return code + */ +static inline int rmi_data_destroy(unsigned long rd, unsigned long ipa, + unsigned long *data_out, + unsigned long *top_out) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_DATA_DESTROY, rd, ipa, &res); + + if (data_out) + *data_out = res.a1; + if (top_out) + *top_out = res.a2; + + return res.a0; +} + +/** + * rmi_features() - Read feature register + * @index: Feature register index + * @out: Feature register value is written to this pointer + * + * Return: RMI return code + */ +static inline int rmi_features(unsigned long index, unsigned long *out) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_FEATURES, index, &res); + + if (out) + *out = res.a1; + return res.a0; +} + +/** + * rmi_granule_delegate() - Delegate a granule + * @phys: PA of the granule + * + * Delegate a granule for use by the realm world. + * + * Return: RMI return code + */ +static inline int rmi_granule_delegate(unsigned long phys) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_GRANULE_DELEGATE, phys, &res); + + return res.a0; +} + +/** + * rmi_granule_undelegate() - Undelegate a granule + * @phys: PA of the granule + * + * Undelegate a granule to allow use by the normal world. Will fail if the + * granule is in use. + * + * Return: RMI return code + */ +static inline int rmi_granule_undelegate(unsigned long phys) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_GRANULE_UNDELEGATE, phys, &res); + + return res.a0; +} + +/** + * rmi_psci_complete() - Complete pending PSCI command + * @calling_rec: PA of the calling REC + * @target_rec: PA of the target REC + * @status: Status of the PSCI request + * + * Completes a pending PSCI command which was called with an MPIDR argument, by + * providing the corresponding REC. + * + * Return: RMI return code + */ +static inline int rmi_psci_complete(unsigned long calling_rec, + unsigned long target_rec, + unsigned long status) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_PSCI_COMPLETE, calling_rec, target_rec, + status, &res); + + return res.a0; +} + +/** + * rmi_realm_activate() - Active a realm + * @rd: PA of the RD + * + * Mark a realm as Active signalling that creation is complete and allowing + * execution of the realm. + * + * Return: RMI return code + */ +static inline int rmi_realm_activate(unsigned long rd) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REALM_ACTIVATE, rd, &res); + + return res.a0; +} + +/** + * rmi_realm_create() - Create a realm + * @rd: PA of the RD + * @params: PA of realm parameters + * + * Create a new realm using the given parameters. + * + * Return: RMI return code + */ +static inline int rmi_realm_create(unsigned long rd, unsigned long params) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REALM_CREATE, rd, params, &res); + + return res.a0; +} + +/** + * rmi_realm_destroy() - Destroy a realm + * @rd: PA of the RD + * + * Destroys a realm, all objects belonging to the realm must be destroyed first. + * + * Return: RMI return code + */ +static inline int rmi_realm_destroy(unsigned long rd) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REALM_DESTROY, rd, &res); + + return res.a0; +} + +/** + * rmi_rec_aux_count() - Get number of auxiliary granules required + * @rd: PA of the RD + * @aux_count: Number of granules written to this pointer + * + * A REC may require extra auxiliary granules to be delegated for the RMM to + * store metadata (not visible to the normal world) in. This function provides + * the number of granules that are required. + * + * Return: RMI return code + */ +static inline int rmi_rec_aux_count(unsigned long rd, unsigned long *aux_count) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REC_AUX_COUNT, rd, &res); + + if (aux_count) + *aux_count = res.a1; + return res.a0; +} + +/** + * rmi_rec_create() - Create a REC + * @rd: PA of the RD + * @rec: PA of the target REC + * @params: PA of REC parameters + * + * Create a REC using the parameters specified in the struct rec_params pointed + * to by @params. + * + * Return: RMI return code + */ +static inline int rmi_rec_create(unsigned long rd, unsigned long rec, + unsigned long params) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REC_CREATE, rd, rec, params, &res); + + return res.a0; +} + +/** + * rmi_rec_destroy() - Destroy a REC + * @rec: PA of the target REC + * + * Destroys a REC. The REC must not be running. + * + * Return: RMI return code + */ +static inline int rmi_rec_destroy(unsigned long rec) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REC_DESTROY, rec, &res); + + return res.a0; +} + +/** + * rmi_rec_enter() - Enter a REC + * @rec: PA of the target REC + * @run_ptr: PA of RecRun structure + * + * Starts (or continues) execution within a REC. + * + * Return: RMI return code + */ +static inline int rmi_rec_enter(unsigned long rec, unsigned long run_ptr) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_REC_ENTER, rec, run_ptr, &res); + + return res.a0; +} + +/** + * rmi_rtt_create() - Creates an RTT + * @rd: PA of the RD + * @rtt: PA of the target RTT + * @ipa: Base of the IPA range described by the RTT + * @level: Depth of the RTT within the tree + * + * Creates an RTT (Realm Translation Table) at the specified level for the + * translation of the specified address within the realm. + * + * Return: RMI return code + */ +static inline int rmi_rtt_create(unsigned long rd, unsigned long rtt, + unsigned long ipa, long level) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_CREATE, rd, rtt, ipa, level, &res); + + return res.a0; +} + +/** + * rmi_rtt_destroy() - Destroy an RTT + * @rd: PA of the RD + * @ipa: Base of the IPA range described by the RTT + * @level: Depth of the RTT within the tree + * @out_rtt: Pointer to write the PA of the RTT which was destroyed + * @out_top: Pointer to write the top IPA of non-live RTT entries + * + * Destroys an RTT. The RTT must be non-live, i.e. none of the entries in the + * table are in ASSIGNED or TABLE state. + * + * Return: RMI return code. + */ +static inline int rmi_rtt_destroy(unsigned long rd, + unsigned long ipa, + long level, + unsigned long *out_rtt, + unsigned long *out_top) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_DESTROY, rd, ipa, level, &res); + + if (out_rtt) + *out_rtt = res.a1; + if (out_top) + *out_top = res.a2; + + return res.a0; +} + +/** + * rmi_rtt_fold() - Fold an RTT + * @rd: PA of the RD + * @ipa: Base of the IPA range described by the RTT + * @level: Depth of the RTT within the tree + * @out_rtt: Pointer to write the PA of the RTT which was destroyed + * + * Folds an RTT. If all entries with the RTT are 'homogeneous' the RTT can be + * folded into the parent and the RTT destroyed. + * + * Return: RMI return code + */ +static inline int rmi_rtt_fold(unsigned long rd, unsigned long ipa, + long level, unsigned long *out_rtt) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_FOLD, rd, ipa, level, &res); + + if (out_rtt) + *out_rtt = res.a1; + + return res.a0; +} + +/** + * rmi_rtt_init_ripas() - Set RIPAS for new realm + * @rd: PA of the RD + * @base: Base of target IPA region + * @top: Top of target IPA region + * @out_top: Top IPA of range whose RIPAS was modified + * + * Sets the RIPAS of a target IPA range to RAM, for a realm in the NEW state. + * + * Return: RMI return code + */ +static inline int rmi_rtt_init_ripas(unsigned long rd, unsigned long base, + unsigned long top, unsigned long *out_top) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_INIT_RIPAS, rd, base, top, &res); + + if (out_top) + *out_top = res.a1; + + return res.a0; +} + +/** + * rmi_rtt_map_unprotected() - Map NS granules into a realm + * @rd: PA of the RD + * @ipa: Base IPA of the mapping + * @level: Depth within the RTT tree + * @desc: RTTE descriptor + * + * Create a mapping from an Unprotected IPA to a Non-secure PA. + * + * Return: RMI return code + */ +static inline int rmi_rtt_map_unprotected(unsigned long rd, + unsigned long ipa, + long level, + unsigned long desc) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_MAP_UNPROTECTED, rd, ipa, level, + desc, &res); + + return res.a0; +} + +/** + * rmi_rtt_read_entry() - Read an RTTE + * @rd: PA of the RD + * @ipa: IPA for which to read the RTTE + * @level: RTT level at which to read the RTTE + * @rtt: Output structure describing the RTTE + * + * Reads a RTTE (Realm Translation Table Entry). + * + * Return: RMI return code + */ +static inline int rmi_rtt_read_entry(unsigned long rd, unsigned long ipa, + long level, struct rtt_entry *rtt) +{ + struct arm_smccc_1_2_regs regs = { + SMC_RMI_RTT_READ_ENTRY, + rd, ipa, level + }; + + arm_smccc_1_2_smc(®s, ®s); + + rtt->walk_level = regs.a1; + rtt->state = regs.a2 & 0xFF; + rtt->desc = regs.a3; + rtt->ripas = regs.a4 & 0xFF; + + return regs.a0; +} + +/** + * rmi_rtt_set_ripas() - Set RIPAS for an running realm + * @rd: PA of the RD + * @rec: PA of the REC making the request + * @base: Base of target IPA region + * @top: Top of target IPA region + * @out_top: Pointer to write top IPA of range whose RIPAS was modified + * + * Completes a request made by the realm to change the RIPAS of a target IPA + * range. + * + * Return: RMI return code + */ +static inline int rmi_rtt_set_ripas(unsigned long rd, unsigned long rec, + unsigned long base, unsigned long top, + unsigned long *out_top) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_SET_RIPAS, rd, rec, base, top, &res); + + if (out_top) + *out_top = res.a1; + + return res.a0; +} + +/** + * rmi_rtt_unmap_unprotected() - Remove a NS mapping + * @rd: PA of the RD + * @ipa: Base IPA of the mapping + * @level: Depth within the RTT tree + * @out_top: Pointer to write top IPA of non-live RTT entries + * + * Removes a mapping at an Unprotected IPA. + * + * Return: RMI return code + */ +static inline int rmi_rtt_unmap_unprotected(unsigned long rd, + unsigned long ipa, + long level, + unsigned long *out_top) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RMI_RTT_UNMAP_UNPROTECTED, rd, ipa, + level, &res); + + if (out_top) + *out_top = res.a1; + + return res.a0; +} + +#endif /* __ASM_RMI_CMDS_H */ diff --git a/arch/arm64/include/asm/rmi_smc.h b/arch/arm64/include/asm/rmi_smc.h new file mode 100644 index 0000000000000000000000000000000000000000..7a93a3e0ac6eb981c95468fd616d8c62a220f634 --- /dev/null +++ b/arch/arm64/include/asm/rmi_smc.h @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023-2024 ARM Ltd. + * + * The values and structures in this file are from the Realm Management Monitor + * specification (DEN0137) version 1.0-rel0: + * https://developer.arm.com/documentation/den0137/1-0rel0/ + */ + +#ifndef __ASM_RMI_SMC_H +#define __ASM_RMI_SMC_H + +#include + +#define SMC_RMI_CALL(func) \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD, \ + (func)) + +#define SMC_RMI_VERSION SMC_RMI_CALL(0x0150) +#define SMC_RMI_GRANULE_DELEGATE SMC_RMI_CALL(0x0151) +#define SMC_RMI_GRANULE_UNDELEGATE SMC_RMI_CALL(0x0152) +#define SMC_RMI_DATA_CREATE SMC_RMI_CALL(0x0153) +#define SMC_RMI_DATA_CREATE_UNKNOWN SMC_RMI_CALL(0x0154) +#define SMC_RMI_DATA_DESTROY SMC_RMI_CALL(0x0155) + +#define SMC_RMI_REALM_ACTIVATE SMC_RMI_CALL(0x0157) +#define SMC_RMI_REALM_CREATE SMC_RMI_CALL(0x0158) +#define SMC_RMI_REALM_DESTROY SMC_RMI_CALL(0x0159) +#define SMC_RMI_REC_CREATE SMC_RMI_CALL(0x015a) +#define SMC_RMI_REC_DESTROY SMC_RMI_CALL(0x015b) +#define SMC_RMI_REC_ENTER SMC_RMI_CALL(0x015c) +#define SMC_RMI_RTT_CREATE SMC_RMI_CALL(0x015d) +#define SMC_RMI_RTT_DESTROY SMC_RMI_CALL(0x015e) +#define SMC_RMI_RTT_MAP_UNPROTECTED SMC_RMI_CALL(0x015f) + +#define SMC_RMI_RTT_READ_ENTRY SMC_RMI_CALL(0x0161) +#define SMC_RMI_RTT_UNMAP_UNPROTECTED SMC_RMI_CALL(0x0162) + +#define SMC_RMI_PSCI_COMPLETE SMC_RMI_CALL(0x0164) +#define SMC_RMI_FEATURES SMC_RMI_CALL(0x0165) +#define SMC_RMI_RTT_FOLD SMC_RMI_CALL(0x0166) +#define SMC_RMI_REC_AUX_COUNT SMC_RMI_CALL(0x0167) +#define SMC_RMI_RTT_INIT_RIPAS SMC_RMI_CALL(0x0168) +#define SMC_RMI_RTT_SET_RIPAS SMC_RMI_CALL(0x0169) + +#define RMI_ABI_MAJOR_VERSION 1 +#define RMI_ABI_MINOR_VERSION 0 + +#define RMI_ABI_VERSION_GET_MAJOR(version) ((version) >> 16) +#define RMI_ABI_VERSION_GET_MINOR(version) ((version) & 0xFFFF) +#define RMI_ABI_VERSION(major, minor) (((major) << 16) | (minor)) + +#define RMI_UNASSIGNED 0 +#define RMI_ASSIGNED 1 +#define RMI_TABLE 2 + +#define RMI_RETURN_STATUS(ret) ((ret) & 0xFF) +#define RMI_RETURN_INDEX(ret) (((ret) >> 8) & 0xFF) + +#define RMI_SUCCESS 0 +#define RMI_ERROR_INPUT 1 +#define RMI_ERROR_REALM 2 +#define RMI_ERROR_REC 3 +#define RMI_ERROR_RTT 4 + +enum rmi_ripas { + RMI_EMPTY = 0, + RMI_RAM = 1, + RMI_DESTROYED = 2, +}; + +#define RMI_NO_MEASURE_CONTENT 0 +#define RMI_MEASURE_CONTENT 1 + +#define RMI_FEATURE_REGISTER_0_S2SZ GENMASK(7, 0) +#define RMI_FEATURE_REGISTER_0_LPA2 BIT(8) +#define RMI_FEATURE_REGISTER_0_SVE_EN BIT(9) +#define RMI_FEATURE_REGISTER_0_SVE_VL GENMASK(13, 10) +#define RMI_FEATURE_REGISTER_0_NUM_BPS GENMASK(19, 14) +#define RMI_FEATURE_REGISTER_0_NUM_WPS GENMASK(25, 20) +#define RMI_FEATURE_REGISTER_0_PMU_EN BIT(26) +#define RMI_FEATURE_REGISTER_0_PMU_NUM_CTRS GENMASK(31, 27) +#define RMI_FEATURE_REGISTER_0_HASH_SHA_256 BIT(32) +#define RMI_FEATURE_REGISTER_0_HASH_SHA_512 BIT(33) +#define RMI_FEATURE_REGISTER_0_GICV3_NUM_LRS GENMASK(37, 34) +#define RMI_FEATURE_REGISTER_0_MAX_RECS_ORDER GENMASK(41, 38) +#define RMI_FEATURE_REGISTER_0_Reserved GENMASK(63, 42) + +#define RMI_REALM_PARAM_FLAG_LPA2 BIT(0) +#define RMI_REALM_PARAM_FLAG_SVE BIT(1) +#define RMI_REALM_PARAM_FLAG_PMU BIT(2) + +/* + * Note many of these fields are smaller than u64 but all fields have u64 + * alignment, so use u64 to ensure correct alignment. + */ +struct realm_params { + union { /* 0x0 */ + struct { + u64 flags; + u64 s2sz; + u64 sve_vl; + u64 num_bps; + u64 num_wps; + u64 pmu_num_ctrs; + u64 hash_algo; + }; + u8 padding0[0x400]; + }; + union { /* 0x400 */ + u8 rpv[64]; + u8 padding1[0x400]; + }; + union { /* 0x800 */ + struct { + u64 vmid; + u64 rtt_base; + s64 rtt_level_start; + u64 rtt_num_start; + }; + u8 padding2[0x800]; + }; +}; + +/* + * The number of GPRs (starting from X0) that are + * configured by the host when a REC is created. + */ +#define REC_CREATE_NR_GPRS 8 + +#define REC_PARAMS_FLAG_RUNNABLE BIT_ULL(0) + +#define REC_PARAMS_AUX_GRANULES 16 + +struct rec_params { + union { /* 0x0 */ + u64 flags; + u8 padding0[0x100]; + }; + union { /* 0x100 */ + u64 mpidr; + u8 padding1[0x100]; + }; + union { /* 0x200 */ + u64 pc; + u8 padding2[0x100]; + }; + union { /* 0x300 */ + u64 gprs[REC_CREATE_NR_GPRS]; + u8 padding3[0x500]; + }; + union { /* 0x800 */ + struct { + u64 num_rec_aux; + u64 aux[REC_PARAMS_AUX_GRANULES]; + }; + u8 padding4[0x800]; + }; +}; + +#define REC_ENTER_FLAG_EMULATED_MMIO BIT(0) +#define REC_ENTER_FLAG_INJECT_SEA BIT(1) +#define REC_ENTER_FLAG_TRAP_WFI BIT(2) +#define REC_ENTER_FLAG_TRAP_WFE BIT(3) +#define REC_ENTER_FLAG_RIPAS_RESPONSE BIT(4) + +#define REC_RUN_GPRS 31 +#define REC_MAX_GIC_NUM_LRS 16 + +struct rec_enter { + union { /* 0x000 */ + u64 flags; + u8 padding0[0x200]; + }; + union { /* 0x200 */ + u64 gprs[REC_RUN_GPRS]; + u8 padding1[0x100]; + }; + union { /* 0x300 */ + struct { + u64 gicv3_hcr; + u64 gicv3_lrs[REC_MAX_GIC_NUM_LRS]; + }; + u8 padding2[0x100]; + }; + u8 padding3[0x400]; +}; + +#define RMI_EXIT_SYNC 0x00 +#define RMI_EXIT_IRQ 0x01 +#define RMI_EXIT_FIQ 0x02 +#define RMI_EXIT_PSCI 0x03 +#define RMI_EXIT_RIPAS_CHANGE 0x04 +#define RMI_EXIT_HOST_CALL 0x05 +#define RMI_EXIT_SERROR 0x06 + +struct rec_exit { + union { /* 0x000 */ + u8 exit_reason; + u8 padding0[0x100]; + }; + union { /* 0x100 */ + struct { + u64 esr; + u64 far; + u64 hpfar; + }; + u8 padding1[0x100]; + }; + union { /* 0x200 */ + u64 gprs[REC_RUN_GPRS]; + u8 padding2[0x100]; + }; + union { /* 0x300 */ + struct { + u64 gicv3_hcr; + u64 gicv3_lrs[REC_MAX_GIC_NUM_LRS]; + u64 gicv3_misr; + u64 gicv3_vmcr; + }; + u8 padding3[0x100]; + }; + union { /* 0x400 */ + struct { + u64 cntp_ctl; + u64 cntp_cval; + u64 cntv_ctl; + u64 cntv_cval; + }; + u8 padding4[0x100]; + }; + union { /* 0x500 */ + struct { + u64 ripas_base; + u64 ripas_top; + u64 ripas_value; + }; + u8 padding5[0x100]; + }; + union { /* 0x600 */ + u16 imm; + u8 padding6[0x100]; + }; + union { /* 0x700 */ + struct { + u8 pmu_ovf_status; + }; + u8 padding7[0x100]; + }; +}; + +struct rec_run { + struct rec_enter enter; + struct rec_exit exit; +}; + +#endif /* __ASM_RMI_SMC_H */ diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h new file mode 100644 index 0000000000000000000000000000000000000000..5f9c8623183dddf6c96be5dbe9fa1c5e7136adaa --- /dev/null +++ b/arch/arm64/include/asm/rsi.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024 ARM Ltd. + */ + +#ifndef __ASM_RSI_H_ +#define __ASM_RSI_H_ + +#include +#include +#include + +DECLARE_STATIC_KEY_FALSE(rsi_present); + +void __init arm64_rsi_init(void); + +bool arm64_rsi_is_protected(phys_addr_t base, size_t size); + +static inline bool is_realm_world(void) +{ + return static_branch_unlikely(&rsi_present); +} + +static inline int rsi_set_memory_range(phys_addr_t start, phys_addr_t end, + enum ripas state, unsigned long flags) +{ + unsigned long ret; + phys_addr_t top; + + while (start != end) { + ret = rsi_set_addr_range_state(start, end, state, flags, &top); + if (ret || top < start || top > end) + return -EINVAL; + start = top; + } + + return 0; +} + +/* + * Convert the specified range to RAM. Do not use this if you rely on the + * contents of a page that may already be in RAM state. + */ +static inline int rsi_set_memory_range_protected(phys_addr_t start, + phys_addr_t end) +{ + return rsi_set_memory_range(start, end, RSI_RIPAS_RAM, + RSI_CHANGE_DESTROYED); +} + +/* + * Convert the specified range to RAM. Do not convert any pages that may have + * been DESTROYED, without our permission. + */ +static inline int rsi_set_memory_range_protected_safe(phys_addr_t start, + phys_addr_t end) +{ + return rsi_set_memory_range(start, end, RSI_RIPAS_RAM, + RSI_NO_CHANGE_DESTROYED); +} + +static inline int rsi_set_memory_range_shared(phys_addr_t start, + phys_addr_t end) +{ + return rsi_set_memory_range(start, end, RSI_RIPAS_EMPTY, + RSI_CHANGE_DESTROYED); +} +#endif /* __ASM_RSI_H_ */ diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h new file mode 100644 index 0000000000000000000000000000000000000000..e6a211001bd38edbb8fa3922b17ec59e6d12bbc4 --- /dev/null +++ b/arch/arm64/include/asm/rsi_cmds.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#ifndef __ASM_RSI_CMDS_H +#define __ASM_RSI_CMDS_H + +#include + +#include + +#define RSI_GRANULE_SHIFT 12 +#define RSI_GRANULE_SIZE (_AC(1, UL) << RSI_GRANULE_SHIFT) + +enum ripas { + RSI_RIPAS_EMPTY = 0, + RSI_RIPAS_RAM = 1, + RSI_RIPAS_DESTROYED = 2, + RSI_RIPAS_DEV = 3, +}; + +static inline unsigned long rsi_request_version(unsigned long req, + unsigned long *out_lower, + unsigned long *out_higher) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_ABI_VERSION, req, 0, 0, 0, 0, 0, 0, &res); + + if (out_lower) + *out_lower = res.a1; + if (out_higher) + *out_higher = res.a2; + + return res.a0; +} + +static inline unsigned long rsi_get_realm_config(struct realm_config *cfg) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_REALM_CONFIG, virt_to_phys(cfg), + 0, 0, 0, 0, 0, 0, &res); + return res.a0; +} + +static inline unsigned long rsi_ipa_state_get(phys_addr_t start, + phys_addr_t end, + enum ripas *state, + phys_addr_t *top) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_IPA_STATE_GET, + start, end, 0, 0, 0, 0, 0, + &res); + + if (res.a0 == RSI_SUCCESS) { + if (top) + *top = res.a1; + if (state) + *state = res.a2; + } + + return res.a0; +} + +static inline long rsi_set_addr_range_state(phys_addr_t start, + phys_addr_t end, + enum ripas state, + unsigned long flags, + phys_addr_t *top) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_IPA_STATE_SET, start, end, state, + flags, 0, 0, 0, &res); + + if (top) + *top = res.a1; + + if (res.a2 != RSI_ACCEPT) + return -EPERM; + + return res.a0; +} + +/** + * rsi_attestation_token_init - Initialise the operation to retrieve an + * attestation token. + * + * @challenge: The challenge data to be used in the attestation token + * generation. + * @size: Size of the challenge data in bytes. + * + * Initialises the attestation token generation and returns an upper bound + * on the attestation token size that can be used to allocate an adequate + * buffer. The caller is expected to subsequently call + * rsi_attestation_token_continue() to retrieve the attestation token data on + * the same CPU. + * + * Returns: + * On success, returns the upper limit of the attestation report size. + * Otherwise, -EINVAL + */ +static inline long +rsi_attestation_token_init(const u8 *challenge, unsigned long size) +{ + struct arm_smccc_1_2_regs regs = { 0 }; + + /* The challenge must be at least 32bytes and at most 64bytes */ + if (!challenge || size < 32 || size > 64) + return -EINVAL; + + regs.a0 = SMC_RSI_ATTESTATION_TOKEN_INIT; + memcpy(®s.a1, challenge, size); + arm_smccc_1_2_smc(®s, ®s); + + if (regs.a0 == RSI_SUCCESS) + return regs.a1; + + return -EINVAL; +} + +/** + * rsi_attestation_token_continue - Continue the operation to retrieve an + * attestation token. + * + * @granule: {I}PA of the Granule to which the token will be written. + * @offset: Offset within Granule to start of buffer in bytes. + * @size: The size of the buffer. + * @len: The number of bytes written to the buffer. + * + * Retrieves up to a RSI_GRANULE_SIZE worth of token data per call. The caller + * is expected to call rsi_attestation_token_init() before calling this + * function to retrieve the attestation token. + * + * Return: + * * %RSI_SUCCESS - Attestation token retrieved successfully. + * * %RSI_INCOMPLETE - Token generation is not complete. + * * %RSI_ERROR_INPUT - A parameter was not valid. + * * %RSI_ERROR_STATE - Attestation not in progress. + */ +static inline unsigned long rsi_attestation_token_continue(phys_addr_t granule, + unsigned long offset, + unsigned long size, + unsigned long *len) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RSI_ATTESTATION_TOKEN_CONTINUE, + granule, offset, size, 0, &res); + + if (len) + *len = res.a1; + return res.a0; +} + +#endif /* __ASM_RSI_CMDS_H */ diff --git a/arch/arm64/include/asm/rsi_smc.h b/arch/arm64/include/asm/rsi_smc.h new file mode 100644 index 0000000000000000000000000000000000000000..6cb070eca9e9b2b7650832c96aab3795a558058e --- /dev/null +++ b/arch/arm64/include/asm/rsi_smc.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#ifndef __ASM_RSI_SMC_H_ +#define __ASM_RSI_SMC_H_ + +#include + +/* + * This file describes the Realm Services Interface (RSI) Application Binary + * Interface (ABI) for SMC calls made from within the Realm to the RMM and + * serviced by the RMM. + */ + +/* + * The major version number of the RSI implementation. This is increased when + * the binary format or semantics of the SMC calls change. + */ +#define RSI_ABI_VERSION_MAJOR UL(1) + +/* + * The minor version number of the RSI implementation. This is increased when + * a bug is fixed, or a feature is added without breaking binary compatibility. + */ +#define RSI_ABI_VERSION_MINOR UL(0) + +#define RSI_ABI_VERSION ((RSI_ABI_VERSION_MAJOR << 16) | \ + RSI_ABI_VERSION_MINOR) + +#define RSI_ABI_VERSION_GET_MAJOR(_version) ((_version) >> 16) +#define RSI_ABI_VERSION_GET_MINOR(_version) ((_version) & 0xFFFF) + +#define RSI_SUCCESS UL(0) +#define RSI_ERROR_INPUT UL(1) +#define RSI_ERROR_STATE UL(2) +#define RSI_INCOMPLETE UL(3) +#define RSI_ERROR_UNKNOWN UL(4) + +#define SMC_RSI_FID(n) ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD, \ + n) + +/* + * Returns RSI version. + * + * arg1 == Requested interface revision + * ret0 == Status / error + * ret1 == Lower implemented interface revision + * ret2 == Higher implemented interface revision + */ +#define SMC_RSI_ABI_VERSION SMC_RSI_FID(0x190) + +/* + * Read feature register. + * + * arg1 == Feature register index + * ret0 == Status / error + * ret1 == Feature register value + */ +#define SMC_RSI_FEATURES SMC_RSI_FID(0x191) + +/* + * Read measurement for the current Realm. + * + * arg1 == Index, which measurements slot to read + * ret0 == Status / error + * ret1 == Measurement value, bytes: 0 - 7 + * ret2 == Measurement value, bytes: 8 - 15 + * ret3 == Measurement value, bytes: 16 - 23 + * ret4 == Measurement value, bytes: 24 - 31 + * ret5 == Measurement value, bytes: 32 - 39 + * ret6 == Measurement value, bytes: 40 - 47 + * ret7 == Measurement value, bytes: 48 - 55 + * ret8 == Measurement value, bytes: 56 - 63 + */ +#define SMC_RSI_MEASUREMENT_READ SMC_RSI_FID(0x192) + +/* + * Extend Realm Extensible Measurement (REM) value. + * + * arg1 == Index, which measurements slot to extend + * arg2 == Size of realm measurement in bytes, max 64 bytes + * arg3 == Measurement value, bytes: 0 - 7 + * arg4 == Measurement value, bytes: 8 - 15 + * arg5 == Measurement value, bytes: 16 - 23 + * arg6 == Measurement value, bytes: 24 - 31 + * arg7 == Measurement value, bytes: 32 - 39 + * arg8 == Measurement value, bytes: 40 - 47 + * arg9 == Measurement value, bytes: 48 - 55 + * arg10 == Measurement value, bytes: 56 - 63 + * ret0 == Status / error + */ +#define SMC_RSI_MEASUREMENT_EXTEND SMC_RSI_FID(0x193) + +/* + * Initialize the operation to retrieve an attestation token. + * + * arg1 == Challenge value, bytes: 0 - 7 + * arg2 == Challenge value, bytes: 8 - 15 + * arg3 == Challenge value, bytes: 16 - 23 + * arg4 == Challenge value, bytes: 24 - 31 + * arg5 == Challenge value, bytes: 32 - 39 + * arg6 == Challenge value, bytes: 40 - 47 + * arg7 == Challenge value, bytes: 48 - 55 + * arg8 == Challenge value, bytes: 56 - 63 + * ret0 == Status / error + * ret1 == Upper bound of token size in bytes + */ +#define SMC_RSI_ATTESTATION_TOKEN_INIT SMC_RSI_FID(0x194) + +/* + * Continue the operation to retrieve an attestation token. + * + * arg1 == The IPA of token buffer + * arg2 == Offset within the granule of the token buffer + * arg3 == Size of the granule buffer + * ret0 == Status / error + * ret1 == Length of token bytes copied to the granule buffer + */ +#define SMC_RSI_ATTESTATION_TOKEN_CONTINUE SMC_RSI_FID(0x195) + +#ifndef __ASSEMBLY__ + +struct realm_config { + union { + struct { + unsigned long ipa_bits; /* Width of IPA in bits */ + unsigned long hash_algo; /* Hash algorithm */ + }; + u8 pad[0x200]; + }; + union { + u8 rpv[64]; /* Realm Personalization Value */ + u8 pad2[0xe00]; + }; + /* + * The RMM requires the configuration structure to be aligned to a 4k + * boundary, ensure this happens by aligning this structure. + */ +} __aligned(0x1000); + +#endif /* __ASSEMBLY__ */ + +/* + * Read configuration for the current Realm. + * + * arg1 == struct realm_config addr + * ret0 == Status / error + */ +#define SMC_RSI_REALM_CONFIG SMC_RSI_FID(0x196) + +/* + * Request RIPAS of a target IPA range to be changed to a specified value. + * + * arg1 == Base IPA address of target region + * arg2 == Top of the region + * arg3 == RIPAS value + * arg4 == flags + * ret0 == Status / error + * ret1 == Top of modified IPA range + * ret2 == Whether the Host accepted or rejected the request + */ +#define SMC_RSI_IPA_STATE_SET SMC_RSI_FID(0x197) + +#define RSI_NO_CHANGE_DESTROYED UL(0) +#define RSI_CHANGE_DESTROYED UL(1) + +#define RSI_ACCEPT UL(0) +#define RSI_REJECT UL(1) + +/* + * Get RIPAS of a target IPA range. + * + * arg1 == Base IPA of target region + * arg2 == End of target IPA region + * ret0 == Status / error + * ret1 == Top of IPA region which has the reported RIPAS value + * ret2 == RIPAS value + */ +#define SMC_RSI_IPA_STATE_GET SMC_RSI_FID(0x198) + +/* + * Make a Host call. + * + * arg1 == IPA of host call structure + * ret0 == Status / error + */ +#define SMC_RSI_HOST_CALL SMC_RSI_FID(0x199) + +#endif /* __ASM_RSI_SMC_H_ */ diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h index 0f740b7811871cbd5b35fc99fbe550830739738e..37774c7930064c6442c6ab26cfaa454f1fab319f 100644 --- a/arch/arm64/include/asm/set_memory.h +++ b/arch/arm64/include/asm/set_memory.h @@ -3,6 +3,7 @@ #ifndef _ASM_ARM64_SET_MEMORY_H #define _ASM_ARM64_SET_MEMORY_H +#include #include bool can_set_direct_map(void); @@ -14,4 +15,7 @@ int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); bool kernel_page_present(struct page *page); +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + #endif /* _ASM_ARM64_SET_MEMORY_H */ diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 261d6e9df2e1009f8efab95d90dda611fcba1b3e..12cf36c3818904fb68c3ee19434c9a89054ed84b 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -81,6 +81,7 @@ void __hyp_reset_vectors(void); bool is_kvm_arm_initialised(void); DECLARE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); +DECLARE_STATIC_KEY_FALSE(kvm_rme_is_available); /* Reports the availability of HYP mode */ static inline bool is_hyp_mode_available(void) diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index f7ddd73a8c0fa2dabffd2782f674f3b484079875..1d73809c38515a030a726b24fa87d8e2774e7ee1 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -110,6 +110,7 @@ struct kvm_regs { #define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */ #define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */ #define KVM_ARM_VCPU_HAS_EL2 7 /* Support nested virtualization */ +#define KVM_ARM_VCPU_REC 9 /* VCPU REC state as part of Realm */ struct kvm_vcpu_init { __u32 target; @@ -415,6 +416,54 @@ enum { #define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 #define KVM_DEV_ARM_ITS_CTRL_RESET 4 +/* KVM_CAP_ARM_RME on VM fd */ +#define KVM_CAP_ARM_RME_CONFIG_REALM 0 +#define KVM_CAP_ARM_RME_CREATE_REALM 1 +#define KVM_CAP_ARM_RME_INIT_RIPAS_REALM 2 +#define KVM_CAP_ARM_RME_POPULATE_REALM 3 +#define KVM_CAP_ARM_RME_ACTIVATE_REALM 4 + +/* List of configuration items accepted for KVM_CAP_ARM_RME_CONFIG_REALM */ +#define ARM_RME_CONFIG_RPV 0 +#define ARM_RME_CONFIG_HASH_ALGO 1 + +#define ARM_RME_CONFIG_MEASUREMENT_ALGO_SHA256 0 +#define ARM_RME_CONFIG_MEASUREMENT_ALGO_SHA512 1 + +#define ARM_RME_CONFIG_RPV_SIZE 64 + +struct arm_rme_config { + __u32 cfg; + union { + /* cfg == ARM_RME_CONFIG_RPV */ + struct { + __u8 rpv[ARM_RME_CONFIG_RPV_SIZE]; + }; + + /* cfg == ARM_RME_CONFIG_HASH_ALGO */ + struct { + __u32 hash_algo; + }; + + /* Fix the size of the union */ + __u8 reserved[256]; + }; +}; + +#define KVM_ARM_RME_POPULATE_FLAGS_MEASURE (1 << 0) +struct arm_rme_populate_realm { + __u64 base; + __u64 size; + __u32 flags; + __u32 reserved[3]; +}; + +struct arm_rme_init_ripas { + __u64 base; + __u64 size; + __u64 reserved[2]; +}; + /* Device Control API on vcpu fd */ #define KVM_ARM_VCPU_PMU_V3_CTRL 0 #define KVM_ARM_VCPU_PMU_V3_IRQ 0 diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index d95b3d6b471a7d63957c47151fd6cb404ca0f4c7..c6a5be77d77d5b438fd2a3ba328d74b5ecc620e2 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -34,7 +34,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ syscall.o proton-pack.o idreg-override.o idle.o \ - patching.o + patching.o rsi.o obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 2b478ca356b00f1f71c11d40e6bb6eb66cf9b173..76cf3e63422ac7d5242921f48169adc73f34a64a 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -32,8 +32,16 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) u64 attr = md->attribute; u32 type = md->type; - if (type == EFI_MEMORY_MAPPED_IO) - return PROT_DEVICE_nGnRE; + if (type == EFI_MEMORY_MAPPED_IO) { + pgprot_t prot = __pgprot(PROT_DEVICE_nGnRE); + + if (arm64_is_protected_mmio(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT)) + prot = pgprot_encrypted(prot); + else + prot = pgprot_decrypted(prot); + return pgprot_val(prot); + } if (region_is_misaligned(md)) { static bool __initdata code_is_misaligned; diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c new file mode 100644 index 0000000000000000000000000000000000000000..be868cb5a4b45cef31119826dbf9ca79645ca33f --- /dev/null +++ b/arch/arm64/kernel/rsi.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static struct realm_config config; + +unsigned long prot_ns_shared; +EXPORT_SYMBOL(prot_ns_shared); + +DEFINE_STATIC_KEY_FALSE_RO(rsi_present); +EXPORT_SYMBOL(rsi_present); + +bool cc_platform_has(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return is_realm_world(); + default: + return false; + } +} +EXPORT_SYMBOL_GPL(cc_platform_has); + +static bool rsi_version_matches(void) +{ + unsigned long ver_lower, ver_higher; + unsigned long ret = rsi_request_version(RSI_ABI_VERSION, + &ver_lower, + &ver_higher); + + if (ret == SMCCC_RET_NOT_SUPPORTED) + return false; + + if (ret != RSI_SUCCESS) { + pr_err("RME: RMM doesn't support RSI version %lu.%lu. Supported range: %lu.%lu-%lu.%lu\n", + RSI_ABI_VERSION_MAJOR, RSI_ABI_VERSION_MINOR, + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower), + RSI_ABI_VERSION_GET_MAJOR(ver_higher), + RSI_ABI_VERSION_GET_MINOR(ver_higher)); + return false; + } + + pr_info("RME: Using RSI version %lu.%lu\n", + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower)); + + return true; +} + +static void __init arm64_rsi_setup_memory(void) +{ + u64 i; + phys_addr_t start, end; + + /* + * Iterate over the available memory ranges and convert the state to + * protected memory. We should take extra care to ensure that we DO NOT + * permit any "DESTROYED" pages to be converted to "RAM". + * + * panic() is used because if the attempt to switch the memory to + * protected has failed here, then future accesses to the memory are + * simply going to be reflected as a SEA (Synchronous External Abort) + * which we can't handle. Bailing out early prevents the guest limping + * on and dying later. + */ + for_each_mem_range(i, &start, &end) { + if (rsi_set_memory_range_protected_safe(start, end)) { + panic("Failed to set memory range to protected: %pa-%pa", + &start, &end); + } + } +} + +/* + * Check if a given PA range is Trusted (e.g., Protected memory, a Trusted Device + * mapping, or an MMIO emulated in the Realm world). + * + * We can rely on the RIPAS value of the region to detect if a given region is + * protected. + * + * RIPAS_DEV - A trusted device memory or a trusted emulated MMIO (in the Realm + * world + * RIPAS_RAM - Memory (RAM), protected by the RMM guarantees. (e.g., Firmware + * reserved regions for data sharing). + * + * RIPAS_DESTROYED is a special case of one of the above, where the host did + * something without our permission and as such we can't do anything about it. + * + * The only case where something is emulated by the untrusted hypervisor or is + * backed by shared memory is indicated by RSI_RIPAS_EMPTY. + */ +bool arm64_rsi_is_protected(phys_addr_t base, size_t size) +{ + enum ripas ripas; + phys_addr_t end, top; + + /* Overflow ? */ + if (WARN_ON(base + size <= base)) + return false; + + end = ALIGN(base + size, RSI_GRANULE_SIZE); + base = ALIGN_DOWN(base, RSI_GRANULE_SIZE); + + while (base < end) { + if (WARN_ON(rsi_ipa_state_get(base, end, &ripas, &top))) + break; + if (WARN_ON(top <= base)) + break; + if (ripas == RSI_RIPAS_EMPTY) + break; + base = top; + } + + return base >= end; +} +EXPORT_SYMBOL(arm64_rsi_is_protected); + +static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) +{ + if (arm64_rsi_is_protected(phys, size)) + *prot = pgprot_encrypted(*prot); + else + *prot = pgprot_decrypted(*prot); + + return 0; +} + +void __init arm64_rsi_init(void) +{ + if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) + return; + if (!rsi_version_matches()) + return; + if (!can_set_direct_map()) { + pr_err("RME: Cannot set the kernel direct map, consider CONFIG_RODATA_FULL_DEFAULT_ENABLED=y or rodata=full\n"); + return; + } + if (WARN_ON(rsi_get_realm_config(&config))) + return; + prot_ns_shared = BIT(config.ipa_bits - 1); + + if (arm64_ioremap_prot_hook_register(realm_ioremap_hook)) + return; + + if (realm_register_memory_enc_ops()) + return; + + arm64_rsi_setup_memory(); + + static_branch_enable(&rsi_present); +} + diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 040b0175334c057f955e63b65c51182efb4e8f8d..a5711b5bd77f43b4f1f7a33fbe8efb8ee22f2ac7 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -371,6 +372,8 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) else psci_acpi_init(); + arm64_rsi_init(); + init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index c0c050e53157d9908c91fd781aa1b5d3271e4092..2eec980cbe5c073d7b611a0ce02bbc2af440952a 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -20,7 +20,8 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ vgic/vgic-v3.o vgic/vgic-v4.o \ vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \ vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ - vgic/vgic-its.o vgic/vgic-debug.o + vgic/vgic-its.o vgic/vgic-debug.o \ + rme.o rme-exit.o kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index a795dad2b99ab1cb598f8430fe67cc76b7b8651e..70c1fb6c217ecfdc57102fadc66e97b704f475a2 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -162,6 +162,13 @@ static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval) static void timer_set_offset(struct arch_timer_context *ctxt, u64 offset) { + struct kvm_vcpu *vcpu = ctxt->vcpu; + + if (kvm_is_realm(vcpu->kvm)) { + WARN_ON(offset); + return; + } + if (!ctxt->offset.vm_offset) { WARN(offset, "timer %ld\n", arch_timer_ctx_index(ctxt)); return; @@ -460,6 +467,21 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, } } +void kvm_realm_timers_update(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *arch_timer = &vcpu->arch.timer_cpu; + int i; + + for (i = 0; i < NR_KVM_EL0_TIMERS; i++) { + struct arch_timer_context *timer = &arch_timer->timers[i]; + bool status = timer_get_ctl(timer) & ARCH_TIMER_CTRL_IT_STAT; + bool level = kvm_timer_irq_can_fire(timer) && status; + + if (level != timer->irq.level) + kvm_timer_update_irq(vcpu, level, timer); + } +} + /* Only called for a fully emulated timer */ static void timer_emulate(struct arch_timer_context *ctx) { @@ -829,6 +851,8 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) if (unlikely(!timer->enabled)) return; + kvm_timer_unblocking(vcpu); + get_timer_map(vcpu, &map); if (static_branch_likely(&has_gic_active_state)) { @@ -842,8 +866,6 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) kvm_timer_vcpu_load_nogic(vcpu); } - kvm_timer_unblocking(vcpu); - timer_restore_state(map.direct_vtimer); if (map.direct_ptimer) timer_restore_state(map.direct_ptimer); @@ -988,7 +1010,9 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) ctxt->vcpu = vcpu; - if (timerid == TIMER_VTIMER) + if (kvm_is_realm(vcpu->kvm)) + ctxt->offset.vm_offset = NULL; + else if (timerid == TIMER_VTIMER) ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset; else ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset; @@ -1011,13 +1035,19 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = vcpu_timer(vcpu); + u64 cntvoff; for (int i = 0; i < NR_KVM_TIMERS; i++) timer_context_init(vcpu, i); + if (kvm_is_realm(vcpu->kvm)) + cntvoff = 0; + else + cntvoff = kvm_phys_timer_read(); + /* Synchronize offsets across timers of a VM if not already provided */ if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) { - timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read()); + timer_set_offset(vcpu_vtimer(vcpu), cntvoff); timer_set_offset(vcpu_ptimer(vcpu), 0); } @@ -1525,6 +1555,13 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) return -EINVAL; } + /* + * We don't use mapped IRQs for Realms because the RMI doesn't allow + * us setting the LR.HW bit in the VGIC. + */ + if (vcpu_is_rec(vcpu)) + return 0; + get_timer_map(vcpu, &map); ret = kvm_vgic_map_phys_irq(vcpu, @@ -1656,6 +1693,9 @@ int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm, if (offset->reserved) return -EINVAL; + if (kvm_is_realm(kvm)) + return -EINVAL; + mutex_lock(&kvm->lock); if (lock_all_vcpus(kvm)) { diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index fe4314af8eeccc6b80c9210df0ae5965911a3fb5..20607591763fffffdb02ec349e59f751f179ad78 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,7 @@ #include #include #include +#include #include #include @@ -47,6 +49,8 @@ static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; +DEFINE_STATIC_KEY_FALSE(kvm_rme_is_available); + DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); @@ -115,6 +119,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->slots_lock); break; + case KVM_CAP_ARM_RME: + mutex_lock(&kvm->lock); + r = kvm_realm_enable_cap(kvm, cap); + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -146,6 +155,21 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_unlock(&kvm->lock); #endif + if (type & ~(KVM_VM_TYPE_ARM_MASK | KVM_VM_TYPE_ARM_IPA_SIZE_MASK)) + return -EINVAL; + + switch (type & KVM_VM_TYPE_ARM_MASK) { + case KVM_VM_TYPE_ARM_NORMAL: + break; + case KVM_VM_TYPE_ARM_REALM: + if (!static_branch_unlikely(&kvm_rme_is_available)) + return -EPERM; + kvm->arch.is_realm = true; + break; + default: + return -EINVAL; + } + ret = kvm_share_hyp(kvm, kvm + 1); if (ret) return ret; @@ -175,6 +199,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); + /* Initialise the realm bits after the generic bits are enabled */ + if (kvm_is_realm(kvm)) { + ret = kvm_init_realm_vm(kvm); + if (ret) + goto err_free_cpumask; + } + return 0; err_free_cpumask: @@ -209,6 +240,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_unshare_hyp(kvm, kvm + 1); kvm_arm_teardown_hypercalls(kvm); + kvm_destroy_realm(kvm); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) @@ -226,21 +258,23 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ONE_REG: case KVM_CAP_ARM_PSCI: case KVM_CAP_ARM_PSCI_0_2: - case KVM_CAP_READONLY_MEM: case KVM_CAP_MP_STATE: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_VCPU_EVENTS: case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: case KVM_CAP_ARM_NISV_TO_USER: case KVM_CAP_ARM_INJECT_EXT_DABT: - case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_PTP_KVM: case KVM_CAP_ARM_SYSTEM_SUSPEND: case KVM_CAP_IRQFD_RESAMPLE: - case KVM_CAP_COUNTER_OFFSET: r = 1; break; + case KVM_CAP_COUNTER_OFFSET: + case KVM_CAP_READONLY_MEM: + case KVM_CAP_SET_GUEST_DEBUG: + r = !kvm_is_realm(kvm); + break; case KVM_CAP_SET_GUEST_DEBUG2: return KVM_GUESTDBG_VALID_MASK; case KVM_CAP_ARM_SET_DEVICE_ADDR: @@ -280,16 +314,19 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = system_supports_mte(); break; case KVM_CAP_STEAL_TIME: - r = kvm_arm_pvtime_supported(); + if (kvm_is_realm(kvm)) + r = 0; + else + r = kvm_arm_pvtime_supported(); break; case KVM_CAP_ARM_EL1_32BIT: r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1); break; case KVM_CAP_GUEST_DEBUG_HW_BPS: - r = get_num_brps(); + r = kvm_is_realm(kvm) ? 0 : get_num_brps(); break; case KVM_CAP_GUEST_DEBUG_HW_WPS: - r = get_num_wrps(); + r = kvm_is_realm(kvm) ? 0 : get_num_wrps(); break; case KVM_CAP_ARM_PMU_V3: r = kvm_arm_support_pmu_v3(); @@ -301,7 +338,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = get_kvm_ipa_limit(); break; case KVM_CAP_ARM_SVE: - r = system_supports_sve(); + if (kvm_is_realm(kvm)) + r = kvm_rme_supports_sve(); + else + r = system_supports_sve(); break; case KVM_CAP_ARM_PTRAUTH_ADDRESS: case KVM_CAP_ARM_PTRAUTH_GENERIC: @@ -316,6 +356,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES: r = kvm_supported_block_sizes(); break; + case KVM_CAP_ARM_RME: + r = static_key_enabled(&kvm_rme_is_available); + break; default: r = 0; } @@ -368,6 +411,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu_clear_flag(vcpu, VCPU_INITIALIZED); bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + vcpu->arch.rec.mpidr = INVALID_HWID; + vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; /* @@ -447,10 +492,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_vgic_load(vcpu); kvm_timer_vcpu_load(vcpu); - if (has_vhe()) - kvm_vcpu_load_sysregs_vhe(vcpu); - kvm_arch_vcpu_load_fp(vcpu); - kvm_vcpu_pmu_restore_guest(vcpu); if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); @@ -463,23 +504,38 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu_ptrauth_disable(vcpu); kvm_arch_vcpu_load_debug_state_flags(vcpu); + /* No additional state needs to be loaded on Realmed VMs */ + if (vcpu_is_rec(vcpu)) + return; + + if (has_vhe()) + kvm_vcpu_load_sysregs_vhe(vcpu); + kvm_arch_vcpu_load_fp(vcpu); + kvm_vcpu_pmu_restore_guest(vcpu); + if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus)) vcpu_set_on_unsupported_cpu(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + kvm_timer_vcpu_put(vcpu); + kvm_vgic_put(vcpu); + + vcpu->cpu = -1; + + if (vcpu_is_rec(vcpu)) + return; + kvm_arch_vcpu_put_debug_state_flags(vcpu); kvm_arch_vcpu_put_fp(vcpu); if (has_vhe()) kvm_vcpu_put_sysregs_vhe(vcpu); - kvm_timer_vcpu_put(vcpu); - kvm_vgic_put(vcpu); + kvm_vcpu_pmu_restore_host(vcpu); kvm_arm_vmid_clear_active(); vcpu_clear_on_unsupported_cpu(vcpu); - vcpu->cpu = -1; } static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) @@ -635,6 +691,11 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) if (kvm_vm_is_protected(kvm)) kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); + if (!irqchip_in_kernel(kvm) && kvm_is_realm(vcpu->kvm)) { + /* Userspace irqchip not yet supported with Realms */ + return -EOPNOTSUPP; + } + mutex_lock(&kvm->arch.config_lock); set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); mutex_unlock(&kvm->arch.config_lock); @@ -796,8 +857,7 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) - kvm_pmu_handle_pmcr(vcpu, - __vcpu_sys_reg(vcpu, PMCR_EL0)); + kvm_pmu_handle_pmcr(vcpu, kvm_vcpu_read_pmcr(vcpu)); if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu)) kvm_vcpu_pmu_restore_guest(vcpu); @@ -921,6 +981,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) run->exit_reason = KVM_EXIT_UNKNOWN; run->flags = 0; while (ret > 0) { + bool pmu_stopped = false; + /* * Check conditions before entering the guest */ @@ -931,6 +993,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (ret > 0) ret = check_vcpu_requests(vcpu); + if (ret > 0 && vcpu_is_rec(vcpu)) + ret = kvm_rec_pre_enter(vcpu); + /* * Preparing the interrupts to be injected also * involves poking the GIC, which must be done in a @@ -949,6 +1014,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_pmu_flush_hwstate(vcpu); + if (vcpu_is_rec(vcpu) && kvm_pmu_get_irq_level(vcpu)) { + pmu_stopped = true; + arm_pmu_set_phys_irq(false); + } + local_irq_disable(); kvm_vgic_flush_hwstate(vcpu); @@ -984,7 +1054,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) trace_kvm_entry(*vcpu_pc(vcpu)); guest_timing_enter_irqoff(); - ret = kvm_arm_vcpu_enter_exit(vcpu); + if (vcpu_is_rec(vcpu)) + ret = kvm_rec_enter(vcpu); + else + ret = kvm_arm_vcpu_enter_exit(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; @@ -1038,13 +1111,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) local_irq_enable(); - trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); - /* Exit types that need handling before we can be preempted */ - handle_exit_early(vcpu, ret); + if (!vcpu_is_rec(vcpu)) { + trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), + *vcpu_pc(vcpu)); + + handle_exit_early(vcpu, ret); + } preempt_enable(); + if (pmu_stopped) + arm_pmu_set_phys_irq(true); + /* * The ARMv8 architecture doesn't give the hypervisor * a mechanism to prevent a guest from dropping to AArch32 EL0 @@ -1064,7 +1143,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) ret = ARM_EXCEPTION_IL; } - ret = handle_exit(vcpu, ret); + if (vcpu_is_rec(vcpu)) + ret = handle_rec_exit(vcpu, ret); + else + ret = handle_exit(vcpu, ret); } /* Tell userspace about in-kernel device output levels */ @@ -1185,6 +1267,19 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, return -EINVAL; } +static unsigned long system_supported_vcpu_features(struct kvm *kvm) +{ + unsigned long features = KVM_VCPU_VALID_FEATURES; + + if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1)) + clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features); + + if (!kvm_is_realm(kvm)) + clear_bit(KVM_ARM_VCPU_REC, &features); + + return features; +} + static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { @@ -1199,12 +1294,12 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, return -ENOENT; } + if (features & ~system_supported_vcpu_features(vcpu->kvm)) + return -EINVAL; + if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features)) return 0; - if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) - return -EINVAL; - /* MTE is incompatible with AArch32 */ if (kvm_has_mte(vcpu->kvm)) return -EINVAL; @@ -1213,6 +1308,10 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features)) return -EINVAL; + /* RME is incompatible with AArch32 */ + if (test_bit(KVM_ARM_VCPU_REC, &features)) + return -EINVAL; + return 0; } @@ -1224,6 +1323,21 @@ static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu, return !bitmap_equal(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES); } +static int kvm_setup_vcpu(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + int ret = 0; + + /* + * When the vCPU has a PMU, but no PMU is set for the guest + * yet, set the default one. + */ + if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu) + ret = kvm_arm_set_default_pmu(kvm); + + return ret; +} + static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { @@ -1239,6 +1353,10 @@ static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu, bitmap_copy(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES); + ret = kvm_setup_vcpu(vcpu); + if (ret) + goto out_unlock; + /* Now we know what it is, we can reset it. */ ret = kvm_reset_vcpu(vcpu); if (ret) { @@ -1397,6 +1515,22 @@ static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, return __kvm_arm_vcpu_set_events(vcpu, events); } +static int kvm_arm_vcpu_rmm_psci_complete(struct kvm_vcpu *vcpu, + struct kvm_arm_rmm_psci_complete *arg) +{ + struct kvm_vcpu *target = kvm_mpidr_to_vcpu(vcpu->kvm, arg->target_mpidr); + + if (!target) + return -EINVAL; + + /* + * RMM v1.0 only supports PSCI_RET_SUCCESS or PSCI_RET_DENIED + * for the status. But, let us leave it to the RMM to filter + * for making this future proof. + */ + return realm_psci_complete(vcpu, target, arg->psci_status); +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1451,10 +1585,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (unlikely(!kvm_vcpu_initialized(vcpu))) break; - r = -EPERM; - if (!kvm_arm_vcpu_is_finalized(vcpu)) - break; - r = -EFAULT; if (copy_from_user(®_list, user_list, sizeof(reg_list))) break; @@ -1519,6 +1649,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return kvm_arm_vcpu_finalize(vcpu, what); } + case KVM_ARM_VCPU_RMM_PSCI_COMPLETE: { + struct kvm_arm_rmm_psci_complete req; + + if (!vcpu_is_rec(vcpu)) + return -EPERM; + if (copy_from_user(&req, argp, sizeof(req))) + return -EFAULT; + return kvm_arm_vcpu_rmm_psci_complete(vcpu, &req); + } default: r = -EINVAL; } @@ -2411,6 +2550,9 @@ static __init int kvm_arm_init(void) in_hyp_mode = is_kernel_in_hyp_mode(); + if (in_hyp_mode) + kvm_init_rme(); + if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || cpus_have_final_cap(ARM64_WORKAROUND_1508412)) kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index efe82cc86bd1f3fc73900c85d3170d085f9be3ed..0566e3429366bab0cccd66eacfb98d2a209ba579 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -73,6 +73,24 @@ static u64 core_reg_offset_from_id(u64 id) return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); } +static bool kvm_realm_validate_core_reg(u64 off) +{ + /* + * Note that GPRs can only sometimes be controlled by the VMM. + * For PSCI only X0-X6 are used, higher registers are ignored (restored + * from the REC). + * For HOST_CALL all of X0-X30 are copied to the RsiHostCall structure. + * For emulated MMIO X0 is always used. + */ + switch (off) { + case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... + KVM_REG_ARM_CORE_REG(regs.regs[30]): + case KVM_REG_ARM_CORE_REG(regs.pc): + return true; + } + return false; +} + static int core_reg_size_from_offset(const struct kvm_vcpu *vcpu, u64 off) { int size; @@ -342,7 +360,7 @@ static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (!vcpu_has_sve(vcpu)) return -ENOENT; - if (kvm_arm_vcpu_sve_finalized(vcpu)) + if (kvm_arm_vcpu_sve_finalized(vcpu) || kvm_realm_is_created(vcpu->kvm)) return -EPERM; /* too late! */ if (WARN_ON(vcpu->arch.sve_state)) @@ -356,7 +374,7 @@ static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (vq_present(vqs, vq)) max_vq = vq; - if (max_vq > sve_vq_from_vl(kvm_sve_max_vl)) + if (max_vq > sve_vq_from_vl(kvm_sve_get_max_vl(vcpu->kvm))) return -EINVAL; /* @@ -600,8 +618,6 @@ static const u64 timer_reg_list[] = { KVM_REG_ARM_PTIMER_CVAL, }; -#define NUM_TIMER_REGS ARRAY_SIZE(timer_reg_list) - static bool is_timer_reg(u64 index) { switch (index) { @@ -616,9 +632,14 @@ static bool is_timer_reg(u64 index) return false; } +static unsigned long num_timer_regs(struct kvm_vcpu *vcpu) +{ + return kvm_is_realm(vcpu->kvm) ? 0 : ARRAY_SIZE(timer_reg_list); +} + static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - for (int i = 0; i < NUM_TIMER_REGS; i++) { + for (int i = 0; i < num_timer_regs(vcpu); i++) { if (put_user(timer_reg_list[i], uindices)) return -EFAULT; uindices++; @@ -653,11 +674,11 @@ static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu) { const unsigned int slices = vcpu_sve_slices(vcpu); - if (!vcpu_has_sve(vcpu)) + if (!vcpu_has_sve(vcpu) || !kvm_arm_vcpu_sve_finalized(vcpu)) return 0; - /* Policed by KVM_GET_REG_LIST: */ - WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); + if (kvm_is_realm(vcpu->kvm)) + return 1; /* KVM_REG_ARM64_SVE_VLS */ return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */) + 1; /* KVM_REG_ARM64_SVE_VLS */ @@ -674,8 +695,8 @@ static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, if (!vcpu_has_sve(vcpu)) return 0; - /* Policed by KVM_GET_REG_LIST: */ - WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu)); + if (!kvm_arm_vcpu_sve_finalized(vcpu)) + return -EPERM; /* * Enumerate this first, so that userspace can save/restore in @@ -686,6 +707,9 @@ static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu, return -EFAULT; ++num_regs; + if (kvm_is_realm(vcpu->kvm)) + return num_regs; + for (i = 0; i < slices; i++) { for (n = 0; n < SVE_NUM_ZREGS; n++) { reg = KVM_REG_ARM64_SVE_ZREG(n, i); @@ -723,7 +747,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) res += num_sve_regs(vcpu); res += kvm_arm_num_sys_reg_descs(vcpu); res += kvm_arm_get_fw_num_regs(vcpu); - res += NUM_TIMER_REGS; + res += num_timer_regs(vcpu); return res; } @@ -755,7 +779,7 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) ret = copy_timer_indices(vcpu, uindices); if (ret < 0) return ret; - uindices += NUM_TIMER_REGS; + uindices += num_timer_regs(vcpu); return kvm_arm_copy_sys_reg_indices(vcpu, uindices); } @@ -780,12 +804,44 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) return kvm_arm_sys_reg_get_reg(vcpu, reg); } +#define KVM_REG_ARM_PMCR_EL0 ARM64_SYS_REG(3, 3, 9, 12, 0) +#define KVM_REG_ARM_ID_AA64DFR0_EL1 ARM64_SYS_REG(3, 0, 0, 5, 0) + +/* + * The RMI ABI only enables setting some GPRs and PC. The selection of GPRs + * that are available depends on the Realm state and the reason for the last + * exit. All other registers are reset to architectural or otherwise defined + * reset values by the RMM, except for a few configuration fields that + * correspond to Realm parameters. + */ +static bool validate_realm_set_reg(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) { + u64 off = core_reg_offset_from_id(reg->id); + + return kvm_realm_validate_core_reg(off); + } else { + switch (reg->id) { + case KVM_REG_ARM_PMCR_EL0: + case KVM_REG_ARM_ID_AA64DFR0_EL1: + case KVM_REG_ARM64_SVE_VLS: + return true; + } + } + + return false; +} + int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { /* We currently use nothing arch-specific in upper 32 bits */ if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32) return -EINVAL; + if (kvm_is_realm(vcpu->kvm) && !validate_realm_set_reg(vcpu, reg)) + return -EINVAL; + switch (reg->id & KVM_REG_ARM_COPROC_MASK) { case KVM_REG_ARM_CORE: return set_core_reg(vcpu, reg); case KVM_REG_ARM_FW: @@ -837,6 +893,30 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, bool has_esr = events->exception.serror_has_esr; bool ext_dabt_pending = events->exception.ext_dabt_pending; + if (vcpu_is_rec(vcpu)) { + /* Cannot inject SError into a Realm. */ + if (serror_pending) + return -EINVAL; + + /* + * If a data abort is pending, set the flag and let the RMM + * inject an SEA when the REC is scheduled to be run. + */ + if (ext_dabt_pending) { + /* + * Can only inject SEA into a Realm if the previous exit + * was due to a data abort of an Unprotected IPA. + */ + if (!(vcpu->arch.rec.run->enter.flags & REC_ENTER_FLAG_EMULATED_MMIO)) + return -EINVAL; + + vcpu->arch.rec.run->enter.flags &= ~REC_ENTER_FLAG_EMULATED_MMIO; + vcpu->arch.rec.run->enter.flags |= REC_ENTER_FLAG_INJECT_SEA; + } + + return 0; + } + if (serror_pending && has_esr) { if (!cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) return -EINVAL; diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 7fb4df0456dea53f9cdeccdd28c2ff47a8ce6ff3..c06e9352a80af34802d2200d1fe8aa01c1a8a346 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -397,14 +397,14 @@ void kvm_arm_teardown_hypercalls(struct kvm *kvm) int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu) { - return ARRAY_SIZE(kvm_arm_fw_reg_ids); + return kvm_is_realm(vcpu->kvm) ? 0 : ARRAY_SIZE(kvm_arm_fw_reg_ids); } int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { int i; - for (i = 0; i < ARRAY_SIZE(kvm_arm_fw_reg_ids); i++) { + for (i = 0; i < kvm_arm_get_fw_num_regs(vcpu); i++) { if (put_user(kvm_arm_fw_reg_ids[i], uindices++)) return -EFAULT; } diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 0bd93a5f21ce382506803d018ba7df7072ac0aa1..50f434af083822273a45524e993fc569940c791c 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -165,7 +165,9 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr) */ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) { - if (vcpu_el1_is_32bit(vcpu)) + if (unlikely(vcpu_is_rec(vcpu))) + vcpu->arch.rec.run->enter.flags |= REC_ENTER_FLAG_INJECT_SEA; + else if (vcpu_el1_is_32bit(vcpu)) inject_abt32(vcpu, false, addr); else inject_abt64(vcpu, false, addr); @@ -224,6 +226,7 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu) */ void kvm_inject_undefined(struct kvm_vcpu *vcpu) { + WARN(vcpu_is_rec(vcpu), "Unexpected undefined exception injection to REC"); if (vcpu_el1_is_32bit(vcpu)) inject_undef32(vcpu); else diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 2aa503ff742ee57e5023453e7d143d93fe909121..ac214bd9dbc26ec268b1b7b24031174d06bcb473 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -6,6 +6,7 @@ #include #include +#include #include #include "trace.h" @@ -136,14 +137,21 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, &data); data = vcpu_data_host_to_guest(vcpu, data, len); - vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); + + if (vcpu_is_rec(vcpu)) + vcpu->arch.rec.run->enter.gprs[0] = data; + else + vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); } /* * The MMIO instruction is emulated and should not be re-executed * in the guest. */ - kvm_incr_pc(vcpu); + if (vcpu_is_rec(vcpu)) + vcpu->arch.rec.run->enter.flags |= REC_ENTER_FLAG_EMULATED_MMIO; + else + kvm_incr_pc(vcpu); return 1; } @@ -163,6 +171,11 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) * volunteered to do so, and bail out otherwise. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { + if (vcpu_is_rec(vcpu)) { + kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + return 1; + } + if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, &vcpu->kvm->arch.flags)) { run->exit_reason = KVM_EXIT_ARM_NISV; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 482280fe22d7c59d2539b9f0e758cfc95afad0ff..512a6b743d046a20c5b27f36e89ab921afa016e6 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -310,6 +310,7 @@ static void invalidate_icache_guest_page(void *va, size_t size) * @start: The intermediate physical base address of the range to unmap * @size: The size of the area to unmap * @may_block: Whether or not we are permitted to block + * @only_shared: If true then protected mappings should not be unmapped * * Clear a range of stage-2 mappings, lowering the various ref-counts. Must * be called while holding mmu_lock (unless for freeing the stage2 pgd before @@ -317,20 +318,26 @@ static void invalidate_icache_guest_page(void *va, size_t size) * with things behind our backs. */ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, - bool may_block) + bool may_block, bool only_shared) { struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); phys_addr_t end = start + size; lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap, - may_block)); + + if (kvm_is_realm(kvm)) + kvm_realm_unmap_range(kvm, start, size, !only_shared); + else + WARN_ON(stage2_apply_range(mmu, start, end, + kvm_pgtable_stage2_unmap, + may_block)); } -static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) +static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, + bool may_block) { - __unmap_stage2_range(mmu, start, size, true); + __unmap_stage2_range(mmu, start, size, may_block, false); } static void stage2_flush_memslot(struct kvm *kvm, @@ -339,7 +346,11 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = addr + PAGE_SIZE * memslot->npages; - stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush); + if (kvm_is_realm(kvm)) + kvm_realm_unmap_range(kvm, addr, end - addr, false); + else + stage2_apply_range_resched(&kvm->arch.mmu, addr, end, + kvm_pgtable_stage2_flush); } /** @@ -853,26 +864,15 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .icache_inval_pou = invalidate_icache_guest_page, }; -/** - * kvm_init_stage2_mmu - Initialise a S2 MMU structure - * @kvm: The pointer to the KVM structure - * @mmu: The pointer to the s2 MMU structure - * @type: The machine type of the virtual machine - * - * Allocates only the stage-2 HW PGD level table(s). - * Note we don't need locking here as this is only called when the VM is - * created, which can only be done once. - */ -int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +static int kvm_init_ipa_range(struct kvm *kvm, + struct kvm_s2_mmu *mmu, unsigned long type) { u32 kvm_ipa_limit = get_kvm_ipa_limit(); - int cpu, err; - struct kvm_pgtable *pgt; u64 mmfr0, mmfr1; u32 phys_shift; - if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) - return -EINVAL; + if (kvm_is_realm(kvm)) + kvm_ipa_limit = kvm_realm_ipa_limit(); phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); if (is_protected_kvm_enabled()) { @@ -894,11 +894,33 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); + return 0; +} + +/** + * kvm_init_stage2_mmu - Initialise a S2 MMU structure + * @kvm: The pointer to the KVM structure + * @mmu: The pointer to the s2 MMU structure + * @type: The machine type of the virtual machine + * + * Allocates only the stage-2 HW PGD level table(s). + * Note we don't need locking here as this is only called when the VM is + * created, which can only be done once. + */ +int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) +{ + int cpu, err; + struct kvm_pgtable *pgt; + if (mmu->pgt != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } + err = kvm_init_ipa_range(kvm, mmu, type); + if (err) + return err; + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); if (!pgt) return -ENOMEM; @@ -974,7 +996,8 @@ static void stage2_unmap_memslot(struct kvm *kvm, if (!(vma->vm_flags & VM_PFNMAP)) { gpa_t gpa = addr + (vm_start - memslot->userspace_addr); - unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start); + unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start, + true); } hva = vm_end; } while (hva < reg_end); @@ -993,6 +1016,10 @@ void stage2_unmap_vm(struct kvm *kvm) struct kvm_memory_slot *memslot; int idx, bkt; + /* For realms this is handled by the RMM so nothing to do here */ + if (kvm_is_realm(kvm)) + return; + idx = srcu_read_lock(&kvm->srcu); mmap_read_lock(current->mm); write_lock(&kvm->mmu_lock); @@ -1009,10 +1036,25 @@ void stage2_unmap_vm(struct kvm *kvm) void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) { struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); - struct kvm_pgtable *pgt = NULL; + struct kvm_pgtable *pgt; write_lock(&kvm->mmu_lock); pgt = mmu->pgt; + if (kvm_is_realm(kvm) && + (kvm_realm_state(kvm) != REALM_STATE_DEAD && + kvm_realm_state(kvm) != REALM_STATE_NONE)) { + unmap_stage2_range(mmu, 0, (~0ULL) & PAGE_MASK, false); + write_unlock(&kvm->mmu_lock); + kvm_realm_destroy_rtts(kvm, pgt->ia_bits); + + /* + * The physical PGD pages are delegated to the RMM, so cannot + * be freed at this point. This function will be called again + * from kvm_destroy_realm() after the physical pages have been + * returned at which point the memory can be freed. + */ + return; + } if (pgt) { mmu->pgd_phys = 0; mmu->pgt = NULL; @@ -1075,6 +1117,10 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (is_protected_kvm_enabled()) return -EPERM; + /* We don't support mapping special pages into a Realm */ + if (kvm_is_realm(kvm)) + return -EPERM; + size += offset_in_page(guest_ipa); guest_ipa &= PAGE_MASK; @@ -1391,6 +1437,25 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) return vma->vm_flags & VM_MTE_ALLOWED; } +static int realm_map_ipa(struct kvm *kvm, phys_addr_t ipa, + kvm_pfn_t pfn, unsigned long map_size, + enum kvm_pgtable_prot prot, + struct kvm_mmu_memory_cache *memcache) +{ + struct realm *realm = &kvm->arch.realm; + + if (WARN_ON(!(prot & KVM_PGTABLE_PROT_W))) + return -EFAULT; + + ipa = ALIGN_DOWN(ipa, PAGE_SIZE); + + if (!kvm_realm_is_private_address(realm, ipa)) + return realm_map_non_secure(realm, ipa, pfn, map_size, + memcache); + + return realm_map_protected(realm, ipa, pfn, map_size, memcache); +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) @@ -1414,6 +1479,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level); write_fault = kvm_is_write_fault(vcpu); + + /* + * Realms cannot map protected pages read-only + * FIXME: It should be possible to map unprotected pages read-only + */ + if (vcpu_is_rec(vcpu)) + write_fault = true; + exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); VM_BUG_ON(write_fault && exec_fault); @@ -1455,6 +1528,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (logging_active) { force_pte = true; vma_shift = PAGE_SHIFT; + } else if (vcpu_is_rec(vcpu)) { + // Force PTE level mappings for realms + force_pte = true; + vma_shift = PAGE_SHIFT; } else { vma_shift = get_vma_page_shift(vma, hva); } @@ -1487,7 +1564,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) fault_ipa &= ~(vma_pagesize - 1); - gfn = fault_ipa >> PAGE_SHIFT; + gfn = kvm_gpa_from_fault(kvm, fault_ipa) >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); /* Don't use the VMA after the unlock -- it may have vanished */ @@ -1536,6 +1613,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault && device) return -ENOEXEC; + /* + * Adapted from cca-v8 + * Since OLK-6.6 does not implement the private_memslot_fault() + * that depends on the higher version feature guest memfd. + * Here we should handle protected addresses fault expect protected devices. + */ + if (device && vcpu_is_rec(vcpu) && + kvm_gpa_from_fault(kvm, fault_ipa) == fault_ipa) + return -EINVAL; + read_lock(&kvm->mmu_lock); pgt = vcpu->arch.hw_mmu->pgt; if (mmu_invalidate_retry(kvm, mmu_seq)) @@ -1588,6 +1675,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, */ if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule) ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); + else if (kvm_is_realm(kvm)) + ret = realm_map_ipa(kvm, fault_ipa, pfn, vma_pagesize, + prot, memcache); else ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, @@ -1697,7 +1787,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) idx = srcu_read_lock(&vcpu->kvm->srcu); - gfn = fault_ipa >> PAGE_SHIFT; + gfn = kvm_gpa_from_fault(vcpu->kvm, fault_ipa) >> PAGE_SHIFT; memslot = gfn_to_memslot(vcpu->kvm, gfn); hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); write_fault = kvm_is_write_fault(vcpu); @@ -1742,7 +1832,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) * of the page size. */ fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); - ret = io_mem_abort(vcpu, fault_ipa); + ret = io_mem_abort(vcpu, kvm_gpa_from_fault(vcpu->kvm, fault_ipa)); goto out_unlock; } @@ -1775,7 +1865,8 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, (range->end - range->start) << PAGE_SHIFT, - range->may_block); + range->may_block, + !(range->attr_filter & KVM_FILTER_PRIVATE)); return false; } @@ -1821,6 +1912,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.mmu.pgt) return false; + /* We don't support aging for Realms */ + if (kvm_is_realm(kvm)) + return true; + return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, true); @@ -1833,6 +1928,10 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.mmu.pgt) return false; + /* We don't support aging for Realms */ + if (kvm_is_realm(kvm)) + return true; + return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, false); @@ -2085,7 +2184,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; write_lock(&kvm->mmu_lock); - unmap_stage2_range(&kvm->arch.mmu, gpa, size); + unmap_stage2_range(&kvm->arch.mmu, gpa, size, true); write_unlock(&kvm->mmu_lock); } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 3867d6d1f5d1bd3330f6106622be10425aced533..90b7755c7b66f2c442c306272b97969de8701b4e 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -72,7 +72,7 @@ static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc) static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc) { - u64 val = __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), PMCR_EL0); + u64 val = kvm_vcpu_read_pmcr(kvm_pmc_to_vcpu(pmc)); return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) || (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC)); @@ -250,9 +250,8 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) { - u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT; + u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); - val &= ARMV8_PMU_PMCR_N_MASK; if (val == 0) return BIT(ARMV8_PMU_CYCLE_IDX); else @@ -272,7 +271,7 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) if (!kvm_vcpu_has_pmu(vcpu)) return; - if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val) + if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) || !val) return; for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) { @@ -324,7 +323,10 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) { u64 reg = 0; - if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) { + if (vcpu_is_rec(vcpu)) + return vcpu->arch.rec.run->exit.pmu_ovf_status; + + if ((kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) { reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); } @@ -425,7 +427,7 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, { int i; - if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) + if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E)) return; /* Weed out disabled counters */ @@ -568,7 +570,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc) { struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc); - return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) && + return (kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E) && (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)); } @@ -716,10 +718,9 @@ static struct arm_pmu *kvm_pmu_probe_armpmu(void) * It is still necessary to get a valid cpu, though, to probe for the * default PMU instance as userspace is not required to specify a PMU * type. In order to uphold the preexisting behavior KVM selects the - * PMU instance for the core where the first call to the - * KVM_ARM_VCPU_PMU_V3_CTRL attribute group occurs. A dependent use case - * would be a user with disdain of all things big.LITTLE that affines - * the VMM to a particular cluster of cores. + * PMU instance for the core during vcpu init. A dependent use + * case would be a user with disdain of all things big.LITTLE that + * affines the VMM to a particular cluster of cores. * * In any case, userspace should just do the sane thing and use the UAPI * to select a PMU type directly. But, be wary of the baggage being @@ -873,6 +874,55 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) return true; } +static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) +{ + lockdep_assert_held(&kvm->arch.config_lock); + + kvm->arch.arm_pmu = arm_pmu; + kvm->arch.pmcr_n = kvm_arm_pmu_get_max_counters(kvm); +} + +/** + * kvm_arm_set_default_pmu - No PMU set, get the default one. + * @kvm: The kvm pointer + * + * The observant among you will notice that the supported_cpus + * mask does not get updated for the default PMU even though it + * is quite possible the selected instance supports only a + * subset of cores in the system. This is intentional, and + * upholds the preexisting behavior on heterogeneous systems + * where vCPUs can be scheduled on any core but the guest + * counters could stop working. + */ +int kvm_arm_set_default_pmu(struct kvm *kvm) +{ + struct arm_pmu *arm_pmu = kvm_pmu_probe_armpmu(); + + if (!arm_pmu) + return -ENODEV; + + kvm_arm_set_pmu(kvm, arm_pmu); + return 0; +} + +/** + * kvm_arm_pmu_get_max_counters - Return the max number of PMU counters. + * @kvm: The kvm pointer + */ +u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) +{ + struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; + + if (kvm_is_realm(kvm)) + return kvm_realm_max_pmu_counters(); + + /* + * The arm_pmu->num_events considers the cycle counter as well. + * Ignore that and return only the general-purpose counters. + */ + return arm_pmu->num_events - 1; +} + static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) { struct kvm *kvm = vcpu->kvm; @@ -892,7 +942,7 @@ static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) break; } - kvm->arch.arm_pmu = arm_pmu; + kvm_arm_set_pmu(kvm, arm_pmu); cpumask_copy(kvm->arch.supported_cpus, &arm_pmu->supported_cpus); ret = 0; break; @@ -915,23 +965,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (vcpu->arch.pmu.created) return -EBUSY; - if (!kvm->arch.arm_pmu) { - /* - * No PMU set, get the default one. - * - * The observant among you will notice that the supported_cpus - * mask does not get updated for the default PMU even though it - * is quite possible the selected instance supports only a - * subset of cores in the system. This is intentional, and - * upholds the preexisting behavior on heterogeneous systems - * where vCPUs can be scheduled on any core but the guest - * counters could stop working. - */ - kvm->arch.arm_pmu = kvm_pmu_probe_armpmu(); - if (!kvm->arch.arm_pmu) - return -ENODEV; - } - switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { int __user *uaddr = (int __user *)(long)attr->addr; @@ -1071,3 +1104,14 @@ u8 kvm_arm_pmu_get_pmuver_limit(void) ID_AA64DFR0_EL1_PMUVer_V3P5); return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), tmp); } + +/** + * kvm_vcpu_read_pmcr - Read PMCR_EL0 register for the vCPU + * @vcpu: The vcpu pointer + */ +u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) +{ + u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); + + return u64_replace_bits(pmcr, vcpu->kvm->arch.pmcr_n, ARMV8_PMU_PMCR_N); +} diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 1f69b667332b2ba9f9560dd6cfec0d8ce580104e..750b3899d462ef4e3fc429517a5d8f3c3eebc1c6 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -103,6 +103,12 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) reset_state->reset = true; kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); + /* + * Make sure we issue PSCI_COMPLETE before the VCPU can be + * scheduled. + */ + if (vcpu_is_rec(vcpu)) + realm_psci_complete(source_vcpu, vcpu, PSCI_RET_SUCCESS); /* * Make sure the reset request is observed if the RUNNABLE mp_state is @@ -115,6 +121,11 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) out_unlock: spin_unlock(&vcpu->arch.mp_state_lock); + if (vcpu_is_rec(vcpu) && ret != PSCI_RET_SUCCESS) { + realm_psci_complete(source_vcpu, vcpu, + ret == PSCI_RET_ALREADY_ON ? + PSCI_RET_SUCCESS : PSCI_RET_DENIED); + } return ret; } @@ -142,6 +153,25 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) /* Ignore other bits of target affinity */ target_affinity &= target_affinity_mask; + if (vcpu_is_rec(vcpu)) { + struct kvm_vcpu *target_vcpu; + + /* RMM supports only zero affinity level */ + if (lowest_affinity_level != 0) + return PSCI_RET_INVALID_PARAMS; + + target_vcpu = kvm_mpidr_to_vcpu(kvm, target_affinity); + if (!target_vcpu) + return PSCI_RET_INVALID_PARAMS; + + /* + * Provide the references of the source and target RECs to the + * RMM so that the RMM can complete the PSCI request. + */ + realm_psci_complete(vcpu, target_vcpu, PSCI_RET_SUCCESS); + return PSCI_RET_SUCCESS; + } + /* * If one or more VCPU matching target affinity are running * then ON else OFF diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 43a53a403f5102c81bba0d928c0f78331f800269..38282c3b52363780532c23254059eb21e3e048ce 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -45,7 +45,7 @@ static u32 __ro_after_init kvm_ipa_limit; #define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ PSR_AA32_I_BIT | PSR_AA32_F_BIT) -unsigned int __ro_after_init kvm_sve_max_vl; +static unsigned int __ro_after_init kvm_sve_max_vl; unsigned int __ro_after_init kvm_host_sve_max_vl; int __init kvm_arm_init_sve(void) @@ -76,12 +76,20 @@ int __init kvm_arm_init_sve(void) return 0; } +unsigned int kvm_sve_get_max_vl(struct kvm *kvm) +{ + if (kvm_is_realm(kvm)) + return kvm_realm_sve_max_vl(); + else + return kvm_sve_max_vl; +} + static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) { if (!system_supports_sve()) return -EINVAL; - vcpu->arch.sve_max_vl = kvm_sve_max_vl; + vcpu->arch.sve_max_vl = kvm_sve_get_max_vl(vcpu->kvm); /* * Userspace can still customize the vector lengths by writing @@ -142,6 +150,11 @@ int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature) return -EPERM; return kvm_vcpu_finalize_sve(vcpu); + case KVM_ARM_VCPU_REC: + if (!kvm_is_realm(vcpu->kvm) || !vcpu_is_rec(vcpu)) + return -EINVAL; + + return kvm_create_rec(vcpu); } return -EINVAL; @@ -152,6 +165,11 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) if (vcpu_has_sve(vcpu) && !kvm_arm_vcpu_sve_finalized(vcpu)) return false; + if (kvm_is_realm(vcpu->kvm) && + !(vcpu_is_rec(vcpu) && kvm_arm_rec_finalized(vcpu) && + READ_ONCE(vcpu->kvm->arch.realm.state) == REALM_STATE_ACTIVE)) + return false; + return true; } @@ -165,6 +183,7 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu)); kfree(sve_state); kfree(vcpu->arch.ccsidr); + kvm_destroy_rec(vcpu); } static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/rme-exit.c b/arch/arm64/kvm/rme-exit.c new file mode 100644 index 0000000000000000000000000000000000000000..76ae7ecc7bee8db3b50787a7db66e6b0e5226fa7 --- /dev/null +++ b/arch/arm64/kvm/rme-exit.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include +#include +#include + +#include +#include +#include +#include + +typedef int (*exit_handler_fn)(struct kvm_vcpu *vcpu); + +static int rec_exit_reason_notimpl(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + + vcpu_err(vcpu, "Unhandled exit reason from realm (ESR: %#llx)\n", + rec->run->exit.esr); + return -ENXIO; +} + +static int rec_exit_sync_dabt(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + + /* + * In the case of a write, copy over gprs[0] to the target GPR, + * preparing to handle MMIO write fault. The content to be written has + * been saved to gprs[0] by the RMM (even if another register was used + * by the guest). In the case of normal memory access this is redundant + * (the guest will replay the instruction), but the overhead is + * minimal. + */ + if (kvm_vcpu_dabt_iswrite(vcpu) && kvm_vcpu_dabt_isvalid(vcpu)) + vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), + rec->run->exit.gprs[0]); + + return kvm_handle_guest_abort(vcpu); +} + +static int rec_exit_sync_iabt(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + + vcpu_err(vcpu, "Unhandled instruction abort (ESR: %#llx).\n", + rec->run->exit.esr); + return -ENXIO; +} + +static int rec_exit_sys_reg(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + unsigned long esr = kvm_vcpu_get_esr(vcpu); + int rt = kvm_vcpu_sys_get_rt(vcpu); + bool is_write = !(esr & 1); + int ret; + + if (is_write) + vcpu_set_reg(vcpu, rt, rec->run->exit.gprs[0]); + + ret = kvm_handle_sys_reg(vcpu); + if (ret > 0 && !is_write) + rec->run->enter.gprs[0] = vcpu_get_reg(vcpu, rt); + + return ret; +} + +static exit_handler_fn rec_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = rec_exit_reason_notimpl, + [ESR_ELx_EC_SYS64] = rec_exit_sys_reg, + [ESR_ELx_EC_DABT_LOW] = rec_exit_sync_dabt, + [ESR_ELx_EC_IABT_LOW] = rec_exit_sync_iabt +}; + +static int rec_exit_psci(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + int i; + + for (i = 0; i < REC_RUN_GPRS; i++) + vcpu_set_reg(vcpu, i, rec->run->exit.gprs[i]); + + return kvm_smccc_call_handler(vcpu); +} + +static int rec_exit_ripas_change(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct realm *realm = &kvm->arch.realm; + struct realm_rec *rec = &vcpu->arch.rec; + unsigned long base = rec->run->exit.ripas_base; + unsigned long top = rec->run->exit.ripas_top; + unsigned long ripas = rec->run->exit.ripas_value; + + if (!kvm_realm_is_private_address(realm, base) || + !kvm_realm_is_private_address(realm, top - 1)) { + vcpu_err(vcpu, "Invalid RIPAS_CHANGE for %#lx - %#lx, ripas: %#lx\n", + base, top, ripas); + return -EINVAL; + } + + return 1; +} + +static int rec_exit_host_call(struct kvm_vcpu *vcpu) +{ + int i; + struct realm_rec *rec = &vcpu->arch.rec; + + vcpu->stat.hvc_exit_stat++; + + for (i = 0; i < REC_RUN_GPRS; i++) + vcpu_set_reg(vcpu, i, rec->run->exit.gprs[i]); + + return kvm_smccc_call_handler(vcpu); +} + +static void update_arch_timer_irq_lines(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + + __vcpu_sys_reg(vcpu, CNTV_CTL_EL0) = rec->run->exit.cntv_ctl; + __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0) = rec->run->exit.cntv_cval; + __vcpu_sys_reg(vcpu, CNTP_CTL_EL0) = rec->run->exit.cntp_ctl; + __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = rec->run->exit.cntp_cval; + + kvm_realm_timers_update(vcpu); +} + +/* + * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on + * proper exit to userspace. + */ +int handle_rec_exit(struct kvm_vcpu *vcpu, int rec_run_ret) +{ + struct realm_rec *rec = &vcpu->arch.rec; + u8 esr_ec = ESR_ELx_EC(rec->run->exit.esr); + unsigned long status, index; + + status = RMI_RETURN_STATUS(rec_run_ret); + index = RMI_RETURN_INDEX(rec_run_ret); + + /* + * If a PSCI_SYSTEM_OFF request raced with a vcpu executing, we might + * see the following status code and index indicating an attempt to run + * a REC when the RD state is SYSTEM_OFF. In this case, we just need to + * return to user space which can deal with the system event or will try + * to run the KVM VCPU again, at which point we will no longer attempt + * to enter the Realm because we will have a sleep request pending on + * the VCPU as a result of KVM's PSCI handling. + */ + if (status == RMI_ERROR_REALM && index == 1) { + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + return 0; + } + + if (rec_run_ret) + return -ENXIO; + + vcpu->arch.fault.esr_el2 = rec->run->exit.esr; + vcpu->arch.fault.far_el2 = rec->run->exit.far; + /* HPFAR_EL2 is only valid for RMI_EXIT_SYNC */ + vcpu->arch.fault.hpfar_el2 = 0; + + update_arch_timer_irq_lines(vcpu); + + /* Reset the emulation flags for the next run of the REC */ + rec->run->enter.flags = 0; + + switch (rec->run->exit.exit_reason) { + case RMI_EXIT_SYNC: + /* + * HPFAR_EL2_NS is hijacked to indicate a valid HPFAR value, + * see __get_fault_info() + */ + vcpu->arch.fault.hpfar_el2 = rec->run->exit.hpfar; + return rec_exit_handlers[esr_ec](vcpu); + case RMI_EXIT_IRQ: + case RMI_EXIT_FIQ: + return 1; + case RMI_EXIT_PSCI: + return rec_exit_psci(vcpu); + case RMI_EXIT_RIPAS_CHANGE: + return rec_exit_ripas_change(vcpu); + case RMI_EXIT_HOST_CALL: + return rec_exit_host_call(vcpu); + } + + kvm_pr_unimpl("Unsupported exit reason: %u\n", + rec->run->exit.exit_reason); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return 0; +} diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c new file mode 100644 index 0000000000000000000000000000000000000000..d29e4dd9abc5ab0e5cfb7cdc386801c30238fc0a --- /dev/null +++ b/arch/arm64/kvm/rme.c @@ -0,0 +1,1723 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include + +#include +#include +#include +#include + +#include + +static unsigned long rmm_feat_reg0; + +#define RMM_PAGE_SHIFT 12 +#define RMM_PAGE_SIZE BIT(RMM_PAGE_SHIFT) + +#define RMM_RTT_BLOCK_LEVEL 2 +#define RMM_RTT_MAX_LEVEL 3 + +/* See ARM64_HW_PGTABLE_LEVEL_SHIFT() */ +#define RMM_RTT_LEVEL_SHIFT(l) \ + ((RMM_PAGE_SHIFT - 3) * (4 - (l)) + 3) +#define RMM_L2_BLOCK_SIZE BIT(RMM_RTT_LEVEL_SHIFT(2)) + +static inline unsigned long rme_rtt_level_mapsize(int level) +{ + if (WARN_ON(level > RMM_RTT_MAX_LEVEL)) + return RMM_PAGE_SIZE; + + return (1UL << RMM_RTT_LEVEL_SHIFT(level)); +} + +static bool rme_has_feature(unsigned long feature) +{ + return !!u64_get_bits(rmm_feat_reg0, feature); +} + +bool kvm_rme_supports_sve(void) +{ + return rme_has_feature(RMI_FEATURE_REGISTER_0_SVE_EN); +} + +static int rmi_check_version(void) +{ + struct arm_smccc_res res; + unsigned short version_major, version_minor; + unsigned long host_version = RMI_ABI_VERSION(RMI_ABI_MAJOR_VERSION, + RMI_ABI_MINOR_VERSION); + + arm_smccc_1_1_invoke(SMC_RMI_VERSION, host_version, &res); + + if (res.a0 == SMCCC_RET_NOT_SUPPORTED) + return -ENXIO; + + version_major = RMI_ABI_VERSION_GET_MAJOR(res.a1); + version_minor = RMI_ABI_VERSION_GET_MINOR(res.a1); + + if (res.a0 != RMI_SUCCESS) { + unsigned short high_version_major, high_version_minor; + + high_version_major = RMI_ABI_VERSION_GET_MAJOR(res.a2); + high_version_minor = RMI_ABI_VERSION_GET_MINOR(res.a2); + + kvm_err("Unsupported RMI ABI (v%d.%d - v%d.%d) we want v%d.%d\n", + version_major, version_minor, + high_version_major, high_version_minor, + RMI_ABI_MAJOR_VERSION, + RMI_ABI_MINOR_VERSION); + return -ENXIO; + } + + kvm_info("RMI ABI version %d.%d\n", version_major, version_minor); + + return 0; +} + +u32 kvm_realm_ipa_limit(void) +{ + return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_S2SZ); +} + +u32 kvm_realm_vgic_nr_lr(void) +{ + return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_GICV3_NUM_LRS); +} + +u8 kvm_realm_max_pmu_counters(void) +{ + return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_PMU_NUM_CTRS); +} + +unsigned int kvm_realm_sve_max_vl(void) +{ + return sve_vl_from_vq(u64_get_bits(rmm_feat_reg0, + RMI_FEATURE_REGISTER_0_SVE_VL) + 1); +} + +u64 kvm_realm_reset_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) +{ + u32 bps = u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_NUM_BPS); + u32 wps = u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_NUM_WPS); + u32 ctx_cmps; + + if (!kvm_is_realm(vcpu->kvm)) + return val; + + /* Ensure CTX_CMPs is still valid */ + ctx_cmps = FIELD_GET(ID_AA64DFR0_EL1_CTX_CMPs, val); + ctx_cmps = min(bps, ctx_cmps); + + val &= ~(ID_AA64DFR0_EL1_BRPs_MASK | ID_AA64DFR0_EL1_WRPs_MASK | + ID_AA64DFR0_EL1_CTX_CMPs); + val |= FIELD_PREP(ID_AA64DFR0_EL1_BRPs_MASK, bps) | + FIELD_PREP(ID_AA64DFR0_EL1_WRPs_MASK, wps) | + FIELD_PREP(ID_AA64DFR0_EL1_CTX_CMPs, ctx_cmps); + + return val; +} + +static int get_start_level(struct realm *realm) +{ + return 4 - ((realm->ia_bits - 8) / (RMM_PAGE_SHIFT - 3)); +} + +static int find_map_level(struct realm *realm, + unsigned long start, + unsigned long end) +{ + int level = RMM_RTT_MAX_LEVEL; + + while (level > get_start_level(realm)) { + unsigned long map_size = rme_rtt_level_mapsize(level - 1); + + if (!IS_ALIGNED(start, map_size) || + (start + map_size) > end) + break; + + level--; + } + + return level; +} + +static phys_addr_t alloc_delegated_granule(struct kvm_mmu_memory_cache *mc) +{ + phys_addr_t phys; + void *virt; + + if (mc) + virt = kvm_mmu_memory_cache_alloc(mc); + else + virt = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + + if (!virt) + return PHYS_ADDR_MAX; + + phys = virt_to_phys(virt); + + if (rmi_granule_delegate(phys)) { + free_page((unsigned long)virt); + + return PHYS_ADDR_MAX; + } + + kvm_account_pgtable_pages(virt, 1); + + return phys; +} + +static void free_delegated_granule(phys_addr_t phys) +{ + if (WARN_ON(rmi_granule_undelegate(phys))) { + /* Undelegate failed: leak the page */ + return; + } + + kvm_account_pgtable_pages(phys_to_virt(phys), -1); + + free_page((unsigned long)phys_to_virt(phys)); +} + +int realm_psci_complete(struct kvm_vcpu *source, struct kvm_vcpu *target, + unsigned long status) +{ + int ret; + + ret = rmi_psci_complete(virt_to_phys(source->arch.rec.rec_page), + virt_to_phys(target->arch.rec.rec_page), + status); + if (ret) + return -EINVAL; + + return 0; +} + +static int realm_rtt_create(struct realm *realm, + unsigned long addr, + int level, + phys_addr_t phys) +{ + addr = ALIGN_DOWN(addr, rme_rtt_level_mapsize(level - 1)); + return rmi_rtt_create(virt_to_phys(realm->rd), phys, addr, level); +} + +static int realm_rtt_fold(struct realm *realm, + unsigned long addr, + int level, + phys_addr_t *rtt_granule) +{ + unsigned long out_rtt; + int ret; + + addr = ALIGN_DOWN(addr, rme_rtt_level_mapsize(level - 1)); + ret = rmi_rtt_fold(virt_to_phys(realm->rd), addr, level, &out_rtt); + + if (RMI_RETURN_STATUS(ret) == RMI_SUCCESS && rtt_granule) + *rtt_granule = out_rtt; + + return ret; +} + +static int realm_destroy_private_granule(struct realm *realm, + unsigned long ipa, + unsigned long *next_addr, + phys_addr_t *out_rtt) +{ + unsigned long rd = virt_to_phys(realm->rd); + unsigned long rtt_addr; + phys_addr_t rtt; + int ret; + +retry: + ret = rmi_data_destroy(rd, ipa, &rtt_addr, next_addr); + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { + if (*next_addr > ipa) + return 0; /* UNASSIGNED */ + rtt = alloc_delegated_granule(NULL); + if (WARN_ON(rtt == PHYS_ADDR_MAX)) + return -ENOMEM; + /* + * ASSIGNED - ipa is mapped as a block, so split. The index + * from the return code should be 2 otherwise it appears + * there's a huge page bigger than KVM currently supports + */ + WARN_ON(RMI_RETURN_INDEX(ret) != 2); + ret = realm_rtt_create(realm, ipa, 3, rtt); + if (WARN_ON(ret)) { + free_delegated_granule(rtt); + return -ENXIO; + } + goto retry; + } else if (WARN_ON(ret)) { + return -ENXIO; + } + + ret = rmi_granule_undelegate(rtt_addr); + if (WARN_ON(ret)) + return -ENXIO; + + *out_rtt = rtt_addr; + + return 0; +} + +static int realm_unmap_private_page(struct realm *realm, + unsigned long ipa, + unsigned long *next_addr) +{ + unsigned long end = ALIGN(ipa + 1, PAGE_SIZE); + unsigned long addr; + phys_addr_t out_rtt = PHYS_ADDR_MAX; + int ret; + + for (addr = ipa; addr < end; addr = *next_addr) { + ret = realm_destroy_private_granule(realm, addr, next_addr, + &out_rtt); + if (ret) + return ret; + } + + return 0; +} + +/* + * Returns 0 on successful fold, a negative value on error, a positive value if + * we were not able to fold all tables at this level. + */ +static int realm_fold_rtt_level(struct realm *realm, int level, + unsigned long start, unsigned long end) +{ + int not_folded = 0; + ssize_t map_size; + unsigned long addr, next_addr; + + if (WARN_ON(level > RMM_RTT_MAX_LEVEL)) + return -EINVAL; + + map_size = rme_rtt_level_mapsize(level - 1); + + for (addr = start; addr < end; addr = next_addr) { + phys_addr_t rtt_granule; + int ret; + unsigned long align_addr = ALIGN(addr, map_size); + + next_addr = ALIGN(addr + 1, map_size); + + ret = realm_rtt_fold(realm, align_addr, level, &rtt_granule); + + switch (RMI_RETURN_STATUS(ret)) { + case RMI_SUCCESS: + free_delegated_granule(rtt_granule); + break; + case RMI_ERROR_RTT: + if (level == RMM_RTT_MAX_LEVEL || + RMI_RETURN_INDEX(ret) < level) { + not_folded++; + break; + } + /* Recurse a level deeper */ + ret = realm_fold_rtt_level(realm, + level + 1, + addr, + next_addr); + if (ret < 0) + return ret; + else if (ret == 0) + /* Try again at this level */ + next_addr = addr; + break; + default: + WARN_ON(1); + return -ENXIO; + } + } + + return not_folded; +} + +static void realm_unmap_shared_range(struct kvm *kvm, + int level, + unsigned long start, + unsigned long end) +{ + struct realm *realm = &kvm->arch.realm; + unsigned long rd = virt_to_phys(realm->rd); + ssize_t map_size = rme_rtt_level_mapsize(level); + unsigned long next_addr, addr; + unsigned long shared_bit = BIT(realm->ia_bits - 1); + + if (WARN_ON(level > RMM_RTT_MAX_LEVEL)) + return; + + start |= shared_bit; + end |= shared_bit; + + for (addr = start; addr < end; addr = next_addr) { + unsigned long align_addr = ALIGN(addr, map_size); + int ret; + + next_addr = ALIGN(addr + 1, map_size); + + if (align_addr != addr || next_addr > end) { + /* Need to recurse deeper */ + if (addr < align_addr) + next_addr = align_addr; + realm_unmap_shared_range(kvm, level + 1, addr, + min(next_addr, end)); + continue; + } + + ret = rmi_rtt_unmap_unprotected(rd, addr, level, &next_addr); + switch (RMI_RETURN_STATUS(ret)) { + case RMI_SUCCESS: + break; + case RMI_ERROR_RTT: + if (next_addr == addr) { + /* + * There's a mapping here, but it's not a block + * mapping, so reset next_addr to the next block + * boundary and recurse to clear out the pages + * one level deeper. + */ + next_addr = ALIGN(addr + 1, map_size); + realm_unmap_shared_range(kvm, level + 1, addr, + next_addr); + } + break; + default: + WARN_ON(1); + return; + } + + cond_resched_rwlock_write(&kvm->mmu_lock); + } + realm_fold_rtt_level(realm, get_start_level(realm) + 1, start, end); +} + +static int realm_init_sve_param(struct kvm *kvm, struct realm_params *params) +{ + int ret = 0; + unsigned long i; + struct kvm_vcpu *vcpu; + int vl, last_vl = -1; + + /* + * Get the preferred SVE configuration, set by userspace with the + * KVM_ARM_VCPU_SVE feature and KVM_REG_ARM64_SVE_VLS pseudo-register. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + if (vcpu_has_sve(vcpu)) { + if (!kvm_arm_vcpu_sve_finalized(vcpu)) + ret = -EINVAL; + vl = vcpu->arch.sve_max_vl; + } else { + vl = 0; + } + mutex_unlock(&vcpu->mutex); + if (ret) + return ret; + + /* We need all vCPUs to have the same SVE config */ + if (last_vl >= 0 && last_vl != vl) + return -EINVAL; + + last_vl = vl; + } + + if (last_vl > 0) { + params->sve_vl = sve_vq_from_vl(last_vl) - 1; + params->flags |= RMI_REALM_PARAM_FLAG_SVE; + } + return 0; +} + +/* Calculate the number of s2 root rtts needed */ +static int realm_num_root_rtts(struct realm *realm) +{ + unsigned int ipa_bits = realm->ia_bits; + unsigned int levels = 3 - get_start_level(realm); + unsigned int sl_ipa_bits = (levels + 1) * (RMM_PAGE_SHIFT - 3) + + RMM_PAGE_SHIFT; + + if (sl_ipa_bits >= ipa_bits) + return 1; + + return 1 << (ipa_bits - sl_ipa_bits); +} + +static int realm_create_rd(struct kvm *kvm) +{ + struct realm *realm = &kvm->arch.realm; + struct realm_params *params = realm->params; + void *rd = NULL; + phys_addr_t rd_phys, params_phys; + size_t pgd_size = kvm_pgtable_stage2_pgd_size(kvm->arch.vtcr); + u64 dfr0 = IDREG(kvm, SYS_ID_AA64DFR0_EL1); + int i, r; + int rtt_num_start; + + realm->ia_bits = VTCR_EL2_IPA(kvm->arch.vtcr); + rtt_num_start = realm_num_root_rtts(realm); + + if (WARN_ON(realm->rd || !realm->params)) + return -EEXIST; + + if (pgd_size / RMM_PAGE_SIZE < rtt_num_start) + return -EINVAL; + + rd = (void *)__get_free_page(GFP_KERNEL); + if (!rd) + return -ENOMEM; + + rd_phys = virt_to_phys(rd); + if (rmi_granule_delegate(rd_phys)) { + r = -ENXIO; + goto free_rd; + } + + for (i = 0; i < pgd_size; i += RMM_PAGE_SIZE) { + phys_addr_t pgd_phys = kvm->arch.mmu.pgd_phys + i; + + if (rmi_granule_delegate(pgd_phys)) { + r = -ENXIO; + goto out_undelegate_tables; + } + } + + params->s2sz = VTCR_EL2_IPA(kvm->arch.vtcr); + params->rtt_level_start = get_start_level(realm); + params->rtt_num_start = rtt_num_start; + params->rtt_base = kvm->arch.mmu.pgd_phys; + params->vmid = realm->vmid; + params->num_bps = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0); + params->num_wps = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0); + + if (kvm->arch.arm_pmu) { + params->pmu_num_ctrs = kvm->arch.pmcr_n; + params->flags |= RMI_REALM_PARAM_FLAG_PMU; + } + + r = realm_init_sve_param(kvm, params); + if (r) + goto out_undelegate_tables; + + params_phys = virt_to_phys(params); + + if (rmi_realm_create(rd_phys, params_phys)) { + r = -ENXIO; + goto out_undelegate_tables; + } + + if (WARN_ON(rmi_rec_aux_count(rd_phys, &realm->num_aux))) { + WARN_ON(rmi_realm_destroy(rd_phys)); + goto out_undelegate_tables; + } + + realm->rd = rd; + + return 0; + +out_undelegate_tables: + while (i > 0) { + i -= RMM_PAGE_SIZE; + + phys_addr_t pgd_phys = kvm->arch.mmu.pgd_phys + i; + + if (WARN_ON(rmi_granule_undelegate(pgd_phys))) { + /* Leak the pages if they cannot be returned */ + kvm->arch.mmu.pgt = NULL; + break; + } + } + if (WARN_ON(rmi_granule_undelegate(rd_phys))) { + /* Leak the page if it isn't returned */ + return r; + } +free_rd: + free_page((unsigned long)rd); + return r; +} + +static int realm_rtt_destroy(struct realm *realm, unsigned long addr, + int level, phys_addr_t *rtt_granule, + unsigned long *next_addr) +{ + unsigned long out_rtt; + int ret; + + ret = rmi_rtt_destroy(virt_to_phys(realm->rd), addr, level, + &out_rtt, next_addr); + + *rtt_granule = out_rtt; + + return ret; +} + +static int realm_create_rtt_levels(struct realm *realm, + unsigned long ipa, + int level, + int max_level, + struct kvm_mmu_memory_cache *mc) +{ + if (level == max_level) + return 0; + + while (level++ < max_level) { + phys_addr_t rtt = alloc_delegated_granule(mc); + int ret; + + if (rtt == PHYS_ADDR_MAX) + return -ENOMEM; + + ret = realm_rtt_create(realm, ipa, level, rtt); + + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT && + RMI_RETURN_INDEX(ret) == level - 1) { + /* The RTT already exists, continue */ + free_delegated_granule(rtt); + continue; + } + if (ret) { + WARN(1, "Failed to create RTT at level %d: %d\n", + level, ret); + free_delegated_granule(rtt); + return -ENXIO; + } + } + + return 0; +} + +static int realm_tear_down_rtt_level(struct realm *realm, int level, + unsigned long start, unsigned long end) +{ + ssize_t map_size; + unsigned long addr, next_addr; + + if (WARN_ON(level > RMM_RTT_MAX_LEVEL)) + return -EINVAL; + + map_size = rme_rtt_level_mapsize(level - 1); + + for (addr = start; addr < end; addr = next_addr) { + phys_addr_t rtt_granule; + int ret; + unsigned long align_addr = ALIGN(addr, map_size); + + next_addr = ALIGN(addr + 1, map_size); + + if (next_addr > end || align_addr != addr) { + /* + * The target range is smaller than what this level + * covers, recurse deeper. + */ + ret = realm_tear_down_rtt_level(realm, + level + 1, + addr, + min(next_addr, end)); + if (ret) + return ret; + continue; + } + + ret = realm_rtt_destroy(realm, addr, level, + &rtt_granule, &next_addr); + + switch (RMI_RETURN_STATUS(ret)) { + case RMI_SUCCESS: + free_delegated_granule(rtt_granule); + break; + case RMI_ERROR_RTT: + if (next_addr > addr) { + /* Missing RTT, skip */ + break; + } + /* + * We tear down the RTT range for the full IPA + * space, after everything is unmapped. Also we + * descend down only if we cannot tear down a + * top level RTT. Thus RMM must be able to walk + * to the requested level. e.g., a block mapping + * exists at L1 or L2. + */ + if (WARN_ON(RMI_RETURN_INDEX(ret) != level)) + return -EBUSY; + if (WARN_ON(level == RMM_RTT_MAX_LEVEL)) + return -EBUSY; + + /* + * The table has active entries in it, recurse deeper + * and tear down the RTTs. + */ + next_addr = ALIGN(addr + 1, map_size); + ret = realm_tear_down_rtt_level(realm, + level + 1, + addr, + next_addr); + if (ret) + return ret; + /* + * Now that the child RTTs are destroyed, + * retry at this level. + */ + next_addr = addr; + break; + default: + WARN_ON(1); + return -ENXIO; + } + } + + return 0; +} + +static int realm_tear_down_rtt_range(struct realm *realm, + unsigned long start, unsigned long end) +{ + return realm_tear_down_rtt_level(realm, get_start_level(realm) + 1, + start, end); +} + +void kvm_realm_destroy_rtts(struct kvm *kvm, u32 ia_bits) +{ + struct realm *realm = &kvm->arch.realm; + + WARN_ON(realm_tear_down_rtt_range(realm, 0, (1UL << ia_bits))); +} + +static void realm_unmap_private_range(struct kvm *kvm, + unsigned long start, + unsigned long end) +{ + struct realm *realm = &kvm->arch.realm; + unsigned long next_addr, addr; + int ret; + + for (addr = start; addr < end; addr = next_addr) { + ret = realm_unmap_private_page(realm, addr, &next_addr); + + if (ret) + break; + } + + realm_fold_rtt_level(realm, get_start_level(realm) + 1, + start, end); +} + +void kvm_realm_unmap_range(struct kvm *kvm, unsigned long start, + unsigned long size, bool unmap_private) +{ + unsigned long end = start + size; + struct realm *realm = &kvm->arch.realm; + + end = min(BIT(realm->ia_bits - 1), end); + + if (realm->state == REALM_STATE_NONE) + return; + + realm_unmap_shared_range(kvm, find_map_level(realm, start, end), + start, end); + if (unmap_private) + realm_unmap_private_range(kvm, start, end); +} + +static int realm_create_protected_data_granule(struct realm *realm, + unsigned long ipa, + phys_addr_t dst_phys, + phys_addr_t src_phys, + unsigned long flags) +{ + phys_addr_t rd = virt_to_phys(realm->rd); + int ret; + + if (rmi_granule_delegate(dst_phys)) + return -ENXIO; + + ret = rmi_data_create(rd, dst_phys, ipa, src_phys, flags); + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { + /* Create missing RTTs and retry */ + int level = RMI_RETURN_INDEX(ret); + + WARN_ON(level == RMM_RTT_MAX_LEVEL); + + ret = realm_create_rtt_levels(realm, ipa, level, + RMM_RTT_MAX_LEVEL, NULL); + if (ret) + return -EIO; + + ret = rmi_data_create(rd, dst_phys, ipa, src_phys, flags); + } + if (ret) + return -EIO; + + return 0; +} + +static int realm_create_protected_data_page(struct realm *realm, + unsigned long ipa, + struct page *dst_page, + struct page *src_page, + unsigned long flags) +{ + unsigned long rd = virt_to_phys(realm->rd); + phys_addr_t dst_phys, src_phys; + bool undelegate_failed = false; + int ret, offset; + + dst_phys = page_to_phys(dst_page); + src_phys = page_to_phys(src_page); + copy_page(page_address(src_page), page_address(dst_page)); + + for (offset = 0; offset < PAGE_SIZE; offset += RMM_PAGE_SIZE) { + + ret = realm_create_protected_data_granule(realm, + ipa, + dst_phys, + src_phys, + flags); + if (ret) + goto err; + + ipa += RMM_PAGE_SIZE; + dst_phys += RMM_PAGE_SIZE; + src_phys += RMM_PAGE_SIZE; + } + + return 0; + +err: + if (ret == -EIO) { + /* current offset needs undelegating */ + if (WARN_ON(rmi_granule_undelegate(dst_phys))) + undelegate_failed = true; + } + while (offset > 0) { + ipa -= RMM_PAGE_SIZE; + offset -= RMM_PAGE_SIZE; + dst_phys -= RMM_PAGE_SIZE; + + rmi_data_destroy(rd, ipa, NULL, NULL); + + if (WARN_ON(rmi_granule_undelegate(dst_phys))) + undelegate_failed = true; + } + + if (undelegate_failed) { + /* + * A granule could not be undelegated, + * so the page has to be leaked + */ + get_page(dst_page); + } + + return -ENXIO; +} + +static int fold_rtt(struct realm *realm, unsigned long addr, int level) +{ + phys_addr_t rtt_addr; + int ret; + + ret = realm_rtt_fold(realm, addr, level, &rtt_addr); + if (ret) + return ret; + + free_delegated_granule(rtt_addr); + + return 0; +} + +int realm_map_protected(struct realm *realm, + unsigned long ipa, + kvm_pfn_t pfn, + unsigned long map_size, + struct kvm_mmu_memory_cache *memcache) +{ + phys_addr_t phys = __pfn_to_phys(pfn); + phys_addr_t rd = virt_to_phys(realm->rd); + unsigned long base_ipa = ipa; + unsigned long size; + int map_level; + int ret = 0; + + if (WARN_ON(!IS_ALIGNED(map_size, RMM_PAGE_SIZE))) + return -EINVAL; + + if (WARN_ON(!IS_ALIGNED(ipa, map_size))) + return -EINVAL; + + if (IS_ALIGNED(map_size, RMM_L2_BLOCK_SIZE)) + map_level = 2; + else + map_level = 3; + + if (map_level < RMM_RTT_MAX_LEVEL) { + /* + * A temporary RTT is needed during the map, precreate it, + * however if there is an error (e.g. missing parent tables) + * this will be handled below. + */ + realm_create_rtt_levels(realm, ipa, map_level, + RMM_RTT_MAX_LEVEL, memcache); + } + + for (size = 0; size < map_size; size += RMM_PAGE_SIZE) { + if (rmi_granule_delegate(phys)) { + /* + * It's likely we raced with another VCPU on the same + * fault. Assume the other VCPU has handled the fault + * and return to the guest. + */ + return 0; + } + + ret = rmi_data_create_unknown(rd, phys, ipa); + + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { + /* Create missing RTTs and retry */ + int level = RMI_RETURN_INDEX(ret); + + WARN_ON(level == RMM_RTT_MAX_LEVEL); + + ret = realm_create_rtt_levels(realm, ipa, level, + RMM_RTT_MAX_LEVEL, + memcache); + if (ret) + goto err_undelegate; + + ret = rmi_data_create_unknown(rd, phys, ipa); + } + + if (WARN_ON(ret)) + goto err_undelegate; + + phys += RMM_PAGE_SIZE; + ipa += RMM_PAGE_SIZE; + } + + if (map_size == RMM_L2_BLOCK_SIZE) { + ret = fold_rtt(realm, base_ipa, map_level + 1); + if (WARN_ON(ret)) + goto err; + } + + return 0; + +err_undelegate: + if (WARN_ON(rmi_granule_undelegate(phys))) { + /* Page can't be returned to NS world so is lost */ + get_page(phys_to_page(phys)); + } +err: + while (size > 0) { + unsigned long data, top; + + phys -= RMM_PAGE_SIZE; + size -= RMM_PAGE_SIZE; + ipa -= RMM_PAGE_SIZE; + + WARN_ON(rmi_data_destroy(rd, ipa, &data, &top)); + + if (WARN_ON(rmi_granule_undelegate(phys))) { + /* Page can't be returned to NS world so is lost */ + get_page(phys_to_page(phys)); + } + } + return -ENXIO; +} + +int realm_map_non_secure(struct realm *realm, + unsigned long ipa, + kvm_pfn_t pfn, + unsigned long size, + struct kvm_mmu_memory_cache *memcache) +{ + phys_addr_t rd = virt_to_phys(realm->rd); + phys_addr_t phys = __pfn_to_phys(pfn); + unsigned long offset; + int map_size, map_level; + int ret = 0; + + if (WARN_ON(!IS_ALIGNED(size, RMM_PAGE_SIZE))) + return -EINVAL; + + if (WARN_ON(!IS_ALIGNED(ipa, size))) + return -EINVAL; + + if (IS_ALIGNED(size, RMM_L2_BLOCK_SIZE)) { + map_level = 2; + map_size = RMM_L2_BLOCK_SIZE; + } else { + map_level = 3; + map_size = RMM_PAGE_SIZE; + } + + for (offset = 0; offset < size; offset += map_size) { + /* + * realm_map_ipa() enforces that the memory is writable, + * so for now we permit both read and write. + */ + unsigned long desc = phys | + PTE_S2_MEMATTR(MT_S2_FWB_NORMAL) | + (3 << 6); + ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc); + + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { + /* Create missing RTTs and retry */ + int level = RMI_RETURN_INDEX(ret); + + ret = realm_create_rtt_levels(realm, ipa, level, + map_level, memcache); + if (ret) + return -ENXIO; + + ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc); + } + /* + * RMI_ERROR_RTT can be reported for two reasons: either the + * RTT tables are not there, or there is an RTTE already + * present for the address. The call to + * realm_create_rtt_levels() above handles the first case, and + * in the second case this indicates that another thread has + * already populated the RTTE for us, so we can ignore the + * error and continue. + */ + if (ret && RMI_RETURN_STATUS(ret) != RMI_ERROR_RTT) + return -ENXIO; + + ipa += map_size; + phys += map_size; + } + + return 0; +} + +static int populate_region(struct kvm *kvm, + phys_addr_t ipa_base, + phys_addr_t ipa_end, + unsigned long data_flags) +{ + struct realm *realm = &kvm->arch.realm; + struct kvm_memory_slot *memslot; + gfn_t base_gfn, end_gfn; + int idx; + phys_addr_t ipa = ipa_base; + struct page *tmp_page; + int ret = 0; + + base_gfn = gpa_to_gfn(ipa_base); + end_gfn = gpa_to_gfn(ipa_end); + + idx = srcu_read_lock(&kvm->srcu); + memslot = gfn_to_memslot(kvm, base_gfn); + if (!memslot) { + ret = -EFAULT; + goto out; + } + + /* We require the region to be contained within a single memslot */ + if (memslot->base_gfn + memslot->npages < end_gfn) { + ret = -EINVAL; + goto out; + } + + tmp_page = alloc_page(GFP_KERNEL); + if (!tmp_page) { + ret = -ENOMEM; + goto out; + } + + mmap_read_lock(current->mm); + + while (ipa < ipa_end) { + struct vm_area_struct *vma; + unsigned long hva; + struct page *page; + kvm_pfn_t pfn; + + hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa)); + vma = vma_lookup(current->mm, hva); + if (!vma) { + ret = -EFAULT; + break; + } + + pfn = gfn_to_pfn_memslot(memslot, gpa_to_gfn(ipa)); + + if (is_error_pfn(pfn)) { + ret = -EFAULT; + break; + } + + page = pfn_to_page(pfn); + + ret = realm_create_protected_data_page(realm, ipa, + page, + tmp_page, + data_flags); + if (ret) { + kvm_release_page_clean(page); + break; + } + + ipa += PAGE_SIZE; + kvm_release_pfn_dirty(pfn); + } +out: + mmap_read_unlock(current->mm); + __free_page(tmp_page); + srcu_read_unlock(&kvm->srcu, idx); + return ret; +} + +static int kvm_populate_realm(struct kvm *kvm, + struct arm_rme_populate_realm *args) +{ + phys_addr_t ipa_base, ipa_end; + unsigned long data_flags = 0; + + if (kvm_realm_state(kvm) != REALM_STATE_NEW) + return -EPERM; + + if (!IS_ALIGNED(args->base, PAGE_SIZE) || + !IS_ALIGNED(args->size, PAGE_SIZE) || + (args->flags & ~RMI_MEASURE_CONTENT)) + return -EINVAL; + + ipa_base = args->base; + ipa_end = ipa_base + args->size; + + if (ipa_end < ipa_base) + return -EINVAL; + + if (args->flags & RMI_MEASURE_CONTENT) + data_flags |= RMI_MEASURE_CONTENT; + + /* + * Perform the population in parts to ensure locks are not held for too + * long + */ + while (ipa_base < ipa_end) { + phys_addr_t end = min(ipa_end, ipa_base + SZ_2M); + + int ret = populate_region(kvm, ipa_base, end, + args->flags); + + if (ret) + return ret; + + ipa_base = end; + + cond_resched(); + } + + return 0; +} + +static int realm_set_ipa_state(struct kvm_vcpu *vcpu, + unsigned long start, + unsigned long end, + unsigned long ripas, + unsigned long *top_ipa) +{ + struct kvm *kvm = vcpu->kvm; + struct realm *realm = &kvm->arch.realm; + struct realm_rec *rec = &vcpu->arch.rec; + phys_addr_t rd_phys = virt_to_phys(realm->rd); + phys_addr_t rec_phys = virt_to_phys(rec->rec_page); + struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + unsigned long ipa = start; + int ret = 0; + + while (ipa < end) { + unsigned long next; + + ret = rmi_rtt_set_ripas(rd_phys, rec_phys, ipa, end, &next); + + if (RMI_RETURN_STATUS(ret) == RMI_SUCCESS) { + ipa = next; + } else if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { + int walk_level = RMI_RETURN_INDEX(ret); + int level = find_map_level(realm, ipa, end); + + /* + * If the RMM walk ended early then more tables are + * needed to reach the required depth to set the RIPAS. + */ + if (walk_level >= level) + return -EINVAL; + + ret = realm_create_rtt_levels(realm, ipa, + walk_level, + level, + memcache); + if (ret) + return ret; + /* Retry with the RTT levels in place */ + break; + } else { + WARN(1, "Unexpected error in %s: %#x\n", __func__, + ret); + ret = -ENXIO; + break; + } + } + + *top_ipa = ipa; + + if (ripas == RMI_EMPTY && ipa != start) + realm_unmap_private_range(kvm, start, ipa); + + return ret; +} + +static int realm_init_ipa_state(struct realm *realm, + unsigned long ipa, + unsigned long end) +{ + phys_addr_t rd_phys = virt_to_phys(realm->rd); + int ret; + + while (ipa < end) { + unsigned long next; + + ret = rmi_rtt_init_ripas(rd_phys, ipa, end, &next); + + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) { + int err_level = RMI_RETURN_INDEX(ret); + int level = find_map_level(realm, ipa, end); + + if (WARN_ON(err_level >= level)) + return -ENXIO; + + ret = realm_create_rtt_levels(realm, ipa, + err_level, + level, NULL); + if (ret) + return ret; + /* Retry with the RTT levels in place */ + continue; + } else if (WARN_ON(ret)) { + return -ENXIO; + } + + ipa = next; + } + + return 0; +} + +static int kvm_init_ipa_range_realm(struct kvm *kvm, + struct arm_rme_init_ripas *args) +{ + gpa_t addr, end; + struct realm *realm = &kvm->arch.realm; + + addr = args->base; + end = addr + args->size; + + if (end < addr) + return -EINVAL; + + if (kvm_realm_state(kvm) != REALM_STATE_NEW) + return -EPERM; + + return realm_init_ipa_state(realm, addr, end); +} + +static int kvm_activate_realm(struct kvm *kvm) +{ + struct realm *realm = &kvm->arch.realm; + + if (kvm_realm_state(kvm) != REALM_STATE_NEW) + return -EINVAL; + + if (rmi_realm_activate(virt_to_phys(realm->rd))) + return -ENXIO; + + WRITE_ONCE(realm->state, REALM_STATE_ACTIVE); + return 0; +} + +/* Protects access to rme_vmid_bitmap */ +static DEFINE_SPINLOCK(rme_vmid_lock); +static unsigned long *rme_vmid_bitmap; + +static int rme_vmid_init(void) +{ + unsigned int vmid_count = 1 << kvm_get_vmid_bits(); + + rme_vmid_bitmap = bitmap_zalloc(vmid_count, GFP_KERNEL); + if (!rme_vmid_bitmap) { + kvm_err("%s: Couldn't allocate rme vmid bitmap\n", __func__); + return -ENOMEM; + } + + return 0; +} + +static int rme_vmid_reserve(void) +{ + int ret; + unsigned int vmid_count = 1 << kvm_get_vmid_bits(); + + spin_lock(&rme_vmid_lock); + ret = bitmap_find_free_region(rme_vmid_bitmap, vmid_count, 0); + spin_unlock(&rme_vmid_lock); + + return ret; +} + +static void rme_vmid_release(unsigned int vmid) +{ + spin_lock(&rme_vmid_lock); + bitmap_release_region(rme_vmid_bitmap, vmid, 0); + spin_unlock(&rme_vmid_lock); +} + +static int kvm_create_realm(struct kvm *kvm) +{ + struct realm *realm = &kvm->arch.realm; + int ret; + + if (!kvm_is_realm(kvm)) + return -EINVAL; + if (kvm_realm_is_created(kvm)) + return -EEXIST; + + ret = rme_vmid_reserve(); + if (ret < 0) + return ret; + realm->vmid = ret; + + ret = realm_create_rd(kvm); + if (ret) { + rme_vmid_release(realm->vmid); + return ret; + } + + WRITE_ONCE(realm->state, REALM_STATE_NEW); + + /* The realm is up, free the parameters. */ + free_page((unsigned long)realm->params); + realm->params = NULL; + + return 0; +} + +static int config_realm_hash_algo(struct realm *realm, + struct arm_rme_config *cfg) +{ + switch (cfg->hash_algo) { + case ARM_RME_CONFIG_MEASUREMENT_ALGO_SHA256: + if (!rme_has_feature(RMI_FEATURE_REGISTER_0_HASH_SHA_256)) + return -EINVAL; + break; + case ARM_RME_CONFIG_MEASUREMENT_ALGO_SHA512: + if (!rme_has_feature(RMI_FEATURE_REGISTER_0_HASH_SHA_512)) + return -EINVAL; + break; + default: + return -EINVAL; + } + realm->params->hash_algo = cfg->hash_algo; + return 0; +} + +static int kvm_rme_config_realm(struct kvm *kvm, struct kvm_enable_cap *cap) +{ + struct arm_rme_config cfg; + struct realm *realm = &kvm->arch.realm; + int r = 0; + + if (kvm_realm_is_created(kvm)) + return -EBUSY; + + if (copy_from_user(&cfg, (void __user *)cap->args[1], sizeof(cfg))) + return -EFAULT; + + switch (cfg.cfg) { + case ARM_RME_CONFIG_RPV: + memcpy(&realm->params->rpv, &cfg.rpv, sizeof(cfg.rpv)); + break; + case ARM_RME_CONFIG_HASH_ALGO: + r = config_realm_hash_algo(realm, &cfg); + break; + default: + r = -EINVAL; + } + + return r; +} + +int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) +{ + int r = 0; + + if (!kvm_is_realm(kvm)) + return -EINVAL; + + switch (cap->args[0]) { + case KVM_CAP_ARM_RME_CONFIG_REALM: + r = kvm_rme_config_realm(kvm, cap); + break; + case KVM_CAP_ARM_RME_CREATE_REALM: + r = kvm_create_realm(kvm); + break; + case KVM_CAP_ARM_RME_INIT_RIPAS_REALM: { + struct arm_rme_init_ripas args; + void __user *argp = u64_to_user_ptr(cap->args[1]); + + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + + r = kvm_init_ipa_range_realm(kvm, &args); + break; + } + case KVM_CAP_ARM_RME_POPULATE_REALM: { + struct arm_rme_populate_realm args; + void __user *argp = u64_to_user_ptr(cap->args[1]); + + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + + r = kvm_populate_realm(kvm, &args); + break; + } + case KVM_CAP_ARM_RME_ACTIVATE_REALM: + r = kvm_activate_realm(kvm); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +void kvm_destroy_realm(struct kvm *kvm) +{ + struct realm *realm = &kvm->arch.realm; + size_t pgd_size = kvm_pgtable_stage2_pgd_size(kvm->arch.vtcr); + int i; + + if (realm->params) { + free_page((unsigned long)realm->params); + realm->params = NULL; + } + + if (!kvm_realm_is_created(kvm)) + return; + + WRITE_ONCE(realm->state, REALM_STATE_DYING); + + if (realm->rd) { + phys_addr_t rd_phys = virt_to_phys(realm->rd); + + if (WARN_ON(rmi_realm_destroy(rd_phys))) + return; + free_delegated_granule(rd_phys); + realm->rd = NULL; + } + + rme_vmid_release(realm->vmid); + + for (i = 0; i < pgd_size; i += RMM_PAGE_SIZE) { + phys_addr_t pgd_phys = kvm->arch.mmu.pgd_phys + i; + + if (WARN_ON(rmi_granule_undelegate(pgd_phys))) + return; + } + + WRITE_ONCE(realm->state, REALM_STATE_DEAD); + + /* Now that the Realm is destroyed, free the entry level RTTs */ + kvm_free_stage2_pgd(&kvm->arch.mmu); +} + +static void kvm_complete_ripas_change(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct realm_rec *rec = &vcpu->arch.rec; + unsigned long base = rec->run->exit.ripas_base; + unsigned long top = rec->run->exit.ripas_top; + unsigned long ripas = rec->run->exit.ripas_value; + unsigned long top_ipa; + int ret; + + do { + kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_cache, + kvm_mmu_cache_min_pages(kvm)); + write_lock(&kvm->mmu_lock); + ret = realm_set_ipa_state(vcpu, base, top, ripas, &top_ipa); + write_unlock(&kvm->mmu_lock); + + if (WARN_RATELIMIT(ret && ret != -ENOMEM, + "Unable to satisfy RIPAS_CHANGE for %#lx - %#lx, ripas: %#lx\n", + base, top, ripas)) + break; + + base = top_ipa; + } while (base < top); + rec->run->exit.ripas_base = base; +} + +/* + * kvm_rec_pre_enter - Complete operations before entering a REC + * + * Some operations require work to be completed before entering a realm. That + * work may require memory allocation so cannot be done in the kvm_rec_enter() + * call. + * + * Return: 1 if we should enter the guest + * 0 if we should exit to userspace + * < 0 if we should exit to userspace, where the return value indicates + * an error + */ +int kvm_rec_pre_enter(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + + if (kvm_realm_state(vcpu->kvm) != REALM_STATE_ACTIVE) + return -EINVAL; + switch (rec->run->exit.exit_reason) { + case RMI_EXIT_HOST_CALL: + case RMI_EXIT_PSCI: + for (int i = 0; i < REC_RUN_GPRS; i++) + rec->run->enter.gprs[i] = vcpu_get_reg(vcpu, i); + break; + case RMI_EXIT_RIPAS_CHANGE: + kvm_complete_ripas_change(vcpu); + break; + } + + return 1; +} + +int kvm_rec_enter(struct kvm_vcpu *vcpu) +{ + struct realm_rec *rec = &vcpu->arch.rec; + + return rmi_rec_enter(virt_to_phys(rec->rec_page), + virt_to_phys(rec->run)); +} + +static void free_rec_aux(struct page **aux_pages, + unsigned int num_aux) +{ + unsigned int i, j; + unsigned int page_count = 0; + + for (i = 0; i < num_aux;) { + struct page *aux_page = aux_pages[page_count++]; + phys_addr_t aux_page_phys = page_to_phys(aux_page); + bool should_free = true; + + for (j = 0; j < PAGE_SIZE && i < num_aux; j += RMM_PAGE_SIZE) { + if (WARN_ON(rmi_granule_undelegate(aux_page_phys))) + should_free = false; + aux_page_phys += RMM_PAGE_SIZE; + i++; + } + /* Only free if all the undelegate calls were successful */ + if (should_free) + __free_page(aux_page); + } +} + +static int alloc_rec_aux(struct page **aux_pages, + u64 *aux_phys_pages, + unsigned int num_aux) +{ + struct page *aux_page; + int page_count = 0; + unsigned int i, j; + int ret; + + for (i = 0; i < num_aux;) { + phys_addr_t aux_page_phys; + + aux_page = alloc_page(GFP_KERNEL); + if (!aux_page) { + ret = -ENOMEM; + goto out_err; + } + + aux_page_phys = page_to_phys(aux_page); + for (j = 0; j < PAGE_SIZE && i < num_aux; j += RMM_PAGE_SIZE) { + if (rmi_granule_delegate(aux_page_phys)) { + ret = -ENXIO; + goto err_undelegate; + } + aux_phys_pages[i++] = aux_page_phys; + aux_page_phys += RMM_PAGE_SIZE; + } + aux_pages[page_count++] = aux_page; + } + + return 0; +err_undelegate: + while (j > 0) { + j -= RMM_PAGE_SIZE; + i--; + if (WARN_ON(rmi_granule_undelegate(aux_phys_pages[i]))) { + /* Leak the page if the undelegate fails */ + goto out_err; + } + } + __free_page(aux_page); +out_err: + free_rec_aux(aux_pages, i); + return ret; +} + +int kvm_create_rec(struct kvm_vcpu *vcpu) +{ + struct user_pt_regs *vcpu_regs = vcpu_gp_regs(vcpu); + unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + struct realm *realm = &vcpu->kvm->arch.realm; + struct realm_rec *rec = &vcpu->arch.rec; + unsigned long rec_page_phys; + struct rec_params *params; + int r, i; + + if (kvm_realm_state(vcpu->kvm) != REALM_STATE_NEW) + return -ENOENT; + + if (rec->run) + return -EBUSY; + + /* + * The RMM will report PSCI v1.0 to Realms and the KVM_ARM_VCPU_PSCI_0_2 + * flag covers v0.2 and onwards. + */ + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2)) + return -EINVAL; + + if (vcpu->kvm->arch.arm_pmu && !kvm_vcpu_has_pmu(vcpu)) + return -EINVAL; + + BUILD_BUG_ON(sizeof(*params) > PAGE_SIZE); + BUILD_BUG_ON(sizeof(*rec->run) > PAGE_SIZE); + + params = (struct rec_params *)get_zeroed_page(GFP_KERNEL); + rec->rec_page = (void *)__get_free_page(GFP_KERNEL); + rec->run = (void *)get_zeroed_page(GFP_KERNEL); + if (!params || !rec->rec_page || !rec->run) { + r = -ENOMEM; + goto out_free_pages; + } + + for (i = 0; i < ARRAY_SIZE(params->gprs); i++) + params->gprs[i] = vcpu_regs->regs[i]; + + params->pc = vcpu_regs->pc; + + if (vcpu->vcpu_id == 0) + params->flags |= REC_PARAMS_FLAG_RUNNABLE; + + rec_page_phys = virt_to_phys(rec->rec_page); + + if (rmi_granule_delegate(rec_page_phys)) { + r = -ENXIO; + goto out_free_pages; + } + + r = alloc_rec_aux(rec->aux_pages, params->aux, realm->num_aux); + if (r) + goto out_undelegate_rmm_rec; + + params->num_rec_aux = realm->num_aux; + params->mpidr = mpidr; + + if (rmi_rec_create(virt_to_phys(realm->rd), + rec_page_phys, + virt_to_phys(params))) { + r = -ENXIO; + goto out_free_rec_aux; + } + + rec->mpidr = mpidr; + + free_page((unsigned long)params); + return 0; + +out_free_rec_aux: + free_rec_aux(rec->aux_pages, realm->num_aux); +out_undelegate_rmm_rec: + if (WARN_ON(rmi_granule_undelegate(rec_page_phys))) + rec->rec_page = NULL; +out_free_pages: + free_page((unsigned long)rec->run); + free_page((unsigned long)rec->rec_page); + free_page((unsigned long)params); + return r; +} + +void kvm_destroy_rec(struct kvm_vcpu *vcpu) +{ + struct realm *realm = &vcpu->kvm->arch.realm; + struct realm_rec *rec = &vcpu->arch.rec; + unsigned long rec_page_phys; + + if (!vcpu_is_rec(vcpu)) + return; + + if (!rec->run) { + /* Nothing to do if the VCPU hasn't been finalized */ + return; + } + + free_page((unsigned long)rec->run); + + rec_page_phys = virt_to_phys(rec->rec_page); + + /* + * The REC and any AUX pages cannot be reclaimed until the REC is + * destroyed. So if the REC destroy fails then the REC page and any AUX + * pages will be leaked. + */ + if (WARN_ON(rmi_rec_destroy(rec_page_phys))) + return; + + free_rec_aux(rec->aux_pages, realm->num_aux); + + free_delegated_granule(rec_page_phys); +} + +int kvm_init_realm_vm(struct kvm *kvm) +{ + kvm->arch.realm.params = (void *)get_zeroed_page(GFP_KERNEL); + + if (!kvm->arch.realm.params) + return -ENOMEM; + return 0; +} + +void kvm_init_rme(void) +{ + if (PAGE_SIZE != SZ_4K) + /* Only 4k page size on the host is supported */ + return; + + if (rmi_check_version()) + /* Continue without realm support */ + return; + + if (WARN_ON(rmi_features(0, &rmm_feat_reg0))) + return; + + if (rme_vmid_init()) + return; + + static_branch_enable(&kvm_rme_is_available); +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index bfcf36c758c22c349a50ebfddb71509b6f831266..2c4f4d4a1f554c27de5cf655394fe80bbb021d02 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -725,14 +725,9 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX); + u64 mask = BIT(ARMV8_PMU_CYCLE_IDX); + u8 n = vcpu->kvm->arch.pmcr_n; - /* No PMU available, any PMU reg may UNDEF... */ - if (!kvm_arm_support_pmu_v3()) - return 0; - - n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT; - n &= ARMV8_PMU_PMCR_N_MASK; if (n) mask |= GENMASK(n - 1, 0); @@ -768,17 +763,15 @@ static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - u64 pmcr; - - /* No PMU available, PMCR_EL0 may UNDEF... */ - if (!kvm_arm_support_pmu_v3()) - return 0; + u64 pmcr = 0; - /* Only preserve PMCR_EL0.N, and reset the rest to 0 */ - pmcr = read_sysreg(pmcr_el0) & (ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT); if (!kvm_supports_32bit_el0()) pmcr |= ARMV8_PMU_PMCR_LC; + /* + * The value of PMCR.N field is included when the + * vCPU register is read via kvm_vcpu_read_pmcr(). + */ __vcpu_sys_reg(vcpu, r->reg) = pmcr; return __vcpu_sys_reg(vcpu, r->reg); @@ -828,7 +821,7 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * Only update writeable bits of PMCR (continuing into * kvm_pmu_handle_pmcr() as well) */ - val = __vcpu_sys_reg(vcpu, PMCR_EL0); + val = kvm_vcpu_read_pmcr(vcpu); val &= ~ARMV8_PMU_PMCR_MASK; val |= p->regval & ARMV8_PMU_PMCR_MASK; if (!kvm_supports_32bit_el0()) @@ -836,7 +829,7 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, kvm_pmu_handle_pmcr(vcpu, val); } else { /* PMCR.P & PMCR.C are RAZ */ - val = __vcpu_sys_reg(vcpu, PMCR_EL0) + val = kvm_vcpu_read_pmcr(vcpu) & ~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C); p->regval = val; } @@ -885,8 +878,8 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) { u64 pmcr, val; - pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); - val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK; + pmcr = kvm_vcpu_read_pmcr(vcpu); + val = FIELD_GET(ARMV8_PMU_PMCR_N, pmcr); if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) { kvm_inject_undefined(vcpu); return false; @@ -1109,6 +1102,51 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return true; } +static int get_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + *val = kvm_vcpu_read_pmcr(vcpu); + return 0; +} + +static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + u8 new_n = FIELD_GET(ARMV8_PMU_PMCR_N, val); + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&kvm->arch.config_lock); + + /* + * The vCPU can't have more counters than the PMU hardware + * implements. Ignore this error to maintain compatibility + * with the existing KVM behavior. + */ + if (!kvm_vm_has_ran_once(kvm) && !kvm_realm_is_created(kvm) && + new_n <= kvm_arm_pmu_get_max_counters(kvm)) + kvm->arch.pmcr_n = new_n; + + mutex_unlock(&kvm->arch.config_lock); + + /* + * Ignore writes to RES0 bits, read only bits that are cleared on + * vCPU reset, and writable bits that KVM doesn't support yet. + * (i.e. only PMCR.N and bits [7:0] are mutable from userspace) + * The LP bit is RES0 when FEAT_PMUv3p5 is not supported on the vCPU. + * But, we leave the bit as it is here, as the vCPU's PMUver might + * be changed later (NOTE: the bit will be cleared on first vCPU run + * if necessary). + */ + val &= ARMV8_PMU_PMCR_MASK; + + /* The LC bit is RES1 when AArch32 is not supported */ + if (!kvm_supports_32bit_el0()) + val |= ARMV8_PMU_PMCR_LC; + + __vcpu_sys_reg(vcpu, r->reg) = val; + return 0; +} + /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ @@ -1507,14 +1545,18 @@ static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, /* Hide SPE from guests */ val &= ~ID_AA64DFR0_EL1_PMSVer_MASK; - return val; + return kvm_realm_reset_id_aa64dfr0_el1(vcpu, val); } static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { + u8 debugver = SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, val); u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); + u8 bps = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, val); + u8 wps = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, val); + u8 ctx_cmps = SYS_FIELD_GET(ID_AA64DFR0_EL1, CTX_CMPs, val); /* * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the @@ -1533,6 +1575,14 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; + /* + * ID_AA64DFR0_EL1.DebugVer, BRPs and WRPs all have to be greater than + * zero. CTX_CMPs is never greater than BRPs. + */ + if (debugver < ID_AA64DFR0_EL1_DebugVer_IMP || !bps || !wps || + ctx_cmps > bps) + return -EINVAL; + return set_id_reg(vcpu, rd, val); } @@ -1643,10 +1693,11 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, mutex_lock(&vcpu->kvm->arch.config_lock); /* - * Once the VM has started the ID registers are immutable. Reject any - * write that does not match the final register value. + * Once the VM has started or the Realm descriptor is created, the ID + * registers are immutable. Reject any write that does not match the + * final register value. */ - if (kvm_vm_has_ran_once(vcpu->kvm)) { + if (kvm_vm_has_ran_once(vcpu->kvm) || kvm_realm_is_created(vcpu->kvm)) { if (val != read_id_reg(vcpu, rd)) ret = -EBUSY; else @@ -2224,8 +2275,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_CTR_EL0), access_ctr }, { SYS_DESC(SYS_SVCR), undef_access }, - { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, - .reset = reset_pmcr, .reg = PMCR_EL0 }, + { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr, + .reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr }, { PMU_SYS_REG(PMCNTENSET_EL0), .access = access_pmcnten, .reg = PMCNTENSET_EL0 }, { PMU_SYS_REG(PMCNTENCLR_EL0), @@ -3521,18 +3572,18 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } -static unsigned int num_demux_regs(void) +static unsigned int num_demux_regs(struct kvm_vcpu *vcpu) { - return CSSELR_MAX; + return kvm_is_realm(vcpu->kvm) ? 0 : CSSELR_MAX; } -static int write_demux_regids(u64 __user *uindices) +static int write_demux_regids(struct kvm_vcpu *vcpu, u64 __user *uindices) { u64 val = KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX; unsigned int i; val |= KVM_REG_ARM_DEMUX_ID_CCSIDR; - for (i = 0; i < CSSELR_MAX; i++) { + for (i = 0; i < num_demux_regs(vcpu); i++) { if (put_user(val | i, uindices)) return -EFAULT; uindices++; @@ -3540,6 +3591,24 @@ static int write_demux_regids(u64 __user *uindices) return 0; } +static unsigned int num_invariant_regs(struct kvm_vcpu *vcpu) +{ + return kvm_is_realm(vcpu->kvm) ? 0 : ARRAY_SIZE(invariant_sys_regs); +} + +static int write_invariant_regids(struct kvm_vcpu *vcpu, u64 __user *uindices) +{ + unsigned int i; + + for (i = 0; i < num_invariant_regs(vcpu); i++) { + if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices)) + return -EFAULT; + uindices++; + } + return 0; +} + + static u64 sys_reg_to_index(const struct sys_reg_desc *reg) { return (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | @@ -3563,11 +3632,27 @@ static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind) return true; } +static bool kvm_realm_sys_reg_hidden_user(const struct kvm_vcpu *vcpu, u64 reg) +{ + if (!kvm_is_realm(vcpu->kvm)) + return false; + + switch (reg) { + case SYS_ID_AA64DFR0_EL1: + case SYS_PMCR_EL0: + return false; + } + return true; +} + static int walk_one_sys_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 __user **uind, unsigned int *total) { + if (kvm_realm_sys_reg_hidden_user(vcpu, reg_to_encoding(rd))) + return 0; + /* * Ignore registers we trap but don't save, * and for which no custom user accessor is provided. @@ -3605,29 +3690,26 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind) unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu) { - return ARRAY_SIZE(invariant_sys_regs) - + num_demux_regs() + return num_invariant_regs(vcpu) + + num_demux_regs(vcpu) + walk_sys_regs(vcpu, (u64 __user *)NULL); } int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { - unsigned int i; int err; - /* Then give them all the invariant registers' indices. */ - for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) { - if (put_user(sys_reg_to_index(&invariant_sys_regs[i]), uindices)) - return -EFAULT; - uindices++; - } + err = write_invariant_regids(vcpu, uindices); + if (err) + return err; + uindices += num_invariant_regs(vcpu); err = walk_sys_regs(vcpu, uindices); if (err < 0) return err; uindices += err; - return write_demux_regids(uindices); + return write_demux_regids(vcpu, uindices); } int __init kvm_sys_reg_table_init(void) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index b7306c588d9d3422efb34682f6fee06102d98688..cffb3030c5242700f81792ad07516c4359839790 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -81,7 +81,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) * the proper checks already. */ if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && - !kvm_vgic_global_state.can_emulate_gicv2) + (!kvm_vgic_global_state.can_emulate_gicv2 || kvm_is_realm(kvm))) return -ENODEV; /* Must be held to avoid race with vCPU creation */ diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 3dfc8b84e03e67868ff49cb72a97695a9222ef2b..2a03c16b3bb314464e88667fede3bc52a48502ea 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -7,9 +7,11 @@ #include #include #include +#include #include #include #include +#include #include "vgic.h" @@ -749,6 +751,9 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + if (vcpu_is_rec(vcpu)) + cpu_if->vgic_vmcr = vcpu->arch.rec.run->exit.gicv3_vmcr; + WARN_ON(vgic_v4_put(vcpu)); vgic_v3_vmcr_sync(vcpu); diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 8be4c1ebdec27a3e13c12914af80227c8d0eb32b..07d73faacc512bfadb6aec49a8c55c2bf11d6e20 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -10,7 +10,9 @@ #include #include +#include #include +#include #include "vgic.h" @@ -21,6 +23,13 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { .gicv3_cpuif = STATIC_KEY_FALSE_INIT, }; +static inline int kvm_vcpu_vgic_nr_lr(struct kvm_vcpu *vcpu) +{ + if (unlikely(vcpu_is_rec(vcpu))) + return kvm_realm_vgic_nr_lr(); + return kvm_vgic_global_state.nr_lr; +} + /* * Locking order is always: * kvm->lock (mutex) @@ -808,7 +817,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) lockdep_assert_held(&vgic_cpu->ap_list_lock); count = compute_ap_list_depth(vcpu, &multi_sgi); - if (count > kvm_vgic_global_state.nr_lr || multi_sgi) + if (count > kvm_vcpu_vgic_nr_lr(vcpu) || multi_sgi) vgic_sort_ap_list(vcpu); count = 0; @@ -837,7 +846,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) raw_spin_unlock(&irq->irq_lock); - if (count == kvm_vgic_global_state.nr_lr) { + if (count == kvm_vcpu_vgic_nr_lr(vcpu)) { if (!list_is_last(&irq->ap_list, &vgic_cpu->ap_list_head)) vgic_set_underflow(vcpu); @@ -846,7 +855,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) } /* Nuke remaining LRs */ - for (i = count ; i < kvm_vgic_global_state.nr_lr; i++) + for (i = count ; i < kvm_vcpu_vgic_nr_lr(vcpu); i++) vgic_clear_lr(vcpu, i); if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) @@ -865,10 +874,23 @@ static inline bool can_access_vgic_from_kernel(void) return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe(); } +static inline void vgic_rmm_save_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + int i; + + for (i = 0; i < kvm_vcpu_vgic_nr_lr(vcpu); i++) { + cpu_if->vgic_lr[i] = vcpu->arch.rec.run->exit.gicv3_lrs[i]; + vcpu->arch.rec.run->enter.gicv3_lrs[i] = 0; + } +} + static inline void vgic_save_state(struct kvm_vcpu *vcpu) { if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_save_state(vcpu); + else if (vcpu_is_rec(vcpu)) + vgic_rmm_save_state(vcpu); else __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3); } @@ -895,10 +917,28 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) vgic_prune_ap_list(vcpu); } +static inline void vgic_rmm_restore_state(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + int i; + + for (i = 0; i < kvm_vcpu_vgic_nr_lr(vcpu); i++) { + vcpu->arch.rec.run->enter.gicv3_lrs[i] = cpu_if->vgic_lr[i]; + /* + * Also populate the rec.run->exit copies so that a late + * decision to back out from entering the realm doesn't cause + * the state to be lost + */ + vcpu->arch.rec.run->exit.gicv3_lrs[i] = cpu_if->vgic_lr[i]; + } +} + static inline void vgic_restore_state(struct kvm_vcpu *vcpu) { if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_restore_state(vcpu); + else if (vcpu_is_rec(vcpu)) + vgic_rmm_restore_state(vcpu); else __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3); } @@ -939,10 +979,15 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) void kvm_vgic_load(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + !vgic_initialized(vcpu->kvm) || + vcpu_is_rec(vcpu))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_load(vcpu); else vgic_v3_load(vcpu); @@ -950,10 +995,15 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu) void kvm_vgic_put(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + !vgic_initialized(vcpu->kvm) || + vcpu_is_rec(vcpu))) { + if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) + __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3); return; + } - if (kvm_vgic_global_state.type == VGIC_V2) + if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) vgic_v2_put(vcpu); else vgic_v3_put(vcpu); diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index dbd1bc95967d00d364e27e27c1f5fff2bc48e81f..b35415be138281ae8ce5f181b9cc302e73af3514 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := dma-mapping.o extable.o fault.o init.o \ cache.o copypage.o flush.o \ - ioremap.o mmap.o pgd.o mmu.o \ + ioremap.o mmap.o pgd.o mem_encrypt.o mmu.o \ context.o proc.o pageattr.o fixmap.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index fb997df204e937287ab789c62da5fb9c09c19ee3..b6550cedd4b751c42e1d7816a26575773fcaa770 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -1728,6 +1728,25 @@ static int do_tag_check_fault(unsigned long far, unsigned long esr, return 0; } +static int do_gpf_ptw(unsigned long far, unsigned long esr, struct pt_regs *regs) +{ + const struct fault_info *inf = esr_to_fault_info(esr); + + die_kernel_fault(inf->name, far, esr, regs); + return 0; +} + +static int do_gpf(unsigned long far, unsigned long esr, struct pt_regs *regs) +{ + const struct fault_info *inf = esr_to_fault_info(esr); + + if (!is_el1_instruction_abort(esr) && fixup_exception(regs)) + return 0; + + arm64_notify_die(inf->name, regs, inf->sig, inf->code, far, esr); + return 0; +} + static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" }, @@ -1764,12 +1783,12 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "unknown 32" }, { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 34" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 35" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 36" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 37" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 38" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 39" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 40" }, + { do_gpf_ptw, SIGKILL, SI_KERNEL, "Granule Protection Fault at level -1" }, + { do_gpf_ptw, SIGKILL, SI_KERNEL, "Granule Protection Fault at level 0" }, + { do_gpf_ptw, SIGKILL, SI_KERNEL, "Granule Protection Fault at level 1" }, + { do_gpf_ptw, SIGKILL, SI_KERNEL, "Granule Protection Fault at level 2" }, + { do_gpf_ptw, SIGKILL, SI_KERNEL, "Granule Protection Fault at level 3" }, + { do_gpf, SIGBUS, SI_KERNEL, "Granule Protection Fault not on table walk" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 41" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 42" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 43" }, diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1463dc657a98114a22ea8f6493daba1b6af28bfd..f840dd539cafffe3e9b5b8aecc563270e95943b7 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -495,12 +496,19 @@ void __init bootmem_init(void) */ void __init mem_init(void) { + unsigned int flags = SWIOTLB_VERBOSE; bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit); + if (is_realm_world()) { + swiotlb = true; + flags |= SWIOTLB_FORCE; + } + if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC)) swiotlb = true; - swiotlb_init(swiotlb, SWIOTLB_VERBOSE); + swiotlb_init(swiotlb, flags); + swiotlb_update_mem_attributes(); #ifdef CONFIG_PSWIOTLB /* enable pswiotlb default */ diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index 3f34dc886f81773a1154e6e10c4f693009c2625b..47859e84755b4a50a6a4428fe9ab3f640749fc28 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -4,10 +4,22 @@ #include #include +static ioremap_prot_hook_t ioremap_prot_hook; + +int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook) +{ + if (WARN_ON(ioremap_prot_hook)) + return -EBUSY; + + ioremap_prot_hook = hook; + return 0; +} + void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, unsigned long prot) { unsigned long last_addr = phys_addr + size - 1; + pgprot_t pgprot = __pgprot(prot); /* Don't allow outside PHYS_MASK */ if (last_addr & ~PHYS_MASK) @@ -17,7 +29,16 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr)))) return NULL; - return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); + /* + * If a hook is registered (e.g. for confidential computing + * purposes), call that now and barf if it fails. + */ + if (unlikely(ioremap_prot_hook) && + WARN_ON(ioremap_prot_hook(phys_addr, size, &pgprot))) { + return NULL; + } + + return generic_ioremap_prot(phys_addr, size, pgprot); } EXPORT_SYMBOL(ioremap_prot); diff --git a/arch/arm64/mm/mem_encrypt.c b/arch/arm64/mm/mem_encrypt.c new file mode 100644 index 0000000000000000000000000000000000000000..ee3c0ab043845f9dc62715da4e60afa829f8ccb2 --- /dev/null +++ b/arch/arm64/mm/mem_encrypt.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Implementation of the memory encryption/decryption API. + * + * Since the low-level details of the operation depend on the + * Confidential Computing environment (e.g. pKVM, CCA, ...), this just + * acts as a top-level dispatcher to whatever hooks may have been + * registered. + * + * Author: Will Deacon + * Copyright (C) 2024 Google LLC + * + * "Hello, boils and ghouls!" + */ + +#include +#include +#include +#include + +#include + +static const struct arm64_mem_crypt_ops *crypt_ops; + +int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops) +{ + if (WARN_ON(crypt_ops)) + return -EBUSY; + + crypt_ops = ops; + return 0; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return crypt_ops->encrypt(addr, numpages); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (likely(!crypt_ops) || WARN_ON(!PAGE_ALIGNED(addr))) + return 0; + + return crypt_ops->decrypt(addr, numpages); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 0a62f458c5cb023b01c894c555d120ce74941fea..8a81c59c9eff41c4250a4d7690ec191d927e0317 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -5,10 +5,12 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -23,14 +25,16 @@ bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED bool can_set_direct_map(void) { /* - * rodata_full and DEBUG_PAGEALLOC require linear map to be - * mapped at page granularity, so that it is possible to + * rodata_full, DEBUG_PAGEALLOC and a Realm guest all require linear + * map to be mapped at page granularity, so that it is possible to * protect/unprotect single pages. * * KFENCE pool requires page-granular mapping if initialized late. + * + * Realms need to make pages shared/protected at page granularity. */ return rodata_full || debug_pagealloc_enabled() || - arm64_kfence_can_set_direct_map(); + arm64_kfence_can_set_direct_map() || is_realm_world(); } static int change_page_range(pte_t *ptep, unsigned long addr, void *data) @@ -60,7 +64,13 @@ static int __change_memory_common(unsigned long start, unsigned long size, ret = apply_to_page_range(&init_mm, start, size, change_page_range, &data); - flush_tlb_kernel_range(start, start + size); + /* + * If the memory is being made valid without changing any other bits + * then a TLBI isn't required as a non-valid entry cannot be cached in + * the TLB. + */ + if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask)) + flush_tlb_kernel_range(start, start + size); return ret; } @@ -192,6 +202,86 @@ int set_direct_map_default_noflush(struct page *page) PAGE_SIZE, change_page_range, &data); } +static int __set_memory_enc_dec(unsigned long addr, + int numpages, + bool encrypt) +{ + unsigned long set_prot = 0, clear_prot = 0; + phys_addr_t start, end; + int ret; + + if (!is_realm_world()) + return 0; + + if (!__is_lm_address(addr)) + return -EINVAL; + + start = __virt_to_phys(addr); + end = start + numpages * PAGE_SIZE; + + if (encrypt) + clear_prot = PROT_NS_SHARED; + else + set_prot = PROT_NS_SHARED; + + /* + * Break the mapping before we make any changes to avoid stale TLB + * entries or Synchronous External Aborts caused by RIPAS_EMPTY + */ + ret = __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(set_prot), + __pgprot(clear_prot | PTE_VALID)); + + if (ret) + return ret; + + if (encrypt) + ret = rsi_set_memory_range_protected(start, end); + else + ret = rsi_set_memory_range_shared(start, end); + + if (ret) + return ret; + + return __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(PTE_VALID), + __pgprot(0)); +} + +static int realm_set_memory_encrypted(unsigned long addr, int numpages) +{ + int ret = __set_memory_enc_dec(addr, numpages, true); + + /* + * If the request to change state fails, then the only sensible cause + * of action for the caller is to leak the memory + */ + WARN(ret, "Failed to encrypt memory, %d pages will be leaked", + numpages); + + return ret; +} + +static int realm_set_memory_decrypted(unsigned long addr, int numpages) +{ + int ret = __set_memory_enc_dec(addr, numpages, false); + + WARN(ret, "Failed to decrypt memory, %d pages will be leaked", + numpages); + + return ret; +} + +static const struct arm64_mem_crypt_ops realm_crypt_ops = { + .encrypt = realm_set_memory_encrypted, + .decrypt = realm_set_memory_decrypted, +}; + +int realm_register_memory_enc_ops(void) +{ + return arm64_mem_crypt_ops_register(&realm_crypt_ops); +} + #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index a2a3e89f2d9ab371a5a62ed24b7fa29f459726ab..136a7ec1b50ad44e97b1e90d0d282e7657adadd4 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -457,7 +457,6 @@ void free_initmem(void) unsigned long kernel_end = (unsigned long)&_end; /* Remap kernel text and data, but do not touch init section yet. */ - kernel_set_to_readonly = true; map_pages(init_end, __pa(init_end), kernel_end - init_end, PAGE_KERNEL, 0); @@ -491,11 +490,18 @@ void free_initmem(void) #ifdef CONFIG_STRICT_KERNEL_RWX void mark_rodata_ro(void) { - /* rodata memory was already mapped with KERNEL_RO access rights by - pagetable_init() and map_pages(). No need to do additional stuff here */ - unsigned long roai_size = __end_ro_after_init - __start_ro_after_init; + unsigned long start = (unsigned long) &__start_rodata; + unsigned long end = (unsigned long) &__end_rodata; + + pr_info("Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); + + kernel_set_to_readonly = true; + map_pages(start, __pa(start), end - start, PAGE_KERNEL, 0); - pr_info("Write protected read-only-after-init data: %luk\n", roai_size >> 10); + /* force the kernel to see the new page table entries */ + flush_cache_all(); + flush_tlb_all(); } #endif diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 13cdb85b5d9b2aa8846bd504598febf80d964474..080d9b8ffbd69c12927f0758bf035e71b23941fb 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -12,12 +12,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -27,6 +29,7 @@ #include #include #include +#include #include #include @@ -167,6 +170,7 @@ struct its_device { struct its_node *its; struct event_lpi_map event_map; void *itt; + u32 itt_sz; u32 nr_ites; u32 device_id; bool shared; @@ -202,6 +206,87 @@ static DEFINE_IDA(its_vpeid_ida); #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) #define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K) +static struct page *its_alloc_pages_node(int node, gfp_t gfp, + unsigned int order) +{ + struct page *page; + int ret = 0; + + page = alloc_pages_node(node, gfp, order); + + if (!page) + return NULL; + + ret = set_memory_decrypted((unsigned long)page_address(page), + 1 << order); + /* + * If set_memory_decrypted() fails then we don't know what state the + * page is in, so we can't free it. Instead we leak it. + * set_memory_decrypted() will already have WARNed. + */ + if (ret) + return NULL; + + return page; +} + +static struct page *its_alloc_pages(gfp_t gfp, unsigned int order) +{ + return its_alloc_pages_node(NUMA_NO_NODE, gfp, order); +} + +static void its_free_pages(void *addr, unsigned int order) +{ + /* + * If the memory cannot be encrypted again then we must leak the pages. + * set_memory_encrypted() will already have WARNed. + */ + if (set_memory_encrypted((unsigned long)addr, 1 << order)) + return; + free_pages((unsigned long)addr, order); +} + +static struct gen_pool *itt_pool; + +static void *itt_alloc_pool(int node, int size) +{ + unsigned long addr; + struct page *page; + + if (size >= PAGE_SIZE) { + page = its_alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, get_order(size)); + + return page ? page_address(page) : NULL; + } + + do { + addr = gen_pool_alloc(itt_pool, size); + if (addr) + break; + + page = its_alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + break; + + gen_pool_add(itt_pool, (unsigned long)page_address(page), PAGE_SIZE, node); + } while (!addr); + + return (void *)addr; +} + +static void itt_free_pool(void *addr, int size) +{ + if (!addr) + return; + + if (size >= PAGE_SIZE) { + its_free_pages(addr, get_order(size)); + return; + } + + gen_pool_free(itt_pool, (unsigned long)addr, size); +} + /* * Skip ITSs that have no vLPIs mapped, unless we're on GICv4.1, as we * always have vSGIs mapped. @@ -624,7 +709,6 @@ static struct its_collection *its_build_mapd_cmd(struct its_node *its, u8 size = ilog2(desc->its_mapd_cmd.dev->nr_ites); itt_addr = virt_to_phys(desc->its_mapd_cmd.dev->itt); - itt_addr = ALIGN(itt_addr, ITS_ITT_ALIGN); its_encode_cmd(cmd, GITS_CMD_MAPD); its_encode_devid(cmd, desc->its_mapd_cmd.dev->device_id); @@ -2201,7 +2285,8 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) { struct page *prop_page; - prop_page = alloc_pages(gfp_flags, get_order(LPI_PROPBASE_SZ)); + prop_page = its_alloc_pages(gfp_flags, + get_order(LPI_PROPBASE_SZ)); if (!prop_page) return NULL; @@ -2212,8 +2297,7 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags) static void its_free_prop_table(struct page *prop_page) { - free_pages((unsigned long)page_address(prop_page), - get_order(LPI_PROPBASE_SZ)); + its_free_pages(page_address(prop_page), get_order(LPI_PROPBASE_SZ)); } static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size) @@ -2335,7 +2419,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, order = get_order(GITS_BASER_PAGES_MAX * psz); } - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); + page = its_alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order); if (!page) return -ENOMEM; @@ -2348,7 +2432,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, /* 52bit PA is supported only when PageSize=64K */ if (psz != SZ_64K) { pr_err("ITS: no 52bit PA support when psz=%d\n", psz); - free_pages((unsigned long)base, order); + its_free_pages(base, order); return -ENXIO; } @@ -2404,7 +2488,7 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, pr_err("ITS@%pa: %s doesn't stick: %llx %llx\n", &its->phys_base, its_base_type_string[type], val, tmp); - free_pages((unsigned long)base, order); + its_free_pages(base, order); return -ENXIO; } @@ -2543,8 +2627,7 @@ static void its_free_tables(struct its_node *its) for (i = 0; i < GITS_BASER_NR_REGS; i++) { if (its->tables[i].base) { - free_pages((unsigned long)its->tables[i].base, - its->tables[i].order); + its_free_pages(its->tables[i].base, its->tables[i].order); its->tables[i].base = NULL; } } @@ -2814,7 +2897,7 @@ static bool allocate_vpe_l2_table(int cpu, u32 id) /* Allocate memory for 2nd level table */ if (!table[idx]) { - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz)); + page = its_alloc_pages(GFP_KERNEL | __GFP_ZERO, get_order(psz)); if (!page) return false; @@ -2933,7 +3016,7 @@ static int allocate_vpe_l1_table(void) pr_debug("np = %d, npg = %lld, psz = %d, epp = %d, esz = %d\n", np, npg, psz, epp, esz); - page = alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE)); + page = its_alloc_pages(GFP_ATOMIC | __GFP_ZERO, get_order(np * PAGE_SIZE)); if (!page) return -ENOMEM; @@ -2979,8 +3062,7 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags) { struct page *pend_page; - pend_page = alloc_pages(gfp_flags | __GFP_ZERO, - get_order(LPI_PENDBASE_SZ)); + pend_page = its_alloc_pages(gfp_flags | __GFP_ZERO, get_order(LPI_PENDBASE_SZ)); if (!pend_page) return NULL; @@ -2992,7 +3074,7 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags) static void its_free_pending_table(struct page *pt) { - free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ)); + its_free_pages(page_address(pt), get_order(LPI_PENDBASE_SZ)); } /* @@ -3327,8 +3409,8 @@ static bool its_alloc_table_entry(struct its_node *its, /* Allocate memory for 2nd level table */ if (!table[idx]) { - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, - get_order(baser->psz)); + page = its_alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, + get_order(baser->psz)); if (!page) return false; @@ -3423,15 +3505,18 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, if (WARN_ON(!is_power_of_2(nvecs))) nvecs = roundup_pow_of_two(nvecs); - dev = kzalloc(sizeof(*dev), GFP_KERNEL); /* * Even if the device wants a single LPI, the ITT must be * sized as a power of two (and you need at least one bit...). */ nr_ites = max(2, nvecs); sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1); - sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1; - itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node); + sz = max(sz, ITS_ITT_ALIGN); + + itt = itt_alloc_pool(its->numa_node, sz); + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (alloc_lpis) { lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis); if (lpi_map) @@ -3443,9 +3528,9 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, lpi_base = 0; } - if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) { + if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) { kfree(dev); - kfree(itt); + itt_free_pool(itt, sz); bitmap_free(lpi_map); kfree(col_map); return NULL; @@ -3455,6 +3540,7 @@ static struct its_device *its_create_device(struct its_node *its, u32 dev_id, dev->its = its; dev->itt = itt; + dev->itt_sz = sz; dev->nr_ites = nr_ites; dev->event_map.lpi_map = lpi_map; dev->event_map.col_map = col_map; @@ -3482,7 +3568,7 @@ static void its_free_device(struct its_device *its_dev) list_del(&its_dev->entry); raw_spin_unlock_irqrestore(&its_dev->its->lock, flags); kfree(its_dev->event_map.col_map); - kfree(its_dev->itt); + itt_free_pool(its_dev->itt, its_dev->itt_sz); kfree(its_dev); } @@ -5125,8 +5211,9 @@ static int __init its_probe_one(struct its_node *its) } } - page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, - get_order(ITS_CMD_QUEUE_SZ)); + page = its_alloc_pages_node(its->numa_node, + GFP_KERNEL | __GFP_ZERO, + get_order(ITS_CMD_QUEUE_SZ)); if (!page) { err = -ENOMEM; goto out_unmap_sgir; @@ -5190,7 +5277,7 @@ static int __init its_probe_one(struct its_node *its) out_free_tables: its_free_tables(its); out_free_cmd: - free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); + its_free_pages(its->cmd_base, get_order(ITS_CMD_QUEUE_SZ)); out_unmap_sgir: if (its->sgir_base) iounmap(its->sgir_base); @@ -5672,6 +5759,10 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, bool has_v4_1 = false; int err; + itt_pool = gen_pool_create(get_order(ITS_ITT_ALIGN), -1); + if (!itt_pool) + return -ENOMEM; + gic_rdists = rdists; its_parent = parent_domain; diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index b3bb4d62f13dcc52c7f6d0fd95cdf6eb76e57930..aa79e76ef21adffb2248833c79c9141dab501f44 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -734,6 +734,21 @@ static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node) return 0; } +void arm_pmu_set_phys_irq(bool enable) +{ + int cpu = get_cpu(); + struct arm_pmu *pmu = per_cpu(cpu_armpmu, cpu); + int irq; + + irq = armpmu_get_cpu_irq(pmu, cpu); + if (irq && !enable) + per_cpu(cpu_irq_ops, cpu)->disable_pmuirq(irq); + else if (irq && enable) + per_cpu(cpu_irq_ops, cpu)->enable_pmuirq(irq); + + put_cpu(); +} + #ifdef CONFIG_CPU_PM static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd) { diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 0858e6096453ef280e712d77ba2a1e2f85cb5cb2..6c0efa5f2eca7f178b84dce625a5eb60e3457637 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -1111,8 +1112,7 @@ static void __armv8pmu_probe_pmu(void *info) probe->present = true; /* Read the nb of CNTx counters supported from PMNC */ - cpu_pmu->num_events = (armv8pmu_pmcr_read() >> ARMV8_PMU_PMCR_N_SHIFT) - & ARMV8_PMU_PMCR_N_MASK; + cpu_pmu->num_events = FIELD_GET(ARMV8_PMU_PMCR_N, armv8pmu_pmcr_read()); /* Add the CPU cycles counter */ cpu_pmu->num_events += 1; diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig index b1c4efa00182e0f40a8fa199620da19fb615918a..40129b6f0eca41ced6c6213776571a5ae8374ebe 100644 --- a/drivers/virt/Kconfig +++ b/drivers/virt/Kconfig @@ -48,12 +48,6 @@ source "drivers/virt/nitro_enclaves/Kconfig" source "drivers/virt/acrn/Kconfig" -source "drivers/virt/coco/efi_secret/Kconfig" - -source "drivers/virt/coco/sev-guest/Kconfig" - -source "drivers/virt/coco/tdx-guest/Kconfig" - -source "drivers/virt/coco/csv-guest/Kconfig" +source "drivers/virt/coco/Kconfig" endif diff --git a/drivers/virt/Makefile b/drivers/virt/Makefile index 62681a26075878b0bef4d2513bf9f36120a4cdd2..f29901bd782058d3552cdec2c2128ad47ce6fe27 100644 --- a/drivers/virt/Makefile +++ b/drivers/virt/Makefile @@ -9,7 +9,4 @@ obj-y += vboxguest/ obj-$(CONFIG_NITRO_ENCLAVES) += nitro_enclaves/ obj-$(CONFIG_ACRN_HSM) += acrn/ -obj-$(CONFIG_EFI_SECRET) += coco/efi_secret/ -obj-$(CONFIG_SEV_GUEST) += coco/sev-guest/ -obj-$(CONFIG_INTEL_TDX_GUEST) += coco/tdx-guest/ -obj-$(CONFIG_CSV_GUEST) += coco/csv-guest/ +obj-y += coco/ diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..dd1011c2e59ef52f04c03ebbe39a05757177375b --- /dev/null +++ b/drivers/virt/coco/Kconfig @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Confidential computing related collateral +# + +config TSM_REPORTS + select CONFIGFS_FS + tristate + +source "drivers/virt/coco/efi_secret/Kconfig" + +source "drivers/virt/coco/sev-guest/Kconfig" + +source "drivers/virt/coco/tdx-guest/Kconfig" + +source "drivers/virt/coco/csv-guest/Kconfig" + +source "drivers/virt/coco/arm-cca-guest/Kconfig" diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..a0c5840a3a33712fcbba8bfc0c129c1ee9bb2ad0 --- /dev/null +++ b/drivers/virt/coco/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Confidential computing related collateral +# +obj-$(CONFIG_TSM_REPORTS) += tsm.o +obj-$(CONFIG_EFI_SECRET) += efi_secret/ +obj-$(CONFIG_SEV_GUEST) += sev-guest/ +obj-$(CONFIG_INTEL_TDX_GUEST) += tdx-guest/ +obj-$(CONFIG_CSV_GUEST) += csv-guest/ +obj-$(CONFIG_ARM_CCA_GUEST) += arm-cca-guest/ diff --git a/drivers/virt/coco/arm-cca-guest/Kconfig b/drivers/virt/coco/arm-cca-guest/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..9dd27c3ee215275cf985498c5d2731ded0708246 --- /dev/null +++ b/drivers/virt/coco/arm-cca-guest/Kconfig @@ -0,0 +1,11 @@ +config ARM_CCA_GUEST + tristate "Arm CCA Guest driver" + depends on ARM64 + default m + select TSM_REPORTS + help + The driver provides userspace interface to request and + attestation report from the Realm Management Monitor(RMM). + + If you choose 'M' here, this module will be called + arm-cca-guest. diff --git a/drivers/virt/coco/arm-cca-guest/Makefile b/drivers/virt/coco/arm-cca-guest/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..69eeba08e98a85259e608a44ba62a84ba4de583f --- /dev/null +++ b/drivers/virt/coco/arm-cca-guest/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_ARM_CCA_GUEST) += arm-cca-guest.o diff --git a/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c b/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c new file mode 100644 index 0000000000000000000000000000000000000000..f937f6e1009e02b20116401be8aae4333a2cea69 --- /dev/null +++ b/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +/** + * struct arm_cca_token_info - a descriptor for the token buffer. + * @challenge: Pointer to the challenge data + * @challenge_size: Size of the challenge data + * @granule: PA of the granule to which the token will be written + * @offset: Offset within granule to start of buffer in bytes + * @result: result of rsi_attestation_token_continue operation + */ +struct arm_cca_token_info { + void *challenge; + unsigned long challenge_size; + phys_addr_t granule; + unsigned long offset; + unsigned long result; +}; + +static void arm_cca_attestation_init(void *param) +{ + struct arm_cca_token_info *info; + + info = (struct arm_cca_token_info *)param; + + info->result = rsi_attestation_token_init(info->challenge, + info->challenge_size); +} + +/** + * arm_cca_attestation_continue - Retrieve the attestation token data. + * + * @param: pointer to the arm_cca_token_info + * + * Attestation token generation is a long running operation and therefore + * the token data may not be retrieved in a single call. Moreover, the + * token retrieval operation must be requested on the same CPU on which the + * attestation token generation was initialised. + * This helper function is therefore scheduled on the same CPU multiple + * times until the entire token data is retrieved. + */ +static void arm_cca_attestation_continue(void *param) +{ + unsigned long len; + unsigned long size; + struct arm_cca_token_info *info; + + info = (struct arm_cca_token_info *)param; + + size = RSI_GRANULE_SIZE - info->offset; + info->result = rsi_attestation_token_continue(info->granule, + info->offset, size, &len); + info->offset += len; +} + +/** + * arm_cca_report_new - Generate a new attestation token. + * + * @report: pointer to the TSM report context information. + * @data: pointer to the context specific data for this module. + * + * Initialise the attestation token generation using the challenge data + * passed in the TSM descriptor. Allocate memory for the attestation token + * and schedule calls to retrieve the attestation token on the same CPU + * on which the attestation token generation was initialised. + * + * The challenge data must be at least 32 bytes and no more than 64 bytes. If + * less than 64 bytes are provided it will be zero padded to 64 bytes. + * + * Return: + * * %0 - Attestation token generated successfully. + * * %-EINVAL - A parameter was not valid. + * * %-ENOMEM - Out of memory. + * * %-EFAULT - Failed to get IPA for memory page(s). + * * A negative status code as returned by smp_call_function_single(). + */ +static int arm_cca_report_new(struct tsm_report *report, void *data) +{ + int ret; + int cpu; + long max_size; + unsigned long token_size = 0; + struct arm_cca_token_info info; + void *buf; + u8 *token __free(kvfree) = NULL; + struct tsm_desc *desc = &report->desc; + + if (desc->inblob_len < 32 || desc->inblob_len > 64) + return -EINVAL; + + /* + * The attestation token 'init' and 'continue' calls must be + * performed on the same CPU. smp_call_function_single() is used + * instead of simply calling get_cpu() because of the need to + * allocate outblob based on the returned value from the 'init' + * call and that cannot be done in an atomic context. + */ + cpu = smp_processor_id(); + + info.challenge = desc->inblob; + info.challenge_size = desc->inblob_len; + + ret = smp_call_function_single(cpu, arm_cca_attestation_init, + &info, true); + if (ret) + return ret; + max_size = info.result; + + if (max_size <= 0) + return -EINVAL; + + /* Allocate outblob */ + token = kvzalloc(max_size, GFP_KERNEL); + if (!token) + return -ENOMEM; + + /* + * Since the outblob may not be physically contiguous, use a page + * to bounce the buffer from RMM. + */ + buf = alloc_pages_exact(RSI_GRANULE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Get the PA of the memory page(s) that were allocated */ + info.granule = (unsigned long)virt_to_phys(buf); + + /* Loop until the token is ready or there is an error */ + do { + /* Retrieve one RSI_GRANULE_SIZE data per loop iteration */ + info.offset = 0; + do { + /* + * Schedule a call to retrieve a sub-granule chunk + * of data per loop iteration. + */ + ret = smp_call_function_single(cpu, + arm_cca_attestation_continue, + (void *)&info, true); + if (ret != 0) { + token_size = 0; + goto exit_free_granule_page; + } + } while (info.result == RSI_INCOMPLETE && + info.offset < RSI_GRANULE_SIZE); + + if (info.result != RSI_SUCCESS) { + ret = -ENXIO; + token_size = 0; + goto exit_free_granule_page; + } + + /* + * Copy the retrieved token data from the granule + * to the token buffer, ensuring that the RMM doesn't + * overflow the buffer. + */ + if (WARN_ON(token_size + info.offset > max_size)) + break; + memcpy(&token[token_size], buf, info.offset); + token_size += info.offset; + } while (info.result == RSI_INCOMPLETE); + + report->outblob = no_free_ptr(token); +exit_free_granule_page: + report->outblob_len = token_size; + free_pages_exact(buf, RSI_GRANULE_SIZE); + return ret; +} + +static const struct tsm_ops arm_cca_tsm_ops = { + .name = KBUILD_MODNAME, + .report_new = arm_cca_report_new, +}; + +/** + * arm_cca_guest_init - Register with the Trusted Security Module (TSM) + * interface. + * + * Return: + * * %0 - Registered successfully with the TSM interface. + * * %-ENODEV - The execution context is not an Arm Realm. + * * %-EBUSY - Already registered. + */ +static int __init arm_cca_guest_init(void) +{ + int ret; + + if (!is_realm_world()) + return -ENODEV; + + ret = tsm_register(&arm_cca_tsm_ops, NULL, NULL); + if (ret < 0) + pr_err("Error %d registering with TSM\n", ret); + + return ret; +} +module_init(arm_cca_guest_init); + +/** + * arm_cca_guest_exit - unregister with the Trusted Security Module (TSM) + * interface. + */ +static void __exit arm_cca_guest_exit(void) +{ + tsm_unregister(&arm_cca_tsm_ops); +} +module_exit(arm_cca_guest_exit); + +MODULE_AUTHOR("Sami Mujawar "); +MODULE_DESCRIPTION("Arm CCA Guest TSM Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/virt/coco/tsm.c b/drivers/virt/coco/tsm.c new file mode 100644 index 0000000000000000000000000000000000000000..9c80966dd7b69b178fb0a70b26281451c3764217 --- /dev/null +++ b/drivers/virt/coco/tsm.c @@ -0,0 +1,452 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Intel Corporation. All rights reserved. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct tsm_provider { + const struct tsm_ops *ops; + const struct config_item_type *type; + void *data; + atomic_t count; +} provider; +static DECLARE_RWSEM(tsm_rwsem); + +/** + * DOC: Trusted Security Module (TSM) Attestation Report Interface + * + * The TSM report interface is a common provider of blobs that facilitate + * attestation of a TVM (confidential computing guest) by an attestation + * service. A TSM report combines a user-defined blob (likely a public-key with + * a nonce for a key-exchange protocol) with a signed attestation report. That + * combined blob is then used to obtain secrets provided by an agent that can + * validate the attestation report. The expectation is that this interface is + * invoked infrequently, however configfs allows for multiple agents to + * own their own report generation instances to generate reports as + * often as needed. + * + * The attestation report format is TSM provider specific, when / if a standard + * materializes that can be published instead of the vendor layout. Until then + * the 'provider' attribute indicates the format of 'outblob', and optionally + * 'auxblob'. + */ + +struct tsm_report_state { + struct tsm_report report; + unsigned long write_generation; + unsigned long read_generation; + struct config_item cfg; +}; + +enum tsm_data_select { + TSM_REPORT, + TSM_CERTS, +}; + +static struct tsm_report *to_tsm_report(struct config_item *cfg) +{ + struct tsm_report_state *state = + container_of(cfg, struct tsm_report_state, cfg); + + return &state->report; +} + +static struct tsm_report_state *to_state(struct tsm_report *report) +{ + return container_of(report, struct tsm_report_state, report); +} + +static int try_advance_write_generation(struct tsm_report *report) +{ + struct tsm_report_state *state = to_state(report); + + lockdep_assert_held_write(&tsm_rwsem); + + /* + * Malicious or broken userspace has written enough times for + * read_generation == write_generation by modular arithmetic without an + * interim read. Stop accepting updates until the current report + * configuration is read. + */ + if (state->write_generation == state->read_generation - 1) + return -EBUSY; + state->write_generation++; + return 0; +} + +static ssize_t tsm_report_privlevel_store(struct config_item *cfg, + const char *buf, size_t len) +{ + struct tsm_report *report = to_tsm_report(cfg); + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + + guard(rwsem_write)(&tsm_rwsem); + if (!provider.ops) + return -ENXIO; + + /* + * The valid privilege levels that a TSM might accept, if it accepts a + * privilege level setting at all, are a max of TSM_PRIVLEVEL_MAX (see + * SEV-SNP GHCB) and a minimum of a TSM selected floor value no less + * than 0. + */ + if (provider.ops->privlevel_floor > val || val > TSM_PRIVLEVEL_MAX) + return -EINVAL; + + rc = try_advance_write_generation(report); + if (rc) + return rc; + report->desc.privlevel = val; + + return len; +} +CONFIGFS_ATTR_WO(tsm_report_, privlevel); + +static ssize_t tsm_report_privlevel_floor_show(struct config_item *cfg, + char *buf) +{ + guard(rwsem_read)(&tsm_rwsem); + + if (!provider.ops) + return -ENXIO; + + return sysfs_emit(buf, "%u\n", provider.ops->privlevel_floor); +} +CONFIGFS_ATTR_RO(tsm_report_, privlevel_floor); + +static ssize_t tsm_report_inblob_write(struct config_item *cfg, + const void *buf, size_t count) +{ + struct tsm_report *report = to_tsm_report(cfg); + int rc; + + guard(rwsem_write)(&tsm_rwsem); + rc = try_advance_write_generation(report); + if (rc) + return rc; + + report->desc.inblob_len = count; + memcpy(report->desc.inblob, buf, count); + return count; +} +CONFIGFS_BIN_ATTR_WO(tsm_report_, inblob, NULL, TSM_INBLOB_MAX); + +static ssize_t tsm_report_generation_show(struct config_item *cfg, char *buf) +{ + struct tsm_report *report = to_tsm_report(cfg); + struct tsm_report_state *state = to_state(report); + + guard(rwsem_read)(&tsm_rwsem); + return sysfs_emit(buf, "%lu\n", state->write_generation); +} +CONFIGFS_ATTR_RO(tsm_report_, generation); + +static ssize_t tsm_report_provider_show(struct config_item *cfg, char *buf) +{ + guard(rwsem_read)(&tsm_rwsem); + if (!provider.ops) + return -ENXIO; + + return sysfs_emit(buf, "%s\n", provider.ops->name); +} +CONFIGFS_ATTR_RO(tsm_report_, provider); + +static ssize_t __read_report(struct tsm_report *report, void *buf, size_t count, + enum tsm_data_select select) +{ + loff_t offset = 0; + ssize_t len; + u8 *out; + + if (select == TSM_REPORT) { + out = report->outblob; + len = report->outblob_len; + } else { + out = report->auxblob; + len = report->auxblob_len; + } + + /* + * Recall that a NULL @buf is configfs requesting the size of + * the buffer. + */ + if (!buf) + return len; + return memory_read_from_buffer(buf, count, &offset, out, len); +} + +static ssize_t read_cached_report(struct tsm_report *report, void *buf, + size_t count, enum tsm_data_select select) +{ + struct tsm_report_state *state = to_state(report); + + guard(rwsem_read)(&tsm_rwsem); + if (!report->desc.inblob_len) + return -EINVAL; + + /* + * A given TSM backend always fills in ->outblob regardless of + * whether the report includes an auxblob or not. + */ + if (!report->outblob || + state->read_generation != state->write_generation) + return -EWOULDBLOCK; + + return __read_report(report, buf, count, select); +} + +static ssize_t tsm_report_read(struct tsm_report *report, void *buf, + size_t count, enum tsm_data_select select) +{ + struct tsm_report_state *state = to_state(report); + const struct tsm_ops *ops; + ssize_t rc; + + /* try to read from the existing report if present and valid... */ + rc = read_cached_report(report, buf, count, select); + if (rc >= 0 || rc != -EWOULDBLOCK) + return rc; + + /* slow path, report may need to be regenerated... */ + guard(rwsem_write)(&tsm_rwsem); + ops = provider.ops; + if (!ops) + return -ENXIO; + if (!report->desc.inblob_len) + return -EINVAL; + + /* did another thread already generate this report? */ + if (report->outblob && + state->read_generation == state->write_generation) + goto out; + + kvfree(report->outblob); + kvfree(report->auxblob); + report->outblob = NULL; + report->auxblob = NULL; + rc = ops->report_new(report, provider.data); + if (rc < 0) + return rc; + state->read_generation = state->write_generation; +out: + return __read_report(report, buf, count, select); +} + +static ssize_t tsm_report_outblob_read(struct config_item *cfg, void *buf, + size_t count) +{ + struct tsm_report *report = to_tsm_report(cfg); + + return tsm_report_read(report, buf, count, TSM_REPORT); +} +CONFIGFS_BIN_ATTR_RO(tsm_report_, outblob, NULL, TSM_OUTBLOB_MAX); + +static ssize_t tsm_report_auxblob_read(struct config_item *cfg, void *buf, + size_t count) +{ + struct tsm_report *report = to_tsm_report(cfg); + + return tsm_report_read(report, buf, count, TSM_CERTS); +} +CONFIGFS_BIN_ATTR_RO(tsm_report_, auxblob, NULL, TSM_OUTBLOB_MAX); + +#define TSM_DEFAULT_ATTRS() \ + &tsm_report_attr_generation, \ + &tsm_report_attr_provider + +static struct configfs_attribute *tsm_report_attrs[] = { + TSM_DEFAULT_ATTRS(), + NULL, +}; + +static struct configfs_attribute *tsm_report_extra_attrs[] = { + TSM_DEFAULT_ATTRS(), + &tsm_report_attr_privlevel, + &tsm_report_attr_privlevel_floor, + NULL, +}; + +#define TSM_DEFAULT_BIN_ATTRS() \ + &tsm_report_attr_inblob, \ + &tsm_report_attr_outblob + +static struct configfs_bin_attribute *tsm_report_bin_attrs[] = { + TSM_DEFAULT_BIN_ATTRS(), + NULL, +}; + +static struct configfs_bin_attribute *tsm_report_bin_extra_attrs[] = { + TSM_DEFAULT_BIN_ATTRS(), + &tsm_report_attr_auxblob, + NULL, +}; + +static void tsm_report_item_release(struct config_item *cfg) +{ + struct tsm_report *report = to_tsm_report(cfg); + struct tsm_report_state *state = to_state(report); + + kvfree(report->auxblob); + kvfree(report->outblob); + kfree(state); +} + +static struct configfs_item_operations tsm_report_item_ops = { + .release = tsm_report_item_release, +}; + +const struct config_item_type tsm_report_default_type = { + .ct_owner = THIS_MODULE, + .ct_bin_attrs = tsm_report_bin_attrs, + .ct_attrs = tsm_report_attrs, + .ct_item_ops = &tsm_report_item_ops, +}; +EXPORT_SYMBOL_GPL(tsm_report_default_type); + +const struct config_item_type tsm_report_extra_type = { + .ct_owner = THIS_MODULE, + .ct_bin_attrs = tsm_report_bin_extra_attrs, + .ct_attrs = tsm_report_extra_attrs, + .ct_item_ops = &tsm_report_item_ops, +}; +EXPORT_SYMBOL_GPL(tsm_report_extra_type); + +static struct config_item *tsm_report_make_item(struct config_group *group, + const char *name) +{ + struct tsm_report_state *state; + + guard(rwsem_read)(&tsm_rwsem); + if (!provider.ops) + return ERR_PTR(-ENXIO); + + state = kzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return ERR_PTR(-ENOMEM); + + atomic_inc(&provider.count); + config_item_init_type_name(&state->cfg, name, provider.type); + return &state->cfg; +} + +static void tsm_report_drop_item(struct config_group *group, struct config_item *item) +{ + config_item_put(item); + atomic_dec(&provider.count); +} + +static struct configfs_group_operations tsm_report_group_ops = { + .make_item = tsm_report_make_item, + .drop_item = tsm_report_drop_item, +}; + +static const struct config_item_type tsm_reports_type = { + .ct_owner = THIS_MODULE, + .ct_group_ops = &tsm_report_group_ops, +}; + +static const struct config_item_type tsm_root_group_type = { + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem tsm_configfs = { + .su_group = { + .cg_item = { + .ci_namebuf = "tsm", + .ci_type = &tsm_root_group_type, + }, + }, + .su_mutex = __MUTEX_INITIALIZER(tsm_configfs.su_mutex), +}; + +int tsm_register(const struct tsm_ops *ops, void *priv, + const struct config_item_type *type) +{ + const struct tsm_ops *conflict; + + if (!type) + type = &tsm_report_default_type; + if (!(type == &tsm_report_default_type || type == &tsm_report_extra_type)) + return -EINVAL; + + guard(rwsem_write)(&tsm_rwsem); + conflict = provider.ops; + if (conflict) { + pr_err("\"%s\" ops already registered\n", conflict->name); + return -EBUSY; + } + + if (atomic_read(&provider.count)) { + pr_err("configfs/tsm/report not empty\n"); + return -EBUSY; + } + + provider.ops = ops; + provider.data = priv; + provider.type = type; + return 0; +} +EXPORT_SYMBOL_GPL(tsm_register); + +int tsm_unregister(const struct tsm_ops *ops) +{ + guard(rwsem_write)(&tsm_rwsem); + if (ops != provider.ops) + return -EBUSY; + if (atomic_read(&provider.count)) + pr_warn("\"%s\" unregistered with items present in configfs/tsm/report\n", + provider.ops->name); + provider.ops = NULL; + provider.data = NULL; + provider.type = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(tsm_unregister); + +static struct config_group *tsm_report_group; + +static int __init tsm_init(void) +{ + struct config_group *root = &tsm_configfs.su_group; + struct config_group *tsm; + int rc; + + config_group_init(root); + rc = configfs_register_subsystem(&tsm_configfs); + if (rc) + return rc; + + tsm = configfs_register_default_group(root, "report", + &tsm_reports_type); + if (IS_ERR(tsm)) { + configfs_unregister_subsystem(&tsm_configfs); + return PTR_ERR(tsm); + } + tsm_report_group = tsm; + + return 0; +} +module_init(tsm_init); + +static void __exit tsm_exit(void) +{ + configfs_unregister_default_group(tsm_report_group); + configfs_unregister_subsystem(&tsm_configfs); +} +module_exit(tsm_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Provide Trusted Security Module attestation reports via configfs"); diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index db13bb620f527e2a0d8204cfdafe877975c35394..c768de6f19a9a90d04eab66f522bad2f10ef1fd9 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -180,6 +180,11 @@ static inline bool is_kernel_rodata(unsigned long addr) addr < (unsigned long)__end_rodata; } +static inline bool is_kernel_ro_after_init(unsigned long addr) +{ + return addr >= (unsigned long)__start_ro_after_init && + addr < (unsigned long)__end_ro_after_init; +} /** * is_kernel_inittext - checks if the pointer address is located in the * .init.text section diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index e748bc957d83262233b3c8a655ed68b0ad8f34dd..bde9bdb8233f793fc91dee282e37054c425b66b0 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -112,6 +112,8 @@ int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); +void kvm_realm_timers_update(struct kvm_vcpu *vcpu); + u64 kvm_phys_timer_read(void); void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index c4aabbf002f7cf06cd02ec1fa23685490b15695d..34204f0a8701d9353dfb9fe9bca7a34a0fc148c2 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -76,6 +76,8 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_resync_el0(void); +#define kvm_pmu_get_irq_level(vcpu) ((vcpu)->arch.pmu.irq_level) + #define kvm_vcpu_has_pmu(vcpu) \ (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features)) @@ -101,7 +103,10 @@ void kvm_vcpu_pmu_resync_el0(void); }) u8 kvm_arm_pmu_get_pmuver_limit(void); +int kvm_arm_set_default_pmu(struct kvm *kvm); +u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm); +u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu); #else struct kvm_pmu { }; @@ -163,6 +168,8 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) return 0; } +#define kvm_pmu_get_irq_level(vcpu) (false) + #define kvm_vcpu_has_pmu(vcpu) ({ false; }) #define kvm_pmu_is_3p5(vcpu) ({ false; }) static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {} @@ -173,6 +180,20 @@ static inline u8 kvm_arm_pmu_get_pmuver_limit(void) return 0; } static inline void kvm_vcpu_pmu_resync_el0(void) {} +static inline int kvm_arm_set_default_pmu(struct kvm *kvm) +{ + return -ENODEV; +} + +static inline u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) +{ + return 0; +} + +static inline u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) +{ + return 0; +} #endif diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h index 6e55b9283789b148f76030e63de34de03fc35cc0..bbeb68f031be44cad3e0f846c84c2b1d5e146f02 100644 --- a/include/kvm/arm_psci.h +++ b/include/kvm/arm_psci.h @@ -10,6 +10,8 @@ #include #include +#include + #define KVM_ARM_PSCI_0_1 PSCI_VERSION(0, 1) #define KVM_ARM_PSCI_0_2 PSCI_VERSION(0, 2) #define KVM_ARM_PSCI_1_0 PSCI_VERSION(1, 0) diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 18aade195884d7b3e3899dc89175e300f58434e0..8589b0dafdb0cad77d49af2acddba6f19dc08df2 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -55,14 +55,18 @@ static inline phys_addr_t translate_dma_to_phys(struct device *dev, #define phys_to_dma_unencrypted phys_to_dma #endif #else -static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, - phys_addr_t paddr) +static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) { if (dev->dma_range_map) return translate_phys_to_dma(dev, paddr); return paddr; } +static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, + phys_addr_t paddr) +{ + return dma_addr_unencrypted(__phys_to_dma(dev, paddr)); +} /* * If memory encryption is supported, phys_to_dma will set the memory encryption * bit in the DMA address, and dma_to_phys will clear it. @@ -71,19 +75,20 @@ static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, */ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - return __sme_set(phys_to_dma_unencrypted(dev, paddr)); + return dma_addr_encrypted(__phys_to_dma(dev, paddr)); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { phys_addr_t paddr; + dma_addr = dma_addr_canonical(dma_addr); if (dev->dma_range_map) paddr = translate_dma_to_phys(dev, dma_addr); else paddr = dma_addr; - return __sme_clr(paddr); + return paddr; } #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */ diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f0a949b7c9733ce3209fb3adc561ac2f4d93d3ca..f5a2727ca4a9a9a8d675656632cfad4ff045c231 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -216,6 +216,7 @@ extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); +extern void jump_label_init_ro(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, @@ -265,6 +266,8 @@ static __always_inline void jump_label_init(void) static_key_initialized = true; } +static __always_inline void jump_label_init_ro(void) { } + static __always_inline bool static_key_false(struct static_key *key) { if (unlikely_notrace(static_key_count(key) > 0)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index de2b643ac2177d8b4deaa6fd5468589401e36f6a..cebf74319ffc465eadd55a1bfbbef69d2f606029 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -259,11 +259,17 @@ union kvm_mmu_notifier_arg { pte_t pte; }; +enum kvm_gfn_range_filter { + KVM_FILTER_SHARED = BIT(0), + KVM_FILTER_PRIVATE = BIT(1), +}; + struct kvm_gfn_range { struct kvm_memory_slot *slot; gfn_t start; gfn_t end; union kvm_mmu_notifier_arg arg; + enum kvm_gfn_range_filter attr_filter; bool may_block; }; bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index ae45263892611d95dd24789c7770ea3eddc1a1b3..07584c5e36fb40f8b63fb0388308ec664e0889d1 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -26,11 +26,34 @@ */ #define __sme_set(x) ((x) | sme_me_mask) #define __sme_clr(x) ((x) & ~sme_me_mask) + +#define dma_addr_encrypted(x) __sme_set(x) +#define dma_addr_canonical(x) __sme_clr(x) + #else #define __sme_set(x) (x) #define __sme_clr(x) (x) #endif +/* + * dma_addr_encrypted() and dma_addr_unencrypted() are for converting a given DMA + * address to the respective type of addressing. + * + * dma_addr_canonical() is used to reverse any conversions for encrypted/decrypted + * back to the canonical address. + */ +#ifndef dma_addr_encrypted +#define dma_addr_encrypted(x) (x) +#endif + +#ifndef dma_addr_unencrypted +#define dma_addr_unencrypted(x) (x) +#endif + +#ifndef dma_addr_canonical +#define dma_addr_canonical(x) (x) +#endif + #endif /* __ASSEMBLY__ */ #endif /* __MEM_ENCRYPT_H__ */ diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 143fbc10ecfe032e593e040514f7225bf316a2d1..266c7c82affd84618a519b35f4f1a42a4292af34 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -174,6 +174,7 @@ void kvm_host_pmu_init(struct arm_pmu *pmu); #endif bool arm_pmu_irq_is_nmi(void); +void arm_pmu_set_phys_irq(bool enable); /* Internal functions only for core arm_pmu code */ struct arm_pmu *armpmu_alloc(void); @@ -184,7 +185,11 @@ void armpmu_free_irq(int irq, int cpu); #define ARMV8_PMU_PDEV_NAME "armv8-pmu" -#endif /* CONFIG_ARM_PMU */ +#else /* CONFIG_ARM_PMU */ + +static inline void arm_pmu_set_phys_irq(bool enable) {} + +#endif #define ARMV8_SPE_PDEV_NAME "arm,spe-v1" #define ARMV8_TRBE_PDEV_NAME "arm,trbe" diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index e3899bd77f5cc5860e8c3182bd3797f1ca0002e7..28a21d13908a3400dd460b8bdca4ab2edb542448 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -215,8 +215,7 @@ #define ARMV8_PMU_PMCR_DP (1 << 5) /* Disable CCNT if non-invasive debug*/ #define ARMV8_PMU_PMCR_LC (1 << 6) /* Overflow on 64 bit cycle counter */ #define ARMV8_PMU_PMCR_LP (1 << 7) /* Long event counter enable */ -#define ARMV8_PMU_PMCR_N_SHIFT 11 /* Number of counters supported */ -#define ARMV8_PMU_PMCR_N_MASK 0x1f +#define ARMV8_PMU_PMCR_N GENMASK(15, 11) /* Number of counters supported */ #define ARMV8_PMU_PMCR_MASK 0xff /* Mask for writable bits */ /* diff --git a/include/linux/slab.h b/include/linux/slab.h index a761cc3559f295ccf13f5de282b1cd0b08c44d6e..f30f3ade52cf4b9de2414432f4d72072af36675c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -764,6 +764,8 @@ static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t fla extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) __realloc_size(3); extern void kvfree(const void *addr); +DEFINE_FREE(kvfree, void *, if (_T) kvfree(_T)) + extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); diff --git a/include/linux/tsm.h b/include/linux/tsm.h new file mode 100644 index 0000000000000000000000000000000000000000..de8324a2223c52597e476b6fb67f745281ca193a --- /dev/null +++ b/include/linux/tsm.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TSM_H +#define __TSM_H + +#include +#include + +#define TSM_INBLOB_MAX 64 +#define TSM_OUTBLOB_MAX SZ_32K + +/* + * Privilege level is a nested permission concept to allow confidential + * guests to partition address space, 4-levels are supported. + */ +#define TSM_PRIVLEVEL_MAX 3 + +/** + * struct tsm_desc - option descriptor for generating tsm report blobs + * @privlevel: optional privilege level to associate with @outblob + * @inblob_len: sizeof @inblob + * @inblob: arbitrary input data + */ +struct tsm_desc { + unsigned int privlevel; + size_t inblob_len; + u8 inblob[TSM_INBLOB_MAX]; +}; + +/** + * struct tsm_report - track state of report generation relative to options + * @desc: input parameters to @report_new() + * @outblob_len: sizeof(@outblob) + * @outblob: generated evidence to provider to the attestation agent + * @auxblob_len: sizeof(@auxblob) + * @auxblob: (optional) auxiliary data to the report (e.g. certificate data) + */ +struct tsm_report { + struct tsm_desc desc; + size_t outblob_len; + u8 *outblob; + size_t auxblob_len; + u8 *auxblob; +}; + +/** + * struct tsm_ops - attributes and operations for tsm instances + * @name: tsm id reflected in /sys/kernel/config/tsm/report/$report/provider + * @privlevel_floor: convey base privlevel for nested scenarios + * @report_new: Populate @report with the report blob and auxblob + * (optional), return 0 on successful population, or -errno otherwise + * + * Implementation specific ops, only one is expected to be registered at + * a time i.e. only one of "sev-guest", "tdx-guest", etc. + */ +struct tsm_ops { + const char *name; + const unsigned int privlevel_floor; + int (*report_new)(struct tsm_report *report, void *data); +}; + +extern const struct config_item_type tsm_report_default_type; + +/* publish @privlevel, @privlevel_floor, and @auxblob attributes */ +extern const struct config_item_type tsm_report_extra_type; + +int tsm_register(const struct tsm_ops *ops, void *priv, + const struct config_item_type *type); +int tsm_unregister(const struct tsm_ops *ops); +#endif /* __TSM_H */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 23fae6de3caa96ad24a7a10af0f0c65726ed3545..e0f07ddd42d1dcb5268c45054dece67c162acafd 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -925,14 +925,25 @@ struct kvm_ppc_resize_hpt { #define KVM_S390_SIE_PAGE_OFFSET 1 /* - * On arm64, machine type can be used to request the physical - * address size for the VM. Bits[7-0] are reserved for the guest - * PA size shift (i.e, log2(PA_Size)). For backward compatibility, - * value 0 implies the default IPA size, 40bits. + * On arm64, machine type can be used to request both the machine type and + * the physical address size for the VM. + * + * Bits[11-8] are reserved for the ARM specific machine type. + * + * Bits[7-0] are reserved for the guest PA size shift (i.e, log2(PA_Size)). + * For backward compatibility, value 0 implies the default IPA size, 40bits. */ +#define KVM_VM_TYPE_ARM_SHIFT 8 +#define KVM_VM_TYPE_ARM_MASK (0xfULL << KVM_VM_TYPE_ARM_SHIFT) +#define KVM_VM_TYPE_ARM(_type) \ + (((_type) << KVM_VM_TYPE_ARM_SHIFT) & KVM_VM_TYPE_ARM_MASK) +#define KVM_VM_TYPE_ARM_NORMAL KVM_VM_TYPE_ARM(0) +#define KVM_VM_TYPE_ARM_REALM KVM_VM_TYPE_ARM(1) + #define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL #define KVM_VM_TYPE_ARM_IPA_SIZE(x) \ ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + /* * ioctls for /dev/kvm fds: */ @@ -1207,6 +1218,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #define KVM_CAP_USER_MEMORY2 231 +#define KVM_CAP_ARM_RME 240 #define KVM_CAP_SEV_ES_GHCB 500 #define KVM_CAP_HYGON_COCO_EXT 501 @@ -2407,4 +2419,13 @@ struct kvm_csv3_handle_memory { __u32 opcode; }; +/* Available with KVM_CAP_ARM_RME, only for VMs with KVM_VM_TYPE_ARM_REALM */ +struct kvm_arm_rmm_psci_complete { + __u64 target_mpidr; + __u32 psci_status; + __u32 padding[3]; +}; + +#define KVM_ARM_VCPU_RMM_PSCI_COMPLETE _IOW(KVMIO, 0xd6, struct kvm_arm_rmm_psci_complete) + #endif /* __LINUX_KVM_H */ diff --git a/init/main.c b/init/main.c index 7254231225675434f005f215e8ca9d32d90e2208..a6b581b548931e83e635c7e71b0589318fcafcbb 100644 --- a/init/main.c +++ b/init/main.c @@ -1416,6 +1416,7 @@ static void mark_readonly(void) * insecure pages which are W+X. */ flush_module_init_free_work(); + jump_label_init_ro(); mark_rodata_ro(); rodata_test(); } else diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 554e04b25b13a8e27694e0ea783be22f171f6374..101572d6a9083679490c9bab91263b974efe354a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -565,6 +565,45 @@ void __init jump_label_init(void) cpus_read_unlock(); } +static inline bool static_key_sealed(struct static_key *key) +{ + return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK); +} + +static inline void static_key_seal(struct static_key *key) +{ + unsigned long type = key->type & JUMP_TYPE_TRUE; + key->type = JUMP_TYPE_LINKED | type; +} + +void jump_label_init_ro(void) +{ + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __stop___jump_table; + struct jump_entry *iter; + + if (WARN_ON_ONCE(!static_key_initialized)) + return; + + cpus_read_lock(); + jump_label_lock(); + + for (iter = iter_start; iter < iter_stop; iter++) { + struct static_key *iterk = jump_entry_key(iter); + + if (!is_kernel_ro_after_init((unsigned long)iterk)) + continue; + + if (static_key_sealed(iterk)) + continue; + + static_key_seal(iterk); + } + + jump_label_unlock(); + cpus_read_unlock(); +} + #ifdef CONFIG_MODULES enum jump_label_type jump_label_init_type(struct jump_entry *entry) @@ -685,6 +724,15 @@ static int jump_label_add_module(struct module *mod) static_key_set_entries(key, iter); continue; } + + /* + * If the key was sealed at init, then there's no need to keep a + * reference to its module entries - just patch them now and be + * done with it. + */ + if (static_key_sealed(key)) + goto do_poke; + jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; @@ -710,6 +758,7 @@ static int jump_label_add_module(struct module *mod) static_key_set_linked(key); /* Only update if we've changed from our initial state */ +do_poke: if (jump_label_type(iter) != jump_label_init_type(iter)) __jump_label_update(key, iter, iter_stop, true); } @@ -734,6 +783,10 @@ static void jump_label_del_module(struct module *mod) if (within_module((unsigned long)key, mod)) continue; + /* No @jlm allocated because key was sealed at init. */ + if (static_key_sealed(key)) + continue; + /* No memory during module load */ if (WARN_ON(!static_key_linked(key))) continue; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e4077b389c80c29d034ae9ec25f958934f0811be..de586c589f0578ae452383b88f5efac2e90ac950 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -625,6 +625,11 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, */ gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; + /* + * HVA-based notifications aren't relevant to private + * mappings as they don't have a userspace mapping. + */ + gfn_range.attr_filter = KVM_FILTER_SHARED; /* * {gfn(page) | page intersects with [hva_start, hva_end)} =